1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/ADT/ScopeExit.h" 28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 30 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 31 #include "llvm/CodeGen/TargetOpcodes.h" 32 #include "llvm/CodeGen/ValueTypes.h" 33 #include "llvm/IR/DerivedTypes.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/IR/Type.h" 36 #include "llvm/Support/Debug.h" 37 38 #define DEBUG_TYPE "amdgpu-legalinfo" 39 40 using namespace llvm; 41 using namespace LegalizeActions; 42 using namespace LegalizeMutations; 43 using namespace LegalityPredicates; 44 using namespace MIPatternMatch; 45 46 // Round the number of elements to the next power of two elements 47 static LLT getPow2VectorType(LLT Ty) { 48 unsigned NElts = Ty.getNumElements(); 49 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 50 return Ty.changeNumElements(Pow2NElts); 51 } 52 53 // Round the number of bits to the next power of two bits 54 static LLT getPow2ScalarType(LLT Ty) { 55 unsigned Bits = Ty.getSizeInBits(); 56 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 57 return LLT::scalar(Pow2Bits); 58 } 59 60 static LegalityPredicate isMultiple32(unsigned TypeIdx, 61 unsigned MaxSize = 1024) { 62 return [=](const LegalityQuery &Query) { 63 const LLT Ty = Query.Types[TypeIdx]; 64 const LLT EltTy = Ty.getScalarType(); 65 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 66 }; 67 } 68 69 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 70 return [=](const LegalityQuery &Query) { 71 return Query.Types[TypeIdx].getSizeInBits() == Size; 72 }; 73 } 74 75 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 return Ty.isVector() && 79 Ty.getNumElements() % 2 != 0 && 80 Ty.getElementType().getSizeInBits() < 32 && 81 Ty.getSizeInBits() % 32 != 0; 82 }; 83 } 84 85 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 86 return [=](const LegalityQuery &Query) { 87 const LLT Ty = Query.Types[TypeIdx]; 88 const LLT EltTy = Ty.getScalarType(); 89 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 90 }; 91 } 92 93 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 94 return [=](const LegalityQuery &Query) { 95 const LLT Ty = Query.Types[TypeIdx]; 96 const LLT EltTy = Ty.getElementType(); 97 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 98 }; 99 } 100 101 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 102 return [=](const LegalityQuery &Query) { 103 const LLT Ty = Query.Types[TypeIdx]; 104 const LLT EltTy = Ty.getElementType(); 105 unsigned Size = Ty.getSizeInBits(); 106 unsigned Pieces = (Size + 63) / 64; 107 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 108 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 109 }; 110 } 111 112 // Increase the number of vector elements to reach the next multiple of 32-bit 113 // type. 114 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 115 return [=](const LegalityQuery &Query) { 116 const LLT Ty = Query.Types[TypeIdx]; 117 118 const LLT EltTy = Ty.getElementType(); 119 const int Size = Ty.getSizeInBits(); 120 const int EltSize = EltTy.getSizeInBits(); 121 const int NextMul32 = (Size + 31) / 32; 122 123 assert(EltSize < 32); 124 125 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 126 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 127 }; 128 } 129 130 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 131 return [=](const LegalityQuery &Query) { 132 const LLT QueryTy = Query.Types[TypeIdx]; 133 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 134 }; 135 } 136 137 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 138 return [=](const LegalityQuery &Query) { 139 const LLT QueryTy = Query.Types[TypeIdx]; 140 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 141 }; 142 } 143 144 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 145 return [=](const LegalityQuery &Query) { 146 const LLT QueryTy = Query.Types[TypeIdx]; 147 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 148 }; 149 } 150 151 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 152 // v2s16. 153 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 154 return [=](const LegalityQuery &Query) { 155 const LLT Ty = Query.Types[TypeIdx]; 156 if (Ty.isVector()) { 157 const int EltSize = Ty.getElementType().getSizeInBits(); 158 return EltSize == 32 || EltSize == 64 || 159 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 160 EltSize == 128 || EltSize == 256; 161 } 162 163 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 164 }; 165 } 166 167 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 168 return [=](const LegalityQuery &Query) { 169 const LLT QueryTy = Query.Types[TypeIdx]; 170 return QueryTy.isVector() && QueryTy.getElementType() == Type; 171 }; 172 } 173 174 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 const LLT QueryTy = Query.Types[TypeIdx]; 177 if (!QueryTy.isVector()) 178 return false; 179 const LLT EltTy = QueryTy.getElementType(); 180 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 181 }; 182 } 183 184 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 const LLT Ty = Query.Types[TypeIdx]; 187 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 188 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 189 }; 190 } 191 192 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) { 193 return [=](const LegalityQuery &Query) { 194 return Query.Types[TypeIdx0].getSizeInBits() < 195 Query.Types[TypeIdx1].getSizeInBits(); 196 }; 197 } 198 199 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) { 200 return [=](const LegalityQuery &Query) { 201 return Query.Types[TypeIdx0].getSizeInBits() > 202 Query.Types[TypeIdx1].getSizeInBits(); 203 }; 204 } 205 206 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 207 const GCNTargetMachine &TM) 208 : ST(ST_) { 209 using namespace TargetOpcode; 210 211 auto GetAddrSpacePtr = [&TM](unsigned AS) { 212 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 213 }; 214 215 const LLT S1 = LLT::scalar(1); 216 const LLT S16 = LLT::scalar(16); 217 const LLT S32 = LLT::scalar(32); 218 const LLT S64 = LLT::scalar(64); 219 const LLT S128 = LLT::scalar(128); 220 const LLT S256 = LLT::scalar(256); 221 const LLT S512 = LLT::scalar(512); 222 const LLT S1024 = LLT::scalar(1024); 223 224 const LLT V2S16 = LLT::vector(2, 16); 225 const LLT V4S16 = LLT::vector(4, 16); 226 227 const LLT V2S32 = LLT::vector(2, 32); 228 const LLT V3S32 = LLT::vector(3, 32); 229 const LLT V4S32 = LLT::vector(4, 32); 230 const LLT V5S32 = LLT::vector(5, 32); 231 const LLT V6S32 = LLT::vector(6, 32); 232 const LLT V7S32 = LLT::vector(7, 32); 233 const LLT V8S32 = LLT::vector(8, 32); 234 const LLT V9S32 = LLT::vector(9, 32); 235 const LLT V10S32 = LLT::vector(10, 32); 236 const LLT V11S32 = LLT::vector(11, 32); 237 const LLT V12S32 = LLT::vector(12, 32); 238 const LLT V13S32 = LLT::vector(13, 32); 239 const LLT V14S32 = LLT::vector(14, 32); 240 const LLT V15S32 = LLT::vector(15, 32); 241 const LLT V16S32 = LLT::vector(16, 32); 242 const LLT V32S32 = LLT::vector(32, 32); 243 244 const LLT V2S64 = LLT::vector(2, 64); 245 const LLT V3S64 = LLT::vector(3, 64); 246 const LLT V4S64 = LLT::vector(4, 64); 247 const LLT V5S64 = LLT::vector(5, 64); 248 const LLT V6S64 = LLT::vector(6, 64); 249 const LLT V7S64 = LLT::vector(7, 64); 250 const LLT V8S64 = LLT::vector(8, 64); 251 const LLT V16S64 = LLT::vector(16, 64); 252 253 std::initializer_list<LLT> AllS32Vectors = 254 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 255 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 256 std::initializer_list<LLT> AllS64Vectors = 257 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 258 259 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 260 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 261 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 262 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 263 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 264 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 265 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 266 267 const LLT CodePtr = FlatPtr; 268 269 const std::initializer_list<LLT> AddrSpaces64 = { 270 GlobalPtr, ConstantPtr, FlatPtr 271 }; 272 273 const std::initializer_list<LLT> AddrSpaces32 = { 274 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 275 }; 276 277 const std::initializer_list<LLT> FPTypesBase = { 278 S32, S64 279 }; 280 281 const std::initializer_list<LLT> FPTypes16 = { 282 S32, S64, S16 283 }; 284 285 const std::initializer_list<LLT> FPTypesPK16 = { 286 S32, S64, S16, V2S16 287 }; 288 289 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 290 291 setAction({G_BRCOND, S1}, Legal); // VCC branches 292 setAction({G_BRCOND, S32}, Legal); // SCC branches 293 294 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 295 // elements for v3s16 296 getActionDefinitionsBuilder(G_PHI) 297 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 298 .legalFor(AllS32Vectors) 299 .legalFor(AllS64Vectors) 300 .legalFor(AddrSpaces64) 301 .legalFor(AddrSpaces32) 302 .clampScalar(0, S32, S256) 303 .widenScalarToNextPow2(0, 32) 304 .clampMaxNumElements(0, S32, 16) 305 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 306 .legalIf(isPointer(0)); 307 308 if (ST.hasVOP3PInsts()) { 309 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 310 .legalFor({S32, S16, V2S16}) 311 .clampScalar(0, S16, S32) 312 .clampMaxNumElements(0, S16, 2) 313 .scalarize(0) 314 .widenScalarToNextPow2(0, 32); 315 } else if (ST.has16BitInsts()) { 316 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 317 .legalFor({S32, S16}) 318 .clampScalar(0, S16, S32) 319 .scalarize(0) 320 .widenScalarToNextPow2(0, 32); 321 } else { 322 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 323 .legalFor({S32}) 324 .clampScalar(0, S32, S32) 325 .scalarize(0); 326 } 327 328 // FIXME: Not really legal. Placeholder for custom lowering. 329 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 330 .customFor({S32, S64}) 331 .clampScalar(0, S32, S64) 332 .widenScalarToNextPow2(0, 32) 333 .scalarize(0); 334 335 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 336 .legalFor({S32}) 337 .clampScalar(0, S32, S32) 338 .scalarize(0); 339 340 // Report legal for any types we can handle anywhere. For the cases only legal 341 // on the SALU, RegBankSelect will be able to re-legalize. 342 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 343 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 344 .clampScalar(0, S32, S64) 345 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 346 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 347 .widenScalarToNextPow2(0) 348 .scalarize(0); 349 350 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 351 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 352 .legalFor({{S32, S1}, {S32, S32}}) 353 .minScalar(0, S32) 354 // TODO: .scalarize(0) 355 .lower(); 356 357 getActionDefinitionsBuilder(G_BITCAST) 358 // Don't worry about the size constraint. 359 .legalIf(all(isRegisterType(0), isRegisterType(1))) 360 .lower(); 361 362 363 getActionDefinitionsBuilder(G_CONSTANT) 364 .legalFor({S1, S32, S64, S16, GlobalPtr, 365 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 366 .clampScalar(0, S32, S64) 367 .widenScalarToNextPow2(0) 368 .legalIf(isPointer(0)); 369 370 getActionDefinitionsBuilder(G_FCONSTANT) 371 .legalFor({S32, S64, S16}) 372 .clampScalar(0, S16, S64); 373 374 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 375 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 376 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 377 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 378 .clampScalarOrElt(0, S32, S1024) 379 .legalIf(isMultiple32(0)) 380 .widenScalarToNextPow2(0, 32) 381 .clampMaxNumElements(0, S32, 16); 382 383 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 384 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 385 .unsupportedFor({PrivatePtr}) 386 .custom(); 387 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 388 389 auto &FPOpActions = getActionDefinitionsBuilder( 390 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 391 .legalFor({S32, S64}); 392 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 393 .customFor({S32, S64}); 394 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 395 .customFor({S32, S64}); 396 397 if (ST.has16BitInsts()) { 398 if (ST.hasVOP3PInsts()) 399 FPOpActions.legalFor({S16, V2S16}); 400 else 401 FPOpActions.legalFor({S16}); 402 403 TrigActions.customFor({S16}); 404 FDIVActions.customFor({S16}); 405 } 406 407 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 408 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 409 410 if (ST.hasVOP3PInsts()) { 411 MinNumMaxNum.customFor(FPTypesPK16) 412 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 413 .clampMaxNumElements(0, S16, 2) 414 .clampScalar(0, S16, S64) 415 .scalarize(0); 416 } else if (ST.has16BitInsts()) { 417 MinNumMaxNum.customFor(FPTypes16) 418 .clampScalar(0, S16, S64) 419 .scalarize(0); 420 } else { 421 MinNumMaxNum.customFor(FPTypesBase) 422 .clampScalar(0, S32, S64) 423 .scalarize(0); 424 } 425 426 if (ST.hasVOP3PInsts()) 427 FPOpActions.clampMaxNumElements(0, S16, 2); 428 429 FPOpActions 430 .scalarize(0) 431 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 432 433 TrigActions 434 .scalarize(0) 435 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 436 437 FDIVActions 438 .scalarize(0) 439 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 440 441 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 442 .legalFor(FPTypesPK16) 443 .clampMaxNumElements(0, S16, 2) 444 .scalarize(0) 445 .clampScalar(0, S16, S64); 446 447 if (ST.has16BitInsts()) { 448 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 449 .legalFor({S32, S64, S16}) 450 .scalarize(0) 451 .clampScalar(0, S16, S64); 452 } else { 453 getActionDefinitionsBuilder(G_FSQRT) 454 .legalFor({S32, S64}) 455 .scalarize(0) 456 .clampScalar(0, S32, S64); 457 458 if (ST.hasFractBug()) { 459 getActionDefinitionsBuilder(G_FFLOOR) 460 .customFor({S64}) 461 .legalFor({S32, S64}) 462 .scalarize(0) 463 .clampScalar(0, S32, S64); 464 } else { 465 getActionDefinitionsBuilder(G_FFLOOR) 466 .legalFor({S32, S64}) 467 .scalarize(0) 468 .clampScalar(0, S32, S64); 469 } 470 } 471 472 getActionDefinitionsBuilder(G_FPTRUNC) 473 .legalFor({{S32, S64}, {S16, S32}}) 474 .scalarize(0) 475 .lower(); 476 477 getActionDefinitionsBuilder(G_FPEXT) 478 .legalFor({{S64, S32}, {S32, S16}}) 479 .lowerFor({{S64, S16}}) // FIXME: Implement 480 .scalarize(0); 481 482 getActionDefinitionsBuilder(G_FSUB) 483 // Use actual fsub instruction 484 .legalFor({S32}) 485 // Must use fadd + fneg 486 .lowerFor({S64, S16, V2S16}) 487 .scalarize(0) 488 .clampScalar(0, S32, S64); 489 490 // Whether this is legal depends on the floating point mode for the function. 491 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 492 if (ST.hasMadF16()) 493 FMad.customFor({S32, S16}); 494 else 495 FMad.customFor({S32}); 496 FMad.scalarize(0) 497 .lower(); 498 499 // TODO: Do we need to clamp maximum bitwidth? 500 getActionDefinitionsBuilder(G_TRUNC) 501 .legalIf(isScalar(0)) 502 .legalFor({{V2S16, V2S32}}) 503 .clampMaxNumElements(0, S16, 2) 504 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 505 // situations (like an invalid implicit use), we don't want to infinite loop 506 // in the legalizer. 507 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 508 .alwaysLegal(); 509 510 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 511 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 512 {S32, S1}, {S64, S1}, {S16, S1}}) 513 .scalarize(0) 514 .clampScalar(0, S32, S64) 515 .widenScalarToNextPow2(1, 32); 516 517 // TODO: Split s1->s64 during regbankselect for VALU. 518 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 519 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 520 .lowerFor({{S32, S64}}) 521 .lowerIf(typeIs(1, S1)) 522 .customFor({{S64, S64}}); 523 if (ST.has16BitInsts()) 524 IToFP.legalFor({{S16, S16}}); 525 IToFP.clampScalar(1, S32, S64) 526 .scalarize(0) 527 .widenScalarToNextPow2(1); 528 529 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 530 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 531 .customFor({{S64, S64}}); 532 if (ST.has16BitInsts()) 533 FPToI.legalFor({{S16, S16}}); 534 else 535 FPToI.minScalar(1, S32); 536 537 FPToI.minScalar(0, S32) 538 .scalarize(0) 539 .lower(); 540 541 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 542 .scalarize(0) 543 .lower(); 544 545 if (ST.has16BitInsts()) { 546 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 547 .legalFor({S16, S32, S64}) 548 .clampScalar(0, S16, S64) 549 .scalarize(0); 550 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 551 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 552 .legalFor({S32, S64}) 553 .clampScalar(0, S32, S64) 554 .scalarize(0); 555 } else { 556 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 557 .legalFor({S32}) 558 .customFor({S64}) 559 .clampScalar(0, S32, S64) 560 .scalarize(0); 561 } 562 563 getActionDefinitionsBuilder(G_PTR_ADD) 564 .scalarize(0) 565 .alwaysLegal(); 566 567 // TODO: Clamp mask to pointer sizes 568 getActionDefinitionsBuilder(G_PTRMASK) 569 .scalarize(0) 570 .alwaysLegal(); 571 572 auto &CmpBuilder = 573 getActionDefinitionsBuilder(G_ICMP) 574 // The compare output type differs based on the register bank of the output, 575 // so make both s1 and s32 legal. 576 // 577 // Scalar compares producing output in scc will be promoted to s32, as that 578 // is the allocatable register type that will be needed for the copy from 579 // scc. This will be promoted during RegBankSelect, and we assume something 580 // before that won't try to use s32 result types. 581 // 582 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 583 // bank. 584 .legalForCartesianProduct( 585 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 586 .legalForCartesianProduct( 587 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 588 if (ST.has16BitInsts()) { 589 CmpBuilder.legalFor({{S1, S16}}); 590 } 591 592 CmpBuilder 593 .widenScalarToNextPow2(1) 594 .clampScalar(1, S32, S64) 595 .scalarize(0) 596 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 597 598 getActionDefinitionsBuilder(G_FCMP) 599 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 600 .widenScalarToNextPow2(1) 601 .clampScalar(1, S32, S64) 602 .scalarize(0); 603 604 // FIXME: fpow has a selection pattern that should move to custom lowering. 605 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 606 if (ST.has16BitInsts()) 607 Exp2Ops.legalFor({S32, S16}); 608 else 609 Exp2Ops.legalFor({S32}); 610 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 611 Exp2Ops.scalarize(0); 612 613 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 614 if (ST.has16BitInsts()) 615 ExpOps.customFor({{S32}, {S16}}); 616 else 617 ExpOps.customFor({S32}); 618 ExpOps.clampScalar(0, MinScalarFPTy, S32) 619 .scalarize(0); 620 621 // The 64-bit versions produce 32-bit results, but only on the SALU. 622 getActionDefinitionsBuilder(G_CTPOP) 623 .legalFor({{S32, S32}, {S32, S64}}) 624 .clampScalar(0, S32, S32) 625 .clampScalar(1, S32, S64) 626 .scalarize(0) 627 .widenScalarToNextPow2(0, 32) 628 .widenScalarToNextPow2(1, 32); 629 630 // The hardware instructions return a different result on 0 than the generic 631 // instructions expect. The hardware produces -1, but these produce the 632 // bitwidth. 633 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 634 .scalarize(0) 635 .clampScalar(0, S32, S32) 636 .clampScalar(1, S32, S64) 637 .widenScalarToNextPow2(0, 32) 638 .widenScalarToNextPow2(1, 32) 639 .lower(); 640 641 // The 64-bit versions produce 32-bit results, but only on the SALU. 642 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 643 .legalFor({{S32, S32}, {S32, S64}}) 644 .clampScalar(0, S32, S32) 645 .clampScalar(1, S32, S64) 646 .scalarize(0) 647 .widenScalarToNextPow2(0, 32) 648 .widenScalarToNextPow2(1, 32); 649 650 getActionDefinitionsBuilder(G_BITREVERSE) 651 .legalFor({S32}) 652 .clampScalar(0, S32, S32) 653 .scalarize(0); 654 655 if (ST.has16BitInsts()) { 656 getActionDefinitionsBuilder(G_BSWAP) 657 .legalFor({S16, S32, V2S16}) 658 .clampMaxNumElements(0, S16, 2) 659 // FIXME: Fixing non-power-of-2 before clamp is workaround for 660 // narrowScalar limitation. 661 .widenScalarToNextPow2(0) 662 .clampScalar(0, S16, S32) 663 .scalarize(0); 664 665 if (ST.hasVOP3PInsts()) { 666 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 667 .legalFor({S32, S16, V2S16}) 668 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 669 .clampMaxNumElements(0, S16, 2) 670 .minScalar(0, S16) 671 .widenScalarToNextPow2(0) 672 .scalarize(0) 673 .lower(); 674 } else { 675 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 676 .legalFor({S32, S16}) 677 .widenScalarToNextPow2(0) 678 .minScalar(0, S16) 679 .scalarize(0) 680 .lower(); 681 } 682 } else { 683 // TODO: Should have same legality without v_perm_b32 684 getActionDefinitionsBuilder(G_BSWAP) 685 .legalFor({S32}) 686 .lowerIf(narrowerThan(0, 32)) 687 // FIXME: Fixing non-power-of-2 before clamp is workaround for 688 // narrowScalar limitation. 689 .widenScalarToNextPow2(0) 690 .maxScalar(0, S32) 691 .scalarize(0) 692 .lower(); 693 694 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 695 .legalFor({S32}) 696 .minScalar(0, S32) 697 .widenScalarToNextPow2(0) 698 .scalarize(0) 699 .lower(); 700 } 701 702 getActionDefinitionsBuilder(G_INTTOPTR) 703 // List the common cases 704 .legalForCartesianProduct(AddrSpaces64, {S64}) 705 .legalForCartesianProduct(AddrSpaces32, {S32}) 706 .scalarize(0) 707 // Accept any address space as long as the size matches 708 .legalIf(sameSize(0, 1)) 709 .widenScalarIf(smallerThan(1, 0), 710 [](const LegalityQuery &Query) { 711 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 712 }) 713 .narrowScalarIf(greaterThan(1, 0), 714 [](const LegalityQuery &Query) { 715 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 716 }); 717 718 getActionDefinitionsBuilder(G_PTRTOINT) 719 // List the common cases 720 .legalForCartesianProduct(AddrSpaces64, {S64}) 721 .legalForCartesianProduct(AddrSpaces32, {S32}) 722 .scalarize(0) 723 // Accept any address space as long as the size matches 724 .legalIf(sameSize(0, 1)) 725 .widenScalarIf(smallerThan(0, 1), 726 [](const LegalityQuery &Query) { 727 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 728 }) 729 .narrowScalarIf( 730 greaterThan(0, 1), 731 [](const LegalityQuery &Query) { 732 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 733 }); 734 735 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 736 .scalarize(0) 737 .custom(); 738 739 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 740 // handle some operations by just promoting the register during 741 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 742 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 743 switch (AS) { 744 // FIXME: Private element size. 745 case AMDGPUAS::PRIVATE_ADDRESS: 746 return 32; 747 // FIXME: Check subtarget 748 case AMDGPUAS::LOCAL_ADDRESS: 749 return ST.useDS128() ? 128 : 64; 750 751 // Treat constant and global as identical. SMRD loads are sometimes usable 752 // for global loads (ideally constant address space should be eliminated) 753 // depending on the context. Legality cannot be context dependent, but 754 // RegBankSelect can split the load as necessary depending on the pointer 755 // register bank/uniformity and if the memory is invariant or not written in 756 // a kernel. 757 case AMDGPUAS::CONSTANT_ADDRESS: 758 case AMDGPUAS::GLOBAL_ADDRESS: 759 return IsLoad ? 512 : 128; 760 default: 761 return 128; 762 } 763 }; 764 765 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 766 bool IsLoad) -> bool { 767 const LLT DstTy = Query.Types[0]; 768 769 // Split vector extloads. 770 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 771 unsigned Align = Query.MMODescrs[0].AlignInBits; 772 773 if (MemSize < DstTy.getSizeInBits()) 774 MemSize = std::max(MemSize, Align); 775 776 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 777 return true; 778 779 const LLT PtrTy = Query.Types[1]; 780 unsigned AS = PtrTy.getAddressSpace(); 781 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 782 return true; 783 784 // Catch weird sized loads that don't evenly divide into the access sizes 785 // TODO: May be able to widen depending on alignment etc. 786 unsigned NumRegs = (MemSize + 31) / 32; 787 if (NumRegs == 3) { 788 if (!ST.hasDwordx3LoadStores()) 789 return true; 790 } else { 791 // If the alignment allows, these should have been widened. 792 if (!isPowerOf2_32(NumRegs)) 793 return true; 794 } 795 796 if (Align < MemSize) { 797 const SITargetLowering *TLI = ST.getTargetLowering(); 798 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 799 } 800 801 return false; 802 }; 803 804 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 805 unsigned Size = Query.Types[0].getSizeInBits(); 806 if (isPowerOf2_32(Size)) 807 return false; 808 809 if (Size == 96 && ST.hasDwordx3LoadStores()) 810 return false; 811 812 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 813 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 814 return false; 815 816 unsigned Align = Query.MMODescrs[0].AlignInBits; 817 unsigned RoundedSize = NextPowerOf2(Size); 818 return (Align >= RoundedSize); 819 }; 820 821 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 822 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 823 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 824 825 // TODO: Refine based on subtargets which support unaligned access or 128-bit 826 // LDS 827 // TODO: Unsupported flat for SI. 828 829 for (unsigned Op : {G_LOAD, G_STORE}) { 830 const bool IsStore = Op == G_STORE; 831 832 auto &Actions = getActionDefinitionsBuilder(Op); 833 // Whitelist the common cases. 834 // TODO: Loads to s16 on gfx9 835 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 836 {V2S32, GlobalPtr, 64, GlobalAlign32}, 837 {V4S32, GlobalPtr, 128, GlobalAlign32}, 838 {S128, GlobalPtr, 128, GlobalAlign32}, 839 {S64, GlobalPtr, 64, GlobalAlign32}, 840 {V2S64, GlobalPtr, 128, GlobalAlign32}, 841 {V2S16, GlobalPtr, 32, GlobalAlign32}, 842 {S32, GlobalPtr, 8, GlobalAlign8}, 843 {S32, GlobalPtr, 16, GlobalAlign16}, 844 845 {S32, LocalPtr, 32, 32}, 846 {S64, LocalPtr, 64, 32}, 847 {V2S32, LocalPtr, 64, 32}, 848 {S32, LocalPtr, 8, 8}, 849 {S32, LocalPtr, 16, 16}, 850 {V2S16, LocalPtr, 32, 32}, 851 852 {S32, PrivatePtr, 32, 32}, 853 {S32, PrivatePtr, 8, 8}, 854 {S32, PrivatePtr, 16, 16}, 855 {V2S16, PrivatePtr, 32, 32}, 856 857 {S32, FlatPtr, 32, GlobalAlign32}, 858 {S32, FlatPtr, 16, GlobalAlign16}, 859 {S32, FlatPtr, 8, GlobalAlign8}, 860 {V2S16, FlatPtr, 32, GlobalAlign32}, 861 862 {S32, ConstantPtr, 32, GlobalAlign32}, 863 {V2S32, ConstantPtr, 64, GlobalAlign32}, 864 {V4S32, ConstantPtr, 128, GlobalAlign32}, 865 {S64, ConstantPtr, 64, GlobalAlign32}, 866 {S128, ConstantPtr, 128, GlobalAlign32}, 867 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 868 Actions 869 .customIf(typeIs(1, Constant32Ptr)) 870 // Widen suitably aligned loads by loading extra elements. 871 .moreElementsIf([=](const LegalityQuery &Query) { 872 const LLT Ty = Query.Types[0]; 873 return Op == G_LOAD && Ty.isVector() && 874 shouldWidenLoadResult(Query); 875 }, moreElementsToNextPow2(0)) 876 .widenScalarIf([=](const LegalityQuery &Query) { 877 const LLT Ty = Query.Types[0]; 878 return Op == G_LOAD && !Ty.isVector() && 879 shouldWidenLoadResult(Query); 880 }, widenScalarOrEltToNextPow2(0)) 881 .narrowScalarIf( 882 [=](const LegalityQuery &Query) -> bool { 883 return !Query.Types[0].isVector() && 884 needToSplitMemOp(Query, Op == G_LOAD); 885 }, 886 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 887 const LLT DstTy = Query.Types[0]; 888 const LLT PtrTy = Query.Types[1]; 889 890 const unsigned DstSize = DstTy.getSizeInBits(); 891 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 892 893 // Split extloads. 894 if (DstSize > MemSize) 895 return std::make_pair(0, LLT::scalar(MemSize)); 896 897 if (!isPowerOf2_32(DstSize)) { 898 // We're probably decomposing an odd sized store. Try to split 899 // to the widest type. TODO: Account for alignment. As-is it 900 // should be OK, since the new parts will be further legalized. 901 unsigned FloorSize = PowerOf2Floor(DstSize); 902 return std::make_pair(0, LLT::scalar(FloorSize)); 903 } 904 905 if (DstSize > 32 && (DstSize % 32 != 0)) { 906 // FIXME: Need a way to specify non-extload of larger size if 907 // suitably aligned. 908 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 909 } 910 911 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 912 Op == G_LOAD); 913 if (MemSize > MaxSize) 914 return std::make_pair(0, LLT::scalar(MaxSize)); 915 916 unsigned Align = Query.MMODescrs[0].AlignInBits; 917 return std::make_pair(0, LLT::scalar(Align)); 918 }) 919 .fewerElementsIf( 920 [=](const LegalityQuery &Query) -> bool { 921 return Query.Types[0].isVector() && 922 needToSplitMemOp(Query, Op == G_LOAD); 923 }, 924 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 925 const LLT DstTy = Query.Types[0]; 926 const LLT PtrTy = Query.Types[1]; 927 928 LLT EltTy = DstTy.getElementType(); 929 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 930 Op == G_LOAD); 931 932 // FIXME: Handle widened to power of 2 results better. This ends 933 // up scalarizing. 934 // FIXME: 3 element stores scalarized on SI 935 936 // Split if it's too large for the address space. 937 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 938 unsigned NumElts = DstTy.getNumElements(); 939 unsigned EltSize = EltTy.getSizeInBits(); 940 941 if (MaxSize % EltSize == 0) { 942 return std::make_pair( 943 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 944 } 945 946 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 947 948 // FIXME: Refine when odd breakdowns handled 949 // The scalars will need to be re-legalized. 950 if (NumPieces == 1 || NumPieces >= NumElts || 951 NumElts % NumPieces != 0) 952 return std::make_pair(0, EltTy); 953 954 return std::make_pair(0, 955 LLT::vector(NumElts / NumPieces, EltTy)); 956 } 957 958 // FIXME: We could probably handle weird extending loads better. 959 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 960 if (DstTy.getSizeInBits() > MemSize) 961 return std::make_pair(0, EltTy); 962 963 unsigned EltSize = EltTy.getSizeInBits(); 964 unsigned DstSize = DstTy.getSizeInBits(); 965 if (!isPowerOf2_32(DstSize)) { 966 // We're probably decomposing an odd sized store. Try to split 967 // to the widest type. TODO: Account for alignment. As-is it 968 // should be OK, since the new parts will be further legalized. 969 unsigned FloorSize = PowerOf2Floor(DstSize); 970 return std::make_pair( 971 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 972 } 973 974 // Need to split because of alignment. 975 unsigned Align = Query.MMODescrs[0].AlignInBits; 976 if (EltSize > Align && 977 (EltSize / Align < DstTy.getNumElements())) { 978 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 979 } 980 981 // May need relegalization for the scalars. 982 return std::make_pair(0, EltTy); 983 }) 984 .minScalar(0, S32); 985 986 if (IsStore) 987 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 988 989 // TODO: Need a bitcast lower option? 990 Actions 991 .legalIf([=](const LegalityQuery &Query) { 992 const LLT Ty0 = Query.Types[0]; 993 unsigned Size = Ty0.getSizeInBits(); 994 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 995 unsigned Align = Query.MMODescrs[0].AlignInBits; 996 997 // FIXME: Widening store from alignment not valid. 998 if (MemSize < Size) 999 MemSize = std::max(MemSize, Align); 1000 1001 // No extending vector loads. 1002 if (Size > MemSize && Ty0.isVector()) 1003 return false; 1004 1005 switch (MemSize) { 1006 case 8: 1007 case 16: 1008 return Size == 32; 1009 case 32: 1010 case 64: 1011 case 128: 1012 return true; 1013 case 96: 1014 return ST.hasDwordx3LoadStores(); 1015 case 256: 1016 case 512: 1017 return true; 1018 default: 1019 return false; 1020 } 1021 }) 1022 .widenScalarToNextPow2(0) 1023 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1024 } 1025 1026 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1027 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1028 {S32, GlobalPtr, 16, 2 * 8}, 1029 {S32, LocalPtr, 8, 8}, 1030 {S32, LocalPtr, 16, 16}, 1031 {S32, PrivatePtr, 8, 8}, 1032 {S32, PrivatePtr, 16, 16}, 1033 {S32, ConstantPtr, 8, 8}, 1034 {S32, ConstantPtr, 16, 2 * 8}}); 1035 if (ST.hasFlatAddressSpace()) { 1036 ExtLoads.legalForTypesWithMemDesc( 1037 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1038 } 1039 1040 ExtLoads.clampScalar(0, S32, S32) 1041 .widenScalarToNextPow2(0) 1042 .unsupportedIfMemSizeNotPow2() 1043 .lower(); 1044 1045 auto &Atomics = getActionDefinitionsBuilder( 1046 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1047 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1048 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1049 G_ATOMICRMW_UMIN}) 1050 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1051 {S64, GlobalPtr}, {S64, LocalPtr}}); 1052 if (ST.hasFlatAddressSpace()) { 1053 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1054 } 1055 1056 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1057 .legalFor({{S32, LocalPtr}}); 1058 1059 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1060 // demarshalling 1061 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1062 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1063 {S32, FlatPtr}, {S64, FlatPtr}}) 1064 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1065 {S32, RegionPtr}, {S64, RegionPtr}}); 1066 // TODO: Pointer types, any 32-bit or 64-bit vector 1067 1068 // Condition should be s32 for scalar, s1 for vector. 1069 getActionDefinitionsBuilder(G_SELECT) 1070 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1071 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1072 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1073 .clampScalar(0, S16, S64) 1074 .scalarize(1) 1075 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1076 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1077 .clampMaxNumElements(0, S32, 2) 1078 .clampMaxNumElements(0, LocalPtr, 2) 1079 .clampMaxNumElements(0, PrivatePtr, 2) 1080 .scalarize(0) 1081 .widenScalarToNextPow2(0) 1082 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1083 1084 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1085 // be more flexible with the shift amount type. 1086 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1087 .legalFor({{S32, S32}, {S64, S32}}); 1088 if (ST.has16BitInsts()) { 1089 if (ST.hasVOP3PInsts()) { 1090 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1091 .clampMaxNumElements(0, S16, 2); 1092 } else 1093 Shifts.legalFor({{S16, S16}}); 1094 1095 // TODO: Support 16-bit shift amounts for all types 1096 Shifts.widenScalarIf( 1097 [=](const LegalityQuery &Query) { 1098 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1099 // 32-bit amount. 1100 const LLT ValTy = Query.Types[0]; 1101 const LLT AmountTy = Query.Types[1]; 1102 return ValTy.getSizeInBits() <= 16 && 1103 AmountTy.getSizeInBits() < 16; 1104 }, changeTo(1, S16)); 1105 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1106 Shifts.clampScalar(1, S32, S32); 1107 Shifts.clampScalar(0, S16, S64); 1108 Shifts.widenScalarToNextPow2(0, 16); 1109 } else { 1110 // Make sure we legalize the shift amount type first, as the general 1111 // expansion for the shifted type will produce much worse code if it hasn't 1112 // been truncated already. 1113 Shifts.clampScalar(1, S32, S32); 1114 Shifts.clampScalar(0, S32, S64); 1115 Shifts.widenScalarToNextPow2(0, 32); 1116 } 1117 Shifts.scalarize(0); 1118 1119 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1120 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1121 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1122 unsigned IdxTypeIdx = 2; 1123 1124 getActionDefinitionsBuilder(Op) 1125 .customIf([=](const LegalityQuery &Query) { 1126 const LLT EltTy = Query.Types[EltTypeIdx]; 1127 const LLT VecTy = Query.Types[VecTypeIdx]; 1128 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1129 return (EltTy.getSizeInBits() == 16 || 1130 EltTy.getSizeInBits() % 32 == 0) && 1131 VecTy.getSizeInBits() % 32 == 0 && 1132 VecTy.getSizeInBits() <= 1024 && 1133 IdxTy.getSizeInBits() == 32; 1134 }) 1135 .clampScalar(EltTypeIdx, S32, S64) 1136 .clampScalar(VecTypeIdx, S32, S64) 1137 .clampScalar(IdxTypeIdx, S32, S32); 1138 } 1139 1140 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1141 .unsupportedIf([=](const LegalityQuery &Query) { 1142 const LLT &EltTy = Query.Types[1].getElementType(); 1143 return Query.Types[0] != EltTy; 1144 }); 1145 1146 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1147 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1148 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1149 1150 // FIXME: Doesn't handle extract of illegal sizes. 1151 getActionDefinitionsBuilder(Op) 1152 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1153 // FIXME: Multiples of 16 should not be legal. 1154 .legalIf([=](const LegalityQuery &Query) { 1155 const LLT BigTy = Query.Types[BigTyIdx]; 1156 const LLT LitTy = Query.Types[LitTyIdx]; 1157 return (BigTy.getSizeInBits() % 32 == 0) && 1158 (LitTy.getSizeInBits() % 16 == 0); 1159 }) 1160 .widenScalarIf( 1161 [=](const LegalityQuery &Query) { 1162 const LLT BigTy = Query.Types[BigTyIdx]; 1163 return (BigTy.getScalarSizeInBits() < 16); 1164 }, 1165 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1166 .widenScalarIf( 1167 [=](const LegalityQuery &Query) { 1168 const LLT LitTy = Query.Types[LitTyIdx]; 1169 return (LitTy.getScalarSizeInBits() < 16); 1170 }, 1171 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1172 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1173 .widenScalarToNextPow2(BigTyIdx, 32); 1174 1175 } 1176 1177 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1178 .legalForCartesianProduct(AllS32Vectors, {S32}) 1179 .legalForCartesianProduct(AllS64Vectors, {S64}) 1180 .clampNumElements(0, V16S32, V32S32) 1181 .clampNumElements(0, V2S64, V16S64) 1182 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1183 1184 if (ST.hasScalarPackInsts()) { 1185 BuildVector 1186 // FIXME: Should probably widen s1 vectors straight to s32 1187 .minScalarOrElt(0, S16) 1188 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1189 .minScalar(1, S32); 1190 1191 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1192 .legalFor({V2S16, S32}) 1193 .lower(); 1194 BuildVector.minScalarOrElt(0, S32); 1195 } else { 1196 BuildVector.customFor({V2S16, S16}); 1197 BuildVector.minScalarOrElt(0, S32); 1198 1199 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1200 .customFor({V2S16, S32}) 1201 .lower(); 1202 } 1203 1204 BuildVector.legalIf(isRegisterType(0)); 1205 1206 // FIXME: Clamp maximum size 1207 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1208 .legalIf(isRegisterType(0)); 1209 1210 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1211 // pre-legalize. 1212 if (ST.hasVOP3PInsts()) { 1213 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1214 .customFor({V2S16, V2S16}) 1215 .lower(); 1216 } else 1217 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1218 1219 // Merge/Unmerge 1220 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1221 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1222 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1223 1224 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1225 const LLT Ty = Query.Types[TypeIdx]; 1226 if (Ty.isVector()) { 1227 const LLT &EltTy = Ty.getElementType(); 1228 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1229 return true; 1230 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1231 return true; 1232 } 1233 return false; 1234 }; 1235 1236 auto &Builder = getActionDefinitionsBuilder(Op) 1237 .lowerFor({{S16, V2S16}}) 1238 .lowerIf([=](const LegalityQuery &Query) { 1239 const LLT BigTy = Query.Types[BigTyIdx]; 1240 return BigTy.getSizeInBits() == 32; 1241 }) 1242 // Try to widen to s16 first for small types. 1243 // TODO: Only do this on targets with legal s16 shifts 1244 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1245 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1246 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1247 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1248 elementTypeIs(1, S16)), 1249 changeTo(1, V2S16)) 1250 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1251 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1252 // valid. 1253 .clampScalar(LitTyIdx, S32, S512) 1254 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1255 // Break up vectors with weird elements into scalars 1256 .fewerElementsIf( 1257 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1258 scalarize(0)) 1259 .fewerElementsIf( 1260 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1261 scalarize(1)) 1262 .clampScalar(BigTyIdx, S32, S1024); 1263 1264 if (Op == G_MERGE_VALUES) { 1265 Builder.widenScalarIf( 1266 // TODO: Use 16-bit shifts if legal for 8-bit values? 1267 [=](const LegalityQuery &Query) { 1268 const LLT Ty = Query.Types[LitTyIdx]; 1269 return Ty.getSizeInBits() < 32; 1270 }, 1271 changeTo(LitTyIdx, S32)); 1272 } 1273 1274 Builder.widenScalarIf( 1275 [=](const LegalityQuery &Query) { 1276 const LLT Ty = Query.Types[BigTyIdx]; 1277 return !isPowerOf2_32(Ty.getSizeInBits()) && 1278 Ty.getSizeInBits() % 16 != 0; 1279 }, 1280 [=](const LegalityQuery &Query) { 1281 // Pick the next power of 2, or a multiple of 64 over 128. 1282 // Whichever is smaller. 1283 const LLT &Ty = Query.Types[BigTyIdx]; 1284 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1285 if (NewSizeInBits >= 256) { 1286 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1287 if (RoundedTo < NewSizeInBits) 1288 NewSizeInBits = RoundedTo; 1289 } 1290 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1291 }) 1292 .legalIf([=](const LegalityQuery &Query) { 1293 const LLT &BigTy = Query.Types[BigTyIdx]; 1294 const LLT &LitTy = Query.Types[LitTyIdx]; 1295 1296 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1297 return false; 1298 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1299 return false; 1300 1301 return BigTy.getSizeInBits() % 16 == 0 && 1302 LitTy.getSizeInBits() % 16 == 0 && 1303 BigTy.getSizeInBits() <= 1024; 1304 }) 1305 // Any vectors left are the wrong size. Scalarize them. 1306 .scalarize(0) 1307 .scalarize(1); 1308 } 1309 1310 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1311 // RegBankSelect. 1312 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1313 .legalFor({{S32}, {S64}}); 1314 1315 if (ST.hasVOP3PInsts()) { 1316 SextInReg.lowerFor({{V2S16}}) 1317 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1318 // get more vector shift opportunities, since we'll get those when 1319 // expanded. 1320 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1321 } else if (ST.has16BitInsts()) { 1322 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1323 } else { 1324 // Prefer to promote to s32 before lowering if we don't have 16-bit 1325 // shifts. This avoid a lot of intermediate truncate and extend operations. 1326 SextInReg.lowerFor({{S32}, {S64}}); 1327 } 1328 1329 SextInReg 1330 .scalarize(0) 1331 .clampScalar(0, S32, S64) 1332 .lower(); 1333 1334 getActionDefinitionsBuilder(G_FSHR) 1335 .legalFor({{S32, S32}}) 1336 .scalarize(0) 1337 .lower(); 1338 1339 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1340 .legalFor({S64}); 1341 1342 getActionDefinitionsBuilder({ 1343 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1344 G_FCOPYSIGN, 1345 1346 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1347 G_READ_REGISTER, 1348 G_WRITE_REGISTER, 1349 1350 G_SADDO, G_SSUBO, 1351 1352 // TODO: Implement 1353 G_FMINIMUM, G_FMAXIMUM, 1354 G_FSHL 1355 }).lower(); 1356 1357 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1358 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1359 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1360 .unsupported(); 1361 1362 computeTables(); 1363 verify(*ST.getInstrInfo()); 1364 } 1365 1366 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1367 MachineRegisterInfo &MRI, 1368 MachineIRBuilder &B, 1369 GISelChangeObserver &Observer) const { 1370 switch (MI.getOpcode()) { 1371 case TargetOpcode::G_ADDRSPACE_CAST: 1372 return legalizeAddrSpaceCast(MI, MRI, B); 1373 case TargetOpcode::G_FRINT: 1374 return legalizeFrint(MI, MRI, B); 1375 case TargetOpcode::G_FCEIL: 1376 return legalizeFceil(MI, MRI, B); 1377 case TargetOpcode::G_INTRINSIC_TRUNC: 1378 return legalizeIntrinsicTrunc(MI, MRI, B); 1379 case TargetOpcode::G_SITOFP: 1380 return legalizeITOFP(MI, MRI, B, true); 1381 case TargetOpcode::G_UITOFP: 1382 return legalizeITOFP(MI, MRI, B, false); 1383 case TargetOpcode::G_FPTOSI: 1384 return legalizeFPTOI(MI, MRI, B, true); 1385 case TargetOpcode::G_FPTOUI: 1386 return legalizeFPTOI(MI, MRI, B, false); 1387 case TargetOpcode::G_FMINNUM: 1388 case TargetOpcode::G_FMAXNUM: 1389 case TargetOpcode::G_FMINNUM_IEEE: 1390 case TargetOpcode::G_FMAXNUM_IEEE: 1391 return legalizeMinNumMaxNum(MI, MRI, B); 1392 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1393 return legalizeExtractVectorElt(MI, MRI, B); 1394 case TargetOpcode::G_INSERT_VECTOR_ELT: 1395 return legalizeInsertVectorElt(MI, MRI, B); 1396 case TargetOpcode::G_SHUFFLE_VECTOR: 1397 return legalizeShuffleVector(MI, MRI, B); 1398 case TargetOpcode::G_FSIN: 1399 case TargetOpcode::G_FCOS: 1400 return legalizeSinCos(MI, MRI, B); 1401 case TargetOpcode::G_GLOBAL_VALUE: 1402 return legalizeGlobalValue(MI, MRI, B); 1403 case TargetOpcode::G_LOAD: 1404 return legalizeLoad(MI, MRI, B, Observer); 1405 case TargetOpcode::G_FMAD: 1406 return legalizeFMad(MI, MRI, B); 1407 case TargetOpcode::G_FDIV: 1408 return legalizeFDIV(MI, MRI, B); 1409 case TargetOpcode::G_UDIV: 1410 case TargetOpcode::G_UREM: 1411 return legalizeUDIV_UREM(MI, MRI, B); 1412 case TargetOpcode::G_SDIV: 1413 case TargetOpcode::G_SREM: 1414 return legalizeSDIV_SREM(MI, MRI, B); 1415 case TargetOpcode::G_ATOMIC_CMPXCHG: 1416 return legalizeAtomicCmpXChg(MI, MRI, B); 1417 case TargetOpcode::G_FLOG: 1418 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1419 case TargetOpcode::G_FLOG10: 1420 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1421 case TargetOpcode::G_FEXP: 1422 return legalizeFExp(MI, B); 1423 case TargetOpcode::G_FPOW: 1424 return legalizeFPow(MI, B); 1425 case TargetOpcode::G_FFLOOR: 1426 return legalizeFFloor(MI, MRI, B); 1427 case TargetOpcode::G_BUILD_VECTOR: 1428 return legalizeBuildVector(MI, MRI, B); 1429 default: 1430 return false; 1431 } 1432 1433 llvm_unreachable("expected switch to return"); 1434 } 1435 1436 Register AMDGPULegalizerInfo::getSegmentAperture( 1437 unsigned AS, 1438 MachineRegisterInfo &MRI, 1439 MachineIRBuilder &B) const { 1440 MachineFunction &MF = B.getMF(); 1441 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1442 const LLT S32 = LLT::scalar(32); 1443 1444 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1445 1446 if (ST.hasApertureRegs()) { 1447 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1448 // getreg. 1449 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1450 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1451 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1452 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1453 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1454 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1455 unsigned Encoding = 1456 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1457 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1458 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1459 1460 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1461 1462 B.buildInstr(AMDGPU::S_GETREG_B32) 1463 .addDef(GetReg) 1464 .addImm(Encoding); 1465 MRI.setType(GetReg, S32); 1466 1467 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1468 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1469 } 1470 1471 Register QueuePtr = MRI.createGenericVirtualRegister( 1472 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1473 1474 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1475 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1476 return Register(); 1477 1478 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1479 // private_segment_aperture_base_hi. 1480 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1481 1482 // TODO: can we be smarter about machine pointer info? 1483 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1484 MachineMemOperand *MMO = MF.getMachineMemOperand( 1485 PtrInfo, 1486 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1487 MachineMemOperand::MOInvariant, 1488 4, commonAlignment(Align(64), StructOffset)); 1489 1490 Register LoadAddr; 1491 1492 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1493 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1494 } 1495 1496 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1497 MachineInstr &MI, MachineRegisterInfo &MRI, 1498 MachineIRBuilder &B) const { 1499 MachineFunction &MF = B.getMF(); 1500 1501 B.setInstr(MI); 1502 1503 const LLT S32 = LLT::scalar(32); 1504 Register Dst = MI.getOperand(0).getReg(); 1505 Register Src = MI.getOperand(1).getReg(); 1506 1507 LLT DstTy = MRI.getType(Dst); 1508 LLT SrcTy = MRI.getType(Src); 1509 unsigned DestAS = DstTy.getAddressSpace(); 1510 unsigned SrcAS = SrcTy.getAddressSpace(); 1511 1512 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1513 // vector element. 1514 assert(!DstTy.isVector()); 1515 1516 const AMDGPUTargetMachine &TM 1517 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1518 1519 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1520 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1521 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1522 return true; 1523 } 1524 1525 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1526 // Truncate. 1527 B.buildExtract(Dst, Src, 0); 1528 MI.eraseFromParent(); 1529 return true; 1530 } 1531 1532 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1533 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1534 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1535 1536 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1537 // another. Merge operands are required to be the same type, but creating an 1538 // extra ptrtoint would be kind of pointless. 1539 auto HighAddr = B.buildConstant( 1540 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1541 B.buildMerge(Dst, {Src, HighAddr}); 1542 MI.eraseFromParent(); 1543 return true; 1544 } 1545 1546 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1547 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1548 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1549 unsigned NullVal = TM.getNullPointerValue(DestAS); 1550 1551 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1552 auto FlatNull = B.buildConstant(SrcTy, 0); 1553 1554 // Extract low 32-bits of the pointer. 1555 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1556 1557 auto CmpRes = 1558 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1559 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1560 1561 MI.eraseFromParent(); 1562 return true; 1563 } 1564 1565 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1566 return false; 1567 1568 if (!ST.hasFlatAddressSpace()) 1569 return false; 1570 1571 auto SegmentNull = 1572 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1573 auto FlatNull = 1574 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1575 1576 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1577 if (!ApertureReg.isValid()) 1578 return false; 1579 1580 auto CmpRes = 1581 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1582 1583 // Coerce the type of the low half of the result so we can use merge_values. 1584 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1585 1586 // TODO: Should we allow mismatched types but matching sizes in merges to 1587 // avoid the ptrtoint? 1588 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1589 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1590 1591 MI.eraseFromParent(); 1592 return true; 1593 } 1594 1595 bool AMDGPULegalizerInfo::legalizeFrint( 1596 MachineInstr &MI, MachineRegisterInfo &MRI, 1597 MachineIRBuilder &B) const { 1598 B.setInstr(MI); 1599 1600 Register Src = MI.getOperand(1).getReg(); 1601 LLT Ty = MRI.getType(Src); 1602 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1603 1604 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1605 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1606 1607 auto C1 = B.buildFConstant(Ty, C1Val); 1608 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1609 1610 // TODO: Should this propagate fast-math-flags? 1611 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1612 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1613 1614 auto C2 = B.buildFConstant(Ty, C2Val); 1615 auto Fabs = B.buildFAbs(Ty, Src); 1616 1617 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1618 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1619 return true; 1620 } 1621 1622 bool AMDGPULegalizerInfo::legalizeFceil( 1623 MachineInstr &MI, MachineRegisterInfo &MRI, 1624 MachineIRBuilder &B) const { 1625 B.setInstr(MI); 1626 1627 const LLT S1 = LLT::scalar(1); 1628 const LLT S64 = LLT::scalar(64); 1629 1630 Register Src = MI.getOperand(1).getReg(); 1631 assert(MRI.getType(Src) == S64); 1632 1633 // result = trunc(src) 1634 // if (src > 0.0 && src != result) 1635 // result += 1.0 1636 1637 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1638 1639 const auto Zero = B.buildFConstant(S64, 0.0); 1640 const auto One = B.buildFConstant(S64, 1.0); 1641 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1642 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1643 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1644 auto Add = B.buildSelect(S64, And, One, Zero); 1645 1646 // TODO: Should this propagate fast-math-flags? 1647 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1648 return true; 1649 } 1650 1651 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1652 MachineIRBuilder &B) { 1653 const unsigned FractBits = 52; 1654 const unsigned ExpBits = 11; 1655 LLT S32 = LLT::scalar(32); 1656 1657 auto Const0 = B.buildConstant(S32, FractBits - 32); 1658 auto Const1 = B.buildConstant(S32, ExpBits); 1659 1660 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1661 .addUse(Const0.getReg(0)) 1662 .addUse(Const1.getReg(0)); 1663 1664 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1665 } 1666 1667 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1668 MachineInstr &MI, MachineRegisterInfo &MRI, 1669 MachineIRBuilder &B) const { 1670 B.setInstr(MI); 1671 1672 const LLT S1 = LLT::scalar(1); 1673 const LLT S32 = LLT::scalar(32); 1674 const LLT S64 = LLT::scalar(64); 1675 1676 Register Src = MI.getOperand(1).getReg(); 1677 assert(MRI.getType(Src) == S64); 1678 1679 // TODO: Should this use extract since the low half is unused? 1680 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1681 Register Hi = Unmerge.getReg(1); 1682 1683 // Extract the upper half, since this is where we will find the sign and 1684 // exponent. 1685 auto Exp = extractF64Exponent(Hi, B); 1686 1687 const unsigned FractBits = 52; 1688 1689 // Extract the sign bit. 1690 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1691 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1692 1693 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1694 1695 const auto Zero32 = B.buildConstant(S32, 0); 1696 1697 // Extend back to 64-bits. 1698 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1699 1700 auto Shr = B.buildAShr(S64, FractMask, Exp); 1701 auto Not = B.buildNot(S64, Shr); 1702 auto Tmp0 = B.buildAnd(S64, Src, Not); 1703 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1704 1705 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1706 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1707 1708 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1709 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1710 return true; 1711 } 1712 1713 bool AMDGPULegalizerInfo::legalizeITOFP( 1714 MachineInstr &MI, MachineRegisterInfo &MRI, 1715 MachineIRBuilder &B, bool Signed) const { 1716 B.setInstr(MI); 1717 1718 Register Dst = MI.getOperand(0).getReg(); 1719 Register Src = MI.getOperand(1).getReg(); 1720 1721 const LLT S64 = LLT::scalar(64); 1722 const LLT S32 = LLT::scalar(32); 1723 1724 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1725 1726 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1727 1728 auto CvtHi = Signed ? 1729 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1730 B.buildUITOFP(S64, Unmerge.getReg(1)); 1731 1732 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1733 1734 auto ThirtyTwo = B.buildConstant(S32, 32); 1735 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1736 .addUse(CvtHi.getReg(0)) 1737 .addUse(ThirtyTwo.getReg(0)); 1738 1739 // TODO: Should this propagate fast-math-flags? 1740 B.buildFAdd(Dst, LdExp, CvtLo); 1741 MI.eraseFromParent(); 1742 return true; 1743 } 1744 1745 // TODO: Copied from DAG implementation. Verify logic and document how this 1746 // actually works. 1747 bool AMDGPULegalizerInfo::legalizeFPTOI( 1748 MachineInstr &MI, MachineRegisterInfo &MRI, 1749 MachineIRBuilder &B, bool Signed) const { 1750 B.setInstr(MI); 1751 1752 Register Dst = MI.getOperand(0).getReg(); 1753 Register Src = MI.getOperand(1).getReg(); 1754 1755 const LLT S64 = LLT::scalar(64); 1756 const LLT S32 = LLT::scalar(32); 1757 1758 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1759 1760 unsigned Flags = MI.getFlags(); 1761 1762 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1763 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1764 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1765 1766 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1767 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1768 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1769 1770 auto Hi = Signed ? 1771 B.buildFPTOSI(S32, FloorMul) : 1772 B.buildFPTOUI(S32, FloorMul); 1773 auto Lo = B.buildFPTOUI(S32, Fma); 1774 1775 B.buildMerge(Dst, { Lo, Hi }); 1776 MI.eraseFromParent(); 1777 1778 return true; 1779 } 1780 1781 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1782 MachineInstr &MI, MachineRegisterInfo &MRI, 1783 MachineIRBuilder &B) const { 1784 MachineFunction &MF = B.getMF(); 1785 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1786 1787 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1788 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1789 1790 // With ieee_mode disabled, the instructions have the correct behavior 1791 // already for G_FMINNUM/G_FMAXNUM 1792 if (!MFI->getMode().IEEE) 1793 return !IsIEEEOp; 1794 1795 if (IsIEEEOp) 1796 return true; 1797 1798 MachineIRBuilder HelperBuilder(MI); 1799 GISelObserverWrapper DummyObserver; 1800 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1801 HelperBuilder.setInstr(MI); 1802 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1803 } 1804 1805 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1806 MachineInstr &MI, MachineRegisterInfo &MRI, 1807 MachineIRBuilder &B) const { 1808 // TODO: Should move some of this into LegalizerHelper. 1809 1810 // TODO: Promote dynamic indexing of s16 to s32 1811 1812 // FIXME: Artifact combiner probably should have replaced the truncated 1813 // constant before this, so we shouldn't need 1814 // getConstantVRegValWithLookThrough. 1815 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1816 MI.getOperand(2).getReg(), MRI); 1817 if (!IdxVal) // Dynamic case will be selected to register indexing. 1818 return true; 1819 1820 Register Dst = MI.getOperand(0).getReg(); 1821 Register Vec = MI.getOperand(1).getReg(); 1822 1823 LLT VecTy = MRI.getType(Vec); 1824 LLT EltTy = VecTy.getElementType(); 1825 assert(EltTy == MRI.getType(Dst)); 1826 1827 B.setInstr(MI); 1828 1829 if (IdxVal->Value < VecTy.getNumElements()) 1830 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1831 else 1832 B.buildUndef(Dst); 1833 1834 MI.eraseFromParent(); 1835 return true; 1836 } 1837 1838 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1839 MachineInstr &MI, MachineRegisterInfo &MRI, 1840 MachineIRBuilder &B) const { 1841 // TODO: Should move some of this into LegalizerHelper. 1842 1843 // TODO: Promote dynamic indexing of s16 to s32 1844 1845 // FIXME: Artifact combiner probably should have replaced the truncated 1846 // constant before this, so we shouldn't need 1847 // getConstantVRegValWithLookThrough. 1848 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1849 MI.getOperand(3).getReg(), MRI); 1850 if (!IdxVal) // Dynamic case will be selected to register indexing. 1851 return true; 1852 1853 Register Dst = MI.getOperand(0).getReg(); 1854 Register Vec = MI.getOperand(1).getReg(); 1855 Register Ins = MI.getOperand(2).getReg(); 1856 1857 LLT VecTy = MRI.getType(Vec); 1858 LLT EltTy = VecTy.getElementType(); 1859 assert(EltTy == MRI.getType(Ins)); 1860 1861 B.setInstr(MI); 1862 1863 if (IdxVal->Value < VecTy.getNumElements()) 1864 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1865 else 1866 B.buildUndef(Dst); 1867 1868 MI.eraseFromParent(); 1869 return true; 1870 } 1871 1872 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1873 MachineInstr &MI, MachineRegisterInfo &MRI, 1874 MachineIRBuilder &B) const { 1875 const LLT V2S16 = LLT::vector(2, 16); 1876 1877 Register Dst = MI.getOperand(0).getReg(); 1878 Register Src0 = MI.getOperand(1).getReg(); 1879 LLT DstTy = MRI.getType(Dst); 1880 LLT SrcTy = MRI.getType(Src0); 1881 1882 if (SrcTy == V2S16 && DstTy == V2S16 && 1883 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1884 return true; 1885 1886 MachineIRBuilder HelperBuilder(MI); 1887 GISelObserverWrapper DummyObserver; 1888 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1889 HelperBuilder.setInstr(MI); 1890 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1891 } 1892 1893 bool AMDGPULegalizerInfo::legalizeSinCos( 1894 MachineInstr &MI, MachineRegisterInfo &MRI, 1895 MachineIRBuilder &B) const { 1896 B.setInstr(MI); 1897 1898 Register DstReg = MI.getOperand(0).getReg(); 1899 Register SrcReg = MI.getOperand(1).getReg(); 1900 LLT Ty = MRI.getType(DstReg); 1901 unsigned Flags = MI.getFlags(); 1902 1903 Register TrigVal; 1904 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1905 if (ST.hasTrigReducedRange()) { 1906 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1907 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1908 .addUse(MulVal.getReg(0)) 1909 .setMIFlags(Flags).getReg(0); 1910 } else 1911 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1912 1913 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1914 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1915 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1916 .addUse(TrigVal) 1917 .setMIFlags(Flags); 1918 MI.eraseFromParent(); 1919 return true; 1920 } 1921 1922 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1923 Register DstReg, LLT PtrTy, 1924 MachineIRBuilder &B, const GlobalValue *GV, 1925 unsigned Offset, unsigned GAFlags) const { 1926 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1927 // to the following code sequence: 1928 // 1929 // For constant address space: 1930 // s_getpc_b64 s[0:1] 1931 // s_add_u32 s0, s0, $symbol 1932 // s_addc_u32 s1, s1, 0 1933 // 1934 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1935 // a fixup or relocation is emitted to replace $symbol with a literal 1936 // constant, which is a pc-relative offset from the encoding of the $symbol 1937 // operand to the global variable. 1938 // 1939 // For global address space: 1940 // s_getpc_b64 s[0:1] 1941 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1942 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1943 // 1944 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1945 // fixups or relocations are emitted to replace $symbol@*@lo and 1946 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1947 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1948 // operand to the global variable. 1949 // 1950 // What we want here is an offset from the value returned by s_getpc 1951 // (which is the address of the s_add_u32 instruction) to the global 1952 // variable, but since the encoding of $symbol starts 4 bytes after the start 1953 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1954 // small. This requires us to add 4 to the global variable offset in order to 1955 // compute the correct address. 1956 1957 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1958 1959 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1960 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1961 1962 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1963 .addDef(PCReg); 1964 1965 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1966 if (GAFlags == SIInstrInfo::MO_NONE) 1967 MIB.addImm(0); 1968 else 1969 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1970 1971 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1972 1973 if (PtrTy.getSizeInBits() == 32) 1974 B.buildExtract(DstReg, PCReg, 0); 1975 return true; 1976 } 1977 1978 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1979 MachineInstr &MI, MachineRegisterInfo &MRI, 1980 MachineIRBuilder &B) const { 1981 Register DstReg = MI.getOperand(0).getReg(); 1982 LLT Ty = MRI.getType(DstReg); 1983 unsigned AS = Ty.getAddressSpace(); 1984 1985 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1986 MachineFunction &MF = B.getMF(); 1987 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1988 B.setInstr(MI); 1989 1990 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1991 if (!MFI->isEntryFunction()) { 1992 const Function &Fn = MF.getFunction(); 1993 DiagnosticInfoUnsupported BadLDSDecl( 1994 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 1995 DS_Warning); 1996 Fn.getContext().diagnose(BadLDSDecl); 1997 1998 // We currently don't have a way to correctly allocate LDS objects that 1999 // aren't directly associated with a kernel. We do force inlining of 2000 // functions that use local objects. However, if these dead functions are 2001 // not eliminated, we don't want a compile time error. Just emit a warning 2002 // and a trap, since there should be no callable path here. 2003 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2004 B.buildUndef(DstReg); 2005 MI.eraseFromParent(); 2006 return true; 2007 } 2008 2009 // TODO: We could emit code to handle the initialization somewhere. 2010 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2011 const SITargetLowering *TLI = ST.getTargetLowering(); 2012 if (!TLI->shouldUseLDSConstAddress(GV)) { 2013 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2014 return true; // Leave in place; 2015 } 2016 2017 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 2018 MI.eraseFromParent(); 2019 return true; 2020 } 2021 2022 const Function &Fn = MF.getFunction(); 2023 DiagnosticInfoUnsupported BadInit( 2024 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2025 Fn.getContext().diagnose(BadInit); 2026 return true; 2027 } 2028 2029 const SITargetLowering *TLI = ST.getTargetLowering(); 2030 2031 if (TLI->shouldEmitFixup(GV)) { 2032 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2033 MI.eraseFromParent(); 2034 return true; 2035 } 2036 2037 if (TLI->shouldEmitPCReloc(GV)) { 2038 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2039 MI.eraseFromParent(); 2040 return true; 2041 } 2042 2043 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2044 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2045 2046 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2047 MachinePointerInfo::getGOT(MF), 2048 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2049 MachineMemOperand::MOInvariant, 2050 8 /*Size*/, Align(8)); 2051 2052 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2053 2054 if (Ty.getSizeInBits() == 32) { 2055 // Truncate if this is a 32-bit constant adrdess. 2056 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2057 B.buildExtract(DstReg, Load, 0); 2058 } else 2059 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2060 2061 MI.eraseFromParent(); 2062 return true; 2063 } 2064 2065 bool AMDGPULegalizerInfo::legalizeLoad( 2066 MachineInstr &MI, MachineRegisterInfo &MRI, 2067 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2068 B.setInstr(MI); 2069 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2070 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2071 Observer.changingInstr(MI); 2072 MI.getOperand(1).setReg(Cast.getReg(0)); 2073 Observer.changedInstr(MI); 2074 return true; 2075 } 2076 2077 bool AMDGPULegalizerInfo::legalizeFMad( 2078 MachineInstr &MI, MachineRegisterInfo &MRI, 2079 MachineIRBuilder &B) const { 2080 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2081 assert(Ty.isScalar()); 2082 2083 MachineFunction &MF = B.getMF(); 2084 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2085 2086 // TODO: Always legal with future ftz flag. 2087 // FIXME: Do we need just output? 2088 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2089 return true; 2090 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2091 return true; 2092 2093 MachineIRBuilder HelperBuilder(MI); 2094 GISelObserverWrapper DummyObserver; 2095 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2096 HelperBuilder.setInstr(MI); 2097 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2098 } 2099 2100 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2101 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2102 Register DstReg = MI.getOperand(0).getReg(); 2103 Register PtrReg = MI.getOperand(1).getReg(); 2104 Register CmpVal = MI.getOperand(2).getReg(); 2105 Register NewVal = MI.getOperand(3).getReg(); 2106 2107 assert(SITargetLowering::isFlatGlobalAddrSpace( 2108 MRI.getType(PtrReg).getAddressSpace()) && 2109 "this should not have been custom lowered"); 2110 2111 LLT ValTy = MRI.getType(CmpVal); 2112 LLT VecTy = LLT::vector(2, ValTy); 2113 2114 B.setInstr(MI); 2115 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2116 2117 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2118 .addDef(DstReg) 2119 .addUse(PtrReg) 2120 .addUse(PackedVal) 2121 .setMemRefs(MI.memoperands()); 2122 2123 MI.eraseFromParent(); 2124 return true; 2125 } 2126 2127 bool AMDGPULegalizerInfo::legalizeFlog( 2128 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2129 Register Dst = MI.getOperand(0).getReg(); 2130 Register Src = MI.getOperand(1).getReg(); 2131 LLT Ty = B.getMRI()->getType(Dst); 2132 unsigned Flags = MI.getFlags(); 2133 B.setInstr(MI); 2134 2135 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2136 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2137 2138 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2139 MI.eraseFromParent(); 2140 return true; 2141 } 2142 2143 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2144 MachineIRBuilder &B) const { 2145 Register Dst = MI.getOperand(0).getReg(); 2146 Register Src = MI.getOperand(1).getReg(); 2147 unsigned Flags = MI.getFlags(); 2148 LLT Ty = B.getMRI()->getType(Dst); 2149 B.setInstr(MI); 2150 2151 auto K = B.buildFConstant(Ty, numbers::log2e); 2152 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2153 B.buildFExp2(Dst, Mul, Flags); 2154 MI.eraseFromParent(); 2155 return true; 2156 } 2157 2158 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2159 MachineIRBuilder &B) const { 2160 Register Dst = MI.getOperand(0).getReg(); 2161 Register Src0 = MI.getOperand(1).getReg(); 2162 Register Src1 = MI.getOperand(2).getReg(); 2163 unsigned Flags = MI.getFlags(); 2164 LLT Ty = B.getMRI()->getType(Dst); 2165 B.setInstr(MI); 2166 const LLT S16 = LLT::scalar(16); 2167 const LLT S32 = LLT::scalar(32); 2168 2169 if (Ty == S32) { 2170 auto Log = B.buildFLog2(S32, Src0, Flags); 2171 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2172 .addUse(Log.getReg(0)) 2173 .addUse(Src1) 2174 .setMIFlags(Flags); 2175 B.buildFExp2(Dst, Mul, Flags); 2176 } else if (Ty == S16) { 2177 // There's no f16 fmul_legacy, so we need to convert for it. 2178 auto Log = B.buildFLog2(S16, Src0, Flags); 2179 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2180 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2181 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2182 .addUse(Ext0.getReg(0)) 2183 .addUse(Ext1.getReg(0)) 2184 .setMIFlags(Flags); 2185 2186 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2187 } else 2188 return false; 2189 2190 MI.eraseFromParent(); 2191 return true; 2192 } 2193 2194 // Find a source register, ignoring any possible source modifiers. 2195 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2196 Register ModSrc = OrigSrc; 2197 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2198 ModSrc = SrcFNeg->getOperand(1).getReg(); 2199 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2200 ModSrc = SrcFAbs->getOperand(1).getReg(); 2201 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2202 ModSrc = SrcFAbs->getOperand(1).getReg(); 2203 return ModSrc; 2204 } 2205 2206 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2207 MachineRegisterInfo &MRI, 2208 MachineIRBuilder &B) const { 2209 B.setInstr(MI); 2210 2211 const LLT S1 = LLT::scalar(1); 2212 const LLT S64 = LLT::scalar(64); 2213 Register Dst = MI.getOperand(0).getReg(); 2214 Register OrigSrc = MI.getOperand(1).getReg(); 2215 unsigned Flags = MI.getFlags(); 2216 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2217 "this should not have been custom lowered"); 2218 2219 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2220 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2221 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2222 // V_FRACT bug is: 2223 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2224 // 2225 // Convert floor(x) to (x - fract(x)) 2226 2227 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2228 .addUse(OrigSrc) 2229 .setMIFlags(Flags); 2230 2231 // Give source modifier matching some assistance before obscuring a foldable 2232 // pattern. 2233 2234 // TODO: We can avoid the neg on the fract? The input sign to fract 2235 // shouldn't matter? 2236 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2237 2238 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2239 2240 Register Min = MRI.createGenericVirtualRegister(S64); 2241 2242 // We don't need to concern ourselves with the snan handling difference, so 2243 // use the one which will directly select. 2244 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2245 if (MFI->getMode().IEEE) 2246 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2247 else 2248 B.buildFMinNum(Min, Fract, Const, Flags); 2249 2250 Register CorrectedFract = Min; 2251 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2252 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2253 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2254 } 2255 2256 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2257 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2258 2259 MI.eraseFromParent(); 2260 return true; 2261 } 2262 2263 // Turn an illegal packed v2s16 build vector into bit operations. 2264 // TODO: This should probably be a bitcast action in LegalizerHelper. 2265 bool AMDGPULegalizerInfo::legalizeBuildVector( 2266 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2267 Register Dst = MI.getOperand(0).getReg(); 2268 const LLT S32 = LLT::scalar(32); 2269 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2270 2271 Register Src0 = MI.getOperand(1).getReg(); 2272 Register Src1 = MI.getOperand(2).getReg(); 2273 assert(MRI.getType(Src0) == LLT::scalar(16)); 2274 2275 B.setInstr(MI); 2276 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2277 B.buildBitcast(Dst, Merge); 2278 2279 MI.eraseFromParent(); 2280 return true; 2281 } 2282 2283 // Return the use branch instruction, otherwise null if the usage is invalid. 2284 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2285 MachineRegisterInfo &MRI, 2286 MachineInstr *&Br, 2287 MachineBasicBlock *&UncondBrTarget) { 2288 Register CondDef = MI.getOperand(0).getReg(); 2289 if (!MRI.hasOneNonDBGUse(CondDef)) 2290 return nullptr; 2291 2292 MachineBasicBlock *Parent = MI.getParent(); 2293 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2294 if (UseMI.getParent() != Parent || 2295 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2296 return nullptr; 2297 2298 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2299 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2300 if (Next == Parent->end()) { 2301 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2302 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2303 return nullptr; 2304 UncondBrTarget = &*NextMBB; 2305 } else { 2306 if (Next->getOpcode() != AMDGPU::G_BR) 2307 return nullptr; 2308 Br = &*Next; 2309 UncondBrTarget = Br->getOperand(0).getMBB(); 2310 } 2311 2312 return &UseMI; 2313 } 2314 2315 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2316 MachineRegisterInfo &MRI, 2317 Register LiveIn, 2318 Register PhyReg) const { 2319 assert(PhyReg.isPhysical() && "Physical register expected"); 2320 2321 // Insert the live-in copy, if required, by defining destination virtual 2322 // register. 2323 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2324 if (!MRI.getVRegDef(LiveIn)) { 2325 // FIXME: Should have scoped insert pt 2326 MachineBasicBlock &OrigInsBB = B.getMBB(); 2327 auto OrigInsPt = B.getInsertPt(); 2328 2329 MachineBasicBlock &EntryMBB = B.getMF().front(); 2330 EntryMBB.addLiveIn(PhyReg); 2331 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2332 B.buildCopy(LiveIn, PhyReg); 2333 2334 B.setInsertPt(OrigInsBB, OrigInsPt); 2335 } 2336 2337 return LiveIn; 2338 } 2339 2340 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2341 MachineRegisterInfo &MRI, 2342 Register PhyReg, LLT Ty, 2343 bool InsertLiveInCopy) const { 2344 assert(PhyReg.isPhysical() && "Physical register expected"); 2345 2346 // Get or create virtual live-in regester 2347 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2348 if (!LiveIn) { 2349 LiveIn = MRI.createGenericVirtualRegister(Ty); 2350 MRI.addLiveIn(PhyReg, LiveIn); 2351 } 2352 2353 // When the actual true copy required is from virtual register to physical 2354 // register (to be inserted later), live-in copy insertion from physical 2355 // to register virtual register is not required 2356 if (!InsertLiveInCopy) 2357 return LiveIn; 2358 2359 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2360 } 2361 2362 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2363 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2364 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2365 const ArgDescriptor *Arg; 2366 const TargetRegisterClass *RC; 2367 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2368 if (!Arg) { 2369 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2370 return nullptr; 2371 } 2372 return Arg; 2373 } 2374 2375 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2376 const ArgDescriptor *Arg) const { 2377 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2378 return false; // TODO: Handle these 2379 2380 Register SrcReg = Arg->getRegister(); 2381 assert(SrcReg.isPhysical() && "Physical register expected"); 2382 assert(DstReg.isVirtual() && "Virtual register expected"); 2383 2384 MachineRegisterInfo &MRI = *B.getMRI(); 2385 2386 LLT Ty = MRI.getType(DstReg); 2387 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2388 2389 if (Arg->isMasked()) { 2390 // TODO: Should we try to emit this once in the entry block? 2391 const LLT S32 = LLT::scalar(32); 2392 const unsigned Mask = Arg->getMask(); 2393 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2394 2395 Register AndMaskSrc = LiveIn; 2396 2397 if (Shift != 0) { 2398 auto ShiftAmt = B.buildConstant(S32, Shift); 2399 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2400 } 2401 2402 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2403 } else { 2404 B.buildCopy(DstReg, LiveIn); 2405 } 2406 2407 return true; 2408 } 2409 2410 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2411 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2412 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2413 B.setInstr(MI); 2414 2415 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2416 if (!Arg) 2417 return false; 2418 2419 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2420 return false; 2421 2422 MI.eraseFromParent(); 2423 return true; 2424 } 2425 2426 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2427 MachineRegisterInfo &MRI, 2428 MachineIRBuilder &B) const { 2429 B.setInstr(MI); 2430 Register Dst = MI.getOperand(0).getReg(); 2431 LLT DstTy = MRI.getType(Dst); 2432 LLT S16 = LLT::scalar(16); 2433 LLT S32 = LLT::scalar(32); 2434 LLT S64 = LLT::scalar(64); 2435 2436 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2437 return true; 2438 2439 if (DstTy == S16) 2440 return legalizeFDIV16(MI, MRI, B); 2441 if (DstTy == S32) 2442 return legalizeFDIV32(MI, MRI, B); 2443 if (DstTy == S64) 2444 return legalizeFDIV64(MI, MRI, B); 2445 2446 return false; 2447 } 2448 2449 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2450 const LLT S32 = LLT::scalar(32); 2451 2452 auto Cvt0 = B.buildUITOFP(S32, Src); 2453 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2454 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2455 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2456 return B.buildFPTOUI(S32, Mul).getReg(0); 2457 } 2458 2459 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2460 Register DstReg, 2461 Register Num, 2462 Register Den, 2463 bool IsRem) const { 2464 const LLT S1 = LLT::scalar(1); 2465 const LLT S32 = LLT::scalar(32); 2466 2467 // RCP = URECIP(Den) = 2^32 / Den + e 2468 // e is rounding error. 2469 auto RCP = buildDivRCP(B, Den); 2470 2471 // RCP_LO = mul(RCP, Den) 2472 auto RCP_LO = B.buildMul(S32, RCP, Den); 2473 2474 // RCP_HI = mulhu (RCP, Den) */ 2475 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2476 2477 // NEG_RCP_LO = -RCP_LO 2478 auto Zero = B.buildConstant(S32, 0); 2479 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2480 2481 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2482 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2483 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2484 2485 // Calculate the rounding error from the URECIP instruction 2486 // E = mulhu(ABS_RCP_LO, RCP) 2487 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2488 2489 // RCP_A_E = RCP + E 2490 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2491 2492 // RCP_S_E = RCP - E 2493 auto RCP_S_E = B.buildSub(S32, RCP, E); 2494 2495 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2496 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2497 2498 // Quotient = mulhu(Tmp0, Num)stmp 2499 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2500 2501 // Num_S_Remainder = Quotient * Den 2502 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2503 2504 // Remainder = Num - Num_S_Remainder 2505 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2506 2507 // Remainder_GE_Den = Remainder >= Den 2508 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2509 2510 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2511 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2512 Num, Num_S_Remainder); 2513 2514 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2515 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2516 2517 // Calculate Division result: 2518 2519 // Quotient_A_One = Quotient + 1 2520 auto One = B.buildConstant(S32, 1); 2521 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2522 2523 // Quotient_S_One = Quotient - 1 2524 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2525 2526 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2527 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2528 2529 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2530 if (IsRem) { 2531 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2532 2533 // Calculate Rem result: 2534 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2535 2536 // Remainder_A_Den = Remainder + Den 2537 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2538 2539 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2540 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2541 2542 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2543 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2544 } else { 2545 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2546 } 2547 } 2548 2549 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2550 MachineRegisterInfo &MRI, 2551 MachineIRBuilder &B) const { 2552 B.setInstr(MI); 2553 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2554 Register DstReg = MI.getOperand(0).getReg(); 2555 Register Num = MI.getOperand(1).getReg(); 2556 Register Den = MI.getOperand(2).getReg(); 2557 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2558 MI.eraseFromParent(); 2559 return true; 2560 } 2561 2562 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2563 // 2564 // Return lo, hi of result 2565 // 2566 // %cvt.lo = G_UITOFP Val.lo 2567 // %cvt.hi = G_UITOFP Val.hi 2568 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2569 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2570 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2571 // %mul2 = G_FMUL %mul1, 2**(-32) 2572 // %trunc = G_INTRINSIC_TRUNC %mul2 2573 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2574 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2575 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2576 Register Val) { 2577 const LLT S32 = LLT::scalar(32); 2578 auto Unmerge = B.buildUnmerge(S32, Val); 2579 2580 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2581 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2582 2583 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2584 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2585 2586 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2587 auto Mul1 = 2588 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2589 2590 // 2**(-32) 2591 auto Mul2 = 2592 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2593 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2594 2595 // -(2**32) 2596 auto Mad2 = B.buildFMAD(S32, Trunc, 2597 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2598 2599 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2600 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2601 2602 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2603 } 2604 2605 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI, 2606 MachineRegisterInfo &MRI, 2607 MachineIRBuilder &B) const { 2608 B.setInstr(MI); 2609 2610 const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV; 2611 const LLT S32 = LLT::scalar(32); 2612 const LLT S64 = LLT::scalar(64); 2613 const LLT S1 = LLT::scalar(1); 2614 Register Numer = MI.getOperand(1).getReg(); 2615 Register Denom = MI.getOperand(2).getReg(); 2616 Register RcpLo, RcpHi; 2617 2618 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2619 2620 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2621 2622 auto Zero64 = B.buildConstant(S64, 0); 2623 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2624 2625 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2626 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2627 2628 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2629 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2630 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2631 2632 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2633 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2634 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2635 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2636 2637 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2638 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2639 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2640 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2641 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2642 2643 auto Zero32 = B.buildConstant(S32, 0); 2644 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2645 auto Add2_HiC = 2646 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2647 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2648 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2649 2650 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2651 Register NumerLo = UnmergeNumer.getReg(0); 2652 Register NumerHi = UnmergeNumer.getReg(1); 2653 2654 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2655 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2656 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2657 Register Mul3_Lo = UnmergeMul3.getReg(0); 2658 Register Mul3_Hi = UnmergeMul3.getReg(1); 2659 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2660 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2661 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2662 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2663 2664 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2665 Register DenomLo = UnmergeDenom.getReg(0); 2666 Register DenomHi = UnmergeDenom.getReg(1); 2667 2668 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2669 auto C1 = B.buildSExt(S32, CmpHi); 2670 2671 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2672 auto C2 = B.buildSExt(S32, CmpLo); 2673 2674 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2675 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2676 2677 // TODO: Here and below portions of the code can be enclosed into if/endif. 2678 // Currently control flow is unconditional and we have 4 selects after 2679 // potential endif to substitute PHIs. 2680 2681 // if C3 != 0 ... 2682 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2683 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2684 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2685 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2686 2687 auto One64 = B.buildConstant(S64, 1); 2688 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2689 2690 auto C4 = 2691 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2692 auto C5 = 2693 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2694 auto C6 = B.buildSelect( 2695 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2696 2697 // if (C6 != 0) 2698 auto Add4 = B.buildAdd(S64, Add3, One64); 2699 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2700 2701 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2702 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2703 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2704 2705 // endif C6 2706 // endif C3 2707 2708 if (IsDiv) { 2709 auto Sel1 = B.buildSelect( 2710 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2711 B.buildSelect(MI.getOperand(0), 2712 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2713 } else { 2714 auto Sel2 = B.buildSelect( 2715 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2716 B.buildSelect(MI.getOperand(0), 2717 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2718 } 2719 2720 MI.eraseFromParent(); 2721 return true; 2722 } 2723 2724 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2725 MachineRegisterInfo &MRI, 2726 MachineIRBuilder &B) const { 2727 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2728 if (Ty == LLT::scalar(32)) 2729 return legalizeUDIV_UREM32(MI, MRI, B); 2730 if (Ty == LLT::scalar(64)) 2731 return legalizeUDIV_UREM64(MI, MRI, B); 2732 return false; 2733 } 2734 2735 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2736 MachineRegisterInfo &MRI, 2737 MachineIRBuilder &B) const { 2738 B.setInstr(MI); 2739 const LLT S32 = LLT::scalar(32); 2740 2741 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2742 Register DstReg = MI.getOperand(0).getReg(); 2743 Register LHS = MI.getOperand(1).getReg(); 2744 Register RHS = MI.getOperand(2).getReg(); 2745 2746 auto ThirtyOne = B.buildConstant(S32, 31); 2747 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2748 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2749 2750 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2751 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2752 2753 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2754 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2755 2756 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2757 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2758 2759 if (IsRem) { 2760 auto RSign = LHSign; // Remainder sign is the same as LHS 2761 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2762 B.buildSub(DstReg, UDivRem, RSign); 2763 } else { 2764 auto DSign = B.buildXor(S32, LHSign, RHSign); 2765 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2766 B.buildSub(DstReg, UDivRem, DSign); 2767 } 2768 2769 MI.eraseFromParent(); 2770 return true; 2771 } 2772 2773 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2774 MachineRegisterInfo &MRI, 2775 MachineIRBuilder &B) const { 2776 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2777 return legalizeSDIV_SREM32(MI, MRI, B); 2778 return false; 2779 } 2780 2781 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2782 MachineRegisterInfo &MRI, 2783 MachineIRBuilder &B) const { 2784 Register Res = MI.getOperand(0).getReg(); 2785 Register LHS = MI.getOperand(1).getReg(); 2786 Register RHS = MI.getOperand(2).getReg(); 2787 2788 uint16_t Flags = MI.getFlags(); 2789 2790 LLT ResTy = MRI.getType(Res); 2791 LLT S32 = LLT::scalar(32); 2792 LLT S64 = LLT::scalar(64); 2793 2794 const MachineFunction &MF = B.getMF(); 2795 bool Unsafe = 2796 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2797 2798 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2799 return false; 2800 2801 if (!Unsafe && ResTy == S32 && 2802 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2803 return false; 2804 2805 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2806 // 1 / x -> RCP(x) 2807 if (CLHS->isExactlyValue(1.0)) { 2808 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2809 .addUse(RHS) 2810 .setMIFlags(Flags); 2811 2812 MI.eraseFromParent(); 2813 return true; 2814 } 2815 2816 // -1 / x -> RCP( FNEG(x) ) 2817 if (CLHS->isExactlyValue(-1.0)) { 2818 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2819 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2820 .addUse(FNeg.getReg(0)) 2821 .setMIFlags(Flags); 2822 2823 MI.eraseFromParent(); 2824 return true; 2825 } 2826 } 2827 2828 // x / y -> x * (1.0 / y) 2829 if (Unsafe) { 2830 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2831 .addUse(RHS) 2832 .setMIFlags(Flags); 2833 B.buildFMul(Res, LHS, RCP, Flags); 2834 2835 MI.eraseFromParent(); 2836 return true; 2837 } 2838 2839 return false; 2840 } 2841 2842 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2843 MachineRegisterInfo &MRI, 2844 MachineIRBuilder &B) const { 2845 B.setInstr(MI); 2846 Register Res = MI.getOperand(0).getReg(); 2847 Register LHS = MI.getOperand(1).getReg(); 2848 Register RHS = MI.getOperand(2).getReg(); 2849 2850 uint16_t Flags = MI.getFlags(); 2851 2852 LLT S16 = LLT::scalar(16); 2853 LLT S32 = LLT::scalar(32); 2854 2855 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2856 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2857 2858 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2859 .addUse(RHSExt.getReg(0)) 2860 .setMIFlags(Flags); 2861 2862 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2863 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2864 2865 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2866 .addUse(RDst.getReg(0)) 2867 .addUse(RHS) 2868 .addUse(LHS) 2869 .setMIFlags(Flags); 2870 2871 MI.eraseFromParent(); 2872 return true; 2873 } 2874 2875 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2876 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2877 static void toggleSPDenormMode(bool Enable, 2878 MachineIRBuilder &B, 2879 const GCNSubtarget &ST, 2880 AMDGPU::SIModeRegisterDefaults Mode) { 2881 // Set SP denorm mode to this value. 2882 unsigned SPDenormMode = 2883 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2884 2885 if (ST.hasDenormModeInst()) { 2886 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2887 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2888 2889 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2890 B.buildInstr(AMDGPU::S_DENORM_MODE) 2891 .addImm(NewDenormModeValue); 2892 2893 } else { 2894 // Select FP32 bit field in mode register. 2895 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2896 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2897 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2898 2899 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2900 .addImm(SPDenormMode) 2901 .addImm(SPDenormModeBitField); 2902 } 2903 } 2904 2905 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2906 MachineRegisterInfo &MRI, 2907 MachineIRBuilder &B) const { 2908 B.setInstr(MI); 2909 Register Res = MI.getOperand(0).getReg(); 2910 Register LHS = MI.getOperand(1).getReg(); 2911 Register RHS = MI.getOperand(2).getReg(); 2912 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2913 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2914 2915 uint16_t Flags = MI.getFlags(); 2916 2917 LLT S32 = LLT::scalar(32); 2918 LLT S1 = LLT::scalar(1); 2919 2920 auto One = B.buildFConstant(S32, 1.0f); 2921 2922 auto DenominatorScaled = 2923 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2924 .addUse(LHS) 2925 .addUse(RHS) 2926 .addImm(0) 2927 .setMIFlags(Flags); 2928 auto NumeratorScaled = 2929 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2930 .addUse(LHS) 2931 .addUse(RHS) 2932 .addImm(1) 2933 .setMIFlags(Flags); 2934 2935 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2936 .addUse(DenominatorScaled.getReg(0)) 2937 .setMIFlags(Flags); 2938 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2939 2940 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2941 // aren't modeled as reading it. 2942 if (!Mode.allFP32Denormals()) 2943 toggleSPDenormMode(true, B, ST, Mode); 2944 2945 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2946 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2947 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2948 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2949 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2950 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2951 2952 if (!Mode.allFP32Denormals()) 2953 toggleSPDenormMode(false, B, ST, Mode); 2954 2955 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2956 .addUse(Fma4.getReg(0)) 2957 .addUse(Fma1.getReg(0)) 2958 .addUse(Fma3.getReg(0)) 2959 .addUse(NumeratorScaled.getReg(1)) 2960 .setMIFlags(Flags); 2961 2962 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2963 .addUse(Fmas.getReg(0)) 2964 .addUse(RHS) 2965 .addUse(LHS) 2966 .setMIFlags(Flags); 2967 2968 MI.eraseFromParent(); 2969 return true; 2970 } 2971 2972 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2973 MachineRegisterInfo &MRI, 2974 MachineIRBuilder &B) const { 2975 B.setInstr(MI); 2976 Register Res = MI.getOperand(0).getReg(); 2977 Register LHS = MI.getOperand(1).getReg(); 2978 Register RHS = MI.getOperand(2).getReg(); 2979 2980 uint16_t Flags = MI.getFlags(); 2981 2982 LLT S64 = LLT::scalar(64); 2983 LLT S1 = LLT::scalar(1); 2984 2985 auto One = B.buildFConstant(S64, 1.0); 2986 2987 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2988 .addUse(LHS) 2989 .addUse(RHS) 2990 .addImm(0) 2991 .setMIFlags(Flags); 2992 2993 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2994 2995 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2996 .addUse(DivScale0.getReg(0)) 2997 .setMIFlags(Flags); 2998 2999 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3000 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3001 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3002 3003 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3004 .addUse(LHS) 3005 .addUse(RHS) 3006 .addImm(1) 3007 .setMIFlags(Flags); 3008 3009 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3010 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3011 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3012 3013 Register Scale; 3014 if (!ST.hasUsableDivScaleConditionOutput()) { 3015 // Workaround a hardware bug on SI where the condition output from div_scale 3016 // is not usable. 3017 3018 LLT S32 = LLT::scalar(32); 3019 3020 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3021 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3022 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3023 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3024 3025 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3026 Scale1Unmerge.getReg(1)); 3027 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3028 Scale0Unmerge.getReg(1)); 3029 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3030 } else { 3031 Scale = DivScale1.getReg(1); 3032 } 3033 3034 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3035 .addUse(Fma4.getReg(0)) 3036 .addUse(Fma3.getReg(0)) 3037 .addUse(Mul.getReg(0)) 3038 .addUse(Scale) 3039 .setMIFlags(Flags); 3040 3041 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3042 .addUse(Fmas.getReg(0)) 3043 .addUse(RHS) 3044 .addUse(LHS) 3045 .setMIFlags(Flags); 3046 3047 MI.eraseFromParent(); 3048 return true; 3049 } 3050 3051 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3052 MachineRegisterInfo &MRI, 3053 MachineIRBuilder &B) const { 3054 B.setInstr(MI); 3055 Register Res = MI.getOperand(0).getReg(); 3056 Register LHS = MI.getOperand(2).getReg(); 3057 Register RHS = MI.getOperand(3).getReg(); 3058 uint16_t Flags = MI.getFlags(); 3059 3060 LLT S32 = LLT::scalar(32); 3061 LLT S1 = LLT::scalar(1); 3062 3063 auto Abs = B.buildFAbs(S32, RHS, Flags); 3064 const APFloat C0Val(1.0f); 3065 3066 auto C0 = B.buildConstant(S32, 0x6f800000); 3067 auto C1 = B.buildConstant(S32, 0x2f800000); 3068 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3069 3070 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3071 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3072 3073 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3074 3075 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3076 .addUse(Mul0.getReg(0)) 3077 .setMIFlags(Flags); 3078 3079 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3080 3081 B.buildFMul(Res, Sel, Mul1, Flags); 3082 3083 MI.eraseFromParent(); 3084 return true; 3085 } 3086 3087 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3088 MachineRegisterInfo &MRI, 3089 MachineIRBuilder &B) const { 3090 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3091 if (!MFI->isEntryFunction()) { 3092 return legalizePreloadedArgIntrin(MI, MRI, B, 3093 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3094 } 3095 3096 B.setInstr(MI); 3097 3098 uint64_t Offset = 3099 ST.getTargetLowering()->getImplicitParameterOffset( 3100 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3101 Register DstReg = MI.getOperand(0).getReg(); 3102 LLT DstTy = MRI.getType(DstReg); 3103 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3104 3105 const ArgDescriptor *Arg; 3106 const TargetRegisterClass *RC; 3107 std::tie(Arg, RC) 3108 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3109 if (!Arg) 3110 return false; 3111 3112 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3113 if (!loadInputValue(KernargPtrReg, B, Arg)) 3114 return false; 3115 3116 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3117 MI.eraseFromParent(); 3118 return true; 3119 } 3120 3121 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3122 MachineRegisterInfo &MRI, 3123 MachineIRBuilder &B, 3124 unsigned AddrSpace) const { 3125 B.setInstr(MI); 3126 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3127 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3128 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3129 MI.eraseFromParent(); 3130 return true; 3131 } 3132 3133 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3134 // offset (the offset that is included in bounds checking and swizzling, to be 3135 // split between the instruction's voffset and immoffset fields) and soffset 3136 // (the offset that is excluded from bounds checking and swizzling, to go in 3137 // the instruction's soffset field). This function takes the first kind of 3138 // offset and figures out how to split it between voffset and immoffset. 3139 std::tuple<Register, unsigned, unsigned> 3140 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3141 Register OrigOffset) const { 3142 const unsigned MaxImm = 4095; 3143 Register BaseReg; 3144 unsigned TotalConstOffset; 3145 MachineInstr *OffsetDef; 3146 const LLT S32 = LLT::scalar(32); 3147 3148 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3149 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3150 3151 unsigned ImmOffset = TotalConstOffset; 3152 3153 // If the immediate value is too big for the immoffset field, put the value 3154 // and -4096 into the immoffset field so that the value that is copied/added 3155 // for the voffset field is a multiple of 4096, and it stands more chance 3156 // of being CSEd with the copy/add for another similar load/store. 3157 // However, do not do that rounding down to a multiple of 4096 if that is a 3158 // negative number, as it appears to be illegal to have a negative offset 3159 // in the vgpr, even if adding the immediate offset makes it positive. 3160 unsigned Overflow = ImmOffset & ~MaxImm; 3161 ImmOffset -= Overflow; 3162 if ((int32_t)Overflow < 0) { 3163 Overflow += ImmOffset; 3164 ImmOffset = 0; 3165 } 3166 3167 if (Overflow != 0) { 3168 if (!BaseReg) { 3169 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3170 } else { 3171 auto OverflowVal = B.buildConstant(S32, Overflow); 3172 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3173 } 3174 } 3175 3176 if (!BaseReg) 3177 BaseReg = B.buildConstant(S32, 0).getReg(0); 3178 3179 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3180 } 3181 3182 /// Handle register layout difference for f16 images for some subtargets. 3183 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3184 MachineRegisterInfo &MRI, 3185 Register Reg) const { 3186 if (!ST.hasUnpackedD16VMem()) 3187 return Reg; 3188 3189 const LLT S16 = LLT::scalar(16); 3190 const LLT S32 = LLT::scalar(32); 3191 LLT StoreVT = MRI.getType(Reg); 3192 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3193 3194 auto Unmerge = B.buildUnmerge(S16, Reg); 3195 3196 SmallVector<Register, 4> WideRegs; 3197 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3198 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3199 3200 int NumElts = StoreVT.getNumElements(); 3201 3202 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3203 } 3204 3205 Register AMDGPULegalizerInfo::fixStoreSourceType( 3206 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3207 MachineRegisterInfo *MRI = B.getMRI(); 3208 LLT Ty = MRI->getType(VData); 3209 3210 const LLT S16 = LLT::scalar(16); 3211 3212 // Fixup illegal register types for i8 stores. 3213 if (Ty == LLT::scalar(8) || Ty == S16) { 3214 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3215 return AnyExt; 3216 } 3217 3218 if (Ty.isVector()) { 3219 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3220 if (IsFormat) 3221 return handleD16VData(B, *MRI, VData); 3222 } 3223 } 3224 3225 return VData; 3226 } 3227 3228 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3229 MachineRegisterInfo &MRI, 3230 MachineIRBuilder &B, 3231 bool IsTyped, 3232 bool IsFormat) const { 3233 B.setInstr(MI); 3234 3235 Register VData = MI.getOperand(1).getReg(); 3236 LLT Ty = MRI.getType(VData); 3237 LLT EltTy = Ty.getScalarType(); 3238 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3239 const LLT S32 = LLT::scalar(32); 3240 3241 VData = fixStoreSourceType(B, VData, IsFormat); 3242 Register RSrc = MI.getOperand(2).getReg(); 3243 3244 MachineMemOperand *MMO = *MI.memoperands_begin(); 3245 const int MemSize = MMO->getSize(); 3246 3247 unsigned ImmOffset; 3248 unsigned TotalOffset; 3249 3250 // The typed intrinsics add an immediate after the registers. 3251 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3252 3253 // The struct intrinsic variants add one additional operand over raw. 3254 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3255 Register VIndex; 3256 int OpOffset = 0; 3257 if (HasVIndex) { 3258 VIndex = MI.getOperand(3).getReg(); 3259 OpOffset = 1; 3260 } 3261 3262 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3263 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3264 3265 unsigned Format = 0; 3266 if (IsTyped) { 3267 Format = MI.getOperand(5 + OpOffset).getImm(); 3268 ++OpOffset; 3269 } 3270 3271 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3272 3273 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3274 if (TotalOffset != 0) 3275 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3276 3277 unsigned Opc; 3278 if (IsTyped) { 3279 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3280 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3281 } else if (IsFormat) { 3282 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3283 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3284 } else { 3285 switch (MemSize) { 3286 case 1: 3287 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3288 break; 3289 case 2: 3290 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3291 break; 3292 default: 3293 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3294 break; 3295 } 3296 } 3297 3298 if (!VIndex) 3299 VIndex = B.buildConstant(S32, 0).getReg(0); 3300 3301 auto MIB = B.buildInstr(Opc) 3302 .addUse(VData) // vdata 3303 .addUse(RSrc) // rsrc 3304 .addUse(VIndex) // vindex 3305 .addUse(VOffset) // voffset 3306 .addUse(SOffset) // soffset 3307 .addImm(ImmOffset); // offset(imm) 3308 3309 if (IsTyped) 3310 MIB.addImm(Format); 3311 3312 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3313 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3314 .addMemOperand(MMO); 3315 3316 MI.eraseFromParent(); 3317 return true; 3318 } 3319 3320 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3321 MachineRegisterInfo &MRI, 3322 MachineIRBuilder &B, 3323 bool IsFormat, 3324 bool IsTyped) const { 3325 B.setInstr(MI); 3326 3327 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3328 MachineMemOperand *MMO = *MI.memoperands_begin(); 3329 const int MemSize = MMO->getSize(); 3330 const LLT S32 = LLT::scalar(32); 3331 3332 Register Dst = MI.getOperand(0).getReg(); 3333 Register RSrc = MI.getOperand(2).getReg(); 3334 3335 // The typed intrinsics add an immediate after the registers. 3336 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3337 3338 // The struct intrinsic variants add one additional operand over raw. 3339 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3340 Register VIndex; 3341 int OpOffset = 0; 3342 if (HasVIndex) { 3343 VIndex = MI.getOperand(3).getReg(); 3344 OpOffset = 1; 3345 } 3346 3347 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3348 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3349 3350 unsigned Format = 0; 3351 if (IsTyped) { 3352 Format = MI.getOperand(5 + OpOffset).getImm(); 3353 ++OpOffset; 3354 } 3355 3356 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3357 unsigned ImmOffset; 3358 unsigned TotalOffset; 3359 3360 LLT Ty = MRI.getType(Dst); 3361 LLT EltTy = Ty.getScalarType(); 3362 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3363 const bool Unpacked = ST.hasUnpackedD16VMem(); 3364 3365 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3366 if (TotalOffset != 0) 3367 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3368 3369 unsigned Opc; 3370 3371 if (IsTyped) { 3372 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3373 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3374 } else if (IsFormat) { 3375 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3376 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3377 } else { 3378 switch (MemSize) { 3379 case 1: 3380 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3381 break; 3382 case 2: 3383 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3384 break; 3385 default: 3386 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3387 break; 3388 } 3389 } 3390 3391 Register LoadDstReg; 3392 3393 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3394 LLT UnpackedTy = Ty.changeElementSize(32); 3395 3396 if (IsExtLoad) 3397 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3398 else if (Unpacked && IsD16 && Ty.isVector()) 3399 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3400 else 3401 LoadDstReg = Dst; 3402 3403 if (!VIndex) 3404 VIndex = B.buildConstant(S32, 0).getReg(0); 3405 3406 auto MIB = B.buildInstr(Opc) 3407 .addDef(LoadDstReg) // vdata 3408 .addUse(RSrc) // rsrc 3409 .addUse(VIndex) // vindex 3410 .addUse(VOffset) // voffset 3411 .addUse(SOffset) // soffset 3412 .addImm(ImmOffset); // offset(imm) 3413 3414 if (IsTyped) 3415 MIB.addImm(Format); 3416 3417 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3418 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3419 .addMemOperand(MMO); 3420 3421 if (LoadDstReg != Dst) { 3422 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3423 3424 // Widen result for extending loads was widened. 3425 if (IsExtLoad) 3426 B.buildTrunc(Dst, LoadDstReg); 3427 else { 3428 // Repack to original 16-bit vector result 3429 // FIXME: G_TRUNC should work, but legalization currently fails 3430 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3431 SmallVector<Register, 4> Repack; 3432 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3433 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3434 B.buildMerge(Dst, Repack); 3435 } 3436 } 3437 3438 MI.eraseFromParent(); 3439 return true; 3440 } 3441 3442 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3443 MachineIRBuilder &B, 3444 bool IsInc) const { 3445 B.setInstr(MI); 3446 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3447 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3448 B.buildInstr(Opc) 3449 .addDef(MI.getOperand(0).getReg()) 3450 .addUse(MI.getOperand(2).getReg()) 3451 .addUse(MI.getOperand(3).getReg()) 3452 .cloneMemRefs(MI); 3453 MI.eraseFromParent(); 3454 return true; 3455 } 3456 3457 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3458 switch (IntrID) { 3459 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3460 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3461 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3462 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3463 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3464 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3465 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3466 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3467 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3468 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3469 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3470 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3471 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3472 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3473 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3474 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3475 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3476 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3477 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3478 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3479 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3480 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3481 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3482 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3483 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3484 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3485 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3486 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3487 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3488 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3489 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3490 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3491 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3492 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3493 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3494 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3495 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3496 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3497 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3498 default: 3499 llvm_unreachable("unhandled atomic opcode"); 3500 } 3501 } 3502 3503 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3504 MachineIRBuilder &B, 3505 Intrinsic::ID IID) const { 3506 B.setInstr(MI); 3507 3508 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3509 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3510 3511 Register Dst = MI.getOperand(0).getReg(); 3512 Register VData = MI.getOperand(2).getReg(); 3513 3514 Register CmpVal; 3515 int OpOffset = 0; 3516 3517 if (IsCmpSwap) { 3518 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3519 ++OpOffset; 3520 } 3521 3522 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3523 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3524 3525 // The struct intrinsic variants add one additional operand over raw. 3526 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3527 Register VIndex; 3528 if (HasVIndex) { 3529 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3530 ++OpOffset; 3531 } 3532 3533 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3534 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3535 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3536 3537 MachineMemOperand *MMO = *MI.memoperands_begin(); 3538 3539 unsigned ImmOffset; 3540 unsigned TotalOffset; 3541 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3542 if (TotalOffset != 0) 3543 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3544 3545 if (!VIndex) 3546 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3547 3548 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3549 .addDef(Dst) 3550 .addUse(VData); // vdata 3551 3552 if (IsCmpSwap) 3553 MIB.addReg(CmpVal); 3554 3555 MIB.addUse(RSrc) // rsrc 3556 .addUse(VIndex) // vindex 3557 .addUse(VOffset) // voffset 3558 .addUse(SOffset) // soffset 3559 .addImm(ImmOffset) // offset(imm) 3560 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3561 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3562 .addMemOperand(MMO); 3563 3564 MI.eraseFromParent(); 3565 return true; 3566 } 3567 3568 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3569 /// vector with s16 typed elements. 3570 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3571 SmallVectorImpl<Register> &PackedAddrs, 3572 int AddrIdx, int DimIdx, int NumVAddrs, 3573 int NumGradients) { 3574 const LLT S16 = LLT::scalar(16); 3575 const LLT V2S16 = LLT::vector(2, 16); 3576 3577 for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) { 3578 MachineOperand &SrcOp = MI.getOperand(I); 3579 if (!SrcOp.isReg()) 3580 continue; // _L to _LZ may have eliminated this. 3581 3582 Register AddrReg = SrcOp.getReg(); 3583 3584 if (I < DimIdx) { 3585 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3586 PackedAddrs.push_back(AddrReg); 3587 } else { 3588 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3589 // derivatives dx/dh and dx/dv are packed with undef. 3590 if (((I + 1) >= (AddrIdx + NumVAddrs)) || 3591 ((NumGradients / 2) % 2 == 1 && 3592 (I == DimIdx + (NumGradients / 2) - 1 || 3593 I == DimIdx + NumGradients - 1)) || 3594 // Check for _L to _LZ optimization 3595 !MI.getOperand(I + 1).isReg()) { 3596 PackedAddrs.push_back( 3597 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3598 .getReg(0)); 3599 } else { 3600 PackedAddrs.push_back( 3601 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3602 .getReg(0)); 3603 ++I; 3604 } 3605 } 3606 } 3607 } 3608 3609 /// Convert from separate vaddr components to a single vector address register, 3610 /// and replace the remaining operands with $noreg. 3611 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3612 int DimIdx, int NumVAddrs) { 3613 const LLT S32 = LLT::scalar(32); 3614 3615 SmallVector<Register, 8> AddrRegs; 3616 for (int I = 0; I != NumVAddrs; ++I) { 3617 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3618 if (SrcOp.isReg()) { 3619 AddrRegs.push_back(SrcOp.getReg()); 3620 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3621 } 3622 } 3623 3624 int NumAddrRegs = AddrRegs.size(); 3625 if (NumAddrRegs != 1) { 3626 // Round up to 8 elements for v5-v7 3627 // FIXME: Missing intermediate sized register classes and instructions. 3628 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3629 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3630 auto Undef = B.buildUndef(S32); 3631 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3632 NumAddrRegs = RoundedNumRegs; 3633 } 3634 3635 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3636 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3637 } 3638 3639 for (int I = 1; I != NumVAddrs; ++I) { 3640 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3641 if (SrcOp.isReg()) 3642 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3643 } 3644 } 3645 3646 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3647 /// 3648 /// Depending on the subtarget, load/store with 16-bit element data need to be 3649 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3650 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3651 /// registers. 3652 /// 3653 /// We don't want to directly select image instructions just yet, but also want 3654 /// to exposes all register repacking to the legalizer/combiners. We also don't 3655 /// want a selected instrution entering RegBankSelect. In order to avoid 3656 /// defining a multitude of intermediate image instructions, directly hack on 3657 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3658 /// now unnecessary arguments with $noreg. 3659 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3660 MachineInstr &MI, MachineIRBuilder &B, 3661 GISelChangeObserver &Observer, 3662 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3663 B.setInstr(MI); 3664 3665 const int NumDefs = MI.getNumExplicitDefs(); 3666 bool IsTFE = NumDefs == 2; 3667 // We are only processing the operands of d16 image operations on subtargets 3668 // that use the unpacked register layout, or need to repack the TFE result. 3669 3670 // TODO: Do we need to guard against already legalized intrinsics? 3671 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3672 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3673 3674 MachineRegisterInfo *MRI = B.getMRI(); 3675 const LLT S32 = LLT::scalar(32); 3676 const LLT S16 = LLT::scalar(16); 3677 const LLT V2S16 = LLT::vector(2, 16); 3678 3679 // Index of first address argument 3680 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3681 3682 // Check for 16 bit addresses and pack if true. 3683 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3684 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3685 const bool IsA16 = AddrTy == S16; 3686 3687 int NumVAddrs, NumGradients; 3688 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3689 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3690 getDMaskIdx(BaseOpcode, NumDefs); 3691 unsigned DMask = 0; 3692 3693 int DMaskLanes = 0; 3694 if (!BaseOpcode->Atomic) { 3695 DMask = MI.getOperand(DMaskIdx).getImm(); 3696 if (BaseOpcode->Gather4) { 3697 DMaskLanes = 4; 3698 } else if (DMask != 0) { 3699 DMaskLanes = countPopulation(DMask); 3700 } else if (!IsTFE && !BaseOpcode->Store) { 3701 // If dmask is 0, this is a no-op load. This can be eliminated. 3702 B.buildUndef(MI.getOperand(0)); 3703 MI.eraseFromParent(); 3704 return true; 3705 } 3706 } 3707 3708 Observer.changingInstr(MI); 3709 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3710 3711 unsigned NewOpcode = NumDefs == 0 ? 3712 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3713 3714 // Track that we legalized this 3715 MI.setDesc(B.getTII().get(NewOpcode)); 3716 3717 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3718 // dmask to be at least 1 otherwise the instruction will fail 3719 if (IsTFE && DMask == 0) { 3720 DMask = 0x1; 3721 DMaskLanes = 1; 3722 MI.getOperand(DMaskIdx).setImm(DMask); 3723 } 3724 3725 if (BaseOpcode->Atomic) { 3726 Register VData0 = MI.getOperand(2).getReg(); 3727 LLT Ty = MRI->getType(VData0); 3728 3729 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3730 if (Ty.isVector()) 3731 return false; 3732 3733 if (BaseOpcode->AtomicX2) { 3734 Register VData1 = MI.getOperand(3).getReg(); 3735 // The two values are packed in one register. 3736 LLT PackedTy = LLT::vector(2, Ty); 3737 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3738 MI.getOperand(2).setReg(Concat.getReg(0)); 3739 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3740 } 3741 } 3742 3743 int CorrectedNumVAddrs = NumVAddrs; 3744 3745 // Optimize _L to _LZ when _L is zero 3746 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3747 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3748 const ConstantFP *ConstantLod; 3749 const int LodIdx = AddrIdx + NumVAddrs - 1; 3750 3751 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3752 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3753 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3754 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3755 LZMappingInfo->LZ, ImageDimIntr->Dim); 3756 3757 // The starting indexes should remain in the same place. 3758 --NumVAddrs; 3759 --CorrectedNumVAddrs; 3760 3761 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3762 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3763 MI.RemoveOperand(LodIdx); 3764 } 3765 } 3766 } 3767 3768 // Optimize _mip away, when 'lod' is zero 3769 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3770 int64_t ConstantLod; 3771 const int LodIdx = AddrIdx + NumVAddrs - 1; 3772 3773 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3774 if (ConstantLod == 0) { 3775 // TODO: Change intrinsic opcode and remove operand instead or replacing 3776 // it with 0, as the _L to _LZ handling is done above. 3777 MI.getOperand(LodIdx).ChangeToImmediate(0); 3778 --CorrectedNumVAddrs; 3779 } 3780 } 3781 } 3782 3783 // If the register allocator cannot place the address registers contiguously 3784 // without introducing moves, then using the non-sequential address encoding 3785 // is always preferable, since it saves VALU instructions and is usually a 3786 // wash in terms of code size or even better. 3787 // 3788 // However, we currently have no way of hinting to the register allocator 3789 // that MIMG addresses should be placed contiguously when it is possible to 3790 // do so, so force non-NSA for the common 2-address case as a heuristic. 3791 // 3792 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3793 // allocation when possible. 3794 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3795 3796 // Rewrite the addressing register layout before doing anything else. 3797 if (IsA16) { 3798 // FIXME: this feature is missing from gfx10. When that is fixed, this check 3799 // should be introduced. 3800 if (!ST.hasR128A16() && !ST.hasGFX10A16()) 3801 return false; 3802 3803 if (NumVAddrs > 1) { 3804 SmallVector<Register, 4> PackedRegs; 3805 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs, 3806 NumGradients); 3807 3808 if (!UseNSA && PackedRegs.size() > 1) { 3809 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3810 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3811 PackedRegs[0] = Concat.getReg(0); 3812 PackedRegs.resize(1); 3813 } 3814 3815 const int NumPacked = PackedRegs.size(); 3816 for (int I = 0; I != NumVAddrs; ++I) { 3817 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3818 if (!SrcOp.isReg()) { 3819 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3820 continue; 3821 } 3822 3823 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3824 3825 if (I < NumPacked) 3826 SrcOp.setReg(PackedRegs[I]); 3827 else 3828 SrcOp.setReg(AMDGPU::NoRegister); 3829 } 3830 } 3831 } else if (!UseNSA && NumVAddrs > 1) { 3832 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3833 } 3834 3835 3836 if (BaseOpcode->Store) { // No TFE for stores? 3837 // TODO: Handle dmask trim 3838 Register VData = MI.getOperand(1).getReg(); 3839 LLT Ty = MRI->getType(VData); 3840 if (!Ty.isVector() || Ty.getElementType() != S16) 3841 return true; 3842 3843 B.setInstr(MI); 3844 3845 Register RepackedReg = handleD16VData(B, *MRI, VData); 3846 if (RepackedReg != VData) { 3847 MI.getOperand(1).setReg(RepackedReg); 3848 } 3849 3850 return true; 3851 } 3852 3853 Register DstReg = MI.getOperand(0).getReg(); 3854 LLT Ty = MRI->getType(DstReg); 3855 const LLT EltTy = Ty.getScalarType(); 3856 const bool IsD16 = Ty.getScalarType() == S16; 3857 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3858 3859 // Confirm that the return type is large enough for the dmask specified 3860 if (NumElts < DMaskLanes) 3861 return false; 3862 3863 if (NumElts > 4 || DMaskLanes > 4) 3864 return false; 3865 3866 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3867 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3868 3869 // The raw dword aligned data component of the load. The only legal cases 3870 // where this matters should be when using the packed D16 format, for 3871 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3872 LLT RoundedTy; 3873 3874 // S32 vector to to cover all data, plus TFE result element. 3875 LLT TFETy; 3876 3877 // Register type to use for each loaded component. Will be S32 or V2S16. 3878 LLT RegTy; 3879 3880 if (IsD16 && ST.hasUnpackedD16VMem()) { 3881 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3882 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3883 RegTy = S32; 3884 } else { 3885 unsigned EltSize = EltTy.getSizeInBits(); 3886 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3887 unsigned RoundedSize = 32 * RoundedElts; 3888 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3889 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3890 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3891 } 3892 3893 // The return type does not need adjustment. 3894 // TODO: Should we change s16 case to s32 or <2 x s16>? 3895 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3896 return true; 3897 3898 Register Dst1Reg; 3899 3900 // Insert after the instruction. 3901 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3902 3903 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3904 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3905 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3906 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3907 3908 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3909 3910 MI.getOperand(0).setReg(NewResultReg); 3911 3912 // In the IR, TFE is supposed to be used with a 2 element struct return 3913 // type. The intruction really returns these two values in one contiguous 3914 // register, with one additional dword beyond the loaded data. Rewrite the 3915 // return type to use a single register result. 3916 3917 if (IsTFE) { 3918 Dst1Reg = MI.getOperand(1).getReg(); 3919 if (MRI->getType(Dst1Reg) != S32) 3920 return false; 3921 3922 // TODO: Make sure the TFE operand bit is set. 3923 MI.RemoveOperand(1); 3924 3925 // Handle the easy case that requires no repack instructions. 3926 if (Ty == S32) { 3927 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 3928 return true; 3929 } 3930 } 3931 3932 // Now figure out how to copy the new result register back into the old 3933 // result. 3934 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 3935 3936 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 3937 3938 if (ResultNumRegs == 1) { 3939 assert(!IsTFE); 3940 ResultRegs[0] = NewResultReg; 3941 } else { 3942 // We have to repack into a new vector of some kind. 3943 for (int I = 0; I != NumDataRegs; ++I) 3944 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 3945 B.buildUnmerge(ResultRegs, NewResultReg); 3946 3947 // Drop the final TFE element to get the data part. The TFE result is 3948 // directly written to the right place already. 3949 if (IsTFE) 3950 ResultRegs.resize(NumDataRegs); 3951 } 3952 3953 // For an s16 scalar result, we form an s32 result with a truncate regardless 3954 // of packed vs. unpacked. 3955 if (IsD16 && !Ty.isVector()) { 3956 B.buildTrunc(DstReg, ResultRegs[0]); 3957 return true; 3958 } 3959 3960 // Avoid a build/concat_vector of 1 entry. 3961 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 3962 B.buildBitcast(DstReg, ResultRegs[0]); 3963 return true; 3964 } 3965 3966 assert(Ty.isVector()); 3967 3968 if (IsD16) { 3969 // For packed D16 results with TFE enabled, all the data components are 3970 // S32. Cast back to the expected type. 3971 // 3972 // TODO: We don't really need to use load s32 elements. We would only need one 3973 // cast for the TFE result if a multiple of v2s16 was used. 3974 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 3975 for (Register &Reg : ResultRegs) 3976 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 3977 } else if (ST.hasUnpackedD16VMem()) { 3978 for (Register &Reg : ResultRegs) 3979 Reg = B.buildTrunc(S16, Reg).getReg(0); 3980 } 3981 } 3982 3983 auto padWithUndef = [&](LLT Ty, int NumElts) { 3984 if (NumElts == 0) 3985 return; 3986 Register Undef = B.buildUndef(Ty).getReg(0); 3987 for (int I = 0; I != NumElts; ++I) 3988 ResultRegs.push_back(Undef); 3989 }; 3990 3991 // Pad out any elements eliminated due to the dmask. 3992 LLT ResTy = MRI->getType(ResultRegs[0]); 3993 if (!ResTy.isVector()) { 3994 padWithUndef(ResTy, NumElts - ResultRegs.size()); 3995 B.buildBuildVector(DstReg, ResultRegs); 3996 return true; 3997 } 3998 3999 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4000 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4001 4002 // Deal with the one annoying legal case. 4003 const LLT V3S16 = LLT::vector(3, 16); 4004 if (Ty == V3S16) { 4005 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4006 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4007 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4008 return true; 4009 } 4010 4011 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4012 B.buildConcatVectors(DstReg, ResultRegs); 4013 return true; 4014 } 4015 4016 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4017 MachineInstr &MI, MachineIRBuilder &B, 4018 GISelChangeObserver &Observer) const { 4019 Register Dst = MI.getOperand(0).getReg(); 4020 LLT Ty = B.getMRI()->getType(Dst); 4021 unsigned Size = Ty.getSizeInBits(); 4022 MachineFunction &MF = B.getMF(); 4023 4024 Observer.changingInstr(MI); 4025 4026 // FIXME: We don't really need this intermediate instruction. The intrinsic 4027 // should be fixed to have a memory operand. Since it's readnone, we're not 4028 // allowed to add one. 4029 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4030 MI.RemoveOperand(1); // Remove intrinsic ID 4031 4032 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4033 // TODO: Should this use datalayout alignment? 4034 const unsigned MemSize = (Size + 7) / 8; 4035 const Align MemAlign(4); 4036 MachineMemOperand *MMO = MF.getMachineMemOperand( 4037 MachinePointerInfo(), 4038 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4039 MachineMemOperand::MOInvariant, 4040 MemSize, MemAlign); 4041 MI.addMemOperand(MF, MMO); 4042 4043 // There are no 96-bit result scalar loads, but widening to 128-bit should 4044 // always be legal. We may need to restore this to a 96-bit result if it turns 4045 // out this needs to be converted to a vector load during RegBankSelect. 4046 if (!isPowerOf2_32(Size)) { 4047 LegalizerHelper Helper(MF, *this, Observer, B); 4048 B.setInstr(MI); 4049 4050 if (Ty.isVector()) 4051 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4052 else 4053 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4054 } 4055 4056 Observer.changedInstr(MI); 4057 return true; 4058 } 4059 4060 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4061 MachineRegisterInfo &MRI, 4062 MachineIRBuilder &B) const { 4063 B.setInstr(MI); 4064 4065 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4066 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4067 !ST.isTrapHandlerEnabled()) { 4068 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4069 } else { 4070 // Pass queue pointer to trap handler as input, and insert trap instruction 4071 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4072 const ArgDescriptor *Arg = 4073 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4074 if (!Arg) 4075 return false; 4076 MachineRegisterInfo &MRI = *B.getMRI(); 4077 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4078 Register LiveIn = getLiveInRegister( 4079 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4080 /*InsertLiveInCopy=*/false); 4081 if (!loadInputValue(LiveIn, B, Arg)) 4082 return false; 4083 B.buildCopy(SGPR01, LiveIn); 4084 B.buildInstr(AMDGPU::S_TRAP) 4085 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4086 .addReg(SGPR01, RegState::Implicit); 4087 } 4088 4089 MI.eraseFromParent(); 4090 return true; 4091 } 4092 4093 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4094 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4095 B.setInstr(MI); 4096 4097 // Is non-HSA path or trap-handler disabled? then, report a warning 4098 // accordingly 4099 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4100 !ST.isTrapHandlerEnabled()) { 4101 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4102 "debugtrap handler not supported", 4103 MI.getDebugLoc(), DS_Warning); 4104 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4105 Ctx.diagnose(NoTrap); 4106 } else { 4107 // Insert debug-trap instruction 4108 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4109 } 4110 4111 MI.eraseFromParent(); 4112 return true; 4113 } 4114 4115 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 4116 MachineIRBuilder &B, 4117 GISelChangeObserver &Observer) const { 4118 MachineRegisterInfo &MRI = *B.getMRI(); 4119 4120 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4121 auto IntrID = MI.getIntrinsicID(); 4122 switch (IntrID) { 4123 case Intrinsic::amdgcn_if: 4124 case Intrinsic::amdgcn_else: { 4125 MachineInstr *Br = nullptr; 4126 MachineBasicBlock *UncondBrTarget = nullptr; 4127 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4128 const SIRegisterInfo *TRI 4129 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4130 4131 B.setInstr(*BrCond); 4132 Register Def = MI.getOperand(1).getReg(); 4133 Register Use = MI.getOperand(3).getReg(); 4134 4135 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4136 if (IntrID == Intrinsic::amdgcn_if) { 4137 B.buildInstr(AMDGPU::SI_IF) 4138 .addDef(Def) 4139 .addUse(Use) 4140 .addMBB(UncondBrTarget); 4141 } else { 4142 B.buildInstr(AMDGPU::SI_ELSE) 4143 .addDef(Def) 4144 .addUse(Use) 4145 .addMBB(UncondBrTarget) 4146 .addImm(0); 4147 } 4148 4149 if (Br) { 4150 Br->getOperand(0).setMBB(CondBrTarget); 4151 } else { 4152 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4153 // since we're swapping branch targets it needs to be reinserted. 4154 // FIXME: IRTranslator should probably not do this 4155 B.buildBr(*CondBrTarget); 4156 } 4157 4158 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4159 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4160 MI.eraseFromParent(); 4161 BrCond->eraseFromParent(); 4162 return true; 4163 } 4164 4165 return false; 4166 } 4167 case Intrinsic::amdgcn_loop: { 4168 MachineInstr *Br = nullptr; 4169 MachineBasicBlock *UncondBrTarget = nullptr; 4170 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4171 const SIRegisterInfo *TRI 4172 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4173 4174 B.setInstr(*BrCond); 4175 4176 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4177 Register Reg = MI.getOperand(2).getReg(); 4178 B.buildInstr(AMDGPU::SI_LOOP) 4179 .addUse(Reg) 4180 .addMBB(UncondBrTarget); 4181 4182 if (Br) 4183 Br->getOperand(0).setMBB(CondBrTarget); 4184 else 4185 B.buildBr(*CondBrTarget); 4186 4187 MI.eraseFromParent(); 4188 BrCond->eraseFromParent(); 4189 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4190 return true; 4191 } 4192 4193 return false; 4194 } 4195 case Intrinsic::amdgcn_kernarg_segment_ptr: 4196 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4197 B.setInstr(MI); 4198 // This only makes sense to call in a kernel, so just lower to null. 4199 B.buildConstant(MI.getOperand(0).getReg(), 0); 4200 MI.eraseFromParent(); 4201 return true; 4202 } 4203 4204 return legalizePreloadedArgIntrin( 4205 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4206 case Intrinsic::amdgcn_implicitarg_ptr: 4207 return legalizeImplicitArgPtr(MI, MRI, B); 4208 case Intrinsic::amdgcn_workitem_id_x: 4209 return legalizePreloadedArgIntrin(MI, MRI, B, 4210 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4211 case Intrinsic::amdgcn_workitem_id_y: 4212 return legalizePreloadedArgIntrin(MI, MRI, B, 4213 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4214 case Intrinsic::amdgcn_workitem_id_z: 4215 return legalizePreloadedArgIntrin(MI, MRI, B, 4216 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4217 case Intrinsic::amdgcn_workgroup_id_x: 4218 return legalizePreloadedArgIntrin(MI, MRI, B, 4219 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4220 case Intrinsic::amdgcn_workgroup_id_y: 4221 return legalizePreloadedArgIntrin(MI, MRI, B, 4222 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4223 case Intrinsic::amdgcn_workgroup_id_z: 4224 return legalizePreloadedArgIntrin(MI, MRI, B, 4225 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4226 case Intrinsic::amdgcn_dispatch_ptr: 4227 return legalizePreloadedArgIntrin(MI, MRI, B, 4228 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4229 case Intrinsic::amdgcn_queue_ptr: 4230 return legalizePreloadedArgIntrin(MI, MRI, B, 4231 AMDGPUFunctionArgInfo::QUEUE_PTR); 4232 case Intrinsic::amdgcn_implicit_buffer_ptr: 4233 return legalizePreloadedArgIntrin( 4234 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4235 case Intrinsic::amdgcn_dispatch_id: 4236 return legalizePreloadedArgIntrin(MI, MRI, B, 4237 AMDGPUFunctionArgInfo::DISPATCH_ID); 4238 case Intrinsic::amdgcn_fdiv_fast: 4239 return legalizeFDIVFastIntrin(MI, MRI, B); 4240 case Intrinsic::amdgcn_is_shared: 4241 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4242 case Intrinsic::amdgcn_is_private: 4243 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4244 case Intrinsic::amdgcn_wavefrontsize: { 4245 B.setInstr(MI); 4246 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4247 MI.eraseFromParent(); 4248 return true; 4249 } 4250 case Intrinsic::amdgcn_s_buffer_load: 4251 return legalizeSBufferLoad(MI, B, Observer); 4252 case Intrinsic::amdgcn_raw_buffer_store: 4253 case Intrinsic::amdgcn_struct_buffer_store: 4254 return legalizeBufferStore(MI, MRI, B, false, false); 4255 case Intrinsic::amdgcn_raw_buffer_store_format: 4256 case Intrinsic::amdgcn_struct_buffer_store_format: 4257 return legalizeBufferStore(MI, MRI, B, false, true); 4258 case Intrinsic::amdgcn_raw_tbuffer_store: 4259 case Intrinsic::amdgcn_struct_tbuffer_store: 4260 return legalizeBufferStore(MI, MRI, B, true, true); 4261 case Intrinsic::amdgcn_raw_buffer_load: 4262 case Intrinsic::amdgcn_struct_buffer_load: 4263 return legalizeBufferLoad(MI, MRI, B, false, false); 4264 case Intrinsic::amdgcn_raw_buffer_load_format: 4265 case Intrinsic::amdgcn_struct_buffer_load_format: 4266 return legalizeBufferLoad(MI, MRI, B, true, false); 4267 case Intrinsic::amdgcn_raw_tbuffer_load: 4268 case Intrinsic::amdgcn_struct_tbuffer_load: 4269 return legalizeBufferLoad(MI, MRI, B, true, true); 4270 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4271 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4272 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4273 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4274 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4275 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4276 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4277 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4278 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4279 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4280 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4281 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4282 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4283 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4284 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4285 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4286 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4287 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4288 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4289 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4290 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4291 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4292 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4293 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4294 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4295 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4296 return legalizeBufferAtomic(MI, B, IntrID); 4297 case Intrinsic::amdgcn_atomic_inc: 4298 return legalizeAtomicIncDec(MI, B, true); 4299 case Intrinsic::amdgcn_atomic_dec: 4300 return legalizeAtomicIncDec(MI, B, false); 4301 case Intrinsic::trap: 4302 return legalizeTrapIntrinsic(MI, MRI, B); 4303 case Intrinsic::debugtrap: 4304 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4305 default: { 4306 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4307 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4308 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 4309 return true; 4310 } 4311 } 4312 4313 return true; 4314 } 4315