1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/ADT/ScopeExit.h" 28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 30 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 31 #include "llvm/CodeGen/TargetOpcodes.h" 32 #include "llvm/CodeGen/ValueTypes.h" 33 #include "llvm/IR/DerivedTypes.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/IR/Type.h" 36 #include "llvm/Support/Debug.h" 37 38 #define DEBUG_TYPE "amdgpu-legalinfo" 39 40 using namespace llvm; 41 using namespace LegalizeActions; 42 using namespace LegalizeMutations; 43 using namespace LegalityPredicates; 44 using namespace MIPatternMatch; 45 46 // Round the number of elements to the next power of two elements 47 static LLT getPow2VectorType(LLT Ty) { 48 unsigned NElts = Ty.getNumElements(); 49 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 50 return Ty.changeNumElements(Pow2NElts); 51 } 52 53 // Round the number of bits to the next power of two bits 54 static LLT getPow2ScalarType(LLT Ty) { 55 unsigned Bits = Ty.getSizeInBits(); 56 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 57 return LLT::scalar(Pow2Bits); 58 } 59 60 static LegalityPredicate isMultiple32(unsigned TypeIdx, 61 unsigned MaxSize = 1024) { 62 return [=](const LegalityQuery &Query) { 63 const LLT Ty = Query.Types[TypeIdx]; 64 const LLT EltTy = Ty.getScalarType(); 65 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 66 }; 67 } 68 69 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 70 return [=](const LegalityQuery &Query) { 71 return Query.Types[TypeIdx].getSizeInBits() == Size; 72 }; 73 } 74 75 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 return Ty.isVector() && 79 Ty.getNumElements() % 2 != 0 && 80 Ty.getElementType().getSizeInBits() < 32 && 81 Ty.getSizeInBits() % 32 != 0; 82 }; 83 } 84 85 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 86 return [=](const LegalityQuery &Query) { 87 const LLT Ty = Query.Types[TypeIdx]; 88 const LLT EltTy = Ty.getScalarType(); 89 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 90 }; 91 } 92 93 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 94 return [=](const LegalityQuery &Query) { 95 const LLT Ty = Query.Types[TypeIdx]; 96 const LLT EltTy = Ty.getElementType(); 97 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 98 }; 99 } 100 101 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 102 return [=](const LegalityQuery &Query) { 103 const LLT Ty = Query.Types[TypeIdx]; 104 const LLT EltTy = Ty.getElementType(); 105 unsigned Size = Ty.getSizeInBits(); 106 unsigned Pieces = (Size + 63) / 64; 107 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 108 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 109 }; 110 } 111 112 // Increase the number of vector elements to reach the next multiple of 32-bit 113 // type. 114 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 115 return [=](const LegalityQuery &Query) { 116 const LLT Ty = Query.Types[TypeIdx]; 117 118 const LLT EltTy = Ty.getElementType(); 119 const int Size = Ty.getSizeInBits(); 120 const int EltSize = EltTy.getSizeInBits(); 121 const int NextMul32 = (Size + 31) / 32; 122 123 assert(EltSize < 32); 124 125 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 126 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 127 }; 128 } 129 130 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 131 return [=](const LegalityQuery &Query) { 132 const LLT QueryTy = Query.Types[TypeIdx]; 133 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 134 }; 135 } 136 137 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 138 return [=](const LegalityQuery &Query) { 139 const LLT QueryTy = Query.Types[TypeIdx]; 140 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 141 }; 142 } 143 144 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 145 return [=](const LegalityQuery &Query) { 146 const LLT QueryTy = Query.Types[TypeIdx]; 147 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 148 }; 149 } 150 151 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 152 // v2s16. 153 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 154 return [=](const LegalityQuery &Query) { 155 const LLT Ty = Query.Types[TypeIdx]; 156 if (Ty.isVector()) { 157 const int EltSize = Ty.getElementType().getSizeInBits(); 158 return EltSize == 32 || EltSize == 64 || 159 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 160 EltSize == 128 || EltSize == 256; 161 } 162 163 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 164 }; 165 } 166 167 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 168 return [=](const LegalityQuery &Query) { 169 const LLT QueryTy = Query.Types[TypeIdx]; 170 return QueryTy.isVector() && QueryTy.getElementType() == Type; 171 }; 172 } 173 174 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 const LLT QueryTy = Query.Types[TypeIdx]; 177 if (!QueryTy.isVector()) 178 return false; 179 const LLT EltTy = QueryTy.getElementType(); 180 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 181 }; 182 } 183 184 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 const LLT Ty = Query.Types[TypeIdx]; 187 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 188 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 189 }; 190 } 191 192 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) { 193 return [=](const LegalityQuery &Query) { 194 return Query.Types[TypeIdx0].getSizeInBits() < 195 Query.Types[TypeIdx1].getSizeInBits(); 196 }; 197 } 198 199 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) { 200 return [=](const LegalityQuery &Query) { 201 return Query.Types[TypeIdx0].getSizeInBits() > 202 Query.Types[TypeIdx1].getSizeInBits(); 203 }; 204 } 205 206 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 207 const GCNTargetMachine &TM) 208 : ST(ST_) { 209 using namespace TargetOpcode; 210 211 auto GetAddrSpacePtr = [&TM](unsigned AS) { 212 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 213 }; 214 215 const LLT S1 = LLT::scalar(1); 216 const LLT S16 = LLT::scalar(16); 217 const LLT S32 = LLT::scalar(32); 218 const LLT S64 = LLT::scalar(64); 219 const LLT S128 = LLT::scalar(128); 220 const LLT S256 = LLT::scalar(256); 221 const LLT S512 = LLT::scalar(512); 222 const LLT S1024 = LLT::scalar(1024); 223 224 const LLT V2S16 = LLT::vector(2, 16); 225 const LLT V4S16 = LLT::vector(4, 16); 226 227 const LLT V2S32 = LLT::vector(2, 32); 228 const LLT V3S32 = LLT::vector(3, 32); 229 const LLT V4S32 = LLT::vector(4, 32); 230 const LLT V5S32 = LLT::vector(5, 32); 231 const LLT V6S32 = LLT::vector(6, 32); 232 const LLT V7S32 = LLT::vector(7, 32); 233 const LLT V8S32 = LLT::vector(8, 32); 234 const LLT V9S32 = LLT::vector(9, 32); 235 const LLT V10S32 = LLT::vector(10, 32); 236 const LLT V11S32 = LLT::vector(11, 32); 237 const LLT V12S32 = LLT::vector(12, 32); 238 const LLT V13S32 = LLT::vector(13, 32); 239 const LLT V14S32 = LLT::vector(14, 32); 240 const LLT V15S32 = LLT::vector(15, 32); 241 const LLT V16S32 = LLT::vector(16, 32); 242 const LLT V32S32 = LLT::vector(32, 32); 243 244 const LLT V2S64 = LLT::vector(2, 64); 245 const LLT V3S64 = LLT::vector(3, 64); 246 const LLT V4S64 = LLT::vector(4, 64); 247 const LLT V5S64 = LLT::vector(5, 64); 248 const LLT V6S64 = LLT::vector(6, 64); 249 const LLT V7S64 = LLT::vector(7, 64); 250 const LLT V8S64 = LLT::vector(8, 64); 251 const LLT V16S64 = LLT::vector(16, 64); 252 253 std::initializer_list<LLT> AllS32Vectors = 254 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 255 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 256 std::initializer_list<LLT> AllS64Vectors = 257 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 258 259 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 260 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 261 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 262 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 263 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 264 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 265 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 266 267 const LLT CodePtr = FlatPtr; 268 269 const std::initializer_list<LLT> AddrSpaces64 = { 270 GlobalPtr, ConstantPtr, FlatPtr 271 }; 272 273 const std::initializer_list<LLT> AddrSpaces32 = { 274 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 275 }; 276 277 const std::initializer_list<LLT> FPTypesBase = { 278 S32, S64 279 }; 280 281 const std::initializer_list<LLT> FPTypes16 = { 282 S32, S64, S16 283 }; 284 285 const std::initializer_list<LLT> FPTypesPK16 = { 286 S32, S64, S16, V2S16 287 }; 288 289 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 290 291 setAction({G_BRCOND, S1}, Legal); // VCC branches 292 setAction({G_BRCOND, S32}, Legal); // SCC branches 293 294 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 295 // elements for v3s16 296 getActionDefinitionsBuilder(G_PHI) 297 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 298 .legalFor(AllS32Vectors) 299 .legalFor(AllS64Vectors) 300 .legalFor(AddrSpaces64) 301 .legalFor(AddrSpaces32) 302 .clampScalar(0, S32, S256) 303 .widenScalarToNextPow2(0, 32) 304 .clampMaxNumElements(0, S32, 16) 305 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 306 .legalIf(isPointer(0)); 307 308 if (ST.hasVOP3PInsts()) { 309 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 310 .legalFor({S32, S16, V2S16}) 311 .clampScalar(0, S16, S32) 312 .clampMaxNumElements(0, S16, 2) 313 .scalarize(0) 314 .widenScalarToNextPow2(0, 32); 315 } else if (ST.has16BitInsts()) { 316 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 317 .legalFor({S32, S16}) 318 .clampScalar(0, S16, S32) 319 .scalarize(0) 320 .widenScalarToNextPow2(0, 32); 321 } else { 322 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 323 .legalFor({S32}) 324 .clampScalar(0, S32, S32) 325 .scalarize(0); 326 } 327 328 // FIXME: Not really legal. Placeholder for custom lowering. 329 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 330 .customFor({S32, S64}) 331 .clampScalar(0, S32, S64) 332 .widenScalarToNextPow2(0, 32) 333 .scalarize(0); 334 335 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 336 .legalFor({S32}) 337 .clampScalar(0, S32, S32) 338 .scalarize(0); 339 340 // Report legal for any types we can handle anywhere. For the cases only legal 341 // on the SALU, RegBankSelect will be able to re-legalize. 342 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 343 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 344 .clampScalar(0, S32, S64) 345 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 346 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 347 .widenScalarToNextPow2(0) 348 .scalarize(0); 349 350 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 351 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 352 .legalFor({{S32, S1}, {S32, S32}}) 353 .minScalar(0, S32) 354 // TODO: .scalarize(0) 355 .lower(); 356 357 getActionDefinitionsBuilder(G_BITCAST) 358 // Don't worry about the size constraint. 359 .legalIf(all(isRegisterType(0), isRegisterType(1))) 360 .lower(); 361 362 363 getActionDefinitionsBuilder(G_CONSTANT) 364 .legalFor({S1, S32, S64, S16, GlobalPtr, 365 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 366 .clampScalar(0, S32, S64) 367 .widenScalarToNextPow2(0) 368 .legalIf(isPointer(0)); 369 370 getActionDefinitionsBuilder(G_FCONSTANT) 371 .legalFor({S32, S64, S16}) 372 .clampScalar(0, S16, S64); 373 374 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 375 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 376 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 377 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 378 .clampScalarOrElt(0, S32, S1024) 379 .legalIf(isMultiple32(0)) 380 .widenScalarToNextPow2(0, 32) 381 .clampMaxNumElements(0, S32, 16); 382 383 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 384 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 385 .unsupportedFor({PrivatePtr}) 386 .custom(); 387 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 388 389 auto &FPOpActions = getActionDefinitionsBuilder( 390 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 391 .legalFor({S32, S64}); 392 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 393 .customFor({S32, S64}); 394 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 395 .customFor({S32, S64}); 396 397 if (ST.has16BitInsts()) { 398 if (ST.hasVOP3PInsts()) 399 FPOpActions.legalFor({S16, V2S16}); 400 else 401 FPOpActions.legalFor({S16}); 402 403 TrigActions.customFor({S16}); 404 FDIVActions.customFor({S16}); 405 } 406 407 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 408 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 409 410 if (ST.hasVOP3PInsts()) { 411 MinNumMaxNum.customFor(FPTypesPK16) 412 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 413 .clampMaxNumElements(0, S16, 2) 414 .clampScalar(0, S16, S64) 415 .scalarize(0); 416 } else if (ST.has16BitInsts()) { 417 MinNumMaxNum.customFor(FPTypes16) 418 .clampScalar(0, S16, S64) 419 .scalarize(0); 420 } else { 421 MinNumMaxNum.customFor(FPTypesBase) 422 .clampScalar(0, S32, S64) 423 .scalarize(0); 424 } 425 426 if (ST.hasVOP3PInsts()) 427 FPOpActions.clampMaxNumElements(0, S16, 2); 428 429 FPOpActions 430 .scalarize(0) 431 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 432 433 TrigActions 434 .scalarize(0) 435 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 436 437 FDIVActions 438 .scalarize(0) 439 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 440 441 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 442 .legalFor(FPTypesPK16) 443 .clampMaxNumElements(0, S16, 2) 444 .scalarize(0) 445 .clampScalar(0, S16, S64); 446 447 if (ST.has16BitInsts()) { 448 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 449 .legalFor({S32, S64, S16}) 450 .scalarize(0) 451 .clampScalar(0, S16, S64); 452 } else { 453 getActionDefinitionsBuilder(G_FSQRT) 454 .legalFor({S32, S64}) 455 .scalarize(0) 456 .clampScalar(0, S32, S64); 457 458 if (ST.hasFractBug()) { 459 getActionDefinitionsBuilder(G_FFLOOR) 460 .customFor({S64}) 461 .legalFor({S32, S64}) 462 .scalarize(0) 463 .clampScalar(0, S32, S64); 464 } else { 465 getActionDefinitionsBuilder(G_FFLOOR) 466 .legalFor({S32, S64}) 467 .scalarize(0) 468 .clampScalar(0, S32, S64); 469 } 470 } 471 472 getActionDefinitionsBuilder(G_FPTRUNC) 473 .legalFor({{S32, S64}, {S16, S32}}) 474 .scalarize(0) 475 .lower(); 476 477 getActionDefinitionsBuilder(G_FPEXT) 478 .legalFor({{S64, S32}, {S32, S16}}) 479 .lowerFor({{S64, S16}}) // FIXME: Implement 480 .scalarize(0); 481 482 getActionDefinitionsBuilder(G_FSUB) 483 // Use actual fsub instruction 484 .legalFor({S32}) 485 // Must use fadd + fneg 486 .lowerFor({S64, S16, V2S16}) 487 .scalarize(0) 488 .clampScalar(0, S32, S64); 489 490 // Whether this is legal depends on the floating point mode for the function. 491 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 492 if (ST.hasMadF16()) 493 FMad.customFor({S32, S16}); 494 else 495 FMad.customFor({S32}); 496 FMad.scalarize(0) 497 .lower(); 498 499 // TODO: Do we need to clamp maximum bitwidth? 500 getActionDefinitionsBuilder(G_TRUNC) 501 .legalIf(isScalar(0)) 502 .legalFor({{V2S16, V2S32}}) 503 .clampMaxNumElements(0, S16, 2) 504 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 505 // situations (like an invalid implicit use), we don't want to infinite loop 506 // in the legalizer. 507 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 508 .alwaysLegal(); 509 510 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 511 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 512 {S32, S1}, {S64, S1}, {S16, S1}}) 513 .scalarize(0) 514 .clampScalar(0, S32, S64) 515 .widenScalarToNextPow2(1, 32); 516 517 // TODO: Split s1->s64 during regbankselect for VALU. 518 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 519 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 520 .lowerFor({{S32, S64}}) 521 .lowerIf(typeIs(1, S1)) 522 .customFor({{S64, S64}}); 523 if (ST.has16BitInsts()) 524 IToFP.legalFor({{S16, S16}}); 525 IToFP.clampScalar(1, S32, S64) 526 .scalarize(0) 527 .widenScalarToNextPow2(1); 528 529 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 530 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 531 .customFor({{S64, S64}}); 532 if (ST.has16BitInsts()) 533 FPToI.legalFor({{S16, S16}}); 534 else 535 FPToI.minScalar(1, S32); 536 537 FPToI.minScalar(0, S32) 538 .scalarize(0) 539 .lower(); 540 541 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 542 .scalarize(0) 543 .lower(); 544 545 if (ST.has16BitInsts()) { 546 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 547 .legalFor({S16, S32, S64}) 548 .clampScalar(0, S16, S64) 549 .scalarize(0); 550 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 551 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 552 .legalFor({S32, S64}) 553 .clampScalar(0, S32, S64) 554 .scalarize(0); 555 } else { 556 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 557 .legalFor({S32}) 558 .customFor({S64}) 559 .clampScalar(0, S32, S64) 560 .scalarize(0); 561 } 562 563 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 564 .scalarize(0) 565 .alwaysLegal(); 566 567 auto &CmpBuilder = 568 getActionDefinitionsBuilder(G_ICMP) 569 // The compare output type differs based on the register bank of the output, 570 // so make both s1 and s32 legal. 571 // 572 // Scalar compares producing output in scc will be promoted to s32, as that 573 // is the allocatable register type that will be needed for the copy from 574 // scc. This will be promoted during RegBankSelect, and we assume something 575 // before that won't try to use s32 result types. 576 // 577 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 578 // bank. 579 .legalForCartesianProduct( 580 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 581 .legalForCartesianProduct( 582 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 583 if (ST.has16BitInsts()) { 584 CmpBuilder.legalFor({{S1, S16}}); 585 } 586 587 CmpBuilder 588 .widenScalarToNextPow2(1) 589 .clampScalar(1, S32, S64) 590 .scalarize(0) 591 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 592 593 getActionDefinitionsBuilder(G_FCMP) 594 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 595 .widenScalarToNextPow2(1) 596 .clampScalar(1, S32, S64) 597 .scalarize(0); 598 599 // FIXME: fpow has a selection pattern that should move to custom lowering. 600 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 601 if (ST.has16BitInsts()) 602 Exp2Ops.legalFor({S32, S16}); 603 else 604 Exp2Ops.legalFor({S32}); 605 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 606 Exp2Ops.scalarize(0); 607 608 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 609 if (ST.has16BitInsts()) 610 ExpOps.customFor({{S32}, {S16}}); 611 else 612 ExpOps.customFor({S32}); 613 ExpOps.clampScalar(0, MinScalarFPTy, S32) 614 .scalarize(0); 615 616 // The 64-bit versions produce 32-bit results, but only on the SALU. 617 getActionDefinitionsBuilder(G_CTPOP) 618 .legalFor({{S32, S32}, {S32, S64}}) 619 .clampScalar(0, S32, S32) 620 .clampScalar(1, S32, S64) 621 .scalarize(0) 622 .widenScalarToNextPow2(0, 32) 623 .widenScalarToNextPow2(1, 32); 624 625 // The hardware instructions return a different result on 0 than the generic 626 // instructions expect. The hardware produces -1, but these produce the 627 // bitwidth. 628 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 629 .scalarize(0) 630 .clampScalar(0, S32, S32) 631 .clampScalar(1, S32, S64) 632 .widenScalarToNextPow2(0, 32) 633 .widenScalarToNextPow2(1, 32) 634 .lower(); 635 636 // The 64-bit versions produce 32-bit results, but only on the SALU. 637 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 638 .legalFor({{S32, S32}, {S32, S64}}) 639 .clampScalar(0, S32, S32) 640 .clampScalar(1, S32, S64) 641 .scalarize(0) 642 .widenScalarToNextPow2(0, 32) 643 .widenScalarToNextPow2(1, 32); 644 645 getActionDefinitionsBuilder(G_BITREVERSE) 646 .legalFor({S32}) 647 .clampScalar(0, S32, S32) 648 .scalarize(0); 649 650 if (ST.has16BitInsts()) { 651 getActionDefinitionsBuilder(G_BSWAP) 652 .legalFor({S16, S32, V2S16}) 653 .clampMaxNumElements(0, S16, 2) 654 // FIXME: Fixing non-power-of-2 before clamp is workaround for 655 // narrowScalar limitation. 656 .widenScalarToNextPow2(0) 657 .clampScalar(0, S16, S32) 658 .scalarize(0); 659 660 if (ST.hasVOP3PInsts()) { 661 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 662 .legalFor({S32, S16, V2S16}) 663 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 664 .clampMaxNumElements(0, S16, 2) 665 .minScalar(0, S16) 666 .widenScalarToNextPow2(0) 667 .scalarize(0) 668 .lower(); 669 } else { 670 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 671 .legalFor({S32, S16}) 672 .widenScalarToNextPow2(0) 673 .minScalar(0, S16) 674 .scalarize(0) 675 .lower(); 676 } 677 } else { 678 // TODO: Should have same legality without v_perm_b32 679 getActionDefinitionsBuilder(G_BSWAP) 680 .legalFor({S32}) 681 .lowerIf(narrowerThan(0, 32)) 682 // FIXME: Fixing non-power-of-2 before clamp is workaround for 683 // narrowScalar limitation. 684 .widenScalarToNextPow2(0) 685 .maxScalar(0, S32) 686 .scalarize(0) 687 .lower(); 688 689 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 690 .legalFor({S32}) 691 .minScalar(0, S32) 692 .widenScalarToNextPow2(0) 693 .scalarize(0) 694 .lower(); 695 } 696 697 getActionDefinitionsBuilder(G_INTTOPTR) 698 // List the common cases 699 .legalForCartesianProduct(AddrSpaces64, {S64}) 700 .legalForCartesianProduct(AddrSpaces32, {S32}) 701 .scalarize(0) 702 // Accept any address space as long as the size matches 703 .legalIf(sameSize(0, 1)) 704 .widenScalarIf(smallerThan(1, 0), 705 [](const LegalityQuery &Query) { 706 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 707 }) 708 .narrowScalarIf(greaterThan(1, 0), 709 [](const LegalityQuery &Query) { 710 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 711 }); 712 713 getActionDefinitionsBuilder(G_PTRTOINT) 714 // List the common cases 715 .legalForCartesianProduct(AddrSpaces64, {S64}) 716 .legalForCartesianProduct(AddrSpaces32, {S32}) 717 .scalarize(0) 718 // Accept any address space as long as the size matches 719 .legalIf(sameSize(0, 1)) 720 .widenScalarIf(smallerThan(0, 1), 721 [](const LegalityQuery &Query) { 722 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 723 }) 724 .narrowScalarIf( 725 greaterThan(0, 1), 726 [](const LegalityQuery &Query) { 727 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 728 }); 729 730 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 731 .scalarize(0) 732 .custom(); 733 734 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 735 // handle some operations by just promoting the register during 736 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 737 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 738 switch (AS) { 739 // FIXME: Private element size. 740 case AMDGPUAS::PRIVATE_ADDRESS: 741 return 32; 742 // FIXME: Check subtarget 743 case AMDGPUAS::LOCAL_ADDRESS: 744 return ST.useDS128() ? 128 : 64; 745 746 // Treat constant and global as identical. SMRD loads are sometimes usable 747 // for global loads (ideally constant address space should be eliminated) 748 // depending on the context. Legality cannot be context dependent, but 749 // RegBankSelect can split the load as necessary depending on the pointer 750 // register bank/uniformity and if the memory is invariant or not written in 751 // a kernel. 752 case AMDGPUAS::CONSTANT_ADDRESS: 753 case AMDGPUAS::GLOBAL_ADDRESS: 754 return IsLoad ? 512 : 128; 755 default: 756 return 128; 757 } 758 }; 759 760 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 761 bool IsLoad) -> bool { 762 const LLT DstTy = Query.Types[0]; 763 764 // Split vector extloads. 765 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 766 unsigned Align = Query.MMODescrs[0].AlignInBits; 767 768 if (MemSize < DstTy.getSizeInBits()) 769 MemSize = std::max(MemSize, Align); 770 771 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 772 return true; 773 774 const LLT PtrTy = Query.Types[1]; 775 unsigned AS = PtrTy.getAddressSpace(); 776 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 777 return true; 778 779 // Catch weird sized loads that don't evenly divide into the access sizes 780 // TODO: May be able to widen depending on alignment etc. 781 unsigned NumRegs = (MemSize + 31) / 32; 782 if (NumRegs == 3) { 783 if (!ST.hasDwordx3LoadStores()) 784 return true; 785 } else { 786 // If the alignment allows, these should have been widened. 787 if (!isPowerOf2_32(NumRegs)) 788 return true; 789 } 790 791 if (Align < MemSize) { 792 const SITargetLowering *TLI = ST.getTargetLowering(); 793 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 794 } 795 796 return false; 797 }; 798 799 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 800 unsigned Size = Query.Types[0].getSizeInBits(); 801 if (isPowerOf2_32(Size)) 802 return false; 803 804 if (Size == 96 && ST.hasDwordx3LoadStores()) 805 return false; 806 807 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 808 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 809 return false; 810 811 unsigned Align = Query.MMODescrs[0].AlignInBits; 812 unsigned RoundedSize = NextPowerOf2(Size); 813 return (Align >= RoundedSize); 814 }; 815 816 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 817 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 818 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 819 820 // TODO: Refine based on subtargets which support unaligned access or 128-bit 821 // LDS 822 // TODO: Unsupported flat for SI. 823 824 for (unsigned Op : {G_LOAD, G_STORE}) { 825 const bool IsStore = Op == G_STORE; 826 827 auto &Actions = getActionDefinitionsBuilder(Op); 828 // Whitelist the common cases. 829 // TODO: Loads to s16 on gfx9 830 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 831 {V2S32, GlobalPtr, 64, GlobalAlign32}, 832 {V4S32, GlobalPtr, 128, GlobalAlign32}, 833 {S128, GlobalPtr, 128, GlobalAlign32}, 834 {S64, GlobalPtr, 64, GlobalAlign32}, 835 {V2S64, GlobalPtr, 128, GlobalAlign32}, 836 {V2S16, GlobalPtr, 32, GlobalAlign32}, 837 {S32, GlobalPtr, 8, GlobalAlign8}, 838 {S32, GlobalPtr, 16, GlobalAlign16}, 839 840 {S32, LocalPtr, 32, 32}, 841 {S64, LocalPtr, 64, 32}, 842 {V2S32, LocalPtr, 64, 32}, 843 {S32, LocalPtr, 8, 8}, 844 {S32, LocalPtr, 16, 16}, 845 {V2S16, LocalPtr, 32, 32}, 846 847 {S32, PrivatePtr, 32, 32}, 848 {S32, PrivatePtr, 8, 8}, 849 {S32, PrivatePtr, 16, 16}, 850 {V2S16, PrivatePtr, 32, 32}, 851 852 {S32, FlatPtr, 32, GlobalAlign32}, 853 {S32, FlatPtr, 16, GlobalAlign16}, 854 {S32, FlatPtr, 8, GlobalAlign8}, 855 {V2S16, FlatPtr, 32, GlobalAlign32}, 856 857 {S32, ConstantPtr, 32, GlobalAlign32}, 858 {V2S32, ConstantPtr, 64, GlobalAlign32}, 859 {V4S32, ConstantPtr, 128, GlobalAlign32}, 860 {S64, ConstantPtr, 64, GlobalAlign32}, 861 {S128, ConstantPtr, 128, GlobalAlign32}, 862 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 863 Actions 864 .customIf(typeIs(1, Constant32Ptr)) 865 // Widen suitably aligned loads by loading extra elements. 866 .moreElementsIf([=](const LegalityQuery &Query) { 867 const LLT Ty = Query.Types[0]; 868 return Op == G_LOAD && Ty.isVector() && 869 shouldWidenLoadResult(Query); 870 }, moreElementsToNextPow2(0)) 871 .widenScalarIf([=](const LegalityQuery &Query) { 872 const LLT Ty = Query.Types[0]; 873 return Op == G_LOAD && !Ty.isVector() && 874 shouldWidenLoadResult(Query); 875 }, widenScalarOrEltToNextPow2(0)) 876 .narrowScalarIf( 877 [=](const LegalityQuery &Query) -> bool { 878 return !Query.Types[0].isVector() && 879 needToSplitMemOp(Query, Op == G_LOAD); 880 }, 881 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 882 const LLT DstTy = Query.Types[0]; 883 const LLT PtrTy = Query.Types[1]; 884 885 const unsigned DstSize = DstTy.getSizeInBits(); 886 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 887 888 // Split extloads. 889 if (DstSize > MemSize) 890 return std::make_pair(0, LLT::scalar(MemSize)); 891 892 if (!isPowerOf2_32(DstSize)) { 893 // We're probably decomposing an odd sized store. Try to split 894 // to the widest type. TODO: Account for alignment. As-is it 895 // should be OK, since the new parts will be further legalized. 896 unsigned FloorSize = PowerOf2Floor(DstSize); 897 return std::make_pair(0, LLT::scalar(FloorSize)); 898 } 899 900 if (DstSize > 32 && (DstSize % 32 != 0)) { 901 // FIXME: Need a way to specify non-extload of larger size if 902 // suitably aligned. 903 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 904 } 905 906 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 907 Op == G_LOAD); 908 if (MemSize > MaxSize) 909 return std::make_pair(0, LLT::scalar(MaxSize)); 910 911 unsigned Align = Query.MMODescrs[0].AlignInBits; 912 return std::make_pair(0, LLT::scalar(Align)); 913 }) 914 .fewerElementsIf( 915 [=](const LegalityQuery &Query) -> bool { 916 return Query.Types[0].isVector() && 917 needToSplitMemOp(Query, Op == G_LOAD); 918 }, 919 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 920 const LLT DstTy = Query.Types[0]; 921 const LLT PtrTy = Query.Types[1]; 922 923 LLT EltTy = DstTy.getElementType(); 924 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 925 Op == G_LOAD); 926 927 // FIXME: Handle widened to power of 2 results better. This ends 928 // up scalarizing. 929 // FIXME: 3 element stores scalarized on SI 930 931 // Split if it's too large for the address space. 932 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 933 unsigned NumElts = DstTy.getNumElements(); 934 unsigned EltSize = EltTy.getSizeInBits(); 935 936 if (MaxSize % EltSize == 0) { 937 return std::make_pair( 938 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 939 } 940 941 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 942 943 // FIXME: Refine when odd breakdowns handled 944 // The scalars will need to be re-legalized. 945 if (NumPieces == 1 || NumPieces >= NumElts || 946 NumElts % NumPieces != 0) 947 return std::make_pair(0, EltTy); 948 949 return std::make_pair(0, 950 LLT::vector(NumElts / NumPieces, EltTy)); 951 } 952 953 // FIXME: We could probably handle weird extending loads better. 954 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 955 if (DstTy.getSizeInBits() > MemSize) 956 return std::make_pair(0, EltTy); 957 958 unsigned EltSize = EltTy.getSizeInBits(); 959 unsigned DstSize = DstTy.getSizeInBits(); 960 if (!isPowerOf2_32(DstSize)) { 961 // We're probably decomposing an odd sized store. Try to split 962 // to the widest type. TODO: Account for alignment. As-is it 963 // should be OK, since the new parts will be further legalized. 964 unsigned FloorSize = PowerOf2Floor(DstSize); 965 return std::make_pair( 966 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 967 } 968 969 // Need to split because of alignment. 970 unsigned Align = Query.MMODescrs[0].AlignInBits; 971 if (EltSize > Align && 972 (EltSize / Align < DstTy.getNumElements())) { 973 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 974 } 975 976 // May need relegalization for the scalars. 977 return std::make_pair(0, EltTy); 978 }) 979 .minScalar(0, S32); 980 981 if (IsStore) 982 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 983 984 // TODO: Need a bitcast lower option? 985 Actions 986 .legalIf([=](const LegalityQuery &Query) { 987 const LLT Ty0 = Query.Types[0]; 988 unsigned Size = Ty0.getSizeInBits(); 989 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 990 unsigned Align = Query.MMODescrs[0].AlignInBits; 991 992 // FIXME: Widening store from alignment not valid. 993 if (MemSize < Size) 994 MemSize = std::max(MemSize, Align); 995 996 // No extending vector loads. 997 if (Size > MemSize && Ty0.isVector()) 998 return false; 999 1000 switch (MemSize) { 1001 case 8: 1002 case 16: 1003 return Size == 32; 1004 case 32: 1005 case 64: 1006 case 128: 1007 return true; 1008 case 96: 1009 return ST.hasDwordx3LoadStores(); 1010 case 256: 1011 case 512: 1012 return true; 1013 default: 1014 return false; 1015 } 1016 }) 1017 .widenScalarToNextPow2(0) 1018 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1019 } 1020 1021 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1022 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1023 {S32, GlobalPtr, 16, 2 * 8}, 1024 {S32, LocalPtr, 8, 8}, 1025 {S32, LocalPtr, 16, 16}, 1026 {S32, PrivatePtr, 8, 8}, 1027 {S32, PrivatePtr, 16, 16}, 1028 {S32, ConstantPtr, 8, 8}, 1029 {S32, ConstantPtr, 16, 2 * 8}}); 1030 if (ST.hasFlatAddressSpace()) { 1031 ExtLoads.legalForTypesWithMemDesc( 1032 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1033 } 1034 1035 ExtLoads.clampScalar(0, S32, S32) 1036 .widenScalarToNextPow2(0) 1037 .unsupportedIfMemSizeNotPow2() 1038 .lower(); 1039 1040 auto &Atomics = getActionDefinitionsBuilder( 1041 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1042 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1043 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1044 G_ATOMICRMW_UMIN}) 1045 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1046 {S64, GlobalPtr}, {S64, LocalPtr}}); 1047 if (ST.hasFlatAddressSpace()) { 1048 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1049 } 1050 1051 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1052 .legalFor({{S32, LocalPtr}}); 1053 1054 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1055 // demarshalling 1056 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1057 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1058 {S32, FlatPtr}, {S64, FlatPtr}}) 1059 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1060 {S32, RegionPtr}, {S64, RegionPtr}}); 1061 // TODO: Pointer types, any 32-bit or 64-bit vector 1062 1063 // Condition should be s32 for scalar, s1 for vector. 1064 getActionDefinitionsBuilder(G_SELECT) 1065 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1066 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1067 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1068 .clampScalar(0, S16, S64) 1069 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1070 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1071 .scalarize(1) 1072 .clampMaxNumElements(0, S32, 2) 1073 .clampMaxNumElements(0, LocalPtr, 2) 1074 .clampMaxNumElements(0, PrivatePtr, 2) 1075 .scalarize(0) 1076 .widenScalarToNextPow2(0) 1077 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1078 1079 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1080 // be more flexible with the shift amount type. 1081 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1082 .legalFor({{S32, S32}, {S64, S32}}); 1083 if (ST.has16BitInsts()) { 1084 if (ST.hasVOP3PInsts()) { 1085 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 1086 .clampMaxNumElements(0, S16, 2); 1087 } else 1088 Shifts.legalFor({{S16, S32}, {S16, S16}}); 1089 1090 // TODO: Support 16-bit shift amounts 1091 Shifts.clampScalar(1, S32, S32); 1092 Shifts.clampScalar(0, S16, S64); 1093 Shifts.widenScalarToNextPow2(0, 16); 1094 } else { 1095 // Make sure we legalize the shift amount type first, as the general 1096 // expansion for the shifted type will produce much worse code if it hasn't 1097 // been truncated already. 1098 Shifts.clampScalar(1, S32, S32); 1099 Shifts.clampScalar(0, S32, S64); 1100 Shifts.widenScalarToNextPow2(0, 32); 1101 } 1102 Shifts.scalarize(0); 1103 1104 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1105 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1106 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1107 unsigned IdxTypeIdx = 2; 1108 1109 getActionDefinitionsBuilder(Op) 1110 .customIf([=](const LegalityQuery &Query) { 1111 const LLT EltTy = Query.Types[EltTypeIdx]; 1112 const LLT VecTy = Query.Types[VecTypeIdx]; 1113 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1114 return (EltTy.getSizeInBits() == 16 || 1115 EltTy.getSizeInBits() % 32 == 0) && 1116 VecTy.getSizeInBits() % 32 == 0 && 1117 VecTy.getSizeInBits() <= 1024 && 1118 IdxTy.getSizeInBits() == 32; 1119 }) 1120 .clampScalar(EltTypeIdx, S32, S64) 1121 .clampScalar(VecTypeIdx, S32, S64) 1122 .clampScalar(IdxTypeIdx, S32, S32); 1123 } 1124 1125 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1126 .unsupportedIf([=](const LegalityQuery &Query) { 1127 const LLT &EltTy = Query.Types[1].getElementType(); 1128 return Query.Types[0] != EltTy; 1129 }); 1130 1131 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1132 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1133 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1134 1135 // FIXME: Doesn't handle extract of illegal sizes. 1136 getActionDefinitionsBuilder(Op) 1137 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1138 // FIXME: Multiples of 16 should not be legal. 1139 .legalIf([=](const LegalityQuery &Query) { 1140 const LLT BigTy = Query.Types[BigTyIdx]; 1141 const LLT LitTy = Query.Types[LitTyIdx]; 1142 return (BigTy.getSizeInBits() % 32 == 0) && 1143 (LitTy.getSizeInBits() % 16 == 0); 1144 }) 1145 .widenScalarIf( 1146 [=](const LegalityQuery &Query) { 1147 const LLT BigTy = Query.Types[BigTyIdx]; 1148 return (BigTy.getScalarSizeInBits() < 16); 1149 }, 1150 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1151 .widenScalarIf( 1152 [=](const LegalityQuery &Query) { 1153 const LLT LitTy = Query.Types[LitTyIdx]; 1154 return (LitTy.getScalarSizeInBits() < 16); 1155 }, 1156 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1157 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1158 .widenScalarToNextPow2(BigTyIdx, 32); 1159 1160 } 1161 1162 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1163 .legalForCartesianProduct(AllS32Vectors, {S32}) 1164 .legalForCartesianProduct(AllS64Vectors, {S64}) 1165 .clampNumElements(0, V16S32, V32S32) 1166 .clampNumElements(0, V2S64, V16S64) 1167 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1168 1169 if (ST.hasScalarPackInsts()) { 1170 BuildVector 1171 // FIXME: Should probably widen s1 vectors straight to s32 1172 .minScalarOrElt(0, S16) 1173 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1174 .minScalar(1, S32); 1175 1176 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1177 .legalFor({V2S16, S32}) 1178 .lower(); 1179 BuildVector.minScalarOrElt(0, S32); 1180 } else { 1181 BuildVector.customFor({V2S16, S16}); 1182 BuildVector.minScalarOrElt(0, S32); 1183 1184 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1185 .customFor({V2S16, S32}) 1186 .lower(); 1187 } 1188 1189 BuildVector.legalIf(isRegisterType(0)); 1190 1191 // FIXME: Clamp maximum size 1192 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1193 .legalIf(isRegisterType(0)); 1194 1195 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1196 // pre-legalize. 1197 if (ST.hasVOP3PInsts()) { 1198 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1199 .customFor({V2S16, V2S16}) 1200 .lower(); 1201 } else 1202 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1203 1204 // Merge/Unmerge 1205 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1206 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1207 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1208 1209 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1210 const LLT Ty = Query.Types[TypeIdx]; 1211 if (Ty.isVector()) { 1212 const LLT &EltTy = Ty.getElementType(); 1213 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1214 return true; 1215 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1216 return true; 1217 } 1218 return false; 1219 }; 1220 1221 auto &Builder = getActionDefinitionsBuilder(Op) 1222 // Try to widen to s16 first for small types. 1223 // TODO: Only do this on targets with legal s16 shifts 1224 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1225 1226 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1227 .lowerFor({{S16, V2S16}}) 1228 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1229 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1230 elementTypeIs(1, S16)), 1231 changeTo(1, V2S16)) 1232 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1233 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1234 // valid. 1235 .clampScalar(LitTyIdx, S32, S512) 1236 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1237 // Break up vectors with weird elements into scalars 1238 .fewerElementsIf( 1239 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1240 scalarize(0)) 1241 .fewerElementsIf( 1242 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1243 scalarize(1)) 1244 .clampScalar(BigTyIdx, S32, S1024); 1245 1246 if (Op == G_MERGE_VALUES) { 1247 Builder.widenScalarIf( 1248 // TODO: Use 16-bit shifts if legal for 8-bit values? 1249 [=](const LegalityQuery &Query) { 1250 const LLT Ty = Query.Types[LitTyIdx]; 1251 return Ty.getSizeInBits() < 32; 1252 }, 1253 changeTo(LitTyIdx, S32)); 1254 } 1255 1256 Builder.widenScalarIf( 1257 [=](const LegalityQuery &Query) { 1258 const LLT Ty = Query.Types[BigTyIdx]; 1259 return !isPowerOf2_32(Ty.getSizeInBits()) && 1260 Ty.getSizeInBits() % 16 != 0; 1261 }, 1262 [=](const LegalityQuery &Query) { 1263 // Pick the next power of 2, or a multiple of 64 over 128. 1264 // Whichever is smaller. 1265 const LLT &Ty = Query.Types[BigTyIdx]; 1266 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1267 if (NewSizeInBits >= 256) { 1268 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1269 if (RoundedTo < NewSizeInBits) 1270 NewSizeInBits = RoundedTo; 1271 } 1272 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1273 }) 1274 .legalIf([=](const LegalityQuery &Query) { 1275 const LLT &BigTy = Query.Types[BigTyIdx]; 1276 const LLT &LitTy = Query.Types[LitTyIdx]; 1277 1278 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1279 return false; 1280 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1281 return false; 1282 1283 return BigTy.getSizeInBits() % 16 == 0 && 1284 LitTy.getSizeInBits() % 16 == 0 && 1285 BigTy.getSizeInBits() <= 1024; 1286 }) 1287 // Any vectors left are the wrong size. Scalarize them. 1288 .scalarize(0) 1289 .scalarize(1); 1290 } 1291 1292 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1293 // RegBankSelect. 1294 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1295 .legalFor({{S32}, {S64}}); 1296 1297 if (ST.hasVOP3PInsts()) { 1298 SextInReg.lowerFor({{V2S16}}) 1299 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1300 // get more vector shift opportunities, since we'll get those when 1301 // expanded. 1302 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1303 } else if (ST.has16BitInsts()) { 1304 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1305 } else { 1306 // Prefer to promote to s32 before lowering if we don't have 16-bit 1307 // shifts. This avoid a lot of intermediate truncate and extend operations. 1308 SextInReg.lowerFor({{S32}, {S64}}); 1309 } 1310 1311 SextInReg 1312 .scalarize(0) 1313 .clampScalar(0, S32, S64) 1314 .lower(); 1315 1316 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1317 .legalFor({S64}); 1318 1319 getActionDefinitionsBuilder({ 1320 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1321 G_FCOPYSIGN, 1322 1323 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1324 G_READ_REGISTER, 1325 G_WRITE_REGISTER, 1326 1327 G_SADDO, G_SSUBO, 1328 1329 // TODO: Implement 1330 G_FMINIMUM, G_FMAXIMUM 1331 }).lower(); 1332 1333 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1334 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1335 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1336 .unsupported(); 1337 1338 computeTables(); 1339 verify(*ST.getInstrInfo()); 1340 } 1341 1342 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1343 MachineRegisterInfo &MRI, 1344 MachineIRBuilder &B, 1345 GISelChangeObserver &Observer) const { 1346 switch (MI.getOpcode()) { 1347 case TargetOpcode::G_ADDRSPACE_CAST: 1348 return legalizeAddrSpaceCast(MI, MRI, B); 1349 case TargetOpcode::G_FRINT: 1350 return legalizeFrint(MI, MRI, B); 1351 case TargetOpcode::G_FCEIL: 1352 return legalizeFceil(MI, MRI, B); 1353 case TargetOpcode::G_INTRINSIC_TRUNC: 1354 return legalizeIntrinsicTrunc(MI, MRI, B); 1355 case TargetOpcode::G_SITOFP: 1356 return legalizeITOFP(MI, MRI, B, true); 1357 case TargetOpcode::G_UITOFP: 1358 return legalizeITOFP(MI, MRI, B, false); 1359 case TargetOpcode::G_FPTOSI: 1360 return legalizeFPTOI(MI, MRI, B, true); 1361 case TargetOpcode::G_FPTOUI: 1362 return legalizeFPTOI(MI, MRI, B, false); 1363 case TargetOpcode::G_FMINNUM: 1364 case TargetOpcode::G_FMAXNUM: 1365 case TargetOpcode::G_FMINNUM_IEEE: 1366 case TargetOpcode::G_FMAXNUM_IEEE: 1367 return legalizeMinNumMaxNum(MI, MRI, B); 1368 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1369 return legalizeExtractVectorElt(MI, MRI, B); 1370 case TargetOpcode::G_INSERT_VECTOR_ELT: 1371 return legalizeInsertVectorElt(MI, MRI, B); 1372 case TargetOpcode::G_SHUFFLE_VECTOR: 1373 return legalizeShuffleVector(MI, MRI, B); 1374 case TargetOpcode::G_FSIN: 1375 case TargetOpcode::G_FCOS: 1376 return legalizeSinCos(MI, MRI, B); 1377 case TargetOpcode::G_GLOBAL_VALUE: 1378 return legalizeGlobalValue(MI, MRI, B); 1379 case TargetOpcode::G_LOAD: 1380 return legalizeLoad(MI, MRI, B, Observer); 1381 case TargetOpcode::G_FMAD: 1382 return legalizeFMad(MI, MRI, B); 1383 case TargetOpcode::G_FDIV: 1384 return legalizeFDIV(MI, MRI, B); 1385 case TargetOpcode::G_UDIV: 1386 case TargetOpcode::G_UREM: 1387 return legalizeUDIV_UREM(MI, MRI, B); 1388 case TargetOpcode::G_SDIV: 1389 case TargetOpcode::G_SREM: 1390 return legalizeSDIV_SREM(MI, MRI, B); 1391 case TargetOpcode::G_ATOMIC_CMPXCHG: 1392 return legalizeAtomicCmpXChg(MI, MRI, B); 1393 case TargetOpcode::G_FLOG: 1394 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1395 case TargetOpcode::G_FLOG10: 1396 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1397 case TargetOpcode::G_FEXP: 1398 return legalizeFExp(MI, B); 1399 case TargetOpcode::G_FPOW: 1400 return legalizeFPow(MI, B); 1401 case TargetOpcode::G_FFLOOR: 1402 return legalizeFFloor(MI, MRI, B); 1403 case TargetOpcode::G_BUILD_VECTOR: 1404 return legalizeBuildVector(MI, MRI, B); 1405 default: 1406 return false; 1407 } 1408 1409 llvm_unreachable("expected switch to return"); 1410 } 1411 1412 Register AMDGPULegalizerInfo::getSegmentAperture( 1413 unsigned AS, 1414 MachineRegisterInfo &MRI, 1415 MachineIRBuilder &B) const { 1416 MachineFunction &MF = B.getMF(); 1417 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1418 const LLT S32 = LLT::scalar(32); 1419 1420 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1421 1422 if (ST.hasApertureRegs()) { 1423 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1424 // getreg. 1425 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1426 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1427 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1428 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1429 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1430 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1431 unsigned Encoding = 1432 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1433 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1434 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1435 1436 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1437 1438 B.buildInstr(AMDGPU::S_GETREG_B32) 1439 .addDef(GetReg) 1440 .addImm(Encoding); 1441 MRI.setType(GetReg, S32); 1442 1443 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1444 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1445 } 1446 1447 Register QueuePtr = MRI.createGenericVirtualRegister( 1448 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1449 1450 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1451 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1452 return Register(); 1453 1454 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1455 // private_segment_aperture_base_hi. 1456 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1457 1458 // TODO: can we be smarter about machine pointer info? 1459 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1460 MachineMemOperand *MMO = MF.getMachineMemOperand( 1461 PtrInfo, 1462 MachineMemOperand::MOLoad | 1463 MachineMemOperand::MODereferenceable | 1464 MachineMemOperand::MOInvariant, 1465 4, 1466 MinAlign(64, StructOffset)); 1467 1468 Register LoadAddr; 1469 1470 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1471 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1472 } 1473 1474 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1475 MachineInstr &MI, MachineRegisterInfo &MRI, 1476 MachineIRBuilder &B) const { 1477 MachineFunction &MF = B.getMF(); 1478 1479 B.setInstr(MI); 1480 1481 const LLT S32 = LLT::scalar(32); 1482 Register Dst = MI.getOperand(0).getReg(); 1483 Register Src = MI.getOperand(1).getReg(); 1484 1485 LLT DstTy = MRI.getType(Dst); 1486 LLT SrcTy = MRI.getType(Src); 1487 unsigned DestAS = DstTy.getAddressSpace(); 1488 unsigned SrcAS = SrcTy.getAddressSpace(); 1489 1490 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1491 // vector element. 1492 assert(!DstTy.isVector()); 1493 1494 const AMDGPUTargetMachine &TM 1495 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1496 1497 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1498 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1499 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1500 return true; 1501 } 1502 1503 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1504 // Truncate. 1505 B.buildExtract(Dst, Src, 0); 1506 MI.eraseFromParent(); 1507 return true; 1508 } 1509 1510 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1511 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1512 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1513 1514 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1515 // another. Merge operands are required to be the same type, but creating an 1516 // extra ptrtoint would be kind of pointless. 1517 auto HighAddr = B.buildConstant( 1518 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1519 B.buildMerge(Dst, {Src, HighAddr}); 1520 MI.eraseFromParent(); 1521 return true; 1522 } 1523 1524 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1525 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1526 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1527 unsigned NullVal = TM.getNullPointerValue(DestAS); 1528 1529 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1530 auto FlatNull = B.buildConstant(SrcTy, 0); 1531 1532 // Extract low 32-bits of the pointer. 1533 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1534 1535 auto CmpRes = 1536 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1537 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1538 1539 MI.eraseFromParent(); 1540 return true; 1541 } 1542 1543 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1544 return false; 1545 1546 if (!ST.hasFlatAddressSpace()) 1547 return false; 1548 1549 auto SegmentNull = 1550 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1551 auto FlatNull = 1552 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1553 1554 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1555 if (!ApertureReg.isValid()) 1556 return false; 1557 1558 auto CmpRes = 1559 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1560 1561 // Coerce the type of the low half of the result so we can use merge_values. 1562 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1563 1564 // TODO: Should we allow mismatched types but matching sizes in merges to 1565 // avoid the ptrtoint? 1566 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1567 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1568 1569 MI.eraseFromParent(); 1570 return true; 1571 } 1572 1573 bool AMDGPULegalizerInfo::legalizeFrint( 1574 MachineInstr &MI, MachineRegisterInfo &MRI, 1575 MachineIRBuilder &B) const { 1576 B.setInstr(MI); 1577 1578 Register Src = MI.getOperand(1).getReg(); 1579 LLT Ty = MRI.getType(Src); 1580 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1581 1582 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1583 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1584 1585 auto C1 = B.buildFConstant(Ty, C1Val); 1586 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1587 1588 // TODO: Should this propagate fast-math-flags? 1589 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1590 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1591 1592 auto C2 = B.buildFConstant(Ty, C2Val); 1593 auto Fabs = B.buildFAbs(Ty, Src); 1594 1595 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1596 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1597 return true; 1598 } 1599 1600 bool AMDGPULegalizerInfo::legalizeFceil( 1601 MachineInstr &MI, MachineRegisterInfo &MRI, 1602 MachineIRBuilder &B) const { 1603 B.setInstr(MI); 1604 1605 const LLT S1 = LLT::scalar(1); 1606 const LLT S64 = LLT::scalar(64); 1607 1608 Register Src = MI.getOperand(1).getReg(); 1609 assert(MRI.getType(Src) == S64); 1610 1611 // result = trunc(src) 1612 // if (src > 0.0 && src != result) 1613 // result += 1.0 1614 1615 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1616 1617 const auto Zero = B.buildFConstant(S64, 0.0); 1618 const auto One = B.buildFConstant(S64, 1.0); 1619 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1620 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1621 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1622 auto Add = B.buildSelect(S64, And, One, Zero); 1623 1624 // TODO: Should this propagate fast-math-flags? 1625 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1626 return true; 1627 } 1628 1629 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1630 MachineIRBuilder &B) { 1631 const unsigned FractBits = 52; 1632 const unsigned ExpBits = 11; 1633 LLT S32 = LLT::scalar(32); 1634 1635 auto Const0 = B.buildConstant(S32, FractBits - 32); 1636 auto Const1 = B.buildConstant(S32, ExpBits); 1637 1638 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1639 .addUse(Const0.getReg(0)) 1640 .addUse(Const1.getReg(0)); 1641 1642 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1643 } 1644 1645 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1646 MachineInstr &MI, MachineRegisterInfo &MRI, 1647 MachineIRBuilder &B) const { 1648 B.setInstr(MI); 1649 1650 const LLT S1 = LLT::scalar(1); 1651 const LLT S32 = LLT::scalar(32); 1652 const LLT S64 = LLT::scalar(64); 1653 1654 Register Src = MI.getOperand(1).getReg(); 1655 assert(MRI.getType(Src) == S64); 1656 1657 // TODO: Should this use extract since the low half is unused? 1658 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1659 Register Hi = Unmerge.getReg(1); 1660 1661 // Extract the upper half, since this is where we will find the sign and 1662 // exponent. 1663 auto Exp = extractF64Exponent(Hi, B); 1664 1665 const unsigned FractBits = 52; 1666 1667 // Extract the sign bit. 1668 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1669 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1670 1671 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1672 1673 const auto Zero32 = B.buildConstant(S32, 0); 1674 1675 // Extend back to 64-bits. 1676 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1677 1678 auto Shr = B.buildAShr(S64, FractMask, Exp); 1679 auto Not = B.buildNot(S64, Shr); 1680 auto Tmp0 = B.buildAnd(S64, Src, Not); 1681 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1682 1683 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1684 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1685 1686 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1687 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1688 return true; 1689 } 1690 1691 bool AMDGPULegalizerInfo::legalizeITOFP( 1692 MachineInstr &MI, MachineRegisterInfo &MRI, 1693 MachineIRBuilder &B, bool Signed) const { 1694 B.setInstr(MI); 1695 1696 Register Dst = MI.getOperand(0).getReg(); 1697 Register Src = MI.getOperand(1).getReg(); 1698 1699 const LLT S64 = LLT::scalar(64); 1700 const LLT S32 = LLT::scalar(32); 1701 1702 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1703 1704 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1705 1706 auto CvtHi = Signed ? 1707 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1708 B.buildUITOFP(S64, Unmerge.getReg(1)); 1709 1710 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1711 1712 auto ThirtyTwo = B.buildConstant(S32, 32); 1713 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1714 .addUse(CvtHi.getReg(0)) 1715 .addUse(ThirtyTwo.getReg(0)); 1716 1717 // TODO: Should this propagate fast-math-flags? 1718 B.buildFAdd(Dst, LdExp, CvtLo); 1719 MI.eraseFromParent(); 1720 return true; 1721 } 1722 1723 // TODO: Copied from DAG implementation. Verify logic and document how this 1724 // actually works. 1725 bool AMDGPULegalizerInfo::legalizeFPTOI( 1726 MachineInstr &MI, MachineRegisterInfo &MRI, 1727 MachineIRBuilder &B, bool Signed) const { 1728 B.setInstr(MI); 1729 1730 Register Dst = MI.getOperand(0).getReg(); 1731 Register Src = MI.getOperand(1).getReg(); 1732 1733 const LLT S64 = LLT::scalar(64); 1734 const LLT S32 = LLT::scalar(32); 1735 1736 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1737 1738 unsigned Flags = MI.getFlags(); 1739 1740 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1741 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1742 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1743 1744 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1745 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1746 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1747 1748 auto Hi = Signed ? 1749 B.buildFPTOSI(S32, FloorMul) : 1750 B.buildFPTOUI(S32, FloorMul); 1751 auto Lo = B.buildFPTOUI(S32, Fma); 1752 1753 B.buildMerge(Dst, { Lo, Hi }); 1754 MI.eraseFromParent(); 1755 1756 return true; 1757 } 1758 1759 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1760 MachineInstr &MI, MachineRegisterInfo &MRI, 1761 MachineIRBuilder &B) const { 1762 MachineFunction &MF = B.getMF(); 1763 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1764 1765 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1766 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1767 1768 // With ieee_mode disabled, the instructions have the correct behavior 1769 // already for G_FMINNUM/G_FMAXNUM 1770 if (!MFI->getMode().IEEE) 1771 return !IsIEEEOp; 1772 1773 if (IsIEEEOp) 1774 return true; 1775 1776 MachineIRBuilder HelperBuilder(MI); 1777 GISelObserverWrapper DummyObserver; 1778 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1779 HelperBuilder.setInstr(MI); 1780 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1781 } 1782 1783 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1784 MachineInstr &MI, MachineRegisterInfo &MRI, 1785 MachineIRBuilder &B) const { 1786 // TODO: Should move some of this into LegalizerHelper. 1787 1788 // TODO: Promote dynamic indexing of s16 to s32 1789 1790 // FIXME: Artifact combiner probably should have replaced the truncated 1791 // constant before this, so we shouldn't need 1792 // getConstantVRegValWithLookThrough. 1793 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1794 MI.getOperand(2).getReg(), MRI); 1795 if (!IdxVal) // Dynamic case will be selected to register indexing. 1796 return true; 1797 1798 Register Dst = MI.getOperand(0).getReg(); 1799 Register Vec = MI.getOperand(1).getReg(); 1800 1801 LLT VecTy = MRI.getType(Vec); 1802 LLT EltTy = VecTy.getElementType(); 1803 assert(EltTy == MRI.getType(Dst)); 1804 1805 B.setInstr(MI); 1806 1807 if (IdxVal->Value < VecTy.getNumElements()) 1808 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1809 else 1810 B.buildUndef(Dst); 1811 1812 MI.eraseFromParent(); 1813 return true; 1814 } 1815 1816 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1817 MachineInstr &MI, MachineRegisterInfo &MRI, 1818 MachineIRBuilder &B) const { 1819 // TODO: Should move some of this into LegalizerHelper. 1820 1821 // TODO: Promote dynamic indexing of s16 to s32 1822 1823 // FIXME: Artifact combiner probably should have replaced the truncated 1824 // constant before this, so we shouldn't need 1825 // getConstantVRegValWithLookThrough. 1826 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1827 MI.getOperand(3).getReg(), MRI); 1828 if (!IdxVal) // Dynamic case will be selected to register indexing. 1829 return true; 1830 1831 Register Dst = MI.getOperand(0).getReg(); 1832 Register Vec = MI.getOperand(1).getReg(); 1833 Register Ins = MI.getOperand(2).getReg(); 1834 1835 LLT VecTy = MRI.getType(Vec); 1836 LLT EltTy = VecTy.getElementType(); 1837 assert(EltTy == MRI.getType(Ins)); 1838 1839 B.setInstr(MI); 1840 1841 if (IdxVal->Value < VecTy.getNumElements()) 1842 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1843 else 1844 B.buildUndef(Dst); 1845 1846 MI.eraseFromParent(); 1847 return true; 1848 } 1849 1850 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1851 MachineInstr &MI, MachineRegisterInfo &MRI, 1852 MachineIRBuilder &B) const { 1853 const LLT V2S16 = LLT::vector(2, 16); 1854 1855 Register Dst = MI.getOperand(0).getReg(); 1856 Register Src0 = MI.getOperand(1).getReg(); 1857 LLT DstTy = MRI.getType(Dst); 1858 LLT SrcTy = MRI.getType(Src0); 1859 1860 if (SrcTy == V2S16 && DstTy == V2S16 && 1861 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1862 return true; 1863 1864 MachineIRBuilder HelperBuilder(MI); 1865 GISelObserverWrapper DummyObserver; 1866 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1867 HelperBuilder.setInstr(MI); 1868 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1869 } 1870 1871 bool AMDGPULegalizerInfo::legalizeSinCos( 1872 MachineInstr &MI, MachineRegisterInfo &MRI, 1873 MachineIRBuilder &B) const { 1874 B.setInstr(MI); 1875 1876 Register DstReg = MI.getOperand(0).getReg(); 1877 Register SrcReg = MI.getOperand(1).getReg(); 1878 LLT Ty = MRI.getType(DstReg); 1879 unsigned Flags = MI.getFlags(); 1880 1881 Register TrigVal; 1882 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1883 if (ST.hasTrigReducedRange()) { 1884 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1885 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1886 .addUse(MulVal.getReg(0)) 1887 .setMIFlags(Flags).getReg(0); 1888 } else 1889 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1890 1891 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1892 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1893 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1894 .addUse(TrigVal) 1895 .setMIFlags(Flags); 1896 MI.eraseFromParent(); 1897 return true; 1898 } 1899 1900 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1901 Register DstReg, LLT PtrTy, 1902 MachineIRBuilder &B, const GlobalValue *GV, 1903 unsigned Offset, unsigned GAFlags) const { 1904 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1905 // to the following code sequence: 1906 // 1907 // For constant address space: 1908 // s_getpc_b64 s[0:1] 1909 // s_add_u32 s0, s0, $symbol 1910 // s_addc_u32 s1, s1, 0 1911 // 1912 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1913 // a fixup or relocation is emitted to replace $symbol with a literal 1914 // constant, which is a pc-relative offset from the encoding of the $symbol 1915 // operand to the global variable. 1916 // 1917 // For global address space: 1918 // s_getpc_b64 s[0:1] 1919 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1920 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1921 // 1922 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1923 // fixups or relocations are emitted to replace $symbol@*@lo and 1924 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1925 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1926 // operand to the global variable. 1927 // 1928 // What we want here is an offset from the value returned by s_getpc 1929 // (which is the address of the s_add_u32 instruction) to the global 1930 // variable, but since the encoding of $symbol starts 4 bytes after the start 1931 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1932 // small. This requires us to add 4 to the global variable offset in order to 1933 // compute the correct address. 1934 1935 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1936 1937 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1938 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1939 1940 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1941 .addDef(PCReg); 1942 1943 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1944 if (GAFlags == SIInstrInfo::MO_NONE) 1945 MIB.addImm(0); 1946 else 1947 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1948 1949 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1950 1951 if (PtrTy.getSizeInBits() == 32) 1952 B.buildExtract(DstReg, PCReg, 0); 1953 return true; 1954 } 1955 1956 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1957 MachineInstr &MI, MachineRegisterInfo &MRI, 1958 MachineIRBuilder &B) const { 1959 Register DstReg = MI.getOperand(0).getReg(); 1960 LLT Ty = MRI.getType(DstReg); 1961 unsigned AS = Ty.getAddressSpace(); 1962 1963 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1964 MachineFunction &MF = B.getMF(); 1965 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1966 B.setInstr(MI); 1967 1968 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1969 if (!MFI->isEntryFunction()) { 1970 const Function &Fn = MF.getFunction(); 1971 DiagnosticInfoUnsupported BadLDSDecl( 1972 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 1973 DS_Warning); 1974 Fn.getContext().diagnose(BadLDSDecl); 1975 1976 // We currently don't have a way to correctly allocate LDS objects that 1977 // aren't directly associated with a kernel. We do force inlining of 1978 // functions that use local objects. However, if these dead functions are 1979 // not eliminated, we don't want a compile time error. Just emit a warning 1980 // and a trap, since there should be no callable path here. 1981 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 1982 B.buildUndef(DstReg); 1983 MI.eraseFromParent(); 1984 return true; 1985 } 1986 1987 // TODO: We could emit code to handle the initialization somewhere. 1988 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1989 const SITargetLowering *TLI = ST.getTargetLowering(); 1990 if (!TLI->shouldUseLDSConstAddress(GV)) { 1991 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1992 return true; // Leave in place; 1993 } 1994 1995 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1996 MI.eraseFromParent(); 1997 return true; 1998 } 1999 2000 const Function &Fn = MF.getFunction(); 2001 DiagnosticInfoUnsupported BadInit( 2002 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2003 Fn.getContext().diagnose(BadInit); 2004 return true; 2005 } 2006 2007 const SITargetLowering *TLI = ST.getTargetLowering(); 2008 2009 if (TLI->shouldEmitFixup(GV)) { 2010 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2011 MI.eraseFromParent(); 2012 return true; 2013 } 2014 2015 if (TLI->shouldEmitPCReloc(GV)) { 2016 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2017 MI.eraseFromParent(); 2018 return true; 2019 } 2020 2021 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2022 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2023 2024 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2025 MachinePointerInfo::getGOT(MF), 2026 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2027 MachineMemOperand::MOInvariant, 2028 8 /*Size*/, 8 /*Align*/); 2029 2030 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2031 2032 if (Ty.getSizeInBits() == 32) { 2033 // Truncate if this is a 32-bit constant adrdess. 2034 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2035 B.buildExtract(DstReg, Load, 0); 2036 } else 2037 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2038 2039 MI.eraseFromParent(); 2040 return true; 2041 } 2042 2043 bool AMDGPULegalizerInfo::legalizeLoad( 2044 MachineInstr &MI, MachineRegisterInfo &MRI, 2045 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2046 B.setInstr(MI); 2047 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2048 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2049 Observer.changingInstr(MI); 2050 MI.getOperand(1).setReg(Cast.getReg(0)); 2051 Observer.changedInstr(MI); 2052 return true; 2053 } 2054 2055 bool AMDGPULegalizerInfo::legalizeFMad( 2056 MachineInstr &MI, MachineRegisterInfo &MRI, 2057 MachineIRBuilder &B) const { 2058 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2059 assert(Ty.isScalar()); 2060 2061 MachineFunction &MF = B.getMF(); 2062 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2063 2064 // TODO: Always legal with future ftz flag. 2065 // FIXME: Do we need just output? 2066 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2067 return true; 2068 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2069 return true; 2070 2071 MachineIRBuilder HelperBuilder(MI); 2072 GISelObserverWrapper DummyObserver; 2073 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2074 HelperBuilder.setMBB(*MI.getParent()); 2075 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2076 } 2077 2078 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2079 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2080 Register DstReg = MI.getOperand(0).getReg(); 2081 Register PtrReg = MI.getOperand(1).getReg(); 2082 Register CmpVal = MI.getOperand(2).getReg(); 2083 Register NewVal = MI.getOperand(3).getReg(); 2084 2085 assert(SITargetLowering::isFlatGlobalAddrSpace( 2086 MRI.getType(PtrReg).getAddressSpace()) && 2087 "this should not have been custom lowered"); 2088 2089 LLT ValTy = MRI.getType(CmpVal); 2090 LLT VecTy = LLT::vector(2, ValTy); 2091 2092 B.setInstr(MI); 2093 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2094 2095 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2096 .addDef(DstReg) 2097 .addUse(PtrReg) 2098 .addUse(PackedVal) 2099 .setMemRefs(MI.memoperands()); 2100 2101 MI.eraseFromParent(); 2102 return true; 2103 } 2104 2105 bool AMDGPULegalizerInfo::legalizeFlog( 2106 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2107 Register Dst = MI.getOperand(0).getReg(); 2108 Register Src = MI.getOperand(1).getReg(); 2109 LLT Ty = B.getMRI()->getType(Dst); 2110 unsigned Flags = MI.getFlags(); 2111 B.setInstr(MI); 2112 2113 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2114 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2115 2116 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2117 MI.eraseFromParent(); 2118 return true; 2119 } 2120 2121 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2122 MachineIRBuilder &B) const { 2123 Register Dst = MI.getOperand(0).getReg(); 2124 Register Src = MI.getOperand(1).getReg(); 2125 unsigned Flags = MI.getFlags(); 2126 LLT Ty = B.getMRI()->getType(Dst); 2127 B.setInstr(MI); 2128 2129 auto K = B.buildFConstant(Ty, numbers::log2e); 2130 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2131 B.buildFExp2(Dst, Mul, Flags); 2132 MI.eraseFromParent(); 2133 return true; 2134 } 2135 2136 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2137 MachineIRBuilder &B) const { 2138 Register Dst = MI.getOperand(0).getReg(); 2139 Register Src0 = MI.getOperand(1).getReg(); 2140 Register Src1 = MI.getOperand(2).getReg(); 2141 unsigned Flags = MI.getFlags(); 2142 LLT Ty = B.getMRI()->getType(Dst); 2143 B.setInstr(MI); 2144 const LLT S16 = LLT::scalar(16); 2145 const LLT S32 = LLT::scalar(32); 2146 2147 if (Ty == S32) { 2148 auto Log = B.buildFLog2(S32, Src0, Flags); 2149 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2150 .addUse(Log.getReg(0)) 2151 .addUse(Src1) 2152 .setMIFlags(Flags); 2153 B.buildFExp2(Dst, Mul, Flags); 2154 } else if (Ty == S16) { 2155 // There's no f16 fmul_legacy, so we need to convert for it. 2156 auto Log = B.buildFLog2(S16, Src0, Flags); 2157 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2158 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2159 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2160 .addUse(Ext0.getReg(0)) 2161 .addUse(Ext1.getReg(0)) 2162 .setMIFlags(Flags); 2163 2164 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2165 } else 2166 return false; 2167 2168 MI.eraseFromParent(); 2169 return true; 2170 } 2171 2172 // Find a source register, ignoring any possible source modifiers. 2173 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2174 Register ModSrc = OrigSrc; 2175 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2176 ModSrc = SrcFNeg->getOperand(1).getReg(); 2177 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2178 ModSrc = SrcFAbs->getOperand(1).getReg(); 2179 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2180 ModSrc = SrcFAbs->getOperand(1).getReg(); 2181 return ModSrc; 2182 } 2183 2184 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2185 MachineRegisterInfo &MRI, 2186 MachineIRBuilder &B) const { 2187 B.setInstr(MI); 2188 2189 const LLT S1 = LLT::scalar(1); 2190 const LLT S64 = LLT::scalar(64); 2191 Register Dst = MI.getOperand(0).getReg(); 2192 Register OrigSrc = MI.getOperand(1).getReg(); 2193 unsigned Flags = MI.getFlags(); 2194 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2195 "this should not have been custom lowered"); 2196 2197 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2198 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2199 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2200 // V_FRACT bug is: 2201 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2202 // 2203 // Convert floor(x) to (x - fract(x)) 2204 2205 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2206 .addUse(OrigSrc) 2207 .setMIFlags(Flags); 2208 2209 // Give source modifier matching some assistance before obscuring a foldable 2210 // pattern. 2211 2212 // TODO: We can avoid the neg on the fract? The input sign to fract 2213 // shouldn't matter? 2214 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2215 2216 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2217 2218 Register Min = MRI.createGenericVirtualRegister(S64); 2219 2220 // We don't need to concern ourselves with the snan handling difference, so 2221 // use the one which will directly select. 2222 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2223 if (MFI->getMode().IEEE) 2224 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2225 else 2226 B.buildFMinNum(Min, Fract, Const, Flags); 2227 2228 Register CorrectedFract = Min; 2229 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2230 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2231 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2232 } 2233 2234 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2235 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2236 2237 MI.eraseFromParent(); 2238 return true; 2239 } 2240 2241 // Turn an illegal packed v2s16 build vector into bit operations. 2242 // TODO: This should probably be a bitcast action in LegalizerHelper. 2243 bool AMDGPULegalizerInfo::legalizeBuildVector( 2244 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2245 Register Dst = MI.getOperand(0).getReg(); 2246 const LLT S32 = LLT::scalar(32); 2247 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2248 2249 Register Src0 = MI.getOperand(1).getReg(); 2250 Register Src1 = MI.getOperand(2).getReg(); 2251 assert(MRI.getType(Src0) == LLT::scalar(16)); 2252 2253 B.setInstr(MI); 2254 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2255 B.buildBitcast(Dst, Merge); 2256 2257 MI.eraseFromParent(); 2258 return true; 2259 } 2260 2261 // Return the use branch instruction, otherwise null if the usage is invalid. 2262 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2263 MachineRegisterInfo &MRI, 2264 MachineInstr *&Br) { 2265 Register CondDef = MI.getOperand(0).getReg(); 2266 if (!MRI.hasOneNonDBGUse(CondDef)) 2267 return nullptr; 2268 2269 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2270 if (UseMI.getParent() != MI.getParent() || 2271 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2272 return nullptr; 2273 2274 // Make sure the cond br is followed by a G_BR 2275 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2276 if (Next != MI.getParent()->end()) { 2277 if (Next->getOpcode() != AMDGPU::G_BR) 2278 return nullptr; 2279 Br = &*Next; 2280 } 2281 2282 return &UseMI; 2283 } 2284 2285 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2286 MachineRegisterInfo &MRI, 2287 Register LiveIn, 2288 Register PhyReg) const { 2289 assert(PhyReg.isPhysical() && "Physical register expected"); 2290 2291 // Insert the live-in copy, if required, by defining destination virtual 2292 // register. 2293 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2294 if (!MRI.getVRegDef(LiveIn)) { 2295 // FIXME: Should have scoped insert pt 2296 MachineBasicBlock &OrigInsBB = B.getMBB(); 2297 auto OrigInsPt = B.getInsertPt(); 2298 2299 MachineBasicBlock &EntryMBB = B.getMF().front(); 2300 EntryMBB.addLiveIn(PhyReg); 2301 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2302 B.buildCopy(LiveIn, PhyReg); 2303 2304 B.setInsertPt(OrigInsBB, OrigInsPt); 2305 } 2306 2307 return LiveIn; 2308 } 2309 2310 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2311 MachineRegisterInfo &MRI, 2312 Register PhyReg, LLT Ty, 2313 bool InsertLiveInCopy) const { 2314 assert(PhyReg.isPhysical() && "Physical register expected"); 2315 2316 // Get or create virtual live-in regester 2317 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2318 if (!LiveIn) { 2319 LiveIn = MRI.createGenericVirtualRegister(Ty); 2320 MRI.addLiveIn(PhyReg, LiveIn); 2321 } 2322 2323 // When the actual true copy required is from virtual register to physical 2324 // register (to be inserted later), live-in copy insertion from physical 2325 // to register virtual register is not required 2326 if (!InsertLiveInCopy) 2327 return LiveIn; 2328 2329 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2330 } 2331 2332 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2333 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2334 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2335 const ArgDescriptor *Arg; 2336 const TargetRegisterClass *RC; 2337 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2338 if (!Arg) { 2339 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2340 return nullptr; 2341 } 2342 return Arg; 2343 } 2344 2345 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2346 const ArgDescriptor *Arg) const { 2347 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2348 return false; // TODO: Handle these 2349 2350 Register SrcReg = Arg->getRegister(); 2351 assert(SrcReg.isPhysical() && "Physical register expected"); 2352 assert(DstReg.isVirtual() && "Virtual register expected"); 2353 2354 MachineRegisterInfo &MRI = *B.getMRI(); 2355 2356 LLT Ty = MRI.getType(DstReg); 2357 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2358 2359 if (Arg->isMasked()) { 2360 // TODO: Should we try to emit this once in the entry block? 2361 const LLT S32 = LLT::scalar(32); 2362 const unsigned Mask = Arg->getMask(); 2363 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2364 2365 Register AndMaskSrc = LiveIn; 2366 2367 if (Shift != 0) { 2368 auto ShiftAmt = B.buildConstant(S32, Shift); 2369 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2370 } 2371 2372 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2373 } else { 2374 B.buildCopy(DstReg, LiveIn); 2375 } 2376 2377 return true; 2378 } 2379 2380 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2381 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2382 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2383 B.setInstr(MI); 2384 2385 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2386 if (!Arg) 2387 return false; 2388 2389 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2390 return false; 2391 2392 MI.eraseFromParent(); 2393 return true; 2394 } 2395 2396 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2397 MachineRegisterInfo &MRI, 2398 MachineIRBuilder &B) const { 2399 B.setInstr(MI); 2400 Register Dst = MI.getOperand(0).getReg(); 2401 LLT DstTy = MRI.getType(Dst); 2402 LLT S16 = LLT::scalar(16); 2403 LLT S32 = LLT::scalar(32); 2404 LLT S64 = LLT::scalar(64); 2405 2406 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2407 return true; 2408 2409 if (DstTy == S16) 2410 return legalizeFDIV16(MI, MRI, B); 2411 if (DstTy == S32) 2412 return legalizeFDIV32(MI, MRI, B); 2413 if (DstTy == S64) 2414 return legalizeFDIV64(MI, MRI, B); 2415 2416 return false; 2417 } 2418 2419 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2420 const LLT S32 = LLT::scalar(32); 2421 2422 auto Cvt0 = B.buildUITOFP(S32, Src); 2423 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2424 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2425 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2426 return B.buildFPTOUI(S32, Mul).getReg(0); 2427 } 2428 2429 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2430 Register DstReg, 2431 Register Num, 2432 Register Den, 2433 bool IsRem) const { 2434 const LLT S1 = LLT::scalar(1); 2435 const LLT S32 = LLT::scalar(32); 2436 2437 // RCP = URECIP(Den) = 2^32 / Den + e 2438 // e is rounding error. 2439 auto RCP = buildDivRCP(B, Den); 2440 2441 // RCP_LO = mul(RCP, Den) 2442 auto RCP_LO = B.buildMul(S32, RCP, Den); 2443 2444 // RCP_HI = mulhu (RCP, Den) */ 2445 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2446 2447 // NEG_RCP_LO = -RCP_LO 2448 auto Zero = B.buildConstant(S32, 0); 2449 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2450 2451 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2452 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2453 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2454 2455 // Calculate the rounding error from the URECIP instruction 2456 // E = mulhu(ABS_RCP_LO, RCP) 2457 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2458 2459 // RCP_A_E = RCP + E 2460 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2461 2462 // RCP_S_E = RCP - E 2463 auto RCP_S_E = B.buildSub(S32, RCP, E); 2464 2465 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2466 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2467 2468 // Quotient = mulhu(Tmp0, Num)stmp 2469 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2470 2471 // Num_S_Remainder = Quotient * Den 2472 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2473 2474 // Remainder = Num - Num_S_Remainder 2475 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2476 2477 // Remainder_GE_Den = Remainder >= Den 2478 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2479 2480 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2481 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2482 Num, Num_S_Remainder); 2483 2484 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2485 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2486 2487 // Calculate Division result: 2488 2489 // Quotient_A_One = Quotient + 1 2490 auto One = B.buildConstant(S32, 1); 2491 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2492 2493 // Quotient_S_One = Quotient - 1 2494 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2495 2496 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2497 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2498 2499 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2500 if (IsRem) { 2501 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2502 2503 // Calculate Rem result: 2504 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2505 2506 // Remainder_A_Den = Remainder + Den 2507 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2508 2509 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2510 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2511 2512 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2513 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2514 } else { 2515 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2516 } 2517 } 2518 2519 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2520 MachineRegisterInfo &MRI, 2521 MachineIRBuilder &B) const { 2522 B.setInstr(MI); 2523 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2524 Register DstReg = MI.getOperand(0).getReg(); 2525 Register Num = MI.getOperand(1).getReg(); 2526 Register Den = MI.getOperand(2).getReg(); 2527 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2528 MI.eraseFromParent(); 2529 return true; 2530 } 2531 2532 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2533 MachineRegisterInfo &MRI, 2534 MachineIRBuilder &B) const { 2535 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2536 return legalizeUDIV_UREM32(MI, MRI, B); 2537 return false; 2538 } 2539 2540 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2541 MachineRegisterInfo &MRI, 2542 MachineIRBuilder &B) const { 2543 B.setInstr(MI); 2544 const LLT S32 = LLT::scalar(32); 2545 2546 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2547 Register DstReg = MI.getOperand(0).getReg(); 2548 Register LHS = MI.getOperand(1).getReg(); 2549 Register RHS = MI.getOperand(2).getReg(); 2550 2551 auto ThirtyOne = B.buildConstant(S32, 31); 2552 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2553 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2554 2555 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2556 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2557 2558 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2559 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2560 2561 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2562 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2563 2564 if (IsRem) { 2565 auto RSign = LHSign; // Remainder sign is the same as LHS 2566 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2567 B.buildSub(DstReg, UDivRem, RSign); 2568 } else { 2569 auto DSign = B.buildXor(S32, LHSign, RHSign); 2570 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2571 B.buildSub(DstReg, UDivRem, DSign); 2572 } 2573 2574 MI.eraseFromParent(); 2575 return true; 2576 } 2577 2578 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2579 MachineRegisterInfo &MRI, 2580 MachineIRBuilder &B) const { 2581 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2582 return legalizeSDIV_SREM32(MI, MRI, B); 2583 return false; 2584 } 2585 2586 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2587 MachineRegisterInfo &MRI, 2588 MachineIRBuilder &B) const { 2589 Register Res = MI.getOperand(0).getReg(); 2590 Register LHS = MI.getOperand(1).getReg(); 2591 Register RHS = MI.getOperand(2).getReg(); 2592 2593 uint16_t Flags = MI.getFlags(); 2594 2595 LLT ResTy = MRI.getType(Res); 2596 LLT S32 = LLT::scalar(32); 2597 LLT S64 = LLT::scalar(64); 2598 2599 const MachineFunction &MF = B.getMF(); 2600 bool Unsafe = 2601 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2602 2603 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2604 return false; 2605 2606 if (!Unsafe && ResTy == S32 && 2607 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2608 return false; 2609 2610 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2611 // 1 / x -> RCP(x) 2612 if (CLHS->isExactlyValue(1.0)) { 2613 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2614 .addUse(RHS) 2615 .setMIFlags(Flags); 2616 2617 MI.eraseFromParent(); 2618 return true; 2619 } 2620 2621 // -1 / x -> RCP( FNEG(x) ) 2622 if (CLHS->isExactlyValue(-1.0)) { 2623 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2624 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2625 .addUse(FNeg.getReg(0)) 2626 .setMIFlags(Flags); 2627 2628 MI.eraseFromParent(); 2629 return true; 2630 } 2631 } 2632 2633 // x / y -> x * (1.0 / y) 2634 if (Unsafe) { 2635 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2636 .addUse(RHS) 2637 .setMIFlags(Flags); 2638 B.buildFMul(Res, LHS, RCP, Flags); 2639 2640 MI.eraseFromParent(); 2641 return true; 2642 } 2643 2644 return false; 2645 } 2646 2647 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2648 MachineRegisterInfo &MRI, 2649 MachineIRBuilder &B) const { 2650 B.setInstr(MI); 2651 Register Res = MI.getOperand(0).getReg(); 2652 Register LHS = MI.getOperand(1).getReg(); 2653 Register RHS = MI.getOperand(2).getReg(); 2654 2655 uint16_t Flags = MI.getFlags(); 2656 2657 LLT S16 = LLT::scalar(16); 2658 LLT S32 = LLT::scalar(32); 2659 2660 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2661 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2662 2663 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2664 .addUse(RHSExt.getReg(0)) 2665 .setMIFlags(Flags); 2666 2667 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2668 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2669 2670 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2671 .addUse(RDst.getReg(0)) 2672 .addUse(RHS) 2673 .addUse(LHS) 2674 .setMIFlags(Flags); 2675 2676 MI.eraseFromParent(); 2677 return true; 2678 } 2679 2680 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2681 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2682 static void toggleSPDenormMode(bool Enable, 2683 MachineIRBuilder &B, 2684 const GCNSubtarget &ST, 2685 AMDGPU::SIModeRegisterDefaults Mode) { 2686 // Set SP denorm mode to this value. 2687 unsigned SPDenormMode = 2688 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2689 2690 if (ST.hasDenormModeInst()) { 2691 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2692 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2693 2694 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2695 B.buildInstr(AMDGPU::S_DENORM_MODE) 2696 .addImm(NewDenormModeValue); 2697 2698 } else { 2699 // Select FP32 bit field in mode register. 2700 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2701 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2702 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2703 2704 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2705 .addImm(SPDenormMode) 2706 .addImm(SPDenormModeBitField); 2707 } 2708 } 2709 2710 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2711 MachineRegisterInfo &MRI, 2712 MachineIRBuilder &B) const { 2713 B.setInstr(MI); 2714 Register Res = MI.getOperand(0).getReg(); 2715 Register LHS = MI.getOperand(1).getReg(); 2716 Register RHS = MI.getOperand(2).getReg(); 2717 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2718 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2719 2720 uint16_t Flags = MI.getFlags(); 2721 2722 LLT S32 = LLT::scalar(32); 2723 LLT S1 = LLT::scalar(1); 2724 2725 auto One = B.buildFConstant(S32, 1.0f); 2726 2727 auto DenominatorScaled = 2728 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2729 .addUse(RHS) 2730 .addUse(LHS) 2731 .addImm(1) 2732 .setMIFlags(Flags); 2733 auto NumeratorScaled = 2734 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2735 .addUse(LHS) 2736 .addUse(RHS) 2737 .addImm(0) 2738 .setMIFlags(Flags); 2739 2740 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2741 .addUse(DenominatorScaled.getReg(0)) 2742 .setMIFlags(Flags); 2743 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2744 2745 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2746 // aren't modeled as reading it. 2747 if (!Mode.allFP32Denormals()) 2748 toggleSPDenormMode(true, B, ST, Mode); 2749 2750 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2751 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2752 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2753 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2754 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2755 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2756 2757 if (!Mode.allFP32Denormals()) 2758 toggleSPDenormMode(false, B, ST, Mode); 2759 2760 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2761 .addUse(Fma4.getReg(0)) 2762 .addUse(Fma1.getReg(0)) 2763 .addUse(Fma3.getReg(0)) 2764 .addUse(NumeratorScaled.getReg(1)) 2765 .setMIFlags(Flags); 2766 2767 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2768 .addUse(Fmas.getReg(0)) 2769 .addUse(RHS) 2770 .addUse(LHS) 2771 .setMIFlags(Flags); 2772 2773 MI.eraseFromParent(); 2774 return true; 2775 } 2776 2777 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2778 MachineRegisterInfo &MRI, 2779 MachineIRBuilder &B) const { 2780 B.setInstr(MI); 2781 Register Res = MI.getOperand(0).getReg(); 2782 Register LHS = MI.getOperand(1).getReg(); 2783 Register RHS = MI.getOperand(2).getReg(); 2784 2785 uint16_t Flags = MI.getFlags(); 2786 2787 LLT S64 = LLT::scalar(64); 2788 LLT S1 = LLT::scalar(1); 2789 2790 auto One = B.buildFConstant(S64, 1.0); 2791 2792 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2793 .addUse(LHS) 2794 .addUse(RHS) 2795 .addImm(1) 2796 .setMIFlags(Flags); 2797 2798 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2799 2800 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2801 .addUse(DivScale0.getReg(0)) 2802 .setMIFlags(Flags); 2803 2804 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2805 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2806 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2807 2808 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2809 .addUse(LHS) 2810 .addUse(RHS) 2811 .addImm(0) 2812 .setMIFlags(Flags); 2813 2814 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2815 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2816 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2817 2818 Register Scale; 2819 if (!ST.hasUsableDivScaleConditionOutput()) { 2820 // Workaround a hardware bug on SI where the condition output from div_scale 2821 // is not usable. 2822 2823 LLT S32 = LLT::scalar(32); 2824 2825 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2826 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2827 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2828 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2829 2830 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2831 Scale1Unmerge.getReg(1)); 2832 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2833 Scale0Unmerge.getReg(1)); 2834 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 2835 } else { 2836 Scale = DivScale1.getReg(1); 2837 } 2838 2839 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2840 .addUse(Fma4.getReg(0)) 2841 .addUse(Fma3.getReg(0)) 2842 .addUse(Mul.getReg(0)) 2843 .addUse(Scale) 2844 .setMIFlags(Flags); 2845 2846 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2847 .addUse(Fmas.getReg(0)) 2848 .addUse(RHS) 2849 .addUse(LHS) 2850 .setMIFlags(Flags); 2851 2852 MI.eraseFromParent(); 2853 return true; 2854 } 2855 2856 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2857 MachineRegisterInfo &MRI, 2858 MachineIRBuilder &B) const { 2859 B.setInstr(MI); 2860 Register Res = MI.getOperand(0).getReg(); 2861 Register LHS = MI.getOperand(2).getReg(); 2862 Register RHS = MI.getOperand(3).getReg(); 2863 uint16_t Flags = MI.getFlags(); 2864 2865 LLT S32 = LLT::scalar(32); 2866 LLT S1 = LLT::scalar(1); 2867 2868 auto Abs = B.buildFAbs(S32, RHS, Flags); 2869 const APFloat C0Val(1.0f); 2870 2871 auto C0 = B.buildConstant(S32, 0x6f800000); 2872 auto C1 = B.buildConstant(S32, 0x2f800000); 2873 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2874 2875 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2876 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2877 2878 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2879 2880 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2881 .addUse(Mul0.getReg(0)) 2882 .setMIFlags(Flags); 2883 2884 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2885 2886 B.buildFMul(Res, Sel, Mul1, Flags); 2887 2888 MI.eraseFromParent(); 2889 return true; 2890 } 2891 2892 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2893 MachineRegisterInfo &MRI, 2894 MachineIRBuilder &B) const { 2895 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2896 if (!MFI->isEntryFunction()) { 2897 return legalizePreloadedArgIntrin(MI, MRI, B, 2898 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2899 } 2900 2901 B.setInstr(MI); 2902 2903 uint64_t Offset = 2904 ST.getTargetLowering()->getImplicitParameterOffset( 2905 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2906 Register DstReg = MI.getOperand(0).getReg(); 2907 LLT DstTy = MRI.getType(DstReg); 2908 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2909 2910 const ArgDescriptor *Arg; 2911 const TargetRegisterClass *RC; 2912 std::tie(Arg, RC) 2913 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2914 if (!Arg) 2915 return false; 2916 2917 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2918 if (!loadInputValue(KernargPtrReg, B, Arg)) 2919 return false; 2920 2921 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2922 MI.eraseFromParent(); 2923 return true; 2924 } 2925 2926 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2927 MachineRegisterInfo &MRI, 2928 MachineIRBuilder &B, 2929 unsigned AddrSpace) const { 2930 B.setInstr(MI); 2931 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2932 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2933 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2934 MI.eraseFromParent(); 2935 return true; 2936 } 2937 2938 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2939 // offset (the offset that is included in bounds checking and swizzling, to be 2940 // split between the instruction's voffset and immoffset fields) and soffset 2941 // (the offset that is excluded from bounds checking and swizzling, to go in 2942 // the instruction's soffset field). This function takes the first kind of 2943 // offset and figures out how to split it between voffset and immoffset. 2944 std::tuple<Register, unsigned, unsigned> 2945 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2946 Register OrigOffset) const { 2947 const unsigned MaxImm = 4095; 2948 Register BaseReg; 2949 unsigned TotalConstOffset; 2950 MachineInstr *OffsetDef; 2951 const LLT S32 = LLT::scalar(32); 2952 2953 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2954 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2955 2956 unsigned ImmOffset = TotalConstOffset; 2957 2958 // If the immediate value is too big for the immoffset field, put the value 2959 // and -4096 into the immoffset field so that the value that is copied/added 2960 // for the voffset field is a multiple of 4096, and it stands more chance 2961 // of being CSEd with the copy/add for another similar load/store. 2962 // However, do not do that rounding down to a multiple of 4096 if that is a 2963 // negative number, as it appears to be illegal to have a negative offset 2964 // in the vgpr, even if adding the immediate offset makes it positive. 2965 unsigned Overflow = ImmOffset & ~MaxImm; 2966 ImmOffset -= Overflow; 2967 if ((int32_t)Overflow < 0) { 2968 Overflow += ImmOffset; 2969 ImmOffset = 0; 2970 } 2971 2972 if (Overflow != 0) { 2973 if (!BaseReg) { 2974 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2975 } else { 2976 auto OverflowVal = B.buildConstant(S32, Overflow); 2977 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2978 } 2979 } 2980 2981 if (!BaseReg) 2982 BaseReg = B.buildConstant(S32, 0).getReg(0); 2983 2984 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2985 } 2986 2987 /// Handle register layout difference for f16 images for some subtargets. 2988 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2989 MachineRegisterInfo &MRI, 2990 Register Reg) const { 2991 if (!ST.hasUnpackedD16VMem()) 2992 return Reg; 2993 2994 const LLT S16 = LLT::scalar(16); 2995 const LLT S32 = LLT::scalar(32); 2996 LLT StoreVT = MRI.getType(Reg); 2997 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2998 2999 auto Unmerge = B.buildUnmerge(S16, Reg); 3000 3001 SmallVector<Register, 4> WideRegs; 3002 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3003 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3004 3005 int NumElts = StoreVT.getNumElements(); 3006 3007 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3008 } 3009 3010 Register AMDGPULegalizerInfo::fixStoreSourceType( 3011 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3012 MachineRegisterInfo *MRI = B.getMRI(); 3013 LLT Ty = MRI->getType(VData); 3014 3015 const LLT S16 = LLT::scalar(16); 3016 3017 // Fixup illegal register types for i8 stores. 3018 if (Ty == LLT::scalar(8) || Ty == S16) { 3019 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3020 return AnyExt; 3021 } 3022 3023 if (Ty.isVector()) { 3024 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3025 if (IsFormat) 3026 return handleD16VData(B, *MRI, VData); 3027 } 3028 } 3029 3030 return VData; 3031 } 3032 3033 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3034 MachineRegisterInfo &MRI, 3035 MachineIRBuilder &B, 3036 bool IsTyped, 3037 bool IsFormat) const { 3038 B.setInstr(MI); 3039 3040 Register VData = MI.getOperand(1).getReg(); 3041 LLT Ty = MRI.getType(VData); 3042 LLT EltTy = Ty.getScalarType(); 3043 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3044 const LLT S32 = LLT::scalar(32); 3045 3046 VData = fixStoreSourceType(B, VData, IsFormat); 3047 Register RSrc = MI.getOperand(2).getReg(); 3048 3049 MachineMemOperand *MMO = *MI.memoperands_begin(); 3050 const int MemSize = MMO->getSize(); 3051 3052 unsigned ImmOffset; 3053 unsigned TotalOffset; 3054 3055 // The typed intrinsics add an immediate after the registers. 3056 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3057 3058 // The struct intrinsic variants add one additional operand over raw. 3059 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3060 Register VIndex; 3061 int OpOffset = 0; 3062 if (HasVIndex) { 3063 VIndex = MI.getOperand(3).getReg(); 3064 OpOffset = 1; 3065 } 3066 3067 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3068 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3069 3070 unsigned Format = 0; 3071 if (IsTyped) { 3072 Format = MI.getOperand(5 + OpOffset).getImm(); 3073 ++OpOffset; 3074 } 3075 3076 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3077 3078 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3079 if (TotalOffset != 0) 3080 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3081 3082 unsigned Opc; 3083 if (IsTyped) { 3084 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3085 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3086 } else if (IsFormat) { 3087 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3088 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3089 } else { 3090 switch (MemSize) { 3091 case 1: 3092 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3093 break; 3094 case 2: 3095 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3096 break; 3097 default: 3098 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3099 break; 3100 } 3101 } 3102 3103 if (!VIndex) 3104 VIndex = B.buildConstant(S32, 0).getReg(0); 3105 3106 auto MIB = B.buildInstr(Opc) 3107 .addUse(VData) // vdata 3108 .addUse(RSrc) // rsrc 3109 .addUse(VIndex) // vindex 3110 .addUse(VOffset) // voffset 3111 .addUse(SOffset) // soffset 3112 .addImm(ImmOffset); // offset(imm) 3113 3114 if (IsTyped) 3115 MIB.addImm(Format); 3116 3117 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3118 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3119 .addMemOperand(MMO); 3120 3121 MI.eraseFromParent(); 3122 return true; 3123 } 3124 3125 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3126 MachineRegisterInfo &MRI, 3127 MachineIRBuilder &B, 3128 bool IsFormat, 3129 bool IsTyped) const { 3130 B.setInstr(MI); 3131 3132 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3133 MachineMemOperand *MMO = *MI.memoperands_begin(); 3134 const int MemSize = MMO->getSize(); 3135 const LLT S32 = LLT::scalar(32); 3136 3137 Register Dst = MI.getOperand(0).getReg(); 3138 Register RSrc = MI.getOperand(2).getReg(); 3139 3140 // The typed intrinsics add an immediate after the registers. 3141 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3142 3143 // The struct intrinsic variants add one additional operand over raw. 3144 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3145 Register VIndex; 3146 int OpOffset = 0; 3147 if (HasVIndex) { 3148 VIndex = MI.getOperand(3).getReg(); 3149 OpOffset = 1; 3150 } 3151 3152 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3153 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3154 3155 unsigned Format = 0; 3156 if (IsTyped) { 3157 Format = MI.getOperand(5 + OpOffset).getImm(); 3158 ++OpOffset; 3159 } 3160 3161 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3162 unsigned ImmOffset; 3163 unsigned TotalOffset; 3164 3165 LLT Ty = MRI.getType(Dst); 3166 LLT EltTy = Ty.getScalarType(); 3167 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3168 const bool Unpacked = ST.hasUnpackedD16VMem(); 3169 3170 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3171 if (TotalOffset != 0) 3172 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3173 3174 unsigned Opc; 3175 3176 if (IsTyped) { 3177 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3178 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3179 } else if (IsFormat) { 3180 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3181 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3182 } else { 3183 switch (MemSize) { 3184 case 1: 3185 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3186 break; 3187 case 2: 3188 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3189 break; 3190 default: 3191 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3192 break; 3193 } 3194 } 3195 3196 Register LoadDstReg; 3197 3198 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3199 LLT UnpackedTy = Ty.changeElementSize(32); 3200 3201 if (IsExtLoad) 3202 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3203 else if (Unpacked && IsD16 && Ty.isVector()) 3204 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3205 else 3206 LoadDstReg = Dst; 3207 3208 if (!VIndex) 3209 VIndex = B.buildConstant(S32, 0).getReg(0); 3210 3211 auto MIB = B.buildInstr(Opc) 3212 .addDef(LoadDstReg) // vdata 3213 .addUse(RSrc) // rsrc 3214 .addUse(VIndex) // vindex 3215 .addUse(VOffset) // voffset 3216 .addUse(SOffset) // soffset 3217 .addImm(ImmOffset); // offset(imm) 3218 3219 if (IsTyped) 3220 MIB.addImm(Format); 3221 3222 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3223 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3224 .addMemOperand(MMO); 3225 3226 if (LoadDstReg != Dst) { 3227 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3228 3229 // Widen result for extending loads was widened. 3230 if (IsExtLoad) 3231 B.buildTrunc(Dst, LoadDstReg); 3232 else { 3233 // Repack to original 16-bit vector result 3234 // FIXME: G_TRUNC should work, but legalization currently fails 3235 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3236 SmallVector<Register, 4> Repack; 3237 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3238 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3239 B.buildMerge(Dst, Repack); 3240 } 3241 } 3242 3243 MI.eraseFromParent(); 3244 return true; 3245 } 3246 3247 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3248 MachineIRBuilder &B, 3249 bool IsInc) const { 3250 B.setInstr(MI); 3251 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3252 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3253 B.buildInstr(Opc) 3254 .addDef(MI.getOperand(0).getReg()) 3255 .addUse(MI.getOperand(2).getReg()) 3256 .addUse(MI.getOperand(3).getReg()) 3257 .cloneMemRefs(MI); 3258 MI.eraseFromParent(); 3259 return true; 3260 } 3261 3262 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3263 switch (IntrID) { 3264 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3265 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3266 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3267 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3268 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3269 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3270 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3271 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3272 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3273 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3274 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3275 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3276 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3277 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3278 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3279 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3280 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3281 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3282 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3283 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3284 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3285 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3286 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3287 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3288 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3289 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3290 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3291 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3292 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3293 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3294 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3295 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3296 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3297 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3298 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3299 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3300 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3301 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3302 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3303 default: 3304 llvm_unreachable("unhandled atomic opcode"); 3305 } 3306 } 3307 3308 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3309 MachineIRBuilder &B, 3310 Intrinsic::ID IID) const { 3311 B.setInstr(MI); 3312 3313 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3314 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3315 3316 Register Dst = MI.getOperand(0).getReg(); 3317 Register VData = MI.getOperand(2).getReg(); 3318 3319 Register CmpVal; 3320 int OpOffset = 0; 3321 3322 if (IsCmpSwap) { 3323 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3324 ++OpOffset; 3325 } 3326 3327 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3328 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3329 3330 // The struct intrinsic variants add one additional operand over raw. 3331 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3332 Register VIndex; 3333 if (HasVIndex) { 3334 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3335 ++OpOffset; 3336 } 3337 3338 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3339 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3340 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3341 3342 MachineMemOperand *MMO = *MI.memoperands_begin(); 3343 3344 unsigned ImmOffset; 3345 unsigned TotalOffset; 3346 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3347 if (TotalOffset != 0) 3348 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3349 3350 if (!VIndex) 3351 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3352 3353 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3354 .addDef(Dst) 3355 .addUse(VData); // vdata 3356 3357 if (IsCmpSwap) 3358 MIB.addReg(CmpVal); 3359 3360 MIB.addUse(RSrc) // rsrc 3361 .addUse(VIndex) // vindex 3362 .addUse(VOffset) // voffset 3363 .addUse(SOffset) // soffset 3364 .addImm(ImmOffset) // offset(imm) 3365 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3366 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3367 .addMemOperand(MMO); 3368 3369 MI.eraseFromParent(); 3370 return true; 3371 } 3372 3373 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3374 /// vector with s16 typed elements. 3375 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3376 SmallVectorImpl<Register> &PackedAddrs, 3377 int AddrIdx, int DimIdx, int NumVAddrs, 3378 int NumGradients) { 3379 const LLT S16 = LLT::scalar(16); 3380 const LLT V2S16 = LLT::vector(2, 16); 3381 3382 for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) { 3383 Register AddrReg = MI.getOperand(I).getReg(); 3384 3385 if (I < DimIdx) { 3386 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3387 PackedAddrs.push_back(AddrReg); 3388 } else { 3389 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3390 // derivatives dx/dh and dx/dv are packed with undef. 3391 if (((I + 1) >= (AddrIdx + NumVAddrs)) || 3392 ((NumGradients / 2) % 2 == 1 && 3393 (I == DimIdx + (NumGradients / 2) - 1 || 3394 I == DimIdx + NumGradients - 1))) { 3395 PackedAddrs.push_back( 3396 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3397 .getReg(0)); 3398 } else { 3399 PackedAddrs.push_back( 3400 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3401 .getReg(0)); 3402 ++I; 3403 } 3404 } 3405 } 3406 } 3407 3408 /// Convert from separate vaddr components to a single vector address register, 3409 /// and replace the remaining operands with $noreg. 3410 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3411 int DimIdx, int NumVAddrs) { 3412 SmallVector<Register, 8> AddrRegs(NumVAddrs); 3413 for (int I = 0; I != NumVAddrs; ++I) { 3414 AddrRegs[I] = MI.getOperand(DimIdx + I).getReg(); 3415 assert(B.getMRI()->getType(AddrRegs[I]) == LLT::scalar(32)); 3416 } 3417 3418 auto VAddr = B.buildBuildVector(LLT::vector(NumVAddrs, 32), AddrRegs); 3419 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3420 for (int I = 1; I != NumVAddrs; ++I) 3421 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3422 } 3423 3424 /// Return number of address arguments, and the number of gradients 3425 static std::pair<int, int> 3426 getImageNumVAddr(const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, 3427 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode) { 3428 const AMDGPU::MIMGDimInfo *DimInfo 3429 = AMDGPU::getMIMGDimInfo(ImageDimIntr->Dim); 3430 3431 int NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0; 3432 int NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0; 3433 int NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0; 3434 int NumVAddr = BaseOpcode->NumExtraArgs + NumGradients + NumCoords + NumLCM; 3435 return {NumVAddr, NumGradients}; 3436 } 3437 3438 static int getDMaskIdx(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode, 3439 int NumDefs) { 3440 assert(!BaseOpcode->Atomic); 3441 return NumDefs + 1 + (BaseOpcode->Store ? 1 : 0); 3442 } 3443 3444 /// Return first address operand index in an image intrinsic. 3445 static int getImageVAddrIdxBegin(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode, 3446 int NumDefs) { 3447 if (BaseOpcode->Atomic) 3448 return NumDefs + 1 + (BaseOpcode->AtomicX2 ? 2 : 1); 3449 return getDMaskIdx(BaseOpcode, NumDefs) + 1; 3450 } 3451 3452 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3453 /// 3454 /// Depending on the subtarget, load/store with 16-bit element data need to be 3455 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3456 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3457 /// registers. 3458 /// 3459 /// We don't want to directly select image instructions just yet, but also want 3460 /// to exposes all register repacking to the legalizer/combiners. We also don't 3461 /// want a selected instrution entering RegBankSelect. In order to avoid 3462 /// defining a multitude of intermediate image instructions, directly hack on 3463 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3464 /// now unnecessary arguments with $noreg. 3465 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3466 MachineInstr &MI, MachineIRBuilder &B, 3467 GISelChangeObserver &Observer, 3468 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3469 B.setInstr(MI); 3470 3471 const int NumDefs = MI.getNumExplicitDefs(); 3472 bool IsTFE = NumDefs == 2; 3473 // We are only processing the operands of d16 image operations on subtargets 3474 // that use the unpacked register layout, or need to repack the TFE result. 3475 3476 // TODO: Do we need to guard against already legalized intrinsics? 3477 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3478 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3479 3480 MachineRegisterInfo *MRI = B.getMRI(); 3481 const LLT S32 = LLT::scalar(32); 3482 const LLT S16 = LLT::scalar(16); 3483 const LLT V2S16 = LLT::vector(2, 16); 3484 3485 // Index of first address argument 3486 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3487 3488 // Check for 16 bit addresses and pack if true. 3489 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3490 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3491 const bool IsA16 = AddrTy == S16; 3492 3493 int NumVAddrs, NumGradients; 3494 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3495 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3496 getDMaskIdx(BaseOpcode, NumDefs); 3497 unsigned DMask = 0; 3498 3499 int DMaskLanes = 0; 3500 if (!BaseOpcode->Atomic) { 3501 DMask = MI.getOperand(DMaskIdx).getImm(); 3502 if (BaseOpcode->Gather4) { 3503 DMaskLanes = 4; 3504 } else if (DMask != 0) { 3505 DMaskLanes = countPopulation(DMask); 3506 } else if (!IsTFE && !BaseOpcode->Store) { 3507 // If dmask is 0, this is a no-op load. This can be eliminated. 3508 B.buildUndef(MI.getOperand(0)); 3509 MI.eraseFromParent(); 3510 return true; 3511 } 3512 } 3513 3514 Observer.changingInstr(MI); 3515 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3516 3517 unsigned NewOpcode = NumDefs == 0 ? 3518 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3519 3520 // Track that we legalized this 3521 MI.setDesc(B.getTII().get(NewOpcode)); 3522 3523 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3524 // dmask to be at least 1 otherwise the instruction will fail 3525 if (IsTFE && DMask == 0) { 3526 DMask = 0x1; 3527 DMaskLanes = 1; 3528 MI.getOperand(DMaskIdx).setImm(DMask); 3529 } 3530 3531 // If the register allocator cannot place the address registers contiguously 3532 // without introducing moves, then using the non-sequential address encoding 3533 // is always preferable, since it saves VALU instructions and is usually a 3534 // wash in terms of code size or even better. 3535 // 3536 // However, we currently have no way of hinting to the register allocator 3537 // that MIMG addresses should be placed contiguously when it is possible to 3538 // do so, so force non-NSA for the common 2-address case as a heuristic. 3539 // 3540 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3541 // allocation when possible. 3542 const bool UseNSA = NumVAddrs >= 3 && 3543 ST.hasFeature(AMDGPU::FeatureNSAEncoding); 3544 3545 // Rewrite the addressing register layout before doing anything else. 3546 if (IsA16) { 3547 // FIXME: this feature is missing from gfx10. When that is fixed, this check 3548 // should be introduced. 3549 if (!ST.hasR128A16() && !ST.hasGFX10A16()) 3550 return false; 3551 3552 if (NumVAddrs > 1) { 3553 SmallVector<Register, 4> PackedRegs; 3554 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs, 3555 NumGradients); 3556 3557 if (!UseNSA && PackedRegs.size() > 1) { 3558 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3559 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3560 PackedRegs[0] = Concat.getReg(0); 3561 PackedRegs.resize(1); 3562 } 3563 3564 const int NumPacked = PackedRegs.size(); 3565 for (int I = 0; I != NumVAddrs; ++I) { 3566 assert(MI.getOperand(AddrIdx + I).getReg() != AMDGPU::NoRegister); 3567 3568 if (I < NumPacked) 3569 MI.getOperand(AddrIdx + I).setReg(PackedRegs[I]); 3570 else 3571 MI.getOperand(AddrIdx + I).setReg(AMDGPU::NoRegister); 3572 } 3573 } 3574 } else if (!UseNSA && NumVAddrs > 1) { 3575 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3576 } 3577 3578 if (BaseOpcode->Store) { // No TFE for stores? 3579 // TODO: Handle dmask trim 3580 Register VData = MI.getOperand(1).getReg(); 3581 LLT Ty = MRI->getType(VData); 3582 if (!Ty.isVector() || Ty.getElementType() != S16) 3583 return true; 3584 3585 B.setInstr(MI); 3586 3587 Register RepackedReg = handleD16VData(B, *MRI, VData); 3588 if (RepackedReg != VData) { 3589 MI.getOperand(1).setReg(RepackedReg); 3590 } 3591 3592 return true; 3593 } 3594 3595 Register DstReg = MI.getOperand(0).getReg(); 3596 LLT Ty = MRI->getType(DstReg); 3597 const LLT EltTy = Ty.getScalarType(); 3598 const bool IsD16 = Ty.getScalarType() == S16; 3599 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3600 3601 // Confirm that the return type is large enough for the dmask specified 3602 if (NumElts < DMaskLanes) 3603 return false; 3604 3605 if (NumElts > 4 || DMaskLanes > 4) 3606 return false; 3607 3608 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3609 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3610 3611 // The raw dword aligned data component of the load. The only legal cases 3612 // where this matters should be when using the packed D16 format, for 3613 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3614 LLT RoundedTy; 3615 3616 // S32 vector to to cover all data, plus TFE result element. 3617 LLT TFETy; 3618 3619 // Register type to use for each loaded component. Will be S32 or V2S16. 3620 LLT RegTy; 3621 3622 if (IsD16 && ST.hasUnpackedD16VMem()) { 3623 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3624 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3625 RegTy = S32; 3626 } else { 3627 unsigned EltSize = EltTy.getSizeInBits(); 3628 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3629 unsigned RoundedSize = 32 * RoundedElts; 3630 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3631 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3632 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3633 } 3634 3635 // The return type does not need adjustment. 3636 // TODO: Should we change s16 case to s32 or <2 x s16>? 3637 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3638 return true; 3639 3640 Register Dst1Reg; 3641 3642 // Insert after the instruction. 3643 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3644 3645 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3646 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3647 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3648 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3649 3650 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3651 3652 MI.getOperand(0).setReg(NewResultReg); 3653 3654 // In the IR, TFE is supposed to be used with a 2 element struct return 3655 // type. The intruction really returns these two values in one contiguous 3656 // register, with one additional dword beyond the loaded data. Rewrite the 3657 // return type to use a single register result. 3658 3659 if (IsTFE) { 3660 Dst1Reg = MI.getOperand(1).getReg(); 3661 if (MRI->getType(Dst1Reg) != S32) 3662 return false; 3663 3664 // TODO: Make sure the TFE operand bit is set. 3665 MI.RemoveOperand(1); 3666 3667 // Handle the easy case that requires no repack instructions. 3668 if (Ty == S32) { 3669 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 3670 return true; 3671 } 3672 } 3673 3674 // Now figure out how to copy the new result register back into the old 3675 // result. 3676 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 3677 3678 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 3679 3680 if (ResultNumRegs == 1) { 3681 assert(!IsTFE); 3682 ResultRegs[0] = NewResultReg; 3683 } else { 3684 // We have to repack into a new vector of some kind. 3685 for (int I = 0; I != NumDataRegs; ++I) 3686 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 3687 B.buildUnmerge(ResultRegs, NewResultReg); 3688 3689 // Drop the final TFE element to get the data part. The TFE result is 3690 // directly written to the right place already. 3691 if (IsTFE) 3692 ResultRegs.resize(NumDataRegs); 3693 } 3694 3695 // For an s16 scalar result, we form an s32 result with a truncate regardless 3696 // of packed vs. unpacked. 3697 if (IsD16 && !Ty.isVector()) { 3698 B.buildTrunc(DstReg, ResultRegs[0]); 3699 return true; 3700 } 3701 3702 // Avoid a build/concat_vector of 1 entry. 3703 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 3704 B.buildBitcast(DstReg, ResultRegs[0]); 3705 return true; 3706 } 3707 3708 assert(Ty.isVector()); 3709 3710 if (IsD16) { 3711 // For packed D16 results with TFE enabled, all the data components are 3712 // S32. Cast back to the expected type. 3713 // 3714 // TODO: We don't really need to use load s32 elements. We would only need one 3715 // cast for the TFE result if a multiple of v2s16 was used. 3716 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 3717 for (Register &Reg : ResultRegs) 3718 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 3719 } else if (ST.hasUnpackedD16VMem()) { 3720 for (Register &Reg : ResultRegs) 3721 Reg = B.buildTrunc(S16, Reg).getReg(0); 3722 } 3723 } 3724 3725 auto padWithUndef = [&](LLT Ty, int NumElts) { 3726 if (NumElts == 0) 3727 return; 3728 Register Undef = B.buildUndef(Ty).getReg(0); 3729 for (int I = 0; I != NumElts; ++I) 3730 ResultRegs.push_back(Undef); 3731 }; 3732 3733 // Pad out any elements eliminated due to the dmask. 3734 LLT ResTy = MRI->getType(ResultRegs[0]); 3735 if (!ResTy.isVector()) { 3736 padWithUndef(ResTy, NumElts - ResultRegs.size()); 3737 B.buildBuildVector(DstReg, ResultRegs); 3738 return true; 3739 } 3740 3741 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 3742 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 3743 3744 // Deal with the one annoying legal case. 3745 const LLT V3S16 = LLT::vector(3, 16); 3746 if (Ty == V3S16) { 3747 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 3748 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 3749 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 3750 return true; 3751 } 3752 3753 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 3754 B.buildConcatVectors(DstReg, ResultRegs); 3755 return true; 3756 } 3757 3758 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3759 MachineInstr &MI, MachineIRBuilder &B, 3760 GISelChangeObserver &Observer) const { 3761 Register Dst = MI.getOperand(0).getReg(); 3762 LLT Ty = B.getMRI()->getType(Dst); 3763 unsigned Size = Ty.getSizeInBits(); 3764 MachineFunction &MF = B.getMF(); 3765 3766 Observer.changingInstr(MI); 3767 3768 // FIXME: We don't really need this intermediate instruction. The intrinsic 3769 // should be fixed to have a memory operand. Since it's readnone, we're not 3770 // allowed to add one. 3771 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 3772 MI.RemoveOperand(1); // Remove intrinsic ID 3773 3774 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 3775 // TODO: Should this use datalayout alignment? 3776 const unsigned MemSize = (Size + 7) / 8; 3777 const unsigned MemAlign = 4; 3778 MachineMemOperand *MMO = MF.getMachineMemOperand( 3779 MachinePointerInfo(), 3780 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3781 MachineMemOperand::MOInvariant, MemSize, MemAlign); 3782 MI.addMemOperand(MF, MMO); 3783 3784 // There are no 96-bit result scalar loads, but widening to 128-bit should 3785 // always be legal. We may need to restore this to a 96-bit result if it turns 3786 // out this needs to be converted to a vector load during RegBankSelect. 3787 if (!isPowerOf2_32(Size)) { 3788 LegalizerHelper Helper(MF, *this, Observer, B); 3789 B.setInstr(MI); 3790 3791 if (Ty.isVector()) 3792 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 3793 else 3794 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 3795 } 3796 3797 Observer.changedInstr(MI); 3798 return true; 3799 } 3800 3801 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 3802 MachineRegisterInfo &MRI, 3803 MachineIRBuilder &B) const { 3804 B.setInstr(MI); 3805 3806 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 3807 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 3808 !ST.isTrapHandlerEnabled()) { 3809 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 3810 } else { 3811 // Pass queue pointer to trap handler as input, and insert trap instruction 3812 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 3813 const ArgDescriptor *Arg = 3814 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 3815 if (!Arg) 3816 return false; 3817 MachineRegisterInfo &MRI = *B.getMRI(); 3818 Register SGPR01(AMDGPU::SGPR0_SGPR1); 3819 Register LiveIn = getLiveInRegister( 3820 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 3821 /*InsertLiveInCopy=*/false); 3822 if (!loadInputValue(LiveIn, B, Arg)) 3823 return false; 3824 B.buildCopy(SGPR01, LiveIn); 3825 B.buildInstr(AMDGPU::S_TRAP) 3826 .addImm(GCNSubtarget::TrapIDLLVMTrap) 3827 .addReg(SGPR01, RegState::Implicit); 3828 } 3829 3830 MI.eraseFromParent(); 3831 return true; 3832 } 3833 3834 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 3835 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 3836 B.setInstr(MI); 3837 3838 // Is non-HSA path or trap-handler disabled? then, report a warning 3839 // accordingly 3840 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 3841 !ST.isTrapHandlerEnabled()) { 3842 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 3843 "debugtrap handler not supported", 3844 MI.getDebugLoc(), DS_Warning); 3845 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 3846 Ctx.diagnose(NoTrap); 3847 } else { 3848 // Insert debug-trap instruction 3849 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 3850 } 3851 3852 MI.eraseFromParent(); 3853 return true; 3854 } 3855 3856 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 3857 MachineIRBuilder &B, 3858 GISelChangeObserver &Observer) const { 3859 MachineRegisterInfo &MRI = *B.getMRI(); 3860 3861 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 3862 auto IntrID = MI.getIntrinsicID(); 3863 switch (IntrID) { 3864 case Intrinsic::amdgcn_if: 3865 case Intrinsic::amdgcn_else: { 3866 MachineInstr *Br = nullptr; 3867 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3868 const SIRegisterInfo *TRI 3869 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3870 3871 B.setInstr(*BrCond); 3872 Register Def = MI.getOperand(1).getReg(); 3873 Register Use = MI.getOperand(3).getReg(); 3874 3875 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3876 if (Br) 3877 BrTarget = Br->getOperand(0).getMBB(); 3878 3879 if (IntrID == Intrinsic::amdgcn_if) { 3880 B.buildInstr(AMDGPU::SI_IF) 3881 .addDef(Def) 3882 .addUse(Use) 3883 .addMBB(BrTarget); 3884 } else { 3885 B.buildInstr(AMDGPU::SI_ELSE) 3886 .addDef(Def) 3887 .addUse(Use) 3888 .addMBB(BrTarget) 3889 .addImm(0); 3890 } 3891 3892 if (Br) 3893 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3894 3895 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 3896 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 3897 MI.eraseFromParent(); 3898 BrCond->eraseFromParent(); 3899 return true; 3900 } 3901 3902 return false; 3903 } 3904 case Intrinsic::amdgcn_loop: { 3905 MachineInstr *Br = nullptr; 3906 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3907 const SIRegisterInfo *TRI 3908 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3909 3910 B.setInstr(*BrCond); 3911 3912 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3913 if (Br) 3914 BrTarget = Br->getOperand(0).getMBB(); 3915 3916 Register Reg = MI.getOperand(2).getReg(); 3917 B.buildInstr(AMDGPU::SI_LOOP) 3918 .addUse(Reg) 3919 .addMBB(BrTarget); 3920 3921 if (Br) 3922 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3923 3924 MI.eraseFromParent(); 3925 BrCond->eraseFromParent(); 3926 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 3927 return true; 3928 } 3929 3930 return false; 3931 } 3932 case Intrinsic::amdgcn_kernarg_segment_ptr: 3933 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 3934 B.setInstr(MI); 3935 // This only makes sense to call in a kernel, so just lower to null. 3936 B.buildConstant(MI.getOperand(0).getReg(), 0); 3937 MI.eraseFromParent(); 3938 return true; 3939 } 3940 3941 return legalizePreloadedArgIntrin( 3942 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3943 case Intrinsic::amdgcn_implicitarg_ptr: 3944 return legalizeImplicitArgPtr(MI, MRI, B); 3945 case Intrinsic::amdgcn_workitem_id_x: 3946 return legalizePreloadedArgIntrin(MI, MRI, B, 3947 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3948 case Intrinsic::amdgcn_workitem_id_y: 3949 return legalizePreloadedArgIntrin(MI, MRI, B, 3950 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3951 case Intrinsic::amdgcn_workitem_id_z: 3952 return legalizePreloadedArgIntrin(MI, MRI, B, 3953 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3954 case Intrinsic::amdgcn_workgroup_id_x: 3955 return legalizePreloadedArgIntrin(MI, MRI, B, 3956 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 3957 case Intrinsic::amdgcn_workgroup_id_y: 3958 return legalizePreloadedArgIntrin(MI, MRI, B, 3959 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 3960 case Intrinsic::amdgcn_workgroup_id_z: 3961 return legalizePreloadedArgIntrin(MI, MRI, B, 3962 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 3963 case Intrinsic::amdgcn_dispatch_ptr: 3964 return legalizePreloadedArgIntrin(MI, MRI, B, 3965 AMDGPUFunctionArgInfo::DISPATCH_PTR); 3966 case Intrinsic::amdgcn_queue_ptr: 3967 return legalizePreloadedArgIntrin(MI, MRI, B, 3968 AMDGPUFunctionArgInfo::QUEUE_PTR); 3969 case Intrinsic::amdgcn_implicit_buffer_ptr: 3970 return legalizePreloadedArgIntrin( 3971 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 3972 case Intrinsic::amdgcn_dispatch_id: 3973 return legalizePreloadedArgIntrin(MI, MRI, B, 3974 AMDGPUFunctionArgInfo::DISPATCH_ID); 3975 case Intrinsic::amdgcn_fdiv_fast: 3976 return legalizeFDIVFastIntrin(MI, MRI, B); 3977 case Intrinsic::amdgcn_is_shared: 3978 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 3979 case Intrinsic::amdgcn_is_private: 3980 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 3981 case Intrinsic::amdgcn_wavefrontsize: { 3982 B.setInstr(MI); 3983 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 3984 MI.eraseFromParent(); 3985 return true; 3986 } 3987 case Intrinsic::amdgcn_s_buffer_load: 3988 return legalizeSBufferLoad(MI, B, Observer); 3989 case Intrinsic::amdgcn_raw_buffer_store: 3990 case Intrinsic::amdgcn_struct_buffer_store: 3991 return legalizeBufferStore(MI, MRI, B, false, false); 3992 case Intrinsic::amdgcn_raw_buffer_store_format: 3993 case Intrinsic::amdgcn_struct_buffer_store_format: 3994 return legalizeBufferStore(MI, MRI, B, false, true); 3995 case Intrinsic::amdgcn_raw_tbuffer_store: 3996 case Intrinsic::amdgcn_struct_tbuffer_store: 3997 return legalizeBufferStore(MI, MRI, B, true, true); 3998 case Intrinsic::amdgcn_raw_buffer_load: 3999 case Intrinsic::amdgcn_struct_buffer_load: 4000 return legalizeBufferLoad(MI, MRI, B, false, false); 4001 case Intrinsic::amdgcn_raw_buffer_load_format: 4002 case Intrinsic::amdgcn_struct_buffer_load_format: 4003 return legalizeBufferLoad(MI, MRI, B, true, false); 4004 case Intrinsic::amdgcn_raw_tbuffer_load: 4005 case Intrinsic::amdgcn_struct_tbuffer_load: 4006 return legalizeBufferLoad(MI, MRI, B, true, true); 4007 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4008 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4009 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4010 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4011 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4012 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4013 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4014 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4015 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4016 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4017 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4018 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4019 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4020 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4021 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4022 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4023 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4024 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4025 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4026 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4027 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4028 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4029 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4030 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4031 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4032 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4033 return legalizeBufferAtomic(MI, B, IntrID); 4034 case Intrinsic::amdgcn_atomic_inc: 4035 return legalizeAtomicIncDec(MI, B, true); 4036 case Intrinsic::amdgcn_atomic_dec: 4037 return legalizeAtomicIncDec(MI, B, false); 4038 case Intrinsic::trap: 4039 return legalizeTrapIntrinsic(MI, MRI, B); 4040 case Intrinsic::debugtrap: 4041 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4042 default: { 4043 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4044 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4045 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 4046 return true; 4047 } 4048 } 4049 4050 return true; 4051 } 4052