1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/ADT/ScopeExit.h" 28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 30 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 31 #include "llvm/CodeGen/TargetOpcodes.h" 32 #include "llvm/CodeGen/ValueTypes.h" 33 #include "llvm/IR/DerivedTypes.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/IR/Type.h" 36 #include "llvm/Support/Debug.h" 37 38 #define DEBUG_TYPE "amdgpu-legalinfo" 39 40 using namespace llvm; 41 using namespace LegalizeActions; 42 using namespace LegalizeMutations; 43 using namespace LegalityPredicates; 44 using namespace MIPatternMatch; 45 46 // Round the number of elements to the next power of two elements 47 static LLT getPow2VectorType(LLT Ty) { 48 unsigned NElts = Ty.getNumElements(); 49 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 50 return Ty.changeNumElements(Pow2NElts); 51 } 52 53 // Round the number of bits to the next power of two bits 54 static LLT getPow2ScalarType(LLT Ty) { 55 unsigned Bits = Ty.getSizeInBits(); 56 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 57 return LLT::scalar(Pow2Bits); 58 } 59 60 static LegalityPredicate isMultiple32(unsigned TypeIdx, 61 unsigned MaxSize = 1024) { 62 return [=](const LegalityQuery &Query) { 63 const LLT Ty = Query.Types[TypeIdx]; 64 const LLT EltTy = Ty.getScalarType(); 65 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 66 }; 67 } 68 69 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 70 return [=](const LegalityQuery &Query) { 71 return Query.Types[TypeIdx].getSizeInBits() == Size; 72 }; 73 } 74 75 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 return Ty.isVector() && 79 Ty.getNumElements() % 2 != 0 && 80 Ty.getElementType().getSizeInBits() < 32 && 81 Ty.getSizeInBits() % 32 != 0; 82 }; 83 } 84 85 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 86 return [=](const LegalityQuery &Query) { 87 const LLT Ty = Query.Types[TypeIdx]; 88 const LLT EltTy = Ty.getScalarType(); 89 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 90 }; 91 } 92 93 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 94 return [=](const LegalityQuery &Query) { 95 const LLT Ty = Query.Types[TypeIdx]; 96 const LLT EltTy = Ty.getElementType(); 97 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 98 }; 99 } 100 101 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 102 return [=](const LegalityQuery &Query) { 103 const LLT Ty = Query.Types[TypeIdx]; 104 const LLT EltTy = Ty.getElementType(); 105 unsigned Size = Ty.getSizeInBits(); 106 unsigned Pieces = (Size + 63) / 64; 107 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 108 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 109 }; 110 } 111 112 // Increase the number of vector elements to reach the next multiple of 32-bit 113 // type. 114 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 115 return [=](const LegalityQuery &Query) { 116 const LLT Ty = Query.Types[TypeIdx]; 117 118 const LLT EltTy = Ty.getElementType(); 119 const int Size = Ty.getSizeInBits(); 120 const int EltSize = EltTy.getSizeInBits(); 121 const int NextMul32 = (Size + 31) / 32; 122 123 assert(EltSize < 32); 124 125 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 126 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 127 }; 128 } 129 130 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 131 return [=](const LegalityQuery &Query) { 132 const LLT QueryTy = Query.Types[TypeIdx]; 133 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 134 }; 135 } 136 137 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 138 return [=](const LegalityQuery &Query) { 139 const LLT QueryTy = Query.Types[TypeIdx]; 140 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 141 }; 142 } 143 144 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 145 return [=](const LegalityQuery &Query) { 146 const LLT QueryTy = Query.Types[TypeIdx]; 147 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 148 }; 149 } 150 151 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 152 // v2s16. 153 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 154 return [=](const LegalityQuery &Query) { 155 const LLT Ty = Query.Types[TypeIdx]; 156 if (Ty.isVector()) { 157 const int EltSize = Ty.getElementType().getSizeInBits(); 158 return EltSize == 32 || EltSize == 64 || 159 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 160 EltSize == 128 || EltSize == 256; 161 } 162 163 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 164 }; 165 } 166 167 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 168 return [=](const LegalityQuery &Query) { 169 const LLT QueryTy = Query.Types[TypeIdx]; 170 return QueryTy.isVector() && QueryTy.getElementType() == Type; 171 }; 172 } 173 174 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 const LLT QueryTy = Query.Types[TypeIdx]; 177 if (!QueryTy.isVector()) 178 return false; 179 const LLT EltTy = QueryTy.getElementType(); 180 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 181 }; 182 } 183 184 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 const LLT Ty = Query.Types[TypeIdx]; 187 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 188 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 189 }; 190 } 191 192 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) { 193 return [=](const LegalityQuery &Query) { 194 return Query.Types[TypeIdx0].getSizeInBits() < 195 Query.Types[TypeIdx1].getSizeInBits(); 196 }; 197 } 198 199 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) { 200 return [=](const LegalityQuery &Query) { 201 return Query.Types[TypeIdx0].getSizeInBits() > 202 Query.Types[TypeIdx1].getSizeInBits(); 203 }; 204 } 205 206 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 207 const GCNTargetMachine &TM) 208 : ST(ST_) { 209 using namespace TargetOpcode; 210 211 auto GetAddrSpacePtr = [&TM](unsigned AS) { 212 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 213 }; 214 215 const LLT S1 = LLT::scalar(1); 216 const LLT S16 = LLT::scalar(16); 217 const LLT S32 = LLT::scalar(32); 218 const LLT S64 = LLT::scalar(64); 219 const LLT S128 = LLT::scalar(128); 220 const LLT S256 = LLT::scalar(256); 221 const LLT S512 = LLT::scalar(512); 222 const LLT S1024 = LLT::scalar(1024); 223 224 const LLT V2S16 = LLT::vector(2, 16); 225 const LLT V4S16 = LLT::vector(4, 16); 226 227 const LLT V2S32 = LLT::vector(2, 32); 228 const LLT V3S32 = LLT::vector(3, 32); 229 const LLT V4S32 = LLT::vector(4, 32); 230 const LLT V5S32 = LLT::vector(5, 32); 231 const LLT V6S32 = LLT::vector(6, 32); 232 const LLT V7S32 = LLT::vector(7, 32); 233 const LLT V8S32 = LLT::vector(8, 32); 234 const LLT V9S32 = LLT::vector(9, 32); 235 const LLT V10S32 = LLT::vector(10, 32); 236 const LLT V11S32 = LLT::vector(11, 32); 237 const LLT V12S32 = LLT::vector(12, 32); 238 const LLT V13S32 = LLT::vector(13, 32); 239 const LLT V14S32 = LLT::vector(14, 32); 240 const LLT V15S32 = LLT::vector(15, 32); 241 const LLT V16S32 = LLT::vector(16, 32); 242 const LLT V32S32 = LLT::vector(32, 32); 243 244 const LLT V2S64 = LLT::vector(2, 64); 245 const LLT V3S64 = LLT::vector(3, 64); 246 const LLT V4S64 = LLT::vector(4, 64); 247 const LLT V5S64 = LLT::vector(5, 64); 248 const LLT V6S64 = LLT::vector(6, 64); 249 const LLT V7S64 = LLT::vector(7, 64); 250 const LLT V8S64 = LLT::vector(8, 64); 251 const LLT V16S64 = LLT::vector(16, 64); 252 253 std::initializer_list<LLT> AllS32Vectors = 254 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 255 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 256 std::initializer_list<LLT> AllS64Vectors = 257 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 258 259 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 260 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 261 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 262 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 263 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 264 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 265 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 266 267 const LLT CodePtr = FlatPtr; 268 269 const std::initializer_list<LLT> AddrSpaces64 = { 270 GlobalPtr, ConstantPtr, FlatPtr 271 }; 272 273 const std::initializer_list<LLT> AddrSpaces32 = { 274 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 275 }; 276 277 const std::initializer_list<LLT> FPTypesBase = { 278 S32, S64 279 }; 280 281 const std::initializer_list<LLT> FPTypes16 = { 282 S32, S64, S16 283 }; 284 285 const std::initializer_list<LLT> FPTypesPK16 = { 286 S32, S64, S16, V2S16 287 }; 288 289 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 290 291 setAction({G_BRCOND, S1}, Legal); // VCC branches 292 setAction({G_BRCOND, S32}, Legal); // SCC branches 293 294 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 295 // elements for v3s16 296 getActionDefinitionsBuilder(G_PHI) 297 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 298 .legalFor(AllS32Vectors) 299 .legalFor(AllS64Vectors) 300 .legalFor(AddrSpaces64) 301 .legalFor(AddrSpaces32) 302 .clampScalar(0, S32, S256) 303 .widenScalarToNextPow2(0, 32) 304 .clampMaxNumElements(0, S32, 16) 305 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 306 .legalIf(isPointer(0)); 307 308 if (ST.hasVOP3PInsts()) { 309 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 310 .legalFor({S32, S16, V2S16}) 311 .clampScalar(0, S16, S32) 312 .clampMaxNumElements(0, S16, 2) 313 .scalarize(0) 314 .widenScalarToNextPow2(0, 32); 315 } else if (ST.has16BitInsts()) { 316 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 317 .legalFor({S32, S16}) 318 .clampScalar(0, S16, S32) 319 .scalarize(0) 320 .widenScalarToNextPow2(0, 32); 321 } else { 322 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 323 .legalFor({S32}) 324 .clampScalar(0, S32, S32) 325 .scalarize(0); 326 } 327 328 // FIXME: Not really legal. Placeholder for custom lowering. 329 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 330 .customFor({S32, S64}) 331 .clampScalar(0, S32, S64) 332 .widenScalarToNextPow2(0, 32) 333 .scalarize(0); 334 335 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 336 .legalFor({S32}) 337 .clampScalar(0, S32, S32) 338 .scalarize(0); 339 340 // Report legal for any types we can handle anywhere. For the cases only legal 341 // on the SALU, RegBankSelect will be able to re-legalize. 342 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 343 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 344 .clampScalar(0, S32, S64) 345 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 346 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 347 .widenScalarToNextPow2(0) 348 .scalarize(0); 349 350 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 351 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 352 .legalFor({{S32, S1}, {S32, S32}}) 353 .minScalar(0, S32) 354 // TODO: .scalarize(0) 355 .lower(); 356 357 getActionDefinitionsBuilder(G_BITCAST) 358 // Don't worry about the size constraint. 359 .legalIf(all(isRegisterType(0), isRegisterType(1))) 360 .lower(); 361 362 363 getActionDefinitionsBuilder(G_CONSTANT) 364 .legalFor({S1, S32, S64, S16, GlobalPtr, 365 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 366 .clampScalar(0, S32, S64) 367 .widenScalarToNextPow2(0) 368 .legalIf(isPointer(0)); 369 370 getActionDefinitionsBuilder(G_FCONSTANT) 371 .legalFor({S32, S64, S16}) 372 .clampScalar(0, S16, S64); 373 374 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 375 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 376 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 377 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 378 .clampScalarOrElt(0, S32, S1024) 379 .legalIf(isMultiple32(0)) 380 .widenScalarToNextPow2(0, 32) 381 .clampMaxNumElements(0, S32, 16); 382 383 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 384 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 385 .unsupportedFor({PrivatePtr}) 386 .custom(); 387 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 388 389 auto &FPOpActions = getActionDefinitionsBuilder( 390 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 391 .legalFor({S32, S64}); 392 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 393 .customFor({S32, S64}); 394 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 395 .customFor({S32, S64}); 396 397 if (ST.has16BitInsts()) { 398 if (ST.hasVOP3PInsts()) 399 FPOpActions.legalFor({S16, V2S16}); 400 else 401 FPOpActions.legalFor({S16}); 402 403 TrigActions.customFor({S16}); 404 FDIVActions.customFor({S16}); 405 } 406 407 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 408 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 409 410 if (ST.hasVOP3PInsts()) { 411 MinNumMaxNum.customFor(FPTypesPK16) 412 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 413 .clampMaxNumElements(0, S16, 2) 414 .clampScalar(0, S16, S64) 415 .scalarize(0); 416 } else if (ST.has16BitInsts()) { 417 MinNumMaxNum.customFor(FPTypes16) 418 .clampScalar(0, S16, S64) 419 .scalarize(0); 420 } else { 421 MinNumMaxNum.customFor(FPTypesBase) 422 .clampScalar(0, S32, S64) 423 .scalarize(0); 424 } 425 426 if (ST.hasVOP3PInsts()) 427 FPOpActions.clampMaxNumElements(0, S16, 2); 428 429 FPOpActions 430 .scalarize(0) 431 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 432 433 TrigActions 434 .scalarize(0) 435 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 436 437 FDIVActions 438 .scalarize(0) 439 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 440 441 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 442 .legalFor(FPTypesPK16) 443 .clampMaxNumElements(0, S16, 2) 444 .scalarize(0) 445 .clampScalar(0, S16, S64); 446 447 if (ST.has16BitInsts()) { 448 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 449 .legalFor({S32, S64, S16}) 450 .scalarize(0) 451 .clampScalar(0, S16, S64); 452 } else { 453 getActionDefinitionsBuilder(G_FSQRT) 454 .legalFor({S32, S64}) 455 .scalarize(0) 456 .clampScalar(0, S32, S64); 457 458 if (ST.hasFractBug()) { 459 getActionDefinitionsBuilder(G_FFLOOR) 460 .customFor({S64}) 461 .legalFor({S32, S64}) 462 .scalarize(0) 463 .clampScalar(0, S32, S64); 464 } else { 465 getActionDefinitionsBuilder(G_FFLOOR) 466 .legalFor({S32, S64}) 467 .scalarize(0) 468 .clampScalar(0, S32, S64); 469 } 470 } 471 472 getActionDefinitionsBuilder(G_FPTRUNC) 473 .legalFor({{S32, S64}, {S16, S32}}) 474 .scalarize(0) 475 .lower(); 476 477 getActionDefinitionsBuilder(G_FPEXT) 478 .legalFor({{S64, S32}, {S32, S16}}) 479 .lowerFor({{S64, S16}}) // FIXME: Implement 480 .scalarize(0); 481 482 getActionDefinitionsBuilder(G_FSUB) 483 // Use actual fsub instruction 484 .legalFor({S32}) 485 // Must use fadd + fneg 486 .lowerFor({S64, S16, V2S16}) 487 .scalarize(0) 488 .clampScalar(0, S32, S64); 489 490 // Whether this is legal depends on the floating point mode for the function. 491 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 492 if (ST.hasMadF16()) 493 FMad.customFor({S32, S16}); 494 else 495 FMad.customFor({S32}); 496 FMad.scalarize(0) 497 .lower(); 498 499 // TODO: Do we need to clamp maximum bitwidth? 500 getActionDefinitionsBuilder(G_TRUNC) 501 .legalIf(isScalar(0)) 502 .legalFor({{V2S16, V2S32}}) 503 .clampMaxNumElements(0, S16, 2) 504 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 505 // situations (like an invalid implicit use), we don't want to infinite loop 506 // in the legalizer. 507 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 508 .alwaysLegal(); 509 510 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 511 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 512 {S32, S1}, {S64, S1}, {S16, S1}}) 513 .scalarize(0) 514 .clampScalar(0, S32, S64) 515 .widenScalarToNextPow2(1, 32); 516 517 // TODO: Split s1->s64 during regbankselect for VALU. 518 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 519 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 520 .lowerFor({{S32, S64}}) 521 .lowerIf(typeIs(1, S1)) 522 .customFor({{S64, S64}}); 523 if (ST.has16BitInsts()) 524 IToFP.legalFor({{S16, S16}}); 525 IToFP.clampScalar(1, S32, S64) 526 .scalarize(0) 527 .widenScalarToNextPow2(1); 528 529 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 530 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 531 .customFor({{S64, S64}}); 532 if (ST.has16BitInsts()) 533 FPToI.legalFor({{S16, S16}}); 534 else 535 FPToI.minScalar(1, S32); 536 537 FPToI.minScalar(0, S32) 538 .scalarize(0) 539 .lower(); 540 541 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 542 .scalarize(0) 543 .lower(); 544 545 if (ST.has16BitInsts()) { 546 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 547 .legalFor({S16, S32, S64}) 548 .clampScalar(0, S16, S64) 549 .scalarize(0); 550 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 551 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 552 .legalFor({S32, S64}) 553 .clampScalar(0, S32, S64) 554 .scalarize(0); 555 } else { 556 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 557 .legalFor({S32}) 558 .customFor({S64}) 559 .clampScalar(0, S32, S64) 560 .scalarize(0); 561 } 562 563 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 564 .scalarize(0) 565 .alwaysLegal(); 566 567 auto &CmpBuilder = 568 getActionDefinitionsBuilder(G_ICMP) 569 // The compare output type differs based on the register bank of the output, 570 // so make both s1 and s32 legal. 571 // 572 // Scalar compares producing output in scc will be promoted to s32, as that 573 // is the allocatable register type that will be needed for the copy from 574 // scc. This will be promoted during RegBankSelect, and we assume something 575 // before that won't try to use s32 result types. 576 // 577 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 578 // bank. 579 .legalForCartesianProduct( 580 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 581 .legalForCartesianProduct( 582 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 583 if (ST.has16BitInsts()) { 584 CmpBuilder.legalFor({{S1, S16}}); 585 } 586 587 CmpBuilder 588 .widenScalarToNextPow2(1) 589 .clampScalar(1, S32, S64) 590 .scalarize(0) 591 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 592 593 getActionDefinitionsBuilder(G_FCMP) 594 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 595 .widenScalarToNextPow2(1) 596 .clampScalar(1, S32, S64) 597 .scalarize(0); 598 599 // FIXME: fpow has a selection pattern that should move to custom lowering. 600 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 601 if (ST.has16BitInsts()) 602 Exp2Ops.legalFor({S32, S16}); 603 else 604 Exp2Ops.legalFor({S32}); 605 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 606 Exp2Ops.scalarize(0); 607 608 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 609 if (ST.has16BitInsts()) 610 ExpOps.customFor({{S32}, {S16}}); 611 else 612 ExpOps.customFor({S32}); 613 ExpOps.clampScalar(0, MinScalarFPTy, S32) 614 .scalarize(0); 615 616 // The 64-bit versions produce 32-bit results, but only on the SALU. 617 getActionDefinitionsBuilder(G_CTPOP) 618 .legalFor({{S32, S32}, {S32, S64}}) 619 .clampScalar(0, S32, S32) 620 .clampScalar(1, S32, S64) 621 .scalarize(0) 622 .widenScalarToNextPow2(0, 32) 623 .widenScalarToNextPow2(1, 32); 624 625 // The hardware instructions return a different result on 0 than the generic 626 // instructions expect. The hardware produces -1, but these produce the 627 // bitwidth. 628 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 629 .scalarize(0) 630 .clampScalar(0, S32, S32) 631 .clampScalar(1, S32, S64) 632 .widenScalarToNextPow2(0, 32) 633 .widenScalarToNextPow2(1, 32) 634 .lower(); 635 636 // The 64-bit versions produce 32-bit results, but only on the SALU. 637 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 638 .legalFor({{S32, S32}, {S32, S64}}) 639 .clampScalar(0, S32, S32) 640 .clampScalar(1, S32, S64) 641 .scalarize(0) 642 .widenScalarToNextPow2(0, 32) 643 .widenScalarToNextPow2(1, 32); 644 645 getActionDefinitionsBuilder(G_BITREVERSE) 646 .legalFor({S32}) 647 .clampScalar(0, S32, S32) 648 .scalarize(0); 649 650 if (ST.has16BitInsts()) { 651 getActionDefinitionsBuilder(G_BSWAP) 652 .legalFor({S16, S32, V2S16}) 653 .clampMaxNumElements(0, S16, 2) 654 // FIXME: Fixing non-power-of-2 before clamp is workaround for 655 // narrowScalar limitation. 656 .widenScalarToNextPow2(0) 657 .clampScalar(0, S16, S32) 658 .scalarize(0); 659 660 if (ST.hasVOP3PInsts()) { 661 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 662 .legalFor({S32, S16, V2S16}) 663 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 664 .clampMaxNumElements(0, S16, 2) 665 .minScalar(0, S16) 666 .widenScalarToNextPow2(0) 667 .scalarize(0) 668 .lower(); 669 } else { 670 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 671 .legalFor({S32, S16}) 672 .widenScalarToNextPow2(0) 673 .minScalar(0, S16) 674 .scalarize(0) 675 .lower(); 676 } 677 } else { 678 // TODO: Should have same legality without v_perm_b32 679 getActionDefinitionsBuilder(G_BSWAP) 680 .legalFor({S32}) 681 .lowerIf(narrowerThan(0, 32)) 682 // FIXME: Fixing non-power-of-2 before clamp is workaround for 683 // narrowScalar limitation. 684 .widenScalarToNextPow2(0) 685 .maxScalar(0, S32) 686 .scalarize(0) 687 .lower(); 688 689 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 690 .legalFor({S32}) 691 .minScalar(0, S32) 692 .widenScalarToNextPow2(0) 693 .scalarize(0) 694 .lower(); 695 } 696 697 getActionDefinitionsBuilder(G_INTTOPTR) 698 // List the common cases 699 .legalForCartesianProduct(AddrSpaces64, {S64}) 700 .legalForCartesianProduct(AddrSpaces32, {S32}) 701 .scalarize(0) 702 // Accept any address space as long as the size matches 703 .legalIf(sameSize(0, 1)) 704 .widenScalarIf(smallerThan(1, 0), 705 [](const LegalityQuery &Query) { 706 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 707 }) 708 .narrowScalarIf(greaterThan(1, 0), 709 [](const LegalityQuery &Query) { 710 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 711 }); 712 713 getActionDefinitionsBuilder(G_PTRTOINT) 714 // List the common cases 715 .legalForCartesianProduct(AddrSpaces64, {S64}) 716 .legalForCartesianProduct(AddrSpaces32, {S32}) 717 .scalarize(0) 718 // Accept any address space as long as the size matches 719 .legalIf(sameSize(0, 1)) 720 .widenScalarIf(smallerThan(0, 1), 721 [](const LegalityQuery &Query) { 722 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 723 }) 724 .narrowScalarIf( 725 greaterThan(0, 1), 726 [](const LegalityQuery &Query) { 727 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 728 }); 729 730 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 731 .scalarize(0) 732 .custom(); 733 734 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 735 // handle some operations by just promoting the register during 736 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 737 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 738 switch (AS) { 739 // FIXME: Private element size. 740 case AMDGPUAS::PRIVATE_ADDRESS: 741 return 32; 742 // FIXME: Check subtarget 743 case AMDGPUAS::LOCAL_ADDRESS: 744 return ST.useDS128() ? 128 : 64; 745 746 // Treat constant and global as identical. SMRD loads are sometimes usable 747 // for global loads (ideally constant address space should be eliminated) 748 // depending on the context. Legality cannot be context dependent, but 749 // RegBankSelect can split the load as necessary depending on the pointer 750 // register bank/uniformity and if the memory is invariant or not written in 751 // a kernel. 752 case AMDGPUAS::CONSTANT_ADDRESS: 753 case AMDGPUAS::GLOBAL_ADDRESS: 754 return IsLoad ? 512 : 128; 755 default: 756 return 128; 757 } 758 }; 759 760 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 761 bool IsLoad) -> bool { 762 const LLT DstTy = Query.Types[0]; 763 764 // Split vector extloads. 765 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 766 unsigned Align = Query.MMODescrs[0].AlignInBits; 767 768 if (MemSize < DstTy.getSizeInBits()) 769 MemSize = std::max(MemSize, Align); 770 771 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 772 return true; 773 774 const LLT PtrTy = Query.Types[1]; 775 unsigned AS = PtrTy.getAddressSpace(); 776 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 777 return true; 778 779 // Catch weird sized loads that don't evenly divide into the access sizes 780 // TODO: May be able to widen depending on alignment etc. 781 unsigned NumRegs = (MemSize + 31) / 32; 782 if (NumRegs == 3) { 783 if (!ST.hasDwordx3LoadStores()) 784 return true; 785 } else { 786 // If the alignment allows, these should have been widened. 787 if (!isPowerOf2_32(NumRegs)) 788 return true; 789 } 790 791 if (Align < MemSize) { 792 const SITargetLowering *TLI = ST.getTargetLowering(); 793 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 794 } 795 796 return false; 797 }; 798 799 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 800 unsigned Size = Query.Types[0].getSizeInBits(); 801 if (isPowerOf2_32(Size)) 802 return false; 803 804 if (Size == 96 && ST.hasDwordx3LoadStores()) 805 return false; 806 807 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 808 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 809 return false; 810 811 unsigned Align = Query.MMODescrs[0].AlignInBits; 812 unsigned RoundedSize = NextPowerOf2(Size); 813 return (Align >= RoundedSize); 814 }; 815 816 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 817 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 818 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 819 820 // TODO: Refine based on subtargets which support unaligned access or 128-bit 821 // LDS 822 // TODO: Unsupported flat for SI. 823 824 for (unsigned Op : {G_LOAD, G_STORE}) { 825 const bool IsStore = Op == G_STORE; 826 827 auto &Actions = getActionDefinitionsBuilder(Op); 828 // Whitelist the common cases. 829 // TODO: Loads to s16 on gfx9 830 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 831 {V2S32, GlobalPtr, 64, GlobalAlign32}, 832 {V4S32, GlobalPtr, 128, GlobalAlign32}, 833 {S128, GlobalPtr, 128, GlobalAlign32}, 834 {S64, GlobalPtr, 64, GlobalAlign32}, 835 {V2S64, GlobalPtr, 128, GlobalAlign32}, 836 {V2S16, GlobalPtr, 32, GlobalAlign32}, 837 {S32, GlobalPtr, 8, GlobalAlign8}, 838 {S32, GlobalPtr, 16, GlobalAlign16}, 839 840 {S32, LocalPtr, 32, 32}, 841 {S64, LocalPtr, 64, 32}, 842 {V2S32, LocalPtr, 64, 32}, 843 {S32, LocalPtr, 8, 8}, 844 {S32, LocalPtr, 16, 16}, 845 {V2S16, LocalPtr, 32, 32}, 846 847 {S32, PrivatePtr, 32, 32}, 848 {S32, PrivatePtr, 8, 8}, 849 {S32, PrivatePtr, 16, 16}, 850 {V2S16, PrivatePtr, 32, 32}, 851 852 {S32, FlatPtr, 32, GlobalAlign32}, 853 {S32, FlatPtr, 16, GlobalAlign16}, 854 {S32, FlatPtr, 8, GlobalAlign8}, 855 {V2S16, FlatPtr, 32, GlobalAlign32}, 856 857 {S32, ConstantPtr, 32, GlobalAlign32}, 858 {V2S32, ConstantPtr, 64, GlobalAlign32}, 859 {V4S32, ConstantPtr, 128, GlobalAlign32}, 860 {S64, ConstantPtr, 64, GlobalAlign32}, 861 {S128, ConstantPtr, 128, GlobalAlign32}, 862 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 863 Actions 864 .customIf(typeIs(1, Constant32Ptr)) 865 // Widen suitably aligned loads by loading extra elements. 866 .moreElementsIf([=](const LegalityQuery &Query) { 867 const LLT Ty = Query.Types[0]; 868 return Op == G_LOAD && Ty.isVector() && 869 shouldWidenLoadResult(Query); 870 }, moreElementsToNextPow2(0)) 871 .widenScalarIf([=](const LegalityQuery &Query) { 872 const LLT Ty = Query.Types[0]; 873 return Op == G_LOAD && !Ty.isVector() && 874 shouldWidenLoadResult(Query); 875 }, widenScalarOrEltToNextPow2(0)) 876 .narrowScalarIf( 877 [=](const LegalityQuery &Query) -> bool { 878 return !Query.Types[0].isVector() && 879 needToSplitMemOp(Query, Op == G_LOAD); 880 }, 881 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 882 const LLT DstTy = Query.Types[0]; 883 const LLT PtrTy = Query.Types[1]; 884 885 const unsigned DstSize = DstTy.getSizeInBits(); 886 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 887 888 // Split extloads. 889 if (DstSize > MemSize) 890 return std::make_pair(0, LLT::scalar(MemSize)); 891 892 if (!isPowerOf2_32(DstSize)) { 893 // We're probably decomposing an odd sized store. Try to split 894 // to the widest type. TODO: Account for alignment. As-is it 895 // should be OK, since the new parts will be further legalized. 896 unsigned FloorSize = PowerOf2Floor(DstSize); 897 return std::make_pair(0, LLT::scalar(FloorSize)); 898 } 899 900 if (DstSize > 32 && (DstSize % 32 != 0)) { 901 // FIXME: Need a way to specify non-extload of larger size if 902 // suitably aligned. 903 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 904 } 905 906 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 907 Op == G_LOAD); 908 if (MemSize > MaxSize) 909 return std::make_pair(0, LLT::scalar(MaxSize)); 910 911 unsigned Align = Query.MMODescrs[0].AlignInBits; 912 return std::make_pair(0, LLT::scalar(Align)); 913 }) 914 .fewerElementsIf( 915 [=](const LegalityQuery &Query) -> bool { 916 return Query.Types[0].isVector() && 917 needToSplitMemOp(Query, Op == G_LOAD); 918 }, 919 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 920 const LLT DstTy = Query.Types[0]; 921 const LLT PtrTy = Query.Types[1]; 922 923 LLT EltTy = DstTy.getElementType(); 924 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 925 Op == G_LOAD); 926 927 // FIXME: Handle widened to power of 2 results better. This ends 928 // up scalarizing. 929 // FIXME: 3 element stores scalarized on SI 930 931 // Split if it's too large for the address space. 932 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 933 unsigned NumElts = DstTy.getNumElements(); 934 unsigned EltSize = EltTy.getSizeInBits(); 935 936 if (MaxSize % EltSize == 0) { 937 return std::make_pair( 938 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 939 } 940 941 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 942 943 // FIXME: Refine when odd breakdowns handled 944 // The scalars will need to be re-legalized. 945 if (NumPieces == 1 || NumPieces >= NumElts || 946 NumElts % NumPieces != 0) 947 return std::make_pair(0, EltTy); 948 949 return std::make_pair(0, 950 LLT::vector(NumElts / NumPieces, EltTy)); 951 } 952 953 // FIXME: We could probably handle weird extending loads better. 954 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 955 if (DstTy.getSizeInBits() > MemSize) 956 return std::make_pair(0, EltTy); 957 958 unsigned EltSize = EltTy.getSizeInBits(); 959 unsigned DstSize = DstTy.getSizeInBits(); 960 if (!isPowerOf2_32(DstSize)) { 961 // We're probably decomposing an odd sized store. Try to split 962 // to the widest type. TODO: Account for alignment. As-is it 963 // should be OK, since the new parts will be further legalized. 964 unsigned FloorSize = PowerOf2Floor(DstSize); 965 return std::make_pair( 966 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 967 } 968 969 // Need to split because of alignment. 970 unsigned Align = Query.MMODescrs[0].AlignInBits; 971 if (EltSize > Align && 972 (EltSize / Align < DstTy.getNumElements())) { 973 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 974 } 975 976 // May need relegalization for the scalars. 977 return std::make_pair(0, EltTy); 978 }) 979 .minScalar(0, S32); 980 981 if (IsStore) 982 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 983 984 // TODO: Need a bitcast lower option? 985 Actions 986 .legalIf([=](const LegalityQuery &Query) { 987 const LLT Ty0 = Query.Types[0]; 988 unsigned Size = Ty0.getSizeInBits(); 989 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 990 unsigned Align = Query.MMODescrs[0].AlignInBits; 991 992 // FIXME: Widening store from alignment not valid. 993 if (MemSize < Size) 994 MemSize = std::max(MemSize, Align); 995 996 // No extending vector loads. 997 if (Size > MemSize && Ty0.isVector()) 998 return false; 999 1000 switch (MemSize) { 1001 case 8: 1002 case 16: 1003 return Size == 32; 1004 case 32: 1005 case 64: 1006 case 128: 1007 return true; 1008 case 96: 1009 return ST.hasDwordx3LoadStores(); 1010 case 256: 1011 case 512: 1012 return true; 1013 default: 1014 return false; 1015 } 1016 }) 1017 .widenScalarToNextPow2(0) 1018 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1019 } 1020 1021 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1022 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1023 {S32, GlobalPtr, 16, 2 * 8}, 1024 {S32, LocalPtr, 8, 8}, 1025 {S32, LocalPtr, 16, 16}, 1026 {S32, PrivatePtr, 8, 8}, 1027 {S32, PrivatePtr, 16, 16}, 1028 {S32, ConstantPtr, 8, 8}, 1029 {S32, ConstantPtr, 16, 2 * 8}}); 1030 if (ST.hasFlatAddressSpace()) { 1031 ExtLoads.legalForTypesWithMemDesc( 1032 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1033 } 1034 1035 ExtLoads.clampScalar(0, S32, S32) 1036 .widenScalarToNextPow2(0) 1037 .unsupportedIfMemSizeNotPow2() 1038 .lower(); 1039 1040 auto &Atomics = getActionDefinitionsBuilder( 1041 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1042 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1043 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1044 G_ATOMICRMW_UMIN}) 1045 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1046 {S64, GlobalPtr}, {S64, LocalPtr}}); 1047 if (ST.hasFlatAddressSpace()) { 1048 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1049 } 1050 1051 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1052 .legalFor({{S32, LocalPtr}}); 1053 1054 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1055 // demarshalling 1056 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1057 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1058 {S32, FlatPtr}, {S64, FlatPtr}}) 1059 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1060 {S32, RegionPtr}, {S64, RegionPtr}}); 1061 // TODO: Pointer types, any 32-bit or 64-bit vector 1062 1063 // Condition should be s32 for scalar, s1 for vector. 1064 getActionDefinitionsBuilder(G_SELECT) 1065 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1066 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1067 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1068 .clampScalar(0, S16, S64) 1069 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1070 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1071 .scalarize(1) 1072 .clampMaxNumElements(0, S32, 2) 1073 .clampMaxNumElements(0, LocalPtr, 2) 1074 .clampMaxNumElements(0, PrivatePtr, 2) 1075 .scalarize(0) 1076 .widenScalarToNextPow2(0) 1077 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1078 1079 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1080 // be more flexible with the shift amount type. 1081 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1082 .legalFor({{S32, S32}, {S64, S32}}); 1083 if (ST.has16BitInsts()) { 1084 if (ST.hasVOP3PInsts()) { 1085 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 1086 .clampMaxNumElements(0, S16, 2); 1087 } else 1088 Shifts.legalFor({{S16, S32}, {S16, S16}}); 1089 1090 // TODO: Support 16-bit shift amounts 1091 Shifts.clampScalar(1, S32, S32); 1092 Shifts.clampScalar(0, S16, S64); 1093 Shifts.widenScalarToNextPow2(0, 16); 1094 } else { 1095 // Make sure we legalize the shift amount type first, as the general 1096 // expansion for the shifted type will produce much worse code if it hasn't 1097 // been truncated already. 1098 Shifts.clampScalar(1, S32, S32); 1099 Shifts.clampScalar(0, S32, S64); 1100 Shifts.widenScalarToNextPow2(0, 32); 1101 } 1102 Shifts.scalarize(0); 1103 1104 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1105 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1106 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1107 unsigned IdxTypeIdx = 2; 1108 1109 getActionDefinitionsBuilder(Op) 1110 .customIf([=](const LegalityQuery &Query) { 1111 const LLT EltTy = Query.Types[EltTypeIdx]; 1112 const LLT VecTy = Query.Types[VecTypeIdx]; 1113 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1114 return (EltTy.getSizeInBits() == 16 || 1115 EltTy.getSizeInBits() % 32 == 0) && 1116 VecTy.getSizeInBits() % 32 == 0 && 1117 VecTy.getSizeInBits() <= 1024 && 1118 IdxTy.getSizeInBits() == 32; 1119 }) 1120 .clampScalar(EltTypeIdx, S32, S64) 1121 .clampScalar(VecTypeIdx, S32, S64) 1122 .clampScalar(IdxTypeIdx, S32, S32); 1123 } 1124 1125 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1126 .unsupportedIf([=](const LegalityQuery &Query) { 1127 const LLT &EltTy = Query.Types[1].getElementType(); 1128 return Query.Types[0] != EltTy; 1129 }); 1130 1131 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1132 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1133 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1134 1135 // FIXME: Doesn't handle extract of illegal sizes. 1136 getActionDefinitionsBuilder(Op) 1137 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1138 // FIXME: Multiples of 16 should not be legal. 1139 .legalIf([=](const LegalityQuery &Query) { 1140 const LLT BigTy = Query.Types[BigTyIdx]; 1141 const LLT LitTy = Query.Types[LitTyIdx]; 1142 return (BigTy.getSizeInBits() % 32 == 0) && 1143 (LitTy.getSizeInBits() % 16 == 0); 1144 }) 1145 .widenScalarIf( 1146 [=](const LegalityQuery &Query) { 1147 const LLT BigTy = Query.Types[BigTyIdx]; 1148 return (BigTy.getScalarSizeInBits() < 16); 1149 }, 1150 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1151 .widenScalarIf( 1152 [=](const LegalityQuery &Query) { 1153 const LLT LitTy = Query.Types[LitTyIdx]; 1154 return (LitTy.getScalarSizeInBits() < 16); 1155 }, 1156 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1157 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1158 .widenScalarToNextPow2(BigTyIdx, 32); 1159 1160 } 1161 1162 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1163 .legalForCartesianProduct(AllS32Vectors, {S32}) 1164 .legalForCartesianProduct(AllS64Vectors, {S64}) 1165 .clampNumElements(0, V16S32, V32S32) 1166 .clampNumElements(0, V2S64, V16S64) 1167 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1168 1169 if (ST.hasScalarPackInsts()) { 1170 BuildVector 1171 // FIXME: Should probably widen s1 vectors straight to s32 1172 .minScalarOrElt(0, S16) 1173 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1174 .minScalar(1, S32); 1175 1176 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1177 .legalFor({V2S16, S32}) 1178 .lower(); 1179 BuildVector.minScalarOrElt(0, S32); 1180 } else { 1181 BuildVector.customFor({V2S16, S16}); 1182 BuildVector.minScalarOrElt(0, S32); 1183 1184 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1185 .customFor({V2S16, S32}) 1186 .lower(); 1187 } 1188 1189 BuildVector.legalIf(isRegisterType(0)); 1190 1191 // FIXME: Clamp maximum size 1192 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1193 .legalIf(isRegisterType(0)); 1194 1195 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1196 // pre-legalize. 1197 if (ST.hasVOP3PInsts()) { 1198 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1199 .customFor({V2S16, V2S16}) 1200 .lower(); 1201 } else 1202 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1203 1204 // Merge/Unmerge 1205 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1206 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1207 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1208 1209 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1210 const LLT Ty = Query.Types[TypeIdx]; 1211 if (Ty.isVector()) { 1212 const LLT &EltTy = Ty.getElementType(); 1213 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1214 return true; 1215 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1216 return true; 1217 } 1218 return false; 1219 }; 1220 1221 auto &Builder = getActionDefinitionsBuilder(Op) 1222 // Try to widen to s16 first for small types. 1223 // TODO: Only do this on targets with legal s16 shifts 1224 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1225 1226 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1227 .lowerFor({{S16, V2S16}}) 1228 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1229 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1230 elementTypeIs(1, S16)), 1231 changeTo(1, V2S16)) 1232 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1233 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1234 // valid. 1235 .clampScalar(LitTyIdx, S32, S512) 1236 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1237 // Break up vectors with weird elements into scalars 1238 .fewerElementsIf( 1239 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1240 scalarize(0)) 1241 .fewerElementsIf( 1242 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1243 scalarize(1)) 1244 .clampScalar(BigTyIdx, S32, S1024); 1245 1246 if (Op == G_MERGE_VALUES) { 1247 Builder.widenScalarIf( 1248 // TODO: Use 16-bit shifts if legal for 8-bit values? 1249 [=](const LegalityQuery &Query) { 1250 const LLT Ty = Query.Types[LitTyIdx]; 1251 return Ty.getSizeInBits() < 32; 1252 }, 1253 changeTo(LitTyIdx, S32)); 1254 } 1255 1256 Builder.widenScalarIf( 1257 [=](const LegalityQuery &Query) { 1258 const LLT Ty = Query.Types[BigTyIdx]; 1259 return !isPowerOf2_32(Ty.getSizeInBits()) && 1260 Ty.getSizeInBits() % 16 != 0; 1261 }, 1262 [=](const LegalityQuery &Query) { 1263 // Pick the next power of 2, or a multiple of 64 over 128. 1264 // Whichever is smaller. 1265 const LLT &Ty = Query.Types[BigTyIdx]; 1266 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1267 if (NewSizeInBits >= 256) { 1268 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1269 if (RoundedTo < NewSizeInBits) 1270 NewSizeInBits = RoundedTo; 1271 } 1272 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1273 }) 1274 .legalIf([=](const LegalityQuery &Query) { 1275 const LLT &BigTy = Query.Types[BigTyIdx]; 1276 const LLT &LitTy = Query.Types[LitTyIdx]; 1277 1278 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1279 return false; 1280 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1281 return false; 1282 1283 return BigTy.getSizeInBits() % 16 == 0 && 1284 LitTy.getSizeInBits() % 16 == 0 && 1285 BigTy.getSizeInBits() <= 1024; 1286 }) 1287 // Any vectors left are the wrong size. Scalarize them. 1288 .scalarize(0) 1289 .scalarize(1); 1290 } 1291 1292 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1293 // RegBankSelect. 1294 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1295 .legalFor({{S32}, {S64}}); 1296 1297 if (ST.hasVOP3PInsts()) { 1298 SextInReg.lowerFor({{V2S16}}) 1299 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1300 // get more vector shift opportunities, since we'll get those when 1301 // expanded. 1302 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1303 } else if (ST.has16BitInsts()) { 1304 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1305 } else { 1306 // Prefer to promote to s32 before lowering if we don't have 16-bit 1307 // shifts. This avoid a lot of intermediate truncate and extend operations. 1308 SextInReg.lowerFor({{S32}, {S64}}); 1309 } 1310 1311 SextInReg 1312 .scalarize(0) 1313 .clampScalar(0, S32, S64) 1314 .lower(); 1315 1316 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1317 .legalFor({S64}); 1318 1319 getActionDefinitionsBuilder({ 1320 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1321 G_FCOPYSIGN, 1322 1323 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1324 G_READ_REGISTER, 1325 G_WRITE_REGISTER, 1326 1327 G_SADDO, G_SSUBO, 1328 1329 // TODO: Implement 1330 G_FMINIMUM, G_FMAXIMUM 1331 }).lower(); 1332 1333 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1334 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1335 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1336 .unsupported(); 1337 1338 computeTables(); 1339 verify(*ST.getInstrInfo()); 1340 } 1341 1342 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1343 MachineRegisterInfo &MRI, 1344 MachineIRBuilder &B, 1345 GISelChangeObserver &Observer) const { 1346 switch (MI.getOpcode()) { 1347 case TargetOpcode::G_ADDRSPACE_CAST: 1348 return legalizeAddrSpaceCast(MI, MRI, B); 1349 case TargetOpcode::G_FRINT: 1350 return legalizeFrint(MI, MRI, B); 1351 case TargetOpcode::G_FCEIL: 1352 return legalizeFceil(MI, MRI, B); 1353 case TargetOpcode::G_INTRINSIC_TRUNC: 1354 return legalizeIntrinsicTrunc(MI, MRI, B); 1355 case TargetOpcode::G_SITOFP: 1356 return legalizeITOFP(MI, MRI, B, true); 1357 case TargetOpcode::G_UITOFP: 1358 return legalizeITOFP(MI, MRI, B, false); 1359 case TargetOpcode::G_FPTOSI: 1360 return legalizeFPTOI(MI, MRI, B, true); 1361 case TargetOpcode::G_FPTOUI: 1362 return legalizeFPTOI(MI, MRI, B, false); 1363 case TargetOpcode::G_FMINNUM: 1364 case TargetOpcode::G_FMAXNUM: 1365 case TargetOpcode::G_FMINNUM_IEEE: 1366 case TargetOpcode::G_FMAXNUM_IEEE: 1367 return legalizeMinNumMaxNum(MI, MRI, B); 1368 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1369 return legalizeExtractVectorElt(MI, MRI, B); 1370 case TargetOpcode::G_INSERT_VECTOR_ELT: 1371 return legalizeInsertVectorElt(MI, MRI, B); 1372 case TargetOpcode::G_SHUFFLE_VECTOR: 1373 return legalizeShuffleVector(MI, MRI, B); 1374 case TargetOpcode::G_FSIN: 1375 case TargetOpcode::G_FCOS: 1376 return legalizeSinCos(MI, MRI, B); 1377 case TargetOpcode::G_GLOBAL_VALUE: 1378 return legalizeGlobalValue(MI, MRI, B); 1379 case TargetOpcode::G_LOAD: 1380 return legalizeLoad(MI, MRI, B, Observer); 1381 case TargetOpcode::G_FMAD: 1382 return legalizeFMad(MI, MRI, B); 1383 case TargetOpcode::G_FDIV: 1384 return legalizeFDIV(MI, MRI, B); 1385 case TargetOpcode::G_UDIV: 1386 case TargetOpcode::G_UREM: 1387 return legalizeUDIV_UREM(MI, MRI, B); 1388 case TargetOpcode::G_SDIV: 1389 case TargetOpcode::G_SREM: 1390 return legalizeSDIV_SREM(MI, MRI, B); 1391 case TargetOpcode::G_ATOMIC_CMPXCHG: 1392 return legalizeAtomicCmpXChg(MI, MRI, B); 1393 case TargetOpcode::G_FLOG: 1394 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1395 case TargetOpcode::G_FLOG10: 1396 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1397 case TargetOpcode::G_FEXP: 1398 return legalizeFExp(MI, B); 1399 case TargetOpcode::G_FPOW: 1400 return legalizeFPow(MI, B); 1401 case TargetOpcode::G_FFLOOR: 1402 return legalizeFFloor(MI, MRI, B); 1403 case TargetOpcode::G_BUILD_VECTOR: 1404 return legalizeBuildVector(MI, MRI, B); 1405 default: 1406 return false; 1407 } 1408 1409 llvm_unreachable("expected switch to return"); 1410 } 1411 1412 Register AMDGPULegalizerInfo::getSegmentAperture( 1413 unsigned AS, 1414 MachineRegisterInfo &MRI, 1415 MachineIRBuilder &B) const { 1416 MachineFunction &MF = B.getMF(); 1417 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1418 const LLT S32 = LLT::scalar(32); 1419 1420 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1421 1422 if (ST.hasApertureRegs()) { 1423 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1424 // getreg. 1425 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1426 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1427 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1428 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1429 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1430 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1431 unsigned Encoding = 1432 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1433 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1434 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1435 1436 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1437 1438 B.buildInstr(AMDGPU::S_GETREG_B32) 1439 .addDef(GetReg) 1440 .addImm(Encoding); 1441 MRI.setType(GetReg, S32); 1442 1443 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1444 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1445 } 1446 1447 Register QueuePtr = MRI.createGenericVirtualRegister( 1448 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1449 1450 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1451 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1452 return Register(); 1453 1454 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1455 // private_segment_aperture_base_hi. 1456 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1457 1458 // TODO: can we be smarter about machine pointer info? 1459 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1460 MachineMemOperand *MMO = MF.getMachineMemOperand( 1461 PtrInfo, 1462 MachineMemOperand::MOLoad | 1463 MachineMemOperand::MODereferenceable | 1464 MachineMemOperand::MOInvariant, 1465 4, 1466 MinAlign(64, StructOffset)); 1467 1468 Register LoadAddr; 1469 1470 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1471 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1472 } 1473 1474 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1475 MachineInstr &MI, MachineRegisterInfo &MRI, 1476 MachineIRBuilder &B) const { 1477 MachineFunction &MF = B.getMF(); 1478 1479 B.setInstr(MI); 1480 1481 const LLT S32 = LLT::scalar(32); 1482 Register Dst = MI.getOperand(0).getReg(); 1483 Register Src = MI.getOperand(1).getReg(); 1484 1485 LLT DstTy = MRI.getType(Dst); 1486 LLT SrcTy = MRI.getType(Src); 1487 unsigned DestAS = DstTy.getAddressSpace(); 1488 unsigned SrcAS = SrcTy.getAddressSpace(); 1489 1490 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1491 // vector element. 1492 assert(!DstTy.isVector()); 1493 1494 const AMDGPUTargetMachine &TM 1495 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1496 1497 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1498 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1499 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1500 return true; 1501 } 1502 1503 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1504 // Truncate. 1505 B.buildExtract(Dst, Src, 0); 1506 MI.eraseFromParent(); 1507 return true; 1508 } 1509 1510 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1511 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1512 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1513 1514 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1515 // another. Merge operands are required to be the same type, but creating an 1516 // extra ptrtoint would be kind of pointless. 1517 auto HighAddr = B.buildConstant( 1518 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1519 B.buildMerge(Dst, {Src, HighAddr}); 1520 MI.eraseFromParent(); 1521 return true; 1522 } 1523 1524 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1525 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1526 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1527 unsigned NullVal = TM.getNullPointerValue(DestAS); 1528 1529 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1530 auto FlatNull = B.buildConstant(SrcTy, 0); 1531 1532 // Extract low 32-bits of the pointer. 1533 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1534 1535 auto CmpRes = 1536 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1537 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1538 1539 MI.eraseFromParent(); 1540 return true; 1541 } 1542 1543 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1544 return false; 1545 1546 if (!ST.hasFlatAddressSpace()) 1547 return false; 1548 1549 auto SegmentNull = 1550 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1551 auto FlatNull = 1552 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1553 1554 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1555 if (!ApertureReg.isValid()) 1556 return false; 1557 1558 auto CmpRes = 1559 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1560 1561 // Coerce the type of the low half of the result so we can use merge_values. 1562 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1563 1564 // TODO: Should we allow mismatched types but matching sizes in merges to 1565 // avoid the ptrtoint? 1566 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1567 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1568 1569 MI.eraseFromParent(); 1570 return true; 1571 } 1572 1573 bool AMDGPULegalizerInfo::legalizeFrint( 1574 MachineInstr &MI, MachineRegisterInfo &MRI, 1575 MachineIRBuilder &B) const { 1576 B.setInstr(MI); 1577 1578 Register Src = MI.getOperand(1).getReg(); 1579 LLT Ty = MRI.getType(Src); 1580 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1581 1582 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1583 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1584 1585 auto C1 = B.buildFConstant(Ty, C1Val); 1586 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1587 1588 // TODO: Should this propagate fast-math-flags? 1589 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1590 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1591 1592 auto C2 = B.buildFConstant(Ty, C2Val); 1593 auto Fabs = B.buildFAbs(Ty, Src); 1594 1595 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1596 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1597 return true; 1598 } 1599 1600 bool AMDGPULegalizerInfo::legalizeFceil( 1601 MachineInstr &MI, MachineRegisterInfo &MRI, 1602 MachineIRBuilder &B) const { 1603 B.setInstr(MI); 1604 1605 const LLT S1 = LLT::scalar(1); 1606 const LLT S64 = LLT::scalar(64); 1607 1608 Register Src = MI.getOperand(1).getReg(); 1609 assert(MRI.getType(Src) == S64); 1610 1611 // result = trunc(src) 1612 // if (src > 0.0 && src != result) 1613 // result += 1.0 1614 1615 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1616 1617 const auto Zero = B.buildFConstant(S64, 0.0); 1618 const auto One = B.buildFConstant(S64, 1.0); 1619 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1620 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1621 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1622 auto Add = B.buildSelect(S64, And, One, Zero); 1623 1624 // TODO: Should this propagate fast-math-flags? 1625 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1626 return true; 1627 } 1628 1629 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1630 MachineIRBuilder &B) { 1631 const unsigned FractBits = 52; 1632 const unsigned ExpBits = 11; 1633 LLT S32 = LLT::scalar(32); 1634 1635 auto Const0 = B.buildConstant(S32, FractBits - 32); 1636 auto Const1 = B.buildConstant(S32, ExpBits); 1637 1638 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1639 .addUse(Const0.getReg(0)) 1640 .addUse(Const1.getReg(0)); 1641 1642 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1643 } 1644 1645 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1646 MachineInstr &MI, MachineRegisterInfo &MRI, 1647 MachineIRBuilder &B) const { 1648 B.setInstr(MI); 1649 1650 const LLT S1 = LLT::scalar(1); 1651 const LLT S32 = LLT::scalar(32); 1652 const LLT S64 = LLT::scalar(64); 1653 1654 Register Src = MI.getOperand(1).getReg(); 1655 assert(MRI.getType(Src) == S64); 1656 1657 // TODO: Should this use extract since the low half is unused? 1658 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1659 Register Hi = Unmerge.getReg(1); 1660 1661 // Extract the upper half, since this is where we will find the sign and 1662 // exponent. 1663 auto Exp = extractF64Exponent(Hi, B); 1664 1665 const unsigned FractBits = 52; 1666 1667 // Extract the sign bit. 1668 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1669 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1670 1671 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1672 1673 const auto Zero32 = B.buildConstant(S32, 0); 1674 1675 // Extend back to 64-bits. 1676 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1677 1678 auto Shr = B.buildAShr(S64, FractMask, Exp); 1679 auto Not = B.buildNot(S64, Shr); 1680 auto Tmp0 = B.buildAnd(S64, Src, Not); 1681 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1682 1683 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1684 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1685 1686 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1687 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1688 return true; 1689 } 1690 1691 bool AMDGPULegalizerInfo::legalizeITOFP( 1692 MachineInstr &MI, MachineRegisterInfo &MRI, 1693 MachineIRBuilder &B, bool Signed) const { 1694 B.setInstr(MI); 1695 1696 Register Dst = MI.getOperand(0).getReg(); 1697 Register Src = MI.getOperand(1).getReg(); 1698 1699 const LLT S64 = LLT::scalar(64); 1700 const LLT S32 = LLT::scalar(32); 1701 1702 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1703 1704 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1705 1706 auto CvtHi = Signed ? 1707 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1708 B.buildUITOFP(S64, Unmerge.getReg(1)); 1709 1710 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1711 1712 auto ThirtyTwo = B.buildConstant(S32, 32); 1713 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1714 .addUse(CvtHi.getReg(0)) 1715 .addUse(ThirtyTwo.getReg(0)); 1716 1717 // TODO: Should this propagate fast-math-flags? 1718 B.buildFAdd(Dst, LdExp, CvtLo); 1719 MI.eraseFromParent(); 1720 return true; 1721 } 1722 1723 // TODO: Copied from DAG implementation. Verify logic and document how this 1724 // actually works. 1725 bool AMDGPULegalizerInfo::legalizeFPTOI( 1726 MachineInstr &MI, MachineRegisterInfo &MRI, 1727 MachineIRBuilder &B, bool Signed) const { 1728 B.setInstr(MI); 1729 1730 Register Dst = MI.getOperand(0).getReg(); 1731 Register Src = MI.getOperand(1).getReg(); 1732 1733 const LLT S64 = LLT::scalar(64); 1734 const LLT S32 = LLT::scalar(32); 1735 1736 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1737 1738 unsigned Flags = MI.getFlags(); 1739 1740 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1741 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1742 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1743 1744 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1745 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1746 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1747 1748 auto Hi = Signed ? 1749 B.buildFPTOSI(S32, FloorMul) : 1750 B.buildFPTOUI(S32, FloorMul); 1751 auto Lo = B.buildFPTOUI(S32, Fma); 1752 1753 B.buildMerge(Dst, { Lo, Hi }); 1754 MI.eraseFromParent(); 1755 1756 return true; 1757 } 1758 1759 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1760 MachineInstr &MI, MachineRegisterInfo &MRI, 1761 MachineIRBuilder &B) const { 1762 MachineFunction &MF = B.getMF(); 1763 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1764 1765 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1766 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1767 1768 // With ieee_mode disabled, the instructions have the correct behavior 1769 // already for G_FMINNUM/G_FMAXNUM 1770 if (!MFI->getMode().IEEE) 1771 return !IsIEEEOp; 1772 1773 if (IsIEEEOp) 1774 return true; 1775 1776 MachineIRBuilder HelperBuilder(MI); 1777 GISelObserverWrapper DummyObserver; 1778 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1779 HelperBuilder.setInstr(MI); 1780 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1781 } 1782 1783 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1784 MachineInstr &MI, MachineRegisterInfo &MRI, 1785 MachineIRBuilder &B) const { 1786 // TODO: Should move some of this into LegalizerHelper. 1787 1788 // TODO: Promote dynamic indexing of s16 to s32 1789 1790 // FIXME: Artifact combiner probably should have replaced the truncated 1791 // constant before this, so we shouldn't need 1792 // getConstantVRegValWithLookThrough. 1793 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1794 MI.getOperand(2).getReg(), MRI); 1795 if (!IdxVal) // Dynamic case will be selected to register indexing. 1796 return true; 1797 1798 Register Dst = MI.getOperand(0).getReg(); 1799 Register Vec = MI.getOperand(1).getReg(); 1800 1801 LLT VecTy = MRI.getType(Vec); 1802 LLT EltTy = VecTy.getElementType(); 1803 assert(EltTy == MRI.getType(Dst)); 1804 1805 B.setInstr(MI); 1806 1807 if (IdxVal->Value < VecTy.getNumElements()) 1808 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1809 else 1810 B.buildUndef(Dst); 1811 1812 MI.eraseFromParent(); 1813 return true; 1814 } 1815 1816 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1817 MachineInstr &MI, MachineRegisterInfo &MRI, 1818 MachineIRBuilder &B) const { 1819 // TODO: Should move some of this into LegalizerHelper. 1820 1821 // TODO: Promote dynamic indexing of s16 to s32 1822 1823 // FIXME: Artifact combiner probably should have replaced the truncated 1824 // constant before this, so we shouldn't need 1825 // getConstantVRegValWithLookThrough. 1826 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1827 MI.getOperand(3).getReg(), MRI); 1828 if (!IdxVal) // Dynamic case will be selected to register indexing. 1829 return true; 1830 1831 Register Dst = MI.getOperand(0).getReg(); 1832 Register Vec = MI.getOperand(1).getReg(); 1833 Register Ins = MI.getOperand(2).getReg(); 1834 1835 LLT VecTy = MRI.getType(Vec); 1836 LLT EltTy = VecTy.getElementType(); 1837 assert(EltTy == MRI.getType(Ins)); 1838 1839 B.setInstr(MI); 1840 1841 if (IdxVal->Value < VecTy.getNumElements()) 1842 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1843 else 1844 B.buildUndef(Dst); 1845 1846 MI.eraseFromParent(); 1847 return true; 1848 } 1849 1850 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1851 MachineInstr &MI, MachineRegisterInfo &MRI, 1852 MachineIRBuilder &B) const { 1853 const LLT V2S16 = LLT::vector(2, 16); 1854 1855 Register Dst = MI.getOperand(0).getReg(); 1856 Register Src0 = MI.getOperand(1).getReg(); 1857 LLT DstTy = MRI.getType(Dst); 1858 LLT SrcTy = MRI.getType(Src0); 1859 1860 if (SrcTy == V2S16 && DstTy == V2S16 && 1861 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1862 return true; 1863 1864 MachineIRBuilder HelperBuilder(MI); 1865 GISelObserverWrapper DummyObserver; 1866 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1867 HelperBuilder.setInstr(MI); 1868 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1869 } 1870 1871 bool AMDGPULegalizerInfo::legalizeSinCos( 1872 MachineInstr &MI, MachineRegisterInfo &MRI, 1873 MachineIRBuilder &B) const { 1874 B.setInstr(MI); 1875 1876 Register DstReg = MI.getOperand(0).getReg(); 1877 Register SrcReg = MI.getOperand(1).getReg(); 1878 LLT Ty = MRI.getType(DstReg); 1879 unsigned Flags = MI.getFlags(); 1880 1881 Register TrigVal; 1882 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1883 if (ST.hasTrigReducedRange()) { 1884 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1885 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1886 .addUse(MulVal.getReg(0)) 1887 .setMIFlags(Flags).getReg(0); 1888 } else 1889 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1890 1891 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1892 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1893 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1894 .addUse(TrigVal) 1895 .setMIFlags(Flags); 1896 MI.eraseFromParent(); 1897 return true; 1898 } 1899 1900 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1901 Register DstReg, LLT PtrTy, 1902 MachineIRBuilder &B, const GlobalValue *GV, 1903 unsigned Offset, unsigned GAFlags) const { 1904 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1905 // to the following code sequence: 1906 // 1907 // For constant address space: 1908 // s_getpc_b64 s[0:1] 1909 // s_add_u32 s0, s0, $symbol 1910 // s_addc_u32 s1, s1, 0 1911 // 1912 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1913 // a fixup or relocation is emitted to replace $symbol with a literal 1914 // constant, which is a pc-relative offset from the encoding of the $symbol 1915 // operand to the global variable. 1916 // 1917 // For global address space: 1918 // s_getpc_b64 s[0:1] 1919 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1920 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1921 // 1922 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1923 // fixups or relocations are emitted to replace $symbol@*@lo and 1924 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1925 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1926 // operand to the global variable. 1927 // 1928 // What we want here is an offset from the value returned by s_getpc 1929 // (which is the address of the s_add_u32 instruction) to the global 1930 // variable, but since the encoding of $symbol starts 4 bytes after the start 1931 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1932 // small. This requires us to add 4 to the global variable offset in order to 1933 // compute the correct address. 1934 1935 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1936 1937 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1938 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1939 1940 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1941 .addDef(PCReg); 1942 1943 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1944 if (GAFlags == SIInstrInfo::MO_NONE) 1945 MIB.addImm(0); 1946 else 1947 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1948 1949 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1950 1951 if (PtrTy.getSizeInBits() == 32) 1952 B.buildExtract(DstReg, PCReg, 0); 1953 return true; 1954 } 1955 1956 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1957 MachineInstr &MI, MachineRegisterInfo &MRI, 1958 MachineIRBuilder &B) const { 1959 Register DstReg = MI.getOperand(0).getReg(); 1960 LLT Ty = MRI.getType(DstReg); 1961 unsigned AS = Ty.getAddressSpace(); 1962 1963 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1964 MachineFunction &MF = B.getMF(); 1965 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1966 B.setInstr(MI); 1967 1968 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1969 if (!MFI->isEntryFunction()) { 1970 const Function &Fn = MF.getFunction(); 1971 DiagnosticInfoUnsupported BadLDSDecl( 1972 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 1973 DS_Warning); 1974 Fn.getContext().diagnose(BadLDSDecl); 1975 1976 // We currently don't have a way to correctly allocate LDS objects that 1977 // aren't directly associated with a kernel. We do force inlining of 1978 // functions that use local objects. However, if these dead functions are 1979 // not eliminated, we don't want a compile time error. Just emit a warning 1980 // and a trap, since there should be no callable path here. 1981 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 1982 B.buildUndef(DstReg); 1983 MI.eraseFromParent(); 1984 return true; 1985 } 1986 1987 // TODO: We could emit code to handle the initialization somewhere. 1988 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1989 const SITargetLowering *TLI = ST.getTargetLowering(); 1990 if (!TLI->shouldUseLDSConstAddress(GV)) { 1991 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1992 return true; // Leave in place; 1993 } 1994 1995 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1996 MI.eraseFromParent(); 1997 return true; 1998 } 1999 2000 const Function &Fn = MF.getFunction(); 2001 DiagnosticInfoUnsupported BadInit( 2002 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2003 Fn.getContext().diagnose(BadInit); 2004 return true; 2005 } 2006 2007 const SITargetLowering *TLI = ST.getTargetLowering(); 2008 2009 if (TLI->shouldEmitFixup(GV)) { 2010 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2011 MI.eraseFromParent(); 2012 return true; 2013 } 2014 2015 if (TLI->shouldEmitPCReloc(GV)) { 2016 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2017 MI.eraseFromParent(); 2018 return true; 2019 } 2020 2021 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2022 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2023 2024 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2025 MachinePointerInfo::getGOT(MF), 2026 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2027 MachineMemOperand::MOInvariant, 2028 8 /*Size*/, 8 /*Align*/); 2029 2030 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2031 2032 if (Ty.getSizeInBits() == 32) { 2033 // Truncate if this is a 32-bit constant adrdess. 2034 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2035 B.buildExtract(DstReg, Load, 0); 2036 } else 2037 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2038 2039 MI.eraseFromParent(); 2040 return true; 2041 } 2042 2043 bool AMDGPULegalizerInfo::legalizeLoad( 2044 MachineInstr &MI, MachineRegisterInfo &MRI, 2045 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2046 B.setInstr(MI); 2047 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2048 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2049 Observer.changingInstr(MI); 2050 MI.getOperand(1).setReg(Cast.getReg(0)); 2051 Observer.changedInstr(MI); 2052 return true; 2053 } 2054 2055 bool AMDGPULegalizerInfo::legalizeFMad( 2056 MachineInstr &MI, MachineRegisterInfo &MRI, 2057 MachineIRBuilder &B) const { 2058 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2059 assert(Ty.isScalar()); 2060 2061 MachineFunction &MF = B.getMF(); 2062 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2063 2064 // TODO: Always legal with future ftz flag. 2065 // FIXME: Do we need just output? 2066 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2067 return true; 2068 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2069 return true; 2070 2071 MachineIRBuilder HelperBuilder(MI); 2072 GISelObserverWrapper DummyObserver; 2073 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2074 HelperBuilder.setMBB(*MI.getParent()); 2075 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2076 } 2077 2078 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2079 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2080 Register DstReg = MI.getOperand(0).getReg(); 2081 Register PtrReg = MI.getOperand(1).getReg(); 2082 Register CmpVal = MI.getOperand(2).getReg(); 2083 Register NewVal = MI.getOperand(3).getReg(); 2084 2085 assert(SITargetLowering::isFlatGlobalAddrSpace( 2086 MRI.getType(PtrReg).getAddressSpace()) && 2087 "this should not have been custom lowered"); 2088 2089 LLT ValTy = MRI.getType(CmpVal); 2090 LLT VecTy = LLT::vector(2, ValTy); 2091 2092 B.setInstr(MI); 2093 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2094 2095 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2096 .addDef(DstReg) 2097 .addUse(PtrReg) 2098 .addUse(PackedVal) 2099 .setMemRefs(MI.memoperands()); 2100 2101 MI.eraseFromParent(); 2102 return true; 2103 } 2104 2105 bool AMDGPULegalizerInfo::legalizeFlog( 2106 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2107 Register Dst = MI.getOperand(0).getReg(); 2108 Register Src = MI.getOperand(1).getReg(); 2109 LLT Ty = B.getMRI()->getType(Dst); 2110 unsigned Flags = MI.getFlags(); 2111 B.setInstr(MI); 2112 2113 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2114 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2115 2116 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2117 MI.eraseFromParent(); 2118 return true; 2119 } 2120 2121 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2122 MachineIRBuilder &B) const { 2123 Register Dst = MI.getOperand(0).getReg(); 2124 Register Src = MI.getOperand(1).getReg(); 2125 unsigned Flags = MI.getFlags(); 2126 LLT Ty = B.getMRI()->getType(Dst); 2127 B.setInstr(MI); 2128 2129 auto K = B.buildFConstant(Ty, numbers::log2e); 2130 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2131 B.buildFExp2(Dst, Mul, Flags); 2132 MI.eraseFromParent(); 2133 return true; 2134 } 2135 2136 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2137 MachineIRBuilder &B) const { 2138 Register Dst = MI.getOperand(0).getReg(); 2139 Register Src0 = MI.getOperand(1).getReg(); 2140 Register Src1 = MI.getOperand(2).getReg(); 2141 unsigned Flags = MI.getFlags(); 2142 LLT Ty = B.getMRI()->getType(Dst); 2143 B.setInstr(MI); 2144 const LLT S16 = LLT::scalar(16); 2145 const LLT S32 = LLT::scalar(32); 2146 2147 if (Ty == S32) { 2148 auto Log = B.buildFLog2(S32, Src0, Flags); 2149 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2150 .addUse(Log.getReg(0)) 2151 .addUse(Src1) 2152 .setMIFlags(Flags); 2153 B.buildFExp2(Dst, Mul, Flags); 2154 } else if (Ty == S16) { 2155 // There's no f16 fmul_legacy, so we need to convert for it. 2156 auto Log = B.buildFLog2(S16, Src0, Flags); 2157 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2158 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2159 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2160 .addUse(Ext0.getReg(0)) 2161 .addUse(Ext1.getReg(0)) 2162 .setMIFlags(Flags); 2163 2164 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2165 } else 2166 return false; 2167 2168 MI.eraseFromParent(); 2169 return true; 2170 } 2171 2172 // Find a source register, ignoring any possible source modifiers. 2173 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2174 Register ModSrc = OrigSrc; 2175 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2176 ModSrc = SrcFNeg->getOperand(1).getReg(); 2177 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2178 ModSrc = SrcFAbs->getOperand(1).getReg(); 2179 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2180 ModSrc = SrcFAbs->getOperand(1).getReg(); 2181 return ModSrc; 2182 } 2183 2184 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2185 MachineRegisterInfo &MRI, 2186 MachineIRBuilder &B) const { 2187 B.setInstr(MI); 2188 2189 const LLT S1 = LLT::scalar(1); 2190 const LLT S64 = LLT::scalar(64); 2191 Register Dst = MI.getOperand(0).getReg(); 2192 Register OrigSrc = MI.getOperand(1).getReg(); 2193 unsigned Flags = MI.getFlags(); 2194 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2195 "this should not have been custom lowered"); 2196 2197 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2198 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2199 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2200 // V_FRACT bug is: 2201 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2202 // 2203 // Convert floor(x) to (x - fract(x)) 2204 2205 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2206 .addUse(OrigSrc) 2207 .setMIFlags(Flags); 2208 2209 // Give source modifier matching some assistance before obscuring a foldable 2210 // pattern. 2211 2212 // TODO: We can avoid the neg on the fract? The input sign to fract 2213 // shouldn't matter? 2214 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2215 2216 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2217 2218 Register Min = MRI.createGenericVirtualRegister(S64); 2219 2220 // We don't need to concern ourselves with the snan handling difference, so 2221 // use the one which will directly select. 2222 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2223 if (MFI->getMode().IEEE) 2224 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2225 else 2226 B.buildFMinNum(Min, Fract, Const, Flags); 2227 2228 Register CorrectedFract = Min; 2229 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2230 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2231 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2232 } 2233 2234 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2235 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2236 2237 MI.eraseFromParent(); 2238 return true; 2239 } 2240 2241 // Turn an illegal packed v2s16 build vector into bit operations. 2242 // TODO: This should probably be a bitcast action in LegalizerHelper. 2243 bool AMDGPULegalizerInfo::legalizeBuildVector( 2244 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2245 Register Dst = MI.getOperand(0).getReg(); 2246 LLT DstTy = MRI.getType(Dst); 2247 const LLT S32 = LLT::scalar(32); 2248 const LLT V2S16 = LLT::vector(2, 16); 2249 (void)DstTy; 2250 (void)V2S16; 2251 assert(DstTy == V2S16); 2252 2253 Register Src0 = MI.getOperand(1).getReg(); 2254 Register Src1 = MI.getOperand(2).getReg(); 2255 assert(MRI.getType(Src0) == LLT::scalar(16)); 2256 2257 B.setInstr(MI); 2258 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2259 B.buildBitcast(Dst, Merge); 2260 2261 MI.eraseFromParent(); 2262 return true; 2263 } 2264 2265 // Return the use branch instruction, otherwise null if the usage is invalid. 2266 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2267 MachineRegisterInfo &MRI, 2268 MachineInstr *&Br) { 2269 Register CondDef = MI.getOperand(0).getReg(); 2270 if (!MRI.hasOneNonDBGUse(CondDef)) 2271 return nullptr; 2272 2273 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2274 if (UseMI.getParent() != MI.getParent() || 2275 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2276 return nullptr; 2277 2278 // Make sure the cond br is followed by a G_BR 2279 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2280 if (Next != MI.getParent()->end()) { 2281 if (Next->getOpcode() != AMDGPU::G_BR) 2282 return nullptr; 2283 Br = &*Next; 2284 } 2285 2286 return &UseMI; 2287 } 2288 2289 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2290 MachineRegisterInfo &MRI, 2291 Register LiveIn, 2292 Register PhyReg) const { 2293 assert(PhyReg.isPhysical() && "Physical register expected"); 2294 2295 // Insert the live-in copy, if required, by defining destination virtual 2296 // register. 2297 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2298 if (!MRI.getVRegDef(LiveIn)) { 2299 // FIXME: Should have scoped insert pt 2300 MachineBasicBlock &OrigInsBB = B.getMBB(); 2301 auto OrigInsPt = B.getInsertPt(); 2302 2303 MachineBasicBlock &EntryMBB = B.getMF().front(); 2304 EntryMBB.addLiveIn(PhyReg); 2305 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2306 B.buildCopy(LiveIn, PhyReg); 2307 2308 B.setInsertPt(OrigInsBB, OrigInsPt); 2309 } 2310 2311 return LiveIn; 2312 } 2313 2314 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2315 MachineRegisterInfo &MRI, 2316 Register PhyReg, LLT Ty, 2317 bool InsertLiveInCopy) const { 2318 assert(PhyReg.isPhysical() && "Physical register expected"); 2319 2320 // Get or create virtual live-in regester 2321 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2322 if (!LiveIn) { 2323 LiveIn = MRI.createGenericVirtualRegister(Ty); 2324 MRI.addLiveIn(PhyReg, LiveIn); 2325 } 2326 2327 // When the actual true copy required is from virtual register to physical 2328 // register (to be inserted later), live-in copy insertion from physical 2329 // to register virtual register is not required 2330 if (!InsertLiveInCopy) 2331 return LiveIn; 2332 2333 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2334 } 2335 2336 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2337 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2338 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2339 const ArgDescriptor *Arg; 2340 const TargetRegisterClass *RC; 2341 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2342 if (!Arg) { 2343 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2344 return nullptr; 2345 } 2346 return Arg; 2347 } 2348 2349 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2350 const ArgDescriptor *Arg) const { 2351 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2352 return false; // TODO: Handle these 2353 2354 Register SrcReg = Arg->getRegister(); 2355 assert(SrcReg.isPhysical() && "Physical register expected"); 2356 assert(DstReg.isVirtual() && "Virtual register expected"); 2357 2358 MachineRegisterInfo &MRI = *B.getMRI(); 2359 2360 LLT Ty = MRI.getType(DstReg); 2361 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2362 2363 if (Arg->isMasked()) { 2364 // TODO: Should we try to emit this once in the entry block? 2365 const LLT S32 = LLT::scalar(32); 2366 const unsigned Mask = Arg->getMask(); 2367 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2368 2369 Register AndMaskSrc = LiveIn; 2370 2371 if (Shift != 0) { 2372 auto ShiftAmt = B.buildConstant(S32, Shift); 2373 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2374 } 2375 2376 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2377 } else { 2378 B.buildCopy(DstReg, LiveIn); 2379 } 2380 2381 return true; 2382 } 2383 2384 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2385 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2386 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2387 B.setInstr(MI); 2388 2389 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2390 if (!Arg) 2391 return false; 2392 2393 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2394 return false; 2395 2396 MI.eraseFromParent(); 2397 return true; 2398 } 2399 2400 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2401 MachineRegisterInfo &MRI, 2402 MachineIRBuilder &B) const { 2403 B.setInstr(MI); 2404 Register Dst = MI.getOperand(0).getReg(); 2405 LLT DstTy = MRI.getType(Dst); 2406 LLT S16 = LLT::scalar(16); 2407 LLT S32 = LLT::scalar(32); 2408 LLT S64 = LLT::scalar(64); 2409 2410 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2411 return true; 2412 2413 if (DstTy == S16) 2414 return legalizeFDIV16(MI, MRI, B); 2415 if (DstTy == S32) 2416 return legalizeFDIV32(MI, MRI, B); 2417 if (DstTy == S64) 2418 return legalizeFDIV64(MI, MRI, B); 2419 2420 return false; 2421 } 2422 2423 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2424 const LLT S32 = LLT::scalar(32); 2425 2426 auto Cvt0 = B.buildUITOFP(S32, Src); 2427 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2428 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2429 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2430 return B.buildFPTOUI(S32, Mul).getReg(0); 2431 } 2432 2433 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2434 Register DstReg, 2435 Register Num, 2436 Register Den, 2437 bool IsRem) const { 2438 const LLT S1 = LLT::scalar(1); 2439 const LLT S32 = LLT::scalar(32); 2440 2441 // RCP = URECIP(Den) = 2^32 / Den + e 2442 // e is rounding error. 2443 auto RCP = buildDivRCP(B, Den); 2444 2445 // RCP_LO = mul(RCP, Den) 2446 auto RCP_LO = B.buildMul(S32, RCP, Den); 2447 2448 // RCP_HI = mulhu (RCP, Den) */ 2449 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2450 2451 // NEG_RCP_LO = -RCP_LO 2452 auto Zero = B.buildConstant(S32, 0); 2453 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2454 2455 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2456 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2457 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2458 2459 // Calculate the rounding error from the URECIP instruction 2460 // E = mulhu(ABS_RCP_LO, RCP) 2461 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2462 2463 // RCP_A_E = RCP + E 2464 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2465 2466 // RCP_S_E = RCP - E 2467 auto RCP_S_E = B.buildSub(S32, RCP, E); 2468 2469 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2470 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2471 2472 // Quotient = mulhu(Tmp0, Num)stmp 2473 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2474 2475 // Num_S_Remainder = Quotient * Den 2476 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2477 2478 // Remainder = Num - Num_S_Remainder 2479 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2480 2481 // Remainder_GE_Den = Remainder >= Den 2482 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2483 2484 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2485 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2486 Num, Num_S_Remainder); 2487 2488 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2489 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2490 2491 // Calculate Division result: 2492 2493 // Quotient_A_One = Quotient + 1 2494 auto One = B.buildConstant(S32, 1); 2495 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2496 2497 // Quotient_S_One = Quotient - 1 2498 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2499 2500 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2501 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2502 2503 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2504 if (IsRem) { 2505 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2506 2507 // Calculate Rem result: 2508 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2509 2510 // Remainder_A_Den = Remainder + Den 2511 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2512 2513 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2514 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2515 2516 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2517 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2518 } else { 2519 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2520 } 2521 } 2522 2523 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2524 MachineRegisterInfo &MRI, 2525 MachineIRBuilder &B) const { 2526 B.setInstr(MI); 2527 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2528 Register DstReg = MI.getOperand(0).getReg(); 2529 Register Num = MI.getOperand(1).getReg(); 2530 Register Den = MI.getOperand(2).getReg(); 2531 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2532 MI.eraseFromParent(); 2533 return true; 2534 } 2535 2536 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2537 MachineRegisterInfo &MRI, 2538 MachineIRBuilder &B) const { 2539 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2540 return legalizeUDIV_UREM32(MI, MRI, B); 2541 return false; 2542 } 2543 2544 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2545 MachineRegisterInfo &MRI, 2546 MachineIRBuilder &B) const { 2547 B.setInstr(MI); 2548 const LLT S32 = LLT::scalar(32); 2549 2550 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2551 Register DstReg = MI.getOperand(0).getReg(); 2552 Register LHS = MI.getOperand(1).getReg(); 2553 Register RHS = MI.getOperand(2).getReg(); 2554 2555 auto ThirtyOne = B.buildConstant(S32, 31); 2556 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2557 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2558 2559 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2560 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2561 2562 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2563 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2564 2565 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2566 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2567 2568 if (IsRem) { 2569 auto RSign = LHSign; // Remainder sign is the same as LHS 2570 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2571 B.buildSub(DstReg, UDivRem, RSign); 2572 } else { 2573 auto DSign = B.buildXor(S32, LHSign, RHSign); 2574 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2575 B.buildSub(DstReg, UDivRem, DSign); 2576 } 2577 2578 MI.eraseFromParent(); 2579 return true; 2580 } 2581 2582 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2583 MachineRegisterInfo &MRI, 2584 MachineIRBuilder &B) const { 2585 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2586 return legalizeSDIV_SREM32(MI, MRI, B); 2587 return false; 2588 } 2589 2590 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2591 MachineRegisterInfo &MRI, 2592 MachineIRBuilder &B) const { 2593 Register Res = MI.getOperand(0).getReg(); 2594 Register LHS = MI.getOperand(1).getReg(); 2595 Register RHS = MI.getOperand(2).getReg(); 2596 2597 uint16_t Flags = MI.getFlags(); 2598 2599 LLT ResTy = MRI.getType(Res); 2600 LLT S32 = LLT::scalar(32); 2601 LLT S64 = LLT::scalar(64); 2602 2603 const MachineFunction &MF = B.getMF(); 2604 bool Unsafe = 2605 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2606 2607 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2608 return false; 2609 2610 if (!Unsafe && ResTy == S32 && 2611 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2612 return false; 2613 2614 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2615 // 1 / x -> RCP(x) 2616 if (CLHS->isExactlyValue(1.0)) { 2617 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2618 .addUse(RHS) 2619 .setMIFlags(Flags); 2620 2621 MI.eraseFromParent(); 2622 return true; 2623 } 2624 2625 // -1 / x -> RCP( FNEG(x) ) 2626 if (CLHS->isExactlyValue(-1.0)) { 2627 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2628 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2629 .addUse(FNeg.getReg(0)) 2630 .setMIFlags(Flags); 2631 2632 MI.eraseFromParent(); 2633 return true; 2634 } 2635 } 2636 2637 // x / y -> x * (1.0 / y) 2638 if (Unsafe) { 2639 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2640 .addUse(RHS) 2641 .setMIFlags(Flags); 2642 B.buildFMul(Res, LHS, RCP, Flags); 2643 2644 MI.eraseFromParent(); 2645 return true; 2646 } 2647 2648 return false; 2649 } 2650 2651 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2652 MachineRegisterInfo &MRI, 2653 MachineIRBuilder &B) const { 2654 B.setInstr(MI); 2655 Register Res = MI.getOperand(0).getReg(); 2656 Register LHS = MI.getOperand(1).getReg(); 2657 Register RHS = MI.getOperand(2).getReg(); 2658 2659 uint16_t Flags = MI.getFlags(); 2660 2661 LLT S16 = LLT::scalar(16); 2662 LLT S32 = LLT::scalar(32); 2663 2664 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2665 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2666 2667 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2668 .addUse(RHSExt.getReg(0)) 2669 .setMIFlags(Flags); 2670 2671 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2672 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2673 2674 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2675 .addUse(RDst.getReg(0)) 2676 .addUse(RHS) 2677 .addUse(LHS) 2678 .setMIFlags(Flags); 2679 2680 MI.eraseFromParent(); 2681 return true; 2682 } 2683 2684 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2685 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2686 static void toggleSPDenormMode(bool Enable, 2687 MachineIRBuilder &B, 2688 const GCNSubtarget &ST, 2689 AMDGPU::SIModeRegisterDefaults Mode) { 2690 // Set SP denorm mode to this value. 2691 unsigned SPDenormMode = 2692 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2693 2694 if (ST.hasDenormModeInst()) { 2695 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2696 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2697 2698 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2699 B.buildInstr(AMDGPU::S_DENORM_MODE) 2700 .addImm(NewDenormModeValue); 2701 2702 } else { 2703 // Select FP32 bit field in mode register. 2704 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2705 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2706 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2707 2708 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2709 .addImm(SPDenormMode) 2710 .addImm(SPDenormModeBitField); 2711 } 2712 } 2713 2714 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2715 MachineRegisterInfo &MRI, 2716 MachineIRBuilder &B) const { 2717 B.setInstr(MI); 2718 Register Res = MI.getOperand(0).getReg(); 2719 Register LHS = MI.getOperand(1).getReg(); 2720 Register RHS = MI.getOperand(2).getReg(); 2721 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2722 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2723 2724 uint16_t Flags = MI.getFlags(); 2725 2726 LLT S32 = LLT::scalar(32); 2727 LLT S1 = LLT::scalar(1); 2728 2729 auto One = B.buildFConstant(S32, 1.0f); 2730 2731 auto DenominatorScaled = 2732 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2733 .addUse(RHS) 2734 .addUse(LHS) 2735 .addImm(1) 2736 .setMIFlags(Flags); 2737 auto NumeratorScaled = 2738 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2739 .addUse(LHS) 2740 .addUse(RHS) 2741 .addImm(0) 2742 .setMIFlags(Flags); 2743 2744 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2745 .addUse(DenominatorScaled.getReg(0)) 2746 .setMIFlags(Flags); 2747 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2748 2749 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2750 // aren't modeled as reading it. 2751 if (!Mode.allFP32Denormals()) 2752 toggleSPDenormMode(true, B, ST, Mode); 2753 2754 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2755 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2756 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2757 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2758 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2759 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2760 2761 if (!Mode.allFP32Denormals()) 2762 toggleSPDenormMode(false, B, ST, Mode); 2763 2764 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2765 .addUse(Fma4.getReg(0)) 2766 .addUse(Fma1.getReg(0)) 2767 .addUse(Fma3.getReg(0)) 2768 .addUse(NumeratorScaled.getReg(1)) 2769 .setMIFlags(Flags); 2770 2771 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2772 .addUse(Fmas.getReg(0)) 2773 .addUse(RHS) 2774 .addUse(LHS) 2775 .setMIFlags(Flags); 2776 2777 MI.eraseFromParent(); 2778 return true; 2779 } 2780 2781 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2782 MachineRegisterInfo &MRI, 2783 MachineIRBuilder &B) const { 2784 B.setInstr(MI); 2785 Register Res = MI.getOperand(0).getReg(); 2786 Register LHS = MI.getOperand(1).getReg(); 2787 Register RHS = MI.getOperand(2).getReg(); 2788 2789 uint16_t Flags = MI.getFlags(); 2790 2791 LLT S64 = LLT::scalar(64); 2792 LLT S1 = LLT::scalar(1); 2793 2794 auto One = B.buildFConstant(S64, 1.0); 2795 2796 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2797 .addUse(LHS) 2798 .addUse(RHS) 2799 .addImm(1) 2800 .setMIFlags(Flags); 2801 2802 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2803 2804 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2805 .addUse(DivScale0.getReg(0)) 2806 .setMIFlags(Flags); 2807 2808 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2809 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2810 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2811 2812 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2813 .addUse(LHS) 2814 .addUse(RHS) 2815 .addImm(0) 2816 .setMIFlags(Flags); 2817 2818 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2819 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2820 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2821 2822 Register Scale; 2823 if (!ST.hasUsableDivScaleConditionOutput()) { 2824 // Workaround a hardware bug on SI where the condition output from div_scale 2825 // is not usable. 2826 2827 LLT S32 = LLT::scalar(32); 2828 2829 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2830 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2831 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2832 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2833 2834 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2835 Scale1Unmerge.getReg(1)); 2836 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2837 Scale0Unmerge.getReg(1)); 2838 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 2839 } else { 2840 Scale = DivScale1.getReg(1); 2841 } 2842 2843 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2844 .addUse(Fma4.getReg(0)) 2845 .addUse(Fma3.getReg(0)) 2846 .addUse(Mul.getReg(0)) 2847 .addUse(Scale) 2848 .setMIFlags(Flags); 2849 2850 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2851 .addUse(Fmas.getReg(0)) 2852 .addUse(RHS) 2853 .addUse(LHS) 2854 .setMIFlags(Flags); 2855 2856 MI.eraseFromParent(); 2857 return true; 2858 } 2859 2860 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2861 MachineRegisterInfo &MRI, 2862 MachineIRBuilder &B) const { 2863 B.setInstr(MI); 2864 Register Res = MI.getOperand(0).getReg(); 2865 Register LHS = MI.getOperand(2).getReg(); 2866 Register RHS = MI.getOperand(3).getReg(); 2867 uint16_t Flags = MI.getFlags(); 2868 2869 LLT S32 = LLT::scalar(32); 2870 LLT S1 = LLT::scalar(1); 2871 2872 auto Abs = B.buildFAbs(S32, RHS, Flags); 2873 const APFloat C0Val(1.0f); 2874 2875 auto C0 = B.buildConstant(S32, 0x6f800000); 2876 auto C1 = B.buildConstant(S32, 0x2f800000); 2877 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2878 2879 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2880 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2881 2882 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2883 2884 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2885 .addUse(Mul0.getReg(0)) 2886 .setMIFlags(Flags); 2887 2888 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2889 2890 B.buildFMul(Res, Sel, Mul1, Flags); 2891 2892 MI.eraseFromParent(); 2893 return true; 2894 } 2895 2896 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2897 MachineRegisterInfo &MRI, 2898 MachineIRBuilder &B) const { 2899 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2900 if (!MFI->isEntryFunction()) { 2901 return legalizePreloadedArgIntrin(MI, MRI, B, 2902 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2903 } 2904 2905 B.setInstr(MI); 2906 2907 uint64_t Offset = 2908 ST.getTargetLowering()->getImplicitParameterOffset( 2909 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2910 Register DstReg = MI.getOperand(0).getReg(); 2911 LLT DstTy = MRI.getType(DstReg); 2912 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2913 2914 const ArgDescriptor *Arg; 2915 const TargetRegisterClass *RC; 2916 std::tie(Arg, RC) 2917 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2918 if (!Arg) 2919 return false; 2920 2921 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2922 if (!loadInputValue(KernargPtrReg, B, Arg)) 2923 return false; 2924 2925 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2926 MI.eraseFromParent(); 2927 return true; 2928 } 2929 2930 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2931 MachineRegisterInfo &MRI, 2932 MachineIRBuilder &B, 2933 unsigned AddrSpace) const { 2934 B.setInstr(MI); 2935 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2936 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2937 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2938 MI.eraseFromParent(); 2939 return true; 2940 } 2941 2942 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2943 // offset (the offset that is included in bounds checking and swizzling, to be 2944 // split between the instruction's voffset and immoffset fields) and soffset 2945 // (the offset that is excluded from bounds checking and swizzling, to go in 2946 // the instruction's soffset field). This function takes the first kind of 2947 // offset and figures out how to split it between voffset and immoffset. 2948 std::tuple<Register, unsigned, unsigned> 2949 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2950 Register OrigOffset) const { 2951 const unsigned MaxImm = 4095; 2952 Register BaseReg; 2953 unsigned TotalConstOffset; 2954 MachineInstr *OffsetDef; 2955 const LLT S32 = LLT::scalar(32); 2956 2957 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2958 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2959 2960 unsigned ImmOffset = TotalConstOffset; 2961 2962 // If the immediate value is too big for the immoffset field, put the value 2963 // and -4096 into the immoffset field so that the value that is copied/added 2964 // for the voffset field is a multiple of 4096, and it stands more chance 2965 // of being CSEd with the copy/add for another similar load/store. 2966 // However, do not do that rounding down to a multiple of 4096 if that is a 2967 // negative number, as it appears to be illegal to have a negative offset 2968 // in the vgpr, even if adding the immediate offset makes it positive. 2969 unsigned Overflow = ImmOffset & ~MaxImm; 2970 ImmOffset -= Overflow; 2971 if ((int32_t)Overflow < 0) { 2972 Overflow += ImmOffset; 2973 ImmOffset = 0; 2974 } 2975 2976 if (Overflow != 0) { 2977 if (!BaseReg) { 2978 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2979 } else { 2980 auto OverflowVal = B.buildConstant(S32, Overflow); 2981 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2982 } 2983 } 2984 2985 if (!BaseReg) 2986 BaseReg = B.buildConstant(S32, 0).getReg(0); 2987 2988 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2989 } 2990 2991 /// Handle register layout difference for f16 images for some subtargets. 2992 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2993 MachineRegisterInfo &MRI, 2994 Register Reg) const { 2995 if (!ST.hasUnpackedD16VMem()) 2996 return Reg; 2997 2998 const LLT S16 = LLT::scalar(16); 2999 const LLT S32 = LLT::scalar(32); 3000 LLT StoreVT = MRI.getType(Reg); 3001 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3002 3003 auto Unmerge = B.buildUnmerge(S16, Reg); 3004 3005 SmallVector<Register, 4> WideRegs; 3006 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3007 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3008 3009 int NumElts = StoreVT.getNumElements(); 3010 3011 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3012 } 3013 3014 Register AMDGPULegalizerInfo::fixStoreSourceType( 3015 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3016 MachineRegisterInfo *MRI = B.getMRI(); 3017 LLT Ty = MRI->getType(VData); 3018 3019 const LLT S16 = LLT::scalar(16); 3020 3021 // Fixup illegal register types for i8 stores. 3022 if (Ty == LLT::scalar(8) || Ty == S16) { 3023 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3024 return AnyExt; 3025 } 3026 3027 if (Ty.isVector()) { 3028 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3029 if (IsFormat) 3030 return handleD16VData(B, *MRI, VData); 3031 } 3032 } 3033 3034 return VData; 3035 } 3036 3037 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3038 MachineRegisterInfo &MRI, 3039 MachineIRBuilder &B, 3040 bool IsTyped, 3041 bool IsFormat) const { 3042 B.setInstr(MI); 3043 3044 Register VData = MI.getOperand(1).getReg(); 3045 LLT Ty = MRI.getType(VData); 3046 LLT EltTy = Ty.getScalarType(); 3047 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3048 const LLT S32 = LLT::scalar(32); 3049 3050 VData = fixStoreSourceType(B, VData, IsFormat); 3051 Register RSrc = MI.getOperand(2).getReg(); 3052 3053 MachineMemOperand *MMO = *MI.memoperands_begin(); 3054 const int MemSize = MMO->getSize(); 3055 3056 unsigned ImmOffset; 3057 unsigned TotalOffset; 3058 3059 // The typed intrinsics add an immediate after the registers. 3060 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3061 3062 // The struct intrinsic variants add one additional operand over raw. 3063 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3064 Register VIndex; 3065 int OpOffset = 0; 3066 if (HasVIndex) { 3067 VIndex = MI.getOperand(3).getReg(); 3068 OpOffset = 1; 3069 } 3070 3071 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3072 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3073 3074 unsigned Format = 0; 3075 if (IsTyped) { 3076 Format = MI.getOperand(5 + OpOffset).getImm(); 3077 ++OpOffset; 3078 } 3079 3080 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3081 3082 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3083 if (TotalOffset != 0) 3084 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3085 3086 unsigned Opc; 3087 if (IsTyped) { 3088 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3089 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3090 } else if (IsFormat) { 3091 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3092 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3093 } else { 3094 switch (MemSize) { 3095 case 1: 3096 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3097 break; 3098 case 2: 3099 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3100 break; 3101 default: 3102 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3103 break; 3104 } 3105 } 3106 3107 if (!VIndex) 3108 VIndex = B.buildConstant(S32, 0).getReg(0); 3109 3110 auto MIB = B.buildInstr(Opc) 3111 .addUse(VData) // vdata 3112 .addUse(RSrc) // rsrc 3113 .addUse(VIndex) // vindex 3114 .addUse(VOffset) // voffset 3115 .addUse(SOffset) // soffset 3116 .addImm(ImmOffset); // offset(imm) 3117 3118 if (IsTyped) 3119 MIB.addImm(Format); 3120 3121 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3122 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3123 .addMemOperand(MMO); 3124 3125 MI.eraseFromParent(); 3126 return true; 3127 } 3128 3129 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3130 MachineRegisterInfo &MRI, 3131 MachineIRBuilder &B, 3132 bool IsFormat, 3133 bool IsTyped) const { 3134 B.setInstr(MI); 3135 3136 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3137 MachineMemOperand *MMO = *MI.memoperands_begin(); 3138 const int MemSize = MMO->getSize(); 3139 const LLT S32 = LLT::scalar(32); 3140 3141 Register Dst = MI.getOperand(0).getReg(); 3142 Register RSrc = MI.getOperand(2).getReg(); 3143 3144 // The typed intrinsics add an immediate after the registers. 3145 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3146 3147 // The struct intrinsic variants add one additional operand over raw. 3148 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3149 Register VIndex; 3150 int OpOffset = 0; 3151 if (HasVIndex) { 3152 VIndex = MI.getOperand(3).getReg(); 3153 OpOffset = 1; 3154 } 3155 3156 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3157 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3158 3159 unsigned Format = 0; 3160 if (IsTyped) { 3161 Format = MI.getOperand(5 + OpOffset).getImm(); 3162 ++OpOffset; 3163 } 3164 3165 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3166 unsigned ImmOffset; 3167 unsigned TotalOffset; 3168 3169 LLT Ty = MRI.getType(Dst); 3170 LLT EltTy = Ty.getScalarType(); 3171 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3172 const bool Unpacked = ST.hasUnpackedD16VMem(); 3173 3174 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3175 if (TotalOffset != 0) 3176 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3177 3178 unsigned Opc; 3179 3180 if (IsTyped) { 3181 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3182 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3183 } else if (IsFormat) { 3184 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3185 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3186 } else { 3187 switch (MemSize) { 3188 case 1: 3189 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3190 break; 3191 case 2: 3192 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3193 break; 3194 default: 3195 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3196 break; 3197 } 3198 } 3199 3200 Register LoadDstReg; 3201 3202 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3203 LLT UnpackedTy = Ty.changeElementSize(32); 3204 3205 if (IsExtLoad) 3206 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3207 else if (Unpacked && IsD16 && Ty.isVector()) 3208 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3209 else 3210 LoadDstReg = Dst; 3211 3212 if (!VIndex) 3213 VIndex = B.buildConstant(S32, 0).getReg(0); 3214 3215 auto MIB = B.buildInstr(Opc) 3216 .addDef(LoadDstReg) // vdata 3217 .addUse(RSrc) // rsrc 3218 .addUse(VIndex) // vindex 3219 .addUse(VOffset) // voffset 3220 .addUse(SOffset) // soffset 3221 .addImm(ImmOffset); // offset(imm) 3222 3223 if (IsTyped) 3224 MIB.addImm(Format); 3225 3226 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3227 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3228 .addMemOperand(MMO); 3229 3230 if (LoadDstReg != Dst) { 3231 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3232 3233 // Widen result for extending loads was widened. 3234 if (IsExtLoad) 3235 B.buildTrunc(Dst, LoadDstReg); 3236 else { 3237 // Repack to original 16-bit vector result 3238 // FIXME: G_TRUNC should work, but legalization currently fails 3239 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3240 SmallVector<Register, 4> Repack; 3241 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3242 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3243 B.buildMerge(Dst, Repack); 3244 } 3245 } 3246 3247 MI.eraseFromParent(); 3248 return true; 3249 } 3250 3251 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3252 MachineIRBuilder &B, 3253 bool IsInc) const { 3254 B.setInstr(MI); 3255 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3256 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3257 B.buildInstr(Opc) 3258 .addDef(MI.getOperand(0).getReg()) 3259 .addUse(MI.getOperand(2).getReg()) 3260 .addUse(MI.getOperand(3).getReg()) 3261 .cloneMemRefs(MI); 3262 MI.eraseFromParent(); 3263 return true; 3264 } 3265 3266 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3267 switch (IntrID) { 3268 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3269 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3270 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3271 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3272 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3273 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3274 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3275 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3276 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3277 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3278 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3279 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3280 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3281 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3282 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3283 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3284 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3285 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3286 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3287 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3288 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3289 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3290 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3291 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3292 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3293 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3294 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3295 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3296 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3297 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3298 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3299 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3300 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3301 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3302 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3303 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3304 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3305 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3306 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3307 default: 3308 llvm_unreachable("unhandled atomic opcode"); 3309 } 3310 } 3311 3312 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3313 MachineIRBuilder &B, 3314 Intrinsic::ID IID) const { 3315 B.setInstr(MI); 3316 3317 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3318 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3319 3320 Register Dst = MI.getOperand(0).getReg(); 3321 Register VData = MI.getOperand(2).getReg(); 3322 3323 Register CmpVal; 3324 int OpOffset = 0; 3325 3326 if (IsCmpSwap) { 3327 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3328 ++OpOffset; 3329 } 3330 3331 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3332 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3333 3334 // The struct intrinsic variants add one additional operand over raw. 3335 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3336 Register VIndex; 3337 if (HasVIndex) { 3338 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3339 ++OpOffset; 3340 } 3341 3342 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3343 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3344 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3345 3346 MachineMemOperand *MMO = *MI.memoperands_begin(); 3347 3348 unsigned ImmOffset; 3349 unsigned TotalOffset; 3350 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3351 if (TotalOffset != 0) 3352 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3353 3354 if (!VIndex) 3355 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3356 3357 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3358 .addDef(Dst) 3359 .addUse(VData); // vdata 3360 3361 if (IsCmpSwap) 3362 MIB.addReg(CmpVal); 3363 3364 MIB.addUse(RSrc) // rsrc 3365 .addUse(VIndex) // vindex 3366 .addUse(VOffset) // voffset 3367 .addUse(SOffset) // soffset 3368 .addImm(ImmOffset) // offset(imm) 3369 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3370 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3371 .addMemOperand(MMO); 3372 3373 MI.eraseFromParent(); 3374 return true; 3375 } 3376 3377 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3378 /// vector with s16 typed elements. 3379 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3380 SmallVectorImpl<Register> &PackedAddrs, 3381 int AddrIdx, int DimIdx, int NumVAddrs, 3382 int NumGradients) { 3383 const LLT S16 = LLT::scalar(16); 3384 const LLT V2S16 = LLT::vector(2, 16); 3385 3386 for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) { 3387 Register AddrReg = MI.getOperand(I).getReg(); 3388 3389 if (I < DimIdx) { 3390 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3391 PackedAddrs.push_back(AddrReg); 3392 } else { 3393 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3394 // derivatives dx/dh and dx/dv are packed with undef. 3395 if (((I + 1) >= (AddrIdx + NumVAddrs)) || 3396 ((NumGradients / 2) % 2 == 1 && 3397 (I == DimIdx + (NumGradients / 2) - 1 || 3398 I == DimIdx + NumGradients - 1))) { 3399 PackedAddrs.push_back( 3400 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3401 .getReg(0)); 3402 } else { 3403 PackedAddrs.push_back( 3404 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3405 .getReg(0)); 3406 ++I; 3407 } 3408 } 3409 } 3410 } 3411 3412 /// Convert from separate vaddr components to a single vector address register, 3413 /// and replace the remaining operands with $noreg. 3414 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3415 int DimIdx, int NumVAddrs) { 3416 SmallVector<Register, 8> AddrRegs(NumVAddrs); 3417 for (int I = 0; I != NumVAddrs; ++I) { 3418 AddrRegs[I] = MI.getOperand(DimIdx + I).getReg(); 3419 assert(B.getMRI()->getType(AddrRegs[I]) == LLT::scalar(32)); 3420 } 3421 3422 auto VAddr = B.buildBuildVector(LLT::vector(NumVAddrs, 32), AddrRegs); 3423 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3424 for (int I = 1; I != NumVAddrs; ++I) 3425 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3426 } 3427 3428 /// Return number of address arguments, and the number of gradients 3429 static std::pair<int, int> 3430 getImageNumVAddr(const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, 3431 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode) { 3432 const AMDGPU::MIMGDimInfo *DimInfo 3433 = AMDGPU::getMIMGDimInfo(ImageDimIntr->Dim); 3434 3435 int NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0; 3436 int NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0; 3437 int NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0; 3438 int NumVAddr = BaseOpcode->NumExtraArgs + NumGradients + NumCoords + NumLCM; 3439 return {NumVAddr, NumGradients}; 3440 } 3441 3442 static int getDMaskIdx(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode, 3443 int NumDefs) { 3444 assert(!BaseOpcode->Atomic); 3445 return NumDefs + 1 + (BaseOpcode->Store ? 1 : 0); 3446 } 3447 3448 /// Return first address operand index in an image intrinsic. 3449 static int getImageVAddrIdxBegin(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode, 3450 int NumDefs) { 3451 if (BaseOpcode->Atomic) 3452 return NumDefs + 1 + (BaseOpcode->AtomicX2 ? 2 : 1); 3453 return getDMaskIdx(BaseOpcode, NumDefs) + 1; 3454 } 3455 3456 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3457 /// 3458 /// Depending on the subtarget, load/store with 16-bit element data need to be 3459 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3460 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3461 /// registers. 3462 /// 3463 /// We don't want to directly select image instructions just yet, but also want 3464 /// to exposes all register repacking to the legalizer/combiners. We also don't 3465 /// want a selected instrution entering RegBankSelect. In order to avoid 3466 /// defining a multitude of intermediate image instructions, directly hack on 3467 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3468 /// now unnecessary arguments with $noreg. 3469 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3470 MachineInstr &MI, MachineIRBuilder &B, 3471 GISelChangeObserver &Observer, 3472 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3473 B.setInstr(MI); 3474 3475 const int NumDefs = MI.getNumExplicitDefs(); 3476 bool IsTFE = NumDefs == 2; 3477 // We are only processing the operands of d16 image operations on subtargets 3478 // that use the unpacked register layout, or need to repack the TFE result. 3479 3480 // TODO: Do we need to guard against already legalized intrinsics? 3481 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3482 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3483 3484 MachineRegisterInfo *MRI = B.getMRI(); 3485 const LLT S32 = LLT::scalar(32); 3486 const LLT S16 = LLT::scalar(16); 3487 const LLT V2S16 = LLT::vector(2, 16); 3488 3489 // Index of first address argument 3490 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3491 3492 // Check for 16 bit addresses and pack if true. 3493 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3494 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3495 const bool IsA16 = AddrTy == S16; 3496 3497 int NumVAddrs, NumGradients; 3498 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3499 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3500 getDMaskIdx(BaseOpcode, NumDefs); 3501 unsigned DMask = 0; 3502 3503 int DMaskLanes = 0; 3504 if (!BaseOpcode->Atomic) { 3505 DMask = MI.getOperand(DMaskIdx).getImm(); 3506 if (BaseOpcode->Gather4) { 3507 DMaskLanes = 4; 3508 } else if (DMask != 0) { 3509 DMaskLanes = countPopulation(DMask); 3510 } else if (!IsTFE && !BaseOpcode->Store) { 3511 // If dmask is 0, this is a no-op load. This can be eliminated. 3512 B.buildUndef(MI.getOperand(0)); 3513 MI.eraseFromParent(); 3514 return true; 3515 } 3516 } 3517 3518 Observer.changingInstr(MI); 3519 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3520 3521 unsigned NewOpcode = NumDefs == 0 ? 3522 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3523 3524 // Track that we legalized this 3525 MI.setDesc(B.getTII().get(NewOpcode)); 3526 3527 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3528 // dmask to be at least 1 otherwise the instruction will fail 3529 if (IsTFE && DMask == 0) { 3530 DMask = 0x1; 3531 DMaskLanes = 1; 3532 MI.getOperand(DMaskIdx).setImm(DMask); 3533 } 3534 3535 // If the register allocator cannot place the address registers contiguously 3536 // without introducing moves, then using the non-sequential address encoding 3537 // is always preferable, since it saves VALU instructions and is usually a 3538 // wash in terms of code size or even better. 3539 // 3540 // However, we currently have no way of hinting to the register allocator 3541 // that MIMG addresses should be placed contiguously when it is possible to 3542 // do so, so force non-NSA for the common 2-address case as a heuristic. 3543 // 3544 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3545 // allocation when possible. 3546 const bool UseNSA = NumVAddrs >= 3 && 3547 ST.hasFeature(AMDGPU::FeatureNSAEncoding); 3548 3549 // Rewrite the addressing register layout before doing anything else. 3550 if (IsA16) { 3551 #if 0 3552 // FIXME: this feature is missing from gfx10. When that is fixed, this check 3553 // should be introduced. 3554 if (!ST.hasFeature(AMDGPU::FeatureR128A16)) 3555 return false; 3556 #endif 3557 3558 if (NumVAddrs > 1) { 3559 SmallVector<Register, 4> PackedRegs; 3560 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs, 3561 NumGradients); 3562 3563 if (!UseNSA && PackedRegs.size() > 1) { 3564 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3565 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3566 PackedRegs[0] = Concat.getReg(0); 3567 PackedRegs.resize(1); 3568 } 3569 3570 const int NumPacked = PackedRegs.size(); 3571 for (int I = 0; I != NumVAddrs; ++I) { 3572 assert(MI.getOperand(AddrIdx + I).getReg() != AMDGPU::NoRegister); 3573 3574 if (I < NumPacked) 3575 MI.getOperand(AddrIdx + I).setReg(PackedRegs[I]); 3576 else 3577 MI.getOperand(AddrIdx + I).setReg(AMDGPU::NoRegister); 3578 } 3579 } 3580 } else if (!UseNSA && NumVAddrs > 1) { 3581 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3582 } 3583 3584 if (BaseOpcode->Store) { // No TFE for stores? 3585 // TODO: Handle dmask trim 3586 Register VData = MI.getOperand(1).getReg(); 3587 LLT Ty = MRI->getType(VData); 3588 if (!Ty.isVector() || Ty.getElementType() != S16) 3589 return true; 3590 3591 B.setInstr(MI); 3592 3593 Register RepackedReg = handleD16VData(B, *MRI, VData); 3594 if (RepackedReg != VData) { 3595 MI.getOperand(1).setReg(RepackedReg); 3596 } 3597 3598 return true; 3599 } 3600 3601 Register DstReg = MI.getOperand(0).getReg(); 3602 LLT Ty = MRI->getType(DstReg); 3603 const LLT EltTy = Ty.getScalarType(); 3604 const bool IsD16 = Ty.getScalarType() == S16; 3605 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3606 3607 // Confirm that the return type is large enough for the dmask specified 3608 if (NumElts < DMaskLanes) 3609 return false; 3610 3611 if (NumElts > 4 || DMaskLanes > 4) 3612 return false; 3613 3614 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3615 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3616 3617 // The raw dword aligned data component of the load. The only legal cases 3618 // where this matters should be when using the packed D16 format, for 3619 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3620 LLT RoundedTy; 3621 3622 // S32 vector to to cover all data, plus TFE result element. 3623 LLT TFETy; 3624 3625 // Register type to use for each loaded component. Will be S32 or V2S16. 3626 LLT RegTy; 3627 3628 if (IsD16 && ST.hasUnpackedD16VMem()) { 3629 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3630 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3631 RegTy = S32; 3632 } else { 3633 unsigned EltSize = EltTy.getSizeInBits(); 3634 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3635 unsigned RoundedSize = 32 * RoundedElts; 3636 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3637 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3638 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3639 } 3640 3641 // The return type does not need adjustment. 3642 // TODO: Should we change s16 case to s32 or <2 x s16>? 3643 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3644 return true; 3645 3646 Register Dst1Reg; 3647 3648 // Insert after the instruction. 3649 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3650 3651 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3652 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3653 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3654 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3655 3656 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3657 3658 MI.getOperand(0).setReg(NewResultReg); 3659 3660 // In the IR, TFE is supposed to be used with a 2 element struct return 3661 // type. The intruction really returns these two values in one contiguous 3662 // register, with one additional dword beyond the loaded data. Rewrite the 3663 // return type to use a single register result. 3664 3665 if (IsTFE) { 3666 Dst1Reg = MI.getOperand(1).getReg(); 3667 if (MRI->getType(Dst1Reg) != S32) 3668 return false; 3669 3670 // TODO: Make sure the TFE operand bit is set. 3671 MI.RemoveOperand(1); 3672 3673 // Handle the easy case that requires no repack instructions. 3674 if (Ty == S32) { 3675 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 3676 return true; 3677 } 3678 } 3679 3680 // Now figure out how to copy the new result register back into the old 3681 // result. 3682 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 3683 3684 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 3685 3686 if (ResultNumRegs == 1) { 3687 assert(!IsTFE); 3688 ResultRegs[0] = NewResultReg; 3689 } else { 3690 // We have to repack into a new vector of some kind. 3691 for (int I = 0; I != NumDataRegs; ++I) 3692 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 3693 B.buildUnmerge(ResultRegs, NewResultReg); 3694 3695 // Drop the final TFE element to get the data part. The TFE result is 3696 // directly written to the right place already. 3697 if (IsTFE) 3698 ResultRegs.resize(NumDataRegs); 3699 } 3700 3701 // For an s16 scalar result, we form an s32 result with a truncate regardless 3702 // of packed vs. unpacked. 3703 if (IsD16 && !Ty.isVector()) { 3704 B.buildTrunc(DstReg, ResultRegs[0]); 3705 return true; 3706 } 3707 3708 // Avoid a build/concat_vector of 1 entry. 3709 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 3710 B.buildBitcast(DstReg, ResultRegs[0]); 3711 return true; 3712 } 3713 3714 assert(Ty.isVector()); 3715 3716 if (IsD16) { 3717 // For packed D16 results with TFE enabled, all the data components are 3718 // S32. Cast back to the expected type. 3719 // 3720 // TODO: We don't really need to use load s32 elements. We would only need one 3721 // cast for the TFE result if a multiple of v2s16 was used. 3722 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 3723 for (Register &Reg : ResultRegs) 3724 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 3725 } else if (ST.hasUnpackedD16VMem()) { 3726 for (Register &Reg : ResultRegs) 3727 Reg = B.buildTrunc(S16, Reg).getReg(0); 3728 } 3729 } 3730 3731 auto padWithUndef = [&](LLT Ty, int NumElts) { 3732 if (NumElts == 0) 3733 return; 3734 Register Undef = B.buildUndef(Ty).getReg(0); 3735 for (int I = 0; I != NumElts; ++I) 3736 ResultRegs.push_back(Undef); 3737 }; 3738 3739 // Pad out any elements eliminated due to the dmask. 3740 LLT ResTy = MRI->getType(ResultRegs[0]); 3741 if (!ResTy.isVector()) { 3742 padWithUndef(ResTy, NumElts - ResultRegs.size()); 3743 B.buildBuildVector(DstReg, ResultRegs); 3744 return true; 3745 } 3746 3747 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 3748 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 3749 3750 // Deal with the one annoying legal case. 3751 const LLT V3S16 = LLT::vector(3, 16); 3752 if (Ty == V3S16) { 3753 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 3754 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 3755 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 3756 return true; 3757 } 3758 3759 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 3760 B.buildConcatVectors(DstReg, ResultRegs); 3761 return true; 3762 } 3763 3764 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3765 MachineInstr &MI, MachineIRBuilder &B, 3766 GISelChangeObserver &Observer) const { 3767 Register Dst = MI.getOperand(0).getReg(); 3768 LLT Ty = B.getMRI()->getType(Dst); 3769 unsigned Size = Ty.getSizeInBits(); 3770 MachineFunction &MF = B.getMF(); 3771 3772 Observer.changingInstr(MI); 3773 3774 // FIXME: We don't really need this intermediate instruction. The intrinsic 3775 // should be fixed to have a memory operand. Since it's readnone, we're not 3776 // allowed to add one. 3777 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 3778 MI.RemoveOperand(1); // Remove intrinsic ID 3779 3780 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 3781 // TODO: Should this use datalayout alignment? 3782 const unsigned MemSize = (Size + 7) / 8; 3783 const unsigned MemAlign = 4; 3784 MachineMemOperand *MMO = MF.getMachineMemOperand( 3785 MachinePointerInfo(), 3786 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3787 MachineMemOperand::MOInvariant, MemSize, MemAlign); 3788 MI.addMemOperand(MF, MMO); 3789 3790 // There are no 96-bit result scalar loads, but widening to 128-bit should 3791 // always be legal. We may need to restore this to a 96-bit result if it turns 3792 // out this needs to be converted to a vector load during RegBankSelect. 3793 if (!isPowerOf2_32(Size)) { 3794 LegalizerHelper Helper(MF, *this, Observer, B); 3795 B.setInstr(MI); 3796 3797 if (Ty.isVector()) 3798 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 3799 else 3800 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 3801 } 3802 3803 Observer.changedInstr(MI); 3804 return true; 3805 } 3806 3807 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 3808 MachineRegisterInfo &MRI, 3809 MachineIRBuilder &B) const { 3810 B.setInstr(MI); 3811 3812 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 3813 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 3814 !ST.isTrapHandlerEnabled()) { 3815 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 3816 } else { 3817 // Pass queue pointer to trap handler as input, and insert trap instruction 3818 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 3819 const ArgDescriptor *Arg = 3820 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 3821 if (!Arg) 3822 return false; 3823 MachineRegisterInfo &MRI = *B.getMRI(); 3824 Register SGPR01(AMDGPU::SGPR0_SGPR1); 3825 Register LiveIn = getLiveInRegister( 3826 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 3827 /*InsertLiveInCopy=*/false); 3828 if (!loadInputValue(LiveIn, B, Arg)) 3829 return false; 3830 B.buildCopy(SGPR01, LiveIn); 3831 B.buildInstr(AMDGPU::S_TRAP) 3832 .addImm(GCNSubtarget::TrapIDLLVMTrap) 3833 .addReg(SGPR01, RegState::Implicit); 3834 } 3835 3836 MI.eraseFromParent(); 3837 return true; 3838 } 3839 3840 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 3841 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 3842 B.setInstr(MI); 3843 3844 // Is non-HSA path or trap-handler disabled? then, report a warning 3845 // accordingly 3846 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 3847 !ST.isTrapHandlerEnabled()) { 3848 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 3849 "debugtrap handler not supported", 3850 MI.getDebugLoc(), DS_Warning); 3851 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 3852 Ctx.diagnose(NoTrap); 3853 } else { 3854 // Insert debug-trap instruction 3855 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 3856 } 3857 3858 MI.eraseFromParent(); 3859 return true; 3860 } 3861 3862 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 3863 MachineIRBuilder &B, 3864 GISelChangeObserver &Observer) const { 3865 MachineRegisterInfo &MRI = *B.getMRI(); 3866 3867 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 3868 auto IntrID = MI.getIntrinsicID(); 3869 switch (IntrID) { 3870 case Intrinsic::amdgcn_if: 3871 case Intrinsic::amdgcn_else: { 3872 MachineInstr *Br = nullptr; 3873 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3874 const SIRegisterInfo *TRI 3875 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3876 3877 B.setInstr(*BrCond); 3878 Register Def = MI.getOperand(1).getReg(); 3879 Register Use = MI.getOperand(3).getReg(); 3880 3881 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3882 if (Br) 3883 BrTarget = Br->getOperand(0).getMBB(); 3884 3885 if (IntrID == Intrinsic::amdgcn_if) { 3886 B.buildInstr(AMDGPU::SI_IF) 3887 .addDef(Def) 3888 .addUse(Use) 3889 .addMBB(BrTarget); 3890 } else { 3891 B.buildInstr(AMDGPU::SI_ELSE) 3892 .addDef(Def) 3893 .addUse(Use) 3894 .addMBB(BrTarget) 3895 .addImm(0); 3896 } 3897 3898 if (Br) 3899 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3900 3901 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 3902 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 3903 MI.eraseFromParent(); 3904 BrCond->eraseFromParent(); 3905 return true; 3906 } 3907 3908 return false; 3909 } 3910 case Intrinsic::amdgcn_loop: { 3911 MachineInstr *Br = nullptr; 3912 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3913 const SIRegisterInfo *TRI 3914 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3915 3916 B.setInstr(*BrCond); 3917 3918 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3919 if (Br) 3920 BrTarget = Br->getOperand(0).getMBB(); 3921 3922 Register Reg = MI.getOperand(2).getReg(); 3923 B.buildInstr(AMDGPU::SI_LOOP) 3924 .addUse(Reg) 3925 .addMBB(BrTarget); 3926 3927 if (Br) 3928 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3929 3930 MI.eraseFromParent(); 3931 BrCond->eraseFromParent(); 3932 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 3933 return true; 3934 } 3935 3936 return false; 3937 } 3938 case Intrinsic::amdgcn_kernarg_segment_ptr: 3939 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 3940 B.setInstr(MI); 3941 // This only makes sense to call in a kernel, so just lower to null. 3942 B.buildConstant(MI.getOperand(0).getReg(), 0); 3943 MI.eraseFromParent(); 3944 return true; 3945 } 3946 3947 return legalizePreloadedArgIntrin( 3948 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3949 case Intrinsic::amdgcn_implicitarg_ptr: 3950 return legalizeImplicitArgPtr(MI, MRI, B); 3951 case Intrinsic::amdgcn_workitem_id_x: 3952 return legalizePreloadedArgIntrin(MI, MRI, B, 3953 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3954 case Intrinsic::amdgcn_workitem_id_y: 3955 return legalizePreloadedArgIntrin(MI, MRI, B, 3956 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3957 case Intrinsic::amdgcn_workitem_id_z: 3958 return legalizePreloadedArgIntrin(MI, MRI, B, 3959 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3960 case Intrinsic::amdgcn_workgroup_id_x: 3961 return legalizePreloadedArgIntrin(MI, MRI, B, 3962 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 3963 case Intrinsic::amdgcn_workgroup_id_y: 3964 return legalizePreloadedArgIntrin(MI, MRI, B, 3965 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 3966 case Intrinsic::amdgcn_workgroup_id_z: 3967 return legalizePreloadedArgIntrin(MI, MRI, B, 3968 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 3969 case Intrinsic::amdgcn_dispatch_ptr: 3970 return legalizePreloadedArgIntrin(MI, MRI, B, 3971 AMDGPUFunctionArgInfo::DISPATCH_PTR); 3972 case Intrinsic::amdgcn_queue_ptr: 3973 return legalizePreloadedArgIntrin(MI, MRI, B, 3974 AMDGPUFunctionArgInfo::QUEUE_PTR); 3975 case Intrinsic::amdgcn_implicit_buffer_ptr: 3976 return legalizePreloadedArgIntrin( 3977 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 3978 case Intrinsic::amdgcn_dispatch_id: 3979 return legalizePreloadedArgIntrin(MI, MRI, B, 3980 AMDGPUFunctionArgInfo::DISPATCH_ID); 3981 case Intrinsic::amdgcn_fdiv_fast: 3982 return legalizeFDIVFastIntrin(MI, MRI, B); 3983 case Intrinsic::amdgcn_is_shared: 3984 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 3985 case Intrinsic::amdgcn_is_private: 3986 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 3987 case Intrinsic::amdgcn_wavefrontsize: { 3988 B.setInstr(MI); 3989 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 3990 MI.eraseFromParent(); 3991 return true; 3992 } 3993 case Intrinsic::amdgcn_s_buffer_load: 3994 return legalizeSBufferLoad(MI, B, Observer); 3995 case Intrinsic::amdgcn_raw_buffer_store: 3996 case Intrinsic::amdgcn_struct_buffer_store: 3997 return legalizeBufferStore(MI, MRI, B, false, false); 3998 case Intrinsic::amdgcn_raw_buffer_store_format: 3999 case Intrinsic::amdgcn_struct_buffer_store_format: 4000 return legalizeBufferStore(MI, MRI, B, false, true); 4001 case Intrinsic::amdgcn_raw_tbuffer_store: 4002 case Intrinsic::amdgcn_struct_tbuffer_store: 4003 return legalizeBufferStore(MI, MRI, B, true, true); 4004 case Intrinsic::amdgcn_raw_buffer_load: 4005 case Intrinsic::amdgcn_struct_buffer_load: 4006 return legalizeBufferLoad(MI, MRI, B, false, false); 4007 case Intrinsic::amdgcn_raw_buffer_load_format: 4008 case Intrinsic::amdgcn_struct_buffer_load_format: 4009 return legalizeBufferLoad(MI, MRI, B, true, false); 4010 case Intrinsic::amdgcn_raw_tbuffer_load: 4011 case Intrinsic::amdgcn_struct_tbuffer_load: 4012 return legalizeBufferLoad(MI, MRI, B, true, true); 4013 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4014 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4015 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4016 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4017 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4018 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4019 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4020 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4021 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4022 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4023 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4024 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4025 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4026 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4027 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4028 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4029 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4030 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4031 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4032 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4033 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4034 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4035 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4036 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4037 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4038 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4039 return legalizeBufferAtomic(MI, B, IntrID); 4040 case Intrinsic::amdgcn_atomic_inc: 4041 return legalizeAtomicIncDec(MI, B, true); 4042 case Intrinsic::amdgcn_atomic_dec: 4043 return legalizeAtomicIncDec(MI, B, false); 4044 case Intrinsic::trap: 4045 return legalizeTrapIntrinsic(MI, MRI, B); 4046 case Intrinsic::debugtrap: 4047 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4048 default: { 4049 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4050 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4051 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 4052 return true; 4053 } 4054 } 4055 4056 return true; 4057 } 4058