1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Round the number of elements to the next power of two elements 46 static LLT getPow2VectorType(LLT Ty) { 47 unsigned NElts = Ty.getNumElements(); 48 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 49 return Ty.changeNumElements(Pow2NElts); 50 } 51 52 // Round the number of bits to the next power of two bits 53 static LLT getPow2ScalarType(LLT Ty) { 54 unsigned Bits = Ty.getSizeInBits(); 55 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 56 return LLT::scalar(Pow2Bits); 57 } 58 59 static LegalityPredicate isMultiple32(unsigned TypeIdx, 60 unsigned MaxSize = 1024) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getScalarType(); 64 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 65 }; 66 } 67 68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 69 return [=](const LegalityQuery &Query) { 70 return Query.Types[TypeIdx].getSizeInBits() == Size; 71 }; 72 } 73 74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 75 return [=](const LegalityQuery &Query) { 76 const LLT Ty = Query.Types[TypeIdx]; 77 return Ty.isVector() && 78 Ty.getNumElements() % 2 != 0 && 79 Ty.getElementType().getSizeInBits() < 32 && 80 Ty.getSizeInBits() % 32 != 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 130 return [=](const LegalityQuery &Query) { 131 const LLT QueryTy = Query.Types[TypeIdx]; 132 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 133 }; 134 } 135 136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 137 return [=](const LegalityQuery &Query) { 138 const LLT QueryTy = Query.Types[TypeIdx]; 139 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 140 }; 141 } 142 143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 144 return [=](const LegalityQuery &Query) { 145 const LLT QueryTy = Query.Types[TypeIdx]; 146 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 147 }; 148 } 149 150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 151 // v2s16. 152 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 if (Ty.isVector()) { 156 const int EltSize = Ty.getElementType().getSizeInBits(); 157 return EltSize == 32 || EltSize == 64 || 158 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 159 EltSize == 128 || EltSize == 256; 160 } 161 162 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 163 }; 164 } 165 166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 167 return [=](const LegalityQuery &Query) { 168 const LLT QueryTy = Query.Types[TypeIdx]; 169 return QueryTy.isVector() && QueryTy.getElementType() == Type; 170 }; 171 } 172 173 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 174 return [=](const LegalityQuery &Query) { 175 const LLT QueryTy = Query.Types[TypeIdx]; 176 if (!QueryTy.isVector()) 177 return false; 178 const LLT EltTy = QueryTy.getElementType(); 179 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 180 }; 181 } 182 183 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 184 return [=](const LegalityQuery &Query) { 185 const LLT Ty = Query.Types[TypeIdx]; 186 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 187 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 188 }; 189 } 190 191 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) { 192 return [=](const LegalityQuery &Query) { 193 return Query.Types[TypeIdx0].getSizeInBits() < 194 Query.Types[TypeIdx1].getSizeInBits(); 195 }; 196 } 197 198 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) { 199 return [=](const LegalityQuery &Query) { 200 return Query.Types[TypeIdx0].getSizeInBits() > 201 Query.Types[TypeIdx1].getSizeInBits(); 202 }; 203 } 204 205 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 206 const GCNTargetMachine &TM) 207 : ST(ST_) { 208 using namespace TargetOpcode; 209 210 auto GetAddrSpacePtr = [&TM](unsigned AS) { 211 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 212 }; 213 214 const LLT S1 = LLT::scalar(1); 215 const LLT S16 = LLT::scalar(16); 216 const LLT S32 = LLT::scalar(32); 217 const LLT S64 = LLT::scalar(64); 218 const LLT S128 = LLT::scalar(128); 219 const LLT S256 = LLT::scalar(256); 220 const LLT S1024 = LLT::scalar(1024); 221 222 const LLT V2S16 = LLT::vector(2, 16); 223 const LLT V4S16 = LLT::vector(4, 16); 224 225 const LLT V2S32 = LLT::vector(2, 32); 226 const LLT V3S32 = LLT::vector(3, 32); 227 const LLT V4S32 = LLT::vector(4, 32); 228 const LLT V5S32 = LLT::vector(5, 32); 229 const LLT V6S32 = LLT::vector(6, 32); 230 const LLT V7S32 = LLT::vector(7, 32); 231 const LLT V8S32 = LLT::vector(8, 32); 232 const LLT V9S32 = LLT::vector(9, 32); 233 const LLT V10S32 = LLT::vector(10, 32); 234 const LLT V11S32 = LLT::vector(11, 32); 235 const LLT V12S32 = LLT::vector(12, 32); 236 const LLT V13S32 = LLT::vector(13, 32); 237 const LLT V14S32 = LLT::vector(14, 32); 238 const LLT V15S32 = LLT::vector(15, 32); 239 const LLT V16S32 = LLT::vector(16, 32); 240 const LLT V32S32 = LLT::vector(32, 32); 241 242 const LLT V2S64 = LLT::vector(2, 64); 243 const LLT V3S64 = LLT::vector(3, 64); 244 const LLT V4S64 = LLT::vector(4, 64); 245 const LLT V5S64 = LLT::vector(5, 64); 246 const LLT V6S64 = LLT::vector(6, 64); 247 const LLT V7S64 = LLT::vector(7, 64); 248 const LLT V8S64 = LLT::vector(8, 64); 249 const LLT V16S64 = LLT::vector(16, 64); 250 251 std::initializer_list<LLT> AllS32Vectors = 252 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 253 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 254 std::initializer_list<LLT> AllS64Vectors = 255 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 256 257 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 258 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 259 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 260 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 261 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 262 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 263 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 264 265 const LLT CodePtr = FlatPtr; 266 267 const std::initializer_list<LLT> AddrSpaces64 = { 268 GlobalPtr, ConstantPtr, FlatPtr 269 }; 270 271 const std::initializer_list<LLT> AddrSpaces32 = { 272 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 273 }; 274 275 const std::initializer_list<LLT> FPTypesBase = { 276 S32, S64 277 }; 278 279 const std::initializer_list<LLT> FPTypes16 = { 280 S32, S64, S16 281 }; 282 283 const std::initializer_list<LLT> FPTypesPK16 = { 284 S32, S64, S16, V2S16 285 }; 286 287 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 288 289 setAction({G_BRCOND, S1}, Legal); // VCC branches 290 setAction({G_BRCOND, S32}, Legal); // SCC branches 291 292 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 293 // elements for v3s16 294 getActionDefinitionsBuilder(G_PHI) 295 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 296 .legalFor(AllS32Vectors) 297 .legalFor(AllS64Vectors) 298 .legalFor(AddrSpaces64) 299 .legalFor(AddrSpaces32) 300 .clampScalar(0, S32, S256) 301 .widenScalarToNextPow2(0, 32) 302 .clampMaxNumElements(0, S32, 16) 303 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 304 .legalIf(isPointer(0)); 305 306 if (ST.hasVOP3PInsts()) { 307 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 308 .legalFor({S32, S16, V2S16}) 309 .clampScalar(0, S16, S32) 310 .clampMaxNumElements(0, S16, 2) 311 .scalarize(0) 312 .widenScalarToNextPow2(0, 32); 313 } else if (ST.has16BitInsts()) { 314 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 315 .legalFor({S32, S16}) 316 .clampScalar(0, S16, S32) 317 .scalarize(0) 318 .widenScalarToNextPow2(0, 32); 319 } else { 320 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 321 .legalFor({S32}) 322 .clampScalar(0, S32, S32) 323 .scalarize(0); 324 } 325 326 // FIXME: Not really legal. Placeholder for custom lowering. 327 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 328 .customFor({S32, S64}) 329 .clampScalar(0, S32, S64) 330 .widenScalarToNextPow2(0, 32) 331 .scalarize(0); 332 333 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 334 .legalFor({S32}) 335 .clampScalar(0, S32, S32) 336 .scalarize(0); 337 338 // Report legal for any types we can handle anywhere. For the cases only legal 339 // on the SALU, RegBankSelect will be able to re-legalize. 340 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 341 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 342 .clampScalar(0, S32, S64) 343 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 344 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 345 .widenScalarToNextPow2(0) 346 .scalarize(0); 347 348 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 349 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 350 .legalFor({{S32, S1}, {S32, S32}}) 351 .minScalar(0, S32) 352 // TODO: .scalarize(0) 353 .lower(); 354 355 getActionDefinitionsBuilder(G_BITCAST) 356 // Don't worry about the size constraint. 357 .legalIf(all(isRegisterType(0), isRegisterType(1))) 358 .lower(); 359 360 361 getActionDefinitionsBuilder(G_CONSTANT) 362 .legalFor({S1, S32, S64, S16, GlobalPtr, 363 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 364 .clampScalar(0, S32, S64) 365 .widenScalarToNextPow2(0) 366 .legalIf(isPointer(0)); 367 368 getActionDefinitionsBuilder(G_FCONSTANT) 369 .legalFor({S32, S64, S16}) 370 .clampScalar(0, S16, S64); 371 372 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 373 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 374 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 375 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 376 .clampScalarOrElt(0, S32, S1024) 377 .legalIf(isMultiple32(0)) 378 .widenScalarToNextPow2(0, 32) 379 .clampMaxNumElements(0, S32, 16); 380 381 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 382 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 383 .unsupportedFor({PrivatePtr}) 384 .custom(); 385 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 386 387 auto &FPOpActions = getActionDefinitionsBuilder( 388 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 389 .legalFor({S32, S64}); 390 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 391 .customFor({S32, S64}); 392 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 393 .customFor({S32, S64}); 394 395 if (ST.has16BitInsts()) { 396 if (ST.hasVOP3PInsts()) 397 FPOpActions.legalFor({S16, V2S16}); 398 else 399 FPOpActions.legalFor({S16}); 400 401 TrigActions.customFor({S16}); 402 FDIVActions.customFor({S16}); 403 } 404 405 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 406 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 407 408 if (ST.hasVOP3PInsts()) { 409 MinNumMaxNum.customFor(FPTypesPK16) 410 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 411 .clampMaxNumElements(0, S16, 2) 412 .clampScalar(0, S16, S64) 413 .scalarize(0); 414 } else if (ST.has16BitInsts()) { 415 MinNumMaxNum.customFor(FPTypes16) 416 .clampScalar(0, S16, S64) 417 .scalarize(0); 418 } else { 419 MinNumMaxNum.customFor(FPTypesBase) 420 .clampScalar(0, S32, S64) 421 .scalarize(0); 422 } 423 424 if (ST.hasVOP3PInsts()) 425 FPOpActions.clampMaxNumElements(0, S16, 2); 426 427 FPOpActions 428 .scalarize(0) 429 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 430 431 TrigActions 432 .scalarize(0) 433 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 434 435 FDIVActions 436 .scalarize(0) 437 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 438 439 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 440 .legalFor(FPTypesPK16) 441 .clampMaxNumElements(0, S16, 2) 442 .scalarize(0) 443 .clampScalar(0, S16, S64); 444 445 if (ST.has16BitInsts()) { 446 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 447 .legalFor({S32, S64, S16}) 448 .scalarize(0) 449 .clampScalar(0, S16, S64); 450 } else { 451 getActionDefinitionsBuilder(G_FSQRT) 452 .legalFor({S32, S64}) 453 .scalarize(0) 454 .clampScalar(0, S32, S64); 455 456 if (ST.hasFractBug()) { 457 getActionDefinitionsBuilder(G_FFLOOR) 458 .customFor({S64}) 459 .legalFor({S32, S64}) 460 .scalarize(0) 461 .clampScalar(0, S32, S64); 462 } else { 463 getActionDefinitionsBuilder(G_FFLOOR) 464 .legalFor({S32, S64}) 465 .scalarize(0) 466 .clampScalar(0, S32, S64); 467 } 468 } 469 470 getActionDefinitionsBuilder(G_FPTRUNC) 471 .legalFor({{S32, S64}, {S16, S32}}) 472 .scalarize(0) 473 .lower(); 474 475 getActionDefinitionsBuilder(G_FPEXT) 476 .legalFor({{S64, S32}, {S32, S16}}) 477 .lowerFor({{S64, S16}}) // FIXME: Implement 478 .scalarize(0); 479 480 getActionDefinitionsBuilder(G_FSUB) 481 // Use actual fsub instruction 482 .legalFor({S32}) 483 // Must use fadd + fneg 484 .lowerFor({S64, S16, V2S16}) 485 .scalarize(0) 486 .clampScalar(0, S32, S64); 487 488 // Whether this is legal depends on the floating point mode for the function. 489 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 490 if (ST.hasMadF16()) 491 FMad.customFor({S32, S16}); 492 else 493 FMad.customFor({S32}); 494 FMad.scalarize(0) 495 .lower(); 496 497 // TODO: Do we need to clamp maximum bitwidth? 498 getActionDefinitionsBuilder(G_TRUNC) 499 .legalIf(isScalar(0)) 500 .legalFor({{V2S16, V2S32}}) 501 .clampMaxNumElements(0, S16, 2) 502 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 503 // situations (like an invalid implicit use), we don't want to infinite loop 504 // in the legalizer. 505 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 506 .alwaysLegal(); 507 508 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 509 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 510 {S32, S1}, {S64, S1}, {S16, S1}}) 511 .scalarize(0) 512 .clampScalar(0, S32, S64) 513 .widenScalarToNextPow2(1, 32); 514 515 // TODO: Split s1->s64 during regbankselect for VALU. 516 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 517 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 518 .lowerFor({{S32, S64}}) 519 .lowerIf(typeIs(1, S1)) 520 .customFor({{S64, S64}}); 521 if (ST.has16BitInsts()) 522 IToFP.legalFor({{S16, S16}}); 523 IToFP.clampScalar(1, S32, S64) 524 .scalarize(0) 525 .widenScalarToNextPow2(1); 526 527 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 528 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 529 .customFor({{S64, S64}}); 530 if (ST.has16BitInsts()) 531 FPToI.legalFor({{S16, S16}}); 532 else 533 FPToI.minScalar(1, S32); 534 535 FPToI.minScalar(0, S32) 536 .scalarize(0) 537 .lower(); 538 539 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 540 .scalarize(0) 541 .lower(); 542 543 if (ST.has16BitInsts()) { 544 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 545 .legalFor({S16, S32, S64}) 546 .clampScalar(0, S16, S64) 547 .scalarize(0); 548 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 549 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 550 .legalFor({S32, S64}) 551 .clampScalar(0, S32, S64) 552 .scalarize(0); 553 } else { 554 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 555 .legalFor({S32}) 556 .customFor({S64}) 557 .clampScalar(0, S32, S64) 558 .scalarize(0); 559 } 560 561 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 562 .scalarize(0) 563 .alwaysLegal(); 564 565 auto &CmpBuilder = 566 getActionDefinitionsBuilder(G_ICMP) 567 // The compare output type differs based on the register bank of the output, 568 // so make both s1 and s32 legal. 569 // 570 // Scalar compares producing output in scc will be promoted to s32, as that 571 // is the allocatable register type that will be needed for the copy from 572 // scc. This will be promoted during RegBankSelect, and we assume something 573 // before that won't try to use s32 result types. 574 // 575 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 576 // bank. 577 .legalForCartesianProduct( 578 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 579 .legalForCartesianProduct( 580 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 581 if (ST.has16BitInsts()) { 582 CmpBuilder.legalFor({{S1, S16}}); 583 } 584 585 CmpBuilder 586 .widenScalarToNextPow2(1) 587 .clampScalar(1, S32, S64) 588 .scalarize(0) 589 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 590 591 getActionDefinitionsBuilder(G_FCMP) 592 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 593 .widenScalarToNextPow2(1) 594 .clampScalar(1, S32, S64) 595 .scalarize(0); 596 597 // FIXME: fpow has a selection pattern that should move to custom lowering. 598 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 599 if (ST.has16BitInsts()) 600 Exp2Ops.legalFor({S32, S16}); 601 else 602 Exp2Ops.legalFor({S32}); 603 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 604 Exp2Ops.scalarize(0); 605 606 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 607 if (ST.has16BitInsts()) 608 ExpOps.customFor({{S32}, {S16}}); 609 else 610 ExpOps.customFor({S32}); 611 ExpOps.clampScalar(0, MinScalarFPTy, S32) 612 .scalarize(0); 613 614 // The 64-bit versions produce 32-bit results, but only on the SALU. 615 getActionDefinitionsBuilder(G_CTPOP) 616 .legalFor({{S32, S32}, {S32, S64}}) 617 .clampScalar(0, S32, S32) 618 .clampScalar(1, S32, S64) 619 .scalarize(0) 620 .widenScalarToNextPow2(0, 32) 621 .widenScalarToNextPow2(1, 32); 622 623 // The hardware instructions return a different result on 0 than the generic 624 // instructions expect. The hardware produces -1, but these produce the 625 // bitwidth. 626 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 627 .scalarize(0) 628 .clampScalar(0, S32, S32) 629 .clampScalar(1, S32, S64) 630 .widenScalarToNextPow2(0, 32) 631 .widenScalarToNextPow2(1, 32) 632 .lower(); 633 634 // The 64-bit versions produce 32-bit results, but only on the SALU. 635 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 636 .legalFor({{S32, S32}, {S32, S64}}) 637 .clampScalar(0, S32, S32) 638 .clampScalar(1, S32, S64) 639 .scalarize(0) 640 .widenScalarToNextPow2(0, 32) 641 .widenScalarToNextPow2(1, 32); 642 643 getActionDefinitionsBuilder(G_BITREVERSE) 644 .legalFor({S32}) 645 .clampScalar(0, S32, S32) 646 .scalarize(0); 647 648 if (ST.has16BitInsts()) { 649 getActionDefinitionsBuilder(G_BSWAP) 650 .legalFor({S16, S32, V2S16}) 651 .clampMaxNumElements(0, S16, 2) 652 // FIXME: Fixing non-power-of-2 before clamp is workaround for 653 // narrowScalar limitation. 654 .widenScalarToNextPow2(0) 655 .clampScalar(0, S16, S32) 656 .scalarize(0); 657 658 if (ST.hasVOP3PInsts()) { 659 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 660 .legalFor({S32, S16, V2S16}) 661 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 662 .clampMaxNumElements(0, S16, 2) 663 .minScalar(0, S16) 664 .widenScalarToNextPow2(0) 665 .scalarize(0) 666 .lower(); 667 } else { 668 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 669 .legalFor({S32, S16}) 670 .widenScalarToNextPow2(0) 671 .minScalar(0, S16) 672 .scalarize(0) 673 .lower(); 674 } 675 } else { 676 // TODO: Should have same legality without v_perm_b32 677 getActionDefinitionsBuilder(G_BSWAP) 678 .legalFor({S32}) 679 .lowerIf(narrowerThan(0, 32)) 680 // FIXME: Fixing non-power-of-2 before clamp is workaround for 681 // narrowScalar limitation. 682 .widenScalarToNextPow2(0) 683 .maxScalar(0, S32) 684 .scalarize(0) 685 .lower(); 686 687 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 688 .legalFor({S32}) 689 .minScalar(0, S32) 690 .widenScalarToNextPow2(0) 691 .scalarize(0) 692 .lower(); 693 } 694 695 getActionDefinitionsBuilder(G_INTTOPTR) 696 // List the common cases 697 .legalForCartesianProduct(AddrSpaces64, {S64}) 698 .legalForCartesianProduct(AddrSpaces32, {S32}) 699 .scalarize(0) 700 // Accept any address space as long as the size matches 701 .legalIf(sameSize(0, 1)) 702 .widenScalarIf(smallerThan(1, 0), 703 [](const LegalityQuery &Query) { 704 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 705 }) 706 .narrowScalarIf(greaterThan(1, 0), 707 [](const LegalityQuery &Query) { 708 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 709 }); 710 711 getActionDefinitionsBuilder(G_PTRTOINT) 712 // List the common cases 713 .legalForCartesianProduct(AddrSpaces64, {S64}) 714 .legalForCartesianProduct(AddrSpaces32, {S32}) 715 .scalarize(0) 716 // Accept any address space as long as the size matches 717 .legalIf(sameSize(0, 1)) 718 .widenScalarIf(smallerThan(0, 1), 719 [](const LegalityQuery &Query) { 720 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 721 }) 722 .narrowScalarIf( 723 greaterThan(0, 1), 724 [](const LegalityQuery &Query) { 725 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 726 }); 727 728 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 729 .scalarize(0) 730 .custom(); 731 732 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 733 // handle some operations by just promoting the register during 734 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 735 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 736 switch (AS) { 737 // FIXME: Private element size. 738 case AMDGPUAS::PRIVATE_ADDRESS: 739 return 32; 740 // FIXME: Check subtarget 741 case AMDGPUAS::LOCAL_ADDRESS: 742 return ST.useDS128() ? 128 : 64; 743 744 // Treat constant and global as identical. SMRD loads are sometimes usable 745 // for global loads (ideally constant address space should be eliminated) 746 // depending on the context. Legality cannot be context dependent, but 747 // RegBankSelect can split the load as necessary depending on the pointer 748 // register bank/uniformity and if the memory is invariant or not written in 749 // a kernel. 750 case AMDGPUAS::CONSTANT_ADDRESS: 751 case AMDGPUAS::GLOBAL_ADDRESS: 752 return IsLoad ? 512 : 128; 753 default: 754 return 128; 755 } 756 }; 757 758 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 759 bool IsLoad) -> bool { 760 const LLT DstTy = Query.Types[0]; 761 762 // Split vector extloads. 763 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 764 unsigned Align = Query.MMODescrs[0].AlignInBits; 765 766 if (MemSize < DstTy.getSizeInBits()) 767 MemSize = std::max(MemSize, Align); 768 769 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 770 return true; 771 772 const LLT PtrTy = Query.Types[1]; 773 unsigned AS = PtrTy.getAddressSpace(); 774 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 775 return true; 776 777 // Catch weird sized loads that don't evenly divide into the access sizes 778 // TODO: May be able to widen depending on alignment etc. 779 unsigned NumRegs = (MemSize + 31) / 32; 780 if (NumRegs == 3) { 781 if (!ST.hasDwordx3LoadStores()) 782 return true; 783 } else { 784 // If the alignment allows, these should have been widened. 785 if (!isPowerOf2_32(NumRegs)) 786 return true; 787 } 788 789 if (Align < MemSize) { 790 const SITargetLowering *TLI = ST.getTargetLowering(); 791 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 792 } 793 794 return false; 795 }; 796 797 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 798 unsigned Size = Query.Types[0].getSizeInBits(); 799 if (isPowerOf2_32(Size)) 800 return false; 801 802 if (Size == 96 && ST.hasDwordx3LoadStores()) 803 return false; 804 805 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 806 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 807 return false; 808 809 unsigned Align = Query.MMODescrs[0].AlignInBits; 810 unsigned RoundedSize = NextPowerOf2(Size); 811 return (Align >= RoundedSize); 812 }; 813 814 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 815 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 816 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 817 818 // TODO: Refine based on subtargets which support unaligned access or 128-bit 819 // LDS 820 // TODO: Unsupported flat for SI. 821 822 for (unsigned Op : {G_LOAD, G_STORE}) { 823 const bool IsStore = Op == G_STORE; 824 825 auto &Actions = getActionDefinitionsBuilder(Op); 826 // Whitelist the common cases. 827 // TODO: Loads to s16 on gfx9 828 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 829 {V2S32, GlobalPtr, 64, GlobalAlign32}, 830 {V4S32, GlobalPtr, 128, GlobalAlign32}, 831 {S128, GlobalPtr, 128, GlobalAlign32}, 832 {S64, GlobalPtr, 64, GlobalAlign32}, 833 {V2S64, GlobalPtr, 128, GlobalAlign32}, 834 {V2S16, GlobalPtr, 32, GlobalAlign32}, 835 {S32, GlobalPtr, 8, GlobalAlign8}, 836 {S32, GlobalPtr, 16, GlobalAlign16}, 837 838 {S32, LocalPtr, 32, 32}, 839 {S64, LocalPtr, 64, 32}, 840 {V2S32, LocalPtr, 64, 32}, 841 {S32, LocalPtr, 8, 8}, 842 {S32, LocalPtr, 16, 16}, 843 {V2S16, LocalPtr, 32, 32}, 844 845 {S32, PrivatePtr, 32, 32}, 846 {S32, PrivatePtr, 8, 8}, 847 {S32, PrivatePtr, 16, 16}, 848 {V2S16, PrivatePtr, 32, 32}, 849 850 {S32, FlatPtr, 32, GlobalAlign32}, 851 {S32, FlatPtr, 16, GlobalAlign16}, 852 {S32, FlatPtr, 8, GlobalAlign8}, 853 {V2S16, FlatPtr, 32, GlobalAlign32}, 854 855 {S32, ConstantPtr, 32, GlobalAlign32}, 856 {V2S32, ConstantPtr, 64, GlobalAlign32}, 857 {V4S32, ConstantPtr, 128, GlobalAlign32}, 858 {S64, ConstantPtr, 64, GlobalAlign32}, 859 {S128, ConstantPtr, 128, GlobalAlign32}, 860 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 861 Actions 862 .customIf(typeIs(1, Constant32Ptr)) 863 // Widen suitably aligned loads by loading extra elements. 864 .moreElementsIf([=](const LegalityQuery &Query) { 865 const LLT Ty = Query.Types[0]; 866 return Op == G_LOAD && Ty.isVector() && 867 shouldWidenLoadResult(Query); 868 }, moreElementsToNextPow2(0)) 869 .widenScalarIf([=](const LegalityQuery &Query) { 870 const LLT Ty = Query.Types[0]; 871 return Op == G_LOAD && !Ty.isVector() && 872 shouldWidenLoadResult(Query); 873 }, widenScalarOrEltToNextPow2(0)) 874 .narrowScalarIf( 875 [=](const LegalityQuery &Query) -> bool { 876 return !Query.Types[0].isVector() && 877 needToSplitMemOp(Query, Op == G_LOAD); 878 }, 879 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 880 const LLT DstTy = Query.Types[0]; 881 const LLT PtrTy = Query.Types[1]; 882 883 const unsigned DstSize = DstTy.getSizeInBits(); 884 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 885 886 // Split extloads. 887 if (DstSize > MemSize) 888 return std::make_pair(0, LLT::scalar(MemSize)); 889 890 if (!isPowerOf2_32(DstSize)) { 891 // We're probably decomposing an odd sized store. Try to split 892 // to the widest type. TODO: Account for alignment. As-is it 893 // should be OK, since the new parts will be further legalized. 894 unsigned FloorSize = PowerOf2Floor(DstSize); 895 return std::make_pair(0, LLT::scalar(FloorSize)); 896 } 897 898 if (DstSize > 32 && (DstSize % 32 != 0)) { 899 // FIXME: Need a way to specify non-extload of larger size if 900 // suitably aligned. 901 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 902 } 903 904 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 905 Op == G_LOAD); 906 if (MemSize > MaxSize) 907 return std::make_pair(0, LLT::scalar(MaxSize)); 908 909 unsigned Align = Query.MMODescrs[0].AlignInBits; 910 return std::make_pair(0, LLT::scalar(Align)); 911 }) 912 .fewerElementsIf( 913 [=](const LegalityQuery &Query) -> bool { 914 return Query.Types[0].isVector() && 915 needToSplitMemOp(Query, Op == G_LOAD); 916 }, 917 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 918 const LLT DstTy = Query.Types[0]; 919 const LLT PtrTy = Query.Types[1]; 920 921 LLT EltTy = DstTy.getElementType(); 922 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 923 Op == G_LOAD); 924 925 // FIXME: Handle widened to power of 2 results better. This ends 926 // up scalarizing. 927 // FIXME: 3 element stores scalarized on SI 928 929 // Split if it's too large for the address space. 930 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 931 unsigned NumElts = DstTy.getNumElements(); 932 unsigned EltSize = EltTy.getSizeInBits(); 933 934 if (MaxSize % EltSize == 0) { 935 return std::make_pair( 936 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 937 } 938 939 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 940 941 // FIXME: Refine when odd breakdowns handled 942 // The scalars will need to be re-legalized. 943 if (NumPieces == 1 || NumPieces >= NumElts || 944 NumElts % NumPieces != 0) 945 return std::make_pair(0, EltTy); 946 947 return std::make_pair(0, 948 LLT::vector(NumElts / NumPieces, EltTy)); 949 } 950 951 // FIXME: We could probably handle weird extending loads better. 952 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 953 if (DstTy.getSizeInBits() > MemSize) 954 return std::make_pair(0, EltTy); 955 956 unsigned EltSize = EltTy.getSizeInBits(); 957 unsigned DstSize = DstTy.getSizeInBits(); 958 if (!isPowerOf2_32(DstSize)) { 959 // We're probably decomposing an odd sized store. Try to split 960 // to the widest type. TODO: Account for alignment. As-is it 961 // should be OK, since the new parts will be further legalized. 962 unsigned FloorSize = PowerOf2Floor(DstSize); 963 return std::make_pair( 964 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 965 } 966 967 // Need to split because of alignment. 968 unsigned Align = Query.MMODescrs[0].AlignInBits; 969 if (EltSize > Align && 970 (EltSize / Align < DstTy.getNumElements())) { 971 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 972 } 973 974 // May need relegalization for the scalars. 975 return std::make_pair(0, EltTy); 976 }) 977 .minScalar(0, S32); 978 979 if (IsStore) 980 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 981 982 // TODO: Need a bitcast lower option? 983 Actions 984 .legalIf([=](const LegalityQuery &Query) { 985 const LLT Ty0 = Query.Types[0]; 986 unsigned Size = Ty0.getSizeInBits(); 987 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 988 unsigned Align = Query.MMODescrs[0].AlignInBits; 989 990 // FIXME: Widening store from alignment not valid. 991 if (MemSize < Size) 992 MemSize = std::max(MemSize, Align); 993 994 // No extending vector loads. 995 if (Size > MemSize && Ty0.isVector()) 996 return false; 997 998 switch (MemSize) { 999 case 8: 1000 case 16: 1001 return Size == 32; 1002 case 32: 1003 case 64: 1004 case 128: 1005 return true; 1006 case 96: 1007 return ST.hasDwordx3LoadStores(); 1008 case 256: 1009 case 512: 1010 return true; 1011 default: 1012 return false; 1013 } 1014 }) 1015 .widenScalarToNextPow2(0) 1016 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1017 } 1018 1019 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1020 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1021 {S32, GlobalPtr, 16, 2 * 8}, 1022 {S32, LocalPtr, 8, 8}, 1023 {S32, LocalPtr, 16, 16}, 1024 {S32, PrivatePtr, 8, 8}, 1025 {S32, PrivatePtr, 16, 16}, 1026 {S32, ConstantPtr, 8, 8}, 1027 {S32, ConstantPtr, 16, 2 * 8}}); 1028 if (ST.hasFlatAddressSpace()) { 1029 ExtLoads.legalForTypesWithMemDesc( 1030 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1031 } 1032 1033 ExtLoads.clampScalar(0, S32, S32) 1034 .widenScalarToNextPow2(0) 1035 .unsupportedIfMemSizeNotPow2() 1036 .lower(); 1037 1038 auto &Atomics = getActionDefinitionsBuilder( 1039 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1040 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1041 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1042 G_ATOMICRMW_UMIN}) 1043 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1044 {S64, GlobalPtr}, {S64, LocalPtr}}); 1045 if (ST.hasFlatAddressSpace()) { 1046 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1047 } 1048 1049 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1050 .legalFor({{S32, LocalPtr}}); 1051 1052 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1053 // demarshalling 1054 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1055 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1056 {S32, FlatPtr}, {S64, FlatPtr}}) 1057 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1058 {S32, RegionPtr}, {S64, RegionPtr}}); 1059 // TODO: Pointer types, any 32-bit or 64-bit vector 1060 1061 // Condition should be s32 for scalar, s1 for vector. 1062 getActionDefinitionsBuilder(G_SELECT) 1063 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1064 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1065 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1066 .clampScalar(0, S16, S64) 1067 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1068 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1069 .scalarize(1) 1070 .clampMaxNumElements(0, S32, 2) 1071 .clampMaxNumElements(0, LocalPtr, 2) 1072 .clampMaxNumElements(0, PrivatePtr, 2) 1073 .scalarize(0) 1074 .widenScalarToNextPow2(0) 1075 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1076 1077 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1078 // be more flexible with the shift amount type. 1079 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1080 .legalFor({{S32, S32}, {S64, S32}}); 1081 if (ST.has16BitInsts()) { 1082 if (ST.hasVOP3PInsts()) { 1083 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 1084 .clampMaxNumElements(0, S16, 2); 1085 } else 1086 Shifts.legalFor({{S16, S32}, {S16, S16}}); 1087 1088 // TODO: Support 16-bit shift amounts 1089 Shifts.clampScalar(1, S32, S32); 1090 Shifts.clampScalar(0, S16, S64); 1091 Shifts.widenScalarToNextPow2(0, 16); 1092 } else { 1093 // Make sure we legalize the shift amount type first, as the general 1094 // expansion for the shifted type will produce much worse code if it hasn't 1095 // been truncated already. 1096 Shifts.clampScalar(1, S32, S32); 1097 Shifts.clampScalar(0, S32, S64); 1098 Shifts.widenScalarToNextPow2(0, 32); 1099 } 1100 Shifts.scalarize(0); 1101 1102 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1103 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1104 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1105 unsigned IdxTypeIdx = 2; 1106 1107 getActionDefinitionsBuilder(Op) 1108 .customIf([=](const LegalityQuery &Query) { 1109 const LLT EltTy = Query.Types[EltTypeIdx]; 1110 const LLT VecTy = Query.Types[VecTypeIdx]; 1111 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1112 return (EltTy.getSizeInBits() == 16 || 1113 EltTy.getSizeInBits() % 32 == 0) && 1114 VecTy.getSizeInBits() % 32 == 0 && 1115 VecTy.getSizeInBits() <= 1024 && 1116 IdxTy.getSizeInBits() == 32; 1117 }) 1118 .clampScalar(EltTypeIdx, S32, S64) 1119 .clampScalar(VecTypeIdx, S32, S64) 1120 .clampScalar(IdxTypeIdx, S32, S32); 1121 } 1122 1123 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1124 .unsupportedIf([=](const LegalityQuery &Query) { 1125 const LLT &EltTy = Query.Types[1].getElementType(); 1126 return Query.Types[0] != EltTy; 1127 }); 1128 1129 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1130 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1131 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1132 1133 // FIXME: Doesn't handle extract of illegal sizes. 1134 getActionDefinitionsBuilder(Op) 1135 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1136 // FIXME: Multiples of 16 should not be legal. 1137 .legalIf([=](const LegalityQuery &Query) { 1138 const LLT BigTy = Query.Types[BigTyIdx]; 1139 const LLT LitTy = Query.Types[LitTyIdx]; 1140 return (BigTy.getSizeInBits() % 32 == 0) && 1141 (LitTy.getSizeInBits() % 16 == 0); 1142 }) 1143 .widenScalarIf( 1144 [=](const LegalityQuery &Query) { 1145 const LLT BigTy = Query.Types[BigTyIdx]; 1146 return (BigTy.getScalarSizeInBits() < 16); 1147 }, 1148 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1149 .widenScalarIf( 1150 [=](const LegalityQuery &Query) { 1151 const LLT LitTy = Query.Types[LitTyIdx]; 1152 return (LitTy.getScalarSizeInBits() < 16); 1153 }, 1154 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1155 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1156 .widenScalarToNextPow2(BigTyIdx, 32); 1157 1158 } 1159 1160 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1161 .legalForCartesianProduct(AllS32Vectors, {S32}) 1162 .legalForCartesianProduct(AllS64Vectors, {S64}) 1163 .clampNumElements(0, V16S32, V32S32) 1164 .clampNumElements(0, V2S64, V16S64) 1165 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1166 1167 if (ST.hasScalarPackInsts()) { 1168 BuildVector 1169 // FIXME: Should probably widen s1 vectors straight to s32 1170 .minScalarOrElt(0, S16) 1171 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1172 .minScalar(1, S32); 1173 1174 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1175 .legalFor({V2S16, S32}) 1176 .lower(); 1177 BuildVector.minScalarOrElt(0, S32); 1178 } else { 1179 BuildVector.customFor({V2S16, S16}); 1180 BuildVector.minScalarOrElt(0, S32); 1181 1182 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1183 .customFor({V2S16, S32}) 1184 .lower(); 1185 } 1186 1187 BuildVector.legalIf(isRegisterType(0)); 1188 1189 // FIXME: Clamp maximum size 1190 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1191 .legalIf(isRegisterType(0)); 1192 1193 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1194 // pre-legalize. 1195 if (ST.hasVOP3PInsts()) { 1196 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1197 .customFor({V2S16, V2S16}) 1198 .lower(); 1199 } else 1200 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1201 1202 // Merge/Unmerge 1203 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1204 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1205 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1206 1207 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1208 const LLT &Ty = Query.Types[TypeIdx]; 1209 if (Ty.isVector()) { 1210 const LLT &EltTy = Ty.getElementType(); 1211 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1212 return true; 1213 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1214 return true; 1215 } 1216 return false; 1217 }; 1218 1219 auto &Builder = getActionDefinitionsBuilder(Op) 1220 // Try to widen to s16 first for small types. 1221 // TODO: Only do this on targets with legal s16 shifts 1222 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1223 1224 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1225 .lowerFor({{S16, V2S16}}) 1226 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1227 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1228 elementTypeIs(1, S16)), 1229 changeTo(1, V2S16)) 1230 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1231 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1232 // valid. 1233 .clampScalar(LitTyIdx, S32, S256) 1234 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1235 // Break up vectors with weird elements into scalars 1236 .fewerElementsIf( 1237 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1238 scalarize(0)) 1239 .fewerElementsIf( 1240 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1241 scalarize(1)) 1242 .clampScalar(BigTyIdx, S32, S1024); 1243 1244 if (Op == G_MERGE_VALUES) { 1245 Builder.widenScalarIf( 1246 // TODO: Use 16-bit shifts if legal for 8-bit values? 1247 [=](const LegalityQuery &Query) { 1248 const LLT Ty = Query.Types[LitTyIdx]; 1249 return Ty.getSizeInBits() < 32; 1250 }, 1251 changeTo(LitTyIdx, S32)); 1252 } 1253 1254 Builder.widenScalarIf( 1255 [=](const LegalityQuery &Query) { 1256 const LLT Ty = Query.Types[BigTyIdx]; 1257 return !isPowerOf2_32(Ty.getSizeInBits()) && 1258 Ty.getSizeInBits() % 16 != 0; 1259 }, 1260 [=](const LegalityQuery &Query) { 1261 // Pick the next power of 2, or a multiple of 64 over 128. 1262 // Whichever is smaller. 1263 const LLT &Ty = Query.Types[BigTyIdx]; 1264 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1265 if (NewSizeInBits >= 256) { 1266 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1267 if (RoundedTo < NewSizeInBits) 1268 NewSizeInBits = RoundedTo; 1269 } 1270 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1271 }) 1272 .legalIf([=](const LegalityQuery &Query) { 1273 const LLT &BigTy = Query.Types[BigTyIdx]; 1274 const LLT &LitTy = Query.Types[LitTyIdx]; 1275 1276 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1277 return false; 1278 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1279 return false; 1280 1281 return BigTy.getSizeInBits() % 16 == 0 && 1282 LitTy.getSizeInBits() % 16 == 0 && 1283 BigTy.getSizeInBits() <= 1024; 1284 }) 1285 // Any vectors left are the wrong size. Scalarize them. 1286 .scalarize(0) 1287 .scalarize(1); 1288 } 1289 1290 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1291 // RegBankSelect. 1292 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1293 .legalFor({{S32}, {S64}}); 1294 1295 if (ST.hasVOP3PInsts()) { 1296 SextInReg.lowerFor({{V2S16}}) 1297 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1298 // get more vector shift opportunities, since we'll get those when 1299 // expanded. 1300 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1301 } else if (ST.has16BitInsts()) { 1302 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1303 } else { 1304 // Prefer to promote to s32 before lowering if we don't have 16-bit 1305 // shifts. This avoid a lot of intermediate truncate and extend operations. 1306 SextInReg.lowerFor({{S32}, {S64}}); 1307 } 1308 1309 SextInReg 1310 .scalarize(0) 1311 .clampScalar(0, S32, S64) 1312 .lower(); 1313 1314 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1315 .legalFor({S64}); 1316 1317 getActionDefinitionsBuilder({ 1318 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1319 G_FCOPYSIGN, 1320 1321 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1322 G_READ_REGISTER, 1323 G_WRITE_REGISTER, 1324 1325 G_SADDO, G_SSUBO, 1326 1327 // TODO: Implement 1328 G_FMINIMUM, G_FMAXIMUM 1329 }).lower(); 1330 1331 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1332 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1333 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1334 .unsupported(); 1335 1336 computeTables(); 1337 verify(*ST.getInstrInfo()); 1338 } 1339 1340 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1341 MachineRegisterInfo &MRI, 1342 MachineIRBuilder &B, 1343 GISelChangeObserver &Observer) const { 1344 switch (MI.getOpcode()) { 1345 case TargetOpcode::G_ADDRSPACE_CAST: 1346 return legalizeAddrSpaceCast(MI, MRI, B); 1347 case TargetOpcode::G_FRINT: 1348 return legalizeFrint(MI, MRI, B); 1349 case TargetOpcode::G_FCEIL: 1350 return legalizeFceil(MI, MRI, B); 1351 case TargetOpcode::G_INTRINSIC_TRUNC: 1352 return legalizeIntrinsicTrunc(MI, MRI, B); 1353 case TargetOpcode::G_SITOFP: 1354 return legalizeITOFP(MI, MRI, B, true); 1355 case TargetOpcode::G_UITOFP: 1356 return legalizeITOFP(MI, MRI, B, false); 1357 case TargetOpcode::G_FPTOSI: 1358 return legalizeFPTOI(MI, MRI, B, true); 1359 case TargetOpcode::G_FPTOUI: 1360 return legalizeFPTOI(MI, MRI, B, false); 1361 case TargetOpcode::G_FMINNUM: 1362 case TargetOpcode::G_FMAXNUM: 1363 case TargetOpcode::G_FMINNUM_IEEE: 1364 case TargetOpcode::G_FMAXNUM_IEEE: 1365 return legalizeMinNumMaxNum(MI, MRI, B); 1366 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1367 return legalizeExtractVectorElt(MI, MRI, B); 1368 case TargetOpcode::G_INSERT_VECTOR_ELT: 1369 return legalizeInsertVectorElt(MI, MRI, B); 1370 case TargetOpcode::G_SHUFFLE_VECTOR: 1371 return legalizeShuffleVector(MI, MRI, B); 1372 case TargetOpcode::G_FSIN: 1373 case TargetOpcode::G_FCOS: 1374 return legalizeSinCos(MI, MRI, B); 1375 case TargetOpcode::G_GLOBAL_VALUE: 1376 return legalizeGlobalValue(MI, MRI, B); 1377 case TargetOpcode::G_LOAD: 1378 return legalizeLoad(MI, MRI, B, Observer); 1379 case TargetOpcode::G_FMAD: 1380 return legalizeFMad(MI, MRI, B); 1381 case TargetOpcode::G_FDIV: 1382 return legalizeFDIV(MI, MRI, B); 1383 case TargetOpcode::G_UDIV: 1384 case TargetOpcode::G_UREM: 1385 return legalizeUDIV_UREM(MI, MRI, B); 1386 case TargetOpcode::G_SDIV: 1387 case TargetOpcode::G_SREM: 1388 return legalizeSDIV_SREM(MI, MRI, B); 1389 case TargetOpcode::G_ATOMIC_CMPXCHG: 1390 return legalizeAtomicCmpXChg(MI, MRI, B); 1391 case TargetOpcode::G_FLOG: 1392 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1393 case TargetOpcode::G_FLOG10: 1394 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1395 case TargetOpcode::G_FEXP: 1396 return legalizeFExp(MI, B); 1397 case TargetOpcode::G_FPOW: 1398 return legalizeFPow(MI, B); 1399 case TargetOpcode::G_FFLOOR: 1400 return legalizeFFloor(MI, MRI, B); 1401 case TargetOpcode::G_BUILD_VECTOR: 1402 return legalizeBuildVector(MI, MRI, B); 1403 default: 1404 return false; 1405 } 1406 1407 llvm_unreachable("expected switch to return"); 1408 } 1409 1410 Register AMDGPULegalizerInfo::getSegmentAperture( 1411 unsigned AS, 1412 MachineRegisterInfo &MRI, 1413 MachineIRBuilder &B) const { 1414 MachineFunction &MF = B.getMF(); 1415 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1416 const LLT S32 = LLT::scalar(32); 1417 1418 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1419 1420 if (ST.hasApertureRegs()) { 1421 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1422 // getreg. 1423 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1424 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1425 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1426 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1427 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1428 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1429 unsigned Encoding = 1430 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1431 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1432 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1433 1434 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1435 1436 B.buildInstr(AMDGPU::S_GETREG_B32) 1437 .addDef(GetReg) 1438 .addImm(Encoding); 1439 MRI.setType(GetReg, S32); 1440 1441 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1442 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1443 } 1444 1445 Register QueuePtr = MRI.createGenericVirtualRegister( 1446 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1447 1448 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1449 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1450 return Register(); 1451 1452 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1453 // private_segment_aperture_base_hi. 1454 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1455 1456 // TODO: can we be smarter about machine pointer info? 1457 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1458 MachineMemOperand *MMO = MF.getMachineMemOperand( 1459 PtrInfo, 1460 MachineMemOperand::MOLoad | 1461 MachineMemOperand::MODereferenceable | 1462 MachineMemOperand::MOInvariant, 1463 4, 1464 MinAlign(64, StructOffset)); 1465 1466 Register LoadAddr; 1467 1468 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1469 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1470 } 1471 1472 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1473 MachineInstr &MI, MachineRegisterInfo &MRI, 1474 MachineIRBuilder &B) const { 1475 MachineFunction &MF = B.getMF(); 1476 1477 B.setInstr(MI); 1478 1479 const LLT S32 = LLT::scalar(32); 1480 Register Dst = MI.getOperand(0).getReg(); 1481 Register Src = MI.getOperand(1).getReg(); 1482 1483 LLT DstTy = MRI.getType(Dst); 1484 LLT SrcTy = MRI.getType(Src); 1485 unsigned DestAS = DstTy.getAddressSpace(); 1486 unsigned SrcAS = SrcTy.getAddressSpace(); 1487 1488 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1489 // vector element. 1490 assert(!DstTy.isVector()); 1491 1492 const AMDGPUTargetMachine &TM 1493 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1494 1495 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1496 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1497 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1498 return true; 1499 } 1500 1501 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1502 // Truncate. 1503 B.buildExtract(Dst, Src, 0); 1504 MI.eraseFromParent(); 1505 return true; 1506 } 1507 1508 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1509 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1510 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1511 1512 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1513 // another. Merge operands are required to be the same type, but creating an 1514 // extra ptrtoint would be kind of pointless. 1515 auto HighAddr = B.buildConstant( 1516 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1517 B.buildMerge(Dst, {Src, HighAddr}); 1518 MI.eraseFromParent(); 1519 return true; 1520 } 1521 1522 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1523 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1524 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1525 unsigned NullVal = TM.getNullPointerValue(DestAS); 1526 1527 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1528 auto FlatNull = B.buildConstant(SrcTy, 0); 1529 1530 // Extract low 32-bits of the pointer. 1531 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1532 1533 auto CmpRes = 1534 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1535 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1536 1537 MI.eraseFromParent(); 1538 return true; 1539 } 1540 1541 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1542 return false; 1543 1544 if (!ST.hasFlatAddressSpace()) 1545 return false; 1546 1547 auto SegmentNull = 1548 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1549 auto FlatNull = 1550 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1551 1552 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1553 if (!ApertureReg.isValid()) 1554 return false; 1555 1556 auto CmpRes = 1557 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1558 1559 // Coerce the type of the low half of the result so we can use merge_values. 1560 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1561 1562 // TODO: Should we allow mismatched types but matching sizes in merges to 1563 // avoid the ptrtoint? 1564 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1565 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1566 1567 MI.eraseFromParent(); 1568 return true; 1569 } 1570 1571 bool AMDGPULegalizerInfo::legalizeFrint( 1572 MachineInstr &MI, MachineRegisterInfo &MRI, 1573 MachineIRBuilder &B) const { 1574 B.setInstr(MI); 1575 1576 Register Src = MI.getOperand(1).getReg(); 1577 LLT Ty = MRI.getType(Src); 1578 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1579 1580 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1581 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1582 1583 auto C1 = B.buildFConstant(Ty, C1Val); 1584 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1585 1586 // TODO: Should this propagate fast-math-flags? 1587 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1588 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1589 1590 auto C2 = B.buildFConstant(Ty, C2Val); 1591 auto Fabs = B.buildFAbs(Ty, Src); 1592 1593 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1594 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1595 return true; 1596 } 1597 1598 bool AMDGPULegalizerInfo::legalizeFceil( 1599 MachineInstr &MI, MachineRegisterInfo &MRI, 1600 MachineIRBuilder &B) const { 1601 B.setInstr(MI); 1602 1603 const LLT S1 = LLT::scalar(1); 1604 const LLT S64 = LLT::scalar(64); 1605 1606 Register Src = MI.getOperand(1).getReg(); 1607 assert(MRI.getType(Src) == S64); 1608 1609 // result = trunc(src) 1610 // if (src > 0.0 && src != result) 1611 // result += 1.0 1612 1613 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1614 1615 const auto Zero = B.buildFConstant(S64, 0.0); 1616 const auto One = B.buildFConstant(S64, 1.0); 1617 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1618 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1619 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1620 auto Add = B.buildSelect(S64, And, One, Zero); 1621 1622 // TODO: Should this propagate fast-math-flags? 1623 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1624 return true; 1625 } 1626 1627 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1628 MachineIRBuilder &B) { 1629 const unsigned FractBits = 52; 1630 const unsigned ExpBits = 11; 1631 LLT S32 = LLT::scalar(32); 1632 1633 auto Const0 = B.buildConstant(S32, FractBits - 32); 1634 auto Const1 = B.buildConstant(S32, ExpBits); 1635 1636 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1637 .addUse(Const0.getReg(0)) 1638 .addUse(Const1.getReg(0)); 1639 1640 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1641 } 1642 1643 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1644 MachineInstr &MI, MachineRegisterInfo &MRI, 1645 MachineIRBuilder &B) const { 1646 B.setInstr(MI); 1647 1648 const LLT S1 = LLT::scalar(1); 1649 const LLT S32 = LLT::scalar(32); 1650 const LLT S64 = LLT::scalar(64); 1651 1652 Register Src = MI.getOperand(1).getReg(); 1653 assert(MRI.getType(Src) == S64); 1654 1655 // TODO: Should this use extract since the low half is unused? 1656 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1657 Register Hi = Unmerge.getReg(1); 1658 1659 // Extract the upper half, since this is where we will find the sign and 1660 // exponent. 1661 auto Exp = extractF64Exponent(Hi, B); 1662 1663 const unsigned FractBits = 52; 1664 1665 // Extract the sign bit. 1666 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1667 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1668 1669 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1670 1671 const auto Zero32 = B.buildConstant(S32, 0); 1672 1673 // Extend back to 64-bits. 1674 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1675 1676 auto Shr = B.buildAShr(S64, FractMask, Exp); 1677 auto Not = B.buildNot(S64, Shr); 1678 auto Tmp0 = B.buildAnd(S64, Src, Not); 1679 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1680 1681 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1682 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1683 1684 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1685 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1686 return true; 1687 } 1688 1689 bool AMDGPULegalizerInfo::legalizeITOFP( 1690 MachineInstr &MI, MachineRegisterInfo &MRI, 1691 MachineIRBuilder &B, bool Signed) const { 1692 B.setInstr(MI); 1693 1694 Register Dst = MI.getOperand(0).getReg(); 1695 Register Src = MI.getOperand(1).getReg(); 1696 1697 const LLT S64 = LLT::scalar(64); 1698 const LLT S32 = LLT::scalar(32); 1699 1700 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1701 1702 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1703 1704 auto CvtHi = Signed ? 1705 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1706 B.buildUITOFP(S64, Unmerge.getReg(1)); 1707 1708 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1709 1710 auto ThirtyTwo = B.buildConstant(S32, 32); 1711 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1712 .addUse(CvtHi.getReg(0)) 1713 .addUse(ThirtyTwo.getReg(0)); 1714 1715 // TODO: Should this propagate fast-math-flags? 1716 B.buildFAdd(Dst, LdExp, CvtLo); 1717 MI.eraseFromParent(); 1718 return true; 1719 } 1720 1721 // TODO: Copied from DAG implementation. Verify logic and document how this 1722 // actually works. 1723 bool AMDGPULegalizerInfo::legalizeFPTOI( 1724 MachineInstr &MI, MachineRegisterInfo &MRI, 1725 MachineIRBuilder &B, bool Signed) const { 1726 B.setInstr(MI); 1727 1728 Register Dst = MI.getOperand(0).getReg(); 1729 Register Src = MI.getOperand(1).getReg(); 1730 1731 const LLT S64 = LLT::scalar(64); 1732 const LLT S32 = LLT::scalar(32); 1733 1734 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1735 1736 unsigned Flags = MI.getFlags(); 1737 1738 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1739 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1740 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1741 1742 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1743 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1744 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1745 1746 auto Hi = Signed ? 1747 B.buildFPTOSI(S32, FloorMul) : 1748 B.buildFPTOUI(S32, FloorMul); 1749 auto Lo = B.buildFPTOUI(S32, Fma); 1750 1751 B.buildMerge(Dst, { Lo, Hi }); 1752 MI.eraseFromParent(); 1753 1754 return true; 1755 } 1756 1757 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1758 MachineInstr &MI, MachineRegisterInfo &MRI, 1759 MachineIRBuilder &B) const { 1760 MachineFunction &MF = B.getMF(); 1761 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1762 1763 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1764 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1765 1766 // With ieee_mode disabled, the instructions have the correct behavior 1767 // already for G_FMINNUM/G_FMAXNUM 1768 if (!MFI->getMode().IEEE) 1769 return !IsIEEEOp; 1770 1771 if (IsIEEEOp) 1772 return true; 1773 1774 MachineIRBuilder HelperBuilder(MI); 1775 GISelObserverWrapper DummyObserver; 1776 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1777 HelperBuilder.setInstr(MI); 1778 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1779 } 1780 1781 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1782 MachineInstr &MI, MachineRegisterInfo &MRI, 1783 MachineIRBuilder &B) const { 1784 // TODO: Should move some of this into LegalizerHelper. 1785 1786 // TODO: Promote dynamic indexing of s16 to s32 1787 1788 // FIXME: Artifact combiner probably should have replaced the truncated 1789 // constant before this, so we shouldn't need 1790 // getConstantVRegValWithLookThrough. 1791 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1792 MI.getOperand(2).getReg(), MRI); 1793 if (!IdxVal) // Dynamic case will be selected to register indexing. 1794 return true; 1795 1796 Register Dst = MI.getOperand(0).getReg(); 1797 Register Vec = MI.getOperand(1).getReg(); 1798 1799 LLT VecTy = MRI.getType(Vec); 1800 LLT EltTy = VecTy.getElementType(); 1801 assert(EltTy == MRI.getType(Dst)); 1802 1803 B.setInstr(MI); 1804 1805 if (IdxVal->Value < VecTy.getNumElements()) 1806 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1807 else 1808 B.buildUndef(Dst); 1809 1810 MI.eraseFromParent(); 1811 return true; 1812 } 1813 1814 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1815 MachineInstr &MI, MachineRegisterInfo &MRI, 1816 MachineIRBuilder &B) const { 1817 // TODO: Should move some of this into LegalizerHelper. 1818 1819 // TODO: Promote dynamic indexing of s16 to s32 1820 1821 // FIXME: Artifact combiner probably should have replaced the truncated 1822 // constant before this, so we shouldn't need 1823 // getConstantVRegValWithLookThrough. 1824 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1825 MI.getOperand(3).getReg(), MRI); 1826 if (!IdxVal) // Dynamic case will be selected to register indexing. 1827 return true; 1828 1829 Register Dst = MI.getOperand(0).getReg(); 1830 Register Vec = MI.getOperand(1).getReg(); 1831 Register Ins = MI.getOperand(2).getReg(); 1832 1833 LLT VecTy = MRI.getType(Vec); 1834 LLT EltTy = VecTy.getElementType(); 1835 assert(EltTy == MRI.getType(Ins)); 1836 1837 B.setInstr(MI); 1838 1839 if (IdxVal->Value < VecTy.getNumElements()) 1840 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1841 else 1842 B.buildUndef(Dst); 1843 1844 MI.eraseFromParent(); 1845 return true; 1846 } 1847 1848 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1849 MachineInstr &MI, MachineRegisterInfo &MRI, 1850 MachineIRBuilder &B) const { 1851 const LLT V2S16 = LLT::vector(2, 16); 1852 1853 Register Dst = MI.getOperand(0).getReg(); 1854 Register Src0 = MI.getOperand(1).getReg(); 1855 LLT DstTy = MRI.getType(Dst); 1856 LLT SrcTy = MRI.getType(Src0); 1857 1858 if (SrcTy == V2S16 && DstTy == V2S16 && 1859 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1860 return true; 1861 1862 MachineIRBuilder HelperBuilder(MI); 1863 GISelObserverWrapper DummyObserver; 1864 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1865 HelperBuilder.setInstr(MI); 1866 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1867 } 1868 1869 bool AMDGPULegalizerInfo::legalizeSinCos( 1870 MachineInstr &MI, MachineRegisterInfo &MRI, 1871 MachineIRBuilder &B) const { 1872 B.setInstr(MI); 1873 1874 Register DstReg = MI.getOperand(0).getReg(); 1875 Register SrcReg = MI.getOperand(1).getReg(); 1876 LLT Ty = MRI.getType(DstReg); 1877 unsigned Flags = MI.getFlags(); 1878 1879 Register TrigVal; 1880 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1881 if (ST.hasTrigReducedRange()) { 1882 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1883 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1884 .addUse(MulVal.getReg(0)) 1885 .setMIFlags(Flags).getReg(0); 1886 } else 1887 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1888 1889 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1890 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1891 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1892 .addUse(TrigVal) 1893 .setMIFlags(Flags); 1894 MI.eraseFromParent(); 1895 return true; 1896 } 1897 1898 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1899 Register DstReg, LLT PtrTy, 1900 MachineIRBuilder &B, const GlobalValue *GV, 1901 unsigned Offset, unsigned GAFlags) const { 1902 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1903 // to the following code sequence: 1904 // 1905 // For constant address space: 1906 // s_getpc_b64 s[0:1] 1907 // s_add_u32 s0, s0, $symbol 1908 // s_addc_u32 s1, s1, 0 1909 // 1910 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1911 // a fixup or relocation is emitted to replace $symbol with a literal 1912 // constant, which is a pc-relative offset from the encoding of the $symbol 1913 // operand to the global variable. 1914 // 1915 // For global address space: 1916 // s_getpc_b64 s[0:1] 1917 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1918 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1919 // 1920 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1921 // fixups or relocations are emitted to replace $symbol@*@lo and 1922 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1923 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1924 // operand to the global variable. 1925 // 1926 // What we want here is an offset from the value returned by s_getpc 1927 // (which is the address of the s_add_u32 instruction) to the global 1928 // variable, but since the encoding of $symbol starts 4 bytes after the start 1929 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1930 // small. This requires us to add 4 to the global variable offset in order to 1931 // compute the correct address. 1932 1933 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1934 1935 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1936 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1937 1938 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1939 .addDef(PCReg); 1940 1941 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1942 if (GAFlags == SIInstrInfo::MO_NONE) 1943 MIB.addImm(0); 1944 else 1945 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1946 1947 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1948 1949 if (PtrTy.getSizeInBits() == 32) 1950 B.buildExtract(DstReg, PCReg, 0); 1951 return true; 1952 } 1953 1954 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1955 MachineInstr &MI, MachineRegisterInfo &MRI, 1956 MachineIRBuilder &B) const { 1957 Register DstReg = MI.getOperand(0).getReg(); 1958 LLT Ty = MRI.getType(DstReg); 1959 unsigned AS = Ty.getAddressSpace(); 1960 1961 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1962 MachineFunction &MF = B.getMF(); 1963 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1964 B.setInstr(MI); 1965 1966 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1967 if (!MFI->isEntryFunction()) { 1968 const Function &Fn = MF.getFunction(); 1969 DiagnosticInfoUnsupported BadLDSDecl( 1970 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 1971 DS_Warning); 1972 Fn.getContext().diagnose(BadLDSDecl); 1973 1974 // We currently don't have a way to correctly allocate LDS objects that 1975 // aren't directly associated with a kernel. We do force inlining of 1976 // functions that use local objects. However, if these dead functions are 1977 // not eliminated, we don't want a compile time error. Just emit a warning 1978 // and a trap, since there should be no callable path here. 1979 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 1980 B.buildUndef(DstReg); 1981 MI.eraseFromParent(); 1982 return true; 1983 } 1984 1985 // TODO: We could emit code to handle the initialization somewhere. 1986 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1987 const SITargetLowering *TLI = ST.getTargetLowering(); 1988 if (!TLI->shouldUseLDSConstAddress(GV)) { 1989 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1990 return true; // Leave in place; 1991 } 1992 1993 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1994 MI.eraseFromParent(); 1995 return true; 1996 } 1997 1998 const Function &Fn = MF.getFunction(); 1999 DiagnosticInfoUnsupported BadInit( 2000 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2001 Fn.getContext().diagnose(BadInit); 2002 return true; 2003 } 2004 2005 const SITargetLowering *TLI = ST.getTargetLowering(); 2006 2007 if (TLI->shouldEmitFixup(GV)) { 2008 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2009 MI.eraseFromParent(); 2010 return true; 2011 } 2012 2013 if (TLI->shouldEmitPCReloc(GV)) { 2014 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2015 MI.eraseFromParent(); 2016 return true; 2017 } 2018 2019 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2020 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2021 2022 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2023 MachinePointerInfo::getGOT(MF), 2024 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2025 MachineMemOperand::MOInvariant, 2026 8 /*Size*/, 8 /*Align*/); 2027 2028 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2029 2030 if (Ty.getSizeInBits() == 32) { 2031 // Truncate if this is a 32-bit constant adrdess. 2032 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2033 B.buildExtract(DstReg, Load, 0); 2034 } else 2035 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2036 2037 MI.eraseFromParent(); 2038 return true; 2039 } 2040 2041 bool AMDGPULegalizerInfo::legalizeLoad( 2042 MachineInstr &MI, MachineRegisterInfo &MRI, 2043 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2044 B.setInstr(MI); 2045 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2046 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2047 Observer.changingInstr(MI); 2048 MI.getOperand(1).setReg(Cast.getReg(0)); 2049 Observer.changedInstr(MI); 2050 return true; 2051 } 2052 2053 bool AMDGPULegalizerInfo::legalizeFMad( 2054 MachineInstr &MI, MachineRegisterInfo &MRI, 2055 MachineIRBuilder &B) const { 2056 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2057 assert(Ty.isScalar()); 2058 2059 MachineFunction &MF = B.getMF(); 2060 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2061 2062 // TODO: Always legal with future ftz flag. 2063 // FIXME: Do we need just output? 2064 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2065 return true; 2066 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2067 return true; 2068 2069 MachineIRBuilder HelperBuilder(MI); 2070 GISelObserverWrapper DummyObserver; 2071 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2072 HelperBuilder.setMBB(*MI.getParent()); 2073 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2074 } 2075 2076 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2077 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2078 Register DstReg = MI.getOperand(0).getReg(); 2079 Register PtrReg = MI.getOperand(1).getReg(); 2080 Register CmpVal = MI.getOperand(2).getReg(); 2081 Register NewVal = MI.getOperand(3).getReg(); 2082 2083 assert(SITargetLowering::isFlatGlobalAddrSpace( 2084 MRI.getType(PtrReg).getAddressSpace()) && 2085 "this should not have been custom lowered"); 2086 2087 LLT ValTy = MRI.getType(CmpVal); 2088 LLT VecTy = LLT::vector(2, ValTy); 2089 2090 B.setInstr(MI); 2091 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2092 2093 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2094 .addDef(DstReg) 2095 .addUse(PtrReg) 2096 .addUse(PackedVal) 2097 .setMemRefs(MI.memoperands()); 2098 2099 MI.eraseFromParent(); 2100 return true; 2101 } 2102 2103 bool AMDGPULegalizerInfo::legalizeFlog( 2104 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2105 Register Dst = MI.getOperand(0).getReg(); 2106 Register Src = MI.getOperand(1).getReg(); 2107 LLT Ty = B.getMRI()->getType(Dst); 2108 unsigned Flags = MI.getFlags(); 2109 B.setInstr(MI); 2110 2111 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2112 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2113 2114 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2115 MI.eraseFromParent(); 2116 return true; 2117 } 2118 2119 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2120 MachineIRBuilder &B) const { 2121 Register Dst = MI.getOperand(0).getReg(); 2122 Register Src = MI.getOperand(1).getReg(); 2123 unsigned Flags = MI.getFlags(); 2124 LLT Ty = B.getMRI()->getType(Dst); 2125 B.setInstr(MI); 2126 2127 auto K = B.buildFConstant(Ty, numbers::log2e); 2128 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2129 B.buildFExp2(Dst, Mul, Flags); 2130 MI.eraseFromParent(); 2131 return true; 2132 } 2133 2134 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2135 MachineIRBuilder &B) const { 2136 Register Dst = MI.getOperand(0).getReg(); 2137 Register Src0 = MI.getOperand(1).getReg(); 2138 Register Src1 = MI.getOperand(2).getReg(); 2139 unsigned Flags = MI.getFlags(); 2140 LLT Ty = B.getMRI()->getType(Dst); 2141 B.setInstr(MI); 2142 const LLT S16 = LLT::scalar(16); 2143 const LLT S32 = LLT::scalar(32); 2144 2145 if (Ty == S32) { 2146 auto Log = B.buildFLog2(S32, Src0, Flags); 2147 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2148 .addUse(Log.getReg(0)) 2149 .addUse(Src1) 2150 .setMIFlags(Flags); 2151 B.buildFExp2(Dst, Mul, Flags); 2152 } else if (Ty == S16) { 2153 // There's no f16 fmul_legacy, so we need to convert for it. 2154 auto Log = B.buildFLog2(S16, Src0, Flags); 2155 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2156 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2157 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2158 .addUse(Ext0.getReg(0)) 2159 .addUse(Ext1.getReg(0)) 2160 .setMIFlags(Flags); 2161 2162 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2163 } else 2164 return false; 2165 2166 MI.eraseFromParent(); 2167 return true; 2168 } 2169 2170 // Find a source register, ignoring any possible source modifiers. 2171 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2172 Register ModSrc = OrigSrc; 2173 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2174 ModSrc = SrcFNeg->getOperand(1).getReg(); 2175 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2176 ModSrc = SrcFAbs->getOperand(1).getReg(); 2177 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2178 ModSrc = SrcFAbs->getOperand(1).getReg(); 2179 return ModSrc; 2180 } 2181 2182 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2183 MachineRegisterInfo &MRI, 2184 MachineIRBuilder &B) const { 2185 B.setInstr(MI); 2186 2187 const LLT S1 = LLT::scalar(1); 2188 const LLT S64 = LLT::scalar(64); 2189 Register Dst = MI.getOperand(0).getReg(); 2190 Register OrigSrc = MI.getOperand(1).getReg(); 2191 unsigned Flags = MI.getFlags(); 2192 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2193 "this should not have been custom lowered"); 2194 2195 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2196 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2197 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2198 // V_FRACT bug is: 2199 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2200 // 2201 // Convert floor(x) to (x - fract(x)) 2202 2203 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2204 .addUse(OrigSrc) 2205 .setMIFlags(Flags); 2206 2207 // Give source modifier matching some assistance before obscuring a foldable 2208 // pattern. 2209 2210 // TODO: We can avoid the neg on the fract? The input sign to fract 2211 // shouldn't matter? 2212 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2213 2214 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2215 2216 Register Min = MRI.createGenericVirtualRegister(S64); 2217 2218 // We don't need to concern ourselves with the snan handling difference, so 2219 // use the one which will directly select. 2220 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2221 if (MFI->getMode().IEEE) 2222 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2223 else 2224 B.buildFMinNum(Min, Fract, Const, Flags); 2225 2226 Register CorrectedFract = Min; 2227 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2228 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2229 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2230 } 2231 2232 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2233 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2234 2235 MI.eraseFromParent(); 2236 return true; 2237 } 2238 2239 // Turn an illegal packed v2s16 build vector into bit operations. 2240 // TODO: This should probably be a bitcast action in LegalizerHelper. 2241 bool AMDGPULegalizerInfo::legalizeBuildVector( 2242 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2243 Register Dst = MI.getOperand(0).getReg(); 2244 LLT DstTy = MRI.getType(Dst); 2245 const LLT S32 = LLT::scalar(32); 2246 const LLT V2S16 = LLT::vector(2, 16); 2247 (void)DstTy; 2248 (void)V2S16; 2249 assert(DstTy == V2S16); 2250 2251 Register Src0 = MI.getOperand(1).getReg(); 2252 Register Src1 = MI.getOperand(2).getReg(); 2253 assert(MRI.getType(Src0) == LLT::scalar(16)); 2254 2255 B.setInstr(MI); 2256 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2257 B.buildBitcast(Dst, Merge); 2258 2259 MI.eraseFromParent(); 2260 return true; 2261 } 2262 2263 // Return the use branch instruction, otherwise null if the usage is invalid. 2264 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2265 MachineRegisterInfo &MRI, 2266 MachineInstr *&Br) { 2267 Register CondDef = MI.getOperand(0).getReg(); 2268 if (!MRI.hasOneNonDBGUse(CondDef)) 2269 return nullptr; 2270 2271 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2272 if (UseMI.getParent() != MI.getParent() || 2273 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2274 return nullptr; 2275 2276 // Make sure the cond br is followed by a G_BR 2277 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2278 if (Next != MI.getParent()->end()) { 2279 if (Next->getOpcode() != AMDGPU::G_BR) 2280 return nullptr; 2281 Br = &*Next; 2282 } 2283 2284 return &UseMI; 2285 } 2286 2287 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2288 MachineRegisterInfo &MRI, 2289 Register LiveIn, 2290 Register PhyReg) const { 2291 assert(PhyReg.isPhysical() && "Physical register expected"); 2292 2293 // Insert the live-in copy, if required, by defining destination virtual 2294 // register. 2295 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2296 if (!MRI.getVRegDef(LiveIn)) { 2297 // FIXME: Should have scoped insert pt 2298 MachineBasicBlock &OrigInsBB = B.getMBB(); 2299 auto OrigInsPt = B.getInsertPt(); 2300 2301 MachineBasicBlock &EntryMBB = B.getMF().front(); 2302 EntryMBB.addLiveIn(PhyReg); 2303 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2304 B.buildCopy(LiveIn, PhyReg); 2305 2306 B.setInsertPt(OrigInsBB, OrigInsPt); 2307 } 2308 2309 return LiveIn; 2310 } 2311 2312 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2313 MachineRegisterInfo &MRI, 2314 Register PhyReg, LLT Ty, 2315 bool InsertLiveInCopy) const { 2316 assert(PhyReg.isPhysical() && "Physical register expected"); 2317 2318 // Get or create virtual live-in regester 2319 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2320 if (!LiveIn) { 2321 LiveIn = MRI.createGenericVirtualRegister(Ty); 2322 MRI.addLiveIn(PhyReg, LiveIn); 2323 } 2324 2325 // When the actual true copy required is from virtual register to physical 2326 // register (to be inserted later), live-in copy insertion from physical 2327 // to register virtual register is not required 2328 if (!InsertLiveInCopy) 2329 return LiveIn; 2330 2331 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2332 } 2333 2334 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2335 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2336 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2337 const ArgDescriptor *Arg; 2338 const TargetRegisterClass *RC; 2339 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2340 if (!Arg) { 2341 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2342 return nullptr; 2343 } 2344 return Arg; 2345 } 2346 2347 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2348 const ArgDescriptor *Arg) const { 2349 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2350 return false; // TODO: Handle these 2351 2352 Register SrcReg = Arg->getRegister(); 2353 assert(SrcReg.isPhysical() && "Physical register expected"); 2354 assert(DstReg.isVirtual() && "Virtual register expected"); 2355 2356 MachineRegisterInfo &MRI = *B.getMRI(); 2357 2358 LLT Ty = MRI.getType(DstReg); 2359 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2360 2361 if (Arg->isMasked()) { 2362 // TODO: Should we try to emit this once in the entry block? 2363 const LLT S32 = LLT::scalar(32); 2364 const unsigned Mask = Arg->getMask(); 2365 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2366 2367 Register AndMaskSrc = LiveIn; 2368 2369 if (Shift != 0) { 2370 auto ShiftAmt = B.buildConstant(S32, Shift); 2371 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2372 } 2373 2374 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2375 } else { 2376 B.buildCopy(DstReg, LiveIn); 2377 } 2378 2379 return true; 2380 } 2381 2382 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2383 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2384 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2385 B.setInstr(MI); 2386 2387 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2388 if (!Arg) 2389 return false; 2390 2391 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2392 return false; 2393 2394 MI.eraseFromParent(); 2395 return true; 2396 } 2397 2398 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2399 MachineRegisterInfo &MRI, 2400 MachineIRBuilder &B) const { 2401 B.setInstr(MI); 2402 Register Dst = MI.getOperand(0).getReg(); 2403 LLT DstTy = MRI.getType(Dst); 2404 LLT S16 = LLT::scalar(16); 2405 LLT S32 = LLT::scalar(32); 2406 LLT S64 = LLT::scalar(64); 2407 2408 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2409 return true; 2410 2411 if (DstTy == S16) 2412 return legalizeFDIV16(MI, MRI, B); 2413 if (DstTy == S32) 2414 return legalizeFDIV32(MI, MRI, B); 2415 if (DstTy == S64) 2416 return legalizeFDIV64(MI, MRI, B); 2417 2418 return false; 2419 } 2420 2421 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2422 const LLT S32 = LLT::scalar(32); 2423 2424 auto Cvt0 = B.buildUITOFP(S32, Src); 2425 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2426 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2427 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2428 return B.buildFPTOUI(S32, Mul).getReg(0); 2429 } 2430 2431 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2432 Register DstReg, 2433 Register Num, 2434 Register Den, 2435 bool IsRem) const { 2436 const LLT S1 = LLT::scalar(1); 2437 const LLT S32 = LLT::scalar(32); 2438 2439 // RCP = URECIP(Den) = 2^32 / Den + e 2440 // e is rounding error. 2441 auto RCP = buildDivRCP(B, Den); 2442 2443 // RCP_LO = mul(RCP, Den) 2444 auto RCP_LO = B.buildMul(S32, RCP, Den); 2445 2446 // RCP_HI = mulhu (RCP, Den) */ 2447 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2448 2449 // NEG_RCP_LO = -RCP_LO 2450 auto Zero = B.buildConstant(S32, 0); 2451 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2452 2453 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2454 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2455 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2456 2457 // Calculate the rounding error from the URECIP instruction 2458 // E = mulhu(ABS_RCP_LO, RCP) 2459 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2460 2461 // RCP_A_E = RCP + E 2462 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2463 2464 // RCP_S_E = RCP - E 2465 auto RCP_S_E = B.buildSub(S32, RCP, E); 2466 2467 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2468 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2469 2470 // Quotient = mulhu(Tmp0, Num)stmp 2471 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2472 2473 // Num_S_Remainder = Quotient * Den 2474 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2475 2476 // Remainder = Num - Num_S_Remainder 2477 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2478 2479 // Remainder_GE_Den = Remainder >= Den 2480 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2481 2482 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2483 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2484 Num, Num_S_Remainder); 2485 2486 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2487 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2488 2489 // Calculate Division result: 2490 2491 // Quotient_A_One = Quotient + 1 2492 auto One = B.buildConstant(S32, 1); 2493 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2494 2495 // Quotient_S_One = Quotient - 1 2496 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2497 2498 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2499 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2500 2501 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2502 if (IsRem) { 2503 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2504 2505 // Calculate Rem result: 2506 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2507 2508 // Remainder_A_Den = Remainder + Den 2509 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2510 2511 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2512 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2513 2514 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2515 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2516 } else { 2517 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2518 } 2519 } 2520 2521 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2522 MachineRegisterInfo &MRI, 2523 MachineIRBuilder &B) const { 2524 B.setInstr(MI); 2525 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2526 Register DstReg = MI.getOperand(0).getReg(); 2527 Register Num = MI.getOperand(1).getReg(); 2528 Register Den = MI.getOperand(2).getReg(); 2529 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2530 MI.eraseFromParent(); 2531 return true; 2532 } 2533 2534 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2535 MachineRegisterInfo &MRI, 2536 MachineIRBuilder &B) const { 2537 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2538 return legalizeUDIV_UREM32(MI, MRI, B); 2539 return false; 2540 } 2541 2542 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2543 MachineRegisterInfo &MRI, 2544 MachineIRBuilder &B) const { 2545 B.setInstr(MI); 2546 const LLT S32 = LLT::scalar(32); 2547 2548 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2549 Register DstReg = MI.getOperand(0).getReg(); 2550 Register LHS = MI.getOperand(1).getReg(); 2551 Register RHS = MI.getOperand(2).getReg(); 2552 2553 auto ThirtyOne = B.buildConstant(S32, 31); 2554 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2555 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2556 2557 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2558 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2559 2560 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2561 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2562 2563 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2564 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2565 2566 if (IsRem) { 2567 auto RSign = LHSign; // Remainder sign is the same as LHS 2568 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2569 B.buildSub(DstReg, UDivRem, RSign); 2570 } else { 2571 auto DSign = B.buildXor(S32, LHSign, RHSign); 2572 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2573 B.buildSub(DstReg, UDivRem, DSign); 2574 } 2575 2576 MI.eraseFromParent(); 2577 return true; 2578 } 2579 2580 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2581 MachineRegisterInfo &MRI, 2582 MachineIRBuilder &B) const { 2583 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2584 return legalizeSDIV_SREM32(MI, MRI, B); 2585 return false; 2586 } 2587 2588 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2589 MachineRegisterInfo &MRI, 2590 MachineIRBuilder &B) const { 2591 Register Res = MI.getOperand(0).getReg(); 2592 Register LHS = MI.getOperand(1).getReg(); 2593 Register RHS = MI.getOperand(2).getReg(); 2594 2595 uint16_t Flags = MI.getFlags(); 2596 2597 LLT ResTy = MRI.getType(Res); 2598 LLT S32 = LLT::scalar(32); 2599 LLT S64 = LLT::scalar(64); 2600 2601 const MachineFunction &MF = B.getMF(); 2602 bool Unsafe = 2603 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2604 2605 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2606 return false; 2607 2608 if (!Unsafe && ResTy == S32 && 2609 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2610 return false; 2611 2612 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2613 // 1 / x -> RCP(x) 2614 if (CLHS->isExactlyValue(1.0)) { 2615 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2616 .addUse(RHS) 2617 .setMIFlags(Flags); 2618 2619 MI.eraseFromParent(); 2620 return true; 2621 } 2622 2623 // -1 / x -> RCP( FNEG(x) ) 2624 if (CLHS->isExactlyValue(-1.0)) { 2625 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2626 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2627 .addUse(FNeg.getReg(0)) 2628 .setMIFlags(Flags); 2629 2630 MI.eraseFromParent(); 2631 return true; 2632 } 2633 } 2634 2635 // x / y -> x * (1.0 / y) 2636 if (Unsafe) { 2637 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2638 .addUse(RHS) 2639 .setMIFlags(Flags); 2640 B.buildFMul(Res, LHS, RCP, Flags); 2641 2642 MI.eraseFromParent(); 2643 return true; 2644 } 2645 2646 return false; 2647 } 2648 2649 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2650 MachineRegisterInfo &MRI, 2651 MachineIRBuilder &B) const { 2652 B.setInstr(MI); 2653 Register Res = MI.getOperand(0).getReg(); 2654 Register LHS = MI.getOperand(1).getReg(); 2655 Register RHS = MI.getOperand(2).getReg(); 2656 2657 uint16_t Flags = MI.getFlags(); 2658 2659 LLT S16 = LLT::scalar(16); 2660 LLT S32 = LLT::scalar(32); 2661 2662 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2663 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2664 2665 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2666 .addUse(RHSExt.getReg(0)) 2667 .setMIFlags(Flags); 2668 2669 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2670 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2671 2672 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2673 .addUse(RDst.getReg(0)) 2674 .addUse(RHS) 2675 .addUse(LHS) 2676 .setMIFlags(Flags); 2677 2678 MI.eraseFromParent(); 2679 return true; 2680 } 2681 2682 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2683 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2684 static void toggleSPDenormMode(bool Enable, 2685 MachineIRBuilder &B, 2686 const GCNSubtarget &ST, 2687 AMDGPU::SIModeRegisterDefaults Mode) { 2688 // Set SP denorm mode to this value. 2689 unsigned SPDenormMode = 2690 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2691 2692 if (ST.hasDenormModeInst()) { 2693 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2694 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2695 2696 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2697 B.buildInstr(AMDGPU::S_DENORM_MODE) 2698 .addImm(NewDenormModeValue); 2699 2700 } else { 2701 // Select FP32 bit field in mode register. 2702 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2703 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2704 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2705 2706 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2707 .addImm(SPDenormMode) 2708 .addImm(SPDenormModeBitField); 2709 } 2710 } 2711 2712 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2713 MachineRegisterInfo &MRI, 2714 MachineIRBuilder &B) const { 2715 B.setInstr(MI); 2716 Register Res = MI.getOperand(0).getReg(); 2717 Register LHS = MI.getOperand(1).getReg(); 2718 Register RHS = MI.getOperand(2).getReg(); 2719 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2720 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2721 2722 uint16_t Flags = MI.getFlags(); 2723 2724 LLT S32 = LLT::scalar(32); 2725 LLT S1 = LLT::scalar(1); 2726 2727 auto One = B.buildFConstant(S32, 1.0f); 2728 2729 auto DenominatorScaled = 2730 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2731 .addUse(RHS) 2732 .addUse(LHS) 2733 .addImm(1) 2734 .setMIFlags(Flags); 2735 auto NumeratorScaled = 2736 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2737 .addUse(LHS) 2738 .addUse(RHS) 2739 .addImm(0) 2740 .setMIFlags(Flags); 2741 2742 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2743 .addUse(DenominatorScaled.getReg(0)) 2744 .setMIFlags(Flags); 2745 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2746 2747 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2748 // aren't modeled as reading it. 2749 if (!Mode.allFP32Denormals()) 2750 toggleSPDenormMode(true, B, ST, Mode); 2751 2752 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2753 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2754 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2755 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2756 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2757 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2758 2759 if (!Mode.allFP32Denormals()) 2760 toggleSPDenormMode(false, B, ST, Mode); 2761 2762 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2763 .addUse(Fma4.getReg(0)) 2764 .addUse(Fma1.getReg(0)) 2765 .addUse(Fma3.getReg(0)) 2766 .addUse(NumeratorScaled.getReg(1)) 2767 .setMIFlags(Flags); 2768 2769 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2770 .addUse(Fmas.getReg(0)) 2771 .addUse(RHS) 2772 .addUse(LHS) 2773 .setMIFlags(Flags); 2774 2775 MI.eraseFromParent(); 2776 return true; 2777 } 2778 2779 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2780 MachineRegisterInfo &MRI, 2781 MachineIRBuilder &B) const { 2782 B.setInstr(MI); 2783 Register Res = MI.getOperand(0).getReg(); 2784 Register LHS = MI.getOperand(1).getReg(); 2785 Register RHS = MI.getOperand(2).getReg(); 2786 2787 uint16_t Flags = MI.getFlags(); 2788 2789 LLT S64 = LLT::scalar(64); 2790 LLT S1 = LLT::scalar(1); 2791 2792 auto One = B.buildFConstant(S64, 1.0); 2793 2794 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2795 .addUse(LHS) 2796 .addUse(RHS) 2797 .addImm(1) 2798 .setMIFlags(Flags); 2799 2800 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2801 2802 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2803 .addUse(DivScale0.getReg(0)) 2804 .setMIFlags(Flags); 2805 2806 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2807 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2808 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2809 2810 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2811 .addUse(LHS) 2812 .addUse(RHS) 2813 .addImm(0) 2814 .setMIFlags(Flags); 2815 2816 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2817 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2818 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2819 2820 Register Scale; 2821 if (!ST.hasUsableDivScaleConditionOutput()) { 2822 // Workaround a hardware bug on SI where the condition output from div_scale 2823 // is not usable. 2824 2825 LLT S32 = LLT::scalar(32); 2826 2827 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2828 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2829 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2830 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2831 2832 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2833 Scale1Unmerge.getReg(1)); 2834 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2835 Scale0Unmerge.getReg(1)); 2836 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 2837 } else { 2838 Scale = DivScale1.getReg(1); 2839 } 2840 2841 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2842 .addUse(Fma4.getReg(0)) 2843 .addUse(Fma3.getReg(0)) 2844 .addUse(Mul.getReg(0)) 2845 .addUse(Scale) 2846 .setMIFlags(Flags); 2847 2848 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2849 .addUse(Fmas.getReg(0)) 2850 .addUse(RHS) 2851 .addUse(LHS) 2852 .setMIFlags(Flags); 2853 2854 MI.eraseFromParent(); 2855 return true; 2856 } 2857 2858 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2859 MachineRegisterInfo &MRI, 2860 MachineIRBuilder &B) const { 2861 B.setInstr(MI); 2862 Register Res = MI.getOperand(0).getReg(); 2863 Register LHS = MI.getOperand(2).getReg(); 2864 Register RHS = MI.getOperand(3).getReg(); 2865 uint16_t Flags = MI.getFlags(); 2866 2867 LLT S32 = LLT::scalar(32); 2868 LLT S1 = LLT::scalar(1); 2869 2870 auto Abs = B.buildFAbs(S32, RHS, Flags); 2871 const APFloat C0Val(1.0f); 2872 2873 auto C0 = B.buildConstant(S32, 0x6f800000); 2874 auto C1 = B.buildConstant(S32, 0x2f800000); 2875 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2876 2877 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2878 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2879 2880 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2881 2882 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2883 .addUse(Mul0.getReg(0)) 2884 .setMIFlags(Flags); 2885 2886 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2887 2888 B.buildFMul(Res, Sel, Mul1, Flags); 2889 2890 MI.eraseFromParent(); 2891 return true; 2892 } 2893 2894 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2895 MachineRegisterInfo &MRI, 2896 MachineIRBuilder &B) const { 2897 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2898 if (!MFI->isEntryFunction()) { 2899 return legalizePreloadedArgIntrin(MI, MRI, B, 2900 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2901 } 2902 2903 B.setInstr(MI); 2904 2905 uint64_t Offset = 2906 ST.getTargetLowering()->getImplicitParameterOffset( 2907 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2908 Register DstReg = MI.getOperand(0).getReg(); 2909 LLT DstTy = MRI.getType(DstReg); 2910 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2911 2912 const ArgDescriptor *Arg; 2913 const TargetRegisterClass *RC; 2914 std::tie(Arg, RC) 2915 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2916 if (!Arg) 2917 return false; 2918 2919 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2920 if (!loadInputValue(KernargPtrReg, B, Arg)) 2921 return false; 2922 2923 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2924 MI.eraseFromParent(); 2925 return true; 2926 } 2927 2928 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2929 MachineRegisterInfo &MRI, 2930 MachineIRBuilder &B, 2931 unsigned AddrSpace) const { 2932 B.setInstr(MI); 2933 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2934 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2935 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2936 MI.eraseFromParent(); 2937 return true; 2938 } 2939 2940 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2941 // offset (the offset that is included in bounds checking and swizzling, to be 2942 // split between the instruction's voffset and immoffset fields) and soffset 2943 // (the offset that is excluded from bounds checking and swizzling, to go in 2944 // the instruction's soffset field). This function takes the first kind of 2945 // offset and figures out how to split it between voffset and immoffset. 2946 std::tuple<Register, unsigned, unsigned> 2947 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2948 Register OrigOffset) const { 2949 const unsigned MaxImm = 4095; 2950 Register BaseReg; 2951 unsigned TotalConstOffset; 2952 MachineInstr *OffsetDef; 2953 const LLT S32 = LLT::scalar(32); 2954 2955 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2956 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2957 2958 unsigned ImmOffset = TotalConstOffset; 2959 2960 // If the immediate value is too big for the immoffset field, put the value 2961 // and -4096 into the immoffset field so that the value that is copied/added 2962 // for the voffset field is a multiple of 4096, and it stands more chance 2963 // of being CSEd with the copy/add for another similar load/store. 2964 // However, do not do that rounding down to a multiple of 4096 if that is a 2965 // negative number, as it appears to be illegal to have a negative offset 2966 // in the vgpr, even if adding the immediate offset makes it positive. 2967 unsigned Overflow = ImmOffset & ~MaxImm; 2968 ImmOffset -= Overflow; 2969 if ((int32_t)Overflow < 0) { 2970 Overflow += ImmOffset; 2971 ImmOffset = 0; 2972 } 2973 2974 if (Overflow != 0) { 2975 if (!BaseReg) { 2976 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2977 } else { 2978 auto OverflowVal = B.buildConstant(S32, Overflow); 2979 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2980 } 2981 } 2982 2983 if (!BaseReg) 2984 BaseReg = B.buildConstant(S32, 0).getReg(0); 2985 2986 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2987 } 2988 2989 /// Handle register layout difference for f16 images for some subtargets. 2990 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2991 MachineRegisterInfo &MRI, 2992 Register Reg) const { 2993 if (!ST.hasUnpackedD16VMem()) 2994 return Reg; 2995 2996 const LLT S16 = LLT::scalar(16); 2997 const LLT S32 = LLT::scalar(32); 2998 LLT StoreVT = MRI.getType(Reg); 2999 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3000 3001 auto Unmerge = B.buildUnmerge(S16, Reg); 3002 3003 SmallVector<Register, 4> WideRegs; 3004 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3005 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3006 3007 int NumElts = StoreVT.getNumElements(); 3008 3009 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3010 } 3011 3012 Register AMDGPULegalizerInfo::fixStoreSourceType( 3013 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3014 MachineRegisterInfo *MRI = B.getMRI(); 3015 LLT Ty = MRI->getType(VData); 3016 3017 const LLT S16 = LLT::scalar(16); 3018 3019 // Fixup illegal register types for i8 stores. 3020 if (Ty == LLT::scalar(8) || Ty == S16) { 3021 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3022 return AnyExt; 3023 } 3024 3025 if (Ty.isVector()) { 3026 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3027 if (IsFormat) 3028 return handleD16VData(B, *MRI, VData); 3029 } 3030 } 3031 3032 return VData; 3033 } 3034 3035 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3036 MachineRegisterInfo &MRI, 3037 MachineIRBuilder &B, 3038 bool IsTyped, 3039 bool IsFormat) const { 3040 B.setInstr(MI); 3041 3042 Register VData = MI.getOperand(1).getReg(); 3043 LLT Ty = MRI.getType(VData); 3044 LLT EltTy = Ty.getScalarType(); 3045 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3046 const LLT S32 = LLT::scalar(32); 3047 3048 VData = fixStoreSourceType(B, VData, IsFormat); 3049 Register RSrc = MI.getOperand(2).getReg(); 3050 3051 MachineMemOperand *MMO = *MI.memoperands_begin(); 3052 const int MemSize = MMO->getSize(); 3053 3054 unsigned ImmOffset; 3055 unsigned TotalOffset; 3056 3057 // The typed intrinsics add an immediate after the registers. 3058 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3059 3060 // The struct intrinsic variants add one additional operand over raw. 3061 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3062 Register VIndex; 3063 int OpOffset = 0; 3064 if (HasVIndex) { 3065 VIndex = MI.getOperand(3).getReg(); 3066 OpOffset = 1; 3067 } 3068 3069 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3070 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3071 3072 unsigned Format = 0; 3073 if (IsTyped) { 3074 Format = MI.getOperand(5 + OpOffset).getImm(); 3075 ++OpOffset; 3076 } 3077 3078 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3079 3080 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3081 if (TotalOffset != 0) 3082 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3083 3084 unsigned Opc; 3085 if (IsTyped) { 3086 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3087 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3088 } else if (IsFormat) { 3089 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3090 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3091 } else { 3092 switch (MemSize) { 3093 case 1: 3094 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3095 break; 3096 case 2: 3097 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3098 break; 3099 default: 3100 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3101 break; 3102 } 3103 } 3104 3105 if (!VIndex) 3106 VIndex = B.buildConstant(S32, 0).getReg(0); 3107 3108 auto MIB = B.buildInstr(Opc) 3109 .addUse(VData) // vdata 3110 .addUse(RSrc) // rsrc 3111 .addUse(VIndex) // vindex 3112 .addUse(VOffset) // voffset 3113 .addUse(SOffset) // soffset 3114 .addImm(ImmOffset); // offset(imm) 3115 3116 if (IsTyped) 3117 MIB.addImm(Format); 3118 3119 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3120 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3121 .addMemOperand(MMO); 3122 3123 MI.eraseFromParent(); 3124 return true; 3125 } 3126 3127 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3128 MachineRegisterInfo &MRI, 3129 MachineIRBuilder &B, 3130 bool IsFormat, 3131 bool IsTyped) const { 3132 B.setInstr(MI); 3133 3134 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3135 MachineMemOperand *MMO = *MI.memoperands_begin(); 3136 const int MemSize = MMO->getSize(); 3137 const LLT S32 = LLT::scalar(32); 3138 3139 Register Dst = MI.getOperand(0).getReg(); 3140 Register RSrc = MI.getOperand(2).getReg(); 3141 3142 // The typed intrinsics add an immediate after the registers. 3143 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3144 3145 // The struct intrinsic variants add one additional operand over raw. 3146 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3147 Register VIndex; 3148 int OpOffset = 0; 3149 if (HasVIndex) { 3150 VIndex = MI.getOperand(3).getReg(); 3151 OpOffset = 1; 3152 } 3153 3154 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3155 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3156 3157 unsigned Format = 0; 3158 if (IsTyped) { 3159 Format = MI.getOperand(5 + OpOffset).getImm(); 3160 ++OpOffset; 3161 } 3162 3163 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3164 unsigned ImmOffset; 3165 unsigned TotalOffset; 3166 3167 LLT Ty = MRI.getType(Dst); 3168 LLT EltTy = Ty.getScalarType(); 3169 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3170 const bool Unpacked = ST.hasUnpackedD16VMem(); 3171 3172 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3173 if (TotalOffset != 0) 3174 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3175 3176 unsigned Opc; 3177 3178 if (IsTyped) { 3179 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3180 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3181 } else if (IsFormat) { 3182 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3183 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3184 } else { 3185 switch (MemSize) { 3186 case 1: 3187 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3188 break; 3189 case 2: 3190 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3191 break; 3192 default: 3193 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3194 break; 3195 } 3196 } 3197 3198 Register LoadDstReg; 3199 3200 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3201 LLT UnpackedTy = Ty.changeElementSize(32); 3202 3203 if (IsExtLoad) 3204 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3205 else if (Unpacked && IsD16 && Ty.isVector()) 3206 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3207 else 3208 LoadDstReg = Dst; 3209 3210 if (!VIndex) 3211 VIndex = B.buildConstant(S32, 0).getReg(0); 3212 3213 auto MIB = B.buildInstr(Opc) 3214 .addDef(LoadDstReg) // vdata 3215 .addUse(RSrc) // rsrc 3216 .addUse(VIndex) // vindex 3217 .addUse(VOffset) // voffset 3218 .addUse(SOffset) // soffset 3219 .addImm(ImmOffset); // offset(imm) 3220 3221 if (IsTyped) 3222 MIB.addImm(Format); 3223 3224 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3225 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3226 .addMemOperand(MMO); 3227 3228 if (LoadDstReg != Dst) { 3229 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3230 3231 // Widen result for extending loads was widened. 3232 if (IsExtLoad) 3233 B.buildTrunc(Dst, LoadDstReg); 3234 else { 3235 // Repack to original 16-bit vector result 3236 // FIXME: G_TRUNC should work, but legalization currently fails 3237 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3238 SmallVector<Register, 4> Repack; 3239 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3240 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3241 B.buildMerge(Dst, Repack); 3242 } 3243 } 3244 3245 MI.eraseFromParent(); 3246 return true; 3247 } 3248 3249 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3250 MachineIRBuilder &B, 3251 bool IsInc) const { 3252 B.setInstr(MI); 3253 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3254 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3255 B.buildInstr(Opc) 3256 .addDef(MI.getOperand(0).getReg()) 3257 .addUse(MI.getOperand(2).getReg()) 3258 .addUse(MI.getOperand(3).getReg()) 3259 .cloneMemRefs(MI); 3260 MI.eraseFromParent(); 3261 return true; 3262 } 3263 3264 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3265 switch (IntrID) { 3266 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3267 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3268 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3269 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3270 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3271 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3272 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3273 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3274 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3275 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3276 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3277 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3278 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3279 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3280 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3281 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3282 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3283 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3284 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3285 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3286 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3287 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3288 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3289 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3290 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3291 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3292 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3293 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3294 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3295 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3296 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3297 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3298 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3299 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3300 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3301 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3302 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3303 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3304 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3305 default: 3306 llvm_unreachable("unhandled atomic opcode"); 3307 } 3308 } 3309 3310 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3311 MachineIRBuilder &B, 3312 Intrinsic::ID IID) const { 3313 B.setInstr(MI); 3314 3315 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3316 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3317 3318 Register Dst = MI.getOperand(0).getReg(); 3319 Register VData = MI.getOperand(2).getReg(); 3320 3321 Register CmpVal; 3322 int OpOffset = 0; 3323 3324 if (IsCmpSwap) { 3325 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3326 ++OpOffset; 3327 } 3328 3329 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3330 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3331 3332 // The struct intrinsic variants add one additional operand over raw. 3333 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3334 Register VIndex; 3335 if (HasVIndex) { 3336 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3337 ++OpOffset; 3338 } 3339 3340 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3341 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3342 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3343 3344 MachineMemOperand *MMO = *MI.memoperands_begin(); 3345 3346 unsigned ImmOffset; 3347 unsigned TotalOffset; 3348 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3349 if (TotalOffset != 0) 3350 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3351 3352 if (!VIndex) 3353 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3354 3355 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3356 .addDef(Dst) 3357 .addUse(VData); // vdata 3358 3359 if (IsCmpSwap) 3360 MIB.addReg(CmpVal); 3361 3362 MIB.addUse(RSrc) // rsrc 3363 .addUse(VIndex) // vindex 3364 .addUse(VOffset) // voffset 3365 .addUse(SOffset) // soffset 3366 .addImm(ImmOffset) // offset(imm) 3367 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3368 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3369 .addMemOperand(MMO); 3370 3371 MI.eraseFromParent(); 3372 return true; 3373 } 3374 3375 // Produce a vector of s16 elements from s32 pieces. 3376 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg, 3377 ArrayRef<Register> UnmergeParts) { 3378 const LLT S16 = LLT::scalar(16); 3379 3380 SmallVector<Register, 4> RemergeParts(UnmergeParts.size()); 3381 for (int I = 0, E = UnmergeParts.size(); I != E; ++I) 3382 RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0); 3383 3384 B.buildBuildVector(DstReg, RemergeParts); 3385 } 3386 3387 /// Convert a set of s32 registers to a result vector with s16 elements. 3388 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg, 3389 ArrayRef<Register> UnmergeParts) { 3390 MachineRegisterInfo &MRI = *B.getMRI(); 3391 const LLT V2S16 = LLT::vector(2, 16); 3392 LLT TargetTy = MRI.getType(DstReg); 3393 int NumElts = UnmergeParts.size(); 3394 3395 if (NumElts == 1) { 3396 assert(TargetTy == V2S16); 3397 B.buildBitcast(DstReg, UnmergeParts[0]); 3398 return; 3399 } 3400 3401 SmallVector<Register, 4> RemergeParts(NumElts); 3402 for (int I = 0; I != NumElts; ++I) 3403 RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0); 3404 3405 if (TargetTy.getSizeInBits() == 32u * NumElts) { 3406 B.buildConcatVectors(DstReg, RemergeParts); 3407 return; 3408 } 3409 3410 const LLT V3S16 = LLT::vector(3, 16); 3411 const LLT V6S16 = LLT::vector(6, 16); 3412 3413 // Widen to v6s16 and unpack v3 parts. 3414 assert(TargetTy == V3S16); 3415 3416 RemergeParts.push_back(B.buildUndef(V2S16).getReg(0)); 3417 auto Concat = B.buildConcatVectors(V6S16, RemergeParts); 3418 B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat); 3419 } 3420 3421 // FIXME: Just vector trunc should be sufficent, but legalization currently 3422 // broken. 3423 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg, 3424 Register WideDstReg) { 3425 const LLT S32 = LLT::scalar(32); 3426 const LLT S16 = LLT::scalar(16); 3427 3428 auto Unmerge = B.buildUnmerge(S32, WideDstReg); 3429 3430 int NumOps = Unmerge->getNumOperands() - 1; 3431 SmallVector<Register, 4> RemergeParts(NumOps); 3432 for (int I = 0; I != NumOps; ++I) 3433 RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0); 3434 3435 B.buildBuildVector(DstReg, RemergeParts); 3436 } 3437 3438 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3439 MachineInstr &MI, MachineIRBuilder &B, 3440 GISelChangeObserver &Observer, 3441 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3442 bool IsTFE = MI.getNumExplicitDefs() == 2; 3443 3444 // We are only processing the operands of d16 image operations on subtargets 3445 // that use the unpacked register layout, or need to repack the TFE result. 3446 3447 // TODO: Need to handle a16 images too 3448 // TODO: Do we need to guard against already legalized intrinsics? 3449 if (!IsTFE && !ST.hasUnpackedD16VMem()) 3450 return true; 3451 3452 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3453 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3454 3455 if (BaseOpcode->Atomic) // No d16 atomics, or TFE. 3456 return true; 3457 3458 B.setInstr(MI); 3459 3460 MachineRegisterInfo *MRI = B.getMRI(); 3461 const LLT S32 = LLT::scalar(32); 3462 const LLT S16 = LLT::scalar(16); 3463 3464 if (BaseOpcode->Store) { // No TFE for stores? 3465 Register VData = MI.getOperand(1).getReg(); 3466 LLT Ty = MRI->getType(VData); 3467 if (!Ty.isVector() || Ty.getElementType() != S16) 3468 return true; 3469 3470 B.setInstr(MI); 3471 3472 Observer.changingInstr(MI); 3473 MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData)); 3474 Observer.changedInstr(MI); 3475 return true; 3476 } 3477 3478 Register DstReg = MI.getOperand(0).getReg(); 3479 LLT Ty = MRI->getType(DstReg); 3480 const LLT EltTy = Ty.getScalarType(); 3481 const bool IsD16 = Ty.getScalarType() == S16; 3482 const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3483 3484 if (IsTFE) { 3485 // In the IR, TFE is supposed to be used with a 2 element struct return 3486 // type. The intruction really returns these two values in one contiguous 3487 // register, with one additional dword beyond the loaded data. Rewrite the 3488 // return type to use a single register result. 3489 Register Dst1Reg = MI.getOperand(1).getReg(); 3490 if (MRI->getType(Dst1Reg) != S32) 3491 return false; 3492 3493 // TODO: Make sure the TFE operand bit is set. 3494 3495 // The raw dword aligned data component of the load. The only legal cases 3496 // where this matters should be when using the packed D16 format, for 3497 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3498 LLT RoundedTy; 3499 LLT TFETy; 3500 3501 if (IsD16 && ST.hasUnpackedD16VMem()) { 3502 RoundedTy = LLT::scalarOrVector(NumElts, 32); 3503 TFETy = LLT::vector(NumElts + 1, 32); 3504 } else { 3505 unsigned EltSize = Ty.getScalarSizeInBits(); 3506 unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32; 3507 unsigned RoundedSize = 32 * RoundedElts; 3508 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3509 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3510 } 3511 3512 Register TFEReg = MRI->createGenericVirtualRegister(TFETy); 3513 Observer.changingInstr(MI); 3514 3515 MI.getOperand(0).setReg(TFEReg); 3516 MI.RemoveOperand(1); 3517 3518 Observer.changedInstr(MI); 3519 3520 // Insert after the instruction. 3521 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3522 3523 // Now figure out how to copy the new result register back into the old 3524 // result. 3525 3526 SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg); 3527 int NumDataElts = TFETy.getNumElements() - 1; 3528 3529 if (!Ty.isVector()) { 3530 // Simplest case is a trivial unmerge (plus a truncate for d16). 3531 UnmergeResults[0] = Ty == S32 ? 3532 DstReg : MRI->createGenericVirtualRegister(S32); 3533 3534 B.buildUnmerge(UnmergeResults, TFEReg); 3535 if (Ty != S32) 3536 B.buildTrunc(DstReg, UnmergeResults[0]); 3537 return true; 3538 } 3539 3540 // We have to repack into a new vector of some kind. 3541 for (int I = 0; I != NumDataElts; ++I) 3542 UnmergeResults[I] = MRI->createGenericVirtualRegister(S32); 3543 B.buildUnmerge(UnmergeResults, TFEReg); 3544 3545 // Drop the final TFE element. 3546 ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts); 3547 3548 if (EltTy == S32) 3549 B.buildBuildVector(DstReg, DataPart); 3550 else if (ST.hasUnpackedD16VMem()) 3551 truncToS16Vector(B, DstReg, DataPart); 3552 else 3553 bitcastToS16Vector(B, DstReg, DataPart); 3554 3555 return true; 3556 } 3557 3558 // Must be an image load. 3559 if (!Ty.isVector() || Ty.getElementType() != S16) 3560 return true; 3561 3562 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3563 3564 LLT WidenedTy = Ty.changeElementType(S32); 3565 Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy); 3566 3567 Observer.changingInstr(MI); 3568 MI.getOperand(0).setReg(WideDstReg); 3569 Observer.changedInstr(MI); 3570 3571 repackUnpackedD16Load(B, DstReg, WideDstReg); 3572 return true; 3573 } 3574 3575 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3576 MachineInstr &MI, MachineIRBuilder &B, 3577 GISelChangeObserver &Observer) const { 3578 Register Dst = MI.getOperand(0).getReg(); 3579 LLT Ty = B.getMRI()->getType(Dst); 3580 unsigned Size = Ty.getSizeInBits(); 3581 MachineFunction &MF = B.getMF(); 3582 3583 Observer.changingInstr(MI); 3584 3585 // FIXME: We don't really need this intermediate instruction. The intrinsic 3586 // should be fixed to have a memory operand. Since it's readnone, we're not 3587 // allowed to add one. 3588 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 3589 MI.RemoveOperand(1); // Remove intrinsic ID 3590 3591 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 3592 // TODO: Should this use datalayout alignment? 3593 const unsigned MemSize = (Size + 7) / 8; 3594 const unsigned MemAlign = 4; 3595 MachineMemOperand *MMO = MF.getMachineMemOperand( 3596 MachinePointerInfo(), 3597 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3598 MachineMemOperand::MOInvariant, MemSize, MemAlign); 3599 MI.addMemOperand(MF, MMO); 3600 3601 // There are no 96-bit result scalar loads, but widening to 128-bit should 3602 // always be legal. We may need to restore this to a 96-bit result if it turns 3603 // out this needs to be converted to a vector load during RegBankSelect. 3604 if (!isPowerOf2_32(Size)) { 3605 LegalizerHelper Helper(MF, *this, Observer, B); 3606 B.setInstr(MI); 3607 3608 if (Ty.isVector()) 3609 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 3610 else 3611 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 3612 } 3613 3614 Observer.changedInstr(MI); 3615 return true; 3616 } 3617 3618 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 3619 MachineRegisterInfo &MRI, 3620 MachineIRBuilder &B) const { 3621 B.setInstr(MI); 3622 3623 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 3624 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 3625 !ST.isTrapHandlerEnabled()) { 3626 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 3627 } else { 3628 // Pass queue pointer to trap handler as input, and insert trap instruction 3629 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 3630 const ArgDescriptor *Arg = 3631 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 3632 if (!Arg) 3633 return false; 3634 MachineRegisterInfo &MRI = *B.getMRI(); 3635 Register SGPR01(AMDGPU::SGPR0_SGPR1); 3636 Register LiveIn = getLiveInRegister( 3637 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 3638 /*InsertLiveInCopy=*/false); 3639 if (!loadInputValue(LiveIn, B, Arg)) 3640 return false; 3641 B.buildCopy(SGPR01, LiveIn); 3642 B.buildInstr(AMDGPU::S_TRAP) 3643 .addImm(GCNSubtarget::TrapIDLLVMTrap) 3644 .addReg(SGPR01, RegState::Implicit); 3645 } 3646 3647 MI.eraseFromParent(); 3648 return true; 3649 } 3650 3651 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 3652 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 3653 B.setInstr(MI); 3654 3655 // Is non-HSA path or trap-handler disabled? then, report a warning 3656 // accordingly 3657 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 3658 !ST.isTrapHandlerEnabled()) { 3659 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 3660 "debugtrap handler not supported", 3661 MI.getDebugLoc(), DS_Warning); 3662 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 3663 Ctx.diagnose(NoTrap); 3664 } else { 3665 // Insert debug-trap instruction 3666 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 3667 } 3668 3669 MI.eraseFromParent(); 3670 return true; 3671 } 3672 3673 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 3674 MachineIRBuilder &B, 3675 GISelChangeObserver &Observer) const { 3676 MachineRegisterInfo &MRI = *B.getMRI(); 3677 3678 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 3679 auto IntrID = MI.getIntrinsicID(); 3680 switch (IntrID) { 3681 case Intrinsic::amdgcn_if: 3682 case Intrinsic::amdgcn_else: { 3683 MachineInstr *Br = nullptr; 3684 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3685 const SIRegisterInfo *TRI 3686 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3687 3688 B.setInstr(*BrCond); 3689 Register Def = MI.getOperand(1).getReg(); 3690 Register Use = MI.getOperand(3).getReg(); 3691 3692 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3693 if (Br) 3694 BrTarget = Br->getOperand(0).getMBB(); 3695 3696 if (IntrID == Intrinsic::amdgcn_if) { 3697 B.buildInstr(AMDGPU::SI_IF) 3698 .addDef(Def) 3699 .addUse(Use) 3700 .addMBB(BrTarget); 3701 } else { 3702 B.buildInstr(AMDGPU::SI_ELSE) 3703 .addDef(Def) 3704 .addUse(Use) 3705 .addMBB(BrTarget) 3706 .addImm(0); 3707 } 3708 3709 if (Br) 3710 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3711 3712 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 3713 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 3714 MI.eraseFromParent(); 3715 BrCond->eraseFromParent(); 3716 return true; 3717 } 3718 3719 return false; 3720 } 3721 case Intrinsic::amdgcn_loop: { 3722 MachineInstr *Br = nullptr; 3723 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3724 const SIRegisterInfo *TRI 3725 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3726 3727 B.setInstr(*BrCond); 3728 3729 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3730 if (Br) 3731 BrTarget = Br->getOperand(0).getMBB(); 3732 3733 Register Reg = MI.getOperand(2).getReg(); 3734 B.buildInstr(AMDGPU::SI_LOOP) 3735 .addUse(Reg) 3736 .addMBB(BrTarget); 3737 3738 if (Br) 3739 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3740 3741 MI.eraseFromParent(); 3742 BrCond->eraseFromParent(); 3743 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 3744 return true; 3745 } 3746 3747 return false; 3748 } 3749 case Intrinsic::amdgcn_kernarg_segment_ptr: 3750 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 3751 B.setInstr(MI); 3752 // This only makes sense to call in a kernel, so just lower to null. 3753 B.buildConstant(MI.getOperand(0).getReg(), 0); 3754 MI.eraseFromParent(); 3755 return true; 3756 } 3757 3758 return legalizePreloadedArgIntrin( 3759 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3760 case Intrinsic::amdgcn_implicitarg_ptr: 3761 return legalizeImplicitArgPtr(MI, MRI, B); 3762 case Intrinsic::amdgcn_workitem_id_x: 3763 return legalizePreloadedArgIntrin(MI, MRI, B, 3764 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3765 case Intrinsic::amdgcn_workitem_id_y: 3766 return legalizePreloadedArgIntrin(MI, MRI, B, 3767 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3768 case Intrinsic::amdgcn_workitem_id_z: 3769 return legalizePreloadedArgIntrin(MI, MRI, B, 3770 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3771 case Intrinsic::amdgcn_workgroup_id_x: 3772 return legalizePreloadedArgIntrin(MI, MRI, B, 3773 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 3774 case Intrinsic::amdgcn_workgroup_id_y: 3775 return legalizePreloadedArgIntrin(MI, MRI, B, 3776 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 3777 case Intrinsic::amdgcn_workgroup_id_z: 3778 return legalizePreloadedArgIntrin(MI, MRI, B, 3779 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 3780 case Intrinsic::amdgcn_dispatch_ptr: 3781 return legalizePreloadedArgIntrin(MI, MRI, B, 3782 AMDGPUFunctionArgInfo::DISPATCH_PTR); 3783 case Intrinsic::amdgcn_queue_ptr: 3784 return legalizePreloadedArgIntrin(MI, MRI, B, 3785 AMDGPUFunctionArgInfo::QUEUE_PTR); 3786 case Intrinsic::amdgcn_implicit_buffer_ptr: 3787 return legalizePreloadedArgIntrin( 3788 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 3789 case Intrinsic::amdgcn_dispatch_id: 3790 return legalizePreloadedArgIntrin(MI, MRI, B, 3791 AMDGPUFunctionArgInfo::DISPATCH_ID); 3792 case Intrinsic::amdgcn_fdiv_fast: 3793 return legalizeFDIVFastIntrin(MI, MRI, B); 3794 case Intrinsic::amdgcn_is_shared: 3795 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 3796 case Intrinsic::amdgcn_is_private: 3797 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 3798 case Intrinsic::amdgcn_wavefrontsize: { 3799 B.setInstr(MI); 3800 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 3801 MI.eraseFromParent(); 3802 return true; 3803 } 3804 case Intrinsic::amdgcn_s_buffer_load: 3805 return legalizeSBufferLoad(MI, B, Observer); 3806 case Intrinsic::amdgcn_raw_buffer_store: 3807 case Intrinsic::amdgcn_struct_buffer_store: 3808 return legalizeBufferStore(MI, MRI, B, false, false); 3809 case Intrinsic::amdgcn_raw_buffer_store_format: 3810 case Intrinsic::amdgcn_struct_buffer_store_format: 3811 return legalizeBufferStore(MI, MRI, B, false, true); 3812 case Intrinsic::amdgcn_raw_tbuffer_store: 3813 case Intrinsic::amdgcn_struct_tbuffer_store: 3814 return legalizeBufferStore(MI, MRI, B, true, true); 3815 case Intrinsic::amdgcn_raw_buffer_load: 3816 case Intrinsic::amdgcn_struct_buffer_load: 3817 return legalizeBufferLoad(MI, MRI, B, false, false); 3818 case Intrinsic::amdgcn_raw_buffer_load_format: 3819 case Intrinsic::amdgcn_struct_buffer_load_format: 3820 return legalizeBufferLoad(MI, MRI, B, true, false); 3821 case Intrinsic::amdgcn_raw_tbuffer_load: 3822 case Intrinsic::amdgcn_struct_tbuffer_load: 3823 return legalizeBufferLoad(MI, MRI, B, true, true); 3824 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3825 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3826 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3827 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3828 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3829 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3830 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3831 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3832 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3833 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3834 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3835 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3836 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3837 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3838 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3839 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3840 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3841 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3842 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3843 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3844 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3845 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3846 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3847 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3848 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3849 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3850 return legalizeBufferAtomic(MI, B, IntrID); 3851 case Intrinsic::amdgcn_atomic_inc: 3852 return legalizeAtomicIncDec(MI, B, true); 3853 case Intrinsic::amdgcn_atomic_dec: 3854 return legalizeAtomicIncDec(MI, B, false); 3855 case Intrinsic::trap: 3856 return legalizeTrapIntrinsic(MI, MRI, B); 3857 case Intrinsic::debugtrap: 3858 return legalizeDebugTrapIntrinsic(MI, MRI, B); 3859 default: { 3860 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 3861 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 3862 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 3863 return true; 3864 } 3865 } 3866 3867 return true; 3868 } 3869