1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Round the number of elements to the next power of two elements 46 static LLT getPow2VectorType(LLT Ty) { 47 unsigned NElts = Ty.getNumElements(); 48 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 49 return Ty.changeNumElements(Pow2NElts); 50 } 51 52 // Round the number of bits to the next power of two bits 53 static LLT getPow2ScalarType(LLT Ty) { 54 unsigned Bits = Ty.getSizeInBits(); 55 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 56 return LLT::scalar(Pow2Bits); 57 } 58 59 static LegalityPredicate isMultiple32(unsigned TypeIdx, 60 unsigned MaxSize = 1024) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getScalarType(); 64 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 65 }; 66 } 67 68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 69 return [=](const LegalityQuery &Query) { 70 return Query.Types[TypeIdx].getSizeInBits() == Size; 71 }; 72 } 73 74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 75 return [=](const LegalityQuery &Query) { 76 const LLT Ty = Query.Types[TypeIdx]; 77 return Ty.isVector() && 78 Ty.getNumElements() % 2 != 0 && 79 Ty.getElementType().getSizeInBits() < 32 && 80 Ty.getSizeInBits() % 32 != 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 130 return [=](const LegalityQuery &Query) { 131 const LLT QueryTy = Query.Types[TypeIdx]; 132 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 133 }; 134 } 135 136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 137 return [=](const LegalityQuery &Query) { 138 const LLT QueryTy = Query.Types[TypeIdx]; 139 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 140 }; 141 } 142 143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 144 return [=](const LegalityQuery &Query) { 145 const LLT QueryTy = Query.Types[TypeIdx]; 146 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 147 }; 148 } 149 150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 151 // v2s16. 152 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 if (Ty.isVector()) { 156 const int EltSize = Ty.getElementType().getSizeInBits(); 157 return EltSize == 32 || EltSize == 64 || 158 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 159 EltSize == 128 || EltSize == 256; 160 } 161 162 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 163 }; 164 } 165 166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 167 return [=](const LegalityQuery &Query) { 168 const LLT QueryTy = Query.Types[TypeIdx]; 169 return QueryTy.isVector() && QueryTy.getElementType() == Type; 170 }; 171 } 172 173 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 174 return [=](const LegalityQuery &Query) { 175 const LLT QueryTy = Query.Types[TypeIdx]; 176 if (!QueryTy.isVector()) 177 return false; 178 const LLT EltTy = QueryTy.getElementType(); 179 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 180 }; 181 } 182 183 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 184 return [=](const LegalityQuery &Query) { 185 const LLT Ty = Query.Types[TypeIdx]; 186 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 187 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 188 }; 189 } 190 191 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) { 192 return [=](const LegalityQuery &Query) { 193 return Query.Types[TypeIdx0].getSizeInBits() < 194 Query.Types[TypeIdx1].getSizeInBits(); 195 }; 196 } 197 198 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) { 199 return [=](const LegalityQuery &Query) { 200 return Query.Types[TypeIdx0].getSizeInBits() > 201 Query.Types[TypeIdx1].getSizeInBits(); 202 }; 203 } 204 205 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 206 const GCNTargetMachine &TM) 207 : ST(ST_) { 208 using namespace TargetOpcode; 209 210 auto GetAddrSpacePtr = [&TM](unsigned AS) { 211 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 212 }; 213 214 const LLT S1 = LLT::scalar(1); 215 const LLT S16 = LLT::scalar(16); 216 const LLT S32 = LLT::scalar(32); 217 const LLT S64 = LLT::scalar(64); 218 const LLT S128 = LLT::scalar(128); 219 const LLT S256 = LLT::scalar(256); 220 const LLT S512 = LLT::scalar(512); 221 const LLT S1024 = LLT::scalar(1024); 222 223 const LLT V2S16 = LLT::vector(2, 16); 224 const LLT V4S16 = LLT::vector(4, 16); 225 226 const LLT V2S32 = LLT::vector(2, 32); 227 const LLT V3S32 = LLT::vector(3, 32); 228 const LLT V4S32 = LLT::vector(4, 32); 229 const LLT V5S32 = LLT::vector(5, 32); 230 const LLT V6S32 = LLT::vector(6, 32); 231 const LLT V7S32 = LLT::vector(7, 32); 232 const LLT V8S32 = LLT::vector(8, 32); 233 const LLT V9S32 = LLT::vector(9, 32); 234 const LLT V10S32 = LLT::vector(10, 32); 235 const LLT V11S32 = LLT::vector(11, 32); 236 const LLT V12S32 = LLT::vector(12, 32); 237 const LLT V13S32 = LLT::vector(13, 32); 238 const LLT V14S32 = LLT::vector(14, 32); 239 const LLT V15S32 = LLT::vector(15, 32); 240 const LLT V16S32 = LLT::vector(16, 32); 241 const LLT V32S32 = LLT::vector(32, 32); 242 243 const LLT V2S64 = LLT::vector(2, 64); 244 const LLT V3S64 = LLT::vector(3, 64); 245 const LLT V4S64 = LLT::vector(4, 64); 246 const LLT V5S64 = LLT::vector(5, 64); 247 const LLT V6S64 = LLT::vector(6, 64); 248 const LLT V7S64 = LLT::vector(7, 64); 249 const LLT V8S64 = LLT::vector(8, 64); 250 const LLT V16S64 = LLT::vector(16, 64); 251 252 std::initializer_list<LLT> AllS32Vectors = 253 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 254 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 255 std::initializer_list<LLT> AllS64Vectors = 256 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 257 258 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 259 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 260 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 261 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 262 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 263 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 264 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 265 266 const LLT CodePtr = FlatPtr; 267 268 const std::initializer_list<LLT> AddrSpaces64 = { 269 GlobalPtr, ConstantPtr, FlatPtr 270 }; 271 272 const std::initializer_list<LLT> AddrSpaces32 = { 273 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 274 }; 275 276 const std::initializer_list<LLT> FPTypesBase = { 277 S32, S64 278 }; 279 280 const std::initializer_list<LLT> FPTypes16 = { 281 S32, S64, S16 282 }; 283 284 const std::initializer_list<LLT> FPTypesPK16 = { 285 S32, S64, S16, V2S16 286 }; 287 288 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 289 290 setAction({G_BRCOND, S1}, Legal); // VCC branches 291 setAction({G_BRCOND, S32}, Legal); // SCC branches 292 293 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 294 // elements for v3s16 295 getActionDefinitionsBuilder(G_PHI) 296 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 297 .legalFor(AllS32Vectors) 298 .legalFor(AllS64Vectors) 299 .legalFor(AddrSpaces64) 300 .legalFor(AddrSpaces32) 301 .clampScalar(0, S32, S256) 302 .widenScalarToNextPow2(0, 32) 303 .clampMaxNumElements(0, S32, 16) 304 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 305 .legalIf(isPointer(0)); 306 307 if (ST.hasVOP3PInsts()) { 308 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 309 .legalFor({S32, S16, V2S16}) 310 .clampScalar(0, S16, S32) 311 .clampMaxNumElements(0, S16, 2) 312 .scalarize(0) 313 .widenScalarToNextPow2(0, 32); 314 } else if (ST.has16BitInsts()) { 315 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 316 .legalFor({S32, S16}) 317 .clampScalar(0, S16, S32) 318 .scalarize(0) 319 .widenScalarToNextPow2(0, 32); 320 } else { 321 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 322 .legalFor({S32}) 323 .clampScalar(0, S32, S32) 324 .scalarize(0); 325 } 326 327 // FIXME: Not really legal. Placeholder for custom lowering. 328 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 329 .customFor({S32, S64}) 330 .clampScalar(0, S32, S64) 331 .widenScalarToNextPow2(0, 32) 332 .scalarize(0); 333 334 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 335 .legalFor({S32}) 336 .clampScalar(0, S32, S32) 337 .scalarize(0); 338 339 // Report legal for any types we can handle anywhere. For the cases only legal 340 // on the SALU, RegBankSelect will be able to re-legalize. 341 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 342 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 343 .clampScalar(0, S32, S64) 344 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 345 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 346 .widenScalarToNextPow2(0) 347 .scalarize(0); 348 349 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 350 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 351 .legalFor({{S32, S1}, {S32, S32}}) 352 .minScalar(0, S32) 353 // TODO: .scalarize(0) 354 .lower(); 355 356 getActionDefinitionsBuilder(G_BITCAST) 357 // Don't worry about the size constraint. 358 .legalIf(all(isRegisterType(0), isRegisterType(1))) 359 .lower(); 360 361 362 getActionDefinitionsBuilder(G_CONSTANT) 363 .legalFor({S1, S32, S64, S16, GlobalPtr, 364 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 365 .clampScalar(0, S32, S64) 366 .widenScalarToNextPow2(0) 367 .legalIf(isPointer(0)); 368 369 getActionDefinitionsBuilder(G_FCONSTANT) 370 .legalFor({S32, S64, S16}) 371 .clampScalar(0, S16, S64); 372 373 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 374 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 375 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 376 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 377 .clampScalarOrElt(0, S32, S1024) 378 .legalIf(isMultiple32(0)) 379 .widenScalarToNextPow2(0, 32) 380 .clampMaxNumElements(0, S32, 16); 381 382 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 383 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 384 .unsupportedFor({PrivatePtr}) 385 .custom(); 386 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 387 388 auto &FPOpActions = getActionDefinitionsBuilder( 389 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 390 .legalFor({S32, S64}); 391 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 392 .customFor({S32, S64}); 393 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 394 .customFor({S32, S64}); 395 396 if (ST.has16BitInsts()) { 397 if (ST.hasVOP3PInsts()) 398 FPOpActions.legalFor({S16, V2S16}); 399 else 400 FPOpActions.legalFor({S16}); 401 402 TrigActions.customFor({S16}); 403 FDIVActions.customFor({S16}); 404 } 405 406 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 407 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 408 409 if (ST.hasVOP3PInsts()) { 410 MinNumMaxNum.customFor(FPTypesPK16) 411 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 412 .clampMaxNumElements(0, S16, 2) 413 .clampScalar(0, S16, S64) 414 .scalarize(0); 415 } else if (ST.has16BitInsts()) { 416 MinNumMaxNum.customFor(FPTypes16) 417 .clampScalar(0, S16, S64) 418 .scalarize(0); 419 } else { 420 MinNumMaxNum.customFor(FPTypesBase) 421 .clampScalar(0, S32, S64) 422 .scalarize(0); 423 } 424 425 if (ST.hasVOP3PInsts()) 426 FPOpActions.clampMaxNumElements(0, S16, 2); 427 428 FPOpActions 429 .scalarize(0) 430 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 431 432 TrigActions 433 .scalarize(0) 434 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 435 436 FDIVActions 437 .scalarize(0) 438 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 439 440 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 441 .legalFor(FPTypesPK16) 442 .clampMaxNumElements(0, S16, 2) 443 .scalarize(0) 444 .clampScalar(0, S16, S64); 445 446 if (ST.has16BitInsts()) { 447 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 448 .legalFor({S32, S64, S16}) 449 .scalarize(0) 450 .clampScalar(0, S16, S64); 451 } else { 452 getActionDefinitionsBuilder(G_FSQRT) 453 .legalFor({S32, S64}) 454 .scalarize(0) 455 .clampScalar(0, S32, S64); 456 457 if (ST.hasFractBug()) { 458 getActionDefinitionsBuilder(G_FFLOOR) 459 .customFor({S64}) 460 .legalFor({S32, S64}) 461 .scalarize(0) 462 .clampScalar(0, S32, S64); 463 } else { 464 getActionDefinitionsBuilder(G_FFLOOR) 465 .legalFor({S32, S64}) 466 .scalarize(0) 467 .clampScalar(0, S32, S64); 468 } 469 } 470 471 getActionDefinitionsBuilder(G_FPTRUNC) 472 .legalFor({{S32, S64}, {S16, S32}}) 473 .scalarize(0) 474 .lower(); 475 476 getActionDefinitionsBuilder(G_FPEXT) 477 .legalFor({{S64, S32}, {S32, S16}}) 478 .lowerFor({{S64, S16}}) // FIXME: Implement 479 .scalarize(0); 480 481 getActionDefinitionsBuilder(G_FSUB) 482 // Use actual fsub instruction 483 .legalFor({S32}) 484 // Must use fadd + fneg 485 .lowerFor({S64, S16, V2S16}) 486 .scalarize(0) 487 .clampScalar(0, S32, S64); 488 489 // Whether this is legal depends on the floating point mode for the function. 490 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 491 if (ST.hasMadF16()) 492 FMad.customFor({S32, S16}); 493 else 494 FMad.customFor({S32}); 495 FMad.scalarize(0) 496 .lower(); 497 498 // TODO: Do we need to clamp maximum bitwidth? 499 getActionDefinitionsBuilder(G_TRUNC) 500 .legalIf(isScalar(0)) 501 .legalFor({{V2S16, V2S32}}) 502 .clampMaxNumElements(0, S16, 2) 503 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 504 // situations (like an invalid implicit use), we don't want to infinite loop 505 // in the legalizer. 506 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 507 .alwaysLegal(); 508 509 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 510 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 511 {S32, S1}, {S64, S1}, {S16, S1}}) 512 .scalarize(0) 513 .clampScalar(0, S32, S64) 514 .widenScalarToNextPow2(1, 32); 515 516 // TODO: Split s1->s64 during regbankselect for VALU. 517 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 518 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 519 .lowerFor({{S32, S64}}) 520 .lowerIf(typeIs(1, S1)) 521 .customFor({{S64, S64}}); 522 if (ST.has16BitInsts()) 523 IToFP.legalFor({{S16, S16}}); 524 IToFP.clampScalar(1, S32, S64) 525 .scalarize(0) 526 .widenScalarToNextPow2(1); 527 528 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 529 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 530 .customFor({{S64, S64}}); 531 if (ST.has16BitInsts()) 532 FPToI.legalFor({{S16, S16}}); 533 else 534 FPToI.minScalar(1, S32); 535 536 FPToI.minScalar(0, S32) 537 .scalarize(0) 538 .lower(); 539 540 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 541 .scalarize(0) 542 .lower(); 543 544 if (ST.has16BitInsts()) { 545 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 546 .legalFor({S16, S32, S64}) 547 .clampScalar(0, S16, S64) 548 .scalarize(0); 549 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 550 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 551 .legalFor({S32, S64}) 552 .clampScalar(0, S32, S64) 553 .scalarize(0); 554 } else { 555 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 556 .legalFor({S32}) 557 .customFor({S64}) 558 .clampScalar(0, S32, S64) 559 .scalarize(0); 560 } 561 562 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 563 .scalarize(0) 564 .alwaysLegal(); 565 566 auto &CmpBuilder = 567 getActionDefinitionsBuilder(G_ICMP) 568 // The compare output type differs based on the register bank of the output, 569 // so make both s1 and s32 legal. 570 // 571 // Scalar compares producing output in scc will be promoted to s32, as that 572 // is the allocatable register type that will be needed for the copy from 573 // scc. This will be promoted during RegBankSelect, and we assume something 574 // before that won't try to use s32 result types. 575 // 576 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 577 // bank. 578 .legalForCartesianProduct( 579 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 580 .legalForCartesianProduct( 581 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 582 if (ST.has16BitInsts()) { 583 CmpBuilder.legalFor({{S1, S16}}); 584 } 585 586 CmpBuilder 587 .widenScalarToNextPow2(1) 588 .clampScalar(1, S32, S64) 589 .scalarize(0) 590 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 591 592 getActionDefinitionsBuilder(G_FCMP) 593 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 594 .widenScalarToNextPow2(1) 595 .clampScalar(1, S32, S64) 596 .scalarize(0); 597 598 // FIXME: fpow has a selection pattern that should move to custom lowering. 599 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 600 if (ST.has16BitInsts()) 601 Exp2Ops.legalFor({S32, S16}); 602 else 603 Exp2Ops.legalFor({S32}); 604 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 605 Exp2Ops.scalarize(0); 606 607 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 608 if (ST.has16BitInsts()) 609 ExpOps.customFor({{S32}, {S16}}); 610 else 611 ExpOps.customFor({S32}); 612 ExpOps.clampScalar(0, MinScalarFPTy, S32) 613 .scalarize(0); 614 615 // The 64-bit versions produce 32-bit results, but only on the SALU. 616 getActionDefinitionsBuilder(G_CTPOP) 617 .legalFor({{S32, S32}, {S32, S64}}) 618 .clampScalar(0, S32, S32) 619 .clampScalar(1, S32, S64) 620 .scalarize(0) 621 .widenScalarToNextPow2(0, 32) 622 .widenScalarToNextPow2(1, 32); 623 624 // The hardware instructions return a different result on 0 than the generic 625 // instructions expect. The hardware produces -1, but these produce the 626 // bitwidth. 627 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 628 .scalarize(0) 629 .clampScalar(0, S32, S32) 630 .clampScalar(1, S32, S64) 631 .widenScalarToNextPow2(0, 32) 632 .widenScalarToNextPow2(1, 32) 633 .lower(); 634 635 // The 64-bit versions produce 32-bit results, but only on the SALU. 636 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 637 .legalFor({{S32, S32}, {S32, S64}}) 638 .clampScalar(0, S32, S32) 639 .clampScalar(1, S32, S64) 640 .scalarize(0) 641 .widenScalarToNextPow2(0, 32) 642 .widenScalarToNextPow2(1, 32); 643 644 getActionDefinitionsBuilder(G_BITREVERSE) 645 .legalFor({S32}) 646 .clampScalar(0, S32, S32) 647 .scalarize(0); 648 649 if (ST.has16BitInsts()) { 650 getActionDefinitionsBuilder(G_BSWAP) 651 .legalFor({S16, S32, V2S16}) 652 .clampMaxNumElements(0, S16, 2) 653 // FIXME: Fixing non-power-of-2 before clamp is workaround for 654 // narrowScalar limitation. 655 .widenScalarToNextPow2(0) 656 .clampScalar(0, S16, S32) 657 .scalarize(0); 658 659 if (ST.hasVOP3PInsts()) { 660 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 661 .legalFor({S32, S16, V2S16}) 662 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 663 .clampMaxNumElements(0, S16, 2) 664 .minScalar(0, S16) 665 .widenScalarToNextPow2(0) 666 .scalarize(0) 667 .lower(); 668 } else { 669 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 670 .legalFor({S32, S16}) 671 .widenScalarToNextPow2(0) 672 .minScalar(0, S16) 673 .scalarize(0) 674 .lower(); 675 } 676 } else { 677 // TODO: Should have same legality without v_perm_b32 678 getActionDefinitionsBuilder(G_BSWAP) 679 .legalFor({S32}) 680 .lowerIf(narrowerThan(0, 32)) 681 // FIXME: Fixing non-power-of-2 before clamp is workaround for 682 // narrowScalar limitation. 683 .widenScalarToNextPow2(0) 684 .maxScalar(0, S32) 685 .scalarize(0) 686 .lower(); 687 688 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 689 .legalFor({S32}) 690 .minScalar(0, S32) 691 .widenScalarToNextPow2(0) 692 .scalarize(0) 693 .lower(); 694 } 695 696 getActionDefinitionsBuilder(G_INTTOPTR) 697 // List the common cases 698 .legalForCartesianProduct(AddrSpaces64, {S64}) 699 .legalForCartesianProduct(AddrSpaces32, {S32}) 700 .scalarize(0) 701 // Accept any address space as long as the size matches 702 .legalIf(sameSize(0, 1)) 703 .widenScalarIf(smallerThan(1, 0), 704 [](const LegalityQuery &Query) { 705 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 706 }) 707 .narrowScalarIf(greaterThan(1, 0), 708 [](const LegalityQuery &Query) { 709 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 710 }); 711 712 getActionDefinitionsBuilder(G_PTRTOINT) 713 // List the common cases 714 .legalForCartesianProduct(AddrSpaces64, {S64}) 715 .legalForCartesianProduct(AddrSpaces32, {S32}) 716 .scalarize(0) 717 // Accept any address space as long as the size matches 718 .legalIf(sameSize(0, 1)) 719 .widenScalarIf(smallerThan(0, 1), 720 [](const LegalityQuery &Query) { 721 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 722 }) 723 .narrowScalarIf( 724 greaterThan(0, 1), 725 [](const LegalityQuery &Query) { 726 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 727 }); 728 729 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 730 .scalarize(0) 731 .custom(); 732 733 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 734 // handle some operations by just promoting the register during 735 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 736 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 737 switch (AS) { 738 // FIXME: Private element size. 739 case AMDGPUAS::PRIVATE_ADDRESS: 740 return 32; 741 // FIXME: Check subtarget 742 case AMDGPUAS::LOCAL_ADDRESS: 743 return ST.useDS128() ? 128 : 64; 744 745 // Treat constant and global as identical. SMRD loads are sometimes usable 746 // for global loads (ideally constant address space should be eliminated) 747 // depending on the context. Legality cannot be context dependent, but 748 // RegBankSelect can split the load as necessary depending on the pointer 749 // register bank/uniformity and if the memory is invariant or not written in 750 // a kernel. 751 case AMDGPUAS::CONSTANT_ADDRESS: 752 case AMDGPUAS::GLOBAL_ADDRESS: 753 return IsLoad ? 512 : 128; 754 default: 755 return 128; 756 } 757 }; 758 759 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 760 bool IsLoad) -> bool { 761 const LLT DstTy = Query.Types[0]; 762 763 // Split vector extloads. 764 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 765 unsigned Align = Query.MMODescrs[0].AlignInBits; 766 767 if (MemSize < DstTy.getSizeInBits()) 768 MemSize = std::max(MemSize, Align); 769 770 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 771 return true; 772 773 const LLT PtrTy = Query.Types[1]; 774 unsigned AS = PtrTy.getAddressSpace(); 775 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 776 return true; 777 778 // Catch weird sized loads that don't evenly divide into the access sizes 779 // TODO: May be able to widen depending on alignment etc. 780 unsigned NumRegs = (MemSize + 31) / 32; 781 if (NumRegs == 3) { 782 if (!ST.hasDwordx3LoadStores()) 783 return true; 784 } else { 785 // If the alignment allows, these should have been widened. 786 if (!isPowerOf2_32(NumRegs)) 787 return true; 788 } 789 790 if (Align < MemSize) { 791 const SITargetLowering *TLI = ST.getTargetLowering(); 792 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 793 } 794 795 return false; 796 }; 797 798 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 799 unsigned Size = Query.Types[0].getSizeInBits(); 800 if (isPowerOf2_32(Size)) 801 return false; 802 803 if (Size == 96 && ST.hasDwordx3LoadStores()) 804 return false; 805 806 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 807 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 808 return false; 809 810 unsigned Align = Query.MMODescrs[0].AlignInBits; 811 unsigned RoundedSize = NextPowerOf2(Size); 812 return (Align >= RoundedSize); 813 }; 814 815 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 816 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 817 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 818 819 // TODO: Refine based on subtargets which support unaligned access or 128-bit 820 // LDS 821 // TODO: Unsupported flat for SI. 822 823 for (unsigned Op : {G_LOAD, G_STORE}) { 824 const bool IsStore = Op == G_STORE; 825 826 auto &Actions = getActionDefinitionsBuilder(Op); 827 // Whitelist the common cases. 828 // TODO: Loads to s16 on gfx9 829 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 830 {V2S32, GlobalPtr, 64, GlobalAlign32}, 831 {V4S32, GlobalPtr, 128, GlobalAlign32}, 832 {S128, GlobalPtr, 128, GlobalAlign32}, 833 {S64, GlobalPtr, 64, GlobalAlign32}, 834 {V2S64, GlobalPtr, 128, GlobalAlign32}, 835 {V2S16, GlobalPtr, 32, GlobalAlign32}, 836 {S32, GlobalPtr, 8, GlobalAlign8}, 837 {S32, GlobalPtr, 16, GlobalAlign16}, 838 839 {S32, LocalPtr, 32, 32}, 840 {S64, LocalPtr, 64, 32}, 841 {V2S32, LocalPtr, 64, 32}, 842 {S32, LocalPtr, 8, 8}, 843 {S32, LocalPtr, 16, 16}, 844 {V2S16, LocalPtr, 32, 32}, 845 846 {S32, PrivatePtr, 32, 32}, 847 {S32, PrivatePtr, 8, 8}, 848 {S32, PrivatePtr, 16, 16}, 849 {V2S16, PrivatePtr, 32, 32}, 850 851 {S32, FlatPtr, 32, GlobalAlign32}, 852 {S32, FlatPtr, 16, GlobalAlign16}, 853 {S32, FlatPtr, 8, GlobalAlign8}, 854 {V2S16, FlatPtr, 32, GlobalAlign32}, 855 856 {S32, ConstantPtr, 32, GlobalAlign32}, 857 {V2S32, ConstantPtr, 64, GlobalAlign32}, 858 {V4S32, ConstantPtr, 128, GlobalAlign32}, 859 {S64, ConstantPtr, 64, GlobalAlign32}, 860 {S128, ConstantPtr, 128, GlobalAlign32}, 861 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 862 Actions 863 .customIf(typeIs(1, Constant32Ptr)) 864 // Widen suitably aligned loads by loading extra elements. 865 .moreElementsIf([=](const LegalityQuery &Query) { 866 const LLT Ty = Query.Types[0]; 867 return Op == G_LOAD && Ty.isVector() && 868 shouldWidenLoadResult(Query); 869 }, moreElementsToNextPow2(0)) 870 .widenScalarIf([=](const LegalityQuery &Query) { 871 const LLT Ty = Query.Types[0]; 872 return Op == G_LOAD && !Ty.isVector() && 873 shouldWidenLoadResult(Query); 874 }, widenScalarOrEltToNextPow2(0)) 875 .narrowScalarIf( 876 [=](const LegalityQuery &Query) -> bool { 877 return !Query.Types[0].isVector() && 878 needToSplitMemOp(Query, Op == G_LOAD); 879 }, 880 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 881 const LLT DstTy = Query.Types[0]; 882 const LLT PtrTy = Query.Types[1]; 883 884 const unsigned DstSize = DstTy.getSizeInBits(); 885 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 886 887 // Split extloads. 888 if (DstSize > MemSize) 889 return std::make_pair(0, LLT::scalar(MemSize)); 890 891 if (!isPowerOf2_32(DstSize)) { 892 // We're probably decomposing an odd sized store. Try to split 893 // to the widest type. TODO: Account for alignment. As-is it 894 // should be OK, since the new parts will be further legalized. 895 unsigned FloorSize = PowerOf2Floor(DstSize); 896 return std::make_pair(0, LLT::scalar(FloorSize)); 897 } 898 899 if (DstSize > 32 && (DstSize % 32 != 0)) { 900 // FIXME: Need a way to specify non-extload of larger size if 901 // suitably aligned. 902 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 903 } 904 905 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 906 Op == G_LOAD); 907 if (MemSize > MaxSize) 908 return std::make_pair(0, LLT::scalar(MaxSize)); 909 910 unsigned Align = Query.MMODescrs[0].AlignInBits; 911 return std::make_pair(0, LLT::scalar(Align)); 912 }) 913 .fewerElementsIf( 914 [=](const LegalityQuery &Query) -> bool { 915 return Query.Types[0].isVector() && 916 needToSplitMemOp(Query, Op == G_LOAD); 917 }, 918 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 919 const LLT DstTy = Query.Types[0]; 920 const LLT PtrTy = Query.Types[1]; 921 922 LLT EltTy = DstTy.getElementType(); 923 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 924 Op == G_LOAD); 925 926 // FIXME: Handle widened to power of 2 results better. This ends 927 // up scalarizing. 928 // FIXME: 3 element stores scalarized on SI 929 930 // Split if it's too large for the address space. 931 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 932 unsigned NumElts = DstTy.getNumElements(); 933 unsigned EltSize = EltTy.getSizeInBits(); 934 935 if (MaxSize % EltSize == 0) { 936 return std::make_pair( 937 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 938 } 939 940 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 941 942 // FIXME: Refine when odd breakdowns handled 943 // The scalars will need to be re-legalized. 944 if (NumPieces == 1 || NumPieces >= NumElts || 945 NumElts % NumPieces != 0) 946 return std::make_pair(0, EltTy); 947 948 return std::make_pair(0, 949 LLT::vector(NumElts / NumPieces, EltTy)); 950 } 951 952 // FIXME: We could probably handle weird extending loads better. 953 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 954 if (DstTy.getSizeInBits() > MemSize) 955 return std::make_pair(0, EltTy); 956 957 unsigned EltSize = EltTy.getSizeInBits(); 958 unsigned DstSize = DstTy.getSizeInBits(); 959 if (!isPowerOf2_32(DstSize)) { 960 // We're probably decomposing an odd sized store. Try to split 961 // to the widest type. TODO: Account for alignment. As-is it 962 // should be OK, since the new parts will be further legalized. 963 unsigned FloorSize = PowerOf2Floor(DstSize); 964 return std::make_pair( 965 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 966 } 967 968 // Need to split because of alignment. 969 unsigned Align = Query.MMODescrs[0].AlignInBits; 970 if (EltSize > Align && 971 (EltSize / Align < DstTy.getNumElements())) { 972 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 973 } 974 975 // May need relegalization for the scalars. 976 return std::make_pair(0, EltTy); 977 }) 978 .minScalar(0, S32); 979 980 if (IsStore) 981 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 982 983 // TODO: Need a bitcast lower option? 984 Actions 985 .legalIf([=](const LegalityQuery &Query) { 986 const LLT Ty0 = Query.Types[0]; 987 unsigned Size = Ty0.getSizeInBits(); 988 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 989 unsigned Align = Query.MMODescrs[0].AlignInBits; 990 991 // FIXME: Widening store from alignment not valid. 992 if (MemSize < Size) 993 MemSize = std::max(MemSize, Align); 994 995 // No extending vector loads. 996 if (Size > MemSize && Ty0.isVector()) 997 return false; 998 999 switch (MemSize) { 1000 case 8: 1001 case 16: 1002 return Size == 32; 1003 case 32: 1004 case 64: 1005 case 128: 1006 return true; 1007 case 96: 1008 return ST.hasDwordx3LoadStores(); 1009 case 256: 1010 case 512: 1011 return true; 1012 default: 1013 return false; 1014 } 1015 }) 1016 .widenScalarToNextPow2(0) 1017 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1018 } 1019 1020 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1021 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1022 {S32, GlobalPtr, 16, 2 * 8}, 1023 {S32, LocalPtr, 8, 8}, 1024 {S32, LocalPtr, 16, 16}, 1025 {S32, PrivatePtr, 8, 8}, 1026 {S32, PrivatePtr, 16, 16}, 1027 {S32, ConstantPtr, 8, 8}, 1028 {S32, ConstantPtr, 16, 2 * 8}}); 1029 if (ST.hasFlatAddressSpace()) { 1030 ExtLoads.legalForTypesWithMemDesc( 1031 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1032 } 1033 1034 ExtLoads.clampScalar(0, S32, S32) 1035 .widenScalarToNextPow2(0) 1036 .unsupportedIfMemSizeNotPow2() 1037 .lower(); 1038 1039 auto &Atomics = getActionDefinitionsBuilder( 1040 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1041 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1042 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1043 G_ATOMICRMW_UMIN}) 1044 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1045 {S64, GlobalPtr}, {S64, LocalPtr}}); 1046 if (ST.hasFlatAddressSpace()) { 1047 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1048 } 1049 1050 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1051 .legalFor({{S32, LocalPtr}}); 1052 1053 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1054 // demarshalling 1055 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1056 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1057 {S32, FlatPtr}, {S64, FlatPtr}}) 1058 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1059 {S32, RegionPtr}, {S64, RegionPtr}}); 1060 // TODO: Pointer types, any 32-bit or 64-bit vector 1061 1062 // Condition should be s32 for scalar, s1 for vector. 1063 getActionDefinitionsBuilder(G_SELECT) 1064 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1065 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1066 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1067 .clampScalar(0, S16, S64) 1068 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1069 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1070 .scalarize(1) 1071 .clampMaxNumElements(0, S32, 2) 1072 .clampMaxNumElements(0, LocalPtr, 2) 1073 .clampMaxNumElements(0, PrivatePtr, 2) 1074 .scalarize(0) 1075 .widenScalarToNextPow2(0) 1076 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1077 1078 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1079 // be more flexible with the shift amount type. 1080 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1081 .legalFor({{S32, S32}, {S64, S32}}); 1082 if (ST.has16BitInsts()) { 1083 if (ST.hasVOP3PInsts()) { 1084 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 1085 .clampMaxNumElements(0, S16, 2); 1086 } else 1087 Shifts.legalFor({{S16, S32}, {S16, S16}}); 1088 1089 // TODO: Support 16-bit shift amounts 1090 Shifts.clampScalar(1, S32, S32); 1091 Shifts.clampScalar(0, S16, S64); 1092 Shifts.widenScalarToNextPow2(0, 16); 1093 } else { 1094 // Make sure we legalize the shift amount type first, as the general 1095 // expansion for the shifted type will produce much worse code if it hasn't 1096 // been truncated already. 1097 Shifts.clampScalar(1, S32, S32); 1098 Shifts.clampScalar(0, S32, S64); 1099 Shifts.widenScalarToNextPow2(0, 32); 1100 } 1101 Shifts.scalarize(0); 1102 1103 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1104 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1105 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1106 unsigned IdxTypeIdx = 2; 1107 1108 getActionDefinitionsBuilder(Op) 1109 .customIf([=](const LegalityQuery &Query) { 1110 const LLT EltTy = Query.Types[EltTypeIdx]; 1111 const LLT VecTy = Query.Types[VecTypeIdx]; 1112 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1113 return (EltTy.getSizeInBits() == 16 || 1114 EltTy.getSizeInBits() % 32 == 0) && 1115 VecTy.getSizeInBits() % 32 == 0 && 1116 VecTy.getSizeInBits() <= 1024 && 1117 IdxTy.getSizeInBits() == 32; 1118 }) 1119 .clampScalar(EltTypeIdx, S32, S64) 1120 .clampScalar(VecTypeIdx, S32, S64) 1121 .clampScalar(IdxTypeIdx, S32, S32); 1122 } 1123 1124 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1125 .unsupportedIf([=](const LegalityQuery &Query) { 1126 const LLT &EltTy = Query.Types[1].getElementType(); 1127 return Query.Types[0] != EltTy; 1128 }); 1129 1130 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1131 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1132 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1133 1134 // FIXME: Doesn't handle extract of illegal sizes. 1135 getActionDefinitionsBuilder(Op) 1136 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1137 // FIXME: Multiples of 16 should not be legal. 1138 .legalIf([=](const LegalityQuery &Query) { 1139 const LLT BigTy = Query.Types[BigTyIdx]; 1140 const LLT LitTy = Query.Types[LitTyIdx]; 1141 return (BigTy.getSizeInBits() % 32 == 0) && 1142 (LitTy.getSizeInBits() % 16 == 0); 1143 }) 1144 .widenScalarIf( 1145 [=](const LegalityQuery &Query) { 1146 const LLT BigTy = Query.Types[BigTyIdx]; 1147 return (BigTy.getScalarSizeInBits() < 16); 1148 }, 1149 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1150 .widenScalarIf( 1151 [=](const LegalityQuery &Query) { 1152 const LLT LitTy = Query.Types[LitTyIdx]; 1153 return (LitTy.getScalarSizeInBits() < 16); 1154 }, 1155 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1156 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1157 .widenScalarToNextPow2(BigTyIdx, 32); 1158 1159 } 1160 1161 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1162 .legalForCartesianProduct(AllS32Vectors, {S32}) 1163 .legalForCartesianProduct(AllS64Vectors, {S64}) 1164 .clampNumElements(0, V16S32, V32S32) 1165 .clampNumElements(0, V2S64, V16S64) 1166 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1167 1168 if (ST.hasScalarPackInsts()) { 1169 BuildVector 1170 // FIXME: Should probably widen s1 vectors straight to s32 1171 .minScalarOrElt(0, S16) 1172 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1173 .minScalar(1, S32); 1174 1175 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1176 .legalFor({V2S16, S32}) 1177 .lower(); 1178 BuildVector.minScalarOrElt(0, S32); 1179 } else { 1180 BuildVector.customFor({V2S16, S16}); 1181 BuildVector.minScalarOrElt(0, S32); 1182 1183 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1184 .customFor({V2S16, S32}) 1185 .lower(); 1186 } 1187 1188 BuildVector.legalIf(isRegisterType(0)); 1189 1190 // FIXME: Clamp maximum size 1191 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1192 .legalIf(isRegisterType(0)); 1193 1194 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1195 // pre-legalize. 1196 if (ST.hasVOP3PInsts()) { 1197 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1198 .customFor({V2S16, V2S16}) 1199 .lower(); 1200 } else 1201 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1202 1203 // Merge/Unmerge 1204 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1205 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1206 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1207 1208 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1209 const LLT Ty = Query.Types[TypeIdx]; 1210 if (Ty.isVector()) { 1211 const LLT &EltTy = Ty.getElementType(); 1212 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1213 return true; 1214 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1215 return true; 1216 } 1217 return false; 1218 }; 1219 1220 auto &Builder = getActionDefinitionsBuilder(Op) 1221 // Try to widen to s16 first for small types. 1222 // TODO: Only do this on targets with legal s16 shifts 1223 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1224 1225 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1226 .lowerFor({{S16, V2S16}}) 1227 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1228 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1229 elementTypeIs(1, S16)), 1230 changeTo(1, V2S16)) 1231 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1232 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1233 // valid. 1234 .clampScalar(LitTyIdx, S32, S512) 1235 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1236 // Break up vectors with weird elements into scalars 1237 .fewerElementsIf( 1238 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1239 scalarize(0)) 1240 .fewerElementsIf( 1241 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1242 scalarize(1)) 1243 .clampScalar(BigTyIdx, S32, S1024); 1244 1245 if (Op == G_MERGE_VALUES) { 1246 Builder.widenScalarIf( 1247 // TODO: Use 16-bit shifts if legal for 8-bit values? 1248 [=](const LegalityQuery &Query) { 1249 const LLT Ty = Query.Types[LitTyIdx]; 1250 return Ty.getSizeInBits() < 32; 1251 }, 1252 changeTo(LitTyIdx, S32)); 1253 } 1254 1255 Builder.widenScalarIf( 1256 [=](const LegalityQuery &Query) { 1257 const LLT Ty = Query.Types[BigTyIdx]; 1258 return !isPowerOf2_32(Ty.getSizeInBits()) && 1259 Ty.getSizeInBits() % 16 != 0; 1260 }, 1261 [=](const LegalityQuery &Query) { 1262 // Pick the next power of 2, or a multiple of 64 over 128. 1263 // Whichever is smaller. 1264 const LLT &Ty = Query.Types[BigTyIdx]; 1265 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1266 if (NewSizeInBits >= 256) { 1267 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1268 if (RoundedTo < NewSizeInBits) 1269 NewSizeInBits = RoundedTo; 1270 } 1271 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1272 }) 1273 .legalIf([=](const LegalityQuery &Query) { 1274 const LLT &BigTy = Query.Types[BigTyIdx]; 1275 const LLT &LitTy = Query.Types[LitTyIdx]; 1276 1277 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1278 return false; 1279 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1280 return false; 1281 1282 return BigTy.getSizeInBits() % 16 == 0 && 1283 LitTy.getSizeInBits() % 16 == 0 && 1284 BigTy.getSizeInBits() <= 1024; 1285 }) 1286 // Any vectors left are the wrong size. Scalarize them. 1287 .scalarize(0) 1288 .scalarize(1); 1289 } 1290 1291 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1292 // RegBankSelect. 1293 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1294 .legalFor({{S32}, {S64}}); 1295 1296 if (ST.hasVOP3PInsts()) { 1297 SextInReg.lowerFor({{V2S16}}) 1298 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1299 // get more vector shift opportunities, since we'll get those when 1300 // expanded. 1301 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1302 } else if (ST.has16BitInsts()) { 1303 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1304 } else { 1305 // Prefer to promote to s32 before lowering if we don't have 16-bit 1306 // shifts. This avoid a lot of intermediate truncate and extend operations. 1307 SextInReg.lowerFor({{S32}, {S64}}); 1308 } 1309 1310 SextInReg 1311 .scalarize(0) 1312 .clampScalar(0, S32, S64) 1313 .lower(); 1314 1315 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1316 .legalFor({S64}); 1317 1318 getActionDefinitionsBuilder({ 1319 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1320 G_FCOPYSIGN, 1321 1322 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1323 G_READ_REGISTER, 1324 G_WRITE_REGISTER, 1325 1326 G_SADDO, G_SSUBO, 1327 1328 // TODO: Implement 1329 G_FMINIMUM, G_FMAXIMUM 1330 }).lower(); 1331 1332 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1333 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1334 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1335 .unsupported(); 1336 1337 computeTables(); 1338 verify(*ST.getInstrInfo()); 1339 } 1340 1341 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1342 MachineRegisterInfo &MRI, 1343 MachineIRBuilder &B, 1344 GISelChangeObserver &Observer) const { 1345 switch (MI.getOpcode()) { 1346 case TargetOpcode::G_ADDRSPACE_CAST: 1347 return legalizeAddrSpaceCast(MI, MRI, B); 1348 case TargetOpcode::G_FRINT: 1349 return legalizeFrint(MI, MRI, B); 1350 case TargetOpcode::G_FCEIL: 1351 return legalizeFceil(MI, MRI, B); 1352 case TargetOpcode::G_INTRINSIC_TRUNC: 1353 return legalizeIntrinsicTrunc(MI, MRI, B); 1354 case TargetOpcode::G_SITOFP: 1355 return legalizeITOFP(MI, MRI, B, true); 1356 case TargetOpcode::G_UITOFP: 1357 return legalizeITOFP(MI, MRI, B, false); 1358 case TargetOpcode::G_FPTOSI: 1359 return legalizeFPTOI(MI, MRI, B, true); 1360 case TargetOpcode::G_FPTOUI: 1361 return legalizeFPTOI(MI, MRI, B, false); 1362 case TargetOpcode::G_FMINNUM: 1363 case TargetOpcode::G_FMAXNUM: 1364 case TargetOpcode::G_FMINNUM_IEEE: 1365 case TargetOpcode::G_FMAXNUM_IEEE: 1366 return legalizeMinNumMaxNum(MI, MRI, B); 1367 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1368 return legalizeExtractVectorElt(MI, MRI, B); 1369 case TargetOpcode::G_INSERT_VECTOR_ELT: 1370 return legalizeInsertVectorElt(MI, MRI, B); 1371 case TargetOpcode::G_SHUFFLE_VECTOR: 1372 return legalizeShuffleVector(MI, MRI, B); 1373 case TargetOpcode::G_FSIN: 1374 case TargetOpcode::G_FCOS: 1375 return legalizeSinCos(MI, MRI, B); 1376 case TargetOpcode::G_GLOBAL_VALUE: 1377 return legalizeGlobalValue(MI, MRI, B); 1378 case TargetOpcode::G_LOAD: 1379 return legalizeLoad(MI, MRI, B, Observer); 1380 case TargetOpcode::G_FMAD: 1381 return legalizeFMad(MI, MRI, B); 1382 case TargetOpcode::G_FDIV: 1383 return legalizeFDIV(MI, MRI, B); 1384 case TargetOpcode::G_UDIV: 1385 case TargetOpcode::G_UREM: 1386 return legalizeUDIV_UREM(MI, MRI, B); 1387 case TargetOpcode::G_SDIV: 1388 case TargetOpcode::G_SREM: 1389 return legalizeSDIV_SREM(MI, MRI, B); 1390 case TargetOpcode::G_ATOMIC_CMPXCHG: 1391 return legalizeAtomicCmpXChg(MI, MRI, B); 1392 case TargetOpcode::G_FLOG: 1393 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1394 case TargetOpcode::G_FLOG10: 1395 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1396 case TargetOpcode::G_FEXP: 1397 return legalizeFExp(MI, B); 1398 case TargetOpcode::G_FPOW: 1399 return legalizeFPow(MI, B); 1400 case TargetOpcode::G_FFLOOR: 1401 return legalizeFFloor(MI, MRI, B); 1402 case TargetOpcode::G_BUILD_VECTOR: 1403 return legalizeBuildVector(MI, MRI, B); 1404 default: 1405 return false; 1406 } 1407 1408 llvm_unreachable("expected switch to return"); 1409 } 1410 1411 Register AMDGPULegalizerInfo::getSegmentAperture( 1412 unsigned AS, 1413 MachineRegisterInfo &MRI, 1414 MachineIRBuilder &B) const { 1415 MachineFunction &MF = B.getMF(); 1416 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1417 const LLT S32 = LLT::scalar(32); 1418 1419 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1420 1421 if (ST.hasApertureRegs()) { 1422 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1423 // getreg. 1424 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1425 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1426 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1427 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1428 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1429 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1430 unsigned Encoding = 1431 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1432 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1433 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1434 1435 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1436 1437 B.buildInstr(AMDGPU::S_GETREG_B32) 1438 .addDef(GetReg) 1439 .addImm(Encoding); 1440 MRI.setType(GetReg, S32); 1441 1442 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1443 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1444 } 1445 1446 Register QueuePtr = MRI.createGenericVirtualRegister( 1447 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1448 1449 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1450 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1451 return Register(); 1452 1453 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1454 // private_segment_aperture_base_hi. 1455 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1456 1457 // TODO: can we be smarter about machine pointer info? 1458 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1459 MachineMemOperand *MMO = MF.getMachineMemOperand( 1460 PtrInfo, 1461 MachineMemOperand::MOLoad | 1462 MachineMemOperand::MODereferenceable | 1463 MachineMemOperand::MOInvariant, 1464 4, 1465 MinAlign(64, StructOffset)); 1466 1467 Register LoadAddr; 1468 1469 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1470 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1471 } 1472 1473 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1474 MachineInstr &MI, MachineRegisterInfo &MRI, 1475 MachineIRBuilder &B) const { 1476 MachineFunction &MF = B.getMF(); 1477 1478 B.setInstr(MI); 1479 1480 const LLT S32 = LLT::scalar(32); 1481 Register Dst = MI.getOperand(0).getReg(); 1482 Register Src = MI.getOperand(1).getReg(); 1483 1484 LLT DstTy = MRI.getType(Dst); 1485 LLT SrcTy = MRI.getType(Src); 1486 unsigned DestAS = DstTy.getAddressSpace(); 1487 unsigned SrcAS = SrcTy.getAddressSpace(); 1488 1489 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1490 // vector element. 1491 assert(!DstTy.isVector()); 1492 1493 const AMDGPUTargetMachine &TM 1494 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1495 1496 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1497 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1498 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1499 return true; 1500 } 1501 1502 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1503 // Truncate. 1504 B.buildExtract(Dst, Src, 0); 1505 MI.eraseFromParent(); 1506 return true; 1507 } 1508 1509 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1510 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1511 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1512 1513 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1514 // another. Merge operands are required to be the same type, but creating an 1515 // extra ptrtoint would be kind of pointless. 1516 auto HighAddr = B.buildConstant( 1517 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1518 B.buildMerge(Dst, {Src, HighAddr}); 1519 MI.eraseFromParent(); 1520 return true; 1521 } 1522 1523 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1524 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1525 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1526 unsigned NullVal = TM.getNullPointerValue(DestAS); 1527 1528 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1529 auto FlatNull = B.buildConstant(SrcTy, 0); 1530 1531 // Extract low 32-bits of the pointer. 1532 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1533 1534 auto CmpRes = 1535 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1536 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1537 1538 MI.eraseFromParent(); 1539 return true; 1540 } 1541 1542 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1543 return false; 1544 1545 if (!ST.hasFlatAddressSpace()) 1546 return false; 1547 1548 auto SegmentNull = 1549 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1550 auto FlatNull = 1551 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1552 1553 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1554 if (!ApertureReg.isValid()) 1555 return false; 1556 1557 auto CmpRes = 1558 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1559 1560 // Coerce the type of the low half of the result so we can use merge_values. 1561 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1562 1563 // TODO: Should we allow mismatched types but matching sizes in merges to 1564 // avoid the ptrtoint? 1565 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1566 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1567 1568 MI.eraseFromParent(); 1569 return true; 1570 } 1571 1572 bool AMDGPULegalizerInfo::legalizeFrint( 1573 MachineInstr &MI, MachineRegisterInfo &MRI, 1574 MachineIRBuilder &B) const { 1575 B.setInstr(MI); 1576 1577 Register Src = MI.getOperand(1).getReg(); 1578 LLT Ty = MRI.getType(Src); 1579 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1580 1581 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1582 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1583 1584 auto C1 = B.buildFConstant(Ty, C1Val); 1585 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1586 1587 // TODO: Should this propagate fast-math-flags? 1588 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1589 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1590 1591 auto C2 = B.buildFConstant(Ty, C2Val); 1592 auto Fabs = B.buildFAbs(Ty, Src); 1593 1594 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1595 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1596 return true; 1597 } 1598 1599 bool AMDGPULegalizerInfo::legalizeFceil( 1600 MachineInstr &MI, MachineRegisterInfo &MRI, 1601 MachineIRBuilder &B) const { 1602 B.setInstr(MI); 1603 1604 const LLT S1 = LLT::scalar(1); 1605 const LLT S64 = LLT::scalar(64); 1606 1607 Register Src = MI.getOperand(1).getReg(); 1608 assert(MRI.getType(Src) == S64); 1609 1610 // result = trunc(src) 1611 // if (src > 0.0 && src != result) 1612 // result += 1.0 1613 1614 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1615 1616 const auto Zero = B.buildFConstant(S64, 0.0); 1617 const auto One = B.buildFConstant(S64, 1.0); 1618 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1619 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1620 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1621 auto Add = B.buildSelect(S64, And, One, Zero); 1622 1623 // TODO: Should this propagate fast-math-flags? 1624 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1625 return true; 1626 } 1627 1628 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1629 MachineIRBuilder &B) { 1630 const unsigned FractBits = 52; 1631 const unsigned ExpBits = 11; 1632 LLT S32 = LLT::scalar(32); 1633 1634 auto Const0 = B.buildConstant(S32, FractBits - 32); 1635 auto Const1 = B.buildConstant(S32, ExpBits); 1636 1637 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1638 .addUse(Const0.getReg(0)) 1639 .addUse(Const1.getReg(0)); 1640 1641 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1642 } 1643 1644 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1645 MachineInstr &MI, MachineRegisterInfo &MRI, 1646 MachineIRBuilder &B) const { 1647 B.setInstr(MI); 1648 1649 const LLT S1 = LLT::scalar(1); 1650 const LLT S32 = LLT::scalar(32); 1651 const LLT S64 = LLT::scalar(64); 1652 1653 Register Src = MI.getOperand(1).getReg(); 1654 assert(MRI.getType(Src) == S64); 1655 1656 // TODO: Should this use extract since the low half is unused? 1657 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1658 Register Hi = Unmerge.getReg(1); 1659 1660 // Extract the upper half, since this is where we will find the sign and 1661 // exponent. 1662 auto Exp = extractF64Exponent(Hi, B); 1663 1664 const unsigned FractBits = 52; 1665 1666 // Extract the sign bit. 1667 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1668 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1669 1670 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1671 1672 const auto Zero32 = B.buildConstant(S32, 0); 1673 1674 // Extend back to 64-bits. 1675 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1676 1677 auto Shr = B.buildAShr(S64, FractMask, Exp); 1678 auto Not = B.buildNot(S64, Shr); 1679 auto Tmp0 = B.buildAnd(S64, Src, Not); 1680 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1681 1682 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1683 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1684 1685 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1686 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1687 return true; 1688 } 1689 1690 bool AMDGPULegalizerInfo::legalizeITOFP( 1691 MachineInstr &MI, MachineRegisterInfo &MRI, 1692 MachineIRBuilder &B, bool Signed) const { 1693 B.setInstr(MI); 1694 1695 Register Dst = MI.getOperand(0).getReg(); 1696 Register Src = MI.getOperand(1).getReg(); 1697 1698 const LLT S64 = LLT::scalar(64); 1699 const LLT S32 = LLT::scalar(32); 1700 1701 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1702 1703 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1704 1705 auto CvtHi = Signed ? 1706 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1707 B.buildUITOFP(S64, Unmerge.getReg(1)); 1708 1709 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1710 1711 auto ThirtyTwo = B.buildConstant(S32, 32); 1712 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1713 .addUse(CvtHi.getReg(0)) 1714 .addUse(ThirtyTwo.getReg(0)); 1715 1716 // TODO: Should this propagate fast-math-flags? 1717 B.buildFAdd(Dst, LdExp, CvtLo); 1718 MI.eraseFromParent(); 1719 return true; 1720 } 1721 1722 // TODO: Copied from DAG implementation. Verify logic and document how this 1723 // actually works. 1724 bool AMDGPULegalizerInfo::legalizeFPTOI( 1725 MachineInstr &MI, MachineRegisterInfo &MRI, 1726 MachineIRBuilder &B, bool Signed) const { 1727 B.setInstr(MI); 1728 1729 Register Dst = MI.getOperand(0).getReg(); 1730 Register Src = MI.getOperand(1).getReg(); 1731 1732 const LLT S64 = LLT::scalar(64); 1733 const LLT S32 = LLT::scalar(32); 1734 1735 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1736 1737 unsigned Flags = MI.getFlags(); 1738 1739 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1740 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1741 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1742 1743 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1744 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1745 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1746 1747 auto Hi = Signed ? 1748 B.buildFPTOSI(S32, FloorMul) : 1749 B.buildFPTOUI(S32, FloorMul); 1750 auto Lo = B.buildFPTOUI(S32, Fma); 1751 1752 B.buildMerge(Dst, { Lo, Hi }); 1753 MI.eraseFromParent(); 1754 1755 return true; 1756 } 1757 1758 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1759 MachineInstr &MI, MachineRegisterInfo &MRI, 1760 MachineIRBuilder &B) const { 1761 MachineFunction &MF = B.getMF(); 1762 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1763 1764 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1765 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1766 1767 // With ieee_mode disabled, the instructions have the correct behavior 1768 // already for G_FMINNUM/G_FMAXNUM 1769 if (!MFI->getMode().IEEE) 1770 return !IsIEEEOp; 1771 1772 if (IsIEEEOp) 1773 return true; 1774 1775 MachineIRBuilder HelperBuilder(MI); 1776 GISelObserverWrapper DummyObserver; 1777 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1778 HelperBuilder.setInstr(MI); 1779 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1780 } 1781 1782 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1783 MachineInstr &MI, MachineRegisterInfo &MRI, 1784 MachineIRBuilder &B) const { 1785 // TODO: Should move some of this into LegalizerHelper. 1786 1787 // TODO: Promote dynamic indexing of s16 to s32 1788 1789 // FIXME: Artifact combiner probably should have replaced the truncated 1790 // constant before this, so we shouldn't need 1791 // getConstantVRegValWithLookThrough. 1792 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1793 MI.getOperand(2).getReg(), MRI); 1794 if (!IdxVal) // Dynamic case will be selected to register indexing. 1795 return true; 1796 1797 Register Dst = MI.getOperand(0).getReg(); 1798 Register Vec = MI.getOperand(1).getReg(); 1799 1800 LLT VecTy = MRI.getType(Vec); 1801 LLT EltTy = VecTy.getElementType(); 1802 assert(EltTy == MRI.getType(Dst)); 1803 1804 B.setInstr(MI); 1805 1806 if (IdxVal->Value < VecTy.getNumElements()) 1807 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1808 else 1809 B.buildUndef(Dst); 1810 1811 MI.eraseFromParent(); 1812 return true; 1813 } 1814 1815 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1816 MachineInstr &MI, MachineRegisterInfo &MRI, 1817 MachineIRBuilder &B) const { 1818 // TODO: Should move some of this into LegalizerHelper. 1819 1820 // TODO: Promote dynamic indexing of s16 to s32 1821 1822 // FIXME: Artifact combiner probably should have replaced the truncated 1823 // constant before this, so we shouldn't need 1824 // getConstantVRegValWithLookThrough. 1825 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1826 MI.getOperand(3).getReg(), MRI); 1827 if (!IdxVal) // Dynamic case will be selected to register indexing. 1828 return true; 1829 1830 Register Dst = MI.getOperand(0).getReg(); 1831 Register Vec = MI.getOperand(1).getReg(); 1832 Register Ins = MI.getOperand(2).getReg(); 1833 1834 LLT VecTy = MRI.getType(Vec); 1835 LLT EltTy = VecTy.getElementType(); 1836 assert(EltTy == MRI.getType(Ins)); 1837 1838 B.setInstr(MI); 1839 1840 if (IdxVal->Value < VecTy.getNumElements()) 1841 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1842 else 1843 B.buildUndef(Dst); 1844 1845 MI.eraseFromParent(); 1846 return true; 1847 } 1848 1849 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1850 MachineInstr &MI, MachineRegisterInfo &MRI, 1851 MachineIRBuilder &B) const { 1852 const LLT V2S16 = LLT::vector(2, 16); 1853 1854 Register Dst = MI.getOperand(0).getReg(); 1855 Register Src0 = MI.getOperand(1).getReg(); 1856 LLT DstTy = MRI.getType(Dst); 1857 LLT SrcTy = MRI.getType(Src0); 1858 1859 if (SrcTy == V2S16 && DstTy == V2S16 && 1860 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1861 return true; 1862 1863 MachineIRBuilder HelperBuilder(MI); 1864 GISelObserverWrapper DummyObserver; 1865 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1866 HelperBuilder.setInstr(MI); 1867 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1868 } 1869 1870 bool AMDGPULegalizerInfo::legalizeSinCos( 1871 MachineInstr &MI, MachineRegisterInfo &MRI, 1872 MachineIRBuilder &B) const { 1873 B.setInstr(MI); 1874 1875 Register DstReg = MI.getOperand(0).getReg(); 1876 Register SrcReg = MI.getOperand(1).getReg(); 1877 LLT Ty = MRI.getType(DstReg); 1878 unsigned Flags = MI.getFlags(); 1879 1880 Register TrigVal; 1881 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1882 if (ST.hasTrigReducedRange()) { 1883 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1884 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1885 .addUse(MulVal.getReg(0)) 1886 .setMIFlags(Flags).getReg(0); 1887 } else 1888 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1889 1890 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1891 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1892 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1893 .addUse(TrigVal) 1894 .setMIFlags(Flags); 1895 MI.eraseFromParent(); 1896 return true; 1897 } 1898 1899 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1900 Register DstReg, LLT PtrTy, 1901 MachineIRBuilder &B, const GlobalValue *GV, 1902 unsigned Offset, unsigned GAFlags) const { 1903 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1904 // to the following code sequence: 1905 // 1906 // For constant address space: 1907 // s_getpc_b64 s[0:1] 1908 // s_add_u32 s0, s0, $symbol 1909 // s_addc_u32 s1, s1, 0 1910 // 1911 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1912 // a fixup or relocation is emitted to replace $symbol with a literal 1913 // constant, which is a pc-relative offset from the encoding of the $symbol 1914 // operand to the global variable. 1915 // 1916 // For global address space: 1917 // s_getpc_b64 s[0:1] 1918 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1919 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1920 // 1921 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1922 // fixups or relocations are emitted to replace $symbol@*@lo and 1923 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1924 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1925 // operand to the global variable. 1926 // 1927 // What we want here is an offset from the value returned by s_getpc 1928 // (which is the address of the s_add_u32 instruction) to the global 1929 // variable, but since the encoding of $symbol starts 4 bytes after the start 1930 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1931 // small. This requires us to add 4 to the global variable offset in order to 1932 // compute the correct address. 1933 1934 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1935 1936 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1937 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1938 1939 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1940 .addDef(PCReg); 1941 1942 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1943 if (GAFlags == SIInstrInfo::MO_NONE) 1944 MIB.addImm(0); 1945 else 1946 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1947 1948 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1949 1950 if (PtrTy.getSizeInBits() == 32) 1951 B.buildExtract(DstReg, PCReg, 0); 1952 return true; 1953 } 1954 1955 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1956 MachineInstr &MI, MachineRegisterInfo &MRI, 1957 MachineIRBuilder &B) const { 1958 Register DstReg = MI.getOperand(0).getReg(); 1959 LLT Ty = MRI.getType(DstReg); 1960 unsigned AS = Ty.getAddressSpace(); 1961 1962 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1963 MachineFunction &MF = B.getMF(); 1964 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1965 B.setInstr(MI); 1966 1967 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1968 if (!MFI->isEntryFunction()) { 1969 const Function &Fn = MF.getFunction(); 1970 DiagnosticInfoUnsupported BadLDSDecl( 1971 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 1972 DS_Warning); 1973 Fn.getContext().diagnose(BadLDSDecl); 1974 1975 // We currently don't have a way to correctly allocate LDS objects that 1976 // aren't directly associated with a kernel. We do force inlining of 1977 // functions that use local objects. However, if these dead functions are 1978 // not eliminated, we don't want a compile time error. Just emit a warning 1979 // and a trap, since there should be no callable path here. 1980 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 1981 B.buildUndef(DstReg); 1982 MI.eraseFromParent(); 1983 return true; 1984 } 1985 1986 // TODO: We could emit code to handle the initialization somewhere. 1987 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1988 const SITargetLowering *TLI = ST.getTargetLowering(); 1989 if (!TLI->shouldUseLDSConstAddress(GV)) { 1990 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1991 return true; // Leave in place; 1992 } 1993 1994 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1995 MI.eraseFromParent(); 1996 return true; 1997 } 1998 1999 const Function &Fn = MF.getFunction(); 2000 DiagnosticInfoUnsupported BadInit( 2001 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2002 Fn.getContext().diagnose(BadInit); 2003 return true; 2004 } 2005 2006 const SITargetLowering *TLI = ST.getTargetLowering(); 2007 2008 if (TLI->shouldEmitFixup(GV)) { 2009 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2010 MI.eraseFromParent(); 2011 return true; 2012 } 2013 2014 if (TLI->shouldEmitPCReloc(GV)) { 2015 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2016 MI.eraseFromParent(); 2017 return true; 2018 } 2019 2020 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2021 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2022 2023 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2024 MachinePointerInfo::getGOT(MF), 2025 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2026 MachineMemOperand::MOInvariant, 2027 8 /*Size*/, 8 /*Align*/); 2028 2029 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2030 2031 if (Ty.getSizeInBits() == 32) { 2032 // Truncate if this is a 32-bit constant adrdess. 2033 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2034 B.buildExtract(DstReg, Load, 0); 2035 } else 2036 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2037 2038 MI.eraseFromParent(); 2039 return true; 2040 } 2041 2042 bool AMDGPULegalizerInfo::legalizeLoad( 2043 MachineInstr &MI, MachineRegisterInfo &MRI, 2044 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2045 B.setInstr(MI); 2046 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2047 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2048 Observer.changingInstr(MI); 2049 MI.getOperand(1).setReg(Cast.getReg(0)); 2050 Observer.changedInstr(MI); 2051 return true; 2052 } 2053 2054 bool AMDGPULegalizerInfo::legalizeFMad( 2055 MachineInstr &MI, MachineRegisterInfo &MRI, 2056 MachineIRBuilder &B) const { 2057 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2058 assert(Ty.isScalar()); 2059 2060 MachineFunction &MF = B.getMF(); 2061 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2062 2063 // TODO: Always legal with future ftz flag. 2064 // FIXME: Do we need just output? 2065 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2066 return true; 2067 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2068 return true; 2069 2070 MachineIRBuilder HelperBuilder(MI); 2071 GISelObserverWrapper DummyObserver; 2072 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2073 HelperBuilder.setMBB(*MI.getParent()); 2074 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2075 } 2076 2077 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2078 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2079 Register DstReg = MI.getOperand(0).getReg(); 2080 Register PtrReg = MI.getOperand(1).getReg(); 2081 Register CmpVal = MI.getOperand(2).getReg(); 2082 Register NewVal = MI.getOperand(3).getReg(); 2083 2084 assert(SITargetLowering::isFlatGlobalAddrSpace( 2085 MRI.getType(PtrReg).getAddressSpace()) && 2086 "this should not have been custom lowered"); 2087 2088 LLT ValTy = MRI.getType(CmpVal); 2089 LLT VecTy = LLT::vector(2, ValTy); 2090 2091 B.setInstr(MI); 2092 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2093 2094 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2095 .addDef(DstReg) 2096 .addUse(PtrReg) 2097 .addUse(PackedVal) 2098 .setMemRefs(MI.memoperands()); 2099 2100 MI.eraseFromParent(); 2101 return true; 2102 } 2103 2104 bool AMDGPULegalizerInfo::legalizeFlog( 2105 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2106 Register Dst = MI.getOperand(0).getReg(); 2107 Register Src = MI.getOperand(1).getReg(); 2108 LLT Ty = B.getMRI()->getType(Dst); 2109 unsigned Flags = MI.getFlags(); 2110 B.setInstr(MI); 2111 2112 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2113 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2114 2115 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2116 MI.eraseFromParent(); 2117 return true; 2118 } 2119 2120 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2121 MachineIRBuilder &B) const { 2122 Register Dst = MI.getOperand(0).getReg(); 2123 Register Src = MI.getOperand(1).getReg(); 2124 unsigned Flags = MI.getFlags(); 2125 LLT Ty = B.getMRI()->getType(Dst); 2126 B.setInstr(MI); 2127 2128 auto K = B.buildFConstant(Ty, numbers::log2e); 2129 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2130 B.buildFExp2(Dst, Mul, Flags); 2131 MI.eraseFromParent(); 2132 return true; 2133 } 2134 2135 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2136 MachineIRBuilder &B) const { 2137 Register Dst = MI.getOperand(0).getReg(); 2138 Register Src0 = MI.getOperand(1).getReg(); 2139 Register Src1 = MI.getOperand(2).getReg(); 2140 unsigned Flags = MI.getFlags(); 2141 LLT Ty = B.getMRI()->getType(Dst); 2142 B.setInstr(MI); 2143 const LLT S16 = LLT::scalar(16); 2144 const LLT S32 = LLT::scalar(32); 2145 2146 if (Ty == S32) { 2147 auto Log = B.buildFLog2(S32, Src0, Flags); 2148 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2149 .addUse(Log.getReg(0)) 2150 .addUse(Src1) 2151 .setMIFlags(Flags); 2152 B.buildFExp2(Dst, Mul, Flags); 2153 } else if (Ty == S16) { 2154 // There's no f16 fmul_legacy, so we need to convert for it. 2155 auto Log = B.buildFLog2(S16, Src0, Flags); 2156 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2157 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2158 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2159 .addUse(Ext0.getReg(0)) 2160 .addUse(Ext1.getReg(0)) 2161 .setMIFlags(Flags); 2162 2163 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2164 } else 2165 return false; 2166 2167 MI.eraseFromParent(); 2168 return true; 2169 } 2170 2171 // Find a source register, ignoring any possible source modifiers. 2172 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2173 Register ModSrc = OrigSrc; 2174 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2175 ModSrc = SrcFNeg->getOperand(1).getReg(); 2176 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2177 ModSrc = SrcFAbs->getOperand(1).getReg(); 2178 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2179 ModSrc = SrcFAbs->getOperand(1).getReg(); 2180 return ModSrc; 2181 } 2182 2183 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2184 MachineRegisterInfo &MRI, 2185 MachineIRBuilder &B) const { 2186 B.setInstr(MI); 2187 2188 const LLT S1 = LLT::scalar(1); 2189 const LLT S64 = LLT::scalar(64); 2190 Register Dst = MI.getOperand(0).getReg(); 2191 Register OrigSrc = MI.getOperand(1).getReg(); 2192 unsigned Flags = MI.getFlags(); 2193 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2194 "this should not have been custom lowered"); 2195 2196 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2197 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2198 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2199 // V_FRACT bug is: 2200 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2201 // 2202 // Convert floor(x) to (x - fract(x)) 2203 2204 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2205 .addUse(OrigSrc) 2206 .setMIFlags(Flags); 2207 2208 // Give source modifier matching some assistance before obscuring a foldable 2209 // pattern. 2210 2211 // TODO: We can avoid the neg on the fract? The input sign to fract 2212 // shouldn't matter? 2213 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2214 2215 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2216 2217 Register Min = MRI.createGenericVirtualRegister(S64); 2218 2219 // We don't need to concern ourselves with the snan handling difference, so 2220 // use the one which will directly select. 2221 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2222 if (MFI->getMode().IEEE) 2223 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2224 else 2225 B.buildFMinNum(Min, Fract, Const, Flags); 2226 2227 Register CorrectedFract = Min; 2228 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2229 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2230 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2231 } 2232 2233 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2234 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2235 2236 MI.eraseFromParent(); 2237 return true; 2238 } 2239 2240 // Turn an illegal packed v2s16 build vector into bit operations. 2241 // TODO: This should probably be a bitcast action in LegalizerHelper. 2242 bool AMDGPULegalizerInfo::legalizeBuildVector( 2243 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2244 Register Dst = MI.getOperand(0).getReg(); 2245 LLT DstTy = MRI.getType(Dst); 2246 const LLT S32 = LLT::scalar(32); 2247 const LLT V2S16 = LLT::vector(2, 16); 2248 (void)DstTy; 2249 (void)V2S16; 2250 assert(DstTy == V2S16); 2251 2252 Register Src0 = MI.getOperand(1).getReg(); 2253 Register Src1 = MI.getOperand(2).getReg(); 2254 assert(MRI.getType(Src0) == LLT::scalar(16)); 2255 2256 B.setInstr(MI); 2257 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2258 B.buildBitcast(Dst, Merge); 2259 2260 MI.eraseFromParent(); 2261 return true; 2262 } 2263 2264 // Return the use branch instruction, otherwise null if the usage is invalid. 2265 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2266 MachineRegisterInfo &MRI, 2267 MachineInstr *&Br) { 2268 Register CondDef = MI.getOperand(0).getReg(); 2269 if (!MRI.hasOneNonDBGUse(CondDef)) 2270 return nullptr; 2271 2272 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2273 if (UseMI.getParent() != MI.getParent() || 2274 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2275 return nullptr; 2276 2277 // Make sure the cond br is followed by a G_BR 2278 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2279 if (Next != MI.getParent()->end()) { 2280 if (Next->getOpcode() != AMDGPU::G_BR) 2281 return nullptr; 2282 Br = &*Next; 2283 } 2284 2285 return &UseMI; 2286 } 2287 2288 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2289 MachineRegisterInfo &MRI, 2290 Register LiveIn, 2291 Register PhyReg) const { 2292 assert(PhyReg.isPhysical() && "Physical register expected"); 2293 2294 // Insert the live-in copy, if required, by defining destination virtual 2295 // register. 2296 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2297 if (!MRI.getVRegDef(LiveIn)) { 2298 // FIXME: Should have scoped insert pt 2299 MachineBasicBlock &OrigInsBB = B.getMBB(); 2300 auto OrigInsPt = B.getInsertPt(); 2301 2302 MachineBasicBlock &EntryMBB = B.getMF().front(); 2303 EntryMBB.addLiveIn(PhyReg); 2304 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2305 B.buildCopy(LiveIn, PhyReg); 2306 2307 B.setInsertPt(OrigInsBB, OrigInsPt); 2308 } 2309 2310 return LiveIn; 2311 } 2312 2313 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2314 MachineRegisterInfo &MRI, 2315 Register PhyReg, LLT Ty, 2316 bool InsertLiveInCopy) const { 2317 assert(PhyReg.isPhysical() && "Physical register expected"); 2318 2319 // Get or create virtual live-in regester 2320 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2321 if (!LiveIn) { 2322 LiveIn = MRI.createGenericVirtualRegister(Ty); 2323 MRI.addLiveIn(PhyReg, LiveIn); 2324 } 2325 2326 // When the actual true copy required is from virtual register to physical 2327 // register (to be inserted later), live-in copy insertion from physical 2328 // to register virtual register is not required 2329 if (!InsertLiveInCopy) 2330 return LiveIn; 2331 2332 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2333 } 2334 2335 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2336 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2337 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2338 const ArgDescriptor *Arg; 2339 const TargetRegisterClass *RC; 2340 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2341 if (!Arg) { 2342 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2343 return nullptr; 2344 } 2345 return Arg; 2346 } 2347 2348 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2349 const ArgDescriptor *Arg) const { 2350 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2351 return false; // TODO: Handle these 2352 2353 Register SrcReg = Arg->getRegister(); 2354 assert(SrcReg.isPhysical() && "Physical register expected"); 2355 assert(DstReg.isVirtual() && "Virtual register expected"); 2356 2357 MachineRegisterInfo &MRI = *B.getMRI(); 2358 2359 LLT Ty = MRI.getType(DstReg); 2360 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2361 2362 if (Arg->isMasked()) { 2363 // TODO: Should we try to emit this once in the entry block? 2364 const LLT S32 = LLT::scalar(32); 2365 const unsigned Mask = Arg->getMask(); 2366 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2367 2368 Register AndMaskSrc = LiveIn; 2369 2370 if (Shift != 0) { 2371 auto ShiftAmt = B.buildConstant(S32, Shift); 2372 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2373 } 2374 2375 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2376 } else { 2377 B.buildCopy(DstReg, LiveIn); 2378 } 2379 2380 return true; 2381 } 2382 2383 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2384 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2385 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2386 B.setInstr(MI); 2387 2388 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2389 if (!Arg) 2390 return false; 2391 2392 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2393 return false; 2394 2395 MI.eraseFromParent(); 2396 return true; 2397 } 2398 2399 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2400 MachineRegisterInfo &MRI, 2401 MachineIRBuilder &B) const { 2402 B.setInstr(MI); 2403 Register Dst = MI.getOperand(0).getReg(); 2404 LLT DstTy = MRI.getType(Dst); 2405 LLT S16 = LLT::scalar(16); 2406 LLT S32 = LLT::scalar(32); 2407 LLT S64 = LLT::scalar(64); 2408 2409 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2410 return true; 2411 2412 if (DstTy == S16) 2413 return legalizeFDIV16(MI, MRI, B); 2414 if (DstTy == S32) 2415 return legalizeFDIV32(MI, MRI, B); 2416 if (DstTy == S64) 2417 return legalizeFDIV64(MI, MRI, B); 2418 2419 return false; 2420 } 2421 2422 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2423 const LLT S32 = LLT::scalar(32); 2424 2425 auto Cvt0 = B.buildUITOFP(S32, Src); 2426 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2427 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2428 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2429 return B.buildFPTOUI(S32, Mul).getReg(0); 2430 } 2431 2432 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2433 Register DstReg, 2434 Register Num, 2435 Register Den, 2436 bool IsRem) const { 2437 const LLT S1 = LLT::scalar(1); 2438 const LLT S32 = LLT::scalar(32); 2439 2440 // RCP = URECIP(Den) = 2^32 / Den + e 2441 // e is rounding error. 2442 auto RCP = buildDivRCP(B, Den); 2443 2444 // RCP_LO = mul(RCP, Den) 2445 auto RCP_LO = B.buildMul(S32, RCP, Den); 2446 2447 // RCP_HI = mulhu (RCP, Den) */ 2448 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2449 2450 // NEG_RCP_LO = -RCP_LO 2451 auto Zero = B.buildConstant(S32, 0); 2452 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2453 2454 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2455 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2456 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2457 2458 // Calculate the rounding error from the URECIP instruction 2459 // E = mulhu(ABS_RCP_LO, RCP) 2460 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2461 2462 // RCP_A_E = RCP + E 2463 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2464 2465 // RCP_S_E = RCP - E 2466 auto RCP_S_E = B.buildSub(S32, RCP, E); 2467 2468 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2469 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2470 2471 // Quotient = mulhu(Tmp0, Num)stmp 2472 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2473 2474 // Num_S_Remainder = Quotient * Den 2475 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2476 2477 // Remainder = Num - Num_S_Remainder 2478 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2479 2480 // Remainder_GE_Den = Remainder >= Den 2481 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2482 2483 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2484 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2485 Num, Num_S_Remainder); 2486 2487 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2488 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2489 2490 // Calculate Division result: 2491 2492 // Quotient_A_One = Quotient + 1 2493 auto One = B.buildConstant(S32, 1); 2494 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2495 2496 // Quotient_S_One = Quotient - 1 2497 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2498 2499 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2500 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2501 2502 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2503 if (IsRem) { 2504 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2505 2506 // Calculate Rem result: 2507 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2508 2509 // Remainder_A_Den = Remainder + Den 2510 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2511 2512 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2513 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2514 2515 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2516 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2517 } else { 2518 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2519 } 2520 } 2521 2522 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2523 MachineRegisterInfo &MRI, 2524 MachineIRBuilder &B) const { 2525 B.setInstr(MI); 2526 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2527 Register DstReg = MI.getOperand(0).getReg(); 2528 Register Num = MI.getOperand(1).getReg(); 2529 Register Den = MI.getOperand(2).getReg(); 2530 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2531 MI.eraseFromParent(); 2532 return true; 2533 } 2534 2535 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2536 MachineRegisterInfo &MRI, 2537 MachineIRBuilder &B) const { 2538 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2539 return legalizeUDIV_UREM32(MI, MRI, B); 2540 return false; 2541 } 2542 2543 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2544 MachineRegisterInfo &MRI, 2545 MachineIRBuilder &B) const { 2546 B.setInstr(MI); 2547 const LLT S32 = LLT::scalar(32); 2548 2549 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2550 Register DstReg = MI.getOperand(0).getReg(); 2551 Register LHS = MI.getOperand(1).getReg(); 2552 Register RHS = MI.getOperand(2).getReg(); 2553 2554 auto ThirtyOne = B.buildConstant(S32, 31); 2555 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2556 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2557 2558 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2559 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2560 2561 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2562 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2563 2564 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2565 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2566 2567 if (IsRem) { 2568 auto RSign = LHSign; // Remainder sign is the same as LHS 2569 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2570 B.buildSub(DstReg, UDivRem, RSign); 2571 } else { 2572 auto DSign = B.buildXor(S32, LHSign, RHSign); 2573 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2574 B.buildSub(DstReg, UDivRem, DSign); 2575 } 2576 2577 MI.eraseFromParent(); 2578 return true; 2579 } 2580 2581 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2582 MachineRegisterInfo &MRI, 2583 MachineIRBuilder &B) const { 2584 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2585 return legalizeSDIV_SREM32(MI, MRI, B); 2586 return false; 2587 } 2588 2589 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2590 MachineRegisterInfo &MRI, 2591 MachineIRBuilder &B) const { 2592 Register Res = MI.getOperand(0).getReg(); 2593 Register LHS = MI.getOperand(1).getReg(); 2594 Register RHS = MI.getOperand(2).getReg(); 2595 2596 uint16_t Flags = MI.getFlags(); 2597 2598 LLT ResTy = MRI.getType(Res); 2599 LLT S32 = LLT::scalar(32); 2600 LLT S64 = LLT::scalar(64); 2601 2602 const MachineFunction &MF = B.getMF(); 2603 bool Unsafe = 2604 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2605 2606 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2607 return false; 2608 2609 if (!Unsafe && ResTy == S32 && 2610 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2611 return false; 2612 2613 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2614 // 1 / x -> RCP(x) 2615 if (CLHS->isExactlyValue(1.0)) { 2616 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2617 .addUse(RHS) 2618 .setMIFlags(Flags); 2619 2620 MI.eraseFromParent(); 2621 return true; 2622 } 2623 2624 // -1 / x -> RCP( FNEG(x) ) 2625 if (CLHS->isExactlyValue(-1.0)) { 2626 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2627 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2628 .addUse(FNeg.getReg(0)) 2629 .setMIFlags(Flags); 2630 2631 MI.eraseFromParent(); 2632 return true; 2633 } 2634 } 2635 2636 // x / y -> x * (1.0 / y) 2637 if (Unsafe) { 2638 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2639 .addUse(RHS) 2640 .setMIFlags(Flags); 2641 B.buildFMul(Res, LHS, RCP, Flags); 2642 2643 MI.eraseFromParent(); 2644 return true; 2645 } 2646 2647 return false; 2648 } 2649 2650 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2651 MachineRegisterInfo &MRI, 2652 MachineIRBuilder &B) const { 2653 B.setInstr(MI); 2654 Register Res = MI.getOperand(0).getReg(); 2655 Register LHS = MI.getOperand(1).getReg(); 2656 Register RHS = MI.getOperand(2).getReg(); 2657 2658 uint16_t Flags = MI.getFlags(); 2659 2660 LLT S16 = LLT::scalar(16); 2661 LLT S32 = LLT::scalar(32); 2662 2663 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2664 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2665 2666 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2667 .addUse(RHSExt.getReg(0)) 2668 .setMIFlags(Flags); 2669 2670 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2671 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2672 2673 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2674 .addUse(RDst.getReg(0)) 2675 .addUse(RHS) 2676 .addUse(LHS) 2677 .setMIFlags(Flags); 2678 2679 MI.eraseFromParent(); 2680 return true; 2681 } 2682 2683 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2684 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2685 static void toggleSPDenormMode(bool Enable, 2686 MachineIRBuilder &B, 2687 const GCNSubtarget &ST, 2688 AMDGPU::SIModeRegisterDefaults Mode) { 2689 // Set SP denorm mode to this value. 2690 unsigned SPDenormMode = 2691 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2692 2693 if (ST.hasDenormModeInst()) { 2694 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2695 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2696 2697 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2698 B.buildInstr(AMDGPU::S_DENORM_MODE) 2699 .addImm(NewDenormModeValue); 2700 2701 } else { 2702 // Select FP32 bit field in mode register. 2703 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2704 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2705 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2706 2707 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2708 .addImm(SPDenormMode) 2709 .addImm(SPDenormModeBitField); 2710 } 2711 } 2712 2713 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2714 MachineRegisterInfo &MRI, 2715 MachineIRBuilder &B) const { 2716 B.setInstr(MI); 2717 Register Res = MI.getOperand(0).getReg(); 2718 Register LHS = MI.getOperand(1).getReg(); 2719 Register RHS = MI.getOperand(2).getReg(); 2720 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2721 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2722 2723 uint16_t Flags = MI.getFlags(); 2724 2725 LLT S32 = LLT::scalar(32); 2726 LLT S1 = LLT::scalar(1); 2727 2728 auto One = B.buildFConstant(S32, 1.0f); 2729 2730 auto DenominatorScaled = 2731 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2732 .addUse(RHS) 2733 .addUse(LHS) 2734 .addImm(1) 2735 .setMIFlags(Flags); 2736 auto NumeratorScaled = 2737 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2738 .addUse(LHS) 2739 .addUse(RHS) 2740 .addImm(0) 2741 .setMIFlags(Flags); 2742 2743 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2744 .addUse(DenominatorScaled.getReg(0)) 2745 .setMIFlags(Flags); 2746 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2747 2748 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2749 // aren't modeled as reading it. 2750 if (!Mode.allFP32Denormals()) 2751 toggleSPDenormMode(true, B, ST, Mode); 2752 2753 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2754 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2755 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2756 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2757 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2758 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2759 2760 if (!Mode.allFP32Denormals()) 2761 toggleSPDenormMode(false, B, ST, Mode); 2762 2763 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2764 .addUse(Fma4.getReg(0)) 2765 .addUse(Fma1.getReg(0)) 2766 .addUse(Fma3.getReg(0)) 2767 .addUse(NumeratorScaled.getReg(1)) 2768 .setMIFlags(Flags); 2769 2770 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2771 .addUse(Fmas.getReg(0)) 2772 .addUse(RHS) 2773 .addUse(LHS) 2774 .setMIFlags(Flags); 2775 2776 MI.eraseFromParent(); 2777 return true; 2778 } 2779 2780 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2781 MachineRegisterInfo &MRI, 2782 MachineIRBuilder &B) const { 2783 B.setInstr(MI); 2784 Register Res = MI.getOperand(0).getReg(); 2785 Register LHS = MI.getOperand(1).getReg(); 2786 Register RHS = MI.getOperand(2).getReg(); 2787 2788 uint16_t Flags = MI.getFlags(); 2789 2790 LLT S64 = LLT::scalar(64); 2791 LLT S1 = LLT::scalar(1); 2792 2793 auto One = B.buildFConstant(S64, 1.0); 2794 2795 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2796 .addUse(LHS) 2797 .addUse(RHS) 2798 .addImm(1) 2799 .setMIFlags(Flags); 2800 2801 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2802 2803 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2804 .addUse(DivScale0.getReg(0)) 2805 .setMIFlags(Flags); 2806 2807 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2808 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2809 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2810 2811 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2812 .addUse(LHS) 2813 .addUse(RHS) 2814 .addImm(0) 2815 .setMIFlags(Flags); 2816 2817 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2818 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2819 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2820 2821 Register Scale; 2822 if (!ST.hasUsableDivScaleConditionOutput()) { 2823 // Workaround a hardware bug on SI where the condition output from div_scale 2824 // is not usable. 2825 2826 LLT S32 = LLT::scalar(32); 2827 2828 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2829 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2830 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2831 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2832 2833 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2834 Scale1Unmerge.getReg(1)); 2835 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2836 Scale0Unmerge.getReg(1)); 2837 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 2838 } else { 2839 Scale = DivScale1.getReg(1); 2840 } 2841 2842 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2843 .addUse(Fma4.getReg(0)) 2844 .addUse(Fma3.getReg(0)) 2845 .addUse(Mul.getReg(0)) 2846 .addUse(Scale) 2847 .setMIFlags(Flags); 2848 2849 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2850 .addUse(Fmas.getReg(0)) 2851 .addUse(RHS) 2852 .addUse(LHS) 2853 .setMIFlags(Flags); 2854 2855 MI.eraseFromParent(); 2856 return true; 2857 } 2858 2859 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2860 MachineRegisterInfo &MRI, 2861 MachineIRBuilder &B) const { 2862 B.setInstr(MI); 2863 Register Res = MI.getOperand(0).getReg(); 2864 Register LHS = MI.getOperand(2).getReg(); 2865 Register RHS = MI.getOperand(3).getReg(); 2866 uint16_t Flags = MI.getFlags(); 2867 2868 LLT S32 = LLT::scalar(32); 2869 LLT S1 = LLT::scalar(1); 2870 2871 auto Abs = B.buildFAbs(S32, RHS, Flags); 2872 const APFloat C0Val(1.0f); 2873 2874 auto C0 = B.buildConstant(S32, 0x6f800000); 2875 auto C1 = B.buildConstant(S32, 0x2f800000); 2876 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2877 2878 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2879 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2880 2881 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2882 2883 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2884 .addUse(Mul0.getReg(0)) 2885 .setMIFlags(Flags); 2886 2887 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2888 2889 B.buildFMul(Res, Sel, Mul1, Flags); 2890 2891 MI.eraseFromParent(); 2892 return true; 2893 } 2894 2895 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2896 MachineRegisterInfo &MRI, 2897 MachineIRBuilder &B) const { 2898 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2899 if (!MFI->isEntryFunction()) { 2900 return legalizePreloadedArgIntrin(MI, MRI, B, 2901 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2902 } 2903 2904 B.setInstr(MI); 2905 2906 uint64_t Offset = 2907 ST.getTargetLowering()->getImplicitParameterOffset( 2908 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2909 Register DstReg = MI.getOperand(0).getReg(); 2910 LLT DstTy = MRI.getType(DstReg); 2911 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2912 2913 const ArgDescriptor *Arg; 2914 const TargetRegisterClass *RC; 2915 std::tie(Arg, RC) 2916 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2917 if (!Arg) 2918 return false; 2919 2920 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2921 if (!loadInputValue(KernargPtrReg, B, Arg)) 2922 return false; 2923 2924 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2925 MI.eraseFromParent(); 2926 return true; 2927 } 2928 2929 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2930 MachineRegisterInfo &MRI, 2931 MachineIRBuilder &B, 2932 unsigned AddrSpace) const { 2933 B.setInstr(MI); 2934 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2935 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2936 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2937 MI.eraseFromParent(); 2938 return true; 2939 } 2940 2941 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2942 // offset (the offset that is included in bounds checking and swizzling, to be 2943 // split between the instruction's voffset and immoffset fields) and soffset 2944 // (the offset that is excluded from bounds checking and swizzling, to go in 2945 // the instruction's soffset field). This function takes the first kind of 2946 // offset and figures out how to split it between voffset and immoffset. 2947 std::tuple<Register, unsigned, unsigned> 2948 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2949 Register OrigOffset) const { 2950 const unsigned MaxImm = 4095; 2951 Register BaseReg; 2952 unsigned TotalConstOffset; 2953 MachineInstr *OffsetDef; 2954 const LLT S32 = LLT::scalar(32); 2955 2956 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2957 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2958 2959 unsigned ImmOffset = TotalConstOffset; 2960 2961 // If the immediate value is too big for the immoffset field, put the value 2962 // and -4096 into the immoffset field so that the value that is copied/added 2963 // for the voffset field is a multiple of 4096, and it stands more chance 2964 // of being CSEd with the copy/add for another similar load/store. 2965 // However, do not do that rounding down to a multiple of 4096 if that is a 2966 // negative number, as it appears to be illegal to have a negative offset 2967 // in the vgpr, even if adding the immediate offset makes it positive. 2968 unsigned Overflow = ImmOffset & ~MaxImm; 2969 ImmOffset -= Overflow; 2970 if ((int32_t)Overflow < 0) { 2971 Overflow += ImmOffset; 2972 ImmOffset = 0; 2973 } 2974 2975 if (Overflow != 0) { 2976 if (!BaseReg) { 2977 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2978 } else { 2979 auto OverflowVal = B.buildConstant(S32, Overflow); 2980 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2981 } 2982 } 2983 2984 if (!BaseReg) 2985 BaseReg = B.buildConstant(S32, 0).getReg(0); 2986 2987 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2988 } 2989 2990 /// Handle register layout difference for f16 images for some subtargets. 2991 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2992 MachineRegisterInfo &MRI, 2993 Register Reg) const { 2994 if (!ST.hasUnpackedD16VMem()) 2995 return Reg; 2996 2997 const LLT S16 = LLT::scalar(16); 2998 const LLT S32 = LLT::scalar(32); 2999 LLT StoreVT = MRI.getType(Reg); 3000 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3001 3002 auto Unmerge = B.buildUnmerge(S16, Reg); 3003 3004 SmallVector<Register, 4> WideRegs; 3005 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3006 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3007 3008 int NumElts = StoreVT.getNumElements(); 3009 3010 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3011 } 3012 3013 Register AMDGPULegalizerInfo::fixStoreSourceType( 3014 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3015 MachineRegisterInfo *MRI = B.getMRI(); 3016 LLT Ty = MRI->getType(VData); 3017 3018 const LLT S16 = LLT::scalar(16); 3019 3020 // Fixup illegal register types for i8 stores. 3021 if (Ty == LLT::scalar(8) || Ty == S16) { 3022 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3023 return AnyExt; 3024 } 3025 3026 if (Ty.isVector()) { 3027 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3028 if (IsFormat) 3029 return handleD16VData(B, *MRI, VData); 3030 } 3031 } 3032 3033 return VData; 3034 } 3035 3036 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3037 MachineRegisterInfo &MRI, 3038 MachineIRBuilder &B, 3039 bool IsTyped, 3040 bool IsFormat) const { 3041 B.setInstr(MI); 3042 3043 Register VData = MI.getOperand(1).getReg(); 3044 LLT Ty = MRI.getType(VData); 3045 LLT EltTy = Ty.getScalarType(); 3046 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3047 const LLT S32 = LLT::scalar(32); 3048 3049 VData = fixStoreSourceType(B, VData, IsFormat); 3050 Register RSrc = MI.getOperand(2).getReg(); 3051 3052 MachineMemOperand *MMO = *MI.memoperands_begin(); 3053 const int MemSize = MMO->getSize(); 3054 3055 unsigned ImmOffset; 3056 unsigned TotalOffset; 3057 3058 // The typed intrinsics add an immediate after the registers. 3059 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3060 3061 // The struct intrinsic variants add one additional operand over raw. 3062 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3063 Register VIndex; 3064 int OpOffset = 0; 3065 if (HasVIndex) { 3066 VIndex = MI.getOperand(3).getReg(); 3067 OpOffset = 1; 3068 } 3069 3070 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3071 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3072 3073 unsigned Format = 0; 3074 if (IsTyped) { 3075 Format = MI.getOperand(5 + OpOffset).getImm(); 3076 ++OpOffset; 3077 } 3078 3079 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3080 3081 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3082 if (TotalOffset != 0) 3083 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3084 3085 unsigned Opc; 3086 if (IsTyped) { 3087 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3088 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3089 } else if (IsFormat) { 3090 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3091 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3092 } else { 3093 switch (MemSize) { 3094 case 1: 3095 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3096 break; 3097 case 2: 3098 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3099 break; 3100 default: 3101 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3102 break; 3103 } 3104 } 3105 3106 if (!VIndex) 3107 VIndex = B.buildConstant(S32, 0).getReg(0); 3108 3109 auto MIB = B.buildInstr(Opc) 3110 .addUse(VData) // vdata 3111 .addUse(RSrc) // rsrc 3112 .addUse(VIndex) // vindex 3113 .addUse(VOffset) // voffset 3114 .addUse(SOffset) // soffset 3115 .addImm(ImmOffset); // offset(imm) 3116 3117 if (IsTyped) 3118 MIB.addImm(Format); 3119 3120 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3121 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3122 .addMemOperand(MMO); 3123 3124 MI.eraseFromParent(); 3125 return true; 3126 } 3127 3128 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3129 MachineRegisterInfo &MRI, 3130 MachineIRBuilder &B, 3131 bool IsFormat, 3132 bool IsTyped) const { 3133 B.setInstr(MI); 3134 3135 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3136 MachineMemOperand *MMO = *MI.memoperands_begin(); 3137 const int MemSize = MMO->getSize(); 3138 const LLT S32 = LLT::scalar(32); 3139 3140 Register Dst = MI.getOperand(0).getReg(); 3141 Register RSrc = MI.getOperand(2).getReg(); 3142 3143 // The typed intrinsics add an immediate after the registers. 3144 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3145 3146 // The struct intrinsic variants add one additional operand over raw. 3147 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3148 Register VIndex; 3149 int OpOffset = 0; 3150 if (HasVIndex) { 3151 VIndex = MI.getOperand(3).getReg(); 3152 OpOffset = 1; 3153 } 3154 3155 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3156 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3157 3158 unsigned Format = 0; 3159 if (IsTyped) { 3160 Format = MI.getOperand(5 + OpOffset).getImm(); 3161 ++OpOffset; 3162 } 3163 3164 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3165 unsigned ImmOffset; 3166 unsigned TotalOffset; 3167 3168 LLT Ty = MRI.getType(Dst); 3169 LLT EltTy = Ty.getScalarType(); 3170 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3171 const bool Unpacked = ST.hasUnpackedD16VMem(); 3172 3173 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3174 if (TotalOffset != 0) 3175 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3176 3177 unsigned Opc; 3178 3179 if (IsTyped) { 3180 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3181 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3182 } else if (IsFormat) { 3183 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3184 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3185 } else { 3186 switch (MemSize) { 3187 case 1: 3188 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3189 break; 3190 case 2: 3191 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3192 break; 3193 default: 3194 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3195 break; 3196 } 3197 } 3198 3199 Register LoadDstReg; 3200 3201 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3202 LLT UnpackedTy = Ty.changeElementSize(32); 3203 3204 if (IsExtLoad) 3205 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3206 else if (Unpacked && IsD16 && Ty.isVector()) 3207 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3208 else 3209 LoadDstReg = Dst; 3210 3211 if (!VIndex) 3212 VIndex = B.buildConstant(S32, 0).getReg(0); 3213 3214 auto MIB = B.buildInstr(Opc) 3215 .addDef(LoadDstReg) // vdata 3216 .addUse(RSrc) // rsrc 3217 .addUse(VIndex) // vindex 3218 .addUse(VOffset) // voffset 3219 .addUse(SOffset) // soffset 3220 .addImm(ImmOffset); // offset(imm) 3221 3222 if (IsTyped) 3223 MIB.addImm(Format); 3224 3225 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3226 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3227 .addMemOperand(MMO); 3228 3229 if (LoadDstReg != Dst) { 3230 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3231 3232 // Widen result for extending loads was widened. 3233 if (IsExtLoad) 3234 B.buildTrunc(Dst, LoadDstReg); 3235 else { 3236 // Repack to original 16-bit vector result 3237 // FIXME: G_TRUNC should work, but legalization currently fails 3238 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3239 SmallVector<Register, 4> Repack; 3240 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3241 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3242 B.buildMerge(Dst, Repack); 3243 } 3244 } 3245 3246 MI.eraseFromParent(); 3247 return true; 3248 } 3249 3250 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3251 MachineIRBuilder &B, 3252 bool IsInc) const { 3253 B.setInstr(MI); 3254 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3255 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3256 B.buildInstr(Opc) 3257 .addDef(MI.getOperand(0).getReg()) 3258 .addUse(MI.getOperand(2).getReg()) 3259 .addUse(MI.getOperand(3).getReg()) 3260 .cloneMemRefs(MI); 3261 MI.eraseFromParent(); 3262 return true; 3263 } 3264 3265 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3266 switch (IntrID) { 3267 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3268 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3269 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3270 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3271 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3272 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3273 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3274 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3275 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3276 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3277 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3278 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3279 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3280 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3281 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3282 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3283 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3284 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3285 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3286 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3287 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3288 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3289 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3290 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3291 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3292 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3293 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3294 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3295 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3296 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3297 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3298 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3299 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3300 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3301 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3302 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3303 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3304 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3305 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3306 default: 3307 llvm_unreachable("unhandled atomic opcode"); 3308 } 3309 } 3310 3311 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3312 MachineIRBuilder &B, 3313 Intrinsic::ID IID) const { 3314 B.setInstr(MI); 3315 3316 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3317 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3318 3319 Register Dst = MI.getOperand(0).getReg(); 3320 Register VData = MI.getOperand(2).getReg(); 3321 3322 Register CmpVal; 3323 int OpOffset = 0; 3324 3325 if (IsCmpSwap) { 3326 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3327 ++OpOffset; 3328 } 3329 3330 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3331 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3332 3333 // The struct intrinsic variants add one additional operand over raw. 3334 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3335 Register VIndex; 3336 if (HasVIndex) { 3337 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3338 ++OpOffset; 3339 } 3340 3341 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3342 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3343 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3344 3345 MachineMemOperand *MMO = *MI.memoperands_begin(); 3346 3347 unsigned ImmOffset; 3348 unsigned TotalOffset; 3349 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3350 if (TotalOffset != 0) 3351 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3352 3353 if (!VIndex) 3354 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3355 3356 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3357 .addDef(Dst) 3358 .addUse(VData); // vdata 3359 3360 if (IsCmpSwap) 3361 MIB.addReg(CmpVal); 3362 3363 MIB.addUse(RSrc) // rsrc 3364 .addUse(VIndex) // vindex 3365 .addUse(VOffset) // voffset 3366 .addUse(SOffset) // soffset 3367 .addImm(ImmOffset) // offset(imm) 3368 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3369 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3370 .addMemOperand(MMO); 3371 3372 MI.eraseFromParent(); 3373 return true; 3374 } 3375 3376 // Produce a vector of s16 elements from s32 pieces. 3377 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg, 3378 ArrayRef<Register> UnmergeParts) { 3379 const LLT S16 = LLT::scalar(16); 3380 3381 SmallVector<Register, 4> RemergeParts(UnmergeParts.size()); 3382 for (int I = 0, E = UnmergeParts.size(); I != E; ++I) 3383 RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0); 3384 3385 B.buildBuildVector(DstReg, RemergeParts); 3386 } 3387 3388 /// Convert a set of s32 registers to a result vector with s16 elements. 3389 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg, 3390 ArrayRef<Register> UnmergeParts) { 3391 MachineRegisterInfo &MRI = *B.getMRI(); 3392 const LLT V2S16 = LLT::vector(2, 16); 3393 LLT TargetTy = MRI.getType(DstReg); 3394 int NumElts = UnmergeParts.size(); 3395 3396 if (NumElts == 1) { 3397 assert(TargetTy == V2S16); 3398 B.buildBitcast(DstReg, UnmergeParts[0]); 3399 return; 3400 } 3401 3402 SmallVector<Register, 4> RemergeParts(NumElts); 3403 for (int I = 0; I != NumElts; ++I) 3404 RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0); 3405 3406 if (TargetTy.getSizeInBits() == 32u * NumElts) { 3407 B.buildConcatVectors(DstReg, RemergeParts); 3408 return; 3409 } 3410 3411 const LLT V3S16 = LLT::vector(3, 16); 3412 const LLT V6S16 = LLT::vector(6, 16); 3413 3414 // Widen to v6s16 and unpack v3 parts. 3415 assert(TargetTy == V3S16); 3416 3417 RemergeParts.push_back(B.buildUndef(V2S16).getReg(0)); 3418 auto Concat = B.buildConcatVectors(V6S16, RemergeParts); 3419 B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat); 3420 } 3421 3422 // FIXME: Just vector trunc should be sufficent, but legalization currently 3423 // broken. 3424 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg, 3425 Register WideDstReg) { 3426 const LLT S32 = LLT::scalar(32); 3427 const LLT S16 = LLT::scalar(16); 3428 3429 auto Unmerge = B.buildUnmerge(S32, WideDstReg); 3430 3431 int NumOps = Unmerge->getNumOperands() - 1; 3432 SmallVector<Register, 4> RemergeParts(NumOps); 3433 for (int I = 0; I != NumOps; ++I) 3434 RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0); 3435 3436 B.buildBuildVector(DstReg, RemergeParts); 3437 } 3438 3439 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3440 MachineInstr &MI, MachineIRBuilder &B, 3441 GISelChangeObserver &Observer, 3442 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3443 bool IsTFE = MI.getNumExplicitDefs() == 2; 3444 3445 // We are only processing the operands of d16 image operations on subtargets 3446 // that use the unpacked register layout, or need to repack the TFE result. 3447 3448 // TODO: Need to handle a16 images too 3449 // TODO: Do we need to guard against already legalized intrinsics? 3450 if (!IsTFE && !ST.hasUnpackedD16VMem()) 3451 return true; 3452 3453 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3454 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3455 3456 if (BaseOpcode->Atomic) // No d16 atomics, or TFE. 3457 return true; 3458 3459 B.setInstr(MI); 3460 3461 MachineRegisterInfo *MRI = B.getMRI(); 3462 const LLT S32 = LLT::scalar(32); 3463 const LLT S16 = LLT::scalar(16); 3464 3465 if (BaseOpcode->Store) { // No TFE for stores? 3466 Register VData = MI.getOperand(1).getReg(); 3467 LLT Ty = MRI->getType(VData); 3468 if (!Ty.isVector() || Ty.getElementType() != S16) 3469 return true; 3470 3471 B.setInstr(MI); 3472 3473 Observer.changingInstr(MI); 3474 MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData)); 3475 Observer.changedInstr(MI); 3476 return true; 3477 } 3478 3479 Register DstReg = MI.getOperand(0).getReg(); 3480 LLT Ty = MRI->getType(DstReg); 3481 const LLT EltTy = Ty.getScalarType(); 3482 const bool IsD16 = Ty.getScalarType() == S16; 3483 const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3484 3485 if (IsTFE) { 3486 // In the IR, TFE is supposed to be used with a 2 element struct return 3487 // type. The intruction really returns these two values in one contiguous 3488 // register, with one additional dword beyond the loaded data. Rewrite the 3489 // return type to use a single register result. 3490 Register Dst1Reg = MI.getOperand(1).getReg(); 3491 if (MRI->getType(Dst1Reg) != S32) 3492 return false; 3493 3494 // TODO: Make sure the TFE operand bit is set. 3495 3496 // The raw dword aligned data component of the load. The only legal cases 3497 // where this matters should be when using the packed D16 format, for 3498 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3499 LLT RoundedTy; 3500 LLT TFETy; 3501 3502 if (IsD16 && ST.hasUnpackedD16VMem()) { 3503 RoundedTy = LLT::scalarOrVector(NumElts, 32); 3504 TFETy = LLT::vector(NumElts + 1, 32); 3505 } else { 3506 unsigned EltSize = Ty.getScalarSizeInBits(); 3507 unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32; 3508 unsigned RoundedSize = 32 * RoundedElts; 3509 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3510 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3511 } 3512 3513 Register TFEReg = MRI->createGenericVirtualRegister(TFETy); 3514 Observer.changingInstr(MI); 3515 3516 MI.getOperand(0).setReg(TFEReg); 3517 MI.RemoveOperand(1); 3518 3519 Observer.changedInstr(MI); 3520 3521 // Insert after the instruction. 3522 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3523 3524 // Now figure out how to copy the new result register back into the old 3525 // result. 3526 3527 SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg); 3528 int NumDataElts = TFETy.getNumElements() - 1; 3529 3530 if (!Ty.isVector()) { 3531 // Simplest case is a trivial unmerge (plus a truncate for d16). 3532 UnmergeResults[0] = Ty == S32 ? 3533 DstReg : MRI->createGenericVirtualRegister(S32); 3534 3535 B.buildUnmerge(UnmergeResults, TFEReg); 3536 if (Ty != S32) 3537 B.buildTrunc(DstReg, UnmergeResults[0]); 3538 return true; 3539 } 3540 3541 // We have to repack into a new vector of some kind. 3542 for (int I = 0; I != NumDataElts; ++I) 3543 UnmergeResults[I] = MRI->createGenericVirtualRegister(S32); 3544 B.buildUnmerge(UnmergeResults, TFEReg); 3545 3546 // Drop the final TFE element. 3547 ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts); 3548 3549 if (EltTy == S32) 3550 B.buildBuildVector(DstReg, DataPart); 3551 else if (ST.hasUnpackedD16VMem()) 3552 truncToS16Vector(B, DstReg, DataPart); 3553 else 3554 bitcastToS16Vector(B, DstReg, DataPart); 3555 3556 return true; 3557 } 3558 3559 // Must be an image load. 3560 if (!Ty.isVector() || Ty.getElementType() != S16) 3561 return true; 3562 3563 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3564 3565 LLT WidenedTy = Ty.changeElementType(S32); 3566 Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy); 3567 3568 Observer.changingInstr(MI); 3569 MI.getOperand(0).setReg(WideDstReg); 3570 Observer.changedInstr(MI); 3571 3572 repackUnpackedD16Load(B, DstReg, WideDstReg); 3573 return true; 3574 } 3575 3576 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3577 MachineInstr &MI, MachineIRBuilder &B, 3578 GISelChangeObserver &Observer) const { 3579 Register Dst = MI.getOperand(0).getReg(); 3580 LLT Ty = B.getMRI()->getType(Dst); 3581 unsigned Size = Ty.getSizeInBits(); 3582 MachineFunction &MF = B.getMF(); 3583 3584 Observer.changingInstr(MI); 3585 3586 // FIXME: We don't really need this intermediate instruction. The intrinsic 3587 // should be fixed to have a memory operand. Since it's readnone, we're not 3588 // allowed to add one. 3589 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 3590 MI.RemoveOperand(1); // Remove intrinsic ID 3591 3592 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 3593 // TODO: Should this use datalayout alignment? 3594 const unsigned MemSize = (Size + 7) / 8; 3595 const unsigned MemAlign = 4; 3596 MachineMemOperand *MMO = MF.getMachineMemOperand( 3597 MachinePointerInfo(), 3598 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3599 MachineMemOperand::MOInvariant, MemSize, MemAlign); 3600 MI.addMemOperand(MF, MMO); 3601 3602 // There are no 96-bit result scalar loads, but widening to 128-bit should 3603 // always be legal. We may need to restore this to a 96-bit result if it turns 3604 // out this needs to be converted to a vector load during RegBankSelect. 3605 if (!isPowerOf2_32(Size)) { 3606 LegalizerHelper Helper(MF, *this, Observer, B); 3607 B.setInstr(MI); 3608 3609 if (Ty.isVector()) 3610 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 3611 else 3612 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 3613 } 3614 3615 Observer.changedInstr(MI); 3616 return true; 3617 } 3618 3619 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 3620 MachineRegisterInfo &MRI, 3621 MachineIRBuilder &B) const { 3622 B.setInstr(MI); 3623 3624 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 3625 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 3626 !ST.isTrapHandlerEnabled()) { 3627 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 3628 } else { 3629 // Pass queue pointer to trap handler as input, and insert trap instruction 3630 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 3631 const ArgDescriptor *Arg = 3632 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 3633 if (!Arg) 3634 return false; 3635 MachineRegisterInfo &MRI = *B.getMRI(); 3636 Register SGPR01(AMDGPU::SGPR0_SGPR1); 3637 Register LiveIn = getLiveInRegister( 3638 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 3639 /*InsertLiveInCopy=*/false); 3640 if (!loadInputValue(LiveIn, B, Arg)) 3641 return false; 3642 B.buildCopy(SGPR01, LiveIn); 3643 B.buildInstr(AMDGPU::S_TRAP) 3644 .addImm(GCNSubtarget::TrapIDLLVMTrap) 3645 .addReg(SGPR01, RegState::Implicit); 3646 } 3647 3648 MI.eraseFromParent(); 3649 return true; 3650 } 3651 3652 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 3653 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 3654 B.setInstr(MI); 3655 3656 // Is non-HSA path or trap-handler disabled? then, report a warning 3657 // accordingly 3658 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 3659 !ST.isTrapHandlerEnabled()) { 3660 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 3661 "debugtrap handler not supported", 3662 MI.getDebugLoc(), DS_Warning); 3663 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 3664 Ctx.diagnose(NoTrap); 3665 } else { 3666 // Insert debug-trap instruction 3667 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 3668 } 3669 3670 MI.eraseFromParent(); 3671 return true; 3672 } 3673 3674 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 3675 MachineIRBuilder &B, 3676 GISelChangeObserver &Observer) const { 3677 MachineRegisterInfo &MRI = *B.getMRI(); 3678 3679 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 3680 auto IntrID = MI.getIntrinsicID(); 3681 switch (IntrID) { 3682 case Intrinsic::amdgcn_if: 3683 case Intrinsic::amdgcn_else: { 3684 MachineInstr *Br = nullptr; 3685 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3686 const SIRegisterInfo *TRI 3687 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3688 3689 B.setInstr(*BrCond); 3690 Register Def = MI.getOperand(1).getReg(); 3691 Register Use = MI.getOperand(3).getReg(); 3692 3693 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3694 if (Br) 3695 BrTarget = Br->getOperand(0).getMBB(); 3696 3697 if (IntrID == Intrinsic::amdgcn_if) { 3698 B.buildInstr(AMDGPU::SI_IF) 3699 .addDef(Def) 3700 .addUse(Use) 3701 .addMBB(BrTarget); 3702 } else { 3703 B.buildInstr(AMDGPU::SI_ELSE) 3704 .addDef(Def) 3705 .addUse(Use) 3706 .addMBB(BrTarget) 3707 .addImm(0); 3708 } 3709 3710 if (Br) 3711 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3712 3713 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 3714 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 3715 MI.eraseFromParent(); 3716 BrCond->eraseFromParent(); 3717 return true; 3718 } 3719 3720 return false; 3721 } 3722 case Intrinsic::amdgcn_loop: { 3723 MachineInstr *Br = nullptr; 3724 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3725 const SIRegisterInfo *TRI 3726 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3727 3728 B.setInstr(*BrCond); 3729 3730 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3731 if (Br) 3732 BrTarget = Br->getOperand(0).getMBB(); 3733 3734 Register Reg = MI.getOperand(2).getReg(); 3735 B.buildInstr(AMDGPU::SI_LOOP) 3736 .addUse(Reg) 3737 .addMBB(BrTarget); 3738 3739 if (Br) 3740 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3741 3742 MI.eraseFromParent(); 3743 BrCond->eraseFromParent(); 3744 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 3745 return true; 3746 } 3747 3748 return false; 3749 } 3750 case Intrinsic::amdgcn_kernarg_segment_ptr: 3751 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 3752 B.setInstr(MI); 3753 // This only makes sense to call in a kernel, so just lower to null. 3754 B.buildConstant(MI.getOperand(0).getReg(), 0); 3755 MI.eraseFromParent(); 3756 return true; 3757 } 3758 3759 return legalizePreloadedArgIntrin( 3760 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3761 case Intrinsic::amdgcn_implicitarg_ptr: 3762 return legalizeImplicitArgPtr(MI, MRI, B); 3763 case Intrinsic::amdgcn_workitem_id_x: 3764 return legalizePreloadedArgIntrin(MI, MRI, B, 3765 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3766 case Intrinsic::amdgcn_workitem_id_y: 3767 return legalizePreloadedArgIntrin(MI, MRI, B, 3768 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3769 case Intrinsic::amdgcn_workitem_id_z: 3770 return legalizePreloadedArgIntrin(MI, MRI, B, 3771 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3772 case Intrinsic::amdgcn_workgroup_id_x: 3773 return legalizePreloadedArgIntrin(MI, MRI, B, 3774 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 3775 case Intrinsic::amdgcn_workgroup_id_y: 3776 return legalizePreloadedArgIntrin(MI, MRI, B, 3777 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 3778 case Intrinsic::amdgcn_workgroup_id_z: 3779 return legalizePreloadedArgIntrin(MI, MRI, B, 3780 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 3781 case Intrinsic::amdgcn_dispatch_ptr: 3782 return legalizePreloadedArgIntrin(MI, MRI, B, 3783 AMDGPUFunctionArgInfo::DISPATCH_PTR); 3784 case Intrinsic::amdgcn_queue_ptr: 3785 return legalizePreloadedArgIntrin(MI, MRI, B, 3786 AMDGPUFunctionArgInfo::QUEUE_PTR); 3787 case Intrinsic::amdgcn_implicit_buffer_ptr: 3788 return legalizePreloadedArgIntrin( 3789 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 3790 case Intrinsic::amdgcn_dispatch_id: 3791 return legalizePreloadedArgIntrin(MI, MRI, B, 3792 AMDGPUFunctionArgInfo::DISPATCH_ID); 3793 case Intrinsic::amdgcn_fdiv_fast: 3794 return legalizeFDIVFastIntrin(MI, MRI, B); 3795 case Intrinsic::amdgcn_is_shared: 3796 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 3797 case Intrinsic::amdgcn_is_private: 3798 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 3799 case Intrinsic::amdgcn_wavefrontsize: { 3800 B.setInstr(MI); 3801 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 3802 MI.eraseFromParent(); 3803 return true; 3804 } 3805 case Intrinsic::amdgcn_s_buffer_load: 3806 return legalizeSBufferLoad(MI, B, Observer); 3807 case Intrinsic::amdgcn_raw_buffer_store: 3808 case Intrinsic::amdgcn_struct_buffer_store: 3809 return legalizeBufferStore(MI, MRI, B, false, false); 3810 case Intrinsic::amdgcn_raw_buffer_store_format: 3811 case Intrinsic::amdgcn_struct_buffer_store_format: 3812 return legalizeBufferStore(MI, MRI, B, false, true); 3813 case Intrinsic::amdgcn_raw_tbuffer_store: 3814 case Intrinsic::amdgcn_struct_tbuffer_store: 3815 return legalizeBufferStore(MI, MRI, B, true, true); 3816 case Intrinsic::amdgcn_raw_buffer_load: 3817 case Intrinsic::amdgcn_struct_buffer_load: 3818 return legalizeBufferLoad(MI, MRI, B, false, false); 3819 case Intrinsic::amdgcn_raw_buffer_load_format: 3820 case Intrinsic::amdgcn_struct_buffer_load_format: 3821 return legalizeBufferLoad(MI, MRI, B, true, false); 3822 case Intrinsic::amdgcn_raw_tbuffer_load: 3823 case Intrinsic::amdgcn_struct_tbuffer_load: 3824 return legalizeBufferLoad(MI, MRI, B, true, true); 3825 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3826 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3827 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3828 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3829 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3830 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3831 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3832 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3833 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3834 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3835 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3836 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3837 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3838 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3839 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3840 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3841 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3842 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3843 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3844 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3845 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3846 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3847 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3848 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3849 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3850 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3851 return legalizeBufferAtomic(MI, B, IntrID); 3852 case Intrinsic::amdgcn_atomic_inc: 3853 return legalizeAtomicIncDec(MI, B, true); 3854 case Intrinsic::amdgcn_atomic_dec: 3855 return legalizeAtomicIncDec(MI, B, false); 3856 case Intrinsic::trap: 3857 return legalizeTrapIntrinsic(MI, MRI, B); 3858 case Intrinsic::debugtrap: 3859 return legalizeDebugTrapIntrinsic(MI, MRI, B); 3860 default: { 3861 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 3862 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 3863 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 3864 return true; 3865 } 3866 } 3867 3868 return true; 3869 } 3870