1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/ADT/ScopeExit.h" 28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 30 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 31 #include "llvm/CodeGen/TargetOpcodes.h" 32 #include "llvm/CodeGen/ValueTypes.h" 33 #include "llvm/IR/DerivedTypes.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/IR/Type.h" 36 #include "llvm/Support/Debug.h" 37 38 #define DEBUG_TYPE "amdgpu-legalinfo" 39 40 using namespace llvm; 41 using namespace LegalizeActions; 42 using namespace LegalizeMutations; 43 using namespace LegalityPredicates; 44 using namespace MIPatternMatch; 45 46 // Round the number of elements to the next power of two elements 47 static LLT getPow2VectorType(LLT Ty) { 48 unsigned NElts = Ty.getNumElements(); 49 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 50 return Ty.changeNumElements(Pow2NElts); 51 } 52 53 // Round the number of bits to the next power of two bits 54 static LLT getPow2ScalarType(LLT Ty) { 55 unsigned Bits = Ty.getSizeInBits(); 56 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 57 return LLT::scalar(Pow2Bits); 58 } 59 60 static LegalityPredicate isMultiple32(unsigned TypeIdx, 61 unsigned MaxSize = 1024) { 62 return [=](const LegalityQuery &Query) { 63 const LLT Ty = Query.Types[TypeIdx]; 64 const LLT EltTy = Ty.getScalarType(); 65 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 66 }; 67 } 68 69 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 70 return [=](const LegalityQuery &Query) { 71 return Query.Types[TypeIdx].getSizeInBits() == Size; 72 }; 73 } 74 75 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 return Ty.isVector() && 79 Ty.getNumElements() % 2 != 0 && 80 Ty.getElementType().getSizeInBits() < 32 && 81 Ty.getSizeInBits() % 32 != 0; 82 }; 83 } 84 85 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 86 return [=](const LegalityQuery &Query) { 87 const LLT Ty = Query.Types[TypeIdx]; 88 const LLT EltTy = Ty.getScalarType(); 89 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 90 }; 91 } 92 93 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 94 return [=](const LegalityQuery &Query) { 95 const LLT Ty = Query.Types[TypeIdx]; 96 const LLT EltTy = Ty.getElementType(); 97 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 98 }; 99 } 100 101 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 102 return [=](const LegalityQuery &Query) { 103 const LLT Ty = Query.Types[TypeIdx]; 104 const LLT EltTy = Ty.getElementType(); 105 unsigned Size = Ty.getSizeInBits(); 106 unsigned Pieces = (Size + 63) / 64; 107 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 108 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 109 }; 110 } 111 112 // Increase the number of vector elements to reach the next multiple of 32-bit 113 // type. 114 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 115 return [=](const LegalityQuery &Query) { 116 const LLT Ty = Query.Types[TypeIdx]; 117 118 const LLT EltTy = Ty.getElementType(); 119 const int Size = Ty.getSizeInBits(); 120 const int EltSize = EltTy.getSizeInBits(); 121 const int NextMul32 = (Size + 31) / 32; 122 123 assert(EltSize < 32); 124 125 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 126 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 127 }; 128 } 129 130 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 131 return [=](const LegalityQuery &Query) { 132 const LLT QueryTy = Query.Types[TypeIdx]; 133 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 134 }; 135 } 136 137 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 138 return [=](const LegalityQuery &Query) { 139 const LLT QueryTy = Query.Types[TypeIdx]; 140 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 141 }; 142 } 143 144 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 145 return [=](const LegalityQuery &Query) { 146 const LLT QueryTy = Query.Types[TypeIdx]; 147 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 148 }; 149 } 150 151 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 152 // v2s16. 153 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 154 return [=](const LegalityQuery &Query) { 155 const LLT Ty = Query.Types[TypeIdx]; 156 if (Ty.isVector()) { 157 const int EltSize = Ty.getElementType().getSizeInBits(); 158 return EltSize == 32 || EltSize == 64 || 159 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 160 EltSize == 128 || EltSize == 256; 161 } 162 163 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 164 }; 165 } 166 167 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 168 return [=](const LegalityQuery &Query) { 169 const LLT QueryTy = Query.Types[TypeIdx]; 170 return QueryTy.isVector() && QueryTy.getElementType() == Type; 171 }; 172 } 173 174 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 const LLT QueryTy = Query.Types[TypeIdx]; 177 if (!QueryTy.isVector()) 178 return false; 179 const LLT EltTy = QueryTy.getElementType(); 180 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 181 }; 182 } 183 184 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 const LLT Ty = Query.Types[TypeIdx]; 187 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 188 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 189 }; 190 } 191 192 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) { 193 return [=](const LegalityQuery &Query) { 194 return Query.Types[TypeIdx0].getSizeInBits() < 195 Query.Types[TypeIdx1].getSizeInBits(); 196 }; 197 } 198 199 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) { 200 return [=](const LegalityQuery &Query) { 201 return Query.Types[TypeIdx0].getSizeInBits() > 202 Query.Types[TypeIdx1].getSizeInBits(); 203 }; 204 } 205 206 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 207 const GCNTargetMachine &TM) 208 : ST(ST_) { 209 using namespace TargetOpcode; 210 211 auto GetAddrSpacePtr = [&TM](unsigned AS) { 212 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 213 }; 214 215 const LLT S1 = LLT::scalar(1); 216 const LLT S16 = LLT::scalar(16); 217 const LLT S32 = LLT::scalar(32); 218 const LLT S64 = LLT::scalar(64); 219 const LLT S128 = LLT::scalar(128); 220 const LLT S256 = LLT::scalar(256); 221 const LLT S512 = LLT::scalar(512); 222 const LLT S1024 = LLT::scalar(1024); 223 224 const LLT V2S16 = LLT::vector(2, 16); 225 const LLT V4S16 = LLT::vector(4, 16); 226 227 const LLT V2S32 = LLT::vector(2, 32); 228 const LLT V3S32 = LLT::vector(3, 32); 229 const LLT V4S32 = LLT::vector(4, 32); 230 const LLT V5S32 = LLT::vector(5, 32); 231 const LLT V6S32 = LLT::vector(6, 32); 232 const LLT V7S32 = LLT::vector(7, 32); 233 const LLT V8S32 = LLT::vector(8, 32); 234 const LLT V9S32 = LLT::vector(9, 32); 235 const LLT V10S32 = LLT::vector(10, 32); 236 const LLT V11S32 = LLT::vector(11, 32); 237 const LLT V12S32 = LLT::vector(12, 32); 238 const LLT V13S32 = LLT::vector(13, 32); 239 const LLT V14S32 = LLT::vector(14, 32); 240 const LLT V15S32 = LLT::vector(15, 32); 241 const LLT V16S32 = LLT::vector(16, 32); 242 const LLT V32S32 = LLT::vector(32, 32); 243 244 const LLT V2S64 = LLT::vector(2, 64); 245 const LLT V3S64 = LLT::vector(3, 64); 246 const LLT V4S64 = LLT::vector(4, 64); 247 const LLT V5S64 = LLT::vector(5, 64); 248 const LLT V6S64 = LLT::vector(6, 64); 249 const LLT V7S64 = LLT::vector(7, 64); 250 const LLT V8S64 = LLT::vector(8, 64); 251 const LLT V16S64 = LLT::vector(16, 64); 252 253 std::initializer_list<LLT> AllS32Vectors = 254 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 255 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 256 std::initializer_list<LLT> AllS64Vectors = 257 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 258 259 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 260 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 261 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 262 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 263 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 264 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 265 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 266 267 const LLT CodePtr = FlatPtr; 268 269 const std::initializer_list<LLT> AddrSpaces64 = { 270 GlobalPtr, ConstantPtr, FlatPtr 271 }; 272 273 const std::initializer_list<LLT> AddrSpaces32 = { 274 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 275 }; 276 277 const std::initializer_list<LLT> FPTypesBase = { 278 S32, S64 279 }; 280 281 const std::initializer_list<LLT> FPTypes16 = { 282 S32, S64, S16 283 }; 284 285 const std::initializer_list<LLT> FPTypesPK16 = { 286 S32, S64, S16, V2S16 287 }; 288 289 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 290 291 setAction({G_BRCOND, S1}, Legal); // VCC branches 292 setAction({G_BRCOND, S32}, Legal); // SCC branches 293 294 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 295 // elements for v3s16 296 getActionDefinitionsBuilder(G_PHI) 297 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 298 .legalFor(AllS32Vectors) 299 .legalFor(AllS64Vectors) 300 .legalFor(AddrSpaces64) 301 .legalFor(AddrSpaces32) 302 .clampScalar(0, S32, S256) 303 .widenScalarToNextPow2(0, 32) 304 .clampMaxNumElements(0, S32, 16) 305 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 306 .legalIf(isPointer(0)); 307 308 if (ST.hasVOP3PInsts()) { 309 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 310 .legalFor({S32, S16, V2S16}) 311 .clampScalar(0, S16, S32) 312 .clampMaxNumElements(0, S16, 2) 313 .scalarize(0) 314 .widenScalarToNextPow2(0, 32); 315 } else if (ST.has16BitInsts()) { 316 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 317 .legalFor({S32, S16}) 318 .clampScalar(0, S16, S32) 319 .scalarize(0) 320 .widenScalarToNextPow2(0, 32); 321 } else { 322 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 323 .legalFor({S32}) 324 .clampScalar(0, S32, S32) 325 .scalarize(0); 326 } 327 328 // FIXME: Not really legal. Placeholder for custom lowering. 329 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 330 .customFor({S32, S64}) 331 .clampScalar(0, S32, S64) 332 .widenScalarToNextPow2(0, 32) 333 .scalarize(0); 334 335 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 336 .legalFor({S32}) 337 .clampScalar(0, S32, S32) 338 .scalarize(0); 339 340 // Report legal for any types we can handle anywhere. For the cases only legal 341 // on the SALU, RegBankSelect will be able to re-legalize. 342 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 343 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 344 .clampScalar(0, S32, S64) 345 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 346 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 347 .widenScalarToNextPow2(0) 348 .scalarize(0); 349 350 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 351 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 352 .legalFor({{S32, S1}, {S32, S32}}) 353 .minScalar(0, S32) 354 // TODO: .scalarize(0) 355 .lower(); 356 357 getActionDefinitionsBuilder(G_BITCAST) 358 // Don't worry about the size constraint. 359 .legalIf(all(isRegisterType(0), isRegisterType(1))) 360 .lower(); 361 362 363 getActionDefinitionsBuilder(G_CONSTANT) 364 .legalFor({S1, S32, S64, S16, GlobalPtr, 365 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 366 .clampScalar(0, S32, S64) 367 .widenScalarToNextPow2(0) 368 .legalIf(isPointer(0)); 369 370 getActionDefinitionsBuilder(G_FCONSTANT) 371 .legalFor({S32, S64, S16}) 372 .clampScalar(0, S16, S64); 373 374 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 375 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 376 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 377 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 378 .clampScalarOrElt(0, S32, S1024) 379 .legalIf(isMultiple32(0)) 380 .widenScalarToNextPow2(0, 32) 381 .clampMaxNumElements(0, S32, 16); 382 383 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 384 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 385 .unsupportedFor({PrivatePtr}) 386 .custom(); 387 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 388 389 auto &FPOpActions = getActionDefinitionsBuilder( 390 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 391 .legalFor({S32, S64}); 392 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 393 .customFor({S32, S64}); 394 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 395 .customFor({S32, S64}); 396 397 if (ST.has16BitInsts()) { 398 if (ST.hasVOP3PInsts()) 399 FPOpActions.legalFor({S16, V2S16}); 400 else 401 FPOpActions.legalFor({S16}); 402 403 TrigActions.customFor({S16}); 404 FDIVActions.customFor({S16}); 405 } 406 407 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 408 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 409 410 if (ST.hasVOP3PInsts()) { 411 MinNumMaxNum.customFor(FPTypesPK16) 412 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 413 .clampMaxNumElements(0, S16, 2) 414 .clampScalar(0, S16, S64) 415 .scalarize(0); 416 } else if (ST.has16BitInsts()) { 417 MinNumMaxNum.customFor(FPTypes16) 418 .clampScalar(0, S16, S64) 419 .scalarize(0); 420 } else { 421 MinNumMaxNum.customFor(FPTypesBase) 422 .clampScalar(0, S32, S64) 423 .scalarize(0); 424 } 425 426 if (ST.hasVOP3PInsts()) 427 FPOpActions.clampMaxNumElements(0, S16, 2); 428 429 FPOpActions 430 .scalarize(0) 431 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 432 433 TrigActions 434 .scalarize(0) 435 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 436 437 FDIVActions 438 .scalarize(0) 439 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 440 441 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 442 .legalFor(FPTypesPK16) 443 .clampMaxNumElements(0, S16, 2) 444 .scalarize(0) 445 .clampScalar(0, S16, S64); 446 447 if (ST.has16BitInsts()) { 448 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 449 .legalFor({S32, S64, S16}) 450 .scalarize(0) 451 .clampScalar(0, S16, S64); 452 } else { 453 getActionDefinitionsBuilder(G_FSQRT) 454 .legalFor({S32, S64}) 455 .scalarize(0) 456 .clampScalar(0, S32, S64); 457 458 if (ST.hasFractBug()) { 459 getActionDefinitionsBuilder(G_FFLOOR) 460 .customFor({S64}) 461 .legalFor({S32, S64}) 462 .scalarize(0) 463 .clampScalar(0, S32, S64); 464 } else { 465 getActionDefinitionsBuilder(G_FFLOOR) 466 .legalFor({S32, S64}) 467 .scalarize(0) 468 .clampScalar(0, S32, S64); 469 } 470 } 471 472 getActionDefinitionsBuilder(G_FPTRUNC) 473 .legalFor({{S32, S64}, {S16, S32}}) 474 .scalarize(0) 475 .lower(); 476 477 getActionDefinitionsBuilder(G_FPEXT) 478 .legalFor({{S64, S32}, {S32, S16}}) 479 .lowerFor({{S64, S16}}) // FIXME: Implement 480 .scalarize(0); 481 482 getActionDefinitionsBuilder(G_FSUB) 483 // Use actual fsub instruction 484 .legalFor({S32}) 485 // Must use fadd + fneg 486 .lowerFor({S64, S16, V2S16}) 487 .scalarize(0) 488 .clampScalar(0, S32, S64); 489 490 // Whether this is legal depends on the floating point mode for the function. 491 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 492 if (ST.hasMadF16()) 493 FMad.customFor({S32, S16}); 494 else 495 FMad.customFor({S32}); 496 FMad.scalarize(0) 497 .lower(); 498 499 // TODO: Do we need to clamp maximum bitwidth? 500 getActionDefinitionsBuilder(G_TRUNC) 501 .legalIf(isScalar(0)) 502 .legalFor({{V2S16, V2S32}}) 503 .clampMaxNumElements(0, S16, 2) 504 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 505 // situations (like an invalid implicit use), we don't want to infinite loop 506 // in the legalizer. 507 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 508 .alwaysLegal(); 509 510 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 511 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 512 {S32, S1}, {S64, S1}, {S16, S1}}) 513 .scalarize(0) 514 .clampScalar(0, S32, S64) 515 .widenScalarToNextPow2(1, 32); 516 517 // TODO: Split s1->s64 during regbankselect for VALU. 518 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 519 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 520 .lowerFor({{S32, S64}}) 521 .lowerIf(typeIs(1, S1)) 522 .customFor({{S64, S64}}); 523 if (ST.has16BitInsts()) 524 IToFP.legalFor({{S16, S16}}); 525 IToFP.clampScalar(1, S32, S64) 526 .scalarize(0) 527 .widenScalarToNextPow2(1); 528 529 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 530 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 531 .customFor({{S64, S64}}); 532 if (ST.has16BitInsts()) 533 FPToI.legalFor({{S16, S16}}); 534 else 535 FPToI.minScalar(1, S32); 536 537 FPToI.minScalar(0, S32) 538 .scalarize(0) 539 .lower(); 540 541 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 542 .scalarize(0) 543 .lower(); 544 545 if (ST.has16BitInsts()) { 546 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 547 .legalFor({S16, S32, S64}) 548 .clampScalar(0, S16, S64) 549 .scalarize(0); 550 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 551 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 552 .legalFor({S32, S64}) 553 .clampScalar(0, S32, S64) 554 .scalarize(0); 555 } else { 556 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 557 .legalFor({S32}) 558 .customFor({S64}) 559 .clampScalar(0, S32, S64) 560 .scalarize(0); 561 } 562 563 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 564 .scalarize(0) 565 .alwaysLegal(); 566 567 auto &CmpBuilder = 568 getActionDefinitionsBuilder(G_ICMP) 569 // The compare output type differs based on the register bank of the output, 570 // so make both s1 and s32 legal. 571 // 572 // Scalar compares producing output in scc will be promoted to s32, as that 573 // is the allocatable register type that will be needed for the copy from 574 // scc. This will be promoted during RegBankSelect, and we assume something 575 // before that won't try to use s32 result types. 576 // 577 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 578 // bank. 579 .legalForCartesianProduct( 580 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 581 .legalForCartesianProduct( 582 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 583 if (ST.has16BitInsts()) { 584 CmpBuilder.legalFor({{S1, S16}}); 585 } 586 587 CmpBuilder 588 .widenScalarToNextPow2(1) 589 .clampScalar(1, S32, S64) 590 .scalarize(0) 591 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 592 593 getActionDefinitionsBuilder(G_FCMP) 594 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 595 .widenScalarToNextPow2(1) 596 .clampScalar(1, S32, S64) 597 .scalarize(0); 598 599 // FIXME: fpow has a selection pattern that should move to custom lowering. 600 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 601 if (ST.has16BitInsts()) 602 Exp2Ops.legalFor({S32, S16}); 603 else 604 Exp2Ops.legalFor({S32}); 605 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 606 Exp2Ops.scalarize(0); 607 608 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 609 if (ST.has16BitInsts()) 610 ExpOps.customFor({{S32}, {S16}}); 611 else 612 ExpOps.customFor({S32}); 613 ExpOps.clampScalar(0, MinScalarFPTy, S32) 614 .scalarize(0); 615 616 // The 64-bit versions produce 32-bit results, but only on the SALU. 617 getActionDefinitionsBuilder(G_CTPOP) 618 .legalFor({{S32, S32}, {S32, S64}}) 619 .clampScalar(0, S32, S32) 620 .clampScalar(1, S32, S64) 621 .scalarize(0) 622 .widenScalarToNextPow2(0, 32) 623 .widenScalarToNextPow2(1, 32); 624 625 // The hardware instructions return a different result on 0 than the generic 626 // instructions expect. The hardware produces -1, but these produce the 627 // bitwidth. 628 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 629 .scalarize(0) 630 .clampScalar(0, S32, S32) 631 .clampScalar(1, S32, S64) 632 .widenScalarToNextPow2(0, 32) 633 .widenScalarToNextPow2(1, 32) 634 .lower(); 635 636 // The 64-bit versions produce 32-bit results, but only on the SALU. 637 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 638 .legalFor({{S32, S32}, {S32, S64}}) 639 .clampScalar(0, S32, S32) 640 .clampScalar(1, S32, S64) 641 .scalarize(0) 642 .widenScalarToNextPow2(0, 32) 643 .widenScalarToNextPow2(1, 32); 644 645 getActionDefinitionsBuilder(G_BITREVERSE) 646 .legalFor({S32}) 647 .clampScalar(0, S32, S32) 648 .scalarize(0); 649 650 if (ST.has16BitInsts()) { 651 getActionDefinitionsBuilder(G_BSWAP) 652 .legalFor({S16, S32, V2S16}) 653 .clampMaxNumElements(0, S16, 2) 654 // FIXME: Fixing non-power-of-2 before clamp is workaround for 655 // narrowScalar limitation. 656 .widenScalarToNextPow2(0) 657 .clampScalar(0, S16, S32) 658 .scalarize(0); 659 660 if (ST.hasVOP3PInsts()) { 661 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 662 .legalFor({S32, S16, V2S16}) 663 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 664 .clampMaxNumElements(0, S16, 2) 665 .minScalar(0, S16) 666 .widenScalarToNextPow2(0) 667 .scalarize(0) 668 .lower(); 669 } else { 670 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 671 .legalFor({S32, S16}) 672 .widenScalarToNextPow2(0) 673 .minScalar(0, S16) 674 .scalarize(0) 675 .lower(); 676 } 677 } else { 678 // TODO: Should have same legality without v_perm_b32 679 getActionDefinitionsBuilder(G_BSWAP) 680 .legalFor({S32}) 681 .lowerIf(narrowerThan(0, 32)) 682 // FIXME: Fixing non-power-of-2 before clamp is workaround for 683 // narrowScalar limitation. 684 .widenScalarToNextPow2(0) 685 .maxScalar(0, S32) 686 .scalarize(0) 687 .lower(); 688 689 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 690 .legalFor({S32}) 691 .minScalar(0, S32) 692 .widenScalarToNextPow2(0) 693 .scalarize(0) 694 .lower(); 695 } 696 697 getActionDefinitionsBuilder(G_INTTOPTR) 698 // List the common cases 699 .legalForCartesianProduct(AddrSpaces64, {S64}) 700 .legalForCartesianProduct(AddrSpaces32, {S32}) 701 .scalarize(0) 702 // Accept any address space as long as the size matches 703 .legalIf(sameSize(0, 1)) 704 .widenScalarIf(smallerThan(1, 0), 705 [](const LegalityQuery &Query) { 706 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 707 }) 708 .narrowScalarIf(greaterThan(1, 0), 709 [](const LegalityQuery &Query) { 710 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 711 }); 712 713 getActionDefinitionsBuilder(G_PTRTOINT) 714 // List the common cases 715 .legalForCartesianProduct(AddrSpaces64, {S64}) 716 .legalForCartesianProduct(AddrSpaces32, {S32}) 717 .scalarize(0) 718 // Accept any address space as long as the size matches 719 .legalIf(sameSize(0, 1)) 720 .widenScalarIf(smallerThan(0, 1), 721 [](const LegalityQuery &Query) { 722 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 723 }) 724 .narrowScalarIf( 725 greaterThan(0, 1), 726 [](const LegalityQuery &Query) { 727 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 728 }); 729 730 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 731 .scalarize(0) 732 .custom(); 733 734 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 735 // handle some operations by just promoting the register during 736 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 737 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 738 switch (AS) { 739 // FIXME: Private element size. 740 case AMDGPUAS::PRIVATE_ADDRESS: 741 return 32; 742 // FIXME: Check subtarget 743 case AMDGPUAS::LOCAL_ADDRESS: 744 return ST.useDS128() ? 128 : 64; 745 746 // Treat constant and global as identical. SMRD loads are sometimes usable 747 // for global loads (ideally constant address space should be eliminated) 748 // depending on the context. Legality cannot be context dependent, but 749 // RegBankSelect can split the load as necessary depending on the pointer 750 // register bank/uniformity and if the memory is invariant or not written in 751 // a kernel. 752 case AMDGPUAS::CONSTANT_ADDRESS: 753 case AMDGPUAS::GLOBAL_ADDRESS: 754 return IsLoad ? 512 : 128; 755 default: 756 return 128; 757 } 758 }; 759 760 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 761 bool IsLoad) -> bool { 762 const LLT DstTy = Query.Types[0]; 763 764 // Split vector extloads. 765 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 766 unsigned Align = Query.MMODescrs[0].AlignInBits; 767 768 if (MemSize < DstTy.getSizeInBits()) 769 MemSize = std::max(MemSize, Align); 770 771 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 772 return true; 773 774 const LLT PtrTy = Query.Types[1]; 775 unsigned AS = PtrTy.getAddressSpace(); 776 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 777 return true; 778 779 // Catch weird sized loads that don't evenly divide into the access sizes 780 // TODO: May be able to widen depending on alignment etc. 781 unsigned NumRegs = (MemSize + 31) / 32; 782 if (NumRegs == 3) { 783 if (!ST.hasDwordx3LoadStores()) 784 return true; 785 } else { 786 // If the alignment allows, these should have been widened. 787 if (!isPowerOf2_32(NumRegs)) 788 return true; 789 } 790 791 if (Align < MemSize) { 792 const SITargetLowering *TLI = ST.getTargetLowering(); 793 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 794 } 795 796 return false; 797 }; 798 799 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 800 unsigned Size = Query.Types[0].getSizeInBits(); 801 if (isPowerOf2_32(Size)) 802 return false; 803 804 if (Size == 96 && ST.hasDwordx3LoadStores()) 805 return false; 806 807 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 808 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 809 return false; 810 811 unsigned Align = Query.MMODescrs[0].AlignInBits; 812 unsigned RoundedSize = NextPowerOf2(Size); 813 return (Align >= RoundedSize); 814 }; 815 816 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 817 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 818 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 819 820 // TODO: Refine based on subtargets which support unaligned access or 128-bit 821 // LDS 822 // TODO: Unsupported flat for SI. 823 824 for (unsigned Op : {G_LOAD, G_STORE}) { 825 const bool IsStore = Op == G_STORE; 826 827 auto &Actions = getActionDefinitionsBuilder(Op); 828 // Whitelist the common cases. 829 // TODO: Loads to s16 on gfx9 830 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 831 {V2S32, GlobalPtr, 64, GlobalAlign32}, 832 {V4S32, GlobalPtr, 128, GlobalAlign32}, 833 {S128, GlobalPtr, 128, GlobalAlign32}, 834 {S64, GlobalPtr, 64, GlobalAlign32}, 835 {V2S64, GlobalPtr, 128, GlobalAlign32}, 836 {V2S16, GlobalPtr, 32, GlobalAlign32}, 837 {S32, GlobalPtr, 8, GlobalAlign8}, 838 {S32, GlobalPtr, 16, GlobalAlign16}, 839 840 {S32, LocalPtr, 32, 32}, 841 {S64, LocalPtr, 64, 32}, 842 {V2S32, LocalPtr, 64, 32}, 843 {S32, LocalPtr, 8, 8}, 844 {S32, LocalPtr, 16, 16}, 845 {V2S16, LocalPtr, 32, 32}, 846 847 {S32, PrivatePtr, 32, 32}, 848 {S32, PrivatePtr, 8, 8}, 849 {S32, PrivatePtr, 16, 16}, 850 {V2S16, PrivatePtr, 32, 32}, 851 852 {S32, FlatPtr, 32, GlobalAlign32}, 853 {S32, FlatPtr, 16, GlobalAlign16}, 854 {S32, FlatPtr, 8, GlobalAlign8}, 855 {V2S16, FlatPtr, 32, GlobalAlign32}, 856 857 {S32, ConstantPtr, 32, GlobalAlign32}, 858 {V2S32, ConstantPtr, 64, GlobalAlign32}, 859 {V4S32, ConstantPtr, 128, GlobalAlign32}, 860 {S64, ConstantPtr, 64, GlobalAlign32}, 861 {S128, ConstantPtr, 128, GlobalAlign32}, 862 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 863 Actions 864 .customIf(typeIs(1, Constant32Ptr)) 865 // Widen suitably aligned loads by loading extra elements. 866 .moreElementsIf([=](const LegalityQuery &Query) { 867 const LLT Ty = Query.Types[0]; 868 return Op == G_LOAD && Ty.isVector() && 869 shouldWidenLoadResult(Query); 870 }, moreElementsToNextPow2(0)) 871 .widenScalarIf([=](const LegalityQuery &Query) { 872 const LLT Ty = Query.Types[0]; 873 return Op == G_LOAD && !Ty.isVector() && 874 shouldWidenLoadResult(Query); 875 }, widenScalarOrEltToNextPow2(0)) 876 .narrowScalarIf( 877 [=](const LegalityQuery &Query) -> bool { 878 return !Query.Types[0].isVector() && 879 needToSplitMemOp(Query, Op == G_LOAD); 880 }, 881 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 882 const LLT DstTy = Query.Types[0]; 883 const LLT PtrTy = Query.Types[1]; 884 885 const unsigned DstSize = DstTy.getSizeInBits(); 886 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 887 888 // Split extloads. 889 if (DstSize > MemSize) 890 return std::make_pair(0, LLT::scalar(MemSize)); 891 892 if (!isPowerOf2_32(DstSize)) { 893 // We're probably decomposing an odd sized store. Try to split 894 // to the widest type. TODO: Account for alignment. As-is it 895 // should be OK, since the new parts will be further legalized. 896 unsigned FloorSize = PowerOf2Floor(DstSize); 897 return std::make_pair(0, LLT::scalar(FloorSize)); 898 } 899 900 if (DstSize > 32 && (DstSize % 32 != 0)) { 901 // FIXME: Need a way to specify non-extload of larger size if 902 // suitably aligned. 903 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 904 } 905 906 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 907 Op == G_LOAD); 908 if (MemSize > MaxSize) 909 return std::make_pair(0, LLT::scalar(MaxSize)); 910 911 unsigned Align = Query.MMODescrs[0].AlignInBits; 912 return std::make_pair(0, LLT::scalar(Align)); 913 }) 914 .fewerElementsIf( 915 [=](const LegalityQuery &Query) -> bool { 916 return Query.Types[0].isVector() && 917 needToSplitMemOp(Query, Op == G_LOAD); 918 }, 919 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 920 const LLT DstTy = Query.Types[0]; 921 const LLT PtrTy = Query.Types[1]; 922 923 LLT EltTy = DstTy.getElementType(); 924 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 925 Op == G_LOAD); 926 927 // FIXME: Handle widened to power of 2 results better. This ends 928 // up scalarizing. 929 // FIXME: 3 element stores scalarized on SI 930 931 // Split if it's too large for the address space. 932 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 933 unsigned NumElts = DstTy.getNumElements(); 934 unsigned EltSize = EltTy.getSizeInBits(); 935 936 if (MaxSize % EltSize == 0) { 937 return std::make_pair( 938 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 939 } 940 941 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 942 943 // FIXME: Refine when odd breakdowns handled 944 // The scalars will need to be re-legalized. 945 if (NumPieces == 1 || NumPieces >= NumElts || 946 NumElts % NumPieces != 0) 947 return std::make_pair(0, EltTy); 948 949 return std::make_pair(0, 950 LLT::vector(NumElts / NumPieces, EltTy)); 951 } 952 953 // FIXME: We could probably handle weird extending loads better. 954 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 955 if (DstTy.getSizeInBits() > MemSize) 956 return std::make_pair(0, EltTy); 957 958 unsigned EltSize = EltTy.getSizeInBits(); 959 unsigned DstSize = DstTy.getSizeInBits(); 960 if (!isPowerOf2_32(DstSize)) { 961 // We're probably decomposing an odd sized store. Try to split 962 // to the widest type. TODO: Account for alignment. As-is it 963 // should be OK, since the new parts will be further legalized. 964 unsigned FloorSize = PowerOf2Floor(DstSize); 965 return std::make_pair( 966 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 967 } 968 969 // Need to split because of alignment. 970 unsigned Align = Query.MMODescrs[0].AlignInBits; 971 if (EltSize > Align && 972 (EltSize / Align < DstTy.getNumElements())) { 973 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 974 } 975 976 // May need relegalization for the scalars. 977 return std::make_pair(0, EltTy); 978 }) 979 .minScalar(0, S32); 980 981 if (IsStore) 982 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 983 984 // TODO: Need a bitcast lower option? 985 Actions 986 .legalIf([=](const LegalityQuery &Query) { 987 const LLT Ty0 = Query.Types[0]; 988 unsigned Size = Ty0.getSizeInBits(); 989 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 990 unsigned Align = Query.MMODescrs[0].AlignInBits; 991 992 // FIXME: Widening store from alignment not valid. 993 if (MemSize < Size) 994 MemSize = std::max(MemSize, Align); 995 996 // No extending vector loads. 997 if (Size > MemSize && Ty0.isVector()) 998 return false; 999 1000 switch (MemSize) { 1001 case 8: 1002 case 16: 1003 return Size == 32; 1004 case 32: 1005 case 64: 1006 case 128: 1007 return true; 1008 case 96: 1009 return ST.hasDwordx3LoadStores(); 1010 case 256: 1011 case 512: 1012 return true; 1013 default: 1014 return false; 1015 } 1016 }) 1017 .widenScalarToNextPow2(0) 1018 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1019 } 1020 1021 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1022 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1023 {S32, GlobalPtr, 16, 2 * 8}, 1024 {S32, LocalPtr, 8, 8}, 1025 {S32, LocalPtr, 16, 16}, 1026 {S32, PrivatePtr, 8, 8}, 1027 {S32, PrivatePtr, 16, 16}, 1028 {S32, ConstantPtr, 8, 8}, 1029 {S32, ConstantPtr, 16, 2 * 8}}); 1030 if (ST.hasFlatAddressSpace()) { 1031 ExtLoads.legalForTypesWithMemDesc( 1032 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1033 } 1034 1035 ExtLoads.clampScalar(0, S32, S32) 1036 .widenScalarToNextPow2(0) 1037 .unsupportedIfMemSizeNotPow2() 1038 .lower(); 1039 1040 auto &Atomics = getActionDefinitionsBuilder( 1041 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1042 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1043 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1044 G_ATOMICRMW_UMIN}) 1045 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1046 {S64, GlobalPtr}, {S64, LocalPtr}}); 1047 if (ST.hasFlatAddressSpace()) { 1048 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1049 } 1050 1051 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1052 .legalFor({{S32, LocalPtr}}); 1053 1054 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1055 // demarshalling 1056 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1057 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1058 {S32, FlatPtr}, {S64, FlatPtr}}) 1059 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1060 {S32, RegionPtr}, {S64, RegionPtr}}); 1061 // TODO: Pointer types, any 32-bit or 64-bit vector 1062 1063 // Condition should be s32 for scalar, s1 for vector. 1064 getActionDefinitionsBuilder(G_SELECT) 1065 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1066 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1067 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1068 .clampScalar(0, S16, S64) 1069 .scalarize(1) 1070 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1071 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1072 .clampMaxNumElements(0, S32, 2) 1073 .clampMaxNumElements(0, LocalPtr, 2) 1074 .clampMaxNumElements(0, PrivatePtr, 2) 1075 .scalarize(0) 1076 .widenScalarToNextPow2(0) 1077 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1078 1079 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1080 // be more flexible with the shift amount type. 1081 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1082 .legalFor({{S32, S32}, {S64, S32}}); 1083 if (ST.has16BitInsts()) { 1084 if (ST.hasVOP3PInsts()) { 1085 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1086 .clampMaxNumElements(0, S16, 2); 1087 } else 1088 Shifts.legalFor({{S16, S16}}); 1089 1090 // TODO: Support 16-bit shift amounts for all types 1091 Shifts.widenScalarIf( 1092 [=](const LegalityQuery &Query) { 1093 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1094 // 32-bit amount. 1095 const LLT ValTy = Query.Types[0]; 1096 const LLT AmountTy = Query.Types[1]; 1097 return ValTy.getSizeInBits() <= 16 && 1098 AmountTy.getSizeInBits() < 16; 1099 }, changeTo(1, S16)); 1100 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1101 Shifts.clampScalar(1, S32, S32); 1102 Shifts.clampScalar(0, S16, S64); 1103 Shifts.widenScalarToNextPow2(0, 16); 1104 } else { 1105 // Make sure we legalize the shift amount type first, as the general 1106 // expansion for the shifted type will produce much worse code if it hasn't 1107 // been truncated already. 1108 Shifts.clampScalar(1, S32, S32); 1109 Shifts.clampScalar(0, S32, S64); 1110 Shifts.widenScalarToNextPow2(0, 32); 1111 } 1112 Shifts.scalarize(0); 1113 1114 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1115 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1116 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1117 unsigned IdxTypeIdx = 2; 1118 1119 getActionDefinitionsBuilder(Op) 1120 .customIf([=](const LegalityQuery &Query) { 1121 const LLT EltTy = Query.Types[EltTypeIdx]; 1122 const LLT VecTy = Query.Types[VecTypeIdx]; 1123 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1124 return (EltTy.getSizeInBits() == 16 || 1125 EltTy.getSizeInBits() % 32 == 0) && 1126 VecTy.getSizeInBits() % 32 == 0 && 1127 VecTy.getSizeInBits() <= 1024 && 1128 IdxTy.getSizeInBits() == 32; 1129 }) 1130 .clampScalar(EltTypeIdx, S32, S64) 1131 .clampScalar(VecTypeIdx, S32, S64) 1132 .clampScalar(IdxTypeIdx, S32, S32); 1133 } 1134 1135 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1136 .unsupportedIf([=](const LegalityQuery &Query) { 1137 const LLT &EltTy = Query.Types[1].getElementType(); 1138 return Query.Types[0] != EltTy; 1139 }); 1140 1141 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1142 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1143 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1144 1145 // FIXME: Doesn't handle extract of illegal sizes. 1146 getActionDefinitionsBuilder(Op) 1147 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1148 // FIXME: Multiples of 16 should not be legal. 1149 .legalIf([=](const LegalityQuery &Query) { 1150 const LLT BigTy = Query.Types[BigTyIdx]; 1151 const LLT LitTy = Query.Types[LitTyIdx]; 1152 return (BigTy.getSizeInBits() % 32 == 0) && 1153 (LitTy.getSizeInBits() % 16 == 0); 1154 }) 1155 .widenScalarIf( 1156 [=](const LegalityQuery &Query) { 1157 const LLT BigTy = Query.Types[BigTyIdx]; 1158 return (BigTy.getScalarSizeInBits() < 16); 1159 }, 1160 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1161 .widenScalarIf( 1162 [=](const LegalityQuery &Query) { 1163 const LLT LitTy = Query.Types[LitTyIdx]; 1164 return (LitTy.getScalarSizeInBits() < 16); 1165 }, 1166 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1167 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1168 .widenScalarToNextPow2(BigTyIdx, 32); 1169 1170 } 1171 1172 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1173 .legalForCartesianProduct(AllS32Vectors, {S32}) 1174 .legalForCartesianProduct(AllS64Vectors, {S64}) 1175 .clampNumElements(0, V16S32, V32S32) 1176 .clampNumElements(0, V2S64, V16S64) 1177 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1178 1179 if (ST.hasScalarPackInsts()) { 1180 BuildVector 1181 // FIXME: Should probably widen s1 vectors straight to s32 1182 .minScalarOrElt(0, S16) 1183 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1184 .minScalar(1, S32); 1185 1186 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1187 .legalFor({V2S16, S32}) 1188 .lower(); 1189 BuildVector.minScalarOrElt(0, S32); 1190 } else { 1191 BuildVector.customFor({V2S16, S16}); 1192 BuildVector.minScalarOrElt(0, S32); 1193 1194 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1195 .customFor({V2S16, S32}) 1196 .lower(); 1197 } 1198 1199 BuildVector.legalIf(isRegisterType(0)); 1200 1201 // FIXME: Clamp maximum size 1202 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1203 .legalIf(isRegisterType(0)); 1204 1205 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1206 // pre-legalize. 1207 if (ST.hasVOP3PInsts()) { 1208 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1209 .customFor({V2S16, V2S16}) 1210 .lower(); 1211 } else 1212 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1213 1214 // Merge/Unmerge 1215 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1216 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1217 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1218 1219 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1220 const LLT Ty = Query.Types[TypeIdx]; 1221 if (Ty.isVector()) { 1222 const LLT &EltTy = Ty.getElementType(); 1223 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1224 return true; 1225 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1226 return true; 1227 } 1228 return false; 1229 }; 1230 1231 auto &Builder = getActionDefinitionsBuilder(Op) 1232 .lowerFor({{S16, V2S16}}) 1233 .lowerIf([=](const LegalityQuery &Query) { 1234 const LLT BigTy = Query.Types[BigTyIdx]; 1235 return BigTy.getSizeInBits() == 32; 1236 }) 1237 // Try to widen to s16 first for small types. 1238 // TODO: Only do this on targets with legal s16 shifts 1239 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1240 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1241 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1242 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1243 elementTypeIs(1, S16)), 1244 changeTo(1, V2S16)) 1245 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1246 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1247 // valid. 1248 .clampScalar(LitTyIdx, S32, S512) 1249 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1250 // Break up vectors with weird elements into scalars 1251 .fewerElementsIf( 1252 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1253 scalarize(0)) 1254 .fewerElementsIf( 1255 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1256 scalarize(1)) 1257 .clampScalar(BigTyIdx, S32, S1024); 1258 1259 if (Op == G_MERGE_VALUES) { 1260 Builder.widenScalarIf( 1261 // TODO: Use 16-bit shifts if legal for 8-bit values? 1262 [=](const LegalityQuery &Query) { 1263 const LLT Ty = Query.Types[LitTyIdx]; 1264 return Ty.getSizeInBits() < 32; 1265 }, 1266 changeTo(LitTyIdx, S32)); 1267 } 1268 1269 Builder.widenScalarIf( 1270 [=](const LegalityQuery &Query) { 1271 const LLT Ty = Query.Types[BigTyIdx]; 1272 return !isPowerOf2_32(Ty.getSizeInBits()) && 1273 Ty.getSizeInBits() % 16 != 0; 1274 }, 1275 [=](const LegalityQuery &Query) { 1276 // Pick the next power of 2, or a multiple of 64 over 128. 1277 // Whichever is smaller. 1278 const LLT &Ty = Query.Types[BigTyIdx]; 1279 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1280 if (NewSizeInBits >= 256) { 1281 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1282 if (RoundedTo < NewSizeInBits) 1283 NewSizeInBits = RoundedTo; 1284 } 1285 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1286 }) 1287 .legalIf([=](const LegalityQuery &Query) { 1288 const LLT &BigTy = Query.Types[BigTyIdx]; 1289 const LLT &LitTy = Query.Types[LitTyIdx]; 1290 1291 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1292 return false; 1293 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1294 return false; 1295 1296 return BigTy.getSizeInBits() % 16 == 0 && 1297 LitTy.getSizeInBits() % 16 == 0 && 1298 BigTy.getSizeInBits() <= 1024; 1299 }) 1300 // Any vectors left are the wrong size. Scalarize them. 1301 .scalarize(0) 1302 .scalarize(1); 1303 } 1304 1305 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1306 // RegBankSelect. 1307 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1308 .legalFor({{S32}, {S64}}); 1309 1310 if (ST.hasVOP3PInsts()) { 1311 SextInReg.lowerFor({{V2S16}}) 1312 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1313 // get more vector shift opportunities, since we'll get those when 1314 // expanded. 1315 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1316 } else if (ST.has16BitInsts()) { 1317 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1318 } else { 1319 // Prefer to promote to s32 before lowering if we don't have 16-bit 1320 // shifts. This avoid a lot of intermediate truncate and extend operations. 1321 SextInReg.lowerFor({{S32}, {S64}}); 1322 } 1323 1324 SextInReg 1325 .scalarize(0) 1326 .clampScalar(0, S32, S64) 1327 .lower(); 1328 1329 getActionDefinitionsBuilder(G_FSHR) 1330 .legalFor({{S32, S32}}) 1331 .scalarize(0) 1332 .lower(); 1333 1334 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1335 .legalFor({S64}); 1336 1337 getActionDefinitionsBuilder({ 1338 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1339 G_FCOPYSIGN, 1340 1341 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1342 G_READ_REGISTER, 1343 G_WRITE_REGISTER, 1344 1345 G_SADDO, G_SSUBO, 1346 1347 // TODO: Implement 1348 G_FMINIMUM, G_FMAXIMUM, 1349 G_FSHL 1350 }).lower(); 1351 1352 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1353 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1354 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1355 .unsupported(); 1356 1357 computeTables(); 1358 verify(*ST.getInstrInfo()); 1359 } 1360 1361 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1362 MachineRegisterInfo &MRI, 1363 MachineIRBuilder &B, 1364 GISelChangeObserver &Observer) const { 1365 switch (MI.getOpcode()) { 1366 case TargetOpcode::G_ADDRSPACE_CAST: 1367 return legalizeAddrSpaceCast(MI, MRI, B); 1368 case TargetOpcode::G_FRINT: 1369 return legalizeFrint(MI, MRI, B); 1370 case TargetOpcode::G_FCEIL: 1371 return legalizeFceil(MI, MRI, B); 1372 case TargetOpcode::G_INTRINSIC_TRUNC: 1373 return legalizeIntrinsicTrunc(MI, MRI, B); 1374 case TargetOpcode::G_SITOFP: 1375 return legalizeITOFP(MI, MRI, B, true); 1376 case TargetOpcode::G_UITOFP: 1377 return legalizeITOFP(MI, MRI, B, false); 1378 case TargetOpcode::G_FPTOSI: 1379 return legalizeFPTOI(MI, MRI, B, true); 1380 case TargetOpcode::G_FPTOUI: 1381 return legalizeFPTOI(MI, MRI, B, false); 1382 case TargetOpcode::G_FMINNUM: 1383 case TargetOpcode::G_FMAXNUM: 1384 case TargetOpcode::G_FMINNUM_IEEE: 1385 case TargetOpcode::G_FMAXNUM_IEEE: 1386 return legalizeMinNumMaxNum(MI, MRI, B); 1387 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1388 return legalizeExtractVectorElt(MI, MRI, B); 1389 case TargetOpcode::G_INSERT_VECTOR_ELT: 1390 return legalizeInsertVectorElt(MI, MRI, B); 1391 case TargetOpcode::G_SHUFFLE_VECTOR: 1392 return legalizeShuffleVector(MI, MRI, B); 1393 case TargetOpcode::G_FSIN: 1394 case TargetOpcode::G_FCOS: 1395 return legalizeSinCos(MI, MRI, B); 1396 case TargetOpcode::G_GLOBAL_VALUE: 1397 return legalizeGlobalValue(MI, MRI, B); 1398 case TargetOpcode::G_LOAD: 1399 return legalizeLoad(MI, MRI, B, Observer); 1400 case TargetOpcode::G_FMAD: 1401 return legalizeFMad(MI, MRI, B); 1402 case TargetOpcode::G_FDIV: 1403 return legalizeFDIV(MI, MRI, B); 1404 case TargetOpcode::G_UDIV: 1405 case TargetOpcode::G_UREM: 1406 return legalizeUDIV_UREM(MI, MRI, B); 1407 case TargetOpcode::G_SDIV: 1408 case TargetOpcode::G_SREM: 1409 return legalizeSDIV_SREM(MI, MRI, B); 1410 case TargetOpcode::G_ATOMIC_CMPXCHG: 1411 return legalizeAtomicCmpXChg(MI, MRI, B); 1412 case TargetOpcode::G_FLOG: 1413 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1414 case TargetOpcode::G_FLOG10: 1415 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1416 case TargetOpcode::G_FEXP: 1417 return legalizeFExp(MI, B); 1418 case TargetOpcode::G_FPOW: 1419 return legalizeFPow(MI, B); 1420 case TargetOpcode::G_FFLOOR: 1421 return legalizeFFloor(MI, MRI, B); 1422 case TargetOpcode::G_BUILD_VECTOR: 1423 return legalizeBuildVector(MI, MRI, B); 1424 default: 1425 return false; 1426 } 1427 1428 llvm_unreachable("expected switch to return"); 1429 } 1430 1431 Register AMDGPULegalizerInfo::getSegmentAperture( 1432 unsigned AS, 1433 MachineRegisterInfo &MRI, 1434 MachineIRBuilder &B) const { 1435 MachineFunction &MF = B.getMF(); 1436 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1437 const LLT S32 = LLT::scalar(32); 1438 1439 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1440 1441 if (ST.hasApertureRegs()) { 1442 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1443 // getreg. 1444 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1445 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1446 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1447 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1448 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1449 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1450 unsigned Encoding = 1451 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1452 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1453 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1454 1455 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1456 1457 B.buildInstr(AMDGPU::S_GETREG_B32) 1458 .addDef(GetReg) 1459 .addImm(Encoding); 1460 MRI.setType(GetReg, S32); 1461 1462 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1463 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1464 } 1465 1466 Register QueuePtr = MRI.createGenericVirtualRegister( 1467 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1468 1469 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1470 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1471 return Register(); 1472 1473 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1474 // private_segment_aperture_base_hi. 1475 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1476 1477 // TODO: can we be smarter about machine pointer info? 1478 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1479 MachineMemOperand *MMO = MF.getMachineMemOperand( 1480 PtrInfo, 1481 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1482 MachineMemOperand::MOInvariant, 1483 4, commonAlignment(Align(64), StructOffset)); 1484 1485 Register LoadAddr; 1486 1487 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1488 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1489 } 1490 1491 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1492 MachineInstr &MI, MachineRegisterInfo &MRI, 1493 MachineIRBuilder &B) const { 1494 MachineFunction &MF = B.getMF(); 1495 1496 B.setInstr(MI); 1497 1498 const LLT S32 = LLT::scalar(32); 1499 Register Dst = MI.getOperand(0).getReg(); 1500 Register Src = MI.getOperand(1).getReg(); 1501 1502 LLT DstTy = MRI.getType(Dst); 1503 LLT SrcTy = MRI.getType(Src); 1504 unsigned DestAS = DstTy.getAddressSpace(); 1505 unsigned SrcAS = SrcTy.getAddressSpace(); 1506 1507 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1508 // vector element. 1509 assert(!DstTy.isVector()); 1510 1511 const AMDGPUTargetMachine &TM 1512 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1513 1514 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1515 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1516 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1517 return true; 1518 } 1519 1520 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1521 // Truncate. 1522 B.buildExtract(Dst, Src, 0); 1523 MI.eraseFromParent(); 1524 return true; 1525 } 1526 1527 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1528 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1529 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1530 1531 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1532 // another. Merge operands are required to be the same type, but creating an 1533 // extra ptrtoint would be kind of pointless. 1534 auto HighAddr = B.buildConstant( 1535 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1536 B.buildMerge(Dst, {Src, HighAddr}); 1537 MI.eraseFromParent(); 1538 return true; 1539 } 1540 1541 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1542 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1543 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1544 unsigned NullVal = TM.getNullPointerValue(DestAS); 1545 1546 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1547 auto FlatNull = B.buildConstant(SrcTy, 0); 1548 1549 // Extract low 32-bits of the pointer. 1550 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1551 1552 auto CmpRes = 1553 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1554 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1555 1556 MI.eraseFromParent(); 1557 return true; 1558 } 1559 1560 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1561 return false; 1562 1563 if (!ST.hasFlatAddressSpace()) 1564 return false; 1565 1566 auto SegmentNull = 1567 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1568 auto FlatNull = 1569 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1570 1571 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1572 if (!ApertureReg.isValid()) 1573 return false; 1574 1575 auto CmpRes = 1576 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1577 1578 // Coerce the type of the low half of the result so we can use merge_values. 1579 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1580 1581 // TODO: Should we allow mismatched types but matching sizes in merges to 1582 // avoid the ptrtoint? 1583 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1584 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1585 1586 MI.eraseFromParent(); 1587 return true; 1588 } 1589 1590 bool AMDGPULegalizerInfo::legalizeFrint( 1591 MachineInstr &MI, MachineRegisterInfo &MRI, 1592 MachineIRBuilder &B) const { 1593 B.setInstr(MI); 1594 1595 Register Src = MI.getOperand(1).getReg(); 1596 LLT Ty = MRI.getType(Src); 1597 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1598 1599 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1600 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1601 1602 auto C1 = B.buildFConstant(Ty, C1Val); 1603 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1604 1605 // TODO: Should this propagate fast-math-flags? 1606 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1607 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1608 1609 auto C2 = B.buildFConstant(Ty, C2Val); 1610 auto Fabs = B.buildFAbs(Ty, Src); 1611 1612 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1613 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1614 return true; 1615 } 1616 1617 bool AMDGPULegalizerInfo::legalizeFceil( 1618 MachineInstr &MI, MachineRegisterInfo &MRI, 1619 MachineIRBuilder &B) const { 1620 B.setInstr(MI); 1621 1622 const LLT S1 = LLT::scalar(1); 1623 const LLT S64 = LLT::scalar(64); 1624 1625 Register Src = MI.getOperand(1).getReg(); 1626 assert(MRI.getType(Src) == S64); 1627 1628 // result = trunc(src) 1629 // if (src > 0.0 && src != result) 1630 // result += 1.0 1631 1632 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1633 1634 const auto Zero = B.buildFConstant(S64, 0.0); 1635 const auto One = B.buildFConstant(S64, 1.0); 1636 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1637 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1638 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1639 auto Add = B.buildSelect(S64, And, One, Zero); 1640 1641 // TODO: Should this propagate fast-math-flags? 1642 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1643 return true; 1644 } 1645 1646 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1647 MachineIRBuilder &B) { 1648 const unsigned FractBits = 52; 1649 const unsigned ExpBits = 11; 1650 LLT S32 = LLT::scalar(32); 1651 1652 auto Const0 = B.buildConstant(S32, FractBits - 32); 1653 auto Const1 = B.buildConstant(S32, ExpBits); 1654 1655 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1656 .addUse(Const0.getReg(0)) 1657 .addUse(Const1.getReg(0)); 1658 1659 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1660 } 1661 1662 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1663 MachineInstr &MI, MachineRegisterInfo &MRI, 1664 MachineIRBuilder &B) const { 1665 B.setInstr(MI); 1666 1667 const LLT S1 = LLT::scalar(1); 1668 const LLT S32 = LLT::scalar(32); 1669 const LLT S64 = LLT::scalar(64); 1670 1671 Register Src = MI.getOperand(1).getReg(); 1672 assert(MRI.getType(Src) == S64); 1673 1674 // TODO: Should this use extract since the low half is unused? 1675 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1676 Register Hi = Unmerge.getReg(1); 1677 1678 // Extract the upper half, since this is where we will find the sign and 1679 // exponent. 1680 auto Exp = extractF64Exponent(Hi, B); 1681 1682 const unsigned FractBits = 52; 1683 1684 // Extract the sign bit. 1685 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1686 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1687 1688 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1689 1690 const auto Zero32 = B.buildConstant(S32, 0); 1691 1692 // Extend back to 64-bits. 1693 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1694 1695 auto Shr = B.buildAShr(S64, FractMask, Exp); 1696 auto Not = B.buildNot(S64, Shr); 1697 auto Tmp0 = B.buildAnd(S64, Src, Not); 1698 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1699 1700 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1701 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1702 1703 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1704 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1705 return true; 1706 } 1707 1708 bool AMDGPULegalizerInfo::legalizeITOFP( 1709 MachineInstr &MI, MachineRegisterInfo &MRI, 1710 MachineIRBuilder &B, bool Signed) const { 1711 B.setInstr(MI); 1712 1713 Register Dst = MI.getOperand(0).getReg(); 1714 Register Src = MI.getOperand(1).getReg(); 1715 1716 const LLT S64 = LLT::scalar(64); 1717 const LLT S32 = LLT::scalar(32); 1718 1719 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1720 1721 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1722 1723 auto CvtHi = Signed ? 1724 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1725 B.buildUITOFP(S64, Unmerge.getReg(1)); 1726 1727 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1728 1729 auto ThirtyTwo = B.buildConstant(S32, 32); 1730 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1731 .addUse(CvtHi.getReg(0)) 1732 .addUse(ThirtyTwo.getReg(0)); 1733 1734 // TODO: Should this propagate fast-math-flags? 1735 B.buildFAdd(Dst, LdExp, CvtLo); 1736 MI.eraseFromParent(); 1737 return true; 1738 } 1739 1740 // TODO: Copied from DAG implementation. Verify logic and document how this 1741 // actually works. 1742 bool AMDGPULegalizerInfo::legalizeFPTOI( 1743 MachineInstr &MI, MachineRegisterInfo &MRI, 1744 MachineIRBuilder &B, bool Signed) const { 1745 B.setInstr(MI); 1746 1747 Register Dst = MI.getOperand(0).getReg(); 1748 Register Src = MI.getOperand(1).getReg(); 1749 1750 const LLT S64 = LLT::scalar(64); 1751 const LLT S32 = LLT::scalar(32); 1752 1753 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1754 1755 unsigned Flags = MI.getFlags(); 1756 1757 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1758 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1759 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1760 1761 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1762 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1763 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1764 1765 auto Hi = Signed ? 1766 B.buildFPTOSI(S32, FloorMul) : 1767 B.buildFPTOUI(S32, FloorMul); 1768 auto Lo = B.buildFPTOUI(S32, Fma); 1769 1770 B.buildMerge(Dst, { Lo, Hi }); 1771 MI.eraseFromParent(); 1772 1773 return true; 1774 } 1775 1776 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1777 MachineInstr &MI, MachineRegisterInfo &MRI, 1778 MachineIRBuilder &B) const { 1779 MachineFunction &MF = B.getMF(); 1780 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1781 1782 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1783 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1784 1785 // With ieee_mode disabled, the instructions have the correct behavior 1786 // already for G_FMINNUM/G_FMAXNUM 1787 if (!MFI->getMode().IEEE) 1788 return !IsIEEEOp; 1789 1790 if (IsIEEEOp) 1791 return true; 1792 1793 MachineIRBuilder HelperBuilder(MI); 1794 GISelObserverWrapper DummyObserver; 1795 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1796 HelperBuilder.setInstr(MI); 1797 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1798 } 1799 1800 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1801 MachineInstr &MI, MachineRegisterInfo &MRI, 1802 MachineIRBuilder &B) const { 1803 // TODO: Should move some of this into LegalizerHelper. 1804 1805 // TODO: Promote dynamic indexing of s16 to s32 1806 1807 // FIXME: Artifact combiner probably should have replaced the truncated 1808 // constant before this, so we shouldn't need 1809 // getConstantVRegValWithLookThrough. 1810 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1811 MI.getOperand(2).getReg(), MRI); 1812 if (!IdxVal) // Dynamic case will be selected to register indexing. 1813 return true; 1814 1815 Register Dst = MI.getOperand(0).getReg(); 1816 Register Vec = MI.getOperand(1).getReg(); 1817 1818 LLT VecTy = MRI.getType(Vec); 1819 LLT EltTy = VecTy.getElementType(); 1820 assert(EltTy == MRI.getType(Dst)); 1821 1822 B.setInstr(MI); 1823 1824 if (IdxVal->Value < VecTy.getNumElements()) 1825 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1826 else 1827 B.buildUndef(Dst); 1828 1829 MI.eraseFromParent(); 1830 return true; 1831 } 1832 1833 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1834 MachineInstr &MI, MachineRegisterInfo &MRI, 1835 MachineIRBuilder &B) const { 1836 // TODO: Should move some of this into LegalizerHelper. 1837 1838 // TODO: Promote dynamic indexing of s16 to s32 1839 1840 // FIXME: Artifact combiner probably should have replaced the truncated 1841 // constant before this, so we shouldn't need 1842 // getConstantVRegValWithLookThrough. 1843 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1844 MI.getOperand(3).getReg(), MRI); 1845 if (!IdxVal) // Dynamic case will be selected to register indexing. 1846 return true; 1847 1848 Register Dst = MI.getOperand(0).getReg(); 1849 Register Vec = MI.getOperand(1).getReg(); 1850 Register Ins = MI.getOperand(2).getReg(); 1851 1852 LLT VecTy = MRI.getType(Vec); 1853 LLT EltTy = VecTy.getElementType(); 1854 assert(EltTy == MRI.getType(Ins)); 1855 1856 B.setInstr(MI); 1857 1858 if (IdxVal->Value < VecTy.getNumElements()) 1859 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1860 else 1861 B.buildUndef(Dst); 1862 1863 MI.eraseFromParent(); 1864 return true; 1865 } 1866 1867 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1868 MachineInstr &MI, MachineRegisterInfo &MRI, 1869 MachineIRBuilder &B) const { 1870 const LLT V2S16 = LLT::vector(2, 16); 1871 1872 Register Dst = MI.getOperand(0).getReg(); 1873 Register Src0 = MI.getOperand(1).getReg(); 1874 LLT DstTy = MRI.getType(Dst); 1875 LLT SrcTy = MRI.getType(Src0); 1876 1877 if (SrcTy == V2S16 && DstTy == V2S16 && 1878 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1879 return true; 1880 1881 MachineIRBuilder HelperBuilder(MI); 1882 GISelObserverWrapper DummyObserver; 1883 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1884 HelperBuilder.setInstr(MI); 1885 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1886 } 1887 1888 bool AMDGPULegalizerInfo::legalizeSinCos( 1889 MachineInstr &MI, MachineRegisterInfo &MRI, 1890 MachineIRBuilder &B) const { 1891 B.setInstr(MI); 1892 1893 Register DstReg = MI.getOperand(0).getReg(); 1894 Register SrcReg = MI.getOperand(1).getReg(); 1895 LLT Ty = MRI.getType(DstReg); 1896 unsigned Flags = MI.getFlags(); 1897 1898 Register TrigVal; 1899 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1900 if (ST.hasTrigReducedRange()) { 1901 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1902 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1903 .addUse(MulVal.getReg(0)) 1904 .setMIFlags(Flags).getReg(0); 1905 } else 1906 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1907 1908 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1909 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1910 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1911 .addUse(TrigVal) 1912 .setMIFlags(Flags); 1913 MI.eraseFromParent(); 1914 return true; 1915 } 1916 1917 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1918 Register DstReg, LLT PtrTy, 1919 MachineIRBuilder &B, const GlobalValue *GV, 1920 unsigned Offset, unsigned GAFlags) const { 1921 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1922 // to the following code sequence: 1923 // 1924 // For constant address space: 1925 // s_getpc_b64 s[0:1] 1926 // s_add_u32 s0, s0, $symbol 1927 // s_addc_u32 s1, s1, 0 1928 // 1929 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1930 // a fixup or relocation is emitted to replace $symbol with a literal 1931 // constant, which is a pc-relative offset from the encoding of the $symbol 1932 // operand to the global variable. 1933 // 1934 // For global address space: 1935 // s_getpc_b64 s[0:1] 1936 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1937 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1938 // 1939 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1940 // fixups or relocations are emitted to replace $symbol@*@lo and 1941 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1942 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1943 // operand to the global variable. 1944 // 1945 // What we want here is an offset from the value returned by s_getpc 1946 // (which is the address of the s_add_u32 instruction) to the global 1947 // variable, but since the encoding of $symbol starts 4 bytes after the start 1948 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1949 // small. This requires us to add 4 to the global variable offset in order to 1950 // compute the correct address. 1951 1952 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1953 1954 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1955 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1956 1957 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1958 .addDef(PCReg); 1959 1960 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1961 if (GAFlags == SIInstrInfo::MO_NONE) 1962 MIB.addImm(0); 1963 else 1964 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1965 1966 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1967 1968 if (PtrTy.getSizeInBits() == 32) 1969 B.buildExtract(DstReg, PCReg, 0); 1970 return true; 1971 } 1972 1973 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1974 MachineInstr &MI, MachineRegisterInfo &MRI, 1975 MachineIRBuilder &B) const { 1976 Register DstReg = MI.getOperand(0).getReg(); 1977 LLT Ty = MRI.getType(DstReg); 1978 unsigned AS = Ty.getAddressSpace(); 1979 1980 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1981 MachineFunction &MF = B.getMF(); 1982 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1983 B.setInstr(MI); 1984 1985 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1986 if (!MFI->isEntryFunction()) { 1987 const Function &Fn = MF.getFunction(); 1988 DiagnosticInfoUnsupported BadLDSDecl( 1989 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 1990 DS_Warning); 1991 Fn.getContext().diagnose(BadLDSDecl); 1992 1993 // We currently don't have a way to correctly allocate LDS objects that 1994 // aren't directly associated with a kernel. We do force inlining of 1995 // functions that use local objects. However, if these dead functions are 1996 // not eliminated, we don't want a compile time error. Just emit a warning 1997 // and a trap, since there should be no callable path here. 1998 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 1999 B.buildUndef(DstReg); 2000 MI.eraseFromParent(); 2001 return true; 2002 } 2003 2004 // TODO: We could emit code to handle the initialization somewhere. 2005 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2006 const SITargetLowering *TLI = ST.getTargetLowering(); 2007 if (!TLI->shouldUseLDSConstAddress(GV)) { 2008 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2009 return true; // Leave in place; 2010 } 2011 2012 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 2013 MI.eraseFromParent(); 2014 return true; 2015 } 2016 2017 const Function &Fn = MF.getFunction(); 2018 DiagnosticInfoUnsupported BadInit( 2019 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2020 Fn.getContext().diagnose(BadInit); 2021 return true; 2022 } 2023 2024 const SITargetLowering *TLI = ST.getTargetLowering(); 2025 2026 if (TLI->shouldEmitFixup(GV)) { 2027 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2028 MI.eraseFromParent(); 2029 return true; 2030 } 2031 2032 if (TLI->shouldEmitPCReloc(GV)) { 2033 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2034 MI.eraseFromParent(); 2035 return true; 2036 } 2037 2038 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2039 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2040 2041 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2042 MachinePointerInfo::getGOT(MF), 2043 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2044 MachineMemOperand::MOInvariant, 2045 8 /*Size*/, Align(8)); 2046 2047 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2048 2049 if (Ty.getSizeInBits() == 32) { 2050 // Truncate if this is a 32-bit constant adrdess. 2051 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2052 B.buildExtract(DstReg, Load, 0); 2053 } else 2054 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2055 2056 MI.eraseFromParent(); 2057 return true; 2058 } 2059 2060 bool AMDGPULegalizerInfo::legalizeLoad( 2061 MachineInstr &MI, MachineRegisterInfo &MRI, 2062 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2063 B.setInstr(MI); 2064 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2065 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2066 Observer.changingInstr(MI); 2067 MI.getOperand(1).setReg(Cast.getReg(0)); 2068 Observer.changedInstr(MI); 2069 return true; 2070 } 2071 2072 bool AMDGPULegalizerInfo::legalizeFMad( 2073 MachineInstr &MI, MachineRegisterInfo &MRI, 2074 MachineIRBuilder &B) const { 2075 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2076 assert(Ty.isScalar()); 2077 2078 MachineFunction &MF = B.getMF(); 2079 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2080 2081 // TODO: Always legal with future ftz flag. 2082 // FIXME: Do we need just output? 2083 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2084 return true; 2085 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2086 return true; 2087 2088 MachineIRBuilder HelperBuilder(MI); 2089 GISelObserverWrapper DummyObserver; 2090 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2091 HelperBuilder.setInstr(MI); 2092 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2093 } 2094 2095 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2096 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2097 Register DstReg = MI.getOperand(0).getReg(); 2098 Register PtrReg = MI.getOperand(1).getReg(); 2099 Register CmpVal = MI.getOperand(2).getReg(); 2100 Register NewVal = MI.getOperand(3).getReg(); 2101 2102 assert(SITargetLowering::isFlatGlobalAddrSpace( 2103 MRI.getType(PtrReg).getAddressSpace()) && 2104 "this should not have been custom lowered"); 2105 2106 LLT ValTy = MRI.getType(CmpVal); 2107 LLT VecTy = LLT::vector(2, ValTy); 2108 2109 B.setInstr(MI); 2110 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2111 2112 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2113 .addDef(DstReg) 2114 .addUse(PtrReg) 2115 .addUse(PackedVal) 2116 .setMemRefs(MI.memoperands()); 2117 2118 MI.eraseFromParent(); 2119 return true; 2120 } 2121 2122 bool AMDGPULegalizerInfo::legalizeFlog( 2123 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2124 Register Dst = MI.getOperand(0).getReg(); 2125 Register Src = MI.getOperand(1).getReg(); 2126 LLT Ty = B.getMRI()->getType(Dst); 2127 unsigned Flags = MI.getFlags(); 2128 B.setInstr(MI); 2129 2130 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2131 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2132 2133 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2134 MI.eraseFromParent(); 2135 return true; 2136 } 2137 2138 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2139 MachineIRBuilder &B) const { 2140 Register Dst = MI.getOperand(0).getReg(); 2141 Register Src = MI.getOperand(1).getReg(); 2142 unsigned Flags = MI.getFlags(); 2143 LLT Ty = B.getMRI()->getType(Dst); 2144 B.setInstr(MI); 2145 2146 auto K = B.buildFConstant(Ty, numbers::log2e); 2147 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2148 B.buildFExp2(Dst, Mul, Flags); 2149 MI.eraseFromParent(); 2150 return true; 2151 } 2152 2153 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2154 MachineIRBuilder &B) const { 2155 Register Dst = MI.getOperand(0).getReg(); 2156 Register Src0 = MI.getOperand(1).getReg(); 2157 Register Src1 = MI.getOperand(2).getReg(); 2158 unsigned Flags = MI.getFlags(); 2159 LLT Ty = B.getMRI()->getType(Dst); 2160 B.setInstr(MI); 2161 const LLT S16 = LLT::scalar(16); 2162 const LLT S32 = LLT::scalar(32); 2163 2164 if (Ty == S32) { 2165 auto Log = B.buildFLog2(S32, Src0, Flags); 2166 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2167 .addUse(Log.getReg(0)) 2168 .addUse(Src1) 2169 .setMIFlags(Flags); 2170 B.buildFExp2(Dst, Mul, Flags); 2171 } else if (Ty == S16) { 2172 // There's no f16 fmul_legacy, so we need to convert for it. 2173 auto Log = B.buildFLog2(S16, Src0, Flags); 2174 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2175 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2176 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2177 .addUse(Ext0.getReg(0)) 2178 .addUse(Ext1.getReg(0)) 2179 .setMIFlags(Flags); 2180 2181 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2182 } else 2183 return false; 2184 2185 MI.eraseFromParent(); 2186 return true; 2187 } 2188 2189 // Find a source register, ignoring any possible source modifiers. 2190 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2191 Register ModSrc = OrigSrc; 2192 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2193 ModSrc = SrcFNeg->getOperand(1).getReg(); 2194 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2195 ModSrc = SrcFAbs->getOperand(1).getReg(); 2196 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2197 ModSrc = SrcFAbs->getOperand(1).getReg(); 2198 return ModSrc; 2199 } 2200 2201 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2202 MachineRegisterInfo &MRI, 2203 MachineIRBuilder &B) const { 2204 B.setInstr(MI); 2205 2206 const LLT S1 = LLT::scalar(1); 2207 const LLT S64 = LLT::scalar(64); 2208 Register Dst = MI.getOperand(0).getReg(); 2209 Register OrigSrc = MI.getOperand(1).getReg(); 2210 unsigned Flags = MI.getFlags(); 2211 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2212 "this should not have been custom lowered"); 2213 2214 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2215 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2216 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2217 // V_FRACT bug is: 2218 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2219 // 2220 // Convert floor(x) to (x - fract(x)) 2221 2222 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2223 .addUse(OrigSrc) 2224 .setMIFlags(Flags); 2225 2226 // Give source modifier matching some assistance before obscuring a foldable 2227 // pattern. 2228 2229 // TODO: We can avoid the neg on the fract? The input sign to fract 2230 // shouldn't matter? 2231 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2232 2233 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2234 2235 Register Min = MRI.createGenericVirtualRegister(S64); 2236 2237 // We don't need to concern ourselves with the snan handling difference, so 2238 // use the one which will directly select. 2239 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2240 if (MFI->getMode().IEEE) 2241 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2242 else 2243 B.buildFMinNum(Min, Fract, Const, Flags); 2244 2245 Register CorrectedFract = Min; 2246 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2247 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2248 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2249 } 2250 2251 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2252 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2253 2254 MI.eraseFromParent(); 2255 return true; 2256 } 2257 2258 // Turn an illegal packed v2s16 build vector into bit operations. 2259 // TODO: This should probably be a bitcast action in LegalizerHelper. 2260 bool AMDGPULegalizerInfo::legalizeBuildVector( 2261 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2262 Register Dst = MI.getOperand(0).getReg(); 2263 const LLT S32 = LLT::scalar(32); 2264 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2265 2266 Register Src0 = MI.getOperand(1).getReg(); 2267 Register Src1 = MI.getOperand(2).getReg(); 2268 assert(MRI.getType(Src0) == LLT::scalar(16)); 2269 2270 B.setInstr(MI); 2271 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2272 B.buildBitcast(Dst, Merge); 2273 2274 MI.eraseFromParent(); 2275 return true; 2276 } 2277 2278 // Return the use branch instruction, otherwise null if the usage is invalid. 2279 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2280 MachineRegisterInfo &MRI, 2281 MachineInstr *&Br, 2282 MachineBasicBlock *&UncondBrTarget) { 2283 Register CondDef = MI.getOperand(0).getReg(); 2284 if (!MRI.hasOneNonDBGUse(CondDef)) 2285 return nullptr; 2286 2287 MachineBasicBlock *Parent = MI.getParent(); 2288 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2289 if (UseMI.getParent() != Parent || 2290 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2291 return nullptr; 2292 2293 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2294 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2295 if (Next == Parent->end()) { 2296 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2297 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2298 return nullptr; 2299 UncondBrTarget = &*NextMBB; 2300 } else { 2301 if (Next->getOpcode() != AMDGPU::G_BR) 2302 return nullptr; 2303 Br = &*Next; 2304 UncondBrTarget = Br->getOperand(0).getMBB(); 2305 } 2306 2307 return &UseMI; 2308 } 2309 2310 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2311 MachineRegisterInfo &MRI, 2312 Register LiveIn, 2313 Register PhyReg) const { 2314 assert(PhyReg.isPhysical() && "Physical register expected"); 2315 2316 // Insert the live-in copy, if required, by defining destination virtual 2317 // register. 2318 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2319 if (!MRI.getVRegDef(LiveIn)) { 2320 // FIXME: Should have scoped insert pt 2321 MachineBasicBlock &OrigInsBB = B.getMBB(); 2322 auto OrigInsPt = B.getInsertPt(); 2323 2324 MachineBasicBlock &EntryMBB = B.getMF().front(); 2325 EntryMBB.addLiveIn(PhyReg); 2326 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2327 B.buildCopy(LiveIn, PhyReg); 2328 2329 B.setInsertPt(OrigInsBB, OrigInsPt); 2330 } 2331 2332 return LiveIn; 2333 } 2334 2335 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2336 MachineRegisterInfo &MRI, 2337 Register PhyReg, LLT Ty, 2338 bool InsertLiveInCopy) const { 2339 assert(PhyReg.isPhysical() && "Physical register expected"); 2340 2341 // Get or create virtual live-in regester 2342 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2343 if (!LiveIn) { 2344 LiveIn = MRI.createGenericVirtualRegister(Ty); 2345 MRI.addLiveIn(PhyReg, LiveIn); 2346 } 2347 2348 // When the actual true copy required is from virtual register to physical 2349 // register (to be inserted later), live-in copy insertion from physical 2350 // to register virtual register is not required 2351 if (!InsertLiveInCopy) 2352 return LiveIn; 2353 2354 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2355 } 2356 2357 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2358 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2359 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2360 const ArgDescriptor *Arg; 2361 const TargetRegisterClass *RC; 2362 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2363 if (!Arg) { 2364 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2365 return nullptr; 2366 } 2367 return Arg; 2368 } 2369 2370 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2371 const ArgDescriptor *Arg) const { 2372 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2373 return false; // TODO: Handle these 2374 2375 Register SrcReg = Arg->getRegister(); 2376 assert(SrcReg.isPhysical() && "Physical register expected"); 2377 assert(DstReg.isVirtual() && "Virtual register expected"); 2378 2379 MachineRegisterInfo &MRI = *B.getMRI(); 2380 2381 LLT Ty = MRI.getType(DstReg); 2382 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2383 2384 if (Arg->isMasked()) { 2385 // TODO: Should we try to emit this once in the entry block? 2386 const LLT S32 = LLT::scalar(32); 2387 const unsigned Mask = Arg->getMask(); 2388 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2389 2390 Register AndMaskSrc = LiveIn; 2391 2392 if (Shift != 0) { 2393 auto ShiftAmt = B.buildConstant(S32, Shift); 2394 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2395 } 2396 2397 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2398 } else { 2399 B.buildCopy(DstReg, LiveIn); 2400 } 2401 2402 return true; 2403 } 2404 2405 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2406 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2407 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2408 B.setInstr(MI); 2409 2410 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2411 if (!Arg) 2412 return false; 2413 2414 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2415 return false; 2416 2417 MI.eraseFromParent(); 2418 return true; 2419 } 2420 2421 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2422 MachineRegisterInfo &MRI, 2423 MachineIRBuilder &B) const { 2424 B.setInstr(MI); 2425 Register Dst = MI.getOperand(0).getReg(); 2426 LLT DstTy = MRI.getType(Dst); 2427 LLT S16 = LLT::scalar(16); 2428 LLT S32 = LLT::scalar(32); 2429 LLT S64 = LLT::scalar(64); 2430 2431 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2432 return true; 2433 2434 if (DstTy == S16) 2435 return legalizeFDIV16(MI, MRI, B); 2436 if (DstTy == S32) 2437 return legalizeFDIV32(MI, MRI, B); 2438 if (DstTy == S64) 2439 return legalizeFDIV64(MI, MRI, B); 2440 2441 return false; 2442 } 2443 2444 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2445 const LLT S32 = LLT::scalar(32); 2446 2447 auto Cvt0 = B.buildUITOFP(S32, Src); 2448 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2449 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2450 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2451 return B.buildFPTOUI(S32, Mul).getReg(0); 2452 } 2453 2454 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2455 Register DstReg, 2456 Register Num, 2457 Register Den, 2458 bool IsRem) const { 2459 const LLT S1 = LLT::scalar(1); 2460 const LLT S32 = LLT::scalar(32); 2461 2462 // RCP = URECIP(Den) = 2^32 / Den + e 2463 // e is rounding error. 2464 auto RCP = buildDivRCP(B, Den); 2465 2466 // RCP_LO = mul(RCP, Den) 2467 auto RCP_LO = B.buildMul(S32, RCP, Den); 2468 2469 // RCP_HI = mulhu (RCP, Den) */ 2470 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2471 2472 // NEG_RCP_LO = -RCP_LO 2473 auto Zero = B.buildConstant(S32, 0); 2474 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2475 2476 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2477 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2478 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2479 2480 // Calculate the rounding error from the URECIP instruction 2481 // E = mulhu(ABS_RCP_LO, RCP) 2482 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2483 2484 // RCP_A_E = RCP + E 2485 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2486 2487 // RCP_S_E = RCP - E 2488 auto RCP_S_E = B.buildSub(S32, RCP, E); 2489 2490 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2491 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2492 2493 // Quotient = mulhu(Tmp0, Num)stmp 2494 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2495 2496 // Num_S_Remainder = Quotient * Den 2497 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2498 2499 // Remainder = Num - Num_S_Remainder 2500 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2501 2502 // Remainder_GE_Den = Remainder >= Den 2503 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2504 2505 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2506 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2507 Num, Num_S_Remainder); 2508 2509 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2510 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2511 2512 // Calculate Division result: 2513 2514 // Quotient_A_One = Quotient + 1 2515 auto One = B.buildConstant(S32, 1); 2516 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2517 2518 // Quotient_S_One = Quotient - 1 2519 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2520 2521 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2522 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2523 2524 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2525 if (IsRem) { 2526 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2527 2528 // Calculate Rem result: 2529 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2530 2531 // Remainder_A_Den = Remainder + Den 2532 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2533 2534 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2535 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2536 2537 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2538 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2539 } else { 2540 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2541 } 2542 } 2543 2544 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2545 MachineRegisterInfo &MRI, 2546 MachineIRBuilder &B) const { 2547 B.setInstr(MI); 2548 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2549 Register DstReg = MI.getOperand(0).getReg(); 2550 Register Num = MI.getOperand(1).getReg(); 2551 Register Den = MI.getOperand(2).getReg(); 2552 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2553 MI.eraseFromParent(); 2554 return true; 2555 } 2556 2557 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2558 // 2559 // Return lo, hi of result 2560 // 2561 // %cvt.lo = G_UITOFP Val.lo 2562 // %cvt.hi = G_UITOFP Val.hi 2563 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2564 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2565 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2566 // %mul2 = G_FMUL %mul1, 2**(-32) 2567 // %trunc = G_INTRINSIC_TRUNC %mul2 2568 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2569 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2570 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2571 Register Val) { 2572 const LLT S32 = LLT::scalar(32); 2573 auto Unmerge = B.buildUnmerge(S32, Val); 2574 2575 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2576 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2577 2578 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2579 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2580 2581 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2582 auto Mul1 = 2583 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2584 2585 // 2**(-32) 2586 auto Mul2 = 2587 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2588 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2589 2590 // -(2**32) 2591 auto Mad2 = B.buildFMAD(S32, Trunc, 2592 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2593 2594 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2595 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2596 2597 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2598 } 2599 2600 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI, 2601 MachineRegisterInfo &MRI, 2602 MachineIRBuilder &B) const { 2603 B.setInstr(MI); 2604 2605 const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV; 2606 const LLT S32 = LLT::scalar(32); 2607 const LLT S64 = LLT::scalar(64); 2608 const LLT S1 = LLT::scalar(1); 2609 Register Numer = MI.getOperand(1).getReg(); 2610 Register Denom = MI.getOperand(2).getReg(); 2611 Register RcpLo, RcpHi; 2612 2613 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2614 2615 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2616 2617 auto Zero64 = B.buildConstant(S64, 0); 2618 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2619 2620 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2621 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2622 2623 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2624 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2625 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2626 2627 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2628 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2629 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2630 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2631 2632 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2633 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2634 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2635 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2636 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2637 2638 auto Zero32 = B.buildConstant(S32, 0); 2639 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2640 auto Add2_HiC = 2641 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2642 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2643 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2644 2645 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2646 Register NumerLo = UnmergeNumer.getReg(0); 2647 Register NumerHi = UnmergeNumer.getReg(1); 2648 2649 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2650 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2651 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2652 Register Mul3_Lo = UnmergeMul3.getReg(0); 2653 Register Mul3_Hi = UnmergeMul3.getReg(1); 2654 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2655 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2656 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2657 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2658 2659 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2660 Register DenomLo = UnmergeDenom.getReg(0); 2661 Register DenomHi = UnmergeDenom.getReg(1); 2662 2663 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2664 auto C1 = B.buildSExt(S32, CmpHi); 2665 2666 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2667 auto C2 = B.buildSExt(S32, CmpLo); 2668 2669 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2670 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2671 2672 // TODO: Here and below portions of the code can be enclosed into if/endif. 2673 // Currently control flow is unconditional and we have 4 selects after 2674 // potential endif to substitute PHIs. 2675 2676 // if C3 != 0 ... 2677 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2678 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2679 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2680 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2681 2682 auto One64 = B.buildConstant(S64, 1); 2683 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2684 2685 auto C4 = 2686 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2687 auto C5 = 2688 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2689 auto C6 = B.buildSelect( 2690 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2691 2692 // if (C6 != 0) 2693 auto Add4 = B.buildAdd(S64, Add3, One64); 2694 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2695 2696 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2697 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2698 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2699 2700 // endif C6 2701 // endif C3 2702 2703 if (IsDiv) { 2704 auto Sel1 = B.buildSelect( 2705 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2706 B.buildSelect(MI.getOperand(0), 2707 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2708 } else { 2709 auto Sel2 = B.buildSelect( 2710 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2711 B.buildSelect(MI.getOperand(0), 2712 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2713 } 2714 2715 MI.eraseFromParent(); 2716 return true; 2717 } 2718 2719 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2720 MachineRegisterInfo &MRI, 2721 MachineIRBuilder &B) const { 2722 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2723 if (Ty == LLT::scalar(32)) 2724 return legalizeUDIV_UREM32(MI, MRI, B); 2725 if (Ty == LLT::scalar(64)) 2726 return legalizeUDIV_UREM64(MI, MRI, B); 2727 return false; 2728 } 2729 2730 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2731 MachineRegisterInfo &MRI, 2732 MachineIRBuilder &B) const { 2733 B.setInstr(MI); 2734 const LLT S32 = LLT::scalar(32); 2735 2736 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2737 Register DstReg = MI.getOperand(0).getReg(); 2738 Register LHS = MI.getOperand(1).getReg(); 2739 Register RHS = MI.getOperand(2).getReg(); 2740 2741 auto ThirtyOne = B.buildConstant(S32, 31); 2742 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2743 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2744 2745 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2746 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2747 2748 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2749 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2750 2751 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2752 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2753 2754 if (IsRem) { 2755 auto RSign = LHSign; // Remainder sign is the same as LHS 2756 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2757 B.buildSub(DstReg, UDivRem, RSign); 2758 } else { 2759 auto DSign = B.buildXor(S32, LHSign, RHSign); 2760 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2761 B.buildSub(DstReg, UDivRem, DSign); 2762 } 2763 2764 MI.eraseFromParent(); 2765 return true; 2766 } 2767 2768 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2769 MachineRegisterInfo &MRI, 2770 MachineIRBuilder &B) const { 2771 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2772 return legalizeSDIV_SREM32(MI, MRI, B); 2773 return false; 2774 } 2775 2776 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2777 MachineRegisterInfo &MRI, 2778 MachineIRBuilder &B) const { 2779 Register Res = MI.getOperand(0).getReg(); 2780 Register LHS = MI.getOperand(1).getReg(); 2781 Register RHS = MI.getOperand(2).getReg(); 2782 2783 uint16_t Flags = MI.getFlags(); 2784 2785 LLT ResTy = MRI.getType(Res); 2786 LLT S32 = LLT::scalar(32); 2787 LLT S64 = LLT::scalar(64); 2788 2789 const MachineFunction &MF = B.getMF(); 2790 bool Unsafe = 2791 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2792 2793 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2794 return false; 2795 2796 if (!Unsafe && ResTy == S32 && 2797 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2798 return false; 2799 2800 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2801 // 1 / x -> RCP(x) 2802 if (CLHS->isExactlyValue(1.0)) { 2803 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2804 .addUse(RHS) 2805 .setMIFlags(Flags); 2806 2807 MI.eraseFromParent(); 2808 return true; 2809 } 2810 2811 // -1 / x -> RCP( FNEG(x) ) 2812 if (CLHS->isExactlyValue(-1.0)) { 2813 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2814 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2815 .addUse(FNeg.getReg(0)) 2816 .setMIFlags(Flags); 2817 2818 MI.eraseFromParent(); 2819 return true; 2820 } 2821 } 2822 2823 // x / y -> x * (1.0 / y) 2824 if (Unsafe) { 2825 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2826 .addUse(RHS) 2827 .setMIFlags(Flags); 2828 B.buildFMul(Res, LHS, RCP, Flags); 2829 2830 MI.eraseFromParent(); 2831 return true; 2832 } 2833 2834 return false; 2835 } 2836 2837 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2838 MachineRegisterInfo &MRI, 2839 MachineIRBuilder &B) const { 2840 B.setInstr(MI); 2841 Register Res = MI.getOperand(0).getReg(); 2842 Register LHS = MI.getOperand(1).getReg(); 2843 Register RHS = MI.getOperand(2).getReg(); 2844 2845 uint16_t Flags = MI.getFlags(); 2846 2847 LLT S16 = LLT::scalar(16); 2848 LLT S32 = LLT::scalar(32); 2849 2850 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2851 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2852 2853 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2854 .addUse(RHSExt.getReg(0)) 2855 .setMIFlags(Flags); 2856 2857 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2858 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2859 2860 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2861 .addUse(RDst.getReg(0)) 2862 .addUse(RHS) 2863 .addUse(LHS) 2864 .setMIFlags(Flags); 2865 2866 MI.eraseFromParent(); 2867 return true; 2868 } 2869 2870 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2871 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2872 static void toggleSPDenormMode(bool Enable, 2873 MachineIRBuilder &B, 2874 const GCNSubtarget &ST, 2875 AMDGPU::SIModeRegisterDefaults Mode) { 2876 // Set SP denorm mode to this value. 2877 unsigned SPDenormMode = 2878 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2879 2880 if (ST.hasDenormModeInst()) { 2881 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2882 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2883 2884 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2885 B.buildInstr(AMDGPU::S_DENORM_MODE) 2886 .addImm(NewDenormModeValue); 2887 2888 } else { 2889 // Select FP32 bit field in mode register. 2890 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2891 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2892 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2893 2894 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2895 .addImm(SPDenormMode) 2896 .addImm(SPDenormModeBitField); 2897 } 2898 } 2899 2900 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2901 MachineRegisterInfo &MRI, 2902 MachineIRBuilder &B) const { 2903 B.setInstr(MI); 2904 Register Res = MI.getOperand(0).getReg(); 2905 Register LHS = MI.getOperand(1).getReg(); 2906 Register RHS = MI.getOperand(2).getReg(); 2907 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2908 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2909 2910 uint16_t Flags = MI.getFlags(); 2911 2912 LLT S32 = LLT::scalar(32); 2913 LLT S1 = LLT::scalar(1); 2914 2915 auto One = B.buildFConstant(S32, 1.0f); 2916 2917 auto DenominatorScaled = 2918 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2919 .addUse(LHS) 2920 .addUse(RHS) 2921 .addImm(0) 2922 .setMIFlags(Flags); 2923 auto NumeratorScaled = 2924 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2925 .addUse(LHS) 2926 .addUse(RHS) 2927 .addImm(1) 2928 .setMIFlags(Flags); 2929 2930 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2931 .addUse(DenominatorScaled.getReg(0)) 2932 .setMIFlags(Flags); 2933 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2934 2935 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2936 // aren't modeled as reading it. 2937 if (!Mode.allFP32Denormals()) 2938 toggleSPDenormMode(true, B, ST, Mode); 2939 2940 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2941 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2942 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2943 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2944 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2945 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2946 2947 if (!Mode.allFP32Denormals()) 2948 toggleSPDenormMode(false, B, ST, Mode); 2949 2950 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2951 .addUse(Fma4.getReg(0)) 2952 .addUse(Fma1.getReg(0)) 2953 .addUse(Fma3.getReg(0)) 2954 .addUse(NumeratorScaled.getReg(1)) 2955 .setMIFlags(Flags); 2956 2957 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2958 .addUse(Fmas.getReg(0)) 2959 .addUse(RHS) 2960 .addUse(LHS) 2961 .setMIFlags(Flags); 2962 2963 MI.eraseFromParent(); 2964 return true; 2965 } 2966 2967 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2968 MachineRegisterInfo &MRI, 2969 MachineIRBuilder &B) const { 2970 B.setInstr(MI); 2971 Register Res = MI.getOperand(0).getReg(); 2972 Register LHS = MI.getOperand(1).getReg(); 2973 Register RHS = MI.getOperand(2).getReg(); 2974 2975 uint16_t Flags = MI.getFlags(); 2976 2977 LLT S64 = LLT::scalar(64); 2978 LLT S1 = LLT::scalar(1); 2979 2980 auto One = B.buildFConstant(S64, 1.0); 2981 2982 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2983 .addUse(LHS) 2984 .addUse(RHS) 2985 .addImm(0) 2986 .setMIFlags(Flags); 2987 2988 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2989 2990 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2991 .addUse(DivScale0.getReg(0)) 2992 .setMIFlags(Flags); 2993 2994 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2995 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2996 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2997 2998 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2999 .addUse(LHS) 3000 .addUse(RHS) 3001 .addImm(1) 3002 .setMIFlags(Flags); 3003 3004 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3005 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3006 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3007 3008 Register Scale; 3009 if (!ST.hasUsableDivScaleConditionOutput()) { 3010 // Workaround a hardware bug on SI where the condition output from div_scale 3011 // is not usable. 3012 3013 LLT S32 = LLT::scalar(32); 3014 3015 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3016 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3017 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3018 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3019 3020 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3021 Scale1Unmerge.getReg(1)); 3022 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3023 Scale0Unmerge.getReg(1)); 3024 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3025 } else { 3026 Scale = DivScale1.getReg(1); 3027 } 3028 3029 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3030 .addUse(Fma4.getReg(0)) 3031 .addUse(Fma3.getReg(0)) 3032 .addUse(Mul.getReg(0)) 3033 .addUse(Scale) 3034 .setMIFlags(Flags); 3035 3036 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3037 .addUse(Fmas.getReg(0)) 3038 .addUse(RHS) 3039 .addUse(LHS) 3040 .setMIFlags(Flags); 3041 3042 MI.eraseFromParent(); 3043 return true; 3044 } 3045 3046 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3047 MachineRegisterInfo &MRI, 3048 MachineIRBuilder &B) const { 3049 B.setInstr(MI); 3050 Register Res = MI.getOperand(0).getReg(); 3051 Register LHS = MI.getOperand(2).getReg(); 3052 Register RHS = MI.getOperand(3).getReg(); 3053 uint16_t Flags = MI.getFlags(); 3054 3055 LLT S32 = LLT::scalar(32); 3056 LLT S1 = LLT::scalar(1); 3057 3058 auto Abs = B.buildFAbs(S32, RHS, Flags); 3059 const APFloat C0Val(1.0f); 3060 3061 auto C0 = B.buildConstant(S32, 0x6f800000); 3062 auto C1 = B.buildConstant(S32, 0x2f800000); 3063 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3064 3065 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3066 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3067 3068 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3069 3070 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3071 .addUse(Mul0.getReg(0)) 3072 .setMIFlags(Flags); 3073 3074 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3075 3076 B.buildFMul(Res, Sel, Mul1, Flags); 3077 3078 MI.eraseFromParent(); 3079 return true; 3080 } 3081 3082 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3083 MachineRegisterInfo &MRI, 3084 MachineIRBuilder &B) const { 3085 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3086 if (!MFI->isEntryFunction()) { 3087 return legalizePreloadedArgIntrin(MI, MRI, B, 3088 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3089 } 3090 3091 B.setInstr(MI); 3092 3093 uint64_t Offset = 3094 ST.getTargetLowering()->getImplicitParameterOffset( 3095 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3096 Register DstReg = MI.getOperand(0).getReg(); 3097 LLT DstTy = MRI.getType(DstReg); 3098 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3099 3100 const ArgDescriptor *Arg; 3101 const TargetRegisterClass *RC; 3102 std::tie(Arg, RC) 3103 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3104 if (!Arg) 3105 return false; 3106 3107 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3108 if (!loadInputValue(KernargPtrReg, B, Arg)) 3109 return false; 3110 3111 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3112 MI.eraseFromParent(); 3113 return true; 3114 } 3115 3116 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3117 MachineRegisterInfo &MRI, 3118 MachineIRBuilder &B, 3119 unsigned AddrSpace) const { 3120 B.setInstr(MI); 3121 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3122 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3123 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3124 MI.eraseFromParent(); 3125 return true; 3126 } 3127 3128 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3129 // offset (the offset that is included in bounds checking and swizzling, to be 3130 // split between the instruction's voffset and immoffset fields) and soffset 3131 // (the offset that is excluded from bounds checking and swizzling, to go in 3132 // the instruction's soffset field). This function takes the first kind of 3133 // offset and figures out how to split it between voffset and immoffset. 3134 std::tuple<Register, unsigned, unsigned> 3135 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3136 Register OrigOffset) const { 3137 const unsigned MaxImm = 4095; 3138 Register BaseReg; 3139 unsigned TotalConstOffset; 3140 MachineInstr *OffsetDef; 3141 const LLT S32 = LLT::scalar(32); 3142 3143 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3144 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3145 3146 unsigned ImmOffset = TotalConstOffset; 3147 3148 // If the immediate value is too big for the immoffset field, put the value 3149 // and -4096 into the immoffset field so that the value that is copied/added 3150 // for the voffset field is a multiple of 4096, and it stands more chance 3151 // of being CSEd with the copy/add for another similar load/store. 3152 // However, do not do that rounding down to a multiple of 4096 if that is a 3153 // negative number, as it appears to be illegal to have a negative offset 3154 // in the vgpr, even if adding the immediate offset makes it positive. 3155 unsigned Overflow = ImmOffset & ~MaxImm; 3156 ImmOffset -= Overflow; 3157 if ((int32_t)Overflow < 0) { 3158 Overflow += ImmOffset; 3159 ImmOffset = 0; 3160 } 3161 3162 if (Overflow != 0) { 3163 if (!BaseReg) { 3164 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3165 } else { 3166 auto OverflowVal = B.buildConstant(S32, Overflow); 3167 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3168 } 3169 } 3170 3171 if (!BaseReg) 3172 BaseReg = B.buildConstant(S32, 0).getReg(0); 3173 3174 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3175 } 3176 3177 /// Handle register layout difference for f16 images for some subtargets. 3178 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3179 MachineRegisterInfo &MRI, 3180 Register Reg) const { 3181 if (!ST.hasUnpackedD16VMem()) 3182 return Reg; 3183 3184 const LLT S16 = LLT::scalar(16); 3185 const LLT S32 = LLT::scalar(32); 3186 LLT StoreVT = MRI.getType(Reg); 3187 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3188 3189 auto Unmerge = B.buildUnmerge(S16, Reg); 3190 3191 SmallVector<Register, 4> WideRegs; 3192 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3193 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3194 3195 int NumElts = StoreVT.getNumElements(); 3196 3197 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3198 } 3199 3200 Register AMDGPULegalizerInfo::fixStoreSourceType( 3201 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3202 MachineRegisterInfo *MRI = B.getMRI(); 3203 LLT Ty = MRI->getType(VData); 3204 3205 const LLT S16 = LLT::scalar(16); 3206 3207 // Fixup illegal register types for i8 stores. 3208 if (Ty == LLT::scalar(8) || Ty == S16) { 3209 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3210 return AnyExt; 3211 } 3212 3213 if (Ty.isVector()) { 3214 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3215 if (IsFormat) 3216 return handleD16VData(B, *MRI, VData); 3217 } 3218 } 3219 3220 return VData; 3221 } 3222 3223 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3224 MachineRegisterInfo &MRI, 3225 MachineIRBuilder &B, 3226 bool IsTyped, 3227 bool IsFormat) const { 3228 B.setInstr(MI); 3229 3230 Register VData = MI.getOperand(1).getReg(); 3231 LLT Ty = MRI.getType(VData); 3232 LLT EltTy = Ty.getScalarType(); 3233 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3234 const LLT S32 = LLT::scalar(32); 3235 3236 VData = fixStoreSourceType(B, VData, IsFormat); 3237 Register RSrc = MI.getOperand(2).getReg(); 3238 3239 MachineMemOperand *MMO = *MI.memoperands_begin(); 3240 const int MemSize = MMO->getSize(); 3241 3242 unsigned ImmOffset; 3243 unsigned TotalOffset; 3244 3245 // The typed intrinsics add an immediate after the registers. 3246 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3247 3248 // The struct intrinsic variants add one additional operand over raw. 3249 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3250 Register VIndex; 3251 int OpOffset = 0; 3252 if (HasVIndex) { 3253 VIndex = MI.getOperand(3).getReg(); 3254 OpOffset = 1; 3255 } 3256 3257 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3258 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3259 3260 unsigned Format = 0; 3261 if (IsTyped) { 3262 Format = MI.getOperand(5 + OpOffset).getImm(); 3263 ++OpOffset; 3264 } 3265 3266 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3267 3268 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3269 if (TotalOffset != 0) 3270 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3271 3272 unsigned Opc; 3273 if (IsTyped) { 3274 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3275 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3276 } else if (IsFormat) { 3277 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3278 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3279 } else { 3280 switch (MemSize) { 3281 case 1: 3282 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3283 break; 3284 case 2: 3285 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3286 break; 3287 default: 3288 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3289 break; 3290 } 3291 } 3292 3293 if (!VIndex) 3294 VIndex = B.buildConstant(S32, 0).getReg(0); 3295 3296 auto MIB = B.buildInstr(Opc) 3297 .addUse(VData) // vdata 3298 .addUse(RSrc) // rsrc 3299 .addUse(VIndex) // vindex 3300 .addUse(VOffset) // voffset 3301 .addUse(SOffset) // soffset 3302 .addImm(ImmOffset); // offset(imm) 3303 3304 if (IsTyped) 3305 MIB.addImm(Format); 3306 3307 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3308 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3309 .addMemOperand(MMO); 3310 3311 MI.eraseFromParent(); 3312 return true; 3313 } 3314 3315 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3316 MachineRegisterInfo &MRI, 3317 MachineIRBuilder &B, 3318 bool IsFormat, 3319 bool IsTyped) const { 3320 B.setInstr(MI); 3321 3322 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3323 MachineMemOperand *MMO = *MI.memoperands_begin(); 3324 const int MemSize = MMO->getSize(); 3325 const LLT S32 = LLT::scalar(32); 3326 3327 Register Dst = MI.getOperand(0).getReg(); 3328 Register RSrc = MI.getOperand(2).getReg(); 3329 3330 // The typed intrinsics add an immediate after the registers. 3331 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3332 3333 // The struct intrinsic variants add one additional operand over raw. 3334 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3335 Register VIndex; 3336 int OpOffset = 0; 3337 if (HasVIndex) { 3338 VIndex = MI.getOperand(3).getReg(); 3339 OpOffset = 1; 3340 } 3341 3342 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3343 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3344 3345 unsigned Format = 0; 3346 if (IsTyped) { 3347 Format = MI.getOperand(5 + OpOffset).getImm(); 3348 ++OpOffset; 3349 } 3350 3351 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3352 unsigned ImmOffset; 3353 unsigned TotalOffset; 3354 3355 LLT Ty = MRI.getType(Dst); 3356 LLT EltTy = Ty.getScalarType(); 3357 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3358 const bool Unpacked = ST.hasUnpackedD16VMem(); 3359 3360 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3361 if (TotalOffset != 0) 3362 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3363 3364 unsigned Opc; 3365 3366 if (IsTyped) { 3367 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3368 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3369 } else if (IsFormat) { 3370 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3371 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3372 } else { 3373 switch (MemSize) { 3374 case 1: 3375 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3376 break; 3377 case 2: 3378 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3379 break; 3380 default: 3381 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3382 break; 3383 } 3384 } 3385 3386 Register LoadDstReg; 3387 3388 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3389 LLT UnpackedTy = Ty.changeElementSize(32); 3390 3391 if (IsExtLoad) 3392 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3393 else if (Unpacked && IsD16 && Ty.isVector()) 3394 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3395 else 3396 LoadDstReg = Dst; 3397 3398 if (!VIndex) 3399 VIndex = B.buildConstant(S32, 0).getReg(0); 3400 3401 auto MIB = B.buildInstr(Opc) 3402 .addDef(LoadDstReg) // vdata 3403 .addUse(RSrc) // rsrc 3404 .addUse(VIndex) // vindex 3405 .addUse(VOffset) // voffset 3406 .addUse(SOffset) // soffset 3407 .addImm(ImmOffset); // offset(imm) 3408 3409 if (IsTyped) 3410 MIB.addImm(Format); 3411 3412 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3413 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3414 .addMemOperand(MMO); 3415 3416 if (LoadDstReg != Dst) { 3417 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3418 3419 // Widen result for extending loads was widened. 3420 if (IsExtLoad) 3421 B.buildTrunc(Dst, LoadDstReg); 3422 else { 3423 // Repack to original 16-bit vector result 3424 // FIXME: G_TRUNC should work, but legalization currently fails 3425 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3426 SmallVector<Register, 4> Repack; 3427 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3428 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3429 B.buildMerge(Dst, Repack); 3430 } 3431 } 3432 3433 MI.eraseFromParent(); 3434 return true; 3435 } 3436 3437 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3438 MachineIRBuilder &B, 3439 bool IsInc) const { 3440 B.setInstr(MI); 3441 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3442 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3443 B.buildInstr(Opc) 3444 .addDef(MI.getOperand(0).getReg()) 3445 .addUse(MI.getOperand(2).getReg()) 3446 .addUse(MI.getOperand(3).getReg()) 3447 .cloneMemRefs(MI); 3448 MI.eraseFromParent(); 3449 return true; 3450 } 3451 3452 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3453 switch (IntrID) { 3454 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3455 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3456 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3457 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3458 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3459 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3460 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3461 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3462 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3463 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3464 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3465 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3466 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3467 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3468 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3469 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3470 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3471 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3472 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3473 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3474 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3475 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3476 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3477 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3478 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3479 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3480 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3481 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3482 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3483 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3484 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3485 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3486 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3487 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3488 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3489 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3490 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3491 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3492 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3493 default: 3494 llvm_unreachable("unhandled atomic opcode"); 3495 } 3496 } 3497 3498 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3499 MachineIRBuilder &B, 3500 Intrinsic::ID IID) const { 3501 B.setInstr(MI); 3502 3503 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3504 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3505 3506 Register Dst = MI.getOperand(0).getReg(); 3507 Register VData = MI.getOperand(2).getReg(); 3508 3509 Register CmpVal; 3510 int OpOffset = 0; 3511 3512 if (IsCmpSwap) { 3513 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3514 ++OpOffset; 3515 } 3516 3517 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3518 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3519 3520 // The struct intrinsic variants add one additional operand over raw. 3521 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3522 Register VIndex; 3523 if (HasVIndex) { 3524 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3525 ++OpOffset; 3526 } 3527 3528 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3529 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3530 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3531 3532 MachineMemOperand *MMO = *MI.memoperands_begin(); 3533 3534 unsigned ImmOffset; 3535 unsigned TotalOffset; 3536 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3537 if (TotalOffset != 0) 3538 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3539 3540 if (!VIndex) 3541 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3542 3543 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3544 .addDef(Dst) 3545 .addUse(VData); // vdata 3546 3547 if (IsCmpSwap) 3548 MIB.addReg(CmpVal); 3549 3550 MIB.addUse(RSrc) // rsrc 3551 .addUse(VIndex) // vindex 3552 .addUse(VOffset) // voffset 3553 .addUse(SOffset) // soffset 3554 .addImm(ImmOffset) // offset(imm) 3555 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3556 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3557 .addMemOperand(MMO); 3558 3559 MI.eraseFromParent(); 3560 return true; 3561 } 3562 3563 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3564 /// vector with s16 typed elements. 3565 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3566 SmallVectorImpl<Register> &PackedAddrs, 3567 int AddrIdx, int DimIdx, int NumVAddrs, 3568 int NumGradients) { 3569 const LLT S16 = LLT::scalar(16); 3570 const LLT V2S16 = LLT::vector(2, 16); 3571 3572 for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) { 3573 MachineOperand &SrcOp = MI.getOperand(I); 3574 if (!SrcOp.isReg()) 3575 continue; // _L to _LZ may have eliminated this. 3576 3577 Register AddrReg = SrcOp.getReg(); 3578 3579 if (I < DimIdx) { 3580 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3581 PackedAddrs.push_back(AddrReg); 3582 } else { 3583 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3584 // derivatives dx/dh and dx/dv are packed with undef. 3585 if (((I + 1) >= (AddrIdx + NumVAddrs)) || 3586 ((NumGradients / 2) % 2 == 1 && 3587 (I == DimIdx + (NumGradients / 2) - 1 || 3588 I == DimIdx + NumGradients - 1)) || 3589 // Check for _L to _LZ optimization 3590 !MI.getOperand(I + 1).isReg()) { 3591 PackedAddrs.push_back( 3592 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3593 .getReg(0)); 3594 } else { 3595 PackedAddrs.push_back( 3596 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3597 .getReg(0)); 3598 ++I; 3599 } 3600 } 3601 } 3602 } 3603 3604 /// Convert from separate vaddr components to a single vector address register, 3605 /// and replace the remaining operands with $noreg. 3606 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3607 int DimIdx, int NumVAddrs) { 3608 const LLT S32 = LLT::scalar(32); 3609 3610 SmallVector<Register, 8> AddrRegs; 3611 for (int I = 0; I != NumVAddrs; ++I) { 3612 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3613 if (SrcOp.isReg()) { 3614 AddrRegs.push_back(SrcOp.getReg()); 3615 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3616 } 3617 } 3618 3619 int NumAddrRegs = AddrRegs.size(); 3620 if (NumAddrRegs != 1) { 3621 // Round up to 8 elements for v5-v7 3622 // FIXME: Missing intermediate sized register classes and instructions. 3623 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3624 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3625 auto Undef = B.buildUndef(S32); 3626 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3627 NumAddrRegs = RoundedNumRegs; 3628 } 3629 3630 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3631 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3632 } 3633 3634 for (int I = 1; I != NumVAddrs; ++I) { 3635 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3636 if (SrcOp.isReg()) 3637 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3638 } 3639 } 3640 3641 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3642 /// 3643 /// Depending on the subtarget, load/store with 16-bit element data need to be 3644 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3645 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3646 /// registers. 3647 /// 3648 /// We don't want to directly select image instructions just yet, but also want 3649 /// to exposes all register repacking to the legalizer/combiners. We also don't 3650 /// want a selected instrution entering RegBankSelect. In order to avoid 3651 /// defining a multitude of intermediate image instructions, directly hack on 3652 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3653 /// now unnecessary arguments with $noreg. 3654 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3655 MachineInstr &MI, MachineIRBuilder &B, 3656 GISelChangeObserver &Observer, 3657 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3658 B.setInstr(MI); 3659 3660 const int NumDefs = MI.getNumExplicitDefs(); 3661 bool IsTFE = NumDefs == 2; 3662 // We are only processing the operands of d16 image operations on subtargets 3663 // that use the unpacked register layout, or need to repack the TFE result. 3664 3665 // TODO: Do we need to guard against already legalized intrinsics? 3666 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3667 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3668 3669 MachineRegisterInfo *MRI = B.getMRI(); 3670 const LLT S32 = LLT::scalar(32); 3671 const LLT S16 = LLT::scalar(16); 3672 const LLT V2S16 = LLT::vector(2, 16); 3673 3674 // Index of first address argument 3675 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3676 3677 // Check for 16 bit addresses and pack if true. 3678 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3679 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3680 const bool IsA16 = AddrTy == S16; 3681 3682 int NumVAddrs, NumGradients; 3683 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3684 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3685 getDMaskIdx(BaseOpcode, NumDefs); 3686 unsigned DMask = 0; 3687 3688 int DMaskLanes = 0; 3689 if (!BaseOpcode->Atomic) { 3690 DMask = MI.getOperand(DMaskIdx).getImm(); 3691 if (BaseOpcode->Gather4) { 3692 DMaskLanes = 4; 3693 } else if (DMask != 0) { 3694 DMaskLanes = countPopulation(DMask); 3695 } else if (!IsTFE && !BaseOpcode->Store) { 3696 // If dmask is 0, this is a no-op load. This can be eliminated. 3697 B.buildUndef(MI.getOperand(0)); 3698 MI.eraseFromParent(); 3699 return true; 3700 } 3701 } 3702 3703 Observer.changingInstr(MI); 3704 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3705 3706 unsigned NewOpcode = NumDefs == 0 ? 3707 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3708 3709 // Track that we legalized this 3710 MI.setDesc(B.getTII().get(NewOpcode)); 3711 3712 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3713 // dmask to be at least 1 otherwise the instruction will fail 3714 if (IsTFE && DMask == 0) { 3715 DMask = 0x1; 3716 DMaskLanes = 1; 3717 MI.getOperand(DMaskIdx).setImm(DMask); 3718 } 3719 3720 if (BaseOpcode->Atomic) { 3721 Register VData0 = MI.getOperand(2).getReg(); 3722 LLT Ty = MRI->getType(VData0); 3723 3724 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3725 if (Ty.isVector()) 3726 return false; 3727 3728 if (BaseOpcode->AtomicX2) { 3729 Register VData1 = MI.getOperand(3).getReg(); 3730 // The two values are packed in one register. 3731 LLT PackedTy = LLT::vector(2, Ty); 3732 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3733 MI.getOperand(2).setReg(Concat.getReg(0)); 3734 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3735 } 3736 } 3737 3738 int CorrectedNumVAddrs = NumVAddrs; 3739 3740 // Optimize _L to _LZ when _L is zero 3741 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3742 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3743 const ConstantFP *ConstantLod; 3744 const int LodIdx = AddrIdx + NumVAddrs - 1; 3745 3746 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3747 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3748 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3749 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3750 LZMappingInfo->LZ, ImageDimIntr->Dim); 3751 3752 // The starting indexes should remain in the same place. 3753 --NumVAddrs; 3754 --CorrectedNumVAddrs; 3755 3756 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3757 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3758 MI.RemoveOperand(LodIdx); 3759 } 3760 } 3761 } 3762 3763 // Optimize _mip away, when 'lod' is zero 3764 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3765 int64_t ConstantLod; 3766 const int LodIdx = AddrIdx + NumVAddrs - 1; 3767 3768 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3769 if (ConstantLod == 0) { 3770 // TODO: Change intrinsic opcode and remove operand instead or replacing 3771 // it with 0, as the _L to _LZ handling is done above. 3772 MI.getOperand(LodIdx).ChangeToImmediate(0); 3773 --CorrectedNumVAddrs; 3774 } 3775 } 3776 } 3777 3778 // If the register allocator cannot place the address registers contiguously 3779 // without introducing moves, then using the non-sequential address encoding 3780 // is always preferable, since it saves VALU instructions and is usually a 3781 // wash in terms of code size or even better. 3782 // 3783 // However, we currently have no way of hinting to the register allocator 3784 // that MIMG addresses should be placed contiguously when it is possible to 3785 // do so, so force non-NSA for the common 2-address case as a heuristic. 3786 // 3787 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3788 // allocation when possible. 3789 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3790 3791 // Rewrite the addressing register layout before doing anything else. 3792 if (IsA16) { 3793 // FIXME: this feature is missing from gfx10. When that is fixed, this check 3794 // should be introduced. 3795 if (!ST.hasR128A16() && !ST.hasGFX10A16()) 3796 return false; 3797 3798 if (NumVAddrs > 1) { 3799 SmallVector<Register, 4> PackedRegs; 3800 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs, 3801 NumGradients); 3802 3803 if (!UseNSA && PackedRegs.size() > 1) { 3804 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3805 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3806 PackedRegs[0] = Concat.getReg(0); 3807 PackedRegs.resize(1); 3808 } 3809 3810 const int NumPacked = PackedRegs.size(); 3811 for (int I = 0; I != NumVAddrs; ++I) { 3812 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3813 if (!SrcOp.isReg()) { 3814 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3815 continue; 3816 } 3817 3818 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3819 3820 if (I < NumPacked) 3821 SrcOp.setReg(PackedRegs[I]); 3822 else 3823 SrcOp.setReg(AMDGPU::NoRegister); 3824 } 3825 } 3826 } else if (!UseNSA && NumVAddrs > 1) { 3827 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3828 } 3829 3830 3831 if (BaseOpcode->Store) { // No TFE for stores? 3832 // TODO: Handle dmask trim 3833 Register VData = MI.getOperand(1).getReg(); 3834 LLT Ty = MRI->getType(VData); 3835 if (!Ty.isVector() || Ty.getElementType() != S16) 3836 return true; 3837 3838 B.setInstr(MI); 3839 3840 Register RepackedReg = handleD16VData(B, *MRI, VData); 3841 if (RepackedReg != VData) { 3842 MI.getOperand(1).setReg(RepackedReg); 3843 } 3844 3845 return true; 3846 } 3847 3848 Register DstReg = MI.getOperand(0).getReg(); 3849 LLT Ty = MRI->getType(DstReg); 3850 const LLT EltTy = Ty.getScalarType(); 3851 const bool IsD16 = Ty.getScalarType() == S16; 3852 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3853 3854 // Confirm that the return type is large enough for the dmask specified 3855 if (NumElts < DMaskLanes) 3856 return false; 3857 3858 if (NumElts > 4 || DMaskLanes > 4) 3859 return false; 3860 3861 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3862 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3863 3864 // The raw dword aligned data component of the load. The only legal cases 3865 // where this matters should be when using the packed D16 format, for 3866 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3867 LLT RoundedTy; 3868 3869 // S32 vector to to cover all data, plus TFE result element. 3870 LLT TFETy; 3871 3872 // Register type to use for each loaded component. Will be S32 or V2S16. 3873 LLT RegTy; 3874 3875 if (IsD16 && ST.hasUnpackedD16VMem()) { 3876 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3877 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3878 RegTy = S32; 3879 } else { 3880 unsigned EltSize = EltTy.getSizeInBits(); 3881 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3882 unsigned RoundedSize = 32 * RoundedElts; 3883 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3884 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3885 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3886 } 3887 3888 // The return type does not need adjustment. 3889 // TODO: Should we change s16 case to s32 or <2 x s16>? 3890 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3891 return true; 3892 3893 Register Dst1Reg; 3894 3895 // Insert after the instruction. 3896 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3897 3898 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3899 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3900 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3901 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3902 3903 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3904 3905 MI.getOperand(0).setReg(NewResultReg); 3906 3907 // In the IR, TFE is supposed to be used with a 2 element struct return 3908 // type. The intruction really returns these two values in one contiguous 3909 // register, with one additional dword beyond the loaded data. Rewrite the 3910 // return type to use a single register result. 3911 3912 if (IsTFE) { 3913 Dst1Reg = MI.getOperand(1).getReg(); 3914 if (MRI->getType(Dst1Reg) != S32) 3915 return false; 3916 3917 // TODO: Make sure the TFE operand bit is set. 3918 MI.RemoveOperand(1); 3919 3920 // Handle the easy case that requires no repack instructions. 3921 if (Ty == S32) { 3922 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 3923 return true; 3924 } 3925 } 3926 3927 // Now figure out how to copy the new result register back into the old 3928 // result. 3929 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 3930 3931 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 3932 3933 if (ResultNumRegs == 1) { 3934 assert(!IsTFE); 3935 ResultRegs[0] = NewResultReg; 3936 } else { 3937 // We have to repack into a new vector of some kind. 3938 for (int I = 0; I != NumDataRegs; ++I) 3939 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 3940 B.buildUnmerge(ResultRegs, NewResultReg); 3941 3942 // Drop the final TFE element to get the data part. The TFE result is 3943 // directly written to the right place already. 3944 if (IsTFE) 3945 ResultRegs.resize(NumDataRegs); 3946 } 3947 3948 // For an s16 scalar result, we form an s32 result with a truncate regardless 3949 // of packed vs. unpacked. 3950 if (IsD16 && !Ty.isVector()) { 3951 B.buildTrunc(DstReg, ResultRegs[0]); 3952 return true; 3953 } 3954 3955 // Avoid a build/concat_vector of 1 entry. 3956 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 3957 B.buildBitcast(DstReg, ResultRegs[0]); 3958 return true; 3959 } 3960 3961 assert(Ty.isVector()); 3962 3963 if (IsD16) { 3964 // For packed D16 results with TFE enabled, all the data components are 3965 // S32. Cast back to the expected type. 3966 // 3967 // TODO: We don't really need to use load s32 elements. We would only need one 3968 // cast for the TFE result if a multiple of v2s16 was used. 3969 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 3970 for (Register &Reg : ResultRegs) 3971 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 3972 } else if (ST.hasUnpackedD16VMem()) { 3973 for (Register &Reg : ResultRegs) 3974 Reg = B.buildTrunc(S16, Reg).getReg(0); 3975 } 3976 } 3977 3978 auto padWithUndef = [&](LLT Ty, int NumElts) { 3979 if (NumElts == 0) 3980 return; 3981 Register Undef = B.buildUndef(Ty).getReg(0); 3982 for (int I = 0; I != NumElts; ++I) 3983 ResultRegs.push_back(Undef); 3984 }; 3985 3986 // Pad out any elements eliminated due to the dmask. 3987 LLT ResTy = MRI->getType(ResultRegs[0]); 3988 if (!ResTy.isVector()) { 3989 padWithUndef(ResTy, NumElts - ResultRegs.size()); 3990 B.buildBuildVector(DstReg, ResultRegs); 3991 return true; 3992 } 3993 3994 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 3995 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 3996 3997 // Deal with the one annoying legal case. 3998 const LLT V3S16 = LLT::vector(3, 16); 3999 if (Ty == V3S16) { 4000 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4001 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4002 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4003 return true; 4004 } 4005 4006 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4007 B.buildConcatVectors(DstReg, ResultRegs); 4008 return true; 4009 } 4010 4011 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4012 MachineInstr &MI, MachineIRBuilder &B, 4013 GISelChangeObserver &Observer) const { 4014 Register Dst = MI.getOperand(0).getReg(); 4015 LLT Ty = B.getMRI()->getType(Dst); 4016 unsigned Size = Ty.getSizeInBits(); 4017 MachineFunction &MF = B.getMF(); 4018 4019 Observer.changingInstr(MI); 4020 4021 // FIXME: We don't really need this intermediate instruction. The intrinsic 4022 // should be fixed to have a memory operand. Since it's readnone, we're not 4023 // allowed to add one. 4024 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4025 MI.RemoveOperand(1); // Remove intrinsic ID 4026 4027 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4028 // TODO: Should this use datalayout alignment? 4029 const unsigned MemSize = (Size + 7) / 8; 4030 const Align MemAlign(4); 4031 MachineMemOperand *MMO = MF.getMachineMemOperand( 4032 MachinePointerInfo(), 4033 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4034 MachineMemOperand::MOInvariant, 4035 MemSize, MemAlign); 4036 MI.addMemOperand(MF, MMO); 4037 4038 // There are no 96-bit result scalar loads, but widening to 128-bit should 4039 // always be legal. We may need to restore this to a 96-bit result if it turns 4040 // out this needs to be converted to a vector load during RegBankSelect. 4041 if (!isPowerOf2_32(Size)) { 4042 LegalizerHelper Helper(MF, *this, Observer, B); 4043 B.setInstr(MI); 4044 4045 if (Ty.isVector()) 4046 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4047 else 4048 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4049 } 4050 4051 Observer.changedInstr(MI); 4052 return true; 4053 } 4054 4055 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4056 MachineRegisterInfo &MRI, 4057 MachineIRBuilder &B) const { 4058 B.setInstr(MI); 4059 4060 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4061 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4062 !ST.isTrapHandlerEnabled()) { 4063 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4064 } else { 4065 // Pass queue pointer to trap handler as input, and insert trap instruction 4066 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4067 const ArgDescriptor *Arg = 4068 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4069 if (!Arg) 4070 return false; 4071 MachineRegisterInfo &MRI = *B.getMRI(); 4072 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4073 Register LiveIn = getLiveInRegister( 4074 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4075 /*InsertLiveInCopy=*/false); 4076 if (!loadInputValue(LiveIn, B, Arg)) 4077 return false; 4078 B.buildCopy(SGPR01, LiveIn); 4079 B.buildInstr(AMDGPU::S_TRAP) 4080 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4081 .addReg(SGPR01, RegState::Implicit); 4082 } 4083 4084 MI.eraseFromParent(); 4085 return true; 4086 } 4087 4088 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4089 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4090 B.setInstr(MI); 4091 4092 // Is non-HSA path or trap-handler disabled? then, report a warning 4093 // accordingly 4094 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4095 !ST.isTrapHandlerEnabled()) { 4096 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4097 "debugtrap handler not supported", 4098 MI.getDebugLoc(), DS_Warning); 4099 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4100 Ctx.diagnose(NoTrap); 4101 } else { 4102 // Insert debug-trap instruction 4103 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4104 } 4105 4106 MI.eraseFromParent(); 4107 return true; 4108 } 4109 4110 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 4111 MachineIRBuilder &B, 4112 GISelChangeObserver &Observer) const { 4113 MachineRegisterInfo &MRI = *B.getMRI(); 4114 4115 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4116 auto IntrID = MI.getIntrinsicID(); 4117 switch (IntrID) { 4118 case Intrinsic::amdgcn_if: 4119 case Intrinsic::amdgcn_else: { 4120 MachineInstr *Br = nullptr; 4121 MachineBasicBlock *UncondBrTarget = nullptr; 4122 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4123 const SIRegisterInfo *TRI 4124 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4125 4126 B.setInstr(*BrCond); 4127 Register Def = MI.getOperand(1).getReg(); 4128 Register Use = MI.getOperand(3).getReg(); 4129 4130 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4131 if (IntrID == Intrinsic::amdgcn_if) { 4132 B.buildInstr(AMDGPU::SI_IF) 4133 .addDef(Def) 4134 .addUse(Use) 4135 .addMBB(UncondBrTarget); 4136 } else { 4137 B.buildInstr(AMDGPU::SI_ELSE) 4138 .addDef(Def) 4139 .addUse(Use) 4140 .addMBB(UncondBrTarget) 4141 .addImm(0); 4142 } 4143 4144 if (Br) { 4145 Br->getOperand(0).setMBB(CondBrTarget); 4146 } else { 4147 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4148 // since we're swapping branch targets it needs to be reinserted. 4149 // FIXME: IRTranslator should probably not do this 4150 B.buildBr(*CondBrTarget); 4151 } 4152 4153 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4154 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4155 MI.eraseFromParent(); 4156 BrCond->eraseFromParent(); 4157 return true; 4158 } 4159 4160 return false; 4161 } 4162 case Intrinsic::amdgcn_loop: { 4163 MachineInstr *Br = nullptr; 4164 MachineBasicBlock *UncondBrTarget = nullptr; 4165 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4166 const SIRegisterInfo *TRI 4167 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4168 4169 B.setInstr(*BrCond); 4170 4171 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4172 Register Reg = MI.getOperand(2).getReg(); 4173 B.buildInstr(AMDGPU::SI_LOOP) 4174 .addUse(Reg) 4175 .addMBB(UncondBrTarget); 4176 4177 if (Br) 4178 Br->getOperand(0).setMBB(CondBrTarget); 4179 else 4180 B.buildBr(*CondBrTarget); 4181 4182 MI.eraseFromParent(); 4183 BrCond->eraseFromParent(); 4184 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4185 return true; 4186 } 4187 4188 return false; 4189 } 4190 case Intrinsic::amdgcn_kernarg_segment_ptr: 4191 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4192 B.setInstr(MI); 4193 // This only makes sense to call in a kernel, so just lower to null. 4194 B.buildConstant(MI.getOperand(0).getReg(), 0); 4195 MI.eraseFromParent(); 4196 return true; 4197 } 4198 4199 return legalizePreloadedArgIntrin( 4200 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4201 case Intrinsic::amdgcn_implicitarg_ptr: 4202 return legalizeImplicitArgPtr(MI, MRI, B); 4203 case Intrinsic::amdgcn_workitem_id_x: 4204 return legalizePreloadedArgIntrin(MI, MRI, B, 4205 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4206 case Intrinsic::amdgcn_workitem_id_y: 4207 return legalizePreloadedArgIntrin(MI, MRI, B, 4208 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4209 case Intrinsic::amdgcn_workitem_id_z: 4210 return legalizePreloadedArgIntrin(MI, MRI, B, 4211 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4212 case Intrinsic::amdgcn_workgroup_id_x: 4213 return legalizePreloadedArgIntrin(MI, MRI, B, 4214 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4215 case Intrinsic::amdgcn_workgroup_id_y: 4216 return legalizePreloadedArgIntrin(MI, MRI, B, 4217 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4218 case Intrinsic::amdgcn_workgroup_id_z: 4219 return legalizePreloadedArgIntrin(MI, MRI, B, 4220 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4221 case Intrinsic::amdgcn_dispatch_ptr: 4222 return legalizePreloadedArgIntrin(MI, MRI, B, 4223 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4224 case Intrinsic::amdgcn_queue_ptr: 4225 return legalizePreloadedArgIntrin(MI, MRI, B, 4226 AMDGPUFunctionArgInfo::QUEUE_PTR); 4227 case Intrinsic::amdgcn_implicit_buffer_ptr: 4228 return legalizePreloadedArgIntrin( 4229 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4230 case Intrinsic::amdgcn_dispatch_id: 4231 return legalizePreloadedArgIntrin(MI, MRI, B, 4232 AMDGPUFunctionArgInfo::DISPATCH_ID); 4233 case Intrinsic::amdgcn_fdiv_fast: 4234 return legalizeFDIVFastIntrin(MI, MRI, B); 4235 case Intrinsic::amdgcn_is_shared: 4236 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4237 case Intrinsic::amdgcn_is_private: 4238 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4239 case Intrinsic::amdgcn_wavefrontsize: { 4240 B.setInstr(MI); 4241 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4242 MI.eraseFromParent(); 4243 return true; 4244 } 4245 case Intrinsic::amdgcn_s_buffer_load: 4246 return legalizeSBufferLoad(MI, B, Observer); 4247 case Intrinsic::amdgcn_raw_buffer_store: 4248 case Intrinsic::amdgcn_struct_buffer_store: 4249 return legalizeBufferStore(MI, MRI, B, false, false); 4250 case Intrinsic::amdgcn_raw_buffer_store_format: 4251 case Intrinsic::amdgcn_struct_buffer_store_format: 4252 return legalizeBufferStore(MI, MRI, B, false, true); 4253 case Intrinsic::amdgcn_raw_tbuffer_store: 4254 case Intrinsic::amdgcn_struct_tbuffer_store: 4255 return legalizeBufferStore(MI, MRI, B, true, true); 4256 case Intrinsic::amdgcn_raw_buffer_load: 4257 case Intrinsic::amdgcn_struct_buffer_load: 4258 return legalizeBufferLoad(MI, MRI, B, false, false); 4259 case Intrinsic::amdgcn_raw_buffer_load_format: 4260 case Intrinsic::amdgcn_struct_buffer_load_format: 4261 return legalizeBufferLoad(MI, MRI, B, true, false); 4262 case Intrinsic::amdgcn_raw_tbuffer_load: 4263 case Intrinsic::amdgcn_struct_tbuffer_load: 4264 return legalizeBufferLoad(MI, MRI, B, true, true); 4265 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4266 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4267 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4268 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4269 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4270 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4271 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4272 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4273 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4274 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4275 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4276 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4277 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4278 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4279 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4280 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4281 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4282 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4283 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4284 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4285 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4286 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4287 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4288 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4289 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4290 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4291 return legalizeBufferAtomic(MI, B, IntrID); 4292 case Intrinsic::amdgcn_atomic_inc: 4293 return legalizeAtomicIncDec(MI, B, true); 4294 case Intrinsic::amdgcn_atomic_dec: 4295 return legalizeAtomicIncDec(MI, B, false); 4296 case Intrinsic::trap: 4297 return legalizeTrapIntrinsic(MI, MRI, B); 4298 case Intrinsic::debugtrap: 4299 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4300 default: { 4301 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4302 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4303 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 4304 return true; 4305 } 4306 } 4307 4308 return true; 4309 } 4310