1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Round the number of elements to the next power of two elements 46 static LLT getPow2VectorType(LLT Ty) { 47 unsigned NElts = Ty.getNumElements(); 48 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 49 return Ty.changeNumElements(Pow2NElts); 50 } 51 52 // Round the number of bits to the next power of two bits 53 static LLT getPow2ScalarType(LLT Ty) { 54 unsigned Bits = Ty.getSizeInBits(); 55 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 56 return LLT::scalar(Pow2Bits); 57 } 58 59 static LegalityPredicate isMultiple32(unsigned TypeIdx, 60 unsigned MaxSize = 1024) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getScalarType(); 64 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 65 }; 66 } 67 68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 69 return [=](const LegalityQuery &Query) { 70 return Query.Types[TypeIdx].getSizeInBits() == Size; 71 }; 72 } 73 74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 75 return [=](const LegalityQuery &Query) { 76 const LLT Ty = Query.Types[TypeIdx]; 77 return Ty.isVector() && 78 Ty.getNumElements() % 2 != 0 && 79 Ty.getElementType().getSizeInBits() < 32 && 80 Ty.getSizeInBits() % 32 != 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 130 return [=](const LegalityQuery &Query) { 131 const LLT QueryTy = Query.Types[TypeIdx]; 132 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 133 }; 134 } 135 136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 137 return [=](const LegalityQuery &Query) { 138 const LLT QueryTy = Query.Types[TypeIdx]; 139 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 140 }; 141 } 142 143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 144 return [=](const LegalityQuery &Query) { 145 const LLT QueryTy = Query.Types[TypeIdx]; 146 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 147 }; 148 } 149 150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 151 // v2s16. 152 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 if (Ty.isVector()) { 156 const int EltSize = Ty.getElementType().getSizeInBits(); 157 return EltSize == 32 || EltSize == 64 || 158 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 159 EltSize == 128 || EltSize == 256; 160 } 161 162 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 163 }; 164 } 165 166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 167 return [=](const LegalityQuery &Query) { 168 const LLT QueryTy = Query.Types[TypeIdx]; 169 return QueryTy.isVector() && QueryTy.getElementType() == Type; 170 }; 171 } 172 173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 174 return [=](const LegalityQuery &Query) { 175 const LLT Ty = Query.Types[TypeIdx]; 176 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 177 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 178 }; 179 } 180 181 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) { 182 return [=](const LegalityQuery &Query) { 183 return Query.Types[TypeIdx0].getSizeInBits() < 184 Query.Types[TypeIdx1].getSizeInBits(); 185 }; 186 } 187 188 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) { 189 return [=](const LegalityQuery &Query) { 190 return Query.Types[TypeIdx0].getSizeInBits() > 191 Query.Types[TypeIdx1].getSizeInBits(); 192 }; 193 } 194 195 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 196 const GCNTargetMachine &TM) 197 : ST(ST_) { 198 using namespace TargetOpcode; 199 200 auto GetAddrSpacePtr = [&TM](unsigned AS) { 201 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 202 }; 203 204 const LLT S1 = LLT::scalar(1); 205 const LLT S16 = LLT::scalar(16); 206 const LLT S32 = LLT::scalar(32); 207 const LLT S64 = LLT::scalar(64); 208 const LLT S128 = LLT::scalar(128); 209 const LLT S256 = LLT::scalar(256); 210 const LLT S1024 = LLT::scalar(1024); 211 212 const LLT V2S16 = LLT::vector(2, 16); 213 const LLT V4S16 = LLT::vector(4, 16); 214 215 const LLT V2S32 = LLT::vector(2, 32); 216 const LLT V3S32 = LLT::vector(3, 32); 217 const LLT V4S32 = LLT::vector(4, 32); 218 const LLT V5S32 = LLT::vector(5, 32); 219 const LLT V6S32 = LLT::vector(6, 32); 220 const LLT V7S32 = LLT::vector(7, 32); 221 const LLT V8S32 = LLT::vector(8, 32); 222 const LLT V9S32 = LLT::vector(9, 32); 223 const LLT V10S32 = LLT::vector(10, 32); 224 const LLT V11S32 = LLT::vector(11, 32); 225 const LLT V12S32 = LLT::vector(12, 32); 226 const LLT V13S32 = LLT::vector(13, 32); 227 const LLT V14S32 = LLT::vector(14, 32); 228 const LLT V15S32 = LLT::vector(15, 32); 229 const LLT V16S32 = LLT::vector(16, 32); 230 const LLT V32S32 = LLT::vector(32, 32); 231 232 const LLT V2S64 = LLT::vector(2, 64); 233 const LLT V3S64 = LLT::vector(3, 64); 234 const LLT V4S64 = LLT::vector(4, 64); 235 const LLT V5S64 = LLT::vector(5, 64); 236 const LLT V6S64 = LLT::vector(6, 64); 237 const LLT V7S64 = LLT::vector(7, 64); 238 const LLT V8S64 = LLT::vector(8, 64); 239 const LLT V16S64 = LLT::vector(16, 64); 240 241 std::initializer_list<LLT> AllS32Vectors = 242 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 243 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 244 std::initializer_list<LLT> AllS64Vectors = 245 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 246 247 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 248 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 249 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 250 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 251 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 252 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 253 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 254 255 const LLT CodePtr = FlatPtr; 256 257 const std::initializer_list<LLT> AddrSpaces64 = { 258 GlobalPtr, ConstantPtr, FlatPtr 259 }; 260 261 const std::initializer_list<LLT> AddrSpaces32 = { 262 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 263 }; 264 265 const std::initializer_list<LLT> FPTypesBase = { 266 S32, S64 267 }; 268 269 const std::initializer_list<LLT> FPTypes16 = { 270 S32, S64, S16 271 }; 272 273 const std::initializer_list<LLT> FPTypesPK16 = { 274 S32, S64, S16, V2S16 275 }; 276 277 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 278 279 setAction({G_BRCOND, S1}, Legal); // VCC branches 280 setAction({G_BRCOND, S32}, Legal); // SCC branches 281 282 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 283 // elements for v3s16 284 getActionDefinitionsBuilder(G_PHI) 285 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 286 .legalFor(AllS32Vectors) 287 .legalFor(AllS64Vectors) 288 .legalFor(AddrSpaces64) 289 .legalFor(AddrSpaces32) 290 .clampScalar(0, S32, S256) 291 .widenScalarToNextPow2(0, 32) 292 .clampMaxNumElements(0, S32, 16) 293 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 294 .legalIf(isPointer(0)); 295 296 if (ST.hasVOP3PInsts()) { 297 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 298 .legalFor({S32, S16, V2S16}) 299 .clampScalar(0, S16, S32) 300 .clampMaxNumElements(0, S16, 2) 301 .scalarize(0) 302 .widenScalarToNextPow2(0, 32); 303 } else if (ST.has16BitInsts()) { 304 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 305 .legalFor({S32, S16}) 306 .clampScalar(0, S16, S32) 307 .scalarize(0) 308 .widenScalarToNextPow2(0, 32); 309 } else { 310 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 311 .legalFor({S32}) 312 .clampScalar(0, S32, S32) 313 .scalarize(0); 314 } 315 316 // FIXME: Not really legal. Placeholder for custom lowering. 317 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 318 .customFor({S32, S64}) 319 .clampScalar(0, S32, S64) 320 .widenScalarToNextPow2(0, 32) 321 .scalarize(0); 322 323 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 324 .legalFor({S32}) 325 .clampScalar(0, S32, S32) 326 .scalarize(0); 327 328 // Report legal for any types we can handle anywhere. For the cases only legal 329 // on the SALU, RegBankSelect will be able to re-legalize. 330 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 331 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 332 .clampScalar(0, S32, S64) 333 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 334 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 335 .widenScalarToNextPow2(0) 336 .scalarize(0); 337 338 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 339 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 340 .legalFor({{S32, S1}, {S32, S32}}) 341 .minScalar(0, S32) 342 // TODO: .scalarize(0) 343 .lower(); 344 345 getActionDefinitionsBuilder(G_BITCAST) 346 // Don't worry about the size constraint. 347 .legalIf(all(isRegisterType(0), isRegisterType(1))) 348 .lower(); 349 350 351 getActionDefinitionsBuilder(G_CONSTANT) 352 .legalFor({S1, S32, S64, S16, GlobalPtr, 353 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 354 .clampScalar(0, S32, S64) 355 .widenScalarToNextPow2(0) 356 .legalIf(isPointer(0)); 357 358 getActionDefinitionsBuilder(G_FCONSTANT) 359 .legalFor({S32, S64, S16}) 360 .clampScalar(0, S16, S64); 361 362 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 363 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 364 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 365 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 366 .clampScalarOrElt(0, S32, S1024) 367 .legalIf(isMultiple32(0)) 368 .widenScalarToNextPow2(0, 32) 369 .clampMaxNumElements(0, S32, 16); 370 371 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 372 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 373 .unsupportedFor({PrivatePtr}) 374 .custom(); 375 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 376 377 auto &FPOpActions = getActionDefinitionsBuilder( 378 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 379 .legalFor({S32, S64}); 380 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 381 .customFor({S32, S64}); 382 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 383 .customFor({S32, S64}); 384 385 if (ST.has16BitInsts()) { 386 if (ST.hasVOP3PInsts()) 387 FPOpActions.legalFor({S16, V2S16}); 388 else 389 FPOpActions.legalFor({S16}); 390 391 TrigActions.customFor({S16}); 392 FDIVActions.customFor({S16}); 393 } 394 395 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 396 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 397 398 if (ST.hasVOP3PInsts()) { 399 MinNumMaxNum.customFor(FPTypesPK16) 400 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 401 .clampMaxNumElements(0, S16, 2) 402 .clampScalar(0, S16, S64) 403 .scalarize(0); 404 } else if (ST.has16BitInsts()) { 405 MinNumMaxNum.customFor(FPTypes16) 406 .clampScalar(0, S16, S64) 407 .scalarize(0); 408 } else { 409 MinNumMaxNum.customFor(FPTypesBase) 410 .clampScalar(0, S32, S64) 411 .scalarize(0); 412 } 413 414 if (ST.hasVOP3PInsts()) 415 FPOpActions.clampMaxNumElements(0, S16, 2); 416 417 FPOpActions 418 .scalarize(0) 419 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 420 421 TrigActions 422 .scalarize(0) 423 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 424 425 FDIVActions 426 .scalarize(0) 427 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 428 429 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 430 .legalFor(FPTypesPK16) 431 .clampMaxNumElements(0, S16, 2) 432 .scalarize(0) 433 .clampScalar(0, S16, S64); 434 435 if (ST.has16BitInsts()) { 436 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 437 .legalFor({S32, S64, S16}) 438 .scalarize(0) 439 .clampScalar(0, S16, S64); 440 } else { 441 getActionDefinitionsBuilder(G_FSQRT) 442 .legalFor({S32, S64}) 443 .scalarize(0) 444 .clampScalar(0, S32, S64); 445 446 if (ST.hasFractBug()) { 447 getActionDefinitionsBuilder(G_FFLOOR) 448 .customFor({S64}) 449 .legalFor({S32, S64}) 450 .scalarize(0) 451 .clampScalar(0, S32, S64); 452 } else { 453 getActionDefinitionsBuilder(G_FFLOOR) 454 .legalFor({S32, S64}) 455 .scalarize(0) 456 .clampScalar(0, S32, S64); 457 } 458 } 459 460 getActionDefinitionsBuilder(G_FPTRUNC) 461 .legalFor({{S32, S64}, {S16, S32}}) 462 .scalarize(0) 463 .lower(); 464 465 getActionDefinitionsBuilder(G_FPEXT) 466 .legalFor({{S64, S32}, {S32, S16}}) 467 .lowerFor({{S64, S16}}) // FIXME: Implement 468 .scalarize(0); 469 470 getActionDefinitionsBuilder(G_FSUB) 471 // Use actual fsub instruction 472 .legalFor({S32}) 473 // Must use fadd + fneg 474 .lowerFor({S64, S16, V2S16}) 475 .scalarize(0) 476 .clampScalar(0, S32, S64); 477 478 // Whether this is legal depends on the floating point mode for the function. 479 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 480 if (ST.hasMadF16()) 481 FMad.customFor({S32, S16}); 482 else 483 FMad.customFor({S32}); 484 FMad.scalarize(0) 485 .lower(); 486 487 getActionDefinitionsBuilder(G_TRUNC) 488 .alwaysLegal(); 489 490 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 491 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 492 {S32, S1}, {S64, S1}, {S16, S1}}) 493 .scalarize(0) 494 .clampScalar(0, S32, S64) 495 .widenScalarToNextPow2(1, 32); 496 497 // TODO: Split s1->s64 during regbankselect for VALU. 498 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 499 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 500 .lowerFor({{S32, S64}}) 501 .lowerIf(typeIs(1, S1)) 502 .customFor({{S64, S64}}); 503 if (ST.has16BitInsts()) 504 IToFP.legalFor({{S16, S16}}); 505 IToFP.clampScalar(1, S32, S64) 506 .scalarize(0) 507 .widenScalarToNextPow2(1); 508 509 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 510 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 511 .customFor({{S64, S64}}); 512 if (ST.has16BitInsts()) 513 FPToI.legalFor({{S16, S16}}); 514 else 515 FPToI.minScalar(1, S32); 516 517 FPToI.minScalar(0, S32) 518 .scalarize(0) 519 .lower(); 520 521 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 522 .scalarize(0) 523 .lower(); 524 525 if (ST.has16BitInsts()) { 526 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 527 .legalFor({S16, S32, S64}) 528 .clampScalar(0, S16, S64) 529 .scalarize(0); 530 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 531 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 532 .legalFor({S32, S64}) 533 .clampScalar(0, S32, S64) 534 .scalarize(0); 535 } else { 536 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 537 .legalFor({S32}) 538 .customFor({S64}) 539 .clampScalar(0, S32, S64) 540 .scalarize(0); 541 } 542 543 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 544 .scalarize(0) 545 .alwaysLegal(); 546 547 auto &CmpBuilder = 548 getActionDefinitionsBuilder(G_ICMP) 549 // The compare output type differs based on the register bank of the output, 550 // so make both s1 and s32 legal. 551 // 552 // Scalar compares producing output in scc will be promoted to s32, as that 553 // is the allocatable register type that will be needed for the copy from 554 // scc. This will be promoted during RegBankSelect, and we assume something 555 // before that won't try to use s32 result types. 556 // 557 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 558 // bank. 559 .legalForCartesianProduct( 560 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 561 .legalForCartesianProduct( 562 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 563 if (ST.has16BitInsts()) { 564 CmpBuilder.legalFor({{S1, S16}}); 565 } 566 567 CmpBuilder 568 .widenScalarToNextPow2(1) 569 .clampScalar(1, S32, S64) 570 .scalarize(0) 571 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 572 573 getActionDefinitionsBuilder(G_FCMP) 574 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 575 .widenScalarToNextPow2(1) 576 .clampScalar(1, S32, S64) 577 .scalarize(0); 578 579 // FIXME: fpow has a selection pattern that should move to custom lowering. 580 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 581 if (ST.has16BitInsts()) 582 Exp2Ops.legalFor({S32, S16}); 583 else 584 Exp2Ops.legalFor({S32}); 585 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 586 Exp2Ops.scalarize(0); 587 588 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 589 if (ST.has16BitInsts()) 590 ExpOps.customFor({{S32}, {S16}}); 591 else 592 ExpOps.customFor({S32}); 593 ExpOps.clampScalar(0, MinScalarFPTy, S32) 594 .scalarize(0); 595 596 // The 64-bit versions produce 32-bit results, but only on the SALU. 597 getActionDefinitionsBuilder(G_CTPOP) 598 .legalFor({{S32, S32}, {S32, S64}}) 599 .clampScalar(0, S32, S32) 600 .clampScalar(1, S32, S64) 601 .scalarize(0) 602 .widenScalarToNextPow2(0, 32) 603 .widenScalarToNextPow2(1, 32); 604 605 // The hardware instructions return a different result on 0 than the generic 606 // instructions expect. The hardware produces -1, but these produce the 607 // bitwidth. 608 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 609 .scalarize(0) 610 .clampScalar(0, S32, S32) 611 .clampScalar(1, S32, S64) 612 .widenScalarToNextPow2(0, 32) 613 .widenScalarToNextPow2(1, 32) 614 .lower(); 615 616 // The 64-bit versions produce 32-bit results, but only on the SALU. 617 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 618 .legalFor({{S32, S32}, {S32, S64}}) 619 .clampScalar(0, S32, S32) 620 .clampScalar(1, S32, S64) 621 .scalarize(0) 622 .widenScalarToNextPow2(0, 32) 623 .widenScalarToNextPow2(1, 32); 624 625 getActionDefinitionsBuilder(G_BITREVERSE) 626 .legalFor({S32}) 627 .clampScalar(0, S32, S32) 628 .scalarize(0); 629 630 if (ST.has16BitInsts()) { 631 getActionDefinitionsBuilder(G_BSWAP) 632 .legalFor({S16, S32, V2S16}) 633 .clampMaxNumElements(0, S16, 2) 634 // FIXME: Fixing non-power-of-2 before clamp is workaround for 635 // narrowScalar limitation. 636 .widenScalarToNextPow2(0) 637 .clampScalar(0, S16, S32) 638 .scalarize(0); 639 640 if (ST.hasVOP3PInsts()) { 641 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 642 .legalFor({S32, S16, V2S16}) 643 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 644 .clampMaxNumElements(0, S16, 2) 645 .minScalar(0, S16) 646 .widenScalarToNextPow2(0) 647 .scalarize(0) 648 .lower(); 649 } else { 650 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 651 .legalFor({S32, S16}) 652 .widenScalarToNextPow2(0) 653 .minScalar(0, S16) 654 .scalarize(0) 655 .lower(); 656 } 657 } else { 658 // TODO: Should have same legality without v_perm_b32 659 getActionDefinitionsBuilder(G_BSWAP) 660 .legalFor({S32}) 661 .lowerIf(narrowerThan(0, 32)) 662 // FIXME: Fixing non-power-of-2 before clamp is workaround for 663 // narrowScalar limitation. 664 .widenScalarToNextPow2(0) 665 .maxScalar(0, S32) 666 .scalarize(0) 667 .lower(); 668 669 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 670 .legalFor({S32}) 671 .minScalar(0, S32) 672 .widenScalarToNextPow2(0) 673 .scalarize(0) 674 .lower(); 675 } 676 677 getActionDefinitionsBuilder(G_INTTOPTR) 678 // List the common cases 679 .legalForCartesianProduct(AddrSpaces64, {S64}) 680 .legalForCartesianProduct(AddrSpaces32, {S32}) 681 .scalarize(0) 682 // Accept any address space as long as the size matches 683 .legalIf(sameSize(0, 1)) 684 .widenScalarIf(smallerThan(1, 0), 685 [](const LegalityQuery &Query) { 686 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 687 }) 688 .narrowScalarIf(greaterThan(1, 0), 689 [](const LegalityQuery &Query) { 690 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 691 }); 692 693 getActionDefinitionsBuilder(G_PTRTOINT) 694 // List the common cases 695 .legalForCartesianProduct(AddrSpaces64, {S64}) 696 .legalForCartesianProduct(AddrSpaces32, {S32}) 697 .scalarize(0) 698 // Accept any address space as long as the size matches 699 .legalIf(sameSize(0, 1)) 700 .widenScalarIf(smallerThan(0, 1), 701 [](const LegalityQuery &Query) { 702 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 703 }) 704 .narrowScalarIf( 705 greaterThan(0, 1), 706 [](const LegalityQuery &Query) { 707 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 708 }); 709 710 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 711 .scalarize(0) 712 .custom(); 713 714 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 715 // handle some operations by just promoting the register during 716 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 717 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 718 switch (AS) { 719 // FIXME: Private element size. 720 case AMDGPUAS::PRIVATE_ADDRESS: 721 return 32; 722 // FIXME: Check subtarget 723 case AMDGPUAS::LOCAL_ADDRESS: 724 return ST.useDS128() ? 128 : 64; 725 726 // Treat constant and global as identical. SMRD loads are sometimes usable 727 // for global loads (ideally constant address space should be eliminated) 728 // depending on the context. Legality cannot be context dependent, but 729 // RegBankSelect can split the load as necessary depending on the pointer 730 // register bank/uniformity and if the memory is invariant or not written in 731 // a kernel. 732 case AMDGPUAS::CONSTANT_ADDRESS: 733 case AMDGPUAS::GLOBAL_ADDRESS: 734 return IsLoad ? 512 : 128; 735 default: 736 return 128; 737 } 738 }; 739 740 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 741 bool IsLoad) -> bool { 742 const LLT DstTy = Query.Types[0]; 743 744 // Split vector extloads. 745 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 746 unsigned Align = Query.MMODescrs[0].AlignInBits; 747 748 if (MemSize < DstTy.getSizeInBits()) 749 MemSize = std::max(MemSize, Align); 750 751 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 752 return true; 753 754 const LLT PtrTy = Query.Types[1]; 755 unsigned AS = PtrTy.getAddressSpace(); 756 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 757 return true; 758 759 // Catch weird sized loads that don't evenly divide into the access sizes 760 // TODO: May be able to widen depending on alignment etc. 761 unsigned NumRegs = (MemSize + 31) / 32; 762 if (NumRegs == 3) { 763 if (!ST.hasDwordx3LoadStores()) 764 return true; 765 } else { 766 // If the alignment allows, these should have been widened. 767 if (!isPowerOf2_32(NumRegs)) 768 return true; 769 } 770 771 if (Align < MemSize) { 772 const SITargetLowering *TLI = ST.getTargetLowering(); 773 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 774 } 775 776 return false; 777 }; 778 779 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 780 unsigned Size = Query.Types[0].getSizeInBits(); 781 if (isPowerOf2_32(Size)) 782 return false; 783 784 if (Size == 96 && ST.hasDwordx3LoadStores()) 785 return false; 786 787 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 788 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 789 return false; 790 791 unsigned Align = Query.MMODescrs[0].AlignInBits; 792 unsigned RoundedSize = NextPowerOf2(Size); 793 return (Align >= RoundedSize); 794 }; 795 796 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 797 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 798 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 799 800 // TODO: Refine based on subtargets which support unaligned access or 128-bit 801 // LDS 802 // TODO: Unsupported flat for SI. 803 804 for (unsigned Op : {G_LOAD, G_STORE}) { 805 const bool IsStore = Op == G_STORE; 806 807 auto &Actions = getActionDefinitionsBuilder(Op); 808 // Whitelist the common cases. 809 // TODO: Loads to s16 on gfx9 810 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 811 {V2S32, GlobalPtr, 64, GlobalAlign32}, 812 {V4S32, GlobalPtr, 128, GlobalAlign32}, 813 {S128, GlobalPtr, 128, GlobalAlign32}, 814 {S64, GlobalPtr, 64, GlobalAlign32}, 815 {V2S64, GlobalPtr, 128, GlobalAlign32}, 816 {V2S16, GlobalPtr, 32, GlobalAlign32}, 817 {S32, GlobalPtr, 8, GlobalAlign8}, 818 {S32, GlobalPtr, 16, GlobalAlign16}, 819 820 {S32, LocalPtr, 32, 32}, 821 {S64, LocalPtr, 64, 32}, 822 {V2S32, LocalPtr, 64, 32}, 823 {S32, LocalPtr, 8, 8}, 824 {S32, LocalPtr, 16, 16}, 825 {V2S16, LocalPtr, 32, 32}, 826 827 {S32, PrivatePtr, 32, 32}, 828 {S32, PrivatePtr, 8, 8}, 829 {S32, PrivatePtr, 16, 16}, 830 {V2S16, PrivatePtr, 32, 32}, 831 832 {S32, FlatPtr, 32, GlobalAlign32}, 833 {S32, FlatPtr, 16, GlobalAlign16}, 834 {S32, FlatPtr, 8, GlobalAlign8}, 835 {V2S16, FlatPtr, 32, GlobalAlign32}, 836 837 {S32, ConstantPtr, 32, GlobalAlign32}, 838 {V2S32, ConstantPtr, 64, GlobalAlign32}, 839 {V4S32, ConstantPtr, 128, GlobalAlign32}, 840 {S64, ConstantPtr, 64, GlobalAlign32}, 841 {S128, ConstantPtr, 128, GlobalAlign32}, 842 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 843 Actions 844 .customIf(typeIs(1, Constant32Ptr)) 845 // Widen suitably aligned loads by loading extra elements. 846 .moreElementsIf([=](const LegalityQuery &Query) { 847 const LLT Ty = Query.Types[0]; 848 return Op == G_LOAD && Ty.isVector() && 849 shouldWidenLoadResult(Query); 850 }, moreElementsToNextPow2(0)) 851 .widenScalarIf([=](const LegalityQuery &Query) { 852 const LLT Ty = Query.Types[0]; 853 return Op == G_LOAD && !Ty.isVector() && 854 shouldWidenLoadResult(Query); 855 }, widenScalarOrEltToNextPow2(0)) 856 .narrowScalarIf( 857 [=](const LegalityQuery &Query) -> bool { 858 return !Query.Types[0].isVector() && 859 needToSplitMemOp(Query, Op == G_LOAD); 860 }, 861 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 862 const LLT DstTy = Query.Types[0]; 863 const LLT PtrTy = Query.Types[1]; 864 865 const unsigned DstSize = DstTy.getSizeInBits(); 866 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 867 868 // Split extloads. 869 if (DstSize > MemSize) 870 return std::make_pair(0, LLT::scalar(MemSize)); 871 872 if (!isPowerOf2_32(DstSize)) { 873 // We're probably decomposing an odd sized store. Try to split 874 // to the widest type. TODO: Account for alignment. As-is it 875 // should be OK, since the new parts will be further legalized. 876 unsigned FloorSize = PowerOf2Floor(DstSize); 877 return std::make_pair(0, LLT::scalar(FloorSize)); 878 } 879 880 if (DstSize > 32 && (DstSize % 32 != 0)) { 881 // FIXME: Need a way to specify non-extload of larger size if 882 // suitably aligned. 883 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 884 } 885 886 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 887 Op == G_LOAD); 888 if (MemSize > MaxSize) 889 return std::make_pair(0, LLT::scalar(MaxSize)); 890 891 unsigned Align = Query.MMODescrs[0].AlignInBits; 892 return std::make_pair(0, LLT::scalar(Align)); 893 }) 894 .fewerElementsIf( 895 [=](const LegalityQuery &Query) -> bool { 896 return Query.Types[0].isVector() && 897 needToSplitMemOp(Query, Op == G_LOAD); 898 }, 899 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 900 const LLT DstTy = Query.Types[0]; 901 const LLT PtrTy = Query.Types[1]; 902 903 LLT EltTy = DstTy.getElementType(); 904 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 905 Op == G_LOAD); 906 907 // FIXME: Handle widened to power of 2 results better. This ends 908 // up scalarizing. 909 // FIXME: 3 element stores scalarized on SI 910 911 // Split if it's too large for the address space. 912 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 913 unsigned NumElts = DstTy.getNumElements(); 914 unsigned EltSize = EltTy.getSizeInBits(); 915 916 if (MaxSize % EltSize == 0) { 917 return std::make_pair( 918 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 919 } 920 921 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 922 923 // FIXME: Refine when odd breakdowns handled 924 // The scalars will need to be re-legalized. 925 if (NumPieces == 1 || NumPieces >= NumElts || 926 NumElts % NumPieces != 0) 927 return std::make_pair(0, EltTy); 928 929 return std::make_pair(0, 930 LLT::vector(NumElts / NumPieces, EltTy)); 931 } 932 933 // FIXME: We could probably handle weird extending loads better. 934 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 935 if (DstTy.getSizeInBits() > MemSize) 936 return std::make_pair(0, EltTy); 937 938 unsigned EltSize = EltTy.getSizeInBits(); 939 unsigned DstSize = DstTy.getSizeInBits(); 940 if (!isPowerOf2_32(DstSize)) { 941 // We're probably decomposing an odd sized store. Try to split 942 // to the widest type. TODO: Account for alignment. As-is it 943 // should be OK, since the new parts will be further legalized. 944 unsigned FloorSize = PowerOf2Floor(DstSize); 945 return std::make_pair( 946 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 947 } 948 949 // Need to split because of alignment. 950 unsigned Align = Query.MMODescrs[0].AlignInBits; 951 if (EltSize > Align && 952 (EltSize / Align < DstTy.getNumElements())) { 953 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 954 } 955 956 // May need relegalization for the scalars. 957 return std::make_pair(0, EltTy); 958 }) 959 .minScalar(0, S32); 960 961 if (IsStore) 962 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 963 964 // TODO: Need a bitcast lower option? 965 Actions 966 .legalIf([=](const LegalityQuery &Query) { 967 const LLT Ty0 = Query.Types[0]; 968 unsigned Size = Ty0.getSizeInBits(); 969 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 970 unsigned Align = Query.MMODescrs[0].AlignInBits; 971 972 // FIXME: Widening store from alignment not valid. 973 if (MemSize < Size) 974 MemSize = std::max(MemSize, Align); 975 976 // No extending vector loads. 977 if (Size > MemSize && Ty0.isVector()) 978 return false; 979 980 switch (MemSize) { 981 case 8: 982 case 16: 983 return Size == 32; 984 case 32: 985 case 64: 986 case 128: 987 return true; 988 case 96: 989 return ST.hasDwordx3LoadStores(); 990 case 256: 991 case 512: 992 return true; 993 default: 994 return false; 995 } 996 }) 997 .widenScalarToNextPow2(0) 998 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 999 } 1000 1001 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1002 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1003 {S32, GlobalPtr, 16, 2 * 8}, 1004 {S32, LocalPtr, 8, 8}, 1005 {S32, LocalPtr, 16, 16}, 1006 {S32, PrivatePtr, 8, 8}, 1007 {S32, PrivatePtr, 16, 16}, 1008 {S32, ConstantPtr, 8, 8}, 1009 {S32, ConstantPtr, 16, 2 * 8}}); 1010 if (ST.hasFlatAddressSpace()) { 1011 ExtLoads.legalForTypesWithMemDesc( 1012 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1013 } 1014 1015 ExtLoads.clampScalar(0, S32, S32) 1016 .widenScalarToNextPow2(0) 1017 .unsupportedIfMemSizeNotPow2() 1018 .lower(); 1019 1020 auto &Atomics = getActionDefinitionsBuilder( 1021 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1022 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1023 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1024 G_ATOMICRMW_UMIN}) 1025 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1026 {S64, GlobalPtr}, {S64, LocalPtr}}); 1027 if (ST.hasFlatAddressSpace()) { 1028 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1029 } 1030 1031 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1032 .legalFor({{S32, LocalPtr}}); 1033 1034 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1035 // demarshalling 1036 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1037 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1038 {S32, FlatPtr}, {S64, FlatPtr}}) 1039 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1040 {S32, RegionPtr}, {S64, RegionPtr}}); 1041 // TODO: Pointer types, any 32-bit or 64-bit vector 1042 1043 // Condition should be s32 for scalar, s1 for vector. 1044 getActionDefinitionsBuilder(G_SELECT) 1045 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1046 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1047 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1048 .clampScalar(0, S16, S64) 1049 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1050 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1051 .scalarize(1) 1052 .clampMaxNumElements(0, S32, 2) 1053 .clampMaxNumElements(0, LocalPtr, 2) 1054 .clampMaxNumElements(0, PrivatePtr, 2) 1055 .scalarize(0) 1056 .widenScalarToNextPow2(0) 1057 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1058 1059 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1060 // be more flexible with the shift amount type. 1061 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1062 .legalFor({{S32, S32}, {S64, S32}}); 1063 if (ST.has16BitInsts()) { 1064 if (ST.hasVOP3PInsts()) { 1065 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 1066 .clampMaxNumElements(0, S16, 2); 1067 } else 1068 Shifts.legalFor({{S16, S32}, {S16, S16}}); 1069 1070 // TODO: Support 16-bit shift amounts 1071 Shifts.clampScalar(1, S32, S32); 1072 Shifts.clampScalar(0, S16, S64); 1073 Shifts.widenScalarToNextPow2(0, 16); 1074 } else { 1075 // Make sure we legalize the shift amount type first, as the general 1076 // expansion for the shifted type will produce much worse code if it hasn't 1077 // been truncated already. 1078 Shifts.clampScalar(1, S32, S32); 1079 Shifts.clampScalar(0, S32, S64); 1080 Shifts.widenScalarToNextPow2(0, 32); 1081 } 1082 Shifts.scalarize(0); 1083 1084 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1085 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1086 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1087 unsigned IdxTypeIdx = 2; 1088 1089 getActionDefinitionsBuilder(Op) 1090 .customIf([=](const LegalityQuery &Query) { 1091 const LLT EltTy = Query.Types[EltTypeIdx]; 1092 const LLT VecTy = Query.Types[VecTypeIdx]; 1093 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1094 return (EltTy.getSizeInBits() == 16 || 1095 EltTy.getSizeInBits() % 32 == 0) && 1096 VecTy.getSizeInBits() % 32 == 0 && 1097 VecTy.getSizeInBits() <= 1024 && 1098 IdxTy.getSizeInBits() == 32; 1099 }) 1100 .clampScalar(EltTypeIdx, S32, S64) 1101 .clampScalar(VecTypeIdx, S32, S64) 1102 .clampScalar(IdxTypeIdx, S32, S32); 1103 } 1104 1105 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1106 .unsupportedIf([=](const LegalityQuery &Query) { 1107 const LLT &EltTy = Query.Types[1].getElementType(); 1108 return Query.Types[0] != EltTy; 1109 }); 1110 1111 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1112 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1113 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1114 1115 // FIXME: Doesn't handle extract of illegal sizes. 1116 getActionDefinitionsBuilder(Op) 1117 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1118 // FIXME: Multiples of 16 should not be legal. 1119 .legalIf([=](const LegalityQuery &Query) { 1120 const LLT BigTy = Query.Types[BigTyIdx]; 1121 const LLT LitTy = Query.Types[LitTyIdx]; 1122 return (BigTy.getSizeInBits() % 32 == 0) && 1123 (LitTy.getSizeInBits() % 16 == 0); 1124 }) 1125 .widenScalarIf( 1126 [=](const LegalityQuery &Query) { 1127 const LLT BigTy = Query.Types[BigTyIdx]; 1128 return (BigTy.getScalarSizeInBits() < 16); 1129 }, 1130 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1131 .widenScalarIf( 1132 [=](const LegalityQuery &Query) { 1133 const LLT LitTy = Query.Types[LitTyIdx]; 1134 return (LitTy.getScalarSizeInBits() < 16); 1135 }, 1136 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1137 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1138 .widenScalarToNextPow2(BigTyIdx, 32); 1139 1140 } 1141 1142 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1143 .legalForCartesianProduct(AllS32Vectors, {S32}) 1144 .legalForCartesianProduct(AllS64Vectors, {S64}) 1145 .clampNumElements(0, V16S32, V32S32) 1146 .clampNumElements(0, V2S64, V16S64) 1147 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1148 1149 if (ST.hasScalarPackInsts()) { 1150 BuildVector 1151 // FIXME: Should probably widen s1 vectors straight to s32 1152 .minScalarOrElt(0, S16) 1153 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1154 .minScalar(1, S32); 1155 1156 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1157 .legalFor({V2S16, S32}) 1158 .lower(); 1159 BuildVector.minScalarOrElt(0, S32); 1160 } else { 1161 BuildVector.customFor({V2S16, S16}); 1162 BuildVector.minScalarOrElt(0, S32); 1163 1164 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1165 .customFor({V2S16, S32}) 1166 .lower(); 1167 } 1168 1169 BuildVector.legalIf(isRegisterType(0)); 1170 1171 // FIXME: Clamp maximum size 1172 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1173 .legalIf(isRegisterType(0)); 1174 1175 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1176 // pre-legalize. 1177 if (ST.hasVOP3PInsts()) { 1178 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1179 .customFor({V2S16, V2S16}) 1180 .lower(); 1181 } else 1182 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1183 1184 // Merge/Unmerge 1185 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1186 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1187 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1188 1189 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1190 const LLT &Ty = Query.Types[TypeIdx]; 1191 if (Ty.isVector()) { 1192 const LLT &EltTy = Ty.getElementType(); 1193 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1194 return true; 1195 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1196 return true; 1197 } 1198 return false; 1199 }; 1200 1201 auto &Builder = getActionDefinitionsBuilder(Op) 1202 // Try to widen to s16 first for small types. 1203 // TODO: Only do this on targets with legal s16 shifts 1204 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1205 1206 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1207 .lowerFor({{S16, V2S16}}) 1208 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1209 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1210 elementTypeIs(1, S16)), 1211 changeTo(1, V2S16)) 1212 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1213 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1214 // valid. 1215 .clampScalar(LitTyIdx, S32, S256) 1216 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1217 // Break up vectors with weird elements into scalars 1218 .fewerElementsIf( 1219 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1220 scalarize(0)) 1221 .fewerElementsIf( 1222 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1223 scalarize(1)) 1224 .clampScalar(BigTyIdx, S32, S1024); 1225 1226 if (Op == G_MERGE_VALUES) { 1227 Builder.widenScalarIf( 1228 // TODO: Use 16-bit shifts if legal for 8-bit values? 1229 [=](const LegalityQuery &Query) { 1230 const LLT Ty = Query.Types[LitTyIdx]; 1231 return Ty.getSizeInBits() < 32; 1232 }, 1233 changeTo(LitTyIdx, S32)); 1234 } 1235 1236 Builder.widenScalarIf( 1237 [=](const LegalityQuery &Query) { 1238 const LLT Ty = Query.Types[BigTyIdx]; 1239 return !isPowerOf2_32(Ty.getSizeInBits()) && 1240 Ty.getSizeInBits() % 16 != 0; 1241 }, 1242 [=](const LegalityQuery &Query) { 1243 // Pick the next power of 2, or a multiple of 64 over 128. 1244 // Whichever is smaller. 1245 const LLT &Ty = Query.Types[BigTyIdx]; 1246 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1247 if (NewSizeInBits >= 256) { 1248 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1249 if (RoundedTo < NewSizeInBits) 1250 NewSizeInBits = RoundedTo; 1251 } 1252 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1253 }) 1254 .legalIf([=](const LegalityQuery &Query) { 1255 const LLT &BigTy = Query.Types[BigTyIdx]; 1256 const LLT &LitTy = Query.Types[LitTyIdx]; 1257 1258 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1259 return false; 1260 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1261 return false; 1262 1263 return BigTy.getSizeInBits() % 16 == 0 && 1264 LitTy.getSizeInBits() % 16 == 0 && 1265 BigTy.getSizeInBits() <= 1024; 1266 }) 1267 // Any vectors left are the wrong size. Scalarize them. 1268 .scalarize(0) 1269 .scalarize(1); 1270 } 1271 1272 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1273 // RegBankSelect. 1274 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1275 .legalFor({{S32}, {S64}}); 1276 1277 if (ST.hasVOP3PInsts()) { 1278 SextInReg.lowerFor({{V2S16}}) 1279 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1280 // get more vector shift opportunities, since we'll get those when 1281 // expanded. 1282 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1283 } else if (ST.has16BitInsts()) { 1284 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1285 } else { 1286 // Prefer to promote to s32 before lowering if we don't have 16-bit 1287 // shifts. This avoid a lot of intermediate truncate and extend operations. 1288 SextInReg.lowerFor({{S32}, {S64}}); 1289 } 1290 1291 SextInReg 1292 .scalarize(0) 1293 .clampScalar(0, S32, S64) 1294 .lower(); 1295 1296 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1297 .legalFor({S64}); 1298 1299 getActionDefinitionsBuilder({ 1300 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1301 G_FCOPYSIGN, 1302 1303 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1304 G_READ_REGISTER, 1305 G_WRITE_REGISTER, 1306 1307 G_SADDO, G_SSUBO, 1308 1309 // TODO: Implement 1310 G_FMINIMUM, G_FMAXIMUM 1311 }).lower(); 1312 1313 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1314 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1315 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1316 .unsupported(); 1317 1318 computeTables(); 1319 verify(*ST.getInstrInfo()); 1320 } 1321 1322 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1323 MachineRegisterInfo &MRI, 1324 MachineIRBuilder &B, 1325 GISelChangeObserver &Observer) const { 1326 switch (MI.getOpcode()) { 1327 case TargetOpcode::G_ADDRSPACE_CAST: 1328 return legalizeAddrSpaceCast(MI, MRI, B); 1329 case TargetOpcode::G_FRINT: 1330 return legalizeFrint(MI, MRI, B); 1331 case TargetOpcode::G_FCEIL: 1332 return legalizeFceil(MI, MRI, B); 1333 case TargetOpcode::G_INTRINSIC_TRUNC: 1334 return legalizeIntrinsicTrunc(MI, MRI, B); 1335 case TargetOpcode::G_SITOFP: 1336 return legalizeITOFP(MI, MRI, B, true); 1337 case TargetOpcode::G_UITOFP: 1338 return legalizeITOFP(MI, MRI, B, false); 1339 case TargetOpcode::G_FPTOSI: 1340 return legalizeFPTOI(MI, MRI, B, true); 1341 case TargetOpcode::G_FPTOUI: 1342 return legalizeFPTOI(MI, MRI, B, false); 1343 case TargetOpcode::G_FMINNUM: 1344 case TargetOpcode::G_FMAXNUM: 1345 case TargetOpcode::G_FMINNUM_IEEE: 1346 case TargetOpcode::G_FMAXNUM_IEEE: 1347 return legalizeMinNumMaxNum(MI, MRI, B); 1348 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1349 return legalizeExtractVectorElt(MI, MRI, B); 1350 case TargetOpcode::G_INSERT_VECTOR_ELT: 1351 return legalizeInsertVectorElt(MI, MRI, B); 1352 case TargetOpcode::G_SHUFFLE_VECTOR: 1353 return legalizeShuffleVector(MI, MRI, B); 1354 case TargetOpcode::G_FSIN: 1355 case TargetOpcode::G_FCOS: 1356 return legalizeSinCos(MI, MRI, B); 1357 case TargetOpcode::G_GLOBAL_VALUE: 1358 return legalizeGlobalValue(MI, MRI, B); 1359 case TargetOpcode::G_LOAD: 1360 return legalizeLoad(MI, MRI, B, Observer); 1361 case TargetOpcode::G_FMAD: 1362 return legalizeFMad(MI, MRI, B); 1363 case TargetOpcode::G_FDIV: 1364 return legalizeFDIV(MI, MRI, B); 1365 case TargetOpcode::G_UDIV: 1366 case TargetOpcode::G_UREM: 1367 return legalizeUDIV_UREM(MI, MRI, B); 1368 case TargetOpcode::G_SDIV: 1369 case TargetOpcode::G_SREM: 1370 return legalizeSDIV_SREM(MI, MRI, B); 1371 case TargetOpcode::G_ATOMIC_CMPXCHG: 1372 return legalizeAtomicCmpXChg(MI, MRI, B); 1373 case TargetOpcode::G_FLOG: 1374 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1375 case TargetOpcode::G_FLOG10: 1376 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1377 case TargetOpcode::G_FEXP: 1378 return legalizeFExp(MI, B); 1379 case TargetOpcode::G_FPOW: 1380 return legalizeFPow(MI, B); 1381 case TargetOpcode::G_FFLOOR: 1382 return legalizeFFloor(MI, MRI, B); 1383 case TargetOpcode::G_BUILD_VECTOR: 1384 return legalizeBuildVector(MI, MRI, B); 1385 default: 1386 return false; 1387 } 1388 1389 llvm_unreachable("expected switch to return"); 1390 } 1391 1392 Register AMDGPULegalizerInfo::getSegmentAperture( 1393 unsigned AS, 1394 MachineRegisterInfo &MRI, 1395 MachineIRBuilder &B) const { 1396 MachineFunction &MF = B.getMF(); 1397 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1398 const LLT S32 = LLT::scalar(32); 1399 1400 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1401 1402 if (ST.hasApertureRegs()) { 1403 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1404 // getreg. 1405 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1406 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1407 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1408 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1409 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1410 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1411 unsigned Encoding = 1412 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1413 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1414 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1415 1416 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1417 1418 B.buildInstr(AMDGPU::S_GETREG_B32) 1419 .addDef(GetReg) 1420 .addImm(Encoding); 1421 MRI.setType(GetReg, S32); 1422 1423 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1424 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1425 } 1426 1427 Register QueuePtr = MRI.createGenericVirtualRegister( 1428 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1429 1430 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1431 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1432 return Register(); 1433 1434 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1435 // private_segment_aperture_base_hi. 1436 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1437 1438 // TODO: can we be smarter about machine pointer info? 1439 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1440 MachineMemOperand *MMO = MF.getMachineMemOperand( 1441 PtrInfo, 1442 MachineMemOperand::MOLoad | 1443 MachineMemOperand::MODereferenceable | 1444 MachineMemOperand::MOInvariant, 1445 4, 1446 MinAlign(64, StructOffset)); 1447 1448 Register LoadAddr; 1449 1450 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1451 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1452 } 1453 1454 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1455 MachineInstr &MI, MachineRegisterInfo &MRI, 1456 MachineIRBuilder &B) const { 1457 MachineFunction &MF = B.getMF(); 1458 1459 B.setInstr(MI); 1460 1461 const LLT S32 = LLT::scalar(32); 1462 Register Dst = MI.getOperand(0).getReg(); 1463 Register Src = MI.getOperand(1).getReg(); 1464 1465 LLT DstTy = MRI.getType(Dst); 1466 LLT SrcTy = MRI.getType(Src); 1467 unsigned DestAS = DstTy.getAddressSpace(); 1468 unsigned SrcAS = SrcTy.getAddressSpace(); 1469 1470 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1471 // vector element. 1472 assert(!DstTy.isVector()); 1473 1474 const AMDGPUTargetMachine &TM 1475 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1476 1477 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1478 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1479 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1480 return true; 1481 } 1482 1483 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1484 // Truncate. 1485 B.buildExtract(Dst, Src, 0); 1486 MI.eraseFromParent(); 1487 return true; 1488 } 1489 1490 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1491 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1492 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1493 1494 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1495 // another. Merge operands are required to be the same type, but creating an 1496 // extra ptrtoint would be kind of pointless. 1497 auto HighAddr = B.buildConstant( 1498 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1499 B.buildMerge(Dst, {Src, HighAddr}); 1500 MI.eraseFromParent(); 1501 return true; 1502 } 1503 1504 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1505 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1506 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1507 unsigned NullVal = TM.getNullPointerValue(DestAS); 1508 1509 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1510 auto FlatNull = B.buildConstant(SrcTy, 0); 1511 1512 // Extract low 32-bits of the pointer. 1513 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1514 1515 auto CmpRes = 1516 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1517 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1518 1519 MI.eraseFromParent(); 1520 return true; 1521 } 1522 1523 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1524 return false; 1525 1526 if (!ST.hasFlatAddressSpace()) 1527 return false; 1528 1529 auto SegmentNull = 1530 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1531 auto FlatNull = 1532 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1533 1534 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1535 if (!ApertureReg.isValid()) 1536 return false; 1537 1538 auto CmpRes = 1539 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1540 1541 // Coerce the type of the low half of the result so we can use merge_values. 1542 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1543 1544 // TODO: Should we allow mismatched types but matching sizes in merges to 1545 // avoid the ptrtoint? 1546 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1547 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1548 1549 MI.eraseFromParent(); 1550 return true; 1551 } 1552 1553 bool AMDGPULegalizerInfo::legalizeFrint( 1554 MachineInstr &MI, MachineRegisterInfo &MRI, 1555 MachineIRBuilder &B) const { 1556 B.setInstr(MI); 1557 1558 Register Src = MI.getOperand(1).getReg(); 1559 LLT Ty = MRI.getType(Src); 1560 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1561 1562 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1563 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1564 1565 auto C1 = B.buildFConstant(Ty, C1Val); 1566 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1567 1568 // TODO: Should this propagate fast-math-flags? 1569 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1570 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1571 1572 auto C2 = B.buildFConstant(Ty, C2Val); 1573 auto Fabs = B.buildFAbs(Ty, Src); 1574 1575 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1576 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1577 return true; 1578 } 1579 1580 bool AMDGPULegalizerInfo::legalizeFceil( 1581 MachineInstr &MI, MachineRegisterInfo &MRI, 1582 MachineIRBuilder &B) const { 1583 B.setInstr(MI); 1584 1585 const LLT S1 = LLT::scalar(1); 1586 const LLT S64 = LLT::scalar(64); 1587 1588 Register Src = MI.getOperand(1).getReg(); 1589 assert(MRI.getType(Src) == S64); 1590 1591 // result = trunc(src) 1592 // if (src > 0.0 && src != result) 1593 // result += 1.0 1594 1595 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1596 1597 const auto Zero = B.buildFConstant(S64, 0.0); 1598 const auto One = B.buildFConstant(S64, 1.0); 1599 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1600 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1601 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1602 auto Add = B.buildSelect(S64, And, One, Zero); 1603 1604 // TODO: Should this propagate fast-math-flags? 1605 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1606 return true; 1607 } 1608 1609 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1610 MachineIRBuilder &B) { 1611 const unsigned FractBits = 52; 1612 const unsigned ExpBits = 11; 1613 LLT S32 = LLT::scalar(32); 1614 1615 auto Const0 = B.buildConstant(S32, FractBits - 32); 1616 auto Const1 = B.buildConstant(S32, ExpBits); 1617 1618 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1619 .addUse(Const0.getReg(0)) 1620 .addUse(Const1.getReg(0)); 1621 1622 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1623 } 1624 1625 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1626 MachineInstr &MI, MachineRegisterInfo &MRI, 1627 MachineIRBuilder &B) const { 1628 B.setInstr(MI); 1629 1630 const LLT S1 = LLT::scalar(1); 1631 const LLT S32 = LLT::scalar(32); 1632 const LLT S64 = LLT::scalar(64); 1633 1634 Register Src = MI.getOperand(1).getReg(); 1635 assert(MRI.getType(Src) == S64); 1636 1637 // TODO: Should this use extract since the low half is unused? 1638 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1639 Register Hi = Unmerge.getReg(1); 1640 1641 // Extract the upper half, since this is where we will find the sign and 1642 // exponent. 1643 auto Exp = extractF64Exponent(Hi, B); 1644 1645 const unsigned FractBits = 52; 1646 1647 // Extract the sign bit. 1648 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1649 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1650 1651 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1652 1653 const auto Zero32 = B.buildConstant(S32, 0); 1654 1655 // Extend back to 64-bits. 1656 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1657 1658 auto Shr = B.buildAShr(S64, FractMask, Exp); 1659 auto Not = B.buildNot(S64, Shr); 1660 auto Tmp0 = B.buildAnd(S64, Src, Not); 1661 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1662 1663 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1664 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1665 1666 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1667 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1668 return true; 1669 } 1670 1671 bool AMDGPULegalizerInfo::legalizeITOFP( 1672 MachineInstr &MI, MachineRegisterInfo &MRI, 1673 MachineIRBuilder &B, bool Signed) const { 1674 B.setInstr(MI); 1675 1676 Register Dst = MI.getOperand(0).getReg(); 1677 Register Src = MI.getOperand(1).getReg(); 1678 1679 const LLT S64 = LLT::scalar(64); 1680 const LLT S32 = LLT::scalar(32); 1681 1682 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1683 1684 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1685 1686 auto CvtHi = Signed ? 1687 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1688 B.buildUITOFP(S64, Unmerge.getReg(1)); 1689 1690 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1691 1692 auto ThirtyTwo = B.buildConstant(S32, 32); 1693 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1694 .addUse(CvtHi.getReg(0)) 1695 .addUse(ThirtyTwo.getReg(0)); 1696 1697 // TODO: Should this propagate fast-math-flags? 1698 B.buildFAdd(Dst, LdExp, CvtLo); 1699 MI.eraseFromParent(); 1700 return true; 1701 } 1702 1703 // TODO: Copied from DAG implementation. Verify logic and document how this 1704 // actually works. 1705 bool AMDGPULegalizerInfo::legalizeFPTOI( 1706 MachineInstr &MI, MachineRegisterInfo &MRI, 1707 MachineIRBuilder &B, bool Signed) const { 1708 B.setInstr(MI); 1709 1710 Register Dst = MI.getOperand(0).getReg(); 1711 Register Src = MI.getOperand(1).getReg(); 1712 1713 const LLT S64 = LLT::scalar(64); 1714 const LLT S32 = LLT::scalar(32); 1715 1716 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1717 1718 unsigned Flags = MI.getFlags(); 1719 1720 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1721 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1722 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1723 1724 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1725 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1726 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1727 1728 auto Hi = Signed ? 1729 B.buildFPTOSI(S32, FloorMul) : 1730 B.buildFPTOUI(S32, FloorMul); 1731 auto Lo = B.buildFPTOUI(S32, Fma); 1732 1733 B.buildMerge(Dst, { Lo, Hi }); 1734 MI.eraseFromParent(); 1735 1736 return true; 1737 } 1738 1739 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1740 MachineInstr &MI, MachineRegisterInfo &MRI, 1741 MachineIRBuilder &B) const { 1742 MachineFunction &MF = B.getMF(); 1743 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1744 1745 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1746 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1747 1748 // With ieee_mode disabled, the instructions have the correct behavior 1749 // already for G_FMINNUM/G_FMAXNUM 1750 if (!MFI->getMode().IEEE) 1751 return !IsIEEEOp; 1752 1753 if (IsIEEEOp) 1754 return true; 1755 1756 MachineIRBuilder HelperBuilder(MI); 1757 GISelObserverWrapper DummyObserver; 1758 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1759 HelperBuilder.setInstr(MI); 1760 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1761 } 1762 1763 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1764 MachineInstr &MI, MachineRegisterInfo &MRI, 1765 MachineIRBuilder &B) const { 1766 // TODO: Should move some of this into LegalizerHelper. 1767 1768 // TODO: Promote dynamic indexing of s16 to s32 1769 1770 // FIXME: Artifact combiner probably should have replaced the truncated 1771 // constant before this, so we shouldn't need 1772 // getConstantVRegValWithLookThrough. 1773 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1774 MI.getOperand(2).getReg(), MRI); 1775 if (!IdxVal) // Dynamic case will be selected to register indexing. 1776 return true; 1777 1778 Register Dst = MI.getOperand(0).getReg(); 1779 Register Vec = MI.getOperand(1).getReg(); 1780 1781 LLT VecTy = MRI.getType(Vec); 1782 LLT EltTy = VecTy.getElementType(); 1783 assert(EltTy == MRI.getType(Dst)); 1784 1785 B.setInstr(MI); 1786 1787 if (IdxVal->Value < VecTy.getNumElements()) 1788 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1789 else 1790 B.buildUndef(Dst); 1791 1792 MI.eraseFromParent(); 1793 return true; 1794 } 1795 1796 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1797 MachineInstr &MI, MachineRegisterInfo &MRI, 1798 MachineIRBuilder &B) const { 1799 // TODO: Should move some of this into LegalizerHelper. 1800 1801 // TODO: Promote dynamic indexing of s16 to s32 1802 1803 // FIXME: Artifact combiner probably should have replaced the truncated 1804 // constant before this, so we shouldn't need 1805 // getConstantVRegValWithLookThrough. 1806 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1807 MI.getOperand(3).getReg(), MRI); 1808 if (!IdxVal) // Dynamic case will be selected to register indexing. 1809 return true; 1810 1811 Register Dst = MI.getOperand(0).getReg(); 1812 Register Vec = MI.getOperand(1).getReg(); 1813 Register Ins = MI.getOperand(2).getReg(); 1814 1815 LLT VecTy = MRI.getType(Vec); 1816 LLT EltTy = VecTy.getElementType(); 1817 assert(EltTy == MRI.getType(Ins)); 1818 1819 B.setInstr(MI); 1820 1821 if (IdxVal->Value < VecTy.getNumElements()) 1822 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1823 else 1824 B.buildUndef(Dst); 1825 1826 MI.eraseFromParent(); 1827 return true; 1828 } 1829 1830 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1831 MachineInstr &MI, MachineRegisterInfo &MRI, 1832 MachineIRBuilder &B) const { 1833 const LLT V2S16 = LLT::vector(2, 16); 1834 1835 Register Dst = MI.getOperand(0).getReg(); 1836 Register Src0 = MI.getOperand(1).getReg(); 1837 LLT DstTy = MRI.getType(Dst); 1838 LLT SrcTy = MRI.getType(Src0); 1839 1840 if (SrcTy == V2S16 && DstTy == V2S16 && 1841 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1842 return true; 1843 1844 MachineIRBuilder HelperBuilder(MI); 1845 GISelObserverWrapper DummyObserver; 1846 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1847 HelperBuilder.setInstr(MI); 1848 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1849 } 1850 1851 bool AMDGPULegalizerInfo::legalizeSinCos( 1852 MachineInstr &MI, MachineRegisterInfo &MRI, 1853 MachineIRBuilder &B) const { 1854 B.setInstr(MI); 1855 1856 Register DstReg = MI.getOperand(0).getReg(); 1857 Register SrcReg = MI.getOperand(1).getReg(); 1858 LLT Ty = MRI.getType(DstReg); 1859 unsigned Flags = MI.getFlags(); 1860 1861 Register TrigVal; 1862 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1863 if (ST.hasTrigReducedRange()) { 1864 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1865 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1866 .addUse(MulVal.getReg(0)) 1867 .setMIFlags(Flags).getReg(0); 1868 } else 1869 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1870 1871 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1872 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1873 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1874 .addUse(TrigVal) 1875 .setMIFlags(Flags); 1876 MI.eraseFromParent(); 1877 return true; 1878 } 1879 1880 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1881 Register DstReg, LLT PtrTy, 1882 MachineIRBuilder &B, const GlobalValue *GV, 1883 unsigned Offset, unsigned GAFlags) const { 1884 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1885 // to the following code sequence: 1886 // 1887 // For constant address space: 1888 // s_getpc_b64 s[0:1] 1889 // s_add_u32 s0, s0, $symbol 1890 // s_addc_u32 s1, s1, 0 1891 // 1892 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1893 // a fixup or relocation is emitted to replace $symbol with a literal 1894 // constant, which is a pc-relative offset from the encoding of the $symbol 1895 // operand to the global variable. 1896 // 1897 // For global address space: 1898 // s_getpc_b64 s[0:1] 1899 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1900 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1901 // 1902 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1903 // fixups or relocations are emitted to replace $symbol@*@lo and 1904 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1905 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1906 // operand to the global variable. 1907 // 1908 // What we want here is an offset from the value returned by s_getpc 1909 // (which is the address of the s_add_u32 instruction) to the global 1910 // variable, but since the encoding of $symbol starts 4 bytes after the start 1911 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1912 // small. This requires us to add 4 to the global variable offset in order to 1913 // compute the correct address. 1914 1915 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1916 1917 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1918 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1919 1920 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1921 .addDef(PCReg); 1922 1923 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1924 if (GAFlags == SIInstrInfo::MO_NONE) 1925 MIB.addImm(0); 1926 else 1927 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1928 1929 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1930 1931 if (PtrTy.getSizeInBits() == 32) 1932 B.buildExtract(DstReg, PCReg, 0); 1933 return true; 1934 } 1935 1936 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1937 MachineInstr &MI, MachineRegisterInfo &MRI, 1938 MachineIRBuilder &B) const { 1939 Register DstReg = MI.getOperand(0).getReg(); 1940 LLT Ty = MRI.getType(DstReg); 1941 unsigned AS = Ty.getAddressSpace(); 1942 1943 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1944 MachineFunction &MF = B.getMF(); 1945 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1946 B.setInstr(MI); 1947 1948 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1949 if (!MFI->isEntryFunction()) { 1950 const Function &Fn = MF.getFunction(); 1951 DiagnosticInfoUnsupported BadLDSDecl( 1952 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1953 Fn.getContext().diagnose(BadLDSDecl); 1954 } 1955 1956 // TODO: We could emit code to handle the initialization somewhere. 1957 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1958 const SITargetLowering *TLI = ST.getTargetLowering(); 1959 if (!TLI->shouldUseLDSConstAddress(GV)) { 1960 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1961 return true; // Leave in place; 1962 } 1963 1964 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1965 MI.eraseFromParent(); 1966 return true; 1967 } 1968 1969 const Function &Fn = MF.getFunction(); 1970 DiagnosticInfoUnsupported BadInit( 1971 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1972 Fn.getContext().diagnose(BadInit); 1973 return true; 1974 } 1975 1976 const SITargetLowering *TLI = ST.getTargetLowering(); 1977 1978 if (TLI->shouldEmitFixup(GV)) { 1979 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1980 MI.eraseFromParent(); 1981 return true; 1982 } 1983 1984 if (TLI->shouldEmitPCReloc(GV)) { 1985 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1986 MI.eraseFromParent(); 1987 return true; 1988 } 1989 1990 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1991 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1992 1993 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1994 MachinePointerInfo::getGOT(MF), 1995 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1996 MachineMemOperand::MOInvariant, 1997 8 /*Size*/, 8 /*Align*/); 1998 1999 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2000 2001 if (Ty.getSizeInBits() == 32) { 2002 // Truncate if this is a 32-bit constant adrdess. 2003 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2004 B.buildExtract(DstReg, Load, 0); 2005 } else 2006 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2007 2008 MI.eraseFromParent(); 2009 return true; 2010 } 2011 2012 bool AMDGPULegalizerInfo::legalizeLoad( 2013 MachineInstr &MI, MachineRegisterInfo &MRI, 2014 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2015 B.setInstr(MI); 2016 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2017 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2018 Observer.changingInstr(MI); 2019 MI.getOperand(1).setReg(Cast.getReg(0)); 2020 Observer.changedInstr(MI); 2021 return true; 2022 } 2023 2024 bool AMDGPULegalizerInfo::legalizeFMad( 2025 MachineInstr &MI, MachineRegisterInfo &MRI, 2026 MachineIRBuilder &B) const { 2027 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2028 assert(Ty.isScalar()); 2029 2030 MachineFunction &MF = B.getMF(); 2031 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2032 2033 // TODO: Always legal with future ftz flag. 2034 // FIXME: Do we need just output? 2035 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2036 return true; 2037 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2038 return true; 2039 2040 MachineIRBuilder HelperBuilder(MI); 2041 GISelObserverWrapper DummyObserver; 2042 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2043 HelperBuilder.setMBB(*MI.getParent()); 2044 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2045 } 2046 2047 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2048 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2049 Register DstReg = MI.getOperand(0).getReg(); 2050 Register PtrReg = MI.getOperand(1).getReg(); 2051 Register CmpVal = MI.getOperand(2).getReg(); 2052 Register NewVal = MI.getOperand(3).getReg(); 2053 2054 assert(SITargetLowering::isFlatGlobalAddrSpace( 2055 MRI.getType(PtrReg).getAddressSpace()) && 2056 "this should not have been custom lowered"); 2057 2058 LLT ValTy = MRI.getType(CmpVal); 2059 LLT VecTy = LLT::vector(2, ValTy); 2060 2061 B.setInstr(MI); 2062 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2063 2064 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2065 .addDef(DstReg) 2066 .addUse(PtrReg) 2067 .addUse(PackedVal) 2068 .setMemRefs(MI.memoperands()); 2069 2070 MI.eraseFromParent(); 2071 return true; 2072 } 2073 2074 bool AMDGPULegalizerInfo::legalizeFlog( 2075 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2076 Register Dst = MI.getOperand(0).getReg(); 2077 Register Src = MI.getOperand(1).getReg(); 2078 LLT Ty = B.getMRI()->getType(Dst); 2079 unsigned Flags = MI.getFlags(); 2080 B.setInstr(MI); 2081 2082 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2083 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2084 2085 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2086 MI.eraseFromParent(); 2087 return true; 2088 } 2089 2090 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2091 MachineIRBuilder &B) const { 2092 Register Dst = MI.getOperand(0).getReg(); 2093 Register Src = MI.getOperand(1).getReg(); 2094 unsigned Flags = MI.getFlags(); 2095 LLT Ty = B.getMRI()->getType(Dst); 2096 B.setInstr(MI); 2097 2098 auto K = B.buildFConstant(Ty, numbers::log2e); 2099 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2100 B.buildFExp2(Dst, Mul, Flags); 2101 MI.eraseFromParent(); 2102 return true; 2103 } 2104 2105 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2106 MachineIRBuilder &B) const { 2107 Register Dst = MI.getOperand(0).getReg(); 2108 Register Src0 = MI.getOperand(1).getReg(); 2109 Register Src1 = MI.getOperand(2).getReg(); 2110 unsigned Flags = MI.getFlags(); 2111 LLT Ty = B.getMRI()->getType(Dst); 2112 B.setInstr(MI); 2113 const LLT S16 = LLT::scalar(16); 2114 const LLT S32 = LLT::scalar(32); 2115 2116 if (Ty == S32) { 2117 auto Log = B.buildFLog2(S32, Src0, Flags); 2118 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2119 .addUse(Log.getReg(0)) 2120 .addUse(Src1) 2121 .setMIFlags(Flags); 2122 B.buildFExp2(Dst, Mul, Flags); 2123 } else if (Ty == S16) { 2124 // There's no f16 fmul_legacy, so we need to convert for it. 2125 auto Log = B.buildFLog2(S16, Src0, Flags); 2126 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2127 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2128 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2129 .addUse(Ext0.getReg(0)) 2130 .addUse(Ext1.getReg(0)) 2131 .setMIFlags(Flags); 2132 2133 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2134 } else 2135 return false; 2136 2137 MI.eraseFromParent(); 2138 return true; 2139 } 2140 2141 // Find a source register, ignoring any possible source modifiers. 2142 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2143 Register ModSrc = OrigSrc; 2144 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2145 ModSrc = SrcFNeg->getOperand(1).getReg(); 2146 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2147 ModSrc = SrcFAbs->getOperand(1).getReg(); 2148 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2149 ModSrc = SrcFAbs->getOperand(1).getReg(); 2150 return ModSrc; 2151 } 2152 2153 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2154 MachineRegisterInfo &MRI, 2155 MachineIRBuilder &B) const { 2156 B.setInstr(MI); 2157 2158 const LLT S1 = LLT::scalar(1); 2159 const LLT S64 = LLT::scalar(64); 2160 Register Dst = MI.getOperand(0).getReg(); 2161 Register OrigSrc = MI.getOperand(1).getReg(); 2162 unsigned Flags = MI.getFlags(); 2163 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2164 "this should not have been custom lowered"); 2165 2166 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2167 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2168 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2169 // V_FRACT bug is: 2170 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2171 // 2172 // Convert floor(x) to (x - fract(x)) 2173 2174 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2175 .addUse(OrigSrc) 2176 .setMIFlags(Flags); 2177 2178 // Give source modifier matching some assistance before obscuring a foldable 2179 // pattern. 2180 2181 // TODO: We can avoid the neg on the fract? The input sign to fract 2182 // shouldn't matter? 2183 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2184 2185 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2186 2187 Register Min = MRI.createGenericVirtualRegister(S64); 2188 2189 // We don't need to concern ourselves with the snan handling difference, so 2190 // use the one which will directly select. 2191 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2192 if (MFI->getMode().IEEE) 2193 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2194 else 2195 B.buildFMinNum(Min, Fract, Const, Flags); 2196 2197 Register CorrectedFract = Min; 2198 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2199 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2200 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2201 } 2202 2203 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2204 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2205 2206 MI.eraseFromParent(); 2207 return true; 2208 } 2209 2210 // Turn an illegal packed v2s16 build vector into bit operations. 2211 // TODO: This should probably be a bitcast action in LegalizerHelper. 2212 bool AMDGPULegalizerInfo::legalizeBuildVector( 2213 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2214 Register Dst = MI.getOperand(0).getReg(); 2215 LLT DstTy = MRI.getType(Dst); 2216 const LLT S32 = LLT::scalar(32); 2217 const LLT V2S16 = LLT::vector(2, 16); 2218 (void)DstTy; 2219 (void)V2S16; 2220 assert(DstTy == V2S16); 2221 2222 Register Src0 = MI.getOperand(1).getReg(); 2223 Register Src1 = MI.getOperand(2).getReg(); 2224 assert(MRI.getType(Src0) == LLT::scalar(16)); 2225 2226 B.setInstr(MI); 2227 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2228 B.buildBitcast(Dst, Merge); 2229 2230 MI.eraseFromParent(); 2231 return true; 2232 } 2233 2234 // Return the use branch instruction, otherwise null if the usage is invalid. 2235 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2236 MachineRegisterInfo &MRI, 2237 MachineInstr *&Br) { 2238 Register CondDef = MI.getOperand(0).getReg(); 2239 if (!MRI.hasOneNonDBGUse(CondDef)) 2240 return nullptr; 2241 2242 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2243 if (UseMI.getParent() != MI.getParent() || 2244 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2245 return nullptr; 2246 2247 // Make sure the cond br is followed by a G_BR 2248 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2249 if (Next != MI.getParent()->end()) { 2250 if (Next->getOpcode() != AMDGPU::G_BR) 2251 return nullptr; 2252 Br = &*Next; 2253 } 2254 2255 return &UseMI; 2256 } 2257 2258 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2259 MachineRegisterInfo &MRI, 2260 Register LiveIn, 2261 Register PhyReg) const { 2262 assert(PhyReg.isPhysical() && "Physical register expected"); 2263 2264 // Insert the live-in copy, if required, by defining destination virtual 2265 // register. 2266 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2267 if (!MRI.getVRegDef(LiveIn)) { 2268 // FIXME: Should have scoped insert pt 2269 MachineBasicBlock &OrigInsBB = B.getMBB(); 2270 auto OrigInsPt = B.getInsertPt(); 2271 2272 MachineBasicBlock &EntryMBB = B.getMF().front(); 2273 EntryMBB.addLiveIn(PhyReg); 2274 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2275 B.buildCopy(LiveIn, PhyReg); 2276 2277 B.setInsertPt(OrigInsBB, OrigInsPt); 2278 } 2279 2280 return LiveIn; 2281 } 2282 2283 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2284 MachineRegisterInfo &MRI, 2285 Register PhyReg, LLT Ty, 2286 bool InsertLiveInCopy) const { 2287 assert(PhyReg.isPhysical() && "Physical register expected"); 2288 2289 // Get or create virtual live-in regester 2290 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2291 if (!LiveIn) { 2292 LiveIn = MRI.createGenericVirtualRegister(Ty); 2293 MRI.addLiveIn(PhyReg, LiveIn); 2294 } 2295 2296 // When the actual true copy required is from virtual register to physical 2297 // register (to be inserted later), live-in copy insertion from physical 2298 // to register virtual register is not required 2299 if (!InsertLiveInCopy) 2300 return LiveIn; 2301 2302 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2303 } 2304 2305 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2306 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2307 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2308 const ArgDescriptor *Arg; 2309 const TargetRegisterClass *RC; 2310 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2311 if (!Arg) { 2312 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2313 return nullptr; 2314 } 2315 return Arg; 2316 } 2317 2318 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2319 const ArgDescriptor *Arg) const { 2320 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2321 return false; // TODO: Handle these 2322 2323 Register SrcReg = Arg->getRegister(); 2324 assert(SrcReg.isPhysical() && "Physical register expected"); 2325 assert(DstReg.isVirtual() && "Virtual register expected"); 2326 2327 MachineRegisterInfo &MRI = *B.getMRI(); 2328 2329 LLT Ty = MRI.getType(DstReg); 2330 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2331 2332 if (Arg->isMasked()) { 2333 // TODO: Should we try to emit this once in the entry block? 2334 const LLT S32 = LLT::scalar(32); 2335 const unsigned Mask = Arg->getMask(); 2336 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2337 2338 Register AndMaskSrc = LiveIn; 2339 2340 if (Shift != 0) { 2341 auto ShiftAmt = B.buildConstant(S32, Shift); 2342 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2343 } 2344 2345 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2346 } else { 2347 B.buildCopy(DstReg, LiveIn); 2348 } 2349 2350 return true; 2351 } 2352 2353 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2354 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2355 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2356 B.setInstr(MI); 2357 2358 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2359 if (!Arg) 2360 return false; 2361 2362 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2363 return false; 2364 2365 MI.eraseFromParent(); 2366 return true; 2367 } 2368 2369 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2370 MachineRegisterInfo &MRI, 2371 MachineIRBuilder &B) const { 2372 B.setInstr(MI); 2373 Register Dst = MI.getOperand(0).getReg(); 2374 LLT DstTy = MRI.getType(Dst); 2375 LLT S16 = LLT::scalar(16); 2376 LLT S32 = LLT::scalar(32); 2377 LLT S64 = LLT::scalar(64); 2378 2379 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2380 return true; 2381 2382 if (DstTy == S16) 2383 return legalizeFDIV16(MI, MRI, B); 2384 if (DstTy == S32) 2385 return legalizeFDIV32(MI, MRI, B); 2386 if (DstTy == S64) 2387 return legalizeFDIV64(MI, MRI, B); 2388 2389 return false; 2390 } 2391 2392 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2393 const LLT S32 = LLT::scalar(32); 2394 2395 auto Cvt0 = B.buildUITOFP(S32, Src); 2396 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2397 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2398 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2399 return B.buildFPTOUI(S32, Mul).getReg(0); 2400 } 2401 2402 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2403 Register DstReg, 2404 Register Num, 2405 Register Den, 2406 bool IsRem) const { 2407 const LLT S1 = LLT::scalar(1); 2408 const LLT S32 = LLT::scalar(32); 2409 2410 // RCP = URECIP(Den) = 2^32 / Den + e 2411 // e is rounding error. 2412 auto RCP = buildDivRCP(B, Den); 2413 2414 // RCP_LO = mul(RCP, Den) 2415 auto RCP_LO = B.buildMul(S32, RCP, Den); 2416 2417 // RCP_HI = mulhu (RCP, Den) */ 2418 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2419 2420 // NEG_RCP_LO = -RCP_LO 2421 auto Zero = B.buildConstant(S32, 0); 2422 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2423 2424 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2425 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2426 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2427 2428 // Calculate the rounding error from the URECIP instruction 2429 // E = mulhu(ABS_RCP_LO, RCP) 2430 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2431 2432 // RCP_A_E = RCP + E 2433 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2434 2435 // RCP_S_E = RCP - E 2436 auto RCP_S_E = B.buildSub(S32, RCP, E); 2437 2438 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2439 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2440 2441 // Quotient = mulhu(Tmp0, Num)stmp 2442 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2443 2444 // Num_S_Remainder = Quotient * Den 2445 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2446 2447 // Remainder = Num - Num_S_Remainder 2448 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2449 2450 // Remainder_GE_Den = Remainder >= Den 2451 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2452 2453 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2454 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2455 Num, Num_S_Remainder); 2456 2457 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2458 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2459 2460 // Calculate Division result: 2461 2462 // Quotient_A_One = Quotient + 1 2463 auto One = B.buildConstant(S32, 1); 2464 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2465 2466 // Quotient_S_One = Quotient - 1 2467 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2468 2469 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2470 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2471 2472 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2473 if (IsRem) { 2474 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2475 2476 // Calculate Rem result: 2477 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2478 2479 // Remainder_A_Den = Remainder + Den 2480 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2481 2482 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2483 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2484 2485 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2486 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2487 } else { 2488 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2489 } 2490 } 2491 2492 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2493 MachineRegisterInfo &MRI, 2494 MachineIRBuilder &B) const { 2495 B.setInstr(MI); 2496 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2497 Register DstReg = MI.getOperand(0).getReg(); 2498 Register Num = MI.getOperand(1).getReg(); 2499 Register Den = MI.getOperand(2).getReg(); 2500 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2501 MI.eraseFromParent(); 2502 return true; 2503 } 2504 2505 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2506 MachineRegisterInfo &MRI, 2507 MachineIRBuilder &B) const { 2508 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2509 return legalizeUDIV_UREM32(MI, MRI, B); 2510 return false; 2511 } 2512 2513 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2514 MachineRegisterInfo &MRI, 2515 MachineIRBuilder &B) const { 2516 B.setInstr(MI); 2517 const LLT S32 = LLT::scalar(32); 2518 2519 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2520 Register DstReg = MI.getOperand(0).getReg(); 2521 Register LHS = MI.getOperand(1).getReg(); 2522 Register RHS = MI.getOperand(2).getReg(); 2523 2524 auto ThirtyOne = B.buildConstant(S32, 31); 2525 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2526 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2527 2528 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2529 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2530 2531 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2532 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2533 2534 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2535 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2536 2537 if (IsRem) { 2538 auto RSign = LHSign; // Remainder sign is the same as LHS 2539 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2540 B.buildSub(DstReg, UDivRem, RSign); 2541 } else { 2542 auto DSign = B.buildXor(S32, LHSign, RHSign); 2543 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2544 B.buildSub(DstReg, UDivRem, DSign); 2545 } 2546 2547 MI.eraseFromParent(); 2548 return true; 2549 } 2550 2551 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2552 MachineRegisterInfo &MRI, 2553 MachineIRBuilder &B) const { 2554 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2555 return legalizeSDIV_SREM32(MI, MRI, B); 2556 return false; 2557 } 2558 2559 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2560 MachineRegisterInfo &MRI, 2561 MachineIRBuilder &B) const { 2562 Register Res = MI.getOperand(0).getReg(); 2563 Register LHS = MI.getOperand(1).getReg(); 2564 Register RHS = MI.getOperand(2).getReg(); 2565 2566 uint16_t Flags = MI.getFlags(); 2567 2568 LLT ResTy = MRI.getType(Res); 2569 LLT S32 = LLT::scalar(32); 2570 LLT S64 = LLT::scalar(64); 2571 2572 const MachineFunction &MF = B.getMF(); 2573 bool Unsafe = 2574 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2575 2576 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2577 return false; 2578 2579 if (!Unsafe && ResTy == S32 && 2580 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2581 return false; 2582 2583 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2584 // 1 / x -> RCP(x) 2585 if (CLHS->isExactlyValue(1.0)) { 2586 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2587 .addUse(RHS) 2588 .setMIFlags(Flags); 2589 2590 MI.eraseFromParent(); 2591 return true; 2592 } 2593 2594 // -1 / x -> RCP( FNEG(x) ) 2595 if (CLHS->isExactlyValue(-1.0)) { 2596 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2597 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2598 .addUse(FNeg.getReg(0)) 2599 .setMIFlags(Flags); 2600 2601 MI.eraseFromParent(); 2602 return true; 2603 } 2604 } 2605 2606 // x / y -> x * (1.0 / y) 2607 if (Unsafe) { 2608 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2609 .addUse(RHS) 2610 .setMIFlags(Flags); 2611 B.buildFMul(Res, LHS, RCP, Flags); 2612 2613 MI.eraseFromParent(); 2614 return true; 2615 } 2616 2617 return false; 2618 } 2619 2620 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2621 MachineRegisterInfo &MRI, 2622 MachineIRBuilder &B) const { 2623 B.setInstr(MI); 2624 Register Res = MI.getOperand(0).getReg(); 2625 Register LHS = MI.getOperand(1).getReg(); 2626 Register RHS = MI.getOperand(2).getReg(); 2627 2628 uint16_t Flags = MI.getFlags(); 2629 2630 LLT S16 = LLT::scalar(16); 2631 LLT S32 = LLT::scalar(32); 2632 2633 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2634 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2635 2636 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2637 .addUse(RHSExt.getReg(0)) 2638 .setMIFlags(Flags); 2639 2640 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2641 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2642 2643 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2644 .addUse(RDst.getReg(0)) 2645 .addUse(RHS) 2646 .addUse(LHS) 2647 .setMIFlags(Flags); 2648 2649 MI.eraseFromParent(); 2650 return true; 2651 } 2652 2653 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2654 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2655 static void toggleSPDenormMode(bool Enable, 2656 MachineIRBuilder &B, 2657 const GCNSubtarget &ST, 2658 AMDGPU::SIModeRegisterDefaults Mode) { 2659 // Set SP denorm mode to this value. 2660 unsigned SPDenormMode = 2661 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2662 2663 if (ST.hasDenormModeInst()) { 2664 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2665 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2666 2667 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2668 B.buildInstr(AMDGPU::S_DENORM_MODE) 2669 .addImm(NewDenormModeValue); 2670 2671 } else { 2672 // Select FP32 bit field in mode register. 2673 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2674 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2675 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2676 2677 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2678 .addImm(SPDenormMode) 2679 .addImm(SPDenormModeBitField); 2680 } 2681 } 2682 2683 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2684 MachineRegisterInfo &MRI, 2685 MachineIRBuilder &B) const { 2686 B.setInstr(MI); 2687 Register Res = MI.getOperand(0).getReg(); 2688 Register LHS = MI.getOperand(1).getReg(); 2689 Register RHS = MI.getOperand(2).getReg(); 2690 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2691 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2692 2693 uint16_t Flags = MI.getFlags(); 2694 2695 LLT S32 = LLT::scalar(32); 2696 LLT S1 = LLT::scalar(1); 2697 2698 auto One = B.buildFConstant(S32, 1.0f); 2699 2700 auto DenominatorScaled = 2701 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2702 .addUse(RHS) 2703 .addUse(LHS) 2704 .addImm(1) 2705 .setMIFlags(Flags); 2706 auto NumeratorScaled = 2707 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2708 .addUse(LHS) 2709 .addUse(RHS) 2710 .addImm(0) 2711 .setMIFlags(Flags); 2712 2713 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2714 .addUse(DenominatorScaled.getReg(0)) 2715 .setMIFlags(Flags); 2716 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2717 2718 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2719 // aren't modeled as reading it. 2720 if (!Mode.allFP32Denormals()) 2721 toggleSPDenormMode(true, B, ST, Mode); 2722 2723 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2724 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2725 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2726 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2727 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2728 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2729 2730 if (!Mode.allFP32Denormals()) 2731 toggleSPDenormMode(false, B, ST, Mode); 2732 2733 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2734 .addUse(Fma4.getReg(0)) 2735 .addUse(Fma1.getReg(0)) 2736 .addUse(Fma3.getReg(0)) 2737 .addUse(NumeratorScaled.getReg(1)) 2738 .setMIFlags(Flags); 2739 2740 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2741 .addUse(Fmas.getReg(0)) 2742 .addUse(RHS) 2743 .addUse(LHS) 2744 .setMIFlags(Flags); 2745 2746 MI.eraseFromParent(); 2747 return true; 2748 } 2749 2750 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2751 MachineRegisterInfo &MRI, 2752 MachineIRBuilder &B) const { 2753 B.setInstr(MI); 2754 Register Res = MI.getOperand(0).getReg(); 2755 Register LHS = MI.getOperand(1).getReg(); 2756 Register RHS = MI.getOperand(2).getReg(); 2757 2758 uint16_t Flags = MI.getFlags(); 2759 2760 LLT S64 = LLT::scalar(64); 2761 LLT S1 = LLT::scalar(1); 2762 2763 auto One = B.buildFConstant(S64, 1.0); 2764 2765 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2766 .addUse(LHS) 2767 .addUse(RHS) 2768 .addImm(1) 2769 .setMIFlags(Flags); 2770 2771 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2772 2773 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2774 .addUse(DivScale0.getReg(0)) 2775 .setMIFlags(Flags); 2776 2777 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2778 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2779 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2780 2781 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2782 .addUse(LHS) 2783 .addUse(RHS) 2784 .addImm(0) 2785 .setMIFlags(Flags); 2786 2787 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2788 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2789 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2790 2791 Register Scale; 2792 if (!ST.hasUsableDivScaleConditionOutput()) { 2793 // Workaround a hardware bug on SI where the condition output from div_scale 2794 // is not usable. 2795 2796 LLT S32 = LLT::scalar(32); 2797 2798 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2799 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2800 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2801 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2802 2803 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2804 Scale1Unmerge.getReg(1)); 2805 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2806 Scale0Unmerge.getReg(1)); 2807 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 2808 } else { 2809 Scale = DivScale1.getReg(1); 2810 } 2811 2812 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2813 .addUse(Fma4.getReg(0)) 2814 .addUse(Fma3.getReg(0)) 2815 .addUse(Mul.getReg(0)) 2816 .addUse(Scale) 2817 .setMIFlags(Flags); 2818 2819 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2820 .addUse(Fmas.getReg(0)) 2821 .addUse(RHS) 2822 .addUse(LHS) 2823 .setMIFlags(Flags); 2824 2825 MI.eraseFromParent(); 2826 return true; 2827 } 2828 2829 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2830 MachineRegisterInfo &MRI, 2831 MachineIRBuilder &B) const { 2832 B.setInstr(MI); 2833 Register Res = MI.getOperand(0).getReg(); 2834 Register LHS = MI.getOperand(2).getReg(); 2835 Register RHS = MI.getOperand(3).getReg(); 2836 uint16_t Flags = MI.getFlags(); 2837 2838 LLT S32 = LLT::scalar(32); 2839 LLT S1 = LLT::scalar(1); 2840 2841 auto Abs = B.buildFAbs(S32, RHS, Flags); 2842 const APFloat C0Val(1.0f); 2843 2844 auto C0 = B.buildConstant(S32, 0x6f800000); 2845 auto C1 = B.buildConstant(S32, 0x2f800000); 2846 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2847 2848 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2849 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2850 2851 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2852 2853 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2854 .addUse(Mul0.getReg(0)) 2855 .setMIFlags(Flags); 2856 2857 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2858 2859 B.buildFMul(Res, Sel, Mul1, Flags); 2860 2861 MI.eraseFromParent(); 2862 return true; 2863 } 2864 2865 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2866 MachineRegisterInfo &MRI, 2867 MachineIRBuilder &B) const { 2868 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2869 if (!MFI->isEntryFunction()) { 2870 return legalizePreloadedArgIntrin(MI, MRI, B, 2871 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2872 } 2873 2874 B.setInstr(MI); 2875 2876 uint64_t Offset = 2877 ST.getTargetLowering()->getImplicitParameterOffset( 2878 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2879 Register DstReg = MI.getOperand(0).getReg(); 2880 LLT DstTy = MRI.getType(DstReg); 2881 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2882 2883 const ArgDescriptor *Arg; 2884 const TargetRegisterClass *RC; 2885 std::tie(Arg, RC) 2886 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2887 if (!Arg) 2888 return false; 2889 2890 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2891 if (!loadInputValue(KernargPtrReg, B, Arg)) 2892 return false; 2893 2894 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2895 MI.eraseFromParent(); 2896 return true; 2897 } 2898 2899 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2900 MachineRegisterInfo &MRI, 2901 MachineIRBuilder &B, 2902 unsigned AddrSpace) const { 2903 B.setInstr(MI); 2904 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2905 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2906 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2907 MI.eraseFromParent(); 2908 return true; 2909 } 2910 2911 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2912 // offset (the offset that is included in bounds checking and swizzling, to be 2913 // split between the instruction's voffset and immoffset fields) and soffset 2914 // (the offset that is excluded from bounds checking and swizzling, to go in 2915 // the instruction's soffset field). This function takes the first kind of 2916 // offset and figures out how to split it between voffset and immoffset. 2917 std::tuple<Register, unsigned, unsigned> 2918 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2919 Register OrigOffset) const { 2920 const unsigned MaxImm = 4095; 2921 Register BaseReg; 2922 unsigned TotalConstOffset; 2923 MachineInstr *OffsetDef; 2924 const LLT S32 = LLT::scalar(32); 2925 2926 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2927 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2928 2929 unsigned ImmOffset = TotalConstOffset; 2930 2931 // If the immediate value is too big for the immoffset field, put the value 2932 // and -4096 into the immoffset field so that the value that is copied/added 2933 // for the voffset field is a multiple of 4096, and it stands more chance 2934 // of being CSEd with the copy/add for another similar load/store. 2935 // However, do not do that rounding down to a multiple of 4096 if that is a 2936 // negative number, as it appears to be illegal to have a negative offset 2937 // in the vgpr, even if adding the immediate offset makes it positive. 2938 unsigned Overflow = ImmOffset & ~MaxImm; 2939 ImmOffset -= Overflow; 2940 if ((int32_t)Overflow < 0) { 2941 Overflow += ImmOffset; 2942 ImmOffset = 0; 2943 } 2944 2945 if (Overflow != 0) { 2946 if (!BaseReg) { 2947 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2948 } else { 2949 auto OverflowVal = B.buildConstant(S32, Overflow); 2950 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2951 } 2952 } 2953 2954 if (!BaseReg) 2955 BaseReg = B.buildConstant(S32, 0).getReg(0); 2956 2957 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2958 } 2959 2960 /// Handle register layout difference for f16 images for some subtargets. 2961 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2962 MachineRegisterInfo &MRI, 2963 Register Reg) const { 2964 if (!ST.hasUnpackedD16VMem()) 2965 return Reg; 2966 2967 const LLT S16 = LLT::scalar(16); 2968 const LLT S32 = LLT::scalar(32); 2969 LLT StoreVT = MRI.getType(Reg); 2970 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2971 2972 auto Unmerge = B.buildUnmerge(S16, Reg); 2973 2974 SmallVector<Register, 4> WideRegs; 2975 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2976 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2977 2978 int NumElts = StoreVT.getNumElements(); 2979 2980 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2981 } 2982 2983 Register AMDGPULegalizerInfo::fixStoreSourceType( 2984 MachineIRBuilder &B, Register VData, bool IsFormat) const { 2985 MachineRegisterInfo *MRI = B.getMRI(); 2986 LLT Ty = MRI->getType(VData); 2987 2988 const LLT S16 = LLT::scalar(16); 2989 2990 // Fixup illegal register types for i8 stores. 2991 if (Ty == LLT::scalar(8) || Ty == S16) { 2992 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2993 return AnyExt; 2994 } 2995 2996 if (Ty.isVector()) { 2997 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2998 if (IsFormat) 2999 return handleD16VData(B, *MRI, VData); 3000 } 3001 } 3002 3003 return VData; 3004 } 3005 3006 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3007 MachineRegisterInfo &MRI, 3008 MachineIRBuilder &B, 3009 bool IsTyped, 3010 bool IsFormat) const { 3011 B.setInstr(MI); 3012 3013 Register VData = MI.getOperand(1).getReg(); 3014 LLT Ty = MRI.getType(VData); 3015 LLT EltTy = Ty.getScalarType(); 3016 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3017 const LLT S32 = LLT::scalar(32); 3018 3019 VData = fixStoreSourceType(B, VData, IsFormat); 3020 Register RSrc = MI.getOperand(2).getReg(); 3021 3022 MachineMemOperand *MMO = *MI.memoperands_begin(); 3023 const int MemSize = MMO->getSize(); 3024 3025 unsigned ImmOffset; 3026 unsigned TotalOffset; 3027 3028 // The typed intrinsics add an immediate after the registers. 3029 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3030 3031 // The struct intrinsic variants add one additional operand over raw. 3032 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3033 Register VIndex; 3034 int OpOffset = 0; 3035 if (HasVIndex) { 3036 VIndex = MI.getOperand(3).getReg(); 3037 OpOffset = 1; 3038 } 3039 3040 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3041 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3042 3043 unsigned Format = 0; 3044 if (IsTyped) { 3045 Format = MI.getOperand(5 + OpOffset).getImm(); 3046 ++OpOffset; 3047 } 3048 3049 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3050 3051 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3052 if (TotalOffset != 0) 3053 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3054 3055 unsigned Opc; 3056 if (IsTyped) { 3057 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3058 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3059 } else if (IsFormat) { 3060 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3061 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3062 } else { 3063 switch (MemSize) { 3064 case 1: 3065 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3066 break; 3067 case 2: 3068 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3069 break; 3070 default: 3071 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3072 break; 3073 } 3074 } 3075 3076 if (!VIndex) 3077 VIndex = B.buildConstant(S32, 0).getReg(0); 3078 3079 auto MIB = B.buildInstr(Opc) 3080 .addUse(VData) // vdata 3081 .addUse(RSrc) // rsrc 3082 .addUse(VIndex) // vindex 3083 .addUse(VOffset) // voffset 3084 .addUse(SOffset) // soffset 3085 .addImm(ImmOffset); // offset(imm) 3086 3087 if (IsTyped) 3088 MIB.addImm(Format); 3089 3090 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3091 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3092 .addMemOperand(MMO); 3093 3094 MI.eraseFromParent(); 3095 return true; 3096 } 3097 3098 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3099 MachineRegisterInfo &MRI, 3100 MachineIRBuilder &B, 3101 bool IsFormat, 3102 bool IsTyped) const { 3103 B.setInstr(MI); 3104 3105 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3106 MachineMemOperand *MMO = *MI.memoperands_begin(); 3107 const int MemSize = MMO->getSize(); 3108 const LLT S32 = LLT::scalar(32); 3109 3110 Register Dst = MI.getOperand(0).getReg(); 3111 Register RSrc = MI.getOperand(2).getReg(); 3112 3113 // The typed intrinsics add an immediate after the registers. 3114 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3115 3116 // The struct intrinsic variants add one additional operand over raw. 3117 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3118 Register VIndex; 3119 int OpOffset = 0; 3120 if (HasVIndex) { 3121 VIndex = MI.getOperand(3).getReg(); 3122 OpOffset = 1; 3123 } 3124 3125 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3126 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3127 3128 unsigned Format = 0; 3129 if (IsTyped) { 3130 Format = MI.getOperand(5 + OpOffset).getImm(); 3131 ++OpOffset; 3132 } 3133 3134 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3135 unsigned ImmOffset; 3136 unsigned TotalOffset; 3137 3138 LLT Ty = MRI.getType(Dst); 3139 LLT EltTy = Ty.getScalarType(); 3140 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3141 const bool Unpacked = ST.hasUnpackedD16VMem(); 3142 3143 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3144 if (TotalOffset != 0) 3145 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3146 3147 unsigned Opc; 3148 3149 if (IsTyped) { 3150 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3151 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3152 } else if (IsFormat) { 3153 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3154 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3155 } else { 3156 switch (MemSize) { 3157 case 1: 3158 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3159 break; 3160 case 2: 3161 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3162 break; 3163 default: 3164 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3165 break; 3166 } 3167 } 3168 3169 Register LoadDstReg; 3170 3171 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3172 LLT UnpackedTy = Ty.changeElementSize(32); 3173 3174 if (IsExtLoad) 3175 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3176 else if (Unpacked && IsD16 && Ty.isVector()) 3177 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3178 else 3179 LoadDstReg = Dst; 3180 3181 if (!VIndex) 3182 VIndex = B.buildConstant(S32, 0).getReg(0); 3183 3184 auto MIB = B.buildInstr(Opc) 3185 .addDef(LoadDstReg) // vdata 3186 .addUse(RSrc) // rsrc 3187 .addUse(VIndex) // vindex 3188 .addUse(VOffset) // voffset 3189 .addUse(SOffset) // soffset 3190 .addImm(ImmOffset); // offset(imm) 3191 3192 if (IsTyped) 3193 MIB.addImm(Format); 3194 3195 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3196 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3197 .addMemOperand(MMO); 3198 3199 if (LoadDstReg != Dst) { 3200 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3201 3202 // Widen result for extending loads was widened. 3203 if (IsExtLoad) 3204 B.buildTrunc(Dst, LoadDstReg); 3205 else { 3206 // Repack to original 16-bit vector result 3207 // FIXME: G_TRUNC should work, but legalization currently fails 3208 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3209 SmallVector<Register, 4> Repack; 3210 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3211 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3212 B.buildMerge(Dst, Repack); 3213 } 3214 } 3215 3216 MI.eraseFromParent(); 3217 return true; 3218 } 3219 3220 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3221 MachineIRBuilder &B, 3222 bool IsInc) const { 3223 B.setInstr(MI); 3224 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3225 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3226 B.buildInstr(Opc) 3227 .addDef(MI.getOperand(0).getReg()) 3228 .addUse(MI.getOperand(2).getReg()) 3229 .addUse(MI.getOperand(3).getReg()) 3230 .cloneMemRefs(MI); 3231 MI.eraseFromParent(); 3232 return true; 3233 } 3234 3235 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3236 switch (IntrID) { 3237 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3238 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3239 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3240 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3241 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3242 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3243 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3244 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3245 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3246 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3247 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3248 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3249 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3250 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3251 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3252 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3253 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3254 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3255 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3256 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3257 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3258 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3259 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3260 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3261 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3262 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3263 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3264 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3265 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3266 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3267 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3268 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3269 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3270 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3271 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3272 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3273 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3274 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3275 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3276 default: 3277 llvm_unreachable("unhandled atomic opcode"); 3278 } 3279 } 3280 3281 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3282 MachineIRBuilder &B, 3283 Intrinsic::ID IID) const { 3284 B.setInstr(MI); 3285 3286 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3287 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3288 3289 Register Dst = MI.getOperand(0).getReg(); 3290 Register VData = MI.getOperand(2).getReg(); 3291 3292 Register CmpVal; 3293 int OpOffset = 0; 3294 3295 if (IsCmpSwap) { 3296 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3297 ++OpOffset; 3298 } 3299 3300 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3301 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3302 3303 // The struct intrinsic variants add one additional operand over raw. 3304 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3305 Register VIndex; 3306 if (HasVIndex) { 3307 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3308 ++OpOffset; 3309 } 3310 3311 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3312 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3313 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3314 3315 MachineMemOperand *MMO = *MI.memoperands_begin(); 3316 3317 unsigned ImmOffset; 3318 unsigned TotalOffset; 3319 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3320 if (TotalOffset != 0) 3321 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3322 3323 if (!VIndex) 3324 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3325 3326 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3327 .addDef(Dst) 3328 .addUse(VData); // vdata 3329 3330 if (IsCmpSwap) 3331 MIB.addReg(CmpVal); 3332 3333 MIB.addUse(RSrc) // rsrc 3334 .addUse(VIndex) // vindex 3335 .addUse(VOffset) // voffset 3336 .addUse(SOffset) // soffset 3337 .addImm(ImmOffset) // offset(imm) 3338 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3339 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3340 .addMemOperand(MMO); 3341 3342 MI.eraseFromParent(); 3343 return true; 3344 } 3345 3346 // Produce a vector of s16 elements from s32 pieces. 3347 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg, 3348 ArrayRef<Register> UnmergeParts) { 3349 const LLT S16 = LLT::scalar(16); 3350 3351 SmallVector<Register, 4> RemergeParts(UnmergeParts.size()); 3352 for (int I = 0, E = UnmergeParts.size(); I != E; ++I) 3353 RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0); 3354 3355 B.buildBuildVector(DstReg, RemergeParts); 3356 } 3357 3358 /// Convert a set of s32 registers to a result vector with s16 elements. 3359 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg, 3360 ArrayRef<Register> UnmergeParts) { 3361 MachineRegisterInfo &MRI = *B.getMRI(); 3362 const LLT V2S16 = LLT::vector(2, 16); 3363 LLT TargetTy = MRI.getType(DstReg); 3364 int NumElts = UnmergeParts.size(); 3365 3366 if (NumElts == 1) { 3367 assert(TargetTy == V2S16); 3368 B.buildBitcast(DstReg, UnmergeParts[0]); 3369 return; 3370 } 3371 3372 SmallVector<Register, 4> RemergeParts(NumElts); 3373 for (int I = 0; I != NumElts; ++I) 3374 RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0); 3375 3376 if (TargetTy.getSizeInBits() == 32u * NumElts) { 3377 B.buildConcatVectors(DstReg, RemergeParts); 3378 return; 3379 } 3380 3381 const LLT V3S16 = LLT::vector(3, 16); 3382 const LLT V6S16 = LLT::vector(6, 16); 3383 3384 // Widen to v6s16 and unpack v3 parts. 3385 assert(TargetTy == V3S16); 3386 3387 RemergeParts.push_back(B.buildUndef(V2S16).getReg(0)); 3388 auto Concat = B.buildConcatVectors(V6S16, RemergeParts); 3389 B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat); 3390 } 3391 3392 // FIXME: Just vector trunc should be sufficent, but legalization currently 3393 // broken. 3394 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg, 3395 Register WideDstReg) { 3396 const LLT S32 = LLT::scalar(32); 3397 const LLT S16 = LLT::scalar(16); 3398 3399 auto Unmerge = B.buildUnmerge(S32, WideDstReg); 3400 3401 int NumOps = Unmerge->getNumOperands() - 1; 3402 SmallVector<Register, 4> RemergeParts(NumOps); 3403 for (int I = 0; I != NumOps; ++I) 3404 RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0); 3405 3406 B.buildBuildVector(DstReg, RemergeParts); 3407 } 3408 3409 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3410 MachineInstr &MI, MachineIRBuilder &B, 3411 GISelChangeObserver &Observer, 3412 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3413 bool IsTFE = MI.getNumExplicitDefs() == 2; 3414 3415 // We are only processing the operands of d16 image operations on subtargets 3416 // that use the unpacked register layout, or need to repack the TFE result. 3417 3418 // TODO: Need to handle a16 images too 3419 // TODO: Do we need to guard against already legalized intrinsics? 3420 if (!IsTFE && !ST.hasUnpackedD16VMem()) 3421 return true; 3422 3423 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3424 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3425 3426 if (BaseOpcode->Atomic) // No d16 atomics, or TFE. 3427 return true; 3428 3429 B.setInstr(MI); 3430 3431 MachineRegisterInfo *MRI = B.getMRI(); 3432 const LLT S32 = LLT::scalar(32); 3433 const LLT S16 = LLT::scalar(16); 3434 3435 if (BaseOpcode->Store) { // No TFE for stores? 3436 Register VData = MI.getOperand(1).getReg(); 3437 LLT Ty = MRI->getType(VData); 3438 if (!Ty.isVector() || Ty.getElementType() != S16) 3439 return true; 3440 3441 B.setInstr(MI); 3442 3443 Observer.changingInstr(MI); 3444 MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData)); 3445 Observer.changedInstr(MI); 3446 return true; 3447 } 3448 3449 Register DstReg = MI.getOperand(0).getReg(); 3450 LLT Ty = MRI->getType(DstReg); 3451 const LLT EltTy = Ty.getScalarType(); 3452 const bool IsD16 = Ty.getScalarType() == S16; 3453 const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3454 3455 if (IsTFE) { 3456 // In the IR, TFE is supposed to be used with a 2 element struct return 3457 // type. The intruction really returns these two values in one contiguous 3458 // register, with one additional dword beyond the loaded data. Rewrite the 3459 // return type to use a single register result. 3460 Register Dst1Reg = MI.getOperand(1).getReg(); 3461 if (MRI->getType(Dst1Reg) != S32) 3462 return false; 3463 3464 // TODO: Make sure the TFE operand bit is set. 3465 3466 // The raw dword aligned data component of the load. The only legal cases 3467 // where this matters should be when using the packed D16 format, for 3468 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3469 LLT RoundedTy; 3470 LLT TFETy; 3471 3472 if (IsD16 && ST.hasUnpackedD16VMem()) { 3473 RoundedTy = LLT::scalarOrVector(NumElts, 32); 3474 TFETy = LLT::vector(NumElts + 1, 32); 3475 } else { 3476 unsigned EltSize = Ty.getScalarSizeInBits(); 3477 unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32; 3478 unsigned RoundedSize = 32 * RoundedElts; 3479 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3480 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3481 } 3482 3483 Register TFEReg = MRI->createGenericVirtualRegister(TFETy); 3484 Observer.changingInstr(MI); 3485 3486 MI.getOperand(0).setReg(TFEReg); 3487 MI.RemoveOperand(1); 3488 3489 Observer.changedInstr(MI); 3490 3491 // Insert after the instruction. 3492 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3493 3494 // Now figure out how to copy the new result register back into the old 3495 // result. 3496 3497 SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg); 3498 int NumDataElts = TFETy.getNumElements() - 1; 3499 3500 if (!Ty.isVector()) { 3501 // Simplest case is a trivial unmerge (plus a truncate for d16). 3502 UnmergeResults[0] = Ty == S32 ? 3503 DstReg : MRI->createGenericVirtualRegister(S32); 3504 3505 B.buildUnmerge(UnmergeResults, TFEReg); 3506 if (Ty != S32) 3507 B.buildTrunc(DstReg, UnmergeResults[0]); 3508 return true; 3509 } 3510 3511 // We have to repack into a new vector of some kind. 3512 for (int I = 0; I != NumDataElts; ++I) 3513 UnmergeResults[I] = MRI->createGenericVirtualRegister(S32); 3514 B.buildUnmerge(UnmergeResults, TFEReg); 3515 3516 // Drop the final TFE element. 3517 ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts); 3518 3519 if (EltTy == S32) 3520 B.buildBuildVector(DstReg, DataPart); 3521 else if (ST.hasUnpackedD16VMem()) 3522 truncToS16Vector(B, DstReg, DataPart); 3523 else 3524 bitcastToS16Vector(B, DstReg, DataPart); 3525 3526 return true; 3527 } 3528 3529 // Must be an image load. 3530 if (!Ty.isVector() || Ty.getElementType() != S16) 3531 return true; 3532 3533 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3534 3535 LLT WidenedTy = Ty.changeElementType(S32); 3536 Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy); 3537 3538 Observer.changingInstr(MI); 3539 MI.getOperand(0).setReg(WideDstReg); 3540 Observer.changedInstr(MI); 3541 3542 repackUnpackedD16Load(B, DstReg, WideDstReg); 3543 return true; 3544 } 3545 3546 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3547 MachineInstr &MI, MachineIRBuilder &B, 3548 GISelChangeObserver &Observer) const { 3549 Register Dst = MI.getOperand(0).getReg(); 3550 LLT Ty = B.getMRI()->getType(Dst); 3551 unsigned Size = Ty.getSizeInBits(); 3552 MachineFunction &MF = B.getMF(); 3553 3554 Observer.changingInstr(MI); 3555 3556 // FIXME: We don't really need this intermediate instruction. The intrinsic 3557 // should be fixed to have a memory operand. Since it's readnone, we're not 3558 // allowed to add one. 3559 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 3560 MI.RemoveOperand(1); // Remove intrinsic ID 3561 3562 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 3563 // TODO: Should this use datalayout alignment? 3564 const unsigned MemSize = (Size + 7) / 8; 3565 const unsigned MemAlign = 4; 3566 MachineMemOperand *MMO = MF.getMachineMemOperand( 3567 MachinePointerInfo(), 3568 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3569 MachineMemOperand::MOInvariant, MemSize, MemAlign); 3570 MI.addMemOperand(MF, MMO); 3571 3572 // There are no 96-bit result scalar loads, but widening to 128-bit should 3573 // always be legal. We may need to restore this to a 96-bit result if it turns 3574 // out this needs to be converted to a vector load during RegBankSelect. 3575 if (!isPowerOf2_32(Size)) { 3576 LegalizerHelper Helper(MF, *this, Observer, B); 3577 B.setInstr(MI); 3578 3579 if (Ty.isVector()) 3580 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 3581 else 3582 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 3583 } 3584 3585 Observer.changedInstr(MI); 3586 return true; 3587 } 3588 3589 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 3590 MachineRegisterInfo &MRI, 3591 MachineIRBuilder &B) const { 3592 B.setInstr(MI); 3593 3594 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 3595 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 3596 !ST.isTrapHandlerEnabled()) { 3597 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 3598 } else { 3599 // Pass queue pointer to trap handler as input, and insert trap instruction 3600 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 3601 const ArgDescriptor *Arg = 3602 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 3603 if (!Arg) 3604 return false; 3605 MachineRegisterInfo &MRI = *B.getMRI(); 3606 Register SGPR01(AMDGPU::SGPR0_SGPR1); 3607 Register LiveIn = getLiveInRegister( 3608 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 3609 /*InsertLiveInCopy=*/false); 3610 if (!loadInputValue(LiveIn, B, Arg)) 3611 return false; 3612 B.buildCopy(SGPR01, LiveIn); 3613 B.buildInstr(AMDGPU::S_TRAP) 3614 .addImm(GCNSubtarget::TrapIDLLVMTrap) 3615 .addReg(SGPR01, RegState::Implicit); 3616 } 3617 3618 MI.eraseFromParent(); 3619 return true; 3620 } 3621 3622 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 3623 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 3624 B.setInstr(MI); 3625 3626 // Is non-HSA path or trap-handler disabled? then, report a warning 3627 // accordingly 3628 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 3629 !ST.isTrapHandlerEnabled()) { 3630 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 3631 "debugtrap handler not supported", 3632 MI.getDebugLoc(), DS_Warning); 3633 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 3634 Ctx.diagnose(NoTrap); 3635 } else { 3636 // Insert debug-trap instruction 3637 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 3638 } 3639 3640 MI.eraseFromParent(); 3641 return true; 3642 } 3643 3644 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 3645 MachineIRBuilder &B, 3646 GISelChangeObserver &Observer) const { 3647 MachineRegisterInfo &MRI = *B.getMRI(); 3648 3649 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 3650 auto IntrID = MI.getIntrinsicID(); 3651 switch (IntrID) { 3652 case Intrinsic::amdgcn_if: 3653 case Intrinsic::amdgcn_else: { 3654 MachineInstr *Br = nullptr; 3655 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3656 const SIRegisterInfo *TRI 3657 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3658 3659 B.setInstr(*BrCond); 3660 Register Def = MI.getOperand(1).getReg(); 3661 Register Use = MI.getOperand(3).getReg(); 3662 3663 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3664 if (Br) 3665 BrTarget = Br->getOperand(0).getMBB(); 3666 3667 if (IntrID == Intrinsic::amdgcn_if) { 3668 B.buildInstr(AMDGPU::SI_IF) 3669 .addDef(Def) 3670 .addUse(Use) 3671 .addMBB(BrTarget); 3672 } else { 3673 B.buildInstr(AMDGPU::SI_ELSE) 3674 .addDef(Def) 3675 .addUse(Use) 3676 .addMBB(BrTarget) 3677 .addImm(0); 3678 } 3679 3680 if (Br) 3681 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3682 3683 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 3684 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 3685 MI.eraseFromParent(); 3686 BrCond->eraseFromParent(); 3687 return true; 3688 } 3689 3690 return false; 3691 } 3692 case Intrinsic::amdgcn_loop: { 3693 MachineInstr *Br = nullptr; 3694 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3695 const SIRegisterInfo *TRI 3696 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3697 3698 B.setInstr(*BrCond); 3699 3700 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3701 if (Br) 3702 BrTarget = Br->getOperand(0).getMBB(); 3703 3704 Register Reg = MI.getOperand(2).getReg(); 3705 B.buildInstr(AMDGPU::SI_LOOP) 3706 .addUse(Reg) 3707 .addMBB(BrTarget); 3708 3709 if (Br) 3710 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3711 3712 MI.eraseFromParent(); 3713 BrCond->eraseFromParent(); 3714 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 3715 return true; 3716 } 3717 3718 return false; 3719 } 3720 case Intrinsic::amdgcn_kernarg_segment_ptr: 3721 return legalizePreloadedArgIntrin( 3722 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3723 case Intrinsic::amdgcn_implicitarg_ptr: 3724 return legalizeImplicitArgPtr(MI, MRI, B); 3725 case Intrinsic::amdgcn_workitem_id_x: 3726 return legalizePreloadedArgIntrin(MI, MRI, B, 3727 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3728 case Intrinsic::amdgcn_workitem_id_y: 3729 return legalizePreloadedArgIntrin(MI, MRI, B, 3730 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3731 case Intrinsic::amdgcn_workitem_id_z: 3732 return legalizePreloadedArgIntrin(MI, MRI, B, 3733 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3734 case Intrinsic::amdgcn_workgroup_id_x: 3735 return legalizePreloadedArgIntrin(MI, MRI, B, 3736 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 3737 case Intrinsic::amdgcn_workgroup_id_y: 3738 return legalizePreloadedArgIntrin(MI, MRI, B, 3739 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 3740 case Intrinsic::amdgcn_workgroup_id_z: 3741 return legalizePreloadedArgIntrin(MI, MRI, B, 3742 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 3743 case Intrinsic::amdgcn_dispatch_ptr: 3744 return legalizePreloadedArgIntrin(MI, MRI, B, 3745 AMDGPUFunctionArgInfo::DISPATCH_PTR); 3746 case Intrinsic::amdgcn_queue_ptr: 3747 return legalizePreloadedArgIntrin(MI, MRI, B, 3748 AMDGPUFunctionArgInfo::QUEUE_PTR); 3749 case Intrinsic::amdgcn_implicit_buffer_ptr: 3750 return legalizePreloadedArgIntrin( 3751 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 3752 case Intrinsic::amdgcn_dispatch_id: 3753 return legalizePreloadedArgIntrin(MI, MRI, B, 3754 AMDGPUFunctionArgInfo::DISPATCH_ID); 3755 case Intrinsic::amdgcn_fdiv_fast: 3756 return legalizeFDIVFastIntrin(MI, MRI, B); 3757 case Intrinsic::amdgcn_is_shared: 3758 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 3759 case Intrinsic::amdgcn_is_private: 3760 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 3761 case Intrinsic::amdgcn_wavefrontsize: { 3762 B.setInstr(MI); 3763 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 3764 MI.eraseFromParent(); 3765 return true; 3766 } 3767 case Intrinsic::amdgcn_s_buffer_load: 3768 return legalizeSBufferLoad(MI, B, Observer); 3769 case Intrinsic::amdgcn_raw_buffer_store: 3770 case Intrinsic::amdgcn_struct_buffer_store: 3771 return legalizeBufferStore(MI, MRI, B, false, false); 3772 case Intrinsic::amdgcn_raw_buffer_store_format: 3773 case Intrinsic::amdgcn_struct_buffer_store_format: 3774 return legalizeBufferStore(MI, MRI, B, false, true); 3775 case Intrinsic::amdgcn_raw_tbuffer_store: 3776 case Intrinsic::amdgcn_struct_tbuffer_store: 3777 return legalizeBufferStore(MI, MRI, B, true, true); 3778 case Intrinsic::amdgcn_raw_buffer_load: 3779 case Intrinsic::amdgcn_struct_buffer_load: 3780 return legalizeBufferLoad(MI, MRI, B, false, false); 3781 case Intrinsic::amdgcn_raw_buffer_load_format: 3782 case Intrinsic::amdgcn_struct_buffer_load_format: 3783 return legalizeBufferLoad(MI, MRI, B, true, false); 3784 case Intrinsic::amdgcn_raw_tbuffer_load: 3785 case Intrinsic::amdgcn_struct_tbuffer_load: 3786 return legalizeBufferLoad(MI, MRI, B, true, true); 3787 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3788 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3789 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3790 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3791 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3792 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3793 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3794 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3795 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3796 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3797 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3798 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3799 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3800 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3801 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3802 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3803 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3804 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3805 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3806 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3807 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3808 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3809 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3810 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3811 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3812 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3813 return legalizeBufferAtomic(MI, B, IntrID); 3814 case Intrinsic::amdgcn_atomic_inc: 3815 return legalizeAtomicIncDec(MI, B, true); 3816 case Intrinsic::amdgcn_atomic_dec: 3817 return legalizeAtomicIncDec(MI, B, false); 3818 case Intrinsic::trap: 3819 return legalizeTrapIntrinsic(MI, MRI, B); 3820 case Intrinsic::debugtrap: 3821 return legalizeDebugTrapIntrinsic(MI, MRI, B); 3822 default: { 3823 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 3824 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 3825 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 3826 return true; 3827 } 3828 } 3829 3830 return true; 3831 } 3832