1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 return Ty.isVector() && 67 Ty.getNumElements() % 2 != 0 && 68 Ty.getElementType().getSizeInBits() < 32 && 69 Ty.getSizeInBits() % 32 != 0; 70 }; 71 } 72 73 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 74 return [=](const LegalityQuery &Query) { 75 const LLT Ty = Query.Types[TypeIdx]; 76 const LLT EltTy = Ty.getScalarType(); 77 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 78 }; 79 } 80 81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 82 return [=](const LegalityQuery &Query) { 83 const LLT Ty = Query.Types[TypeIdx]; 84 const LLT EltTy = Ty.getElementType(); 85 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 86 }; 87 } 88 89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 90 return [=](const LegalityQuery &Query) { 91 const LLT Ty = Query.Types[TypeIdx]; 92 const LLT EltTy = Ty.getElementType(); 93 unsigned Size = Ty.getSizeInBits(); 94 unsigned Pieces = (Size + 63) / 64; 95 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 96 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 97 }; 98 } 99 100 // Increase the number of vector elements to reach the next multiple of 32-bit 101 // type. 102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 106 const LLT EltTy = Ty.getElementType(); 107 const int Size = Ty.getSizeInBits(); 108 const int EltSize = EltTy.getSizeInBits(); 109 const int NextMul32 = (Size + 31) / 32; 110 111 assert(EltSize < 32); 112 113 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 114 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 115 }; 116 } 117 118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 unsigned Size = Ty.getSizeInBits(); 122 123 LLT CoercedTy; 124 if (Size <= 32) { 125 // <2 x s8> -> s16 126 // <4 x s8> -> s32 127 CoercedTy = LLT::scalar(Size); 128 } else 129 CoercedTy = LLT::scalarOrVector(Size / 32, 32); 130 131 return std::make_pair(TypeIdx, CoercedTy); 132 }; 133 } 134 135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 136 return [=](const LegalityQuery &Query) { 137 const LLT QueryTy = Query.Types[TypeIdx]; 138 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 139 }; 140 } 141 142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 143 return [=](const LegalityQuery &Query) { 144 const LLT QueryTy = Query.Types[TypeIdx]; 145 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 146 }; 147 } 148 149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT QueryTy = Query.Types[TypeIdx]; 152 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 153 }; 154 } 155 156 static bool isRegisterSize(unsigned Size) { 157 return Size % 32 == 0 && Size <= MaxRegisterSize; 158 } 159 160 static bool isRegisterVectorElementType(LLT EltTy) { 161 const int EltSize = EltTy.getSizeInBits(); 162 return EltSize == 16 || EltSize % 32 == 0; 163 } 164 165 static bool isRegisterVectorType(LLT Ty) { 166 const int EltSize = Ty.getElementType().getSizeInBits(); 167 return EltSize == 32 || EltSize == 64 || 168 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 169 EltSize == 128 || EltSize == 256; 170 } 171 172 static bool isRegisterType(LLT Ty) { 173 if (!isRegisterSize(Ty.getSizeInBits())) 174 return false; 175 176 if (Ty.isVector()) 177 return isRegisterVectorType(Ty); 178 179 return true; 180 } 181 182 // Any combination of 32 or 64-bit elements up the maximum register size, and 183 // multiples of v2s16. 184 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 return isRegisterType(Query.Types[TypeIdx]); 187 }; 188 } 189 190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 191 return [=](const LegalityQuery &Query) { 192 const LLT QueryTy = Query.Types[TypeIdx]; 193 if (!QueryTy.isVector()) 194 return false; 195 const LLT EltTy = QueryTy.getElementType(); 196 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 197 }; 198 } 199 200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 201 return [=](const LegalityQuery &Query) { 202 const LLT Ty = Query.Types[TypeIdx]; 203 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 204 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 205 }; 206 } 207 208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 209 // handle some operations by just promoting the register during 210 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 212 bool IsLoad) { 213 switch (AS) { 214 case AMDGPUAS::PRIVATE_ADDRESS: 215 // FIXME: Private element size. 216 return 32; 217 case AMDGPUAS::LOCAL_ADDRESS: 218 return ST.useDS128() ? 128 : 64; 219 case AMDGPUAS::GLOBAL_ADDRESS: 220 case AMDGPUAS::CONSTANT_ADDRESS: 221 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 222 // Treat constant and global as identical. SMRD loads are sometimes usable for 223 // global loads (ideally constant address space should be eliminated) 224 // depending on the context. Legality cannot be context dependent, but 225 // RegBankSelect can split the load as necessary depending on the pointer 226 // register bank/uniformity and if the memory is invariant or not written in a 227 // kernel. 228 return IsLoad ? 512 : 128; 229 default: 230 // Flat addresses may contextually need to be split to 32-bit parts if they 231 // may alias scratch depending on the subtarget. 232 return 128; 233 } 234 } 235 236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 237 const LegalityQuery &Query, 238 unsigned Opcode) { 239 const LLT Ty = Query.Types[0]; 240 241 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 242 const bool IsLoad = Opcode != AMDGPU::G_STORE; 243 244 unsigned RegSize = Ty.getSizeInBits(); 245 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 246 unsigned Align = Query.MMODescrs[0].AlignInBits; 247 unsigned AS = Query.Types[1].getAddressSpace(); 248 249 // All of these need to be custom lowered to cast the pointer operand. 250 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 251 return false; 252 253 // TODO: We should be able to widen loads if the alignment is high enough, but 254 // we also need to modify the memory access size. 255 #if 0 256 // Accept widening loads based on alignment. 257 if (IsLoad && MemSize < Size) 258 MemSize = std::max(MemSize, Align); 259 #endif 260 261 // Only 1-byte and 2-byte to 32-bit extloads are valid. 262 if (MemSize != RegSize && RegSize != 32) 263 return false; 264 265 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 266 return false; 267 268 switch (MemSize) { 269 case 8: 270 case 16: 271 case 32: 272 case 64: 273 case 128: 274 break; 275 case 96: 276 if (!ST.hasDwordx3LoadStores()) 277 return false; 278 break; 279 case 256: 280 case 512: 281 // These may contextually need to be broken down. 282 break; 283 default: 284 return false; 285 } 286 287 assert(RegSize >= MemSize); 288 289 if (Align < MemSize) { 290 const SITargetLowering *TLI = ST.getTargetLowering(); 291 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 292 return false; 293 } 294 295 return true; 296 } 297 298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 299 // workaround this. Eventually it should ignore the type for loads and only care 300 // about the size. Return true in cases where we will workaround this for now by 301 // bitcasting. 302 static bool loadStoreBitcastWorkaround(const LLT Ty) { 303 if (EnableNewLegality) 304 return false; 305 306 const unsigned Size = Ty.getSizeInBits(); 307 if (Size <= 64) 308 return false; 309 if (!Ty.isVector()) 310 return true; 311 unsigned EltSize = Ty.getElementType().getSizeInBits(); 312 return EltSize != 32 && EltSize != 64; 313 } 314 315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 316 unsigned Opcode) { 317 const LLT Ty = Query.Types[0]; 318 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 319 !loadStoreBitcastWorkaround(Ty); 320 } 321 322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 323 const GCNTargetMachine &TM) 324 : ST(ST_) { 325 using namespace TargetOpcode; 326 327 auto GetAddrSpacePtr = [&TM](unsigned AS) { 328 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 329 }; 330 331 const LLT S1 = LLT::scalar(1); 332 const LLT S16 = LLT::scalar(16); 333 const LLT S32 = LLT::scalar(32); 334 const LLT S64 = LLT::scalar(64); 335 const LLT S128 = LLT::scalar(128); 336 const LLT S256 = LLT::scalar(256); 337 const LLT S512 = LLT::scalar(512); 338 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 339 340 const LLT V2S16 = LLT::vector(2, 16); 341 const LLT V4S16 = LLT::vector(4, 16); 342 343 const LLT V2S32 = LLT::vector(2, 32); 344 const LLT V3S32 = LLT::vector(3, 32); 345 const LLT V4S32 = LLT::vector(4, 32); 346 const LLT V5S32 = LLT::vector(5, 32); 347 const LLT V6S32 = LLT::vector(6, 32); 348 const LLT V7S32 = LLT::vector(7, 32); 349 const LLT V8S32 = LLT::vector(8, 32); 350 const LLT V9S32 = LLT::vector(9, 32); 351 const LLT V10S32 = LLT::vector(10, 32); 352 const LLT V11S32 = LLT::vector(11, 32); 353 const LLT V12S32 = LLT::vector(12, 32); 354 const LLT V13S32 = LLT::vector(13, 32); 355 const LLT V14S32 = LLT::vector(14, 32); 356 const LLT V15S32 = LLT::vector(15, 32); 357 const LLT V16S32 = LLT::vector(16, 32); 358 const LLT V32S32 = LLT::vector(32, 32); 359 360 const LLT V2S64 = LLT::vector(2, 64); 361 const LLT V3S64 = LLT::vector(3, 64); 362 const LLT V4S64 = LLT::vector(4, 64); 363 const LLT V5S64 = LLT::vector(5, 64); 364 const LLT V6S64 = LLT::vector(6, 64); 365 const LLT V7S64 = LLT::vector(7, 64); 366 const LLT V8S64 = LLT::vector(8, 64); 367 const LLT V16S64 = LLT::vector(16, 64); 368 369 std::initializer_list<LLT> AllS32Vectors = 370 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 371 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 372 std::initializer_list<LLT> AllS64Vectors = 373 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 374 375 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 376 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 377 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 378 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 379 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 380 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 381 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 382 383 const LLT CodePtr = FlatPtr; 384 385 const std::initializer_list<LLT> AddrSpaces64 = { 386 GlobalPtr, ConstantPtr, FlatPtr 387 }; 388 389 const std::initializer_list<LLT> AddrSpaces32 = { 390 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 391 }; 392 393 const std::initializer_list<LLT> FPTypesBase = { 394 S32, S64 395 }; 396 397 const std::initializer_list<LLT> FPTypes16 = { 398 S32, S64, S16 399 }; 400 401 const std::initializer_list<LLT> FPTypesPK16 = { 402 S32, S64, S16, V2S16 403 }; 404 405 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 406 407 setAction({G_BRCOND, S1}, Legal); // VCC branches 408 setAction({G_BRCOND, S32}, Legal); // SCC branches 409 410 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 411 // elements for v3s16 412 getActionDefinitionsBuilder(G_PHI) 413 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 414 .legalFor(AllS32Vectors) 415 .legalFor(AllS64Vectors) 416 .legalFor(AddrSpaces64) 417 .legalFor(AddrSpaces32) 418 .clampScalar(0, S32, S256) 419 .widenScalarToNextPow2(0, 32) 420 .clampMaxNumElements(0, S32, 16) 421 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 422 .legalIf(isPointer(0)); 423 424 if (ST.hasVOP3PInsts()) { 425 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 426 .legalFor({S32, S16, V2S16}) 427 .clampScalar(0, S16, S32) 428 .clampMaxNumElements(0, S16, 2) 429 .scalarize(0) 430 .widenScalarToNextPow2(0, 32); 431 } else if (ST.has16BitInsts()) { 432 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 433 .legalFor({S32, S16}) 434 .clampScalar(0, S16, S32) 435 .scalarize(0) 436 .widenScalarToNextPow2(0, 32); 437 } else { 438 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 439 .legalFor({S32}) 440 .clampScalar(0, S32, S32) 441 .scalarize(0); 442 } 443 444 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 445 .customFor({S32, S64}) 446 .clampScalar(0, S32, S64) 447 .widenScalarToNextPow2(0, 32) 448 .scalarize(0); 449 450 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 451 .legalFor({S32}) 452 .clampScalar(0, S32, S32) 453 .scalarize(0); 454 455 // Report legal for any types we can handle anywhere. For the cases only legal 456 // on the SALU, RegBankSelect will be able to re-legalize. 457 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 458 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 459 .clampScalar(0, S32, S64) 460 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 461 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 462 .widenScalarToNextPow2(0) 463 .scalarize(0); 464 465 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 466 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 467 .legalFor({{S32, S1}, {S32, S32}}) 468 .minScalar(0, S32) 469 // TODO: .scalarize(0) 470 .lower(); 471 472 getActionDefinitionsBuilder(G_BITCAST) 473 // Don't worry about the size constraint. 474 .legalIf(all(isRegisterType(0), isRegisterType(1))) 475 .lower(); 476 477 478 getActionDefinitionsBuilder(G_CONSTANT) 479 .legalFor({S1, S32, S64, S16, GlobalPtr, 480 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 481 .clampScalar(0, S32, S64) 482 .widenScalarToNextPow2(0) 483 .legalIf(isPointer(0)); 484 485 getActionDefinitionsBuilder(G_FCONSTANT) 486 .legalFor({S32, S64, S16}) 487 .clampScalar(0, S16, S64); 488 489 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 490 .legalIf(isRegisterType(0)) 491 // s1 and s16 are special cases because they have legal operations on 492 // them, but don't really occupy registers in the normal way. 493 .legalFor({S1, S16}) 494 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 495 .clampScalarOrElt(0, S32, MaxScalar) 496 .widenScalarToNextPow2(0, 32) 497 .clampMaxNumElements(0, S32, 16); 498 499 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 500 501 // If the amount is divergent, we have to do a wave reduction to get the 502 // maximum value, so this is expanded during RegBankSelect. 503 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 504 .legalFor({{PrivatePtr, S32}}); 505 506 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 507 .unsupportedFor({PrivatePtr}) 508 .custom(); 509 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 510 511 auto &FPOpActions = getActionDefinitionsBuilder( 512 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 513 .legalFor({S32, S64}); 514 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 515 .customFor({S32, S64}); 516 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 517 .customFor({S32, S64}); 518 519 if (ST.has16BitInsts()) { 520 if (ST.hasVOP3PInsts()) 521 FPOpActions.legalFor({S16, V2S16}); 522 else 523 FPOpActions.legalFor({S16}); 524 525 TrigActions.customFor({S16}); 526 FDIVActions.customFor({S16}); 527 } 528 529 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 530 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 531 532 if (ST.hasVOP3PInsts()) { 533 MinNumMaxNum.customFor(FPTypesPK16) 534 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 535 .clampMaxNumElements(0, S16, 2) 536 .clampScalar(0, S16, S64) 537 .scalarize(0); 538 } else if (ST.has16BitInsts()) { 539 MinNumMaxNum.customFor(FPTypes16) 540 .clampScalar(0, S16, S64) 541 .scalarize(0); 542 } else { 543 MinNumMaxNum.customFor(FPTypesBase) 544 .clampScalar(0, S32, S64) 545 .scalarize(0); 546 } 547 548 if (ST.hasVOP3PInsts()) 549 FPOpActions.clampMaxNumElements(0, S16, 2); 550 551 FPOpActions 552 .scalarize(0) 553 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 554 555 TrigActions 556 .scalarize(0) 557 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 558 559 FDIVActions 560 .scalarize(0) 561 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 562 563 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 564 .legalFor(FPTypesPK16) 565 .clampMaxNumElements(0, S16, 2) 566 .scalarize(0) 567 .clampScalar(0, S16, S64); 568 569 if (ST.has16BitInsts()) { 570 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 571 .legalFor({S32, S64, S16}) 572 .scalarize(0) 573 .clampScalar(0, S16, S64); 574 } else { 575 getActionDefinitionsBuilder(G_FSQRT) 576 .legalFor({S32, S64}) 577 .scalarize(0) 578 .clampScalar(0, S32, S64); 579 580 if (ST.hasFractBug()) { 581 getActionDefinitionsBuilder(G_FFLOOR) 582 .customFor({S64}) 583 .legalFor({S32, S64}) 584 .scalarize(0) 585 .clampScalar(0, S32, S64); 586 } else { 587 getActionDefinitionsBuilder(G_FFLOOR) 588 .legalFor({S32, S64}) 589 .scalarize(0) 590 .clampScalar(0, S32, S64); 591 } 592 } 593 594 getActionDefinitionsBuilder(G_FPTRUNC) 595 .legalFor({{S32, S64}, {S16, S32}}) 596 .scalarize(0) 597 .lower(); 598 599 getActionDefinitionsBuilder(G_FPEXT) 600 .legalFor({{S64, S32}, {S32, S16}}) 601 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 602 .scalarize(0); 603 604 getActionDefinitionsBuilder(G_FSUB) 605 // Use actual fsub instruction 606 .legalFor({S32}) 607 // Must use fadd + fneg 608 .lowerFor({S64, S16, V2S16}) 609 .scalarize(0) 610 .clampScalar(0, S32, S64); 611 612 // Whether this is legal depends on the floating point mode for the function. 613 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 614 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 615 FMad.customFor({S32, S16}); 616 else if (ST.hasMadMacF32Insts()) 617 FMad.customFor({S32}); 618 else if (ST.hasMadF16()) 619 FMad.customFor({S16}); 620 FMad.scalarize(0) 621 .lower(); 622 623 // TODO: Do we need to clamp maximum bitwidth? 624 getActionDefinitionsBuilder(G_TRUNC) 625 .legalIf(isScalar(0)) 626 .legalFor({{V2S16, V2S32}}) 627 .clampMaxNumElements(0, S16, 2) 628 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 629 // situations (like an invalid implicit use), we don't want to infinite loop 630 // in the legalizer. 631 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 632 .alwaysLegal(); 633 634 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 635 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 636 {S32, S1}, {S64, S1}, {S16, S1}}) 637 .scalarize(0) 638 .clampScalar(0, S32, S64) 639 .widenScalarToNextPow2(1, 32); 640 641 // TODO: Split s1->s64 during regbankselect for VALU. 642 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 643 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 644 .lowerFor({{S32, S64}}) 645 .lowerIf(typeIs(1, S1)) 646 .customFor({{S64, S64}}); 647 if (ST.has16BitInsts()) 648 IToFP.legalFor({{S16, S16}}); 649 IToFP.clampScalar(1, S32, S64) 650 .minScalar(0, S32) 651 .scalarize(0) 652 .widenScalarToNextPow2(1); 653 654 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 655 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 656 .customFor({{S64, S64}}) 657 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 658 if (ST.has16BitInsts()) 659 FPToI.legalFor({{S16, S16}}); 660 else 661 FPToI.minScalar(1, S32); 662 663 FPToI.minScalar(0, S32) 664 .scalarize(0) 665 .lower(); 666 667 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 668 .scalarize(0) 669 .lower(); 670 671 if (ST.has16BitInsts()) { 672 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 673 .legalFor({S16, S32, S64}) 674 .clampScalar(0, S16, S64) 675 .scalarize(0); 676 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 677 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 678 .legalFor({S32, S64}) 679 .clampScalar(0, S32, S64) 680 .scalarize(0); 681 } else { 682 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 683 .legalFor({S32}) 684 .customFor({S64}) 685 .clampScalar(0, S32, S64) 686 .scalarize(0); 687 } 688 689 // FIXME: Clamp offset operand. 690 getActionDefinitionsBuilder(G_PTR_ADD) 691 .legalIf(isPointer(0)) 692 .scalarize(0); 693 694 getActionDefinitionsBuilder(G_PTRMASK) 695 .legalIf(typeInSet(1, {S64, S32})) 696 .minScalar(1, S32) 697 .maxScalarIf(sizeIs(0, 32), 1, S32) 698 .maxScalarIf(sizeIs(0, 64), 1, S64) 699 .scalarize(0); 700 701 auto &CmpBuilder = 702 getActionDefinitionsBuilder(G_ICMP) 703 // The compare output type differs based on the register bank of the output, 704 // so make both s1 and s32 legal. 705 // 706 // Scalar compares producing output in scc will be promoted to s32, as that 707 // is the allocatable register type that will be needed for the copy from 708 // scc. This will be promoted during RegBankSelect, and we assume something 709 // before that won't try to use s32 result types. 710 // 711 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 712 // bank. 713 .legalForCartesianProduct( 714 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 715 .legalForCartesianProduct( 716 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 717 if (ST.has16BitInsts()) { 718 CmpBuilder.legalFor({{S1, S16}}); 719 } 720 721 CmpBuilder 722 .widenScalarToNextPow2(1) 723 .clampScalar(1, S32, S64) 724 .scalarize(0) 725 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 726 727 getActionDefinitionsBuilder(G_FCMP) 728 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 729 .widenScalarToNextPow2(1) 730 .clampScalar(1, S32, S64) 731 .scalarize(0); 732 733 // FIXME: fpow has a selection pattern that should move to custom lowering. 734 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 735 if (ST.has16BitInsts()) 736 Exp2Ops.legalFor({S32, S16}); 737 else 738 Exp2Ops.legalFor({S32}); 739 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 740 Exp2Ops.scalarize(0); 741 742 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 743 if (ST.has16BitInsts()) 744 ExpOps.customFor({{S32}, {S16}}); 745 else 746 ExpOps.customFor({S32}); 747 ExpOps.clampScalar(0, MinScalarFPTy, S32) 748 .scalarize(0); 749 750 // The 64-bit versions produce 32-bit results, but only on the SALU. 751 getActionDefinitionsBuilder(G_CTPOP) 752 .legalFor({{S32, S32}, {S32, S64}}) 753 .clampScalar(0, S32, S32) 754 .clampScalar(1, S32, S64) 755 .scalarize(0) 756 .widenScalarToNextPow2(0, 32) 757 .widenScalarToNextPow2(1, 32); 758 759 // The hardware instructions return a different result on 0 than the generic 760 // instructions expect. The hardware produces -1, but these produce the 761 // bitwidth. 762 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 763 .scalarize(0) 764 .clampScalar(0, S32, S32) 765 .clampScalar(1, S32, S64) 766 .widenScalarToNextPow2(0, 32) 767 .widenScalarToNextPow2(1, 32) 768 .lower(); 769 770 // The 64-bit versions produce 32-bit results, but only on the SALU. 771 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 772 .legalFor({{S32, S32}, {S32, S64}}) 773 .clampScalar(0, S32, S32) 774 .clampScalar(1, S32, S64) 775 .scalarize(0) 776 .widenScalarToNextPow2(0, 32) 777 .widenScalarToNextPow2(1, 32); 778 779 getActionDefinitionsBuilder(G_BITREVERSE) 780 .legalFor({S32}) 781 .clampScalar(0, S32, S32) 782 .scalarize(0); 783 784 if (ST.has16BitInsts()) { 785 getActionDefinitionsBuilder(G_BSWAP) 786 .legalFor({S16, S32, V2S16}) 787 .clampMaxNumElements(0, S16, 2) 788 // FIXME: Fixing non-power-of-2 before clamp is workaround for 789 // narrowScalar limitation. 790 .widenScalarToNextPow2(0) 791 .clampScalar(0, S16, S32) 792 .scalarize(0); 793 794 if (ST.hasVOP3PInsts()) { 795 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 796 .legalFor({S32, S16, V2S16}) 797 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 798 .clampMaxNumElements(0, S16, 2) 799 .minScalar(0, S16) 800 .widenScalarToNextPow2(0) 801 .scalarize(0) 802 .lower(); 803 } else { 804 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 805 .legalFor({S32, S16}) 806 .widenScalarToNextPow2(0) 807 .minScalar(0, S16) 808 .scalarize(0) 809 .lower(); 810 } 811 } else { 812 // TODO: Should have same legality without v_perm_b32 813 getActionDefinitionsBuilder(G_BSWAP) 814 .legalFor({S32}) 815 .lowerIf(scalarNarrowerThan(0, 32)) 816 // FIXME: Fixing non-power-of-2 before clamp is workaround for 817 // narrowScalar limitation. 818 .widenScalarToNextPow2(0) 819 .maxScalar(0, S32) 820 .scalarize(0) 821 .lower(); 822 823 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 824 .legalFor({S32}) 825 .minScalar(0, S32) 826 .widenScalarToNextPow2(0) 827 .scalarize(0) 828 .lower(); 829 } 830 831 getActionDefinitionsBuilder(G_INTTOPTR) 832 // List the common cases 833 .legalForCartesianProduct(AddrSpaces64, {S64}) 834 .legalForCartesianProduct(AddrSpaces32, {S32}) 835 .scalarize(0) 836 // Accept any address space as long as the size matches 837 .legalIf(sameSize(0, 1)) 838 .widenScalarIf(smallerThan(1, 0), 839 [](const LegalityQuery &Query) { 840 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 841 }) 842 .narrowScalarIf(largerThan(1, 0), 843 [](const LegalityQuery &Query) { 844 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 845 }); 846 847 getActionDefinitionsBuilder(G_PTRTOINT) 848 // List the common cases 849 .legalForCartesianProduct(AddrSpaces64, {S64}) 850 .legalForCartesianProduct(AddrSpaces32, {S32}) 851 .scalarize(0) 852 // Accept any address space as long as the size matches 853 .legalIf(sameSize(0, 1)) 854 .widenScalarIf(smallerThan(0, 1), 855 [](const LegalityQuery &Query) { 856 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 857 }) 858 .narrowScalarIf( 859 largerThan(0, 1), 860 [](const LegalityQuery &Query) { 861 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 862 }); 863 864 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 865 .scalarize(0) 866 .custom(); 867 868 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 869 bool IsLoad) -> bool { 870 const LLT DstTy = Query.Types[0]; 871 872 // Split vector extloads. 873 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 874 unsigned Align = Query.MMODescrs[0].AlignInBits; 875 876 if (MemSize < DstTy.getSizeInBits()) 877 MemSize = std::max(MemSize, Align); 878 879 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 880 return true; 881 882 const LLT PtrTy = Query.Types[1]; 883 unsigned AS = PtrTy.getAddressSpace(); 884 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 885 return true; 886 887 // Catch weird sized loads that don't evenly divide into the access sizes 888 // TODO: May be able to widen depending on alignment etc. 889 unsigned NumRegs = (MemSize + 31) / 32; 890 if (NumRegs == 3) { 891 if (!ST.hasDwordx3LoadStores()) 892 return true; 893 } else { 894 // If the alignment allows, these should have been widened. 895 if (!isPowerOf2_32(NumRegs)) 896 return true; 897 } 898 899 if (Align < MemSize) { 900 const SITargetLowering *TLI = ST.getTargetLowering(); 901 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 902 } 903 904 return false; 905 }; 906 907 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 908 unsigned Opc) -> bool { 909 unsigned Size = Query.Types[0].getSizeInBits(); 910 if (isPowerOf2_32(Size)) 911 return false; 912 913 if (Size == 96 && ST.hasDwordx3LoadStores()) 914 return false; 915 916 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 917 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 918 return false; 919 920 unsigned Align = Query.MMODescrs[0].AlignInBits; 921 unsigned RoundedSize = NextPowerOf2(Size); 922 return (Align >= RoundedSize); 923 }; 924 925 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 926 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 927 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 928 929 // TODO: Refine based on subtargets which support unaligned access or 128-bit 930 // LDS 931 // TODO: Unsupported flat for SI. 932 933 for (unsigned Op : {G_LOAD, G_STORE}) { 934 const bool IsStore = Op == G_STORE; 935 936 auto &Actions = getActionDefinitionsBuilder(Op); 937 // Explicitly list some common cases. 938 // TODO: Does this help compile time at all? 939 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 940 {V2S32, GlobalPtr, 64, GlobalAlign32}, 941 {V4S32, GlobalPtr, 128, GlobalAlign32}, 942 {S64, GlobalPtr, 64, GlobalAlign32}, 943 {V2S64, GlobalPtr, 128, GlobalAlign32}, 944 {V2S16, GlobalPtr, 32, GlobalAlign32}, 945 {S32, GlobalPtr, 8, GlobalAlign8}, 946 {S32, GlobalPtr, 16, GlobalAlign16}, 947 948 {S32, LocalPtr, 32, 32}, 949 {S64, LocalPtr, 64, 32}, 950 {V2S32, LocalPtr, 64, 32}, 951 {S32, LocalPtr, 8, 8}, 952 {S32, LocalPtr, 16, 16}, 953 {V2S16, LocalPtr, 32, 32}, 954 955 {S32, PrivatePtr, 32, 32}, 956 {S32, PrivatePtr, 8, 8}, 957 {S32, PrivatePtr, 16, 16}, 958 {V2S16, PrivatePtr, 32, 32}, 959 960 {S32, ConstantPtr, 32, GlobalAlign32}, 961 {V2S32, ConstantPtr, 64, GlobalAlign32}, 962 {V4S32, ConstantPtr, 128, GlobalAlign32}, 963 {S64, ConstantPtr, 64, GlobalAlign32}, 964 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 965 Actions.legalIf( 966 [=](const LegalityQuery &Query) -> bool { 967 return isLoadStoreLegal(ST, Query, Op); 968 }); 969 970 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 971 // 64-bits. 972 // 973 // TODO: Should generalize bitcast action into coerce, which will also cover 974 // inserting addrspacecasts. 975 Actions.customIf(typeIs(1, Constant32Ptr)); 976 977 // Turn any illegal element vectors into something easier to deal 978 // with. These will ultimately produce 32-bit scalar shifts to extract the 979 // parts anyway. 980 // 981 // For odd 16-bit element vectors, prefer to split those into pieces with 982 // 16-bit vector parts. 983 Actions.bitcastIf( 984 [=](const LegalityQuery &Query) -> bool { 985 const LLT Ty = Query.Types[0]; 986 const unsigned Size = Ty.getSizeInBits(); 987 988 if (Size != Query.MMODescrs[0].SizeInBits) 989 return Size <= 32 && Ty.isVector(); 990 991 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 992 return true; 993 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 994 !isRegisterVectorElementType(Ty.getElementType()); 995 }, bitcastToRegisterType(0)); 996 997 Actions 998 .customIf(typeIs(1, Constant32Ptr)) 999 // Widen suitably aligned loads by loading extra elements. 1000 .moreElementsIf([=](const LegalityQuery &Query) { 1001 const LLT Ty = Query.Types[0]; 1002 return Op == G_LOAD && Ty.isVector() && 1003 shouldWidenLoadResult(Query, Op); 1004 }, moreElementsToNextPow2(0)) 1005 .widenScalarIf([=](const LegalityQuery &Query) { 1006 const LLT Ty = Query.Types[0]; 1007 return Op == G_LOAD && !Ty.isVector() && 1008 shouldWidenLoadResult(Query, Op); 1009 }, widenScalarOrEltToNextPow2(0)) 1010 .narrowScalarIf( 1011 [=](const LegalityQuery &Query) -> bool { 1012 return !Query.Types[0].isVector() && 1013 needToSplitMemOp(Query, Op == G_LOAD); 1014 }, 1015 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1016 const LLT DstTy = Query.Types[0]; 1017 const LLT PtrTy = Query.Types[1]; 1018 1019 const unsigned DstSize = DstTy.getSizeInBits(); 1020 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1021 1022 // Split extloads. 1023 if (DstSize > MemSize) 1024 return std::make_pair(0, LLT::scalar(MemSize)); 1025 1026 if (!isPowerOf2_32(DstSize)) { 1027 // We're probably decomposing an odd sized store. Try to split 1028 // to the widest type. TODO: Account for alignment. As-is it 1029 // should be OK, since the new parts will be further legalized. 1030 unsigned FloorSize = PowerOf2Floor(DstSize); 1031 return std::make_pair(0, LLT::scalar(FloorSize)); 1032 } 1033 1034 if (DstSize > 32 && (DstSize % 32 != 0)) { 1035 // FIXME: Need a way to specify non-extload of larger size if 1036 // suitably aligned. 1037 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1038 } 1039 1040 unsigned MaxSize = maxSizeForAddrSpace(ST, 1041 PtrTy.getAddressSpace(), 1042 Op == G_LOAD); 1043 if (MemSize > MaxSize) 1044 return std::make_pair(0, LLT::scalar(MaxSize)); 1045 1046 unsigned Align = Query.MMODescrs[0].AlignInBits; 1047 return std::make_pair(0, LLT::scalar(Align)); 1048 }) 1049 .fewerElementsIf( 1050 [=](const LegalityQuery &Query) -> bool { 1051 return Query.Types[0].isVector() && 1052 needToSplitMemOp(Query, Op == G_LOAD); 1053 }, 1054 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1055 const LLT DstTy = Query.Types[0]; 1056 const LLT PtrTy = Query.Types[1]; 1057 1058 LLT EltTy = DstTy.getElementType(); 1059 unsigned MaxSize = maxSizeForAddrSpace(ST, 1060 PtrTy.getAddressSpace(), 1061 Op == G_LOAD); 1062 1063 // FIXME: Handle widened to power of 2 results better. This ends 1064 // up scalarizing. 1065 // FIXME: 3 element stores scalarized on SI 1066 1067 // Split if it's too large for the address space. 1068 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1069 unsigned NumElts = DstTy.getNumElements(); 1070 unsigned EltSize = EltTy.getSizeInBits(); 1071 1072 if (MaxSize % EltSize == 0) { 1073 return std::make_pair( 1074 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1075 } 1076 1077 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1078 1079 // FIXME: Refine when odd breakdowns handled 1080 // The scalars will need to be re-legalized. 1081 if (NumPieces == 1 || NumPieces >= NumElts || 1082 NumElts % NumPieces != 0) 1083 return std::make_pair(0, EltTy); 1084 1085 return std::make_pair(0, 1086 LLT::vector(NumElts / NumPieces, EltTy)); 1087 } 1088 1089 // FIXME: We could probably handle weird extending loads better. 1090 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1091 if (DstTy.getSizeInBits() > MemSize) 1092 return std::make_pair(0, EltTy); 1093 1094 unsigned EltSize = EltTy.getSizeInBits(); 1095 unsigned DstSize = DstTy.getSizeInBits(); 1096 if (!isPowerOf2_32(DstSize)) { 1097 // We're probably decomposing an odd sized store. Try to split 1098 // to the widest type. TODO: Account for alignment. As-is it 1099 // should be OK, since the new parts will be further legalized. 1100 unsigned FloorSize = PowerOf2Floor(DstSize); 1101 return std::make_pair( 1102 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1103 } 1104 1105 // Need to split because of alignment. 1106 unsigned Align = Query.MMODescrs[0].AlignInBits; 1107 if (EltSize > Align && 1108 (EltSize / Align < DstTy.getNumElements())) { 1109 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1110 } 1111 1112 // May need relegalization for the scalars. 1113 return std::make_pair(0, EltTy); 1114 }) 1115 .minScalar(0, S32); 1116 1117 if (IsStore) 1118 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1119 1120 // TODO: Need a bitcast lower option? 1121 Actions 1122 .widenScalarToNextPow2(0) 1123 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1124 } 1125 1126 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1127 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1128 {S32, GlobalPtr, 16, 2 * 8}, 1129 {S32, LocalPtr, 8, 8}, 1130 {S32, LocalPtr, 16, 16}, 1131 {S32, PrivatePtr, 8, 8}, 1132 {S32, PrivatePtr, 16, 16}, 1133 {S32, ConstantPtr, 8, 8}, 1134 {S32, ConstantPtr, 16, 2 * 8}}); 1135 if (ST.hasFlatAddressSpace()) { 1136 ExtLoads.legalForTypesWithMemDesc( 1137 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1138 } 1139 1140 ExtLoads.clampScalar(0, S32, S32) 1141 .widenScalarToNextPow2(0) 1142 .unsupportedIfMemSizeNotPow2() 1143 .lower(); 1144 1145 auto &Atomics = getActionDefinitionsBuilder( 1146 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1147 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1148 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1149 G_ATOMICRMW_UMIN}) 1150 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1151 {S64, GlobalPtr}, {S64, LocalPtr}}); 1152 if (ST.hasFlatAddressSpace()) { 1153 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1154 } 1155 1156 if (ST.hasLDSFPAtomics()) { 1157 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1158 .legalFor({{S32, LocalPtr}}); 1159 } 1160 1161 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1162 // demarshalling 1163 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1164 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1165 {S32, FlatPtr}, {S64, FlatPtr}}) 1166 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1167 {S32, RegionPtr}, {S64, RegionPtr}}); 1168 // TODO: Pointer types, any 32-bit or 64-bit vector 1169 1170 // Condition should be s32 for scalar, s1 for vector. 1171 getActionDefinitionsBuilder(G_SELECT) 1172 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1173 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1174 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1175 .clampScalar(0, S16, S64) 1176 .scalarize(1) 1177 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1178 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1179 .clampMaxNumElements(0, S32, 2) 1180 .clampMaxNumElements(0, LocalPtr, 2) 1181 .clampMaxNumElements(0, PrivatePtr, 2) 1182 .scalarize(0) 1183 .widenScalarToNextPow2(0) 1184 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1185 1186 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1187 // be more flexible with the shift amount type. 1188 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1189 .legalFor({{S32, S32}, {S64, S32}}); 1190 if (ST.has16BitInsts()) { 1191 if (ST.hasVOP3PInsts()) { 1192 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1193 .clampMaxNumElements(0, S16, 2); 1194 } else 1195 Shifts.legalFor({{S16, S16}}); 1196 1197 // TODO: Support 16-bit shift amounts for all types 1198 Shifts.widenScalarIf( 1199 [=](const LegalityQuery &Query) { 1200 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1201 // 32-bit amount. 1202 const LLT ValTy = Query.Types[0]; 1203 const LLT AmountTy = Query.Types[1]; 1204 return ValTy.getSizeInBits() <= 16 && 1205 AmountTy.getSizeInBits() < 16; 1206 }, changeTo(1, S16)); 1207 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1208 Shifts.clampScalar(1, S32, S32); 1209 Shifts.clampScalar(0, S16, S64); 1210 Shifts.widenScalarToNextPow2(0, 16); 1211 } else { 1212 // Make sure we legalize the shift amount type first, as the general 1213 // expansion for the shifted type will produce much worse code if it hasn't 1214 // been truncated already. 1215 Shifts.clampScalar(1, S32, S32); 1216 Shifts.clampScalar(0, S32, S64); 1217 Shifts.widenScalarToNextPow2(0, 32); 1218 } 1219 Shifts.scalarize(0); 1220 1221 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1222 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1223 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1224 unsigned IdxTypeIdx = 2; 1225 1226 getActionDefinitionsBuilder(Op) 1227 .customIf([=](const LegalityQuery &Query) { 1228 const LLT EltTy = Query.Types[EltTypeIdx]; 1229 const LLT VecTy = Query.Types[VecTypeIdx]; 1230 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1231 return (EltTy.getSizeInBits() == 16 || 1232 EltTy.getSizeInBits() % 32 == 0) && 1233 VecTy.getSizeInBits() % 32 == 0 && 1234 VecTy.getSizeInBits() <= MaxRegisterSize && 1235 IdxTy.getSizeInBits() == 32; 1236 }) 1237 .clampScalar(EltTypeIdx, S32, S64) 1238 .clampScalar(VecTypeIdx, S32, S64) 1239 .clampScalar(IdxTypeIdx, S32, S32); 1240 } 1241 1242 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1243 .unsupportedIf([=](const LegalityQuery &Query) { 1244 const LLT &EltTy = Query.Types[1].getElementType(); 1245 return Query.Types[0] != EltTy; 1246 }); 1247 1248 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1249 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1250 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1251 1252 // FIXME: Doesn't handle extract of illegal sizes. 1253 getActionDefinitionsBuilder(Op) 1254 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1255 // FIXME: Multiples of 16 should not be legal. 1256 .legalIf([=](const LegalityQuery &Query) { 1257 const LLT BigTy = Query.Types[BigTyIdx]; 1258 const LLT LitTy = Query.Types[LitTyIdx]; 1259 return (BigTy.getSizeInBits() % 32 == 0) && 1260 (LitTy.getSizeInBits() % 16 == 0); 1261 }) 1262 .widenScalarIf( 1263 [=](const LegalityQuery &Query) { 1264 const LLT BigTy = Query.Types[BigTyIdx]; 1265 return (BigTy.getScalarSizeInBits() < 16); 1266 }, 1267 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1268 .widenScalarIf( 1269 [=](const LegalityQuery &Query) { 1270 const LLT LitTy = Query.Types[LitTyIdx]; 1271 return (LitTy.getScalarSizeInBits() < 16); 1272 }, 1273 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1274 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1275 .widenScalarToNextPow2(BigTyIdx, 32); 1276 1277 } 1278 1279 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1280 .legalForCartesianProduct(AllS32Vectors, {S32}) 1281 .legalForCartesianProduct(AllS64Vectors, {S64}) 1282 .clampNumElements(0, V16S32, V32S32) 1283 .clampNumElements(0, V2S64, V16S64) 1284 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1285 1286 if (ST.hasScalarPackInsts()) { 1287 BuildVector 1288 // FIXME: Should probably widen s1 vectors straight to s32 1289 .minScalarOrElt(0, S16) 1290 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1291 .minScalar(1, S32); 1292 1293 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1294 .legalFor({V2S16, S32}) 1295 .lower(); 1296 BuildVector.minScalarOrElt(0, S32); 1297 } else { 1298 BuildVector.customFor({V2S16, S16}); 1299 BuildVector.minScalarOrElt(0, S32); 1300 1301 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1302 .customFor({V2S16, S32}) 1303 .lower(); 1304 } 1305 1306 BuildVector.legalIf(isRegisterType(0)); 1307 1308 // FIXME: Clamp maximum size 1309 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1310 .legalIf(isRegisterType(0)); 1311 1312 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1313 // pre-legalize. 1314 if (ST.hasVOP3PInsts()) { 1315 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1316 .customFor({V2S16, V2S16}) 1317 .lower(); 1318 } else 1319 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1320 1321 // Merge/Unmerge 1322 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1323 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1324 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1325 1326 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1327 const LLT Ty = Query.Types[TypeIdx]; 1328 if (Ty.isVector()) { 1329 const LLT &EltTy = Ty.getElementType(); 1330 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1331 return true; 1332 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1333 return true; 1334 } 1335 return false; 1336 }; 1337 1338 auto &Builder = getActionDefinitionsBuilder(Op) 1339 .lowerFor({{S16, V2S16}}) 1340 .lowerIf([=](const LegalityQuery &Query) { 1341 const LLT BigTy = Query.Types[BigTyIdx]; 1342 return BigTy.getSizeInBits() == 32; 1343 }) 1344 // Try to widen to s16 first for small types. 1345 // TODO: Only do this on targets with legal s16 shifts 1346 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1347 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1348 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1349 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1350 elementTypeIs(1, S16)), 1351 changeTo(1, V2S16)) 1352 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1353 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1354 // valid. 1355 .clampScalar(LitTyIdx, S32, S512) 1356 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1357 // Break up vectors with weird elements into scalars 1358 .fewerElementsIf( 1359 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1360 scalarize(0)) 1361 .fewerElementsIf( 1362 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1363 scalarize(1)) 1364 .clampScalar(BigTyIdx, S32, MaxScalar); 1365 1366 if (Op == G_MERGE_VALUES) { 1367 Builder.widenScalarIf( 1368 // TODO: Use 16-bit shifts if legal for 8-bit values? 1369 [=](const LegalityQuery &Query) { 1370 const LLT Ty = Query.Types[LitTyIdx]; 1371 return Ty.getSizeInBits() < 32; 1372 }, 1373 changeTo(LitTyIdx, S32)); 1374 } 1375 1376 Builder.widenScalarIf( 1377 [=](const LegalityQuery &Query) { 1378 const LLT Ty = Query.Types[BigTyIdx]; 1379 return !isPowerOf2_32(Ty.getSizeInBits()) && 1380 Ty.getSizeInBits() % 16 != 0; 1381 }, 1382 [=](const LegalityQuery &Query) { 1383 // Pick the next power of 2, or a multiple of 64 over 128. 1384 // Whichever is smaller. 1385 const LLT &Ty = Query.Types[BigTyIdx]; 1386 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1387 if (NewSizeInBits >= 256) { 1388 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1389 if (RoundedTo < NewSizeInBits) 1390 NewSizeInBits = RoundedTo; 1391 } 1392 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1393 }) 1394 .legalIf([=](const LegalityQuery &Query) { 1395 const LLT &BigTy = Query.Types[BigTyIdx]; 1396 const LLT &LitTy = Query.Types[LitTyIdx]; 1397 1398 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1399 return false; 1400 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1401 return false; 1402 1403 return BigTy.getSizeInBits() % 16 == 0 && 1404 LitTy.getSizeInBits() % 16 == 0 && 1405 BigTy.getSizeInBits() <= MaxRegisterSize; 1406 }) 1407 // Any vectors left are the wrong size. Scalarize them. 1408 .scalarize(0) 1409 .scalarize(1); 1410 } 1411 1412 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1413 // RegBankSelect. 1414 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1415 .legalFor({{S32}, {S64}}); 1416 1417 if (ST.hasVOP3PInsts()) { 1418 SextInReg.lowerFor({{V2S16}}) 1419 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1420 // get more vector shift opportunities, since we'll get those when 1421 // expanded. 1422 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1423 } else if (ST.has16BitInsts()) { 1424 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1425 } else { 1426 // Prefer to promote to s32 before lowering if we don't have 16-bit 1427 // shifts. This avoid a lot of intermediate truncate and extend operations. 1428 SextInReg.lowerFor({{S32}, {S64}}); 1429 } 1430 1431 // FIXME: Placeholder rule. Really depends on whether the clamp modifier is 1432 // available, and is selectively legal for s16, s32, v2s16. 1433 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT, G_UADDSAT, G_USUBSAT}) 1434 .scalarize(0) 1435 .clampScalar(0, S16, S32); 1436 1437 SextInReg 1438 .scalarize(0) 1439 .clampScalar(0, S32, S64) 1440 .lower(); 1441 1442 getActionDefinitionsBuilder(G_FSHR) 1443 .legalFor({{S32, S32}}) 1444 .scalarize(0) 1445 .lower(); 1446 1447 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1448 .legalFor({S64}); 1449 1450 getActionDefinitionsBuilder({ 1451 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1452 G_FCOPYSIGN, 1453 1454 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1455 G_READ_REGISTER, 1456 G_WRITE_REGISTER, 1457 1458 G_SADDO, G_SSUBO, 1459 1460 // TODO: Implement 1461 G_FMINIMUM, G_FMAXIMUM, 1462 G_FSHL 1463 }).lower(); 1464 1465 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1466 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1467 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1468 .unsupported(); 1469 1470 computeTables(); 1471 verify(*ST.getInstrInfo()); 1472 } 1473 1474 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1475 MachineInstr &MI) const { 1476 MachineIRBuilder &B = Helper.MIRBuilder; 1477 MachineRegisterInfo &MRI = *B.getMRI(); 1478 GISelChangeObserver &Observer = Helper.Observer; 1479 1480 switch (MI.getOpcode()) { 1481 case TargetOpcode::G_ADDRSPACE_CAST: 1482 return legalizeAddrSpaceCast(MI, MRI, B); 1483 case TargetOpcode::G_FRINT: 1484 return legalizeFrint(MI, MRI, B); 1485 case TargetOpcode::G_FCEIL: 1486 return legalizeFceil(MI, MRI, B); 1487 case TargetOpcode::G_INTRINSIC_TRUNC: 1488 return legalizeIntrinsicTrunc(MI, MRI, B); 1489 case TargetOpcode::G_SITOFP: 1490 return legalizeITOFP(MI, MRI, B, true); 1491 case TargetOpcode::G_UITOFP: 1492 return legalizeITOFP(MI, MRI, B, false); 1493 case TargetOpcode::G_FPTOSI: 1494 return legalizeFPTOI(MI, MRI, B, true); 1495 case TargetOpcode::G_FPTOUI: 1496 return legalizeFPTOI(MI, MRI, B, false); 1497 case TargetOpcode::G_FMINNUM: 1498 case TargetOpcode::G_FMAXNUM: 1499 case TargetOpcode::G_FMINNUM_IEEE: 1500 case TargetOpcode::G_FMAXNUM_IEEE: 1501 return legalizeMinNumMaxNum(Helper, MI); 1502 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1503 return legalizeExtractVectorElt(MI, MRI, B); 1504 case TargetOpcode::G_INSERT_VECTOR_ELT: 1505 return legalizeInsertVectorElt(MI, MRI, B); 1506 case TargetOpcode::G_SHUFFLE_VECTOR: 1507 return legalizeShuffleVector(MI, MRI, B); 1508 case TargetOpcode::G_FSIN: 1509 case TargetOpcode::G_FCOS: 1510 return legalizeSinCos(MI, MRI, B); 1511 case TargetOpcode::G_GLOBAL_VALUE: 1512 return legalizeGlobalValue(MI, MRI, B); 1513 case TargetOpcode::G_LOAD: 1514 return legalizeLoad(MI, MRI, B, Observer); 1515 case TargetOpcode::G_FMAD: 1516 return legalizeFMad(MI, MRI, B); 1517 case TargetOpcode::G_FDIV: 1518 return legalizeFDIV(MI, MRI, B); 1519 case TargetOpcode::G_UDIV: 1520 case TargetOpcode::G_UREM: 1521 return legalizeUDIV_UREM(MI, MRI, B); 1522 case TargetOpcode::G_SDIV: 1523 case TargetOpcode::G_SREM: 1524 return legalizeSDIV_SREM(MI, MRI, B); 1525 case TargetOpcode::G_ATOMIC_CMPXCHG: 1526 return legalizeAtomicCmpXChg(MI, MRI, B); 1527 case TargetOpcode::G_FLOG: 1528 return legalizeFlog(MI, B, numbers::ln2f); 1529 case TargetOpcode::G_FLOG10: 1530 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1531 case TargetOpcode::G_FEXP: 1532 return legalizeFExp(MI, B); 1533 case TargetOpcode::G_FPOW: 1534 return legalizeFPow(MI, B); 1535 case TargetOpcode::G_FFLOOR: 1536 return legalizeFFloor(MI, MRI, B); 1537 case TargetOpcode::G_BUILD_VECTOR: 1538 return legalizeBuildVector(MI, MRI, B); 1539 default: 1540 return false; 1541 } 1542 1543 llvm_unreachable("expected switch to return"); 1544 } 1545 1546 Register AMDGPULegalizerInfo::getSegmentAperture( 1547 unsigned AS, 1548 MachineRegisterInfo &MRI, 1549 MachineIRBuilder &B) const { 1550 MachineFunction &MF = B.getMF(); 1551 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1552 const LLT S32 = LLT::scalar(32); 1553 1554 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1555 1556 if (ST.hasApertureRegs()) { 1557 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1558 // getreg. 1559 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1560 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1561 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1562 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1563 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1564 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1565 unsigned Encoding = 1566 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1567 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1568 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1569 1570 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1571 1572 B.buildInstr(AMDGPU::S_GETREG_B32) 1573 .addDef(GetReg) 1574 .addImm(Encoding); 1575 MRI.setType(GetReg, S32); 1576 1577 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1578 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1579 } 1580 1581 Register QueuePtr = MRI.createGenericVirtualRegister( 1582 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1583 1584 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1585 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1586 return Register(); 1587 1588 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1589 // private_segment_aperture_base_hi. 1590 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1591 1592 // TODO: can we be smarter about machine pointer info? 1593 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1594 MachineMemOperand *MMO = MF.getMachineMemOperand( 1595 PtrInfo, 1596 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1597 MachineMemOperand::MOInvariant, 1598 4, commonAlignment(Align(64), StructOffset)); 1599 1600 Register LoadAddr; 1601 1602 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1603 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1604 } 1605 1606 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1607 MachineInstr &MI, MachineRegisterInfo &MRI, 1608 MachineIRBuilder &B) const { 1609 MachineFunction &MF = B.getMF(); 1610 1611 const LLT S32 = LLT::scalar(32); 1612 Register Dst = MI.getOperand(0).getReg(); 1613 Register Src = MI.getOperand(1).getReg(); 1614 1615 LLT DstTy = MRI.getType(Dst); 1616 LLT SrcTy = MRI.getType(Src); 1617 unsigned DestAS = DstTy.getAddressSpace(); 1618 unsigned SrcAS = SrcTy.getAddressSpace(); 1619 1620 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1621 // vector element. 1622 assert(!DstTy.isVector()); 1623 1624 const AMDGPUTargetMachine &TM 1625 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1626 1627 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1628 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1629 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1630 return true; 1631 } 1632 1633 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1634 // Truncate. 1635 B.buildExtract(Dst, Src, 0); 1636 MI.eraseFromParent(); 1637 return true; 1638 } 1639 1640 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1641 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1642 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1643 1644 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1645 // another. Merge operands are required to be the same type, but creating an 1646 // extra ptrtoint would be kind of pointless. 1647 auto HighAddr = B.buildConstant( 1648 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1649 B.buildMerge(Dst, {Src, HighAddr}); 1650 MI.eraseFromParent(); 1651 return true; 1652 } 1653 1654 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1655 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1656 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1657 unsigned NullVal = TM.getNullPointerValue(DestAS); 1658 1659 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1660 auto FlatNull = B.buildConstant(SrcTy, 0); 1661 1662 // Extract low 32-bits of the pointer. 1663 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1664 1665 auto CmpRes = 1666 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1667 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1668 1669 MI.eraseFromParent(); 1670 return true; 1671 } 1672 1673 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1674 return false; 1675 1676 if (!ST.hasFlatAddressSpace()) 1677 return false; 1678 1679 auto SegmentNull = 1680 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1681 auto FlatNull = 1682 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1683 1684 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1685 if (!ApertureReg.isValid()) 1686 return false; 1687 1688 auto CmpRes = 1689 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1690 1691 // Coerce the type of the low half of the result so we can use merge_values. 1692 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1693 1694 // TODO: Should we allow mismatched types but matching sizes in merges to 1695 // avoid the ptrtoint? 1696 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1697 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1698 1699 MI.eraseFromParent(); 1700 return true; 1701 } 1702 1703 bool AMDGPULegalizerInfo::legalizeFrint( 1704 MachineInstr &MI, MachineRegisterInfo &MRI, 1705 MachineIRBuilder &B) const { 1706 Register Src = MI.getOperand(1).getReg(); 1707 LLT Ty = MRI.getType(Src); 1708 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1709 1710 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1711 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1712 1713 auto C1 = B.buildFConstant(Ty, C1Val); 1714 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1715 1716 // TODO: Should this propagate fast-math-flags? 1717 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1718 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1719 1720 auto C2 = B.buildFConstant(Ty, C2Val); 1721 auto Fabs = B.buildFAbs(Ty, Src); 1722 1723 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1724 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1725 return true; 1726 } 1727 1728 bool AMDGPULegalizerInfo::legalizeFceil( 1729 MachineInstr &MI, MachineRegisterInfo &MRI, 1730 MachineIRBuilder &B) const { 1731 1732 const LLT S1 = LLT::scalar(1); 1733 const LLT S64 = LLT::scalar(64); 1734 1735 Register Src = MI.getOperand(1).getReg(); 1736 assert(MRI.getType(Src) == S64); 1737 1738 // result = trunc(src) 1739 // if (src > 0.0 && src != result) 1740 // result += 1.0 1741 1742 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1743 1744 const auto Zero = B.buildFConstant(S64, 0.0); 1745 const auto One = B.buildFConstant(S64, 1.0); 1746 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1747 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1748 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1749 auto Add = B.buildSelect(S64, And, One, Zero); 1750 1751 // TODO: Should this propagate fast-math-flags? 1752 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1753 return true; 1754 } 1755 1756 static MachineInstrBuilder extractF64Exponent(Register Hi, 1757 MachineIRBuilder &B) { 1758 const unsigned FractBits = 52; 1759 const unsigned ExpBits = 11; 1760 LLT S32 = LLT::scalar(32); 1761 1762 auto Const0 = B.buildConstant(S32, FractBits - 32); 1763 auto Const1 = B.buildConstant(S32, ExpBits); 1764 1765 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1766 .addUse(Hi) 1767 .addUse(Const0.getReg(0)) 1768 .addUse(Const1.getReg(0)); 1769 1770 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1771 } 1772 1773 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1774 MachineInstr &MI, MachineRegisterInfo &MRI, 1775 MachineIRBuilder &B) const { 1776 const LLT S1 = LLT::scalar(1); 1777 const LLT S32 = LLT::scalar(32); 1778 const LLT S64 = LLT::scalar(64); 1779 1780 Register Src = MI.getOperand(1).getReg(); 1781 assert(MRI.getType(Src) == S64); 1782 1783 // TODO: Should this use extract since the low half is unused? 1784 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1785 Register Hi = Unmerge.getReg(1); 1786 1787 // Extract the upper half, since this is where we will find the sign and 1788 // exponent. 1789 auto Exp = extractF64Exponent(Hi, B); 1790 1791 const unsigned FractBits = 52; 1792 1793 // Extract the sign bit. 1794 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1795 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1796 1797 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1798 1799 const auto Zero32 = B.buildConstant(S32, 0); 1800 1801 // Extend back to 64-bits. 1802 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1803 1804 auto Shr = B.buildAShr(S64, FractMask, Exp); 1805 auto Not = B.buildNot(S64, Shr); 1806 auto Tmp0 = B.buildAnd(S64, Src, Not); 1807 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1808 1809 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1810 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1811 1812 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1813 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1814 MI.eraseFromParent(); 1815 return true; 1816 } 1817 1818 bool AMDGPULegalizerInfo::legalizeITOFP( 1819 MachineInstr &MI, MachineRegisterInfo &MRI, 1820 MachineIRBuilder &B, bool Signed) const { 1821 1822 Register Dst = MI.getOperand(0).getReg(); 1823 Register Src = MI.getOperand(1).getReg(); 1824 1825 const LLT S64 = LLT::scalar(64); 1826 const LLT S32 = LLT::scalar(32); 1827 1828 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1829 1830 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1831 1832 auto CvtHi = Signed ? 1833 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1834 B.buildUITOFP(S64, Unmerge.getReg(1)); 1835 1836 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1837 1838 auto ThirtyTwo = B.buildConstant(S32, 32); 1839 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1840 .addUse(CvtHi.getReg(0)) 1841 .addUse(ThirtyTwo.getReg(0)); 1842 1843 // TODO: Should this propagate fast-math-flags? 1844 B.buildFAdd(Dst, LdExp, CvtLo); 1845 MI.eraseFromParent(); 1846 return true; 1847 } 1848 1849 // TODO: Copied from DAG implementation. Verify logic and document how this 1850 // actually works. 1851 bool AMDGPULegalizerInfo::legalizeFPTOI( 1852 MachineInstr &MI, MachineRegisterInfo &MRI, 1853 MachineIRBuilder &B, bool Signed) const { 1854 1855 Register Dst = MI.getOperand(0).getReg(); 1856 Register Src = MI.getOperand(1).getReg(); 1857 1858 const LLT S64 = LLT::scalar(64); 1859 const LLT S32 = LLT::scalar(32); 1860 1861 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1862 1863 unsigned Flags = MI.getFlags(); 1864 1865 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1866 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1867 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1868 1869 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1870 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1871 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1872 1873 auto Hi = Signed ? 1874 B.buildFPTOSI(S32, FloorMul) : 1875 B.buildFPTOUI(S32, FloorMul); 1876 auto Lo = B.buildFPTOUI(S32, Fma); 1877 1878 B.buildMerge(Dst, { Lo, Hi }); 1879 MI.eraseFromParent(); 1880 1881 return true; 1882 } 1883 1884 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 1885 MachineInstr &MI) const { 1886 MachineFunction &MF = Helper.MIRBuilder.getMF(); 1887 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1888 1889 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1890 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1891 1892 // With ieee_mode disabled, the instructions have the correct behavior 1893 // already for G_FMINNUM/G_FMAXNUM 1894 if (!MFI->getMode().IEEE) 1895 return !IsIEEEOp; 1896 1897 if (IsIEEEOp) 1898 return true; 1899 1900 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1901 } 1902 1903 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1904 MachineInstr &MI, MachineRegisterInfo &MRI, 1905 MachineIRBuilder &B) const { 1906 // TODO: Should move some of this into LegalizerHelper. 1907 1908 // TODO: Promote dynamic indexing of s16 to s32 1909 1910 // FIXME: Artifact combiner probably should have replaced the truncated 1911 // constant before this, so we shouldn't need 1912 // getConstantVRegValWithLookThrough. 1913 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1914 MI.getOperand(2).getReg(), MRI); 1915 if (!IdxVal) // Dynamic case will be selected to register indexing. 1916 return true; 1917 1918 Register Dst = MI.getOperand(0).getReg(); 1919 Register Vec = MI.getOperand(1).getReg(); 1920 1921 LLT VecTy = MRI.getType(Vec); 1922 LLT EltTy = VecTy.getElementType(); 1923 assert(EltTy == MRI.getType(Dst)); 1924 1925 if (IdxVal->Value < VecTy.getNumElements()) 1926 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1927 else 1928 B.buildUndef(Dst); 1929 1930 MI.eraseFromParent(); 1931 return true; 1932 } 1933 1934 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1935 MachineInstr &MI, MachineRegisterInfo &MRI, 1936 MachineIRBuilder &B) const { 1937 // TODO: Should move some of this into LegalizerHelper. 1938 1939 // TODO: Promote dynamic indexing of s16 to s32 1940 1941 // FIXME: Artifact combiner probably should have replaced the truncated 1942 // constant before this, so we shouldn't need 1943 // getConstantVRegValWithLookThrough. 1944 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1945 MI.getOperand(3).getReg(), MRI); 1946 if (!IdxVal) // Dynamic case will be selected to register indexing. 1947 return true; 1948 1949 Register Dst = MI.getOperand(0).getReg(); 1950 Register Vec = MI.getOperand(1).getReg(); 1951 Register Ins = MI.getOperand(2).getReg(); 1952 1953 LLT VecTy = MRI.getType(Vec); 1954 LLT EltTy = VecTy.getElementType(); 1955 assert(EltTy == MRI.getType(Ins)); 1956 1957 if (IdxVal->Value < VecTy.getNumElements()) 1958 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1959 else 1960 B.buildUndef(Dst); 1961 1962 MI.eraseFromParent(); 1963 return true; 1964 } 1965 1966 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1967 MachineInstr &MI, MachineRegisterInfo &MRI, 1968 MachineIRBuilder &B) const { 1969 const LLT V2S16 = LLT::vector(2, 16); 1970 1971 Register Dst = MI.getOperand(0).getReg(); 1972 Register Src0 = MI.getOperand(1).getReg(); 1973 LLT DstTy = MRI.getType(Dst); 1974 LLT SrcTy = MRI.getType(Src0); 1975 1976 if (SrcTy == V2S16 && DstTy == V2S16 && 1977 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1978 return true; 1979 1980 MachineIRBuilder HelperBuilder(MI); 1981 GISelObserverWrapper DummyObserver; 1982 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1983 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1984 } 1985 1986 bool AMDGPULegalizerInfo::legalizeSinCos( 1987 MachineInstr &MI, MachineRegisterInfo &MRI, 1988 MachineIRBuilder &B) const { 1989 1990 Register DstReg = MI.getOperand(0).getReg(); 1991 Register SrcReg = MI.getOperand(1).getReg(); 1992 LLT Ty = MRI.getType(DstReg); 1993 unsigned Flags = MI.getFlags(); 1994 1995 Register TrigVal; 1996 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 1997 if (ST.hasTrigReducedRange()) { 1998 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1999 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2000 .addUse(MulVal.getReg(0)) 2001 .setMIFlags(Flags).getReg(0); 2002 } else 2003 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2004 2005 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2006 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2007 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2008 .addUse(TrigVal) 2009 .setMIFlags(Flags); 2010 MI.eraseFromParent(); 2011 return true; 2012 } 2013 2014 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2015 MachineIRBuilder &B, 2016 const GlobalValue *GV, 2017 int64_t Offset, 2018 unsigned GAFlags) const { 2019 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2020 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2021 // to the following code sequence: 2022 // 2023 // For constant address space: 2024 // s_getpc_b64 s[0:1] 2025 // s_add_u32 s0, s0, $symbol 2026 // s_addc_u32 s1, s1, 0 2027 // 2028 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2029 // a fixup or relocation is emitted to replace $symbol with a literal 2030 // constant, which is a pc-relative offset from the encoding of the $symbol 2031 // operand to the global variable. 2032 // 2033 // For global address space: 2034 // s_getpc_b64 s[0:1] 2035 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2036 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2037 // 2038 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2039 // fixups or relocations are emitted to replace $symbol@*@lo and 2040 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2041 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2042 // operand to the global variable. 2043 // 2044 // What we want here is an offset from the value returned by s_getpc 2045 // (which is the address of the s_add_u32 instruction) to the global 2046 // variable, but since the encoding of $symbol starts 4 bytes after the start 2047 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2048 // small. This requires us to add 4 to the global variable offset in order to 2049 // compute the correct address. 2050 2051 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2052 2053 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2054 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2055 2056 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2057 .addDef(PCReg); 2058 2059 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2060 if (GAFlags == SIInstrInfo::MO_NONE) 2061 MIB.addImm(0); 2062 else 2063 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2064 2065 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2066 2067 if (PtrTy.getSizeInBits() == 32) 2068 B.buildExtract(DstReg, PCReg, 0); 2069 return true; 2070 } 2071 2072 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2073 MachineInstr &MI, MachineRegisterInfo &MRI, 2074 MachineIRBuilder &B) const { 2075 Register DstReg = MI.getOperand(0).getReg(); 2076 LLT Ty = MRI.getType(DstReg); 2077 unsigned AS = Ty.getAddressSpace(); 2078 2079 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2080 MachineFunction &MF = B.getMF(); 2081 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2082 2083 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2084 if (!MFI->isEntryFunction()) { 2085 const Function &Fn = MF.getFunction(); 2086 DiagnosticInfoUnsupported BadLDSDecl( 2087 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2088 DS_Warning); 2089 Fn.getContext().diagnose(BadLDSDecl); 2090 2091 // We currently don't have a way to correctly allocate LDS objects that 2092 // aren't directly associated with a kernel. We do force inlining of 2093 // functions that use local objects. However, if these dead functions are 2094 // not eliminated, we don't want a compile time error. Just emit a warning 2095 // and a trap, since there should be no callable path here. 2096 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2097 B.buildUndef(DstReg); 2098 MI.eraseFromParent(); 2099 return true; 2100 } 2101 2102 // TODO: We could emit code to handle the initialization somewhere. 2103 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2104 const SITargetLowering *TLI = ST.getTargetLowering(); 2105 if (!TLI->shouldUseLDSConstAddress(GV)) { 2106 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2107 return true; // Leave in place; 2108 } 2109 2110 B.buildConstant( 2111 DstReg, 2112 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2113 MI.eraseFromParent(); 2114 return true; 2115 } 2116 2117 const Function &Fn = MF.getFunction(); 2118 DiagnosticInfoUnsupported BadInit( 2119 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2120 Fn.getContext().diagnose(BadInit); 2121 return true; 2122 } 2123 2124 const SITargetLowering *TLI = ST.getTargetLowering(); 2125 2126 if (TLI->shouldEmitFixup(GV)) { 2127 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2128 MI.eraseFromParent(); 2129 return true; 2130 } 2131 2132 if (TLI->shouldEmitPCReloc(GV)) { 2133 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2134 MI.eraseFromParent(); 2135 return true; 2136 } 2137 2138 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2139 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2140 2141 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2142 MachinePointerInfo::getGOT(MF), 2143 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2144 MachineMemOperand::MOInvariant, 2145 8 /*Size*/, Align(8)); 2146 2147 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2148 2149 if (Ty.getSizeInBits() == 32) { 2150 // Truncate if this is a 32-bit constant adrdess. 2151 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2152 B.buildExtract(DstReg, Load, 0); 2153 } else 2154 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2155 2156 MI.eraseFromParent(); 2157 return true; 2158 } 2159 2160 bool AMDGPULegalizerInfo::legalizeLoad( 2161 MachineInstr &MI, MachineRegisterInfo &MRI, 2162 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2163 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2164 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2165 Observer.changingInstr(MI); 2166 MI.getOperand(1).setReg(Cast.getReg(0)); 2167 Observer.changedInstr(MI); 2168 return true; 2169 } 2170 2171 bool AMDGPULegalizerInfo::legalizeFMad( 2172 MachineInstr &MI, MachineRegisterInfo &MRI, 2173 MachineIRBuilder &B) const { 2174 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2175 assert(Ty.isScalar()); 2176 2177 MachineFunction &MF = B.getMF(); 2178 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2179 2180 // TODO: Always legal with future ftz flag. 2181 // FIXME: Do we need just output? 2182 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2183 return true; 2184 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2185 return true; 2186 2187 MachineIRBuilder HelperBuilder(MI); 2188 GISelObserverWrapper DummyObserver; 2189 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2190 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2191 } 2192 2193 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2194 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2195 Register DstReg = MI.getOperand(0).getReg(); 2196 Register PtrReg = MI.getOperand(1).getReg(); 2197 Register CmpVal = MI.getOperand(2).getReg(); 2198 Register NewVal = MI.getOperand(3).getReg(); 2199 2200 assert(SITargetLowering::isFlatGlobalAddrSpace( 2201 MRI.getType(PtrReg).getAddressSpace()) && 2202 "this should not have been custom lowered"); 2203 2204 LLT ValTy = MRI.getType(CmpVal); 2205 LLT VecTy = LLT::vector(2, ValTy); 2206 2207 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2208 2209 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2210 .addDef(DstReg) 2211 .addUse(PtrReg) 2212 .addUse(PackedVal) 2213 .setMemRefs(MI.memoperands()); 2214 2215 MI.eraseFromParent(); 2216 return true; 2217 } 2218 2219 bool AMDGPULegalizerInfo::legalizeFlog( 2220 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2221 Register Dst = MI.getOperand(0).getReg(); 2222 Register Src = MI.getOperand(1).getReg(); 2223 LLT Ty = B.getMRI()->getType(Dst); 2224 unsigned Flags = MI.getFlags(); 2225 2226 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2227 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2228 2229 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2230 MI.eraseFromParent(); 2231 return true; 2232 } 2233 2234 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2235 MachineIRBuilder &B) const { 2236 Register Dst = MI.getOperand(0).getReg(); 2237 Register Src = MI.getOperand(1).getReg(); 2238 unsigned Flags = MI.getFlags(); 2239 LLT Ty = B.getMRI()->getType(Dst); 2240 2241 auto K = B.buildFConstant(Ty, numbers::log2e); 2242 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2243 B.buildFExp2(Dst, Mul, Flags); 2244 MI.eraseFromParent(); 2245 return true; 2246 } 2247 2248 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2249 MachineIRBuilder &B) const { 2250 Register Dst = MI.getOperand(0).getReg(); 2251 Register Src0 = MI.getOperand(1).getReg(); 2252 Register Src1 = MI.getOperand(2).getReg(); 2253 unsigned Flags = MI.getFlags(); 2254 LLT Ty = B.getMRI()->getType(Dst); 2255 const LLT S16 = LLT::scalar(16); 2256 const LLT S32 = LLT::scalar(32); 2257 2258 if (Ty == S32) { 2259 auto Log = B.buildFLog2(S32, Src0, Flags); 2260 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2261 .addUse(Log.getReg(0)) 2262 .addUse(Src1) 2263 .setMIFlags(Flags); 2264 B.buildFExp2(Dst, Mul, Flags); 2265 } else if (Ty == S16) { 2266 // There's no f16 fmul_legacy, so we need to convert for it. 2267 auto Log = B.buildFLog2(S16, Src0, Flags); 2268 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2269 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2270 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2271 .addUse(Ext0.getReg(0)) 2272 .addUse(Ext1.getReg(0)) 2273 .setMIFlags(Flags); 2274 2275 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2276 } else 2277 return false; 2278 2279 MI.eraseFromParent(); 2280 return true; 2281 } 2282 2283 // Find a source register, ignoring any possible source modifiers. 2284 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2285 Register ModSrc = OrigSrc; 2286 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2287 ModSrc = SrcFNeg->getOperand(1).getReg(); 2288 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2289 ModSrc = SrcFAbs->getOperand(1).getReg(); 2290 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2291 ModSrc = SrcFAbs->getOperand(1).getReg(); 2292 return ModSrc; 2293 } 2294 2295 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2296 MachineRegisterInfo &MRI, 2297 MachineIRBuilder &B) const { 2298 2299 const LLT S1 = LLT::scalar(1); 2300 const LLT S64 = LLT::scalar(64); 2301 Register Dst = MI.getOperand(0).getReg(); 2302 Register OrigSrc = MI.getOperand(1).getReg(); 2303 unsigned Flags = MI.getFlags(); 2304 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2305 "this should not have been custom lowered"); 2306 2307 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2308 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2309 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2310 // V_FRACT bug is: 2311 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2312 // 2313 // Convert floor(x) to (x - fract(x)) 2314 2315 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2316 .addUse(OrigSrc) 2317 .setMIFlags(Flags); 2318 2319 // Give source modifier matching some assistance before obscuring a foldable 2320 // pattern. 2321 2322 // TODO: We can avoid the neg on the fract? The input sign to fract 2323 // shouldn't matter? 2324 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2325 2326 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2327 2328 Register Min = MRI.createGenericVirtualRegister(S64); 2329 2330 // We don't need to concern ourselves with the snan handling difference, so 2331 // use the one which will directly select. 2332 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2333 if (MFI->getMode().IEEE) 2334 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2335 else 2336 B.buildFMinNum(Min, Fract, Const, Flags); 2337 2338 Register CorrectedFract = Min; 2339 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2340 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2341 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2342 } 2343 2344 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2345 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2346 2347 MI.eraseFromParent(); 2348 return true; 2349 } 2350 2351 // Turn an illegal packed v2s16 build vector into bit operations. 2352 // TODO: This should probably be a bitcast action in LegalizerHelper. 2353 bool AMDGPULegalizerInfo::legalizeBuildVector( 2354 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2355 Register Dst = MI.getOperand(0).getReg(); 2356 const LLT S32 = LLT::scalar(32); 2357 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2358 2359 Register Src0 = MI.getOperand(1).getReg(); 2360 Register Src1 = MI.getOperand(2).getReg(); 2361 assert(MRI.getType(Src0) == LLT::scalar(16)); 2362 2363 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2364 B.buildBitcast(Dst, Merge); 2365 2366 MI.eraseFromParent(); 2367 return true; 2368 } 2369 2370 // Return the use branch instruction, otherwise null if the usage is invalid. 2371 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2372 MachineRegisterInfo &MRI, 2373 MachineInstr *&Br, 2374 MachineBasicBlock *&UncondBrTarget) { 2375 Register CondDef = MI.getOperand(0).getReg(); 2376 if (!MRI.hasOneNonDBGUse(CondDef)) 2377 return nullptr; 2378 2379 MachineBasicBlock *Parent = MI.getParent(); 2380 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2381 if (UseMI.getParent() != Parent || 2382 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2383 return nullptr; 2384 2385 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2386 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2387 if (Next == Parent->end()) { 2388 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2389 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2390 return nullptr; 2391 UncondBrTarget = &*NextMBB; 2392 } else { 2393 if (Next->getOpcode() != AMDGPU::G_BR) 2394 return nullptr; 2395 Br = &*Next; 2396 UncondBrTarget = Br->getOperand(0).getMBB(); 2397 } 2398 2399 return &UseMI; 2400 } 2401 2402 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2403 MachineRegisterInfo &MRI, 2404 Register LiveIn, 2405 Register PhyReg) const { 2406 assert(PhyReg.isPhysical() && "Physical register expected"); 2407 2408 // Insert the live-in copy, if required, by defining destination virtual 2409 // register. 2410 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2411 if (!MRI.getVRegDef(LiveIn)) { 2412 // FIXME: Should have scoped insert pt 2413 MachineBasicBlock &OrigInsBB = B.getMBB(); 2414 auto OrigInsPt = B.getInsertPt(); 2415 2416 MachineBasicBlock &EntryMBB = B.getMF().front(); 2417 EntryMBB.addLiveIn(PhyReg); 2418 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2419 B.buildCopy(LiveIn, PhyReg); 2420 2421 B.setInsertPt(OrigInsBB, OrigInsPt); 2422 } 2423 2424 return LiveIn; 2425 } 2426 2427 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2428 MachineRegisterInfo &MRI, 2429 Register PhyReg, LLT Ty, 2430 bool InsertLiveInCopy) const { 2431 assert(PhyReg.isPhysical() && "Physical register expected"); 2432 2433 // Get or create virtual live-in regester 2434 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2435 if (!LiveIn) { 2436 LiveIn = MRI.createGenericVirtualRegister(Ty); 2437 MRI.addLiveIn(PhyReg, LiveIn); 2438 } 2439 2440 // When the actual true copy required is from virtual register to physical 2441 // register (to be inserted later), live-in copy insertion from physical 2442 // to register virtual register is not required 2443 if (!InsertLiveInCopy) 2444 return LiveIn; 2445 2446 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2447 } 2448 2449 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2450 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2451 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2452 const ArgDescriptor *Arg; 2453 const TargetRegisterClass *RC; 2454 LLT ArgTy; 2455 std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType); 2456 if (!Arg) { 2457 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2458 return nullptr; 2459 } 2460 return Arg; 2461 } 2462 2463 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2464 const ArgDescriptor *Arg) const { 2465 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2466 return false; // TODO: Handle these 2467 2468 Register SrcReg = Arg->getRegister(); 2469 assert(SrcReg.isPhysical() && "Physical register expected"); 2470 assert(DstReg.isVirtual() && "Virtual register expected"); 2471 2472 MachineRegisterInfo &MRI = *B.getMRI(); 2473 2474 LLT Ty = MRI.getType(DstReg); 2475 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2476 2477 if (Arg->isMasked()) { 2478 // TODO: Should we try to emit this once in the entry block? 2479 const LLT S32 = LLT::scalar(32); 2480 const unsigned Mask = Arg->getMask(); 2481 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2482 2483 Register AndMaskSrc = LiveIn; 2484 2485 if (Shift != 0) { 2486 auto ShiftAmt = B.buildConstant(S32, Shift); 2487 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2488 } 2489 2490 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2491 } else { 2492 B.buildCopy(DstReg, LiveIn); 2493 } 2494 2495 return true; 2496 } 2497 2498 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2499 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2500 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2501 2502 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2503 if (!Arg) 2504 return false; 2505 2506 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2507 return false; 2508 2509 MI.eraseFromParent(); 2510 return true; 2511 } 2512 2513 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2514 MachineRegisterInfo &MRI, 2515 MachineIRBuilder &B) const { 2516 Register Dst = MI.getOperand(0).getReg(); 2517 LLT DstTy = MRI.getType(Dst); 2518 LLT S16 = LLT::scalar(16); 2519 LLT S32 = LLT::scalar(32); 2520 LLT S64 = LLT::scalar(64); 2521 2522 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2523 return true; 2524 2525 if (DstTy == S16) 2526 return legalizeFDIV16(MI, MRI, B); 2527 if (DstTy == S32) 2528 return legalizeFDIV32(MI, MRI, B); 2529 if (DstTy == S64) 2530 return legalizeFDIV64(MI, MRI, B); 2531 2532 return false; 2533 } 2534 2535 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2536 Register DstReg, 2537 Register X, 2538 Register Y, 2539 bool IsDiv) const { 2540 const LLT S1 = LLT::scalar(1); 2541 const LLT S32 = LLT::scalar(32); 2542 2543 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2544 // algorithm used here. 2545 2546 // Initial estimate of inv(y). 2547 auto FloatY = B.buildUITOFP(S32, Y); 2548 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2549 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2550 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2551 auto Z = B.buildFPTOUI(S32, ScaledY); 2552 2553 // One round of UNR. 2554 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2555 auto NegYZ = B.buildMul(S32, NegY, Z); 2556 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2557 2558 // Quotient/remainder estimate. 2559 auto Q = B.buildUMulH(S32, X, Z); 2560 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2561 2562 // First quotient/remainder refinement. 2563 auto One = B.buildConstant(S32, 1); 2564 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2565 if (IsDiv) 2566 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2567 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2568 2569 // Second quotient/remainder refinement. 2570 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2571 if (IsDiv) 2572 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2573 else 2574 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2575 } 2576 2577 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2578 MachineRegisterInfo &MRI, 2579 MachineIRBuilder &B) const { 2580 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2581 Register DstReg = MI.getOperand(0).getReg(); 2582 Register Num = MI.getOperand(1).getReg(); 2583 Register Den = MI.getOperand(2).getReg(); 2584 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2585 MI.eraseFromParent(); 2586 return true; 2587 } 2588 2589 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2590 // 2591 // Return lo, hi of result 2592 // 2593 // %cvt.lo = G_UITOFP Val.lo 2594 // %cvt.hi = G_UITOFP Val.hi 2595 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2596 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2597 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2598 // %mul2 = G_FMUL %mul1, 2**(-32) 2599 // %trunc = G_INTRINSIC_TRUNC %mul2 2600 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2601 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2602 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2603 Register Val) { 2604 const LLT S32 = LLT::scalar(32); 2605 auto Unmerge = B.buildUnmerge(S32, Val); 2606 2607 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2608 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2609 2610 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2611 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2612 2613 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2614 auto Mul1 = 2615 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2616 2617 // 2**(-32) 2618 auto Mul2 = 2619 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2620 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2621 2622 // -(2**32) 2623 auto Mad2 = B.buildFMAD(S32, Trunc, 2624 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2625 2626 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2627 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2628 2629 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2630 } 2631 2632 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2633 Register DstReg, 2634 Register Numer, 2635 Register Denom, 2636 bool IsDiv) const { 2637 const LLT S32 = LLT::scalar(32); 2638 const LLT S64 = LLT::scalar(64); 2639 const LLT S1 = LLT::scalar(1); 2640 Register RcpLo, RcpHi; 2641 2642 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2643 2644 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2645 2646 auto Zero64 = B.buildConstant(S64, 0); 2647 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2648 2649 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2650 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2651 2652 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2653 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2654 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2655 2656 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2657 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2658 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2659 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2660 2661 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2662 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2663 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2664 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2665 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2666 2667 auto Zero32 = B.buildConstant(S32, 0); 2668 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2669 auto Add2_HiC = 2670 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2671 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2672 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2673 2674 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2675 Register NumerLo = UnmergeNumer.getReg(0); 2676 Register NumerHi = UnmergeNumer.getReg(1); 2677 2678 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2679 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2680 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2681 Register Mul3_Lo = UnmergeMul3.getReg(0); 2682 Register Mul3_Hi = UnmergeMul3.getReg(1); 2683 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2684 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2685 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2686 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2687 2688 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2689 Register DenomLo = UnmergeDenom.getReg(0); 2690 Register DenomHi = UnmergeDenom.getReg(1); 2691 2692 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2693 auto C1 = B.buildSExt(S32, CmpHi); 2694 2695 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2696 auto C2 = B.buildSExt(S32, CmpLo); 2697 2698 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2699 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2700 2701 // TODO: Here and below portions of the code can be enclosed into if/endif. 2702 // Currently control flow is unconditional and we have 4 selects after 2703 // potential endif to substitute PHIs. 2704 2705 // if C3 != 0 ... 2706 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2707 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2708 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2709 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2710 2711 auto One64 = B.buildConstant(S64, 1); 2712 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2713 2714 auto C4 = 2715 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2716 auto C5 = 2717 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2718 auto C6 = B.buildSelect( 2719 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2720 2721 // if (C6 != 0) 2722 auto Add4 = B.buildAdd(S64, Add3, One64); 2723 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2724 2725 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2726 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2727 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2728 2729 // endif C6 2730 // endif C3 2731 2732 if (IsDiv) { 2733 auto Sel1 = B.buildSelect( 2734 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2735 B.buildSelect(DstReg, 2736 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2737 } else { 2738 auto Sel2 = B.buildSelect( 2739 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2740 B.buildSelect(DstReg, 2741 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2742 } 2743 } 2744 2745 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2746 MachineRegisterInfo &MRI, 2747 MachineIRBuilder &B) const { 2748 const LLT S64 = LLT::scalar(64); 2749 const LLT S32 = LLT::scalar(32); 2750 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2751 Register DstReg = MI.getOperand(0).getReg(); 2752 Register Num = MI.getOperand(1).getReg(); 2753 Register Den = MI.getOperand(2).getReg(); 2754 LLT Ty = MRI.getType(DstReg); 2755 2756 if (Ty == S32) 2757 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2758 else if (Ty == S64) 2759 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2760 else 2761 return false; 2762 2763 MI.eraseFromParent(); 2764 return true; 2765 2766 } 2767 2768 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2769 MachineRegisterInfo &MRI, 2770 MachineIRBuilder &B) const { 2771 const LLT S64 = LLT::scalar(64); 2772 const LLT S32 = LLT::scalar(32); 2773 2774 Register DstReg = MI.getOperand(0).getReg(); 2775 const LLT Ty = MRI.getType(DstReg); 2776 if (Ty != S32 && Ty != S64) 2777 return false; 2778 2779 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2780 2781 Register LHS = MI.getOperand(1).getReg(); 2782 Register RHS = MI.getOperand(2).getReg(); 2783 2784 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2785 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2786 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2787 2788 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2789 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2790 2791 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2792 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2793 2794 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2795 if (Ty == S32) 2796 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2797 else 2798 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2799 2800 Register Sign; 2801 if (IsDiv) 2802 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2803 else 2804 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2805 2806 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2807 B.buildSub(DstReg, UDivRem, Sign); 2808 2809 MI.eraseFromParent(); 2810 return true; 2811 } 2812 2813 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2814 MachineRegisterInfo &MRI, 2815 MachineIRBuilder &B) const { 2816 Register Res = MI.getOperand(0).getReg(); 2817 Register LHS = MI.getOperand(1).getReg(); 2818 Register RHS = MI.getOperand(2).getReg(); 2819 2820 uint16_t Flags = MI.getFlags(); 2821 2822 LLT ResTy = MRI.getType(Res); 2823 LLT S32 = LLT::scalar(32); 2824 LLT S64 = LLT::scalar(64); 2825 2826 const MachineFunction &MF = B.getMF(); 2827 bool Unsafe = 2828 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2829 2830 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2831 return false; 2832 2833 if (!Unsafe && ResTy == S32 && 2834 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2835 return false; 2836 2837 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2838 // 1 / x -> RCP(x) 2839 if (CLHS->isExactlyValue(1.0)) { 2840 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2841 .addUse(RHS) 2842 .setMIFlags(Flags); 2843 2844 MI.eraseFromParent(); 2845 return true; 2846 } 2847 2848 // -1 / x -> RCP( FNEG(x) ) 2849 if (CLHS->isExactlyValue(-1.0)) { 2850 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2851 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2852 .addUse(FNeg.getReg(0)) 2853 .setMIFlags(Flags); 2854 2855 MI.eraseFromParent(); 2856 return true; 2857 } 2858 } 2859 2860 // x / y -> x * (1.0 / y) 2861 if (Unsafe) { 2862 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2863 .addUse(RHS) 2864 .setMIFlags(Flags); 2865 B.buildFMul(Res, LHS, RCP, Flags); 2866 2867 MI.eraseFromParent(); 2868 return true; 2869 } 2870 2871 return false; 2872 } 2873 2874 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2875 MachineRegisterInfo &MRI, 2876 MachineIRBuilder &B) const { 2877 Register Res = MI.getOperand(0).getReg(); 2878 Register LHS = MI.getOperand(1).getReg(); 2879 Register RHS = MI.getOperand(2).getReg(); 2880 2881 uint16_t Flags = MI.getFlags(); 2882 2883 LLT S16 = LLT::scalar(16); 2884 LLT S32 = LLT::scalar(32); 2885 2886 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2887 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2888 2889 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2890 .addUse(RHSExt.getReg(0)) 2891 .setMIFlags(Flags); 2892 2893 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2894 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2895 2896 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2897 .addUse(RDst.getReg(0)) 2898 .addUse(RHS) 2899 .addUse(LHS) 2900 .setMIFlags(Flags); 2901 2902 MI.eraseFromParent(); 2903 return true; 2904 } 2905 2906 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2907 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2908 static void toggleSPDenormMode(bool Enable, 2909 MachineIRBuilder &B, 2910 const GCNSubtarget &ST, 2911 AMDGPU::SIModeRegisterDefaults Mode) { 2912 // Set SP denorm mode to this value. 2913 unsigned SPDenormMode = 2914 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2915 2916 if (ST.hasDenormModeInst()) { 2917 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2918 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2919 2920 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2921 B.buildInstr(AMDGPU::S_DENORM_MODE) 2922 .addImm(NewDenormModeValue); 2923 2924 } else { 2925 // Select FP32 bit field in mode register. 2926 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2927 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2928 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2929 2930 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2931 .addImm(SPDenormMode) 2932 .addImm(SPDenormModeBitField); 2933 } 2934 } 2935 2936 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2937 MachineRegisterInfo &MRI, 2938 MachineIRBuilder &B) const { 2939 Register Res = MI.getOperand(0).getReg(); 2940 Register LHS = MI.getOperand(1).getReg(); 2941 Register RHS = MI.getOperand(2).getReg(); 2942 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2943 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2944 2945 uint16_t Flags = MI.getFlags(); 2946 2947 LLT S32 = LLT::scalar(32); 2948 LLT S1 = LLT::scalar(1); 2949 2950 auto One = B.buildFConstant(S32, 1.0f); 2951 2952 auto DenominatorScaled = 2953 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2954 .addUse(LHS) 2955 .addUse(RHS) 2956 .addImm(0) 2957 .setMIFlags(Flags); 2958 auto NumeratorScaled = 2959 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2960 .addUse(LHS) 2961 .addUse(RHS) 2962 .addImm(1) 2963 .setMIFlags(Flags); 2964 2965 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2966 .addUse(DenominatorScaled.getReg(0)) 2967 .setMIFlags(Flags); 2968 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2969 2970 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2971 // aren't modeled as reading it. 2972 if (!Mode.allFP32Denormals()) 2973 toggleSPDenormMode(true, B, ST, Mode); 2974 2975 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2976 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2977 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2978 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2979 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2980 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2981 2982 if (!Mode.allFP32Denormals()) 2983 toggleSPDenormMode(false, B, ST, Mode); 2984 2985 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2986 .addUse(Fma4.getReg(0)) 2987 .addUse(Fma1.getReg(0)) 2988 .addUse(Fma3.getReg(0)) 2989 .addUse(NumeratorScaled.getReg(1)) 2990 .setMIFlags(Flags); 2991 2992 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2993 .addUse(Fmas.getReg(0)) 2994 .addUse(RHS) 2995 .addUse(LHS) 2996 .setMIFlags(Flags); 2997 2998 MI.eraseFromParent(); 2999 return true; 3000 } 3001 3002 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3003 MachineRegisterInfo &MRI, 3004 MachineIRBuilder &B) const { 3005 Register Res = MI.getOperand(0).getReg(); 3006 Register LHS = MI.getOperand(1).getReg(); 3007 Register RHS = MI.getOperand(2).getReg(); 3008 3009 uint16_t Flags = MI.getFlags(); 3010 3011 LLT S64 = LLT::scalar(64); 3012 LLT S1 = LLT::scalar(1); 3013 3014 auto One = B.buildFConstant(S64, 1.0); 3015 3016 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3017 .addUse(LHS) 3018 .addUse(RHS) 3019 .addImm(0) 3020 .setMIFlags(Flags); 3021 3022 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3023 3024 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3025 .addUse(DivScale0.getReg(0)) 3026 .setMIFlags(Flags); 3027 3028 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3029 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3030 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3031 3032 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3033 .addUse(LHS) 3034 .addUse(RHS) 3035 .addImm(1) 3036 .setMIFlags(Flags); 3037 3038 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3039 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3040 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3041 3042 Register Scale; 3043 if (!ST.hasUsableDivScaleConditionOutput()) { 3044 // Workaround a hardware bug on SI where the condition output from div_scale 3045 // is not usable. 3046 3047 LLT S32 = LLT::scalar(32); 3048 3049 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3050 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3051 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3052 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3053 3054 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3055 Scale1Unmerge.getReg(1)); 3056 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3057 Scale0Unmerge.getReg(1)); 3058 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3059 } else { 3060 Scale = DivScale1.getReg(1); 3061 } 3062 3063 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3064 .addUse(Fma4.getReg(0)) 3065 .addUse(Fma3.getReg(0)) 3066 .addUse(Mul.getReg(0)) 3067 .addUse(Scale) 3068 .setMIFlags(Flags); 3069 3070 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3071 .addUse(Fmas.getReg(0)) 3072 .addUse(RHS) 3073 .addUse(LHS) 3074 .setMIFlags(Flags); 3075 3076 MI.eraseFromParent(); 3077 return true; 3078 } 3079 3080 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3081 MachineRegisterInfo &MRI, 3082 MachineIRBuilder &B) const { 3083 Register Res = MI.getOperand(0).getReg(); 3084 Register LHS = MI.getOperand(2).getReg(); 3085 Register RHS = MI.getOperand(3).getReg(); 3086 uint16_t Flags = MI.getFlags(); 3087 3088 LLT S32 = LLT::scalar(32); 3089 LLT S1 = LLT::scalar(1); 3090 3091 auto Abs = B.buildFAbs(S32, RHS, Flags); 3092 const APFloat C0Val(1.0f); 3093 3094 auto C0 = B.buildConstant(S32, 0x6f800000); 3095 auto C1 = B.buildConstant(S32, 0x2f800000); 3096 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3097 3098 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3099 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3100 3101 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3102 3103 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3104 .addUse(Mul0.getReg(0)) 3105 .setMIFlags(Flags); 3106 3107 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3108 3109 B.buildFMul(Res, Sel, Mul1, Flags); 3110 3111 MI.eraseFromParent(); 3112 return true; 3113 } 3114 3115 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3116 MachineRegisterInfo &MRI, 3117 MachineIRBuilder &B) const { 3118 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3119 uint64_t Offset = 3120 ST.getTargetLowering()->getImplicitParameterOffset( 3121 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3122 LLT DstTy = MRI.getType(DstReg); 3123 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3124 3125 const ArgDescriptor *Arg; 3126 const TargetRegisterClass *RC; 3127 LLT ArgTy; 3128 std::tie(Arg, RC, ArgTy) = 3129 MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3130 if (!Arg) 3131 return false; 3132 3133 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3134 if (!loadInputValue(KernargPtrReg, B, Arg)) 3135 return false; 3136 3137 // FIXME: This should be nuw 3138 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3139 return true; 3140 } 3141 3142 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3143 MachineRegisterInfo &MRI, 3144 MachineIRBuilder &B) const { 3145 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3146 if (!MFI->isEntryFunction()) { 3147 return legalizePreloadedArgIntrin(MI, MRI, B, 3148 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3149 } 3150 3151 Register DstReg = MI.getOperand(0).getReg(); 3152 if (!getImplicitArgPtr(DstReg, MRI, B)) 3153 return false; 3154 3155 MI.eraseFromParent(); 3156 return true; 3157 } 3158 3159 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3160 MachineRegisterInfo &MRI, 3161 MachineIRBuilder &B, 3162 unsigned AddrSpace) const { 3163 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3164 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3165 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3166 MI.eraseFromParent(); 3167 return true; 3168 } 3169 3170 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3171 // offset (the offset that is included in bounds checking and swizzling, to be 3172 // split between the instruction's voffset and immoffset fields) and soffset 3173 // (the offset that is excluded from bounds checking and swizzling, to go in 3174 // the instruction's soffset field). This function takes the first kind of 3175 // offset and figures out how to split it between voffset and immoffset. 3176 std::tuple<Register, unsigned, unsigned> 3177 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3178 Register OrigOffset) const { 3179 const unsigned MaxImm = 4095; 3180 Register BaseReg; 3181 unsigned TotalConstOffset; 3182 MachineInstr *OffsetDef; 3183 const LLT S32 = LLT::scalar(32); 3184 3185 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3186 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3187 3188 unsigned ImmOffset = TotalConstOffset; 3189 3190 // If the immediate value is too big for the immoffset field, put the value 3191 // and -4096 into the immoffset field so that the value that is copied/added 3192 // for the voffset field is a multiple of 4096, and it stands more chance 3193 // of being CSEd with the copy/add for another similar load/store. 3194 // However, do not do that rounding down to a multiple of 4096 if that is a 3195 // negative number, as it appears to be illegal to have a negative offset 3196 // in the vgpr, even if adding the immediate offset makes it positive. 3197 unsigned Overflow = ImmOffset & ~MaxImm; 3198 ImmOffset -= Overflow; 3199 if ((int32_t)Overflow < 0) { 3200 Overflow += ImmOffset; 3201 ImmOffset = 0; 3202 } 3203 3204 if (Overflow != 0) { 3205 if (!BaseReg) { 3206 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3207 } else { 3208 auto OverflowVal = B.buildConstant(S32, Overflow); 3209 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3210 } 3211 } 3212 3213 if (!BaseReg) 3214 BaseReg = B.buildConstant(S32, 0).getReg(0); 3215 3216 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3217 } 3218 3219 /// Handle register layout difference for f16 images for some subtargets. 3220 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3221 MachineRegisterInfo &MRI, 3222 Register Reg) const { 3223 if (!ST.hasUnpackedD16VMem()) 3224 return Reg; 3225 3226 const LLT S16 = LLT::scalar(16); 3227 const LLT S32 = LLT::scalar(32); 3228 LLT StoreVT = MRI.getType(Reg); 3229 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3230 3231 auto Unmerge = B.buildUnmerge(S16, Reg); 3232 3233 SmallVector<Register, 4> WideRegs; 3234 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3235 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3236 3237 int NumElts = StoreVT.getNumElements(); 3238 3239 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3240 } 3241 3242 Register AMDGPULegalizerInfo::fixStoreSourceType( 3243 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3244 MachineRegisterInfo *MRI = B.getMRI(); 3245 LLT Ty = MRI->getType(VData); 3246 3247 const LLT S16 = LLT::scalar(16); 3248 3249 // Fixup illegal register types for i8 stores. 3250 if (Ty == LLT::scalar(8) || Ty == S16) { 3251 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3252 return AnyExt; 3253 } 3254 3255 if (Ty.isVector()) { 3256 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3257 if (IsFormat) 3258 return handleD16VData(B, *MRI, VData); 3259 } 3260 } 3261 3262 return VData; 3263 } 3264 3265 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3266 MachineRegisterInfo &MRI, 3267 MachineIRBuilder &B, 3268 bool IsTyped, 3269 bool IsFormat) const { 3270 Register VData = MI.getOperand(1).getReg(); 3271 LLT Ty = MRI.getType(VData); 3272 LLT EltTy = Ty.getScalarType(); 3273 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3274 const LLT S32 = LLT::scalar(32); 3275 3276 VData = fixStoreSourceType(B, VData, IsFormat); 3277 Register RSrc = MI.getOperand(2).getReg(); 3278 3279 MachineMemOperand *MMO = *MI.memoperands_begin(); 3280 const int MemSize = MMO->getSize(); 3281 3282 unsigned ImmOffset; 3283 unsigned TotalOffset; 3284 3285 // The typed intrinsics add an immediate after the registers. 3286 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3287 3288 // The struct intrinsic variants add one additional operand over raw. 3289 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3290 Register VIndex; 3291 int OpOffset = 0; 3292 if (HasVIndex) { 3293 VIndex = MI.getOperand(3).getReg(); 3294 OpOffset = 1; 3295 } 3296 3297 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3298 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3299 3300 unsigned Format = 0; 3301 if (IsTyped) { 3302 Format = MI.getOperand(5 + OpOffset).getImm(); 3303 ++OpOffset; 3304 } 3305 3306 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3307 3308 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3309 if (TotalOffset != 0) 3310 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3311 3312 unsigned Opc; 3313 if (IsTyped) { 3314 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3315 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3316 } else if (IsFormat) { 3317 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3318 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3319 } else { 3320 switch (MemSize) { 3321 case 1: 3322 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3323 break; 3324 case 2: 3325 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3326 break; 3327 default: 3328 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3329 break; 3330 } 3331 } 3332 3333 if (!VIndex) 3334 VIndex = B.buildConstant(S32, 0).getReg(0); 3335 3336 auto MIB = B.buildInstr(Opc) 3337 .addUse(VData) // vdata 3338 .addUse(RSrc) // rsrc 3339 .addUse(VIndex) // vindex 3340 .addUse(VOffset) // voffset 3341 .addUse(SOffset) // soffset 3342 .addImm(ImmOffset); // offset(imm) 3343 3344 if (IsTyped) 3345 MIB.addImm(Format); 3346 3347 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3348 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3349 .addMemOperand(MMO); 3350 3351 MI.eraseFromParent(); 3352 return true; 3353 } 3354 3355 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3356 MachineRegisterInfo &MRI, 3357 MachineIRBuilder &B, 3358 bool IsFormat, 3359 bool IsTyped) const { 3360 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3361 MachineMemOperand *MMO = *MI.memoperands_begin(); 3362 const int MemSize = MMO->getSize(); 3363 const LLT S32 = LLT::scalar(32); 3364 3365 Register Dst = MI.getOperand(0).getReg(); 3366 Register RSrc = MI.getOperand(2).getReg(); 3367 3368 // The typed intrinsics add an immediate after the registers. 3369 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3370 3371 // The struct intrinsic variants add one additional operand over raw. 3372 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3373 Register VIndex; 3374 int OpOffset = 0; 3375 if (HasVIndex) { 3376 VIndex = MI.getOperand(3).getReg(); 3377 OpOffset = 1; 3378 } 3379 3380 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3381 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3382 3383 unsigned Format = 0; 3384 if (IsTyped) { 3385 Format = MI.getOperand(5 + OpOffset).getImm(); 3386 ++OpOffset; 3387 } 3388 3389 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3390 unsigned ImmOffset; 3391 unsigned TotalOffset; 3392 3393 LLT Ty = MRI.getType(Dst); 3394 LLT EltTy = Ty.getScalarType(); 3395 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3396 const bool Unpacked = ST.hasUnpackedD16VMem(); 3397 3398 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3399 if (TotalOffset != 0) 3400 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3401 3402 unsigned Opc; 3403 3404 if (IsTyped) { 3405 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3406 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3407 } else if (IsFormat) { 3408 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3409 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3410 } else { 3411 switch (MemSize) { 3412 case 1: 3413 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3414 break; 3415 case 2: 3416 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3417 break; 3418 default: 3419 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3420 break; 3421 } 3422 } 3423 3424 Register LoadDstReg; 3425 3426 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3427 LLT UnpackedTy = Ty.changeElementSize(32); 3428 3429 if (IsExtLoad) 3430 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3431 else if (Unpacked && IsD16 && Ty.isVector()) 3432 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3433 else 3434 LoadDstReg = Dst; 3435 3436 if (!VIndex) 3437 VIndex = B.buildConstant(S32, 0).getReg(0); 3438 3439 auto MIB = B.buildInstr(Opc) 3440 .addDef(LoadDstReg) // vdata 3441 .addUse(RSrc) // rsrc 3442 .addUse(VIndex) // vindex 3443 .addUse(VOffset) // voffset 3444 .addUse(SOffset) // soffset 3445 .addImm(ImmOffset); // offset(imm) 3446 3447 if (IsTyped) 3448 MIB.addImm(Format); 3449 3450 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3451 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3452 .addMemOperand(MMO); 3453 3454 if (LoadDstReg != Dst) { 3455 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3456 3457 // Widen result for extending loads was widened. 3458 if (IsExtLoad) 3459 B.buildTrunc(Dst, LoadDstReg); 3460 else { 3461 // Repack to original 16-bit vector result 3462 // FIXME: G_TRUNC should work, but legalization currently fails 3463 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3464 SmallVector<Register, 4> Repack; 3465 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3466 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3467 B.buildMerge(Dst, Repack); 3468 } 3469 } 3470 3471 MI.eraseFromParent(); 3472 return true; 3473 } 3474 3475 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3476 MachineIRBuilder &B, 3477 bool IsInc) const { 3478 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3479 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3480 B.buildInstr(Opc) 3481 .addDef(MI.getOperand(0).getReg()) 3482 .addUse(MI.getOperand(2).getReg()) 3483 .addUse(MI.getOperand(3).getReg()) 3484 .cloneMemRefs(MI); 3485 MI.eraseFromParent(); 3486 return true; 3487 } 3488 3489 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3490 switch (IntrID) { 3491 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3492 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3493 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3494 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3495 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3496 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3497 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3498 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3499 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3500 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3501 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3502 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3503 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3504 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3505 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3506 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3507 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3508 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3509 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3510 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3511 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3512 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3513 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3514 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3515 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3516 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3517 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3518 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3519 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3520 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3521 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3522 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3523 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3524 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3525 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3526 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3527 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3528 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3529 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3530 default: 3531 llvm_unreachable("unhandled atomic opcode"); 3532 } 3533 } 3534 3535 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3536 MachineIRBuilder &B, 3537 Intrinsic::ID IID) const { 3538 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3539 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3540 3541 Register Dst = MI.getOperand(0).getReg(); 3542 Register VData = MI.getOperand(2).getReg(); 3543 3544 Register CmpVal; 3545 int OpOffset = 0; 3546 3547 if (IsCmpSwap) { 3548 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3549 ++OpOffset; 3550 } 3551 3552 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3553 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3554 3555 // The struct intrinsic variants add one additional operand over raw. 3556 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3557 Register VIndex; 3558 if (HasVIndex) { 3559 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3560 ++OpOffset; 3561 } 3562 3563 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3564 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3565 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3566 3567 MachineMemOperand *MMO = *MI.memoperands_begin(); 3568 3569 unsigned ImmOffset; 3570 unsigned TotalOffset; 3571 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3572 if (TotalOffset != 0) 3573 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3574 3575 if (!VIndex) 3576 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3577 3578 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3579 .addDef(Dst) 3580 .addUse(VData); // vdata 3581 3582 if (IsCmpSwap) 3583 MIB.addReg(CmpVal); 3584 3585 MIB.addUse(RSrc) // rsrc 3586 .addUse(VIndex) // vindex 3587 .addUse(VOffset) // voffset 3588 .addUse(SOffset) // soffset 3589 .addImm(ImmOffset) // offset(imm) 3590 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3591 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3592 .addMemOperand(MMO); 3593 3594 MI.eraseFromParent(); 3595 return true; 3596 } 3597 3598 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3599 /// vector with s16 typed elements. 3600 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3601 SmallVectorImpl<Register> &PackedAddrs, 3602 int AddrIdx, int DimIdx, int EndIdx, 3603 int NumGradients) { 3604 const LLT S16 = LLT::scalar(16); 3605 const LLT V2S16 = LLT::vector(2, 16); 3606 3607 for (int I = AddrIdx; I < EndIdx; ++I) { 3608 MachineOperand &SrcOp = MI.getOperand(I); 3609 if (!SrcOp.isReg()) 3610 continue; // _L to _LZ may have eliminated this. 3611 3612 Register AddrReg = SrcOp.getReg(); 3613 3614 if (I < DimIdx) { 3615 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3616 PackedAddrs.push_back(AddrReg); 3617 } else { 3618 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3619 // derivatives dx/dh and dx/dv are packed with undef. 3620 if (((I + 1) >= EndIdx) || 3621 ((NumGradients / 2) % 2 == 1 && 3622 (I == DimIdx + (NumGradients / 2) - 1 || 3623 I == DimIdx + NumGradients - 1)) || 3624 // Check for _L to _LZ optimization 3625 !MI.getOperand(I + 1).isReg()) { 3626 PackedAddrs.push_back( 3627 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3628 .getReg(0)); 3629 } else { 3630 PackedAddrs.push_back( 3631 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3632 .getReg(0)); 3633 ++I; 3634 } 3635 } 3636 } 3637 } 3638 3639 /// Convert from separate vaddr components to a single vector address register, 3640 /// and replace the remaining operands with $noreg. 3641 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3642 int DimIdx, int NumVAddrs) { 3643 const LLT S32 = LLT::scalar(32); 3644 3645 SmallVector<Register, 8> AddrRegs; 3646 for (int I = 0; I != NumVAddrs; ++I) { 3647 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3648 if (SrcOp.isReg()) { 3649 AddrRegs.push_back(SrcOp.getReg()); 3650 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3651 } 3652 } 3653 3654 int NumAddrRegs = AddrRegs.size(); 3655 if (NumAddrRegs != 1) { 3656 // Round up to 8 elements for v5-v7 3657 // FIXME: Missing intermediate sized register classes and instructions. 3658 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3659 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3660 auto Undef = B.buildUndef(S32); 3661 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3662 NumAddrRegs = RoundedNumRegs; 3663 } 3664 3665 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3666 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3667 } 3668 3669 for (int I = 1; I != NumVAddrs; ++I) { 3670 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3671 if (SrcOp.isReg()) 3672 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3673 } 3674 } 3675 3676 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3677 /// 3678 /// Depending on the subtarget, load/store with 16-bit element data need to be 3679 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3680 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3681 /// registers. 3682 /// 3683 /// We don't want to directly select image instructions just yet, but also want 3684 /// to exposes all register repacking to the legalizer/combiners. We also don't 3685 /// want a selected instrution entering RegBankSelect. In order to avoid 3686 /// defining a multitude of intermediate image instructions, directly hack on 3687 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3688 /// now unnecessary arguments with $noreg. 3689 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3690 MachineInstr &MI, MachineIRBuilder &B, 3691 GISelChangeObserver &Observer, 3692 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3693 3694 const int NumDefs = MI.getNumExplicitDefs(); 3695 bool IsTFE = NumDefs == 2; 3696 // We are only processing the operands of d16 image operations on subtargets 3697 // that use the unpacked register layout, or need to repack the TFE result. 3698 3699 // TODO: Do we need to guard against already legalized intrinsics? 3700 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3701 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3702 3703 MachineRegisterInfo *MRI = B.getMRI(); 3704 const LLT S32 = LLT::scalar(32); 3705 const LLT S16 = LLT::scalar(16); 3706 const LLT V2S16 = LLT::vector(2, 16); 3707 3708 // Index of first address argument 3709 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3710 3711 int NumVAddrs, NumGradients; 3712 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3713 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3714 getDMaskIdx(BaseOpcode, NumDefs); 3715 unsigned DMask = 0; 3716 3717 // Check for 16 bit addresses and pack if true. 3718 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3719 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3720 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3721 const bool IsG16 = GradTy == S16; 3722 const bool IsA16 = AddrTy == S16; 3723 3724 int DMaskLanes = 0; 3725 if (!BaseOpcode->Atomic) { 3726 DMask = MI.getOperand(DMaskIdx).getImm(); 3727 if (BaseOpcode->Gather4) { 3728 DMaskLanes = 4; 3729 } else if (DMask != 0) { 3730 DMaskLanes = countPopulation(DMask); 3731 } else if (!IsTFE && !BaseOpcode->Store) { 3732 // If dmask is 0, this is a no-op load. This can be eliminated. 3733 B.buildUndef(MI.getOperand(0)); 3734 MI.eraseFromParent(); 3735 return true; 3736 } 3737 } 3738 3739 Observer.changingInstr(MI); 3740 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3741 3742 unsigned NewOpcode = NumDefs == 0 ? 3743 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3744 3745 // Track that we legalized this 3746 MI.setDesc(B.getTII().get(NewOpcode)); 3747 3748 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3749 // dmask to be at least 1 otherwise the instruction will fail 3750 if (IsTFE && DMask == 0) { 3751 DMask = 0x1; 3752 DMaskLanes = 1; 3753 MI.getOperand(DMaskIdx).setImm(DMask); 3754 } 3755 3756 if (BaseOpcode->Atomic) { 3757 Register VData0 = MI.getOperand(2).getReg(); 3758 LLT Ty = MRI->getType(VData0); 3759 3760 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3761 if (Ty.isVector()) 3762 return false; 3763 3764 if (BaseOpcode->AtomicX2) { 3765 Register VData1 = MI.getOperand(3).getReg(); 3766 // The two values are packed in one register. 3767 LLT PackedTy = LLT::vector(2, Ty); 3768 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3769 MI.getOperand(2).setReg(Concat.getReg(0)); 3770 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3771 } 3772 } 3773 3774 int CorrectedNumVAddrs = NumVAddrs; 3775 3776 // Optimize _L to _LZ when _L is zero 3777 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3778 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3779 const ConstantFP *ConstantLod; 3780 const int LodIdx = AddrIdx + NumVAddrs - 1; 3781 3782 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3783 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3784 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3785 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3786 LZMappingInfo->LZ, ImageDimIntr->Dim); 3787 3788 // The starting indexes should remain in the same place. 3789 --NumVAddrs; 3790 --CorrectedNumVAddrs; 3791 3792 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3793 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3794 MI.RemoveOperand(LodIdx); 3795 } 3796 } 3797 } 3798 3799 // Optimize _mip away, when 'lod' is zero 3800 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3801 int64_t ConstantLod; 3802 const int LodIdx = AddrIdx + NumVAddrs - 1; 3803 3804 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3805 if (ConstantLod == 0) { 3806 // TODO: Change intrinsic opcode and remove operand instead or replacing 3807 // it with 0, as the _L to _LZ handling is done above. 3808 MI.getOperand(LodIdx).ChangeToImmediate(0); 3809 --CorrectedNumVAddrs; 3810 } 3811 } 3812 } 3813 3814 // Rewrite the addressing register layout before doing anything else. 3815 if (IsA16 || IsG16) { 3816 if (IsA16) { 3817 // Target must support the feature and gradients need to be 16 bit too 3818 if (!ST.hasA16() || !IsG16) 3819 return false; 3820 } else if (!ST.hasG16()) 3821 return false; 3822 3823 if (NumVAddrs > 1) { 3824 SmallVector<Register, 4> PackedRegs; 3825 // Don't compress addresses for G16 3826 const int PackEndIdx = 3827 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3828 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3829 PackEndIdx, NumGradients); 3830 3831 if (!IsA16) { 3832 // Add uncompressed address 3833 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3834 int AddrReg = MI.getOperand(I).getReg(); 3835 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3836 PackedRegs.push_back(AddrReg); 3837 } 3838 } 3839 3840 // See also below in the non-a16 branch 3841 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3842 3843 if (!UseNSA && PackedRegs.size() > 1) { 3844 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3845 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3846 PackedRegs[0] = Concat.getReg(0); 3847 PackedRegs.resize(1); 3848 } 3849 3850 const int NumPacked = PackedRegs.size(); 3851 for (int I = 0; I != NumVAddrs; ++I) { 3852 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3853 if (!SrcOp.isReg()) { 3854 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3855 continue; 3856 } 3857 3858 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3859 3860 if (I < NumPacked) 3861 SrcOp.setReg(PackedRegs[I]); 3862 else 3863 SrcOp.setReg(AMDGPU::NoRegister); 3864 } 3865 } 3866 } else { 3867 // If the register allocator cannot place the address registers contiguously 3868 // without introducing moves, then using the non-sequential address encoding 3869 // is always preferable, since it saves VALU instructions and is usually a 3870 // wash in terms of code size or even better. 3871 // 3872 // However, we currently have no way of hinting to the register allocator 3873 // that MIMG addresses should be placed contiguously when it is possible to 3874 // do so, so force non-NSA for the common 2-address case as a heuristic. 3875 // 3876 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3877 // allocation when possible. 3878 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3879 3880 if (!UseNSA && NumVAddrs > 1) 3881 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3882 } 3883 3884 int Flags = 0; 3885 if (IsA16) 3886 Flags |= 1; 3887 if (IsG16) 3888 Flags |= 2; 3889 MI.addOperand(MachineOperand::CreateImm(Flags)); 3890 3891 if (BaseOpcode->Store) { // No TFE for stores? 3892 // TODO: Handle dmask trim 3893 Register VData = MI.getOperand(1).getReg(); 3894 LLT Ty = MRI->getType(VData); 3895 if (!Ty.isVector() || Ty.getElementType() != S16) 3896 return true; 3897 3898 Register RepackedReg = handleD16VData(B, *MRI, VData); 3899 if (RepackedReg != VData) { 3900 MI.getOperand(1).setReg(RepackedReg); 3901 } 3902 3903 return true; 3904 } 3905 3906 Register DstReg = MI.getOperand(0).getReg(); 3907 LLT Ty = MRI->getType(DstReg); 3908 const LLT EltTy = Ty.getScalarType(); 3909 const bool IsD16 = Ty.getScalarType() == S16; 3910 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3911 3912 // Confirm that the return type is large enough for the dmask specified 3913 if (NumElts < DMaskLanes) 3914 return false; 3915 3916 if (NumElts > 4 || DMaskLanes > 4) 3917 return false; 3918 3919 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3920 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3921 3922 // The raw dword aligned data component of the load. The only legal cases 3923 // where this matters should be when using the packed D16 format, for 3924 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3925 LLT RoundedTy; 3926 3927 // S32 vector to to cover all data, plus TFE result element. 3928 LLT TFETy; 3929 3930 // Register type to use for each loaded component. Will be S32 or V2S16. 3931 LLT RegTy; 3932 3933 if (IsD16 && ST.hasUnpackedD16VMem()) { 3934 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3935 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3936 RegTy = S32; 3937 } else { 3938 unsigned EltSize = EltTy.getSizeInBits(); 3939 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3940 unsigned RoundedSize = 32 * RoundedElts; 3941 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3942 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3943 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3944 } 3945 3946 // The return type does not need adjustment. 3947 // TODO: Should we change s16 case to s32 or <2 x s16>? 3948 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3949 return true; 3950 3951 Register Dst1Reg; 3952 3953 // Insert after the instruction. 3954 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3955 3956 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3957 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3958 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3959 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3960 3961 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3962 3963 MI.getOperand(0).setReg(NewResultReg); 3964 3965 // In the IR, TFE is supposed to be used with a 2 element struct return 3966 // type. The intruction really returns these two values in one contiguous 3967 // register, with one additional dword beyond the loaded data. Rewrite the 3968 // return type to use a single register result. 3969 3970 if (IsTFE) { 3971 Dst1Reg = MI.getOperand(1).getReg(); 3972 if (MRI->getType(Dst1Reg) != S32) 3973 return false; 3974 3975 // TODO: Make sure the TFE operand bit is set. 3976 MI.RemoveOperand(1); 3977 3978 // Handle the easy case that requires no repack instructions. 3979 if (Ty == S32) { 3980 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 3981 return true; 3982 } 3983 } 3984 3985 // Now figure out how to copy the new result register back into the old 3986 // result. 3987 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 3988 3989 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 3990 3991 if (ResultNumRegs == 1) { 3992 assert(!IsTFE); 3993 ResultRegs[0] = NewResultReg; 3994 } else { 3995 // We have to repack into a new vector of some kind. 3996 for (int I = 0; I != NumDataRegs; ++I) 3997 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 3998 B.buildUnmerge(ResultRegs, NewResultReg); 3999 4000 // Drop the final TFE element to get the data part. The TFE result is 4001 // directly written to the right place already. 4002 if (IsTFE) 4003 ResultRegs.resize(NumDataRegs); 4004 } 4005 4006 // For an s16 scalar result, we form an s32 result with a truncate regardless 4007 // of packed vs. unpacked. 4008 if (IsD16 && !Ty.isVector()) { 4009 B.buildTrunc(DstReg, ResultRegs[0]); 4010 return true; 4011 } 4012 4013 // Avoid a build/concat_vector of 1 entry. 4014 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4015 B.buildBitcast(DstReg, ResultRegs[0]); 4016 return true; 4017 } 4018 4019 assert(Ty.isVector()); 4020 4021 if (IsD16) { 4022 // For packed D16 results with TFE enabled, all the data components are 4023 // S32. Cast back to the expected type. 4024 // 4025 // TODO: We don't really need to use load s32 elements. We would only need one 4026 // cast for the TFE result if a multiple of v2s16 was used. 4027 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4028 for (Register &Reg : ResultRegs) 4029 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4030 } else if (ST.hasUnpackedD16VMem()) { 4031 for (Register &Reg : ResultRegs) 4032 Reg = B.buildTrunc(S16, Reg).getReg(0); 4033 } 4034 } 4035 4036 auto padWithUndef = [&](LLT Ty, int NumElts) { 4037 if (NumElts == 0) 4038 return; 4039 Register Undef = B.buildUndef(Ty).getReg(0); 4040 for (int I = 0; I != NumElts; ++I) 4041 ResultRegs.push_back(Undef); 4042 }; 4043 4044 // Pad out any elements eliminated due to the dmask. 4045 LLT ResTy = MRI->getType(ResultRegs[0]); 4046 if (!ResTy.isVector()) { 4047 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4048 B.buildBuildVector(DstReg, ResultRegs); 4049 return true; 4050 } 4051 4052 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4053 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4054 4055 // Deal with the one annoying legal case. 4056 const LLT V3S16 = LLT::vector(3, 16); 4057 if (Ty == V3S16) { 4058 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4059 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4060 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4061 return true; 4062 } 4063 4064 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4065 B.buildConcatVectors(DstReg, ResultRegs); 4066 return true; 4067 } 4068 4069 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4070 MachineInstr &MI, MachineIRBuilder &B, 4071 GISelChangeObserver &Observer) const { 4072 Register Dst = MI.getOperand(0).getReg(); 4073 LLT Ty = B.getMRI()->getType(Dst); 4074 unsigned Size = Ty.getSizeInBits(); 4075 MachineFunction &MF = B.getMF(); 4076 4077 Observer.changingInstr(MI); 4078 4079 // FIXME: We don't really need this intermediate instruction. The intrinsic 4080 // should be fixed to have a memory operand. Since it's readnone, we're not 4081 // allowed to add one. 4082 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4083 MI.RemoveOperand(1); // Remove intrinsic ID 4084 4085 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4086 // TODO: Should this use datalayout alignment? 4087 const unsigned MemSize = (Size + 7) / 8; 4088 const Align MemAlign(4); 4089 MachineMemOperand *MMO = MF.getMachineMemOperand( 4090 MachinePointerInfo(), 4091 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4092 MachineMemOperand::MOInvariant, 4093 MemSize, MemAlign); 4094 MI.addMemOperand(MF, MMO); 4095 4096 // There are no 96-bit result scalar loads, but widening to 128-bit should 4097 // always be legal. We may need to restore this to a 96-bit result if it turns 4098 // out this needs to be converted to a vector load during RegBankSelect. 4099 if (!isPowerOf2_32(Size)) { 4100 LegalizerHelper Helper(MF, *this, Observer, B); 4101 4102 if (Ty.isVector()) 4103 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4104 else 4105 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4106 } 4107 4108 Observer.changedInstr(MI); 4109 return true; 4110 } 4111 4112 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4113 MachineRegisterInfo &MRI, 4114 MachineIRBuilder &B) const { 4115 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4116 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4117 !ST.isTrapHandlerEnabled()) { 4118 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4119 } else { 4120 // Pass queue pointer to trap handler as input, and insert trap instruction 4121 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4122 const ArgDescriptor *Arg = 4123 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4124 if (!Arg) 4125 return false; 4126 MachineRegisterInfo &MRI = *B.getMRI(); 4127 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4128 Register LiveIn = getLiveInRegister( 4129 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4130 /*InsertLiveInCopy=*/false); 4131 if (!loadInputValue(LiveIn, B, Arg)) 4132 return false; 4133 B.buildCopy(SGPR01, LiveIn); 4134 B.buildInstr(AMDGPU::S_TRAP) 4135 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4136 .addReg(SGPR01, RegState::Implicit); 4137 } 4138 4139 MI.eraseFromParent(); 4140 return true; 4141 } 4142 4143 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4144 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4145 // Is non-HSA path or trap-handler disabled? then, report a warning 4146 // accordingly 4147 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4148 !ST.isTrapHandlerEnabled()) { 4149 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4150 "debugtrap handler not supported", 4151 MI.getDebugLoc(), DS_Warning); 4152 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4153 Ctx.diagnose(NoTrap); 4154 } else { 4155 // Insert debug-trap instruction 4156 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4157 } 4158 4159 MI.eraseFromParent(); 4160 return true; 4161 } 4162 4163 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4164 MachineInstr &MI) const { 4165 MachineIRBuilder &B = Helper.MIRBuilder; 4166 MachineRegisterInfo &MRI = *B.getMRI(); 4167 4168 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4169 auto IntrID = MI.getIntrinsicID(); 4170 switch (IntrID) { 4171 case Intrinsic::amdgcn_if: 4172 case Intrinsic::amdgcn_else: { 4173 MachineInstr *Br = nullptr; 4174 MachineBasicBlock *UncondBrTarget = nullptr; 4175 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4176 const SIRegisterInfo *TRI 4177 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4178 4179 Register Def = MI.getOperand(1).getReg(); 4180 Register Use = MI.getOperand(3).getReg(); 4181 4182 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4183 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4184 if (IntrID == Intrinsic::amdgcn_if) { 4185 B.buildInstr(AMDGPU::SI_IF) 4186 .addDef(Def) 4187 .addUse(Use) 4188 .addMBB(UncondBrTarget); 4189 } else { 4190 B.buildInstr(AMDGPU::SI_ELSE) 4191 .addDef(Def) 4192 .addUse(Use) 4193 .addMBB(UncondBrTarget) 4194 .addImm(0); 4195 } 4196 4197 if (Br) { 4198 Br->getOperand(0).setMBB(CondBrTarget); 4199 } else { 4200 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4201 // since we're swapping branch targets it needs to be reinserted. 4202 // FIXME: IRTranslator should probably not do this 4203 B.buildBr(*CondBrTarget); 4204 } 4205 4206 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4207 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4208 MI.eraseFromParent(); 4209 BrCond->eraseFromParent(); 4210 return true; 4211 } 4212 4213 return false; 4214 } 4215 case Intrinsic::amdgcn_loop: { 4216 MachineInstr *Br = nullptr; 4217 MachineBasicBlock *UncondBrTarget = nullptr; 4218 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4219 const SIRegisterInfo *TRI 4220 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4221 4222 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4223 Register Reg = MI.getOperand(2).getReg(); 4224 4225 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4226 B.buildInstr(AMDGPU::SI_LOOP) 4227 .addUse(Reg) 4228 .addMBB(UncondBrTarget); 4229 4230 if (Br) 4231 Br->getOperand(0).setMBB(CondBrTarget); 4232 else 4233 B.buildBr(*CondBrTarget); 4234 4235 MI.eraseFromParent(); 4236 BrCond->eraseFromParent(); 4237 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4238 return true; 4239 } 4240 4241 return false; 4242 } 4243 case Intrinsic::amdgcn_kernarg_segment_ptr: 4244 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4245 // This only makes sense to call in a kernel, so just lower to null. 4246 B.buildConstant(MI.getOperand(0).getReg(), 0); 4247 MI.eraseFromParent(); 4248 return true; 4249 } 4250 4251 return legalizePreloadedArgIntrin( 4252 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4253 case Intrinsic::amdgcn_implicitarg_ptr: 4254 return legalizeImplicitArgPtr(MI, MRI, B); 4255 case Intrinsic::amdgcn_workitem_id_x: 4256 return legalizePreloadedArgIntrin(MI, MRI, B, 4257 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4258 case Intrinsic::amdgcn_workitem_id_y: 4259 return legalizePreloadedArgIntrin(MI, MRI, B, 4260 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4261 case Intrinsic::amdgcn_workitem_id_z: 4262 return legalizePreloadedArgIntrin(MI, MRI, B, 4263 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4264 case Intrinsic::amdgcn_workgroup_id_x: 4265 return legalizePreloadedArgIntrin(MI, MRI, B, 4266 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4267 case Intrinsic::amdgcn_workgroup_id_y: 4268 return legalizePreloadedArgIntrin(MI, MRI, B, 4269 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4270 case Intrinsic::amdgcn_workgroup_id_z: 4271 return legalizePreloadedArgIntrin(MI, MRI, B, 4272 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4273 case Intrinsic::amdgcn_dispatch_ptr: 4274 return legalizePreloadedArgIntrin(MI, MRI, B, 4275 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4276 case Intrinsic::amdgcn_queue_ptr: 4277 return legalizePreloadedArgIntrin(MI, MRI, B, 4278 AMDGPUFunctionArgInfo::QUEUE_PTR); 4279 case Intrinsic::amdgcn_implicit_buffer_ptr: 4280 return legalizePreloadedArgIntrin( 4281 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4282 case Intrinsic::amdgcn_dispatch_id: 4283 return legalizePreloadedArgIntrin(MI, MRI, B, 4284 AMDGPUFunctionArgInfo::DISPATCH_ID); 4285 case Intrinsic::amdgcn_fdiv_fast: 4286 return legalizeFDIVFastIntrin(MI, MRI, B); 4287 case Intrinsic::amdgcn_is_shared: 4288 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4289 case Intrinsic::amdgcn_is_private: 4290 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4291 case Intrinsic::amdgcn_wavefrontsize: { 4292 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4293 MI.eraseFromParent(); 4294 return true; 4295 } 4296 case Intrinsic::amdgcn_s_buffer_load: 4297 return legalizeSBufferLoad(MI, B, Helper.Observer); 4298 case Intrinsic::amdgcn_raw_buffer_store: 4299 case Intrinsic::amdgcn_struct_buffer_store: 4300 return legalizeBufferStore(MI, MRI, B, false, false); 4301 case Intrinsic::amdgcn_raw_buffer_store_format: 4302 case Intrinsic::amdgcn_struct_buffer_store_format: 4303 return legalizeBufferStore(MI, MRI, B, false, true); 4304 case Intrinsic::amdgcn_raw_tbuffer_store: 4305 case Intrinsic::amdgcn_struct_tbuffer_store: 4306 return legalizeBufferStore(MI, MRI, B, true, true); 4307 case Intrinsic::amdgcn_raw_buffer_load: 4308 case Intrinsic::amdgcn_struct_buffer_load: 4309 return legalizeBufferLoad(MI, MRI, B, false, false); 4310 case Intrinsic::amdgcn_raw_buffer_load_format: 4311 case Intrinsic::amdgcn_struct_buffer_load_format: 4312 return legalizeBufferLoad(MI, MRI, B, true, false); 4313 case Intrinsic::amdgcn_raw_tbuffer_load: 4314 case Intrinsic::amdgcn_struct_tbuffer_load: 4315 return legalizeBufferLoad(MI, MRI, B, true, true); 4316 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4317 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4318 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4319 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4320 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4321 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4322 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4323 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4324 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4325 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4326 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4327 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4328 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4329 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4330 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4331 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4332 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4333 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4334 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4335 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4336 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4337 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4338 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4339 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4340 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4341 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4342 return legalizeBufferAtomic(MI, B, IntrID); 4343 case Intrinsic::amdgcn_atomic_inc: 4344 return legalizeAtomicIncDec(MI, B, true); 4345 case Intrinsic::amdgcn_atomic_dec: 4346 return legalizeAtomicIncDec(MI, B, false); 4347 case Intrinsic::trap: 4348 return legalizeTrapIntrinsic(MI, MRI, B); 4349 case Intrinsic::debugtrap: 4350 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4351 default: { 4352 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4353 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4354 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4355 return true; 4356 } 4357 } 4358 4359 return true; 4360 } 4361