1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 return Ty.isVector() && 67 Ty.getNumElements() % 2 != 0 && 68 Ty.getElementType().getSizeInBits() < 32 && 69 Ty.getSizeInBits() % 32 != 0; 70 }; 71 } 72 73 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 74 return [=](const LegalityQuery &Query) { 75 const LLT Ty = Query.Types[TypeIdx]; 76 const LLT EltTy = Ty.getScalarType(); 77 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 78 }; 79 } 80 81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 82 return [=](const LegalityQuery &Query) { 83 const LLT Ty = Query.Types[TypeIdx]; 84 const LLT EltTy = Ty.getElementType(); 85 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 86 }; 87 } 88 89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 90 return [=](const LegalityQuery &Query) { 91 const LLT Ty = Query.Types[TypeIdx]; 92 const LLT EltTy = Ty.getElementType(); 93 unsigned Size = Ty.getSizeInBits(); 94 unsigned Pieces = (Size + 63) / 64; 95 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 96 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 97 }; 98 } 99 100 // Increase the number of vector elements to reach the next multiple of 32-bit 101 // type. 102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 106 const LLT EltTy = Ty.getElementType(); 107 const int Size = Ty.getSizeInBits(); 108 const int EltSize = EltTy.getSizeInBits(); 109 const int NextMul32 = (Size + 31) / 32; 110 111 assert(EltSize < 32); 112 113 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 114 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 115 }; 116 } 117 118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 unsigned Size = Ty.getSizeInBits(); 122 123 LLT CoercedTy; 124 if (Size <= 32) { 125 // <2 x s8> -> s16 126 // <4 x s8> -> s32 127 CoercedTy = LLT::scalar(Size); 128 } else 129 CoercedTy = LLT::scalarOrVector(Size / 32, 32); 130 131 return std::make_pair(TypeIdx, CoercedTy); 132 }; 133 } 134 135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 136 return [=](const LegalityQuery &Query) { 137 const LLT QueryTy = Query.Types[TypeIdx]; 138 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 139 }; 140 } 141 142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 143 return [=](const LegalityQuery &Query) { 144 const LLT QueryTy = Query.Types[TypeIdx]; 145 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 146 }; 147 } 148 149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT QueryTy = Query.Types[TypeIdx]; 152 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 153 }; 154 } 155 156 static bool isRegisterSize(unsigned Size) { 157 return Size % 32 == 0 && Size <= MaxRegisterSize; 158 } 159 160 static bool isRegisterVectorElementType(LLT EltTy) { 161 const int EltSize = EltTy.getSizeInBits(); 162 return EltSize == 16 || EltSize % 32 == 0; 163 } 164 165 static bool isRegisterVectorType(LLT Ty) { 166 const int EltSize = Ty.getElementType().getSizeInBits(); 167 return EltSize == 32 || EltSize == 64 || 168 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 169 EltSize == 128 || EltSize == 256; 170 } 171 172 static bool isRegisterType(LLT Ty) { 173 if (!isRegisterSize(Ty.getSizeInBits())) 174 return false; 175 176 if (Ty.isVector()) 177 return isRegisterVectorType(Ty); 178 179 return true; 180 } 181 182 // Any combination of 32 or 64-bit elements up the maximum register size, and 183 // multiples of v2s16. 184 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 return isRegisterType(Query.Types[TypeIdx]); 187 }; 188 } 189 190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 191 return [=](const LegalityQuery &Query) { 192 const LLT QueryTy = Query.Types[TypeIdx]; 193 if (!QueryTy.isVector()) 194 return false; 195 const LLT EltTy = QueryTy.getElementType(); 196 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 197 }; 198 } 199 200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 201 return [=](const LegalityQuery &Query) { 202 const LLT Ty = Query.Types[TypeIdx]; 203 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 204 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 205 }; 206 } 207 208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 209 // handle some operations by just promoting the register during 210 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 212 bool IsLoad) { 213 switch (AS) { 214 case AMDGPUAS::PRIVATE_ADDRESS: 215 // FIXME: Private element size. 216 return 32; 217 case AMDGPUAS::LOCAL_ADDRESS: 218 return ST.useDS128() ? 128 : 64; 219 case AMDGPUAS::GLOBAL_ADDRESS: 220 case AMDGPUAS::CONSTANT_ADDRESS: 221 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 222 // Treat constant and global as identical. SMRD loads are sometimes usable for 223 // global loads (ideally constant address space should be eliminated) 224 // depending on the context. Legality cannot be context dependent, but 225 // RegBankSelect can split the load as necessary depending on the pointer 226 // register bank/uniformity and if the memory is invariant or not written in a 227 // kernel. 228 return IsLoad ? 512 : 128; 229 default: 230 // Flat addresses may contextually need to be split to 32-bit parts if they 231 // may alias scratch depending on the subtarget. 232 return 128; 233 } 234 } 235 236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 237 const LegalityQuery &Query, 238 unsigned Opcode) { 239 const LLT Ty = Query.Types[0]; 240 241 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 242 const bool IsLoad = Opcode != AMDGPU::G_STORE; 243 244 unsigned RegSize = Ty.getSizeInBits(); 245 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 246 unsigned Align = Query.MMODescrs[0].AlignInBits; 247 unsigned AS = Query.Types[1].getAddressSpace(); 248 249 // All of these need to be custom lowered to cast the pointer operand. 250 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 251 return false; 252 253 // TODO: We should be able to widen loads if the alignment is high enough, but 254 // we also need to modify the memory access size. 255 #if 0 256 // Accept widening loads based on alignment. 257 if (IsLoad && MemSize < Size) 258 MemSize = std::max(MemSize, Align); 259 #endif 260 261 // Only 1-byte and 2-byte to 32-bit extloads are valid. 262 if (MemSize != RegSize && RegSize != 32) 263 return false; 264 265 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 266 return false; 267 268 switch (MemSize) { 269 case 8: 270 case 16: 271 case 32: 272 case 64: 273 case 128: 274 break; 275 case 96: 276 if (!ST.hasDwordx3LoadStores()) 277 return false; 278 break; 279 case 256: 280 case 512: 281 // These may contextually need to be broken down. 282 break; 283 default: 284 return false; 285 } 286 287 assert(RegSize >= MemSize); 288 289 if (Align < MemSize) { 290 const SITargetLowering *TLI = ST.getTargetLowering(); 291 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 292 return false; 293 } 294 295 return true; 296 } 297 298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 299 // workaround this. Eventually it should ignore the type for loads and only care 300 // about the size. Return true in cases where we will workaround this for now by 301 // bitcasting. 302 static bool loadStoreBitcastWorkaround(const LLT Ty) { 303 if (EnableNewLegality) 304 return false; 305 306 const unsigned Size = Ty.getSizeInBits(); 307 if (Size <= 64) 308 return false; 309 if (!Ty.isVector()) 310 return true; 311 unsigned EltSize = Ty.getElementType().getSizeInBits(); 312 return EltSize != 32 && EltSize != 64; 313 } 314 315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 316 unsigned Opcode) { 317 const LLT Ty = Query.Types[0]; 318 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 319 !loadStoreBitcastWorkaround(Ty); 320 } 321 322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 323 const GCNTargetMachine &TM) 324 : ST(ST_) { 325 using namespace TargetOpcode; 326 327 auto GetAddrSpacePtr = [&TM](unsigned AS) { 328 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 329 }; 330 331 const LLT S1 = LLT::scalar(1); 332 const LLT S16 = LLT::scalar(16); 333 const LLT S32 = LLT::scalar(32); 334 const LLT S64 = LLT::scalar(64); 335 const LLT S128 = LLT::scalar(128); 336 const LLT S256 = LLT::scalar(256); 337 const LLT S512 = LLT::scalar(512); 338 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 339 340 const LLT V2S16 = LLT::vector(2, 16); 341 const LLT V4S16 = LLT::vector(4, 16); 342 343 const LLT V2S32 = LLT::vector(2, 32); 344 const LLT V3S32 = LLT::vector(3, 32); 345 const LLT V4S32 = LLT::vector(4, 32); 346 const LLT V5S32 = LLT::vector(5, 32); 347 const LLT V6S32 = LLT::vector(6, 32); 348 const LLT V7S32 = LLT::vector(7, 32); 349 const LLT V8S32 = LLT::vector(8, 32); 350 const LLT V9S32 = LLT::vector(9, 32); 351 const LLT V10S32 = LLT::vector(10, 32); 352 const LLT V11S32 = LLT::vector(11, 32); 353 const LLT V12S32 = LLT::vector(12, 32); 354 const LLT V13S32 = LLT::vector(13, 32); 355 const LLT V14S32 = LLT::vector(14, 32); 356 const LLT V15S32 = LLT::vector(15, 32); 357 const LLT V16S32 = LLT::vector(16, 32); 358 const LLT V32S32 = LLT::vector(32, 32); 359 360 const LLT V2S64 = LLT::vector(2, 64); 361 const LLT V3S64 = LLT::vector(3, 64); 362 const LLT V4S64 = LLT::vector(4, 64); 363 const LLT V5S64 = LLT::vector(5, 64); 364 const LLT V6S64 = LLT::vector(6, 64); 365 const LLT V7S64 = LLT::vector(7, 64); 366 const LLT V8S64 = LLT::vector(8, 64); 367 const LLT V16S64 = LLT::vector(16, 64); 368 369 std::initializer_list<LLT> AllS32Vectors = 370 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 371 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 372 std::initializer_list<LLT> AllS64Vectors = 373 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 374 375 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 376 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 377 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 378 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 379 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 380 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 381 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 382 383 const LLT CodePtr = FlatPtr; 384 385 const std::initializer_list<LLT> AddrSpaces64 = { 386 GlobalPtr, ConstantPtr, FlatPtr 387 }; 388 389 const std::initializer_list<LLT> AddrSpaces32 = { 390 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 391 }; 392 393 const std::initializer_list<LLT> FPTypesBase = { 394 S32, S64 395 }; 396 397 const std::initializer_list<LLT> FPTypes16 = { 398 S32, S64, S16 399 }; 400 401 const std::initializer_list<LLT> FPTypesPK16 = { 402 S32, S64, S16, V2S16 403 }; 404 405 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 406 407 setAction({G_BRCOND, S1}, Legal); // VCC branches 408 setAction({G_BRCOND, S32}, Legal); // SCC branches 409 410 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 411 // elements for v3s16 412 getActionDefinitionsBuilder(G_PHI) 413 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 414 .legalFor(AllS32Vectors) 415 .legalFor(AllS64Vectors) 416 .legalFor(AddrSpaces64) 417 .legalFor(AddrSpaces32) 418 .clampScalar(0, S32, S256) 419 .widenScalarToNextPow2(0, 32) 420 .clampMaxNumElements(0, S32, 16) 421 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 422 .legalIf(isPointer(0)); 423 424 if (ST.hasVOP3PInsts()) { 425 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 426 .legalFor({S32, S16, V2S16}) 427 .clampScalar(0, S16, S32) 428 .clampMaxNumElements(0, S16, 2) 429 .scalarize(0) 430 .widenScalarToNextPow2(0, 32); 431 } else if (ST.has16BitInsts()) { 432 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 433 .legalFor({S32, S16}) 434 .clampScalar(0, S16, S32) 435 .scalarize(0) 436 .widenScalarToNextPow2(0, 32); 437 } else { 438 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 439 .legalFor({S32}) 440 .clampScalar(0, S32, S32) 441 .scalarize(0); 442 } 443 444 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 445 .customFor({S32, S64}) 446 .clampScalar(0, S32, S64) 447 .widenScalarToNextPow2(0, 32) 448 .scalarize(0); 449 450 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 451 .legalFor({S32}) 452 .clampScalar(0, S32, S32) 453 .scalarize(0); 454 455 // Report legal for any types we can handle anywhere. For the cases only legal 456 // on the SALU, RegBankSelect will be able to re-legalize. 457 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 458 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 459 .clampScalar(0, S32, S64) 460 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 461 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 462 .widenScalarToNextPow2(0) 463 .scalarize(0); 464 465 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 466 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 467 .legalFor({{S32, S1}, {S32, S32}}) 468 .minScalar(0, S32) 469 // TODO: .scalarize(0) 470 .lower(); 471 472 getActionDefinitionsBuilder(G_BITCAST) 473 // Don't worry about the size constraint. 474 .legalIf(all(isRegisterType(0), isRegisterType(1))) 475 .lower(); 476 477 478 getActionDefinitionsBuilder(G_CONSTANT) 479 .legalFor({S1, S32, S64, S16, GlobalPtr, 480 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 481 .clampScalar(0, S32, S64) 482 .widenScalarToNextPow2(0) 483 .legalIf(isPointer(0)); 484 485 getActionDefinitionsBuilder(G_FCONSTANT) 486 .legalFor({S32, S64, S16}) 487 .clampScalar(0, S16, S64); 488 489 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 490 .legalIf(isRegisterType(0)) 491 // s1 and s16 are special cases because they have legal operations on 492 // them, but don't really occupy registers in the normal way. 493 .legalFor({S1, S16}) 494 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 495 .clampScalarOrElt(0, S32, MaxScalar) 496 .widenScalarToNextPow2(0, 32) 497 .clampMaxNumElements(0, S32, 16); 498 499 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 500 501 // If the amount is divergent, we have to do a wave reduction to get the 502 // maximum value, so this is expanded during RegBankSelect. 503 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 504 .legalFor({{PrivatePtr, S32}}); 505 506 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 507 .unsupportedFor({PrivatePtr}) 508 .custom(); 509 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 510 511 auto &FPOpActions = getActionDefinitionsBuilder( 512 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 513 .legalFor({S32, S64}); 514 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 515 .customFor({S32, S64}); 516 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 517 .customFor({S32, S64}); 518 519 if (ST.has16BitInsts()) { 520 if (ST.hasVOP3PInsts()) 521 FPOpActions.legalFor({S16, V2S16}); 522 else 523 FPOpActions.legalFor({S16}); 524 525 TrigActions.customFor({S16}); 526 FDIVActions.customFor({S16}); 527 } 528 529 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 530 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 531 532 if (ST.hasVOP3PInsts()) { 533 MinNumMaxNum.customFor(FPTypesPK16) 534 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 535 .clampMaxNumElements(0, S16, 2) 536 .clampScalar(0, S16, S64) 537 .scalarize(0); 538 } else if (ST.has16BitInsts()) { 539 MinNumMaxNum.customFor(FPTypes16) 540 .clampScalar(0, S16, S64) 541 .scalarize(0); 542 } else { 543 MinNumMaxNum.customFor(FPTypesBase) 544 .clampScalar(0, S32, S64) 545 .scalarize(0); 546 } 547 548 if (ST.hasVOP3PInsts()) 549 FPOpActions.clampMaxNumElements(0, S16, 2); 550 551 FPOpActions 552 .scalarize(0) 553 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 554 555 TrigActions 556 .scalarize(0) 557 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 558 559 FDIVActions 560 .scalarize(0) 561 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 562 563 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 564 .legalFor(FPTypesPK16) 565 .clampMaxNumElements(0, S16, 2) 566 .scalarize(0) 567 .clampScalar(0, S16, S64); 568 569 if (ST.has16BitInsts()) { 570 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 571 .legalFor({S32, S64, S16}) 572 .scalarize(0) 573 .clampScalar(0, S16, S64); 574 } else { 575 getActionDefinitionsBuilder(G_FSQRT) 576 .legalFor({S32, S64}) 577 .scalarize(0) 578 .clampScalar(0, S32, S64); 579 580 if (ST.hasFractBug()) { 581 getActionDefinitionsBuilder(G_FFLOOR) 582 .customFor({S64}) 583 .legalFor({S32, S64}) 584 .scalarize(0) 585 .clampScalar(0, S32, S64); 586 } else { 587 getActionDefinitionsBuilder(G_FFLOOR) 588 .legalFor({S32, S64}) 589 .scalarize(0) 590 .clampScalar(0, S32, S64); 591 } 592 } 593 594 getActionDefinitionsBuilder(G_FPTRUNC) 595 .legalFor({{S32, S64}, {S16, S32}}) 596 .scalarize(0) 597 .lower(); 598 599 getActionDefinitionsBuilder(G_FPEXT) 600 .legalFor({{S64, S32}, {S32, S16}}) 601 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 602 .scalarize(0); 603 604 getActionDefinitionsBuilder(G_FSUB) 605 // Use actual fsub instruction 606 .legalFor({S32}) 607 // Must use fadd + fneg 608 .lowerFor({S64, S16, V2S16}) 609 .scalarize(0) 610 .clampScalar(0, S32, S64); 611 612 // Whether this is legal depends on the floating point mode for the function. 613 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 614 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 615 FMad.customFor({S32, S16}); 616 else if (ST.hasMadMacF32Insts()) 617 FMad.customFor({S32}); 618 else if (ST.hasMadF16()) 619 FMad.customFor({S16}); 620 FMad.scalarize(0) 621 .lower(); 622 623 // TODO: Do we need to clamp maximum bitwidth? 624 getActionDefinitionsBuilder(G_TRUNC) 625 .legalIf(isScalar(0)) 626 .legalFor({{V2S16, V2S32}}) 627 .clampMaxNumElements(0, S16, 2) 628 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 629 // situations (like an invalid implicit use), we don't want to infinite loop 630 // in the legalizer. 631 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 632 .alwaysLegal(); 633 634 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 635 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 636 {S32, S1}, {S64, S1}, {S16, S1}}) 637 .scalarize(0) 638 .clampScalar(0, S32, S64) 639 .widenScalarToNextPow2(1, 32); 640 641 // TODO: Split s1->s64 during regbankselect for VALU. 642 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 643 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 644 .lowerFor({{S32, S64}}) 645 .lowerIf(typeIs(1, S1)) 646 .customFor({{S64, S64}}); 647 if (ST.has16BitInsts()) 648 IToFP.legalFor({{S16, S16}}); 649 IToFP.clampScalar(1, S32, S64) 650 .minScalar(0, S32) 651 .scalarize(0) 652 .widenScalarToNextPow2(1); 653 654 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 655 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 656 .customFor({{S64, S64}}) 657 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 658 if (ST.has16BitInsts()) 659 FPToI.legalFor({{S16, S16}}); 660 else 661 FPToI.minScalar(1, S32); 662 663 FPToI.minScalar(0, S32) 664 .scalarize(0) 665 .lower(); 666 667 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 668 .scalarize(0) 669 .lower(); 670 671 if (ST.has16BitInsts()) { 672 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 673 .legalFor({S16, S32, S64}) 674 .clampScalar(0, S16, S64) 675 .scalarize(0); 676 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 677 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 678 .legalFor({S32, S64}) 679 .clampScalar(0, S32, S64) 680 .scalarize(0); 681 } else { 682 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 683 .legalFor({S32}) 684 .customFor({S64}) 685 .clampScalar(0, S32, S64) 686 .scalarize(0); 687 } 688 689 // FIXME: Clamp offset operand. 690 getActionDefinitionsBuilder(G_PTR_ADD) 691 .legalIf(isPointer(0)) 692 .scalarize(0); 693 694 getActionDefinitionsBuilder(G_PTRMASK) 695 .legalIf(typeInSet(1, {S64, S32})) 696 .minScalar(1, S32) 697 .maxScalarIf(sizeIs(0, 32), 1, S32) 698 .maxScalarIf(sizeIs(0, 64), 1, S64) 699 .scalarize(0); 700 701 auto &CmpBuilder = 702 getActionDefinitionsBuilder(G_ICMP) 703 // The compare output type differs based on the register bank of the output, 704 // so make both s1 and s32 legal. 705 // 706 // Scalar compares producing output in scc will be promoted to s32, as that 707 // is the allocatable register type that will be needed for the copy from 708 // scc. This will be promoted during RegBankSelect, and we assume something 709 // before that won't try to use s32 result types. 710 // 711 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 712 // bank. 713 .legalForCartesianProduct( 714 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 715 .legalForCartesianProduct( 716 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 717 if (ST.has16BitInsts()) { 718 CmpBuilder.legalFor({{S1, S16}}); 719 } 720 721 CmpBuilder 722 .widenScalarToNextPow2(1) 723 .clampScalar(1, S32, S64) 724 .scalarize(0) 725 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 726 727 getActionDefinitionsBuilder(G_FCMP) 728 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 729 .widenScalarToNextPow2(1) 730 .clampScalar(1, S32, S64) 731 .scalarize(0); 732 733 // FIXME: fpow has a selection pattern that should move to custom lowering. 734 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 735 if (ST.has16BitInsts()) 736 Exp2Ops.legalFor({S32, S16}); 737 else 738 Exp2Ops.legalFor({S32}); 739 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 740 Exp2Ops.scalarize(0); 741 742 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 743 if (ST.has16BitInsts()) 744 ExpOps.customFor({{S32}, {S16}}); 745 else 746 ExpOps.customFor({S32}); 747 ExpOps.clampScalar(0, MinScalarFPTy, S32) 748 .scalarize(0); 749 750 getActionDefinitionsBuilder(G_FPOWI) 751 .clampScalar(0, MinScalarFPTy, S32) 752 .lower(); 753 754 // The 64-bit versions produce 32-bit results, but only on the SALU. 755 getActionDefinitionsBuilder(G_CTPOP) 756 .legalFor({{S32, S32}, {S32, S64}}) 757 .clampScalar(0, S32, S32) 758 .clampScalar(1, S32, S64) 759 .scalarize(0) 760 .widenScalarToNextPow2(0, 32) 761 .widenScalarToNextPow2(1, 32); 762 763 // The hardware instructions return a different result on 0 than the generic 764 // instructions expect. The hardware produces -1, but these produce the 765 // bitwidth. 766 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 767 .scalarize(0) 768 .clampScalar(0, S32, S32) 769 .clampScalar(1, S32, S64) 770 .widenScalarToNextPow2(0, 32) 771 .widenScalarToNextPow2(1, 32) 772 .lower(); 773 774 // The 64-bit versions produce 32-bit results, but only on the SALU. 775 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 776 .legalFor({{S32, S32}, {S32, S64}}) 777 .clampScalar(0, S32, S32) 778 .clampScalar(1, S32, S64) 779 .scalarize(0) 780 .widenScalarToNextPow2(0, 32) 781 .widenScalarToNextPow2(1, 32); 782 783 getActionDefinitionsBuilder(G_BITREVERSE) 784 .legalFor({S32}) 785 .clampScalar(0, S32, S32) 786 .scalarize(0); 787 788 if (ST.has16BitInsts()) { 789 getActionDefinitionsBuilder(G_BSWAP) 790 .legalFor({S16, S32, V2S16}) 791 .clampMaxNumElements(0, S16, 2) 792 // FIXME: Fixing non-power-of-2 before clamp is workaround for 793 // narrowScalar limitation. 794 .widenScalarToNextPow2(0) 795 .clampScalar(0, S16, S32) 796 .scalarize(0); 797 798 if (ST.hasVOP3PInsts()) { 799 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 800 .legalFor({S32, S16, V2S16}) 801 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 802 .clampMaxNumElements(0, S16, 2) 803 .minScalar(0, S16) 804 .widenScalarToNextPow2(0) 805 .scalarize(0) 806 .lower(); 807 } else { 808 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 809 .legalFor({S32, S16}) 810 .widenScalarToNextPow2(0) 811 .minScalar(0, S16) 812 .scalarize(0) 813 .lower(); 814 } 815 } else { 816 // TODO: Should have same legality without v_perm_b32 817 getActionDefinitionsBuilder(G_BSWAP) 818 .legalFor({S32}) 819 .lowerIf(scalarNarrowerThan(0, 32)) 820 // FIXME: Fixing non-power-of-2 before clamp is workaround for 821 // narrowScalar limitation. 822 .widenScalarToNextPow2(0) 823 .maxScalar(0, S32) 824 .scalarize(0) 825 .lower(); 826 827 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 828 .legalFor({S32}) 829 .minScalar(0, S32) 830 .widenScalarToNextPow2(0) 831 .scalarize(0) 832 .lower(); 833 } 834 835 getActionDefinitionsBuilder(G_INTTOPTR) 836 // List the common cases 837 .legalForCartesianProduct(AddrSpaces64, {S64}) 838 .legalForCartesianProduct(AddrSpaces32, {S32}) 839 .scalarize(0) 840 // Accept any address space as long as the size matches 841 .legalIf(sameSize(0, 1)) 842 .widenScalarIf(smallerThan(1, 0), 843 [](const LegalityQuery &Query) { 844 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 845 }) 846 .narrowScalarIf(largerThan(1, 0), 847 [](const LegalityQuery &Query) { 848 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 849 }); 850 851 getActionDefinitionsBuilder(G_PTRTOINT) 852 // List the common cases 853 .legalForCartesianProduct(AddrSpaces64, {S64}) 854 .legalForCartesianProduct(AddrSpaces32, {S32}) 855 .scalarize(0) 856 // Accept any address space as long as the size matches 857 .legalIf(sameSize(0, 1)) 858 .widenScalarIf(smallerThan(0, 1), 859 [](const LegalityQuery &Query) { 860 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 861 }) 862 .narrowScalarIf( 863 largerThan(0, 1), 864 [](const LegalityQuery &Query) { 865 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 866 }); 867 868 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 869 .scalarize(0) 870 .custom(); 871 872 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 873 bool IsLoad) -> bool { 874 const LLT DstTy = Query.Types[0]; 875 876 // Split vector extloads. 877 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 878 unsigned Align = Query.MMODescrs[0].AlignInBits; 879 880 if (MemSize < DstTy.getSizeInBits()) 881 MemSize = std::max(MemSize, Align); 882 883 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 884 return true; 885 886 const LLT PtrTy = Query.Types[1]; 887 unsigned AS = PtrTy.getAddressSpace(); 888 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 889 return true; 890 891 // Catch weird sized loads that don't evenly divide into the access sizes 892 // TODO: May be able to widen depending on alignment etc. 893 unsigned NumRegs = (MemSize + 31) / 32; 894 if (NumRegs == 3) { 895 if (!ST.hasDwordx3LoadStores()) 896 return true; 897 } else { 898 // If the alignment allows, these should have been widened. 899 if (!isPowerOf2_32(NumRegs)) 900 return true; 901 } 902 903 if (Align < MemSize) { 904 const SITargetLowering *TLI = ST.getTargetLowering(); 905 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 906 } 907 908 return false; 909 }; 910 911 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 912 unsigned Opc) -> bool { 913 unsigned Size = Query.Types[0].getSizeInBits(); 914 if (isPowerOf2_32(Size)) 915 return false; 916 917 if (Size == 96 && ST.hasDwordx3LoadStores()) 918 return false; 919 920 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 921 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 922 return false; 923 924 unsigned Align = Query.MMODescrs[0].AlignInBits; 925 unsigned RoundedSize = NextPowerOf2(Size); 926 return (Align >= RoundedSize); 927 }; 928 929 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 930 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 931 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 932 933 // TODO: Refine based on subtargets which support unaligned access or 128-bit 934 // LDS 935 // TODO: Unsupported flat for SI. 936 937 for (unsigned Op : {G_LOAD, G_STORE}) { 938 const bool IsStore = Op == G_STORE; 939 940 auto &Actions = getActionDefinitionsBuilder(Op); 941 // Explicitly list some common cases. 942 // TODO: Does this help compile time at all? 943 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 944 {V2S32, GlobalPtr, 64, GlobalAlign32}, 945 {V4S32, GlobalPtr, 128, GlobalAlign32}, 946 {S64, GlobalPtr, 64, GlobalAlign32}, 947 {V2S64, GlobalPtr, 128, GlobalAlign32}, 948 {V2S16, GlobalPtr, 32, GlobalAlign32}, 949 {S32, GlobalPtr, 8, GlobalAlign8}, 950 {S32, GlobalPtr, 16, GlobalAlign16}, 951 952 {S32, LocalPtr, 32, 32}, 953 {S64, LocalPtr, 64, 32}, 954 {V2S32, LocalPtr, 64, 32}, 955 {S32, LocalPtr, 8, 8}, 956 {S32, LocalPtr, 16, 16}, 957 {V2S16, LocalPtr, 32, 32}, 958 959 {S32, PrivatePtr, 32, 32}, 960 {S32, PrivatePtr, 8, 8}, 961 {S32, PrivatePtr, 16, 16}, 962 {V2S16, PrivatePtr, 32, 32}, 963 964 {S32, ConstantPtr, 32, GlobalAlign32}, 965 {V2S32, ConstantPtr, 64, GlobalAlign32}, 966 {V4S32, ConstantPtr, 128, GlobalAlign32}, 967 {S64, ConstantPtr, 64, GlobalAlign32}, 968 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 969 Actions.legalIf( 970 [=](const LegalityQuery &Query) -> bool { 971 return isLoadStoreLegal(ST, Query, Op); 972 }); 973 974 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 975 // 64-bits. 976 // 977 // TODO: Should generalize bitcast action into coerce, which will also cover 978 // inserting addrspacecasts. 979 Actions.customIf(typeIs(1, Constant32Ptr)); 980 981 // Turn any illegal element vectors into something easier to deal 982 // with. These will ultimately produce 32-bit scalar shifts to extract the 983 // parts anyway. 984 // 985 // For odd 16-bit element vectors, prefer to split those into pieces with 986 // 16-bit vector parts. 987 Actions.bitcastIf( 988 [=](const LegalityQuery &Query) -> bool { 989 const LLT Ty = Query.Types[0]; 990 const unsigned Size = Ty.getSizeInBits(); 991 992 if (Size != Query.MMODescrs[0].SizeInBits) 993 return Size <= 32 && Ty.isVector(); 994 995 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 996 return true; 997 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 998 !isRegisterVectorElementType(Ty.getElementType()); 999 }, bitcastToRegisterType(0)); 1000 1001 Actions 1002 .customIf(typeIs(1, Constant32Ptr)) 1003 // Widen suitably aligned loads by loading extra elements. 1004 .moreElementsIf([=](const LegalityQuery &Query) { 1005 const LLT Ty = Query.Types[0]; 1006 return Op == G_LOAD && Ty.isVector() && 1007 shouldWidenLoadResult(Query, Op); 1008 }, moreElementsToNextPow2(0)) 1009 .widenScalarIf([=](const LegalityQuery &Query) { 1010 const LLT Ty = Query.Types[0]; 1011 return Op == G_LOAD && !Ty.isVector() && 1012 shouldWidenLoadResult(Query, Op); 1013 }, widenScalarOrEltToNextPow2(0)) 1014 .narrowScalarIf( 1015 [=](const LegalityQuery &Query) -> bool { 1016 return !Query.Types[0].isVector() && 1017 needToSplitMemOp(Query, Op == G_LOAD); 1018 }, 1019 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1020 const LLT DstTy = Query.Types[0]; 1021 const LLT PtrTy = Query.Types[1]; 1022 1023 const unsigned DstSize = DstTy.getSizeInBits(); 1024 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1025 1026 // Split extloads. 1027 if (DstSize > MemSize) 1028 return std::make_pair(0, LLT::scalar(MemSize)); 1029 1030 if (!isPowerOf2_32(DstSize)) { 1031 // We're probably decomposing an odd sized store. Try to split 1032 // to the widest type. TODO: Account for alignment. As-is it 1033 // should be OK, since the new parts will be further legalized. 1034 unsigned FloorSize = PowerOf2Floor(DstSize); 1035 return std::make_pair(0, LLT::scalar(FloorSize)); 1036 } 1037 1038 if (DstSize > 32 && (DstSize % 32 != 0)) { 1039 // FIXME: Need a way to specify non-extload of larger size if 1040 // suitably aligned. 1041 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1042 } 1043 1044 unsigned MaxSize = maxSizeForAddrSpace(ST, 1045 PtrTy.getAddressSpace(), 1046 Op == G_LOAD); 1047 if (MemSize > MaxSize) 1048 return std::make_pair(0, LLT::scalar(MaxSize)); 1049 1050 unsigned Align = Query.MMODescrs[0].AlignInBits; 1051 return std::make_pair(0, LLT::scalar(Align)); 1052 }) 1053 .fewerElementsIf( 1054 [=](const LegalityQuery &Query) -> bool { 1055 return Query.Types[0].isVector() && 1056 needToSplitMemOp(Query, Op == G_LOAD); 1057 }, 1058 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1059 const LLT DstTy = Query.Types[0]; 1060 const LLT PtrTy = Query.Types[1]; 1061 1062 LLT EltTy = DstTy.getElementType(); 1063 unsigned MaxSize = maxSizeForAddrSpace(ST, 1064 PtrTy.getAddressSpace(), 1065 Op == G_LOAD); 1066 1067 // FIXME: Handle widened to power of 2 results better. This ends 1068 // up scalarizing. 1069 // FIXME: 3 element stores scalarized on SI 1070 1071 // Split if it's too large for the address space. 1072 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1073 unsigned NumElts = DstTy.getNumElements(); 1074 unsigned EltSize = EltTy.getSizeInBits(); 1075 1076 if (MaxSize % EltSize == 0) { 1077 return std::make_pair( 1078 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1079 } 1080 1081 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1082 1083 // FIXME: Refine when odd breakdowns handled 1084 // The scalars will need to be re-legalized. 1085 if (NumPieces == 1 || NumPieces >= NumElts || 1086 NumElts % NumPieces != 0) 1087 return std::make_pair(0, EltTy); 1088 1089 return std::make_pair(0, 1090 LLT::vector(NumElts / NumPieces, EltTy)); 1091 } 1092 1093 // FIXME: We could probably handle weird extending loads better. 1094 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1095 if (DstTy.getSizeInBits() > MemSize) 1096 return std::make_pair(0, EltTy); 1097 1098 unsigned EltSize = EltTy.getSizeInBits(); 1099 unsigned DstSize = DstTy.getSizeInBits(); 1100 if (!isPowerOf2_32(DstSize)) { 1101 // We're probably decomposing an odd sized store. Try to split 1102 // to the widest type. TODO: Account for alignment. As-is it 1103 // should be OK, since the new parts will be further legalized. 1104 unsigned FloorSize = PowerOf2Floor(DstSize); 1105 return std::make_pair( 1106 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1107 } 1108 1109 // Need to split because of alignment. 1110 unsigned Align = Query.MMODescrs[0].AlignInBits; 1111 if (EltSize > Align && 1112 (EltSize / Align < DstTy.getNumElements())) { 1113 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1114 } 1115 1116 // May need relegalization for the scalars. 1117 return std::make_pair(0, EltTy); 1118 }) 1119 .minScalar(0, S32); 1120 1121 if (IsStore) 1122 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1123 1124 // TODO: Need a bitcast lower option? 1125 Actions 1126 .widenScalarToNextPow2(0) 1127 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1128 } 1129 1130 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1131 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1132 {S32, GlobalPtr, 16, 2 * 8}, 1133 {S32, LocalPtr, 8, 8}, 1134 {S32, LocalPtr, 16, 16}, 1135 {S32, PrivatePtr, 8, 8}, 1136 {S32, PrivatePtr, 16, 16}, 1137 {S32, ConstantPtr, 8, 8}, 1138 {S32, ConstantPtr, 16, 2 * 8}}); 1139 if (ST.hasFlatAddressSpace()) { 1140 ExtLoads.legalForTypesWithMemDesc( 1141 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1142 } 1143 1144 ExtLoads.clampScalar(0, S32, S32) 1145 .widenScalarToNextPow2(0) 1146 .unsupportedIfMemSizeNotPow2() 1147 .lower(); 1148 1149 auto &Atomics = getActionDefinitionsBuilder( 1150 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1151 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1152 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1153 G_ATOMICRMW_UMIN}) 1154 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1155 {S64, GlobalPtr}, {S64, LocalPtr}}); 1156 if (ST.hasFlatAddressSpace()) { 1157 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1158 } 1159 1160 if (ST.hasLDSFPAtomics()) { 1161 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1162 .legalFor({{S32, LocalPtr}}); 1163 } 1164 1165 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1166 // demarshalling 1167 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1168 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1169 {S32, FlatPtr}, {S64, FlatPtr}}) 1170 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1171 {S32, RegionPtr}, {S64, RegionPtr}}); 1172 // TODO: Pointer types, any 32-bit or 64-bit vector 1173 1174 // Condition should be s32 for scalar, s1 for vector. 1175 getActionDefinitionsBuilder(G_SELECT) 1176 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1177 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1178 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1179 .clampScalar(0, S16, S64) 1180 .scalarize(1) 1181 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1182 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1183 .clampMaxNumElements(0, S32, 2) 1184 .clampMaxNumElements(0, LocalPtr, 2) 1185 .clampMaxNumElements(0, PrivatePtr, 2) 1186 .scalarize(0) 1187 .widenScalarToNextPow2(0) 1188 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1189 1190 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1191 // be more flexible with the shift amount type. 1192 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1193 .legalFor({{S32, S32}, {S64, S32}}); 1194 if (ST.has16BitInsts()) { 1195 if (ST.hasVOP3PInsts()) { 1196 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1197 .clampMaxNumElements(0, S16, 2); 1198 } else 1199 Shifts.legalFor({{S16, S16}}); 1200 1201 // TODO: Support 16-bit shift amounts for all types 1202 Shifts.widenScalarIf( 1203 [=](const LegalityQuery &Query) { 1204 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1205 // 32-bit amount. 1206 const LLT ValTy = Query.Types[0]; 1207 const LLT AmountTy = Query.Types[1]; 1208 return ValTy.getSizeInBits() <= 16 && 1209 AmountTy.getSizeInBits() < 16; 1210 }, changeTo(1, S16)); 1211 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1212 Shifts.clampScalar(1, S32, S32); 1213 Shifts.clampScalar(0, S16, S64); 1214 Shifts.widenScalarToNextPow2(0, 16); 1215 } else { 1216 // Make sure we legalize the shift amount type first, as the general 1217 // expansion for the shifted type will produce much worse code if it hasn't 1218 // been truncated already. 1219 Shifts.clampScalar(1, S32, S32); 1220 Shifts.clampScalar(0, S32, S64); 1221 Shifts.widenScalarToNextPow2(0, 32); 1222 } 1223 Shifts.scalarize(0); 1224 1225 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1226 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1227 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1228 unsigned IdxTypeIdx = 2; 1229 1230 getActionDefinitionsBuilder(Op) 1231 .customIf([=](const LegalityQuery &Query) { 1232 const LLT EltTy = Query.Types[EltTypeIdx]; 1233 const LLT VecTy = Query.Types[VecTypeIdx]; 1234 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1235 return (EltTy.getSizeInBits() == 16 || 1236 EltTy.getSizeInBits() % 32 == 0) && 1237 VecTy.getSizeInBits() % 32 == 0 && 1238 VecTy.getSizeInBits() <= MaxRegisterSize && 1239 IdxTy.getSizeInBits() == 32; 1240 }) 1241 .clampScalar(EltTypeIdx, S32, S64) 1242 .clampScalar(VecTypeIdx, S32, S64) 1243 .clampScalar(IdxTypeIdx, S32, S32); 1244 } 1245 1246 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1247 .unsupportedIf([=](const LegalityQuery &Query) { 1248 const LLT &EltTy = Query.Types[1].getElementType(); 1249 return Query.Types[0] != EltTy; 1250 }); 1251 1252 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1253 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1254 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1255 1256 // FIXME: Doesn't handle extract of illegal sizes. 1257 getActionDefinitionsBuilder(Op) 1258 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1259 // FIXME: Multiples of 16 should not be legal. 1260 .legalIf([=](const LegalityQuery &Query) { 1261 const LLT BigTy = Query.Types[BigTyIdx]; 1262 const LLT LitTy = Query.Types[LitTyIdx]; 1263 return (BigTy.getSizeInBits() % 32 == 0) && 1264 (LitTy.getSizeInBits() % 16 == 0); 1265 }) 1266 .widenScalarIf( 1267 [=](const LegalityQuery &Query) { 1268 const LLT BigTy = Query.Types[BigTyIdx]; 1269 return (BigTy.getScalarSizeInBits() < 16); 1270 }, 1271 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1272 .widenScalarIf( 1273 [=](const LegalityQuery &Query) { 1274 const LLT LitTy = Query.Types[LitTyIdx]; 1275 return (LitTy.getScalarSizeInBits() < 16); 1276 }, 1277 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1278 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1279 .widenScalarToNextPow2(BigTyIdx, 32); 1280 1281 } 1282 1283 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1284 .legalForCartesianProduct(AllS32Vectors, {S32}) 1285 .legalForCartesianProduct(AllS64Vectors, {S64}) 1286 .clampNumElements(0, V16S32, V32S32) 1287 .clampNumElements(0, V2S64, V16S64) 1288 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1289 1290 if (ST.hasScalarPackInsts()) { 1291 BuildVector 1292 // FIXME: Should probably widen s1 vectors straight to s32 1293 .minScalarOrElt(0, S16) 1294 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1295 .minScalar(1, S32); 1296 1297 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1298 .legalFor({V2S16, S32}) 1299 .lower(); 1300 BuildVector.minScalarOrElt(0, S32); 1301 } else { 1302 BuildVector.customFor({V2S16, S16}); 1303 BuildVector.minScalarOrElt(0, S32); 1304 1305 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1306 .customFor({V2S16, S32}) 1307 .lower(); 1308 } 1309 1310 BuildVector.legalIf(isRegisterType(0)); 1311 1312 // FIXME: Clamp maximum size 1313 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1314 .legalIf(isRegisterType(0)); 1315 1316 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1317 // pre-legalize. 1318 if (ST.hasVOP3PInsts()) { 1319 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1320 .customFor({V2S16, V2S16}) 1321 .lower(); 1322 } else 1323 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1324 1325 // Merge/Unmerge 1326 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1327 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1328 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1329 1330 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1331 const LLT Ty = Query.Types[TypeIdx]; 1332 if (Ty.isVector()) { 1333 const LLT &EltTy = Ty.getElementType(); 1334 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1335 return true; 1336 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1337 return true; 1338 } 1339 return false; 1340 }; 1341 1342 auto &Builder = getActionDefinitionsBuilder(Op) 1343 .lowerFor({{S16, V2S16}}) 1344 .lowerIf([=](const LegalityQuery &Query) { 1345 const LLT BigTy = Query.Types[BigTyIdx]; 1346 return BigTy.getSizeInBits() == 32; 1347 }) 1348 // Try to widen to s16 first for small types. 1349 // TODO: Only do this on targets with legal s16 shifts 1350 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1351 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1352 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1353 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1354 elementTypeIs(1, S16)), 1355 changeTo(1, V2S16)) 1356 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1357 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1358 // valid. 1359 .clampScalar(LitTyIdx, S32, S512) 1360 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1361 // Break up vectors with weird elements into scalars 1362 .fewerElementsIf( 1363 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1364 scalarize(0)) 1365 .fewerElementsIf( 1366 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1367 scalarize(1)) 1368 .clampScalar(BigTyIdx, S32, MaxScalar); 1369 1370 if (Op == G_MERGE_VALUES) { 1371 Builder.widenScalarIf( 1372 // TODO: Use 16-bit shifts if legal for 8-bit values? 1373 [=](const LegalityQuery &Query) { 1374 const LLT Ty = Query.Types[LitTyIdx]; 1375 return Ty.getSizeInBits() < 32; 1376 }, 1377 changeTo(LitTyIdx, S32)); 1378 } 1379 1380 Builder.widenScalarIf( 1381 [=](const LegalityQuery &Query) { 1382 const LLT Ty = Query.Types[BigTyIdx]; 1383 return !isPowerOf2_32(Ty.getSizeInBits()) && 1384 Ty.getSizeInBits() % 16 != 0; 1385 }, 1386 [=](const LegalityQuery &Query) { 1387 // Pick the next power of 2, or a multiple of 64 over 128. 1388 // Whichever is smaller. 1389 const LLT &Ty = Query.Types[BigTyIdx]; 1390 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1391 if (NewSizeInBits >= 256) { 1392 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1393 if (RoundedTo < NewSizeInBits) 1394 NewSizeInBits = RoundedTo; 1395 } 1396 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1397 }) 1398 .legalIf([=](const LegalityQuery &Query) { 1399 const LLT &BigTy = Query.Types[BigTyIdx]; 1400 const LLT &LitTy = Query.Types[LitTyIdx]; 1401 1402 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1403 return false; 1404 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1405 return false; 1406 1407 return BigTy.getSizeInBits() % 16 == 0 && 1408 LitTy.getSizeInBits() % 16 == 0 && 1409 BigTy.getSizeInBits() <= MaxRegisterSize; 1410 }) 1411 // Any vectors left are the wrong size. Scalarize them. 1412 .scalarize(0) 1413 .scalarize(1); 1414 } 1415 1416 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1417 // RegBankSelect. 1418 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1419 .legalFor({{S32}, {S64}}); 1420 1421 if (ST.hasVOP3PInsts()) { 1422 SextInReg.lowerFor({{V2S16}}) 1423 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1424 // get more vector shift opportunities, since we'll get those when 1425 // expanded. 1426 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1427 } else if (ST.has16BitInsts()) { 1428 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1429 } else { 1430 // Prefer to promote to s32 before lowering if we don't have 16-bit 1431 // shifts. This avoid a lot of intermediate truncate and extend operations. 1432 SextInReg.lowerFor({{S32}, {S64}}); 1433 } 1434 1435 // FIXME: Placeholder rule. Really depends on whether the clamp modifier is 1436 // available, and is selectively legal for s16, s32, v2s16. 1437 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT, G_UADDSAT, G_USUBSAT}) 1438 .scalarize(0) 1439 .clampScalar(0, S16, S32); 1440 1441 SextInReg 1442 .scalarize(0) 1443 .clampScalar(0, S32, S64) 1444 .lower(); 1445 1446 getActionDefinitionsBuilder(G_FSHR) 1447 .legalFor({{S32, S32}}) 1448 .scalarize(0) 1449 .lower(); 1450 1451 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1452 .legalFor({S64}); 1453 1454 getActionDefinitionsBuilder({ 1455 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1456 G_FCOPYSIGN, 1457 1458 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1459 G_READ_REGISTER, 1460 G_WRITE_REGISTER, 1461 1462 G_SADDO, G_SSUBO, 1463 1464 // TODO: Implement 1465 G_FMINIMUM, G_FMAXIMUM, 1466 G_FSHL 1467 }).lower(); 1468 1469 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1470 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1471 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1472 .unsupported(); 1473 1474 computeTables(); 1475 verify(*ST.getInstrInfo()); 1476 } 1477 1478 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1479 MachineInstr &MI) const { 1480 MachineIRBuilder &B = Helper.MIRBuilder; 1481 MachineRegisterInfo &MRI = *B.getMRI(); 1482 GISelChangeObserver &Observer = Helper.Observer; 1483 1484 switch (MI.getOpcode()) { 1485 case TargetOpcode::G_ADDRSPACE_CAST: 1486 return legalizeAddrSpaceCast(MI, MRI, B); 1487 case TargetOpcode::G_FRINT: 1488 return legalizeFrint(MI, MRI, B); 1489 case TargetOpcode::G_FCEIL: 1490 return legalizeFceil(MI, MRI, B); 1491 case TargetOpcode::G_INTRINSIC_TRUNC: 1492 return legalizeIntrinsicTrunc(MI, MRI, B); 1493 case TargetOpcode::G_SITOFP: 1494 return legalizeITOFP(MI, MRI, B, true); 1495 case TargetOpcode::G_UITOFP: 1496 return legalizeITOFP(MI, MRI, B, false); 1497 case TargetOpcode::G_FPTOSI: 1498 return legalizeFPTOI(MI, MRI, B, true); 1499 case TargetOpcode::G_FPTOUI: 1500 return legalizeFPTOI(MI, MRI, B, false); 1501 case TargetOpcode::G_FMINNUM: 1502 case TargetOpcode::G_FMAXNUM: 1503 case TargetOpcode::G_FMINNUM_IEEE: 1504 case TargetOpcode::G_FMAXNUM_IEEE: 1505 return legalizeMinNumMaxNum(Helper, MI); 1506 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1507 return legalizeExtractVectorElt(MI, MRI, B); 1508 case TargetOpcode::G_INSERT_VECTOR_ELT: 1509 return legalizeInsertVectorElt(MI, MRI, B); 1510 case TargetOpcode::G_SHUFFLE_VECTOR: 1511 return legalizeShuffleVector(MI, MRI, B); 1512 case TargetOpcode::G_FSIN: 1513 case TargetOpcode::G_FCOS: 1514 return legalizeSinCos(MI, MRI, B); 1515 case TargetOpcode::G_GLOBAL_VALUE: 1516 return legalizeGlobalValue(MI, MRI, B); 1517 case TargetOpcode::G_LOAD: 1518 return legalizeLoad(MI, MRI, B, Observer); 1519 case TargetOpcode::G_FMAD: 1520 return legalizeFMad(MI, MRI, B); 1521 case TargetOpcode::G_FDIV: 1522 return legalizeFDIV(MI, MRI, B); 1523 case TargetOpcode::G_UDIV: 1524 case TargetOpcode::G_UREM: 1525 return legalizeUDIV_UREM(MI, MRI, B); 1526 case TargetOpcode::G_SDIV: 1527 case TargetOpcode::G_SREM: 1528 return legalizeSDIV_SREM(MI, MRI, B); 1529 case TargetOpcode::G_ATOMIC_CMPXCHG: 1530 return legalizeAtomicCmpXChg(MI, MRI, B); 1531 case TargetOpcode::G_FLOG: 1532 return legalizeFlog(MI, B, numbers::ln2f); 1533 case TargetOpcode::G_FLOG10: 1534 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1535 case TargetOpcode::G_FEXP: 1536 return legalizeFExp(MI, B); 1537 case TargetOpcode::G_FPOW: 1538 return legalizeFPow(MI, B); 1539 case TargetOpcode::G_FFLOOR: 1540 return legalizeFFloor(MI, MRI, B); 1541 case TargetOpcode::G_BUILD_VECTOR: 1542 return legalizeBuildVector(MI, MRI, B); 1543 default: 1544 return false; 1545 } 1546 1547 llvm_unreachable("expected switch to return"); 1548 } 1549 1550 Register AMDGPULegalizerInfo::getSegmentAperture( 1551 unsigned AS, 1552 MachineRegisterInfo &MRI, 1553 MachineIRBuilder &B) const { 1554 MachineFunction &MF = B.getMF(); 1555 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1556 const LLT S32 = LLT::scalar(32); 1557 1558 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1559 1560 if (ST.hasApertureRegs()) { 1561 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1562 // getreg. 1563 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1564 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1565 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1566 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1567 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1568 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1569 unsigned Encoding = 1570 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1571 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1572 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1573 1574 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1575 1576 B.buildInstr(AMDGPU::S_GETREG_B32) 1577 .addDef(GetReg) 1578 .addImm(Encoding); 1579 MRI.setType(GetReg, S32); 1580 1581 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1582 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1583 } 1584 1585 Register QueuePtr = MRI.createGenericVirtualRegister( 1586 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1587 1588 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1589 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1590 return Register(); 1591 1592 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1593 // private_segment_aperture_base_hi. 1594 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1595 1596 // TODO: can we be smarter about machine pointer info? 1597 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1598 MachineMemOperand *MMO = MF.getMachineMemOperand( 1599 PtrInfo, 1600 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1601 MachineMemOperand::MOInvariant, 1602 4, commonAlignment(Align(64), StructOffset)); 1603 1604 Register LoadAddr; 1605 1606 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1607 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1608 } 1609 1610 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1611 MachineInstr &MI, MachineRegisterInfo &MRI, 1612 MachineIRBuilder &B) const { 1613 MachineFunction &MF = B.getMF(); 1614 1615 const LLT S32 = LLT::scalar(32); 1616 Register Dst = MI.getOperand(0).getReg(); 1617 Register Src = MI.getOperand(1).getReg(); 1618 1619 LLT DstTy = MRI.getType(Dst); 1620 LLT SrcTy = MRI.getType(Src); 1621 unsigned DestAS = DstTy.getAddressSpace(); 1622 unsigned SrcAS = SrcTy.getAddressSpace(); 1623 1624 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1625 // vector element. 1626 assert(!DstTy.isVector()); 1627 1628 const AMDGPUTargetMachine &TM 1629 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1630 1631 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1632 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1633 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1634 return true; 1635 } 1636 1637 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1638 // Truncate. 1639 B.buildExtract(Dst, Src, 0); 1640 MI.eraseFromParent(); 1641 return true; 1642 } 1643 1644 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1645 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1646 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1647 1648 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1649 // another. Merge operands are required to be the same type, but creating an 1650 // extra ptrtoint would be kind of pointless. 1651 auto HighAddr = B.buildConstant( 1652 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1653 B.buildMerge(Dst, {Src, HighAddr}); 1654 MI.eraseFromParent(); 1655 return true; 1656 } 1657 1658 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1659 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1660 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1661 unsigned NullVal = TM.getNullPointerValue(DestAS); 1662 1663 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1664 auto FlatNull = B.buildConstant(SrcTy, 0); 1665 1666 // Extract low 32-bits of the pointer. 1667 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1668 1669 auto CmpRes = 1670 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1671 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1672 1673 MI.eraseFromParent(); 1674 return true; 1675 } 1676 1677 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1678 return false; 1679 1680 if (!ST.hasFlatAddressSpace()) 1681 return false; 1682 1683 auto SegmentNull = 1684 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1685 auto FlatNull = 1686 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1687 1688 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1689 if (!ApertureReg.isValid()) 1690 return false; 1691 1692 auto CmpRes = 1693 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1694 1695 // Coerce the type of the low half of the result so we can use merge_values. 1696 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1697 1698 // TODO: Should we allow mismatched types but matching sizes in merges to 1699 // avoid the ptrtoint? 1700 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1701 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1702 1703 MI.eraseFromParent(); 1704 return true; 1705 } 1706 1707 bool AMDGPULegalizerInfo::legalizeFrint( 1708 MachineInstr &MI, MachineRegisterInfo &MRI, 1709 MachineIRBuilder &B) const { 1710 Register Src = MI.getOperand(1).getReg(); 1711 LLT Ty = MRI.getType(Src); 1712 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1713 1714 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1715 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1716 1717 auto C1 = B.buildFConstant(Ty, C1Val); 1718 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1719 1720 // TODO: Should this propagate fast-math-flags? 1721 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1722 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1723 1724 auto C2 = B.buildFConstant(Ty, C2Val); 1725 auto Fabs = B.buildFAbs(Ty, Src); 1726 1727 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1728 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1729 MI.eraseFromParent(); 1730 return true; 1731 } 1732 1733 bool AMDGPULegalizerInfo::legalizeFceil( 1734 MachineInstr &MI, MachineRegisterInfo &MRI, 1735 MachineIRBuilder &B) const { 1736 1737 const LLT S1 = LLT::scalar(1); 1738 const LLT S64 = LLT::scalar(64); 1739 1740 Register Src = MI.getOperand(1).getReg(); 1741 assert(MRI.getType(Src) == S64); 1742 1743 // result = trunc(src) 1744 // if (src > 0.0 && src != result) 1745 // result += 1.0 1746 1747 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1748 1749 const auto Zero = B.buildFConstant(S64, 0.0); 1750 const auto One = B.buildFConstant(S64, 1.0); 1751 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1752 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1753 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1754 auto Add = B.buildSelect(S64, And, One, Zero); 1755 1756 // TODO: Should this propagate fast-math-flags? 1757 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1758 return true; 1759 } 1760 1761 static MachineInstrBuilder extractF64Exponent(Register Hi, 1762 MachineIRBuilder &B) { 1763 const unsigned FractBits = 52; 1764 const unsigned ExpBits = 11; 1765 LLT S32 = LLT::scalar(32); 1766 1767 auto Const0 = B.buildConstant(S32, FractBits - 32); 1768 auto Const1 = B.buildConstant(S32, ExpBits); 1769 1770 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1771 .addUse(Hi) 1772 .addUse(Const0.getReg(0)) 1773 .addUse(Const1.getReg(0)); 1774 1775 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1776 } 1777 1778 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1779 MachineInstr &MI, MachineRegisterInfo &MRI, 1780 MachineIRBuilder &B) const { 1781 const LLT S1 = LLT::scalar(1); 1782 const LLT S32 = LLT::scalar(32); 1783 const LLT S64 = LLT::scalar(64); 1784 1785 Register Src = MI.getOperand(1).getReg(); 1786 assert(MRI.getType(Src) == S64); 1787 1788 // TODO: Should this use extract since the low half is unused? 1789 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1790 Register Hi = Unmerge.getReg(1); 1791 1792 // Extract the upper half, since this is where we will find the sign and 1793 // exponent. 1794 auto Exp = extractF64Exponent(Hi, B); 1795 1796 const unsigned FractBits = 52; 1797 1798 // Extract the sign bit. 1799 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1800 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1801 1802 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1803 1804 const auto Zero32 = B.buildConstant(S32, 0); 1805 1806 // Extend back to 64-bits. 1807 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1808 1809 auto Shr = B.buildAShr(S64, FractMask, Exp); 1810 auto Not = B.buildNot(S64, Shr); 1811 auto Tmp0 = B.buildAnd(S64, Src, Not); 1812 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1813 1814 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1815 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1816 1817 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1818 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1819 MI.eraseFromParent(); 1820 return true; 1821 } 1822 1823 bool AMDGPULegalizerInfo::legalizeITOFP( 1824 MachineInstr &MI, MachineRegisterInfo &MRI, 1825 MachineIRBuilder &B, bool Signed) const { 1826 1827 Register Dst = MI.getOperand(0).getReg(); 1828 Register Src = MI.getOperand(1).getReg(); 1829 1830 const LLT S64 = LLT::scalar(64); 1831 const LLT S32 = LLT::scalar(32); 1832 1833 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1834 1835 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1836 1837 auto CvtHi = Signed ? 1838 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1839 B.buildUITOFP(S64, Unmerge.getReg(1)); 1840 1841 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1842 1843 auto ThirtyTwo = B.buildConstant(S32, 32); 1844 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1845 .addUse(CvtHi.getReg(0)) 1846 .addUse(ThirtyTwo.getReg(0)); 1847 1848 // TODO: Should this propagate fast-math-flags? 1849 B.buildFAdd(Dst, LdExp, CvtLo); 1850 MI.eraseFromParent(); 1851 return true; 1852 } 1853 1854 // TODO: Copied from DAG implementation. Verify logic and document how this 1855 // actually works. 1856 bool AMDGPULegalizerInfo::legalizeFPTOI( 1857 MachineInstr &MI, MachineRegisterInfo &MRI, 1858 MachineIRBuilder &B, bool Signed) const { 1859 1860 Register Dst = MI.getOperand(0).getReg(); 1861 Register Src = MI.getOperand(1).getReg(); 1862 1863 const LLT S64 = LLT::scalar(64); 1864 const LLT S32 = LLT::scalar(32); 1865 1866 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1867 1868 unsigned Flags = MI.getFlags(); 1869 1870 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1871 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1872 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1873 1874 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1875 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1876 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1877 1878 auto Hi = Signed ? 1879 B.buildFPTOSI(S32, FloorMul) : 1880 B.buildFPTOUI(S32, FloorMul); 1881 auto Lo = B.buildFPTOUI(S32, Fma); 1882 1883 B.buildMerge(Dst, { Lo, Hi }); 1884 MI.eraseFromParent(); 1885 1886 return true; 1887 } 1888 1889 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 1890 MachineInstr &MI) const { 1891 MachineFunction &MF = Helper.MIRBuilder.getMF(); 1892 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1893 1894 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1895 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1896 1897 // With ieee_mode disabled, the instructions have the correct behavior 1898 // already for G_FMINNUM/G_FMAXNUM 1899 if (!MFI->getMode().IEEE) 1900 return !IsIEEEOp; 1901 1902 if (IsIEEEOp) 1903 return true; 1904 1905 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1906 } 1907 1908 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1909 MachineInstr &MI, MachineRegisterInfo &MRI, 1910 MachineIRBuilder &B) const { 1911 // TODO: Should move some of this into LegalizerHelper. 1912 1913 // TODO: Promote dynamic indexing of s16 to s32 1914 1915 // FIXME: Artifact combiner probably should have replaced the truncated 1916 // constant before this, so we shouldn't need 1917 // getConstantVRegValWithLookThrough. 1918 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1919 MI.getOperand(2).getReg(), MRI); 1920 if (!IdxVal) // Dynamic case will be selected to register indexing. 1921 return true; 1922 1923 Register Dst = MI.getOperand(0).getReg(); 1924 Register Vec = MI.getOperand(1).getReg(); 1925 1926 LLT VecTy = MRI.getType(Vec); 1927 LLT EltTy = VecTy.getElementType(); 1928 assert(EltTy == MRI.getType(Dst)); 1929 1930 if (IdxVal->Value < VecTy.getNumElements()) 1931 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1932 else 1933 B.buildUndef(Dst); 1934 1935 MI.eraseFromParent(); 1936 return true; 1937 } 1938 1939 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1940 MachineInstr &MI, MachineRegisterInfo &MRI, 1941 MachineIRBuilder &B) const { 1942 // TODO: Should move some of this into LegalizerHelper. 1943 1944 // TODO: Promote dynamic indexing of s16 to s32 1945 1946 // FIXME: Artifact combiner probably should have replaced the truncated 1947 // constant before this, so we shouldn't need 1948 // getConstantVRegValWithLookThrough. 1949 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1950 MI.getOperand(3).getReg(), MRI); 1951 if (!IdxVal) // Dynamic case will be selected to register indexing. 1952 return true; 1953 1954 Register Dst = MI.getOperand(0).getReg(); 1955 Register Vec = MI.getOperand(1).getReg(); 1956 Register Ins = MI.getOperand(2).getReg(); 1957 1958 LLT VecTy = MRI.getType(Vec); 1959 LLT EltTy = VecTy.getElementType(); 1960 assert(EltTy == MRI.getType(Ins)); 1961 1962 if (IdxVal->Value < VecTy.getNumElements()) 1963 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1964 else 1965 B.buildUndef(Dst); 1966 1967 MI.eraseFromParent(); 1968 return true; 1969 } 1970 1971 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1972 MachineInstr &MI, MachineRegisterInfo &MRI, 1973 MachineIRBuilder &B) const { 1974 const LLT V2S16 = LLT::vector(2, 16); 1975 1976 Register Dst = MI.getOperand(0).getReg(); 1977 Register Src0 = MI.getOperand(1).getReg(); 1978 LLT DstTy = MRI.getType(Dst); 1979 LLT SrcTy = MRI.getType(Src0); 1980 1981 if (SrcTy == V2S16 && DstTy == V2S16 && 1982 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1983 return true; 1984 1985 MachineIRBuilder HelperBuilder(MI); 1986 GISelObserverWrapper DummyObserver; 1987 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1988 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1989 } 1990 1991 bool AMDGPULegalizerInfo::legalizeSinCos( 1992 MachineInstr &MI, MachineRegisterInfo &MRI, 1993 MachineIRBuilder &B) const { 1994 1995 Register DstReg = MI.getOperand(0).getReg(); 1996 Register SrcReg = MI.getOperand(1).getReg(); 1997 LLT Ty = MRI.getType(DstReg); 1998 unsigned Flags = MI.getFlags(); 1999 2000 Register TrigVal; 2001 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2002 if (ST.hasTrigReducedRange()) { 2003 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2004 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2005 .addUse(MulVal.getReg(0)) 2006 .setMIFlags(Flags).getReg(0); 2007 } else 2008 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2009 2010 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2011 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2012 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2013 .addUse(TrigVal) 2014 .setMIFlags(Flags); 2015 MI.eraseFromParent(); 2016 return true; 2017 } 2018 2019 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2020 MachineIRBuilder &B, 2021 const GlobalValue *GV, 2022 int64_t Offset, 2023 unsigned GAFlags) const { 2024 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2025 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2026 // to the following code sequence: 2027 // 2028 // For constant address space: 2029 // s_getpc_b64 s[0:1] 2030 // s_add_u32 s0, s0, $symbol 2031 // s_addc_u32 s1, s1, 0 2032 // 2033 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2034 // a fixup or relocation is emitted to replace $symbol with a literal 2035 // constant, which is a pc-relative offset from the encoding of the $symbol 2036 // operand to the global variable. 2037 // 2038 // For global address space: 2039 // s_getpc_b64 s[0:1] 2040 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2041 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2042 // 2043 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2044 // fixups or relocations are emitted to replace $symbol@*@lo and 2045 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2046 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2047 // operand to the global variable. 2048 // 2049 // What we want here is an offset from the value returned by s_getpc 2050 // (which is the address of the s_add_u32 instruction) to the global 2051 // variable, but since the encoding of $symbol starts 4 bytes after the start 2052 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2053 // small. This requires us to add 4 to the global variable offset in order to 2054 // compute the correct address. 2055 2056 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2057 2058 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2059 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2060 2061 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2062 .addDef(PCReg); 2063 2064 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2065 if (GAFlags == SIInstrInfo::MO_NONE) 2066 MIB.addImm(0); 2067 else 2068 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2069 2070 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2071 2072 if (PtrTy.getSizeInBits() == 32) 2073 B.buildExtract(DstReg, PCReg, 0); 2074 return true; 2075 } 2076 2077 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2078 MachineInstr &MI, MachineRegisterInfo &MRI, 2079 MachineIRBuilder &B) const { 2080 Register DstReg = MI.getOperand(0).getReg(); 2081 LLT Ty = MRI.getType(DstReg); 2082 unsigned AS = Ty.getAddressSpace(); 2083 2084 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2085 MachineFunction &MF = B.getMF(); 2086 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2087 2088 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2089 if (!MFI->isEntryFunction()) { 2090 const Function &Fn = MF.getFunction(); 2091 DiagnosticInfoUnsupported BadLDSDecl( 2092 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2093 DS_Warning); 2094 Fn.getContext().diagnose(BadLDSDecl); 2095 2096 // We currently don't have a way to correctly allocate LDS objects that 2097 // aren't directly associated with a kernel. We do force inlining of 2098 // functions that use local objects. However, if these dead functions are 2099 // not eliminated, we don't want a compile time error. Just emit a warning 2100 // and a trap, since there should be no callable path here. 2101 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2102 B.buildUndef(DstReg); 2103 MI.eraseFromParent(); 2104 return true; 2105 } 2106 2107 // TODO: We could emit code to handle the initialization somewhere. 2108 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2109 const SITargetLowering *TLI = ST.getTargetLowering(); 2110 if (!TLI->shouldUseLDSConstAddress(GV)) { 2111 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2112 return true; // Leave in place; 2113 } 2114 2115 B.buildConstant( 2116 DstReg, 2117 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2118 MI.eraseFromParent(); 2119 return true; 2120 } 2121 2122 const Function &Fn = MF.getFunction(); 2123 DiagnosticInfoUnsupported BadInit( 2124 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2125 Fn.getContext().diagnose(BadInit); 2126 return true; 2127 } 2128 2129 const SITargetLowering *TLI = ST.getTargetLowering(); 2130 2131 if (TLI->shouldEmitFixup(GV)) { 2132 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2133 MI.eraseFromParent(); 2134 return true; 2135 } 2136 2137 if (TLI->shouldEmitPCReloc(GV)) { 2138 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2139 MI.eraseFromParent(); 2140 return true; 2141 } 2142 2143 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2144 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2145 2146 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2147 MachinePointerInfo::getGOT(MF), 2148 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2149 MachineMemOperand::MOInvariant, 2150 8 /*Size*/, Align(8)); 2151 2152 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2153 2154 if (Ty.getSizeInBits() == 32) { 2155 // Truncate if this is a 32-bit constant adrdess. 2156 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2157 B.buildExtract(DstReg, Load, 0); 2158 } else 2159 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2160 2161 MI.eraseFromParent(); 2162 return true; 2163 } 2164 2165 bool AMDGPULegalizerInfo::legalizeLoad( 2166 MachineInstr &MI, MachineRegisterInfo &MRI, 2167 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2168 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2169 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2170 Observer.changingInstr(MI); 2171 MI.getOperand(1).setReg(Cast.getReg(0)); 2172 Observer.changedInstr(MI); 2173 return true; 2174 } 2175 2176 bool AMDGPULegalizerInfo::legalizeFMad( 2177 MachineInstr &MI, MachineRegisterInfo &MRI, 2178 MachineIRBuilder &B) const { 2179 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2180 assert(Ty.isScalar()); 2181 2182 MachineFunction &MF = B.getMF(); 2183 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2184 2185 // TODO: Always legal with future ftz flag. 2186 // FIXME: Do we need just output? 2187 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2188 return true; 2189 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2190 return true; 2191 2192 MachineIRBuilder HelperBuilder(MI); 2193 GISelObserverWrapper DummyObserver; 2194 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2195 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2196 } 2197 2198 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2199 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2200 Register DstReg = MI.getOperand(0).getReg(); 2201 Register PtrReg = MI.getOperand(1).getReg(); 2202 Register CmpVal = MI.getOperand(2).getReg(); 2203 Register NewVal = MI.getOperand(3).getReg(); 2204 2205 assert(SITargetLowering::isFlatGlobalAddrSpace( 2206 MRI.getType(PtrReg).getAddressSpace()) && 2207 "this should not have been custom lowered"); 2208 2209 LLT ValTy = MRI.getType(CmpVal); 2210 LLT VecTy = LLT::vector(2, ValTy); 2211 2212 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2213 2214 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2215 .addDef(DstReg) 2216 .addUse(PtrReg) 2217 .addUse(PackedVal) 2218 .setMemRefs(MI.memoperands()); 2219 2220 MI.eraseFromParent(); 2221 return true; 2222 } 2223 2224 bool AMDGPULegalizerInfo::legalizeFlog( 2225 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2226 Register Dst = MI.getOperand(0).getReg(); 2227 Register Src = MI.getOperand(1).getReg(); 2228 LLT Ty = B.getMRI()->getType(Dst); 2229 unsigned Flags = MI.getFlags(); 2230 2231 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2232 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2233 2234 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2235 MI.eraseFromParent(); 2236 return true; 2237 } 2238 2239 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2240 MachineIRBuilder &B) const { 2241 Register Dst = MI.getOperand(0).getReg(); 2242 Register Src = MI.getOperand(1).getReg(); 2243 unsigned Flags = MI.getFlags(); 2244 LLT Ty = B.getMRI()->getType(Dst); 2245 2246 auto K = B.buildFConstant(Ty, numbers::log2e); 2247 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2248 B.buildFExp2(Dst, Mul, Flags); 2249 MI.eraseFromParent(); 2250 return true; 2251 } 2252 2253 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2254 MachineIRBuilder &B) const { 2255 Register Dst = MI.getOperand(0).getReg(); 2256 Register Src0 = MI.getOperand(1).getReg(); 2257 Register Src1 = MI.getOperand(2).getReg(); 2258 unsigned Flags = MI.getFlags(); 2259 LLT Ty = B.getMRI()->getType(Dst); 2260 const LLT S16 = LLT::scalar(16); 2261 const LLT S32 = LLT::scalar(32); 2262 2263 if (Ty == S32) { 2264 auto Log = B.buildFLog2(S32, Src0, Flags); 2265 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2266 .addUse(Log.getReg(0)) 2267 .addUse(Src1) 2268 .setMIFlags(Flags); 2269 B.buildFExp2(Dst, Mul, Flags); 2270 } else if (Ty == S16) { 2271 // There's no f16 fmul_legacy, so we need to convert for it. 2272 auto Log = B.buildFLog2(S16, Src0, Flags); 2273 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2274 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2275 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2276 .addUse(Ext0.getReg(0)) 2277 .addUse(Ext1.getReg(0)) 2278 .setMIFlags(Flags); 2279 2280 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2281 } else 2282 return false; 2283 2284 MI.eraseFromParent(); 2285 return true; 2286 } 2287 2288 // Find a source register, ignoring any possible source modifiers. 2289 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2290 Register ModSrc = OrigSrc; 2291 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2292 ModSrc = SrcFNeg->getOperand(1).getReg(); 2293 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2294 ModSrc = SrcFAbs->getOperand(1).getReg(); 2295 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2296 ModSrc = SrcFAbs->getOperand(1).getReg(); 2297 return ModSrc; 2298 } 2299 2300 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2301 MachineRegisterInfo &MRI, 2302 MachineIRBuilder &B) const { 2303 2304 const LLT S1 = LLT::scalar(1); 2305 const LLT S64 = LLT::scalar(64); 2306 Register Dst = MI.getOperand(0).getReg(); 2307 Register OrigSrc = MI.getOperand(1).getReg(); 2308 unsigned Flags = MI.getFlags(); 2309 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2310 "this should not have been custom lowered"); 2311 2312 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2313 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2314 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2315 // V_FRACT bug is: 2316 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2317 // 2318 // Convert floor(x) to (x - fract(x)) 2319 2320 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2321 .addUse(OrigSrc) 2322 .setMIFlags(Flags); 2323 2324 // Give source modifier matching some assistance before obscuring a foldable 2325 // pattern. 2326 2327 // TODO: We can avoid the neg on the fract? The input sign to fract 2328 // shouldn't matter? 2329 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2330 2331 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2332 2333 Register Min = MRI.createGenericVirtualRegister(S64); 2334 2335 // We don't need to concern ourselves with the snan handling difference, so 2336 // use the one which will directly select. 2337 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2338 if (MFI->getMode().IEEE) 2339 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2340 else 2341 B.buildFMinNum(Min, Fract, Const, Flags); 2342 2343 Register CorrectedFract = Min; 2344 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2345 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2346 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2347 } 2348 2349 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2350 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2351 2352 MI.eraseFromParent(); 2353 return true; 2354 } 2355 2356 // Turn an illegal packed v2s16 build vector into bit operations. 2357 // TODO: This should probably be a bitcast action in LegalizerHelper. 2358 bool AMDGPULegalizerInfo::legalizeBuildVector( 2359 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2360 Register Dst = MI.getOperand(0).getReg(); 2361 const LLT S32 = LLT::scalar(32); 2362 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2363 2364 Register Src0 = MI.getOperand(1).getReg(); 2365 Register Src1 = MI.getOperand(2).getReg(); 2366 assert(MRI.getType(Src0) == LLT::scalar(16)); 2367 2368 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2369 B.buildBitcast(Dst, Merge); 2370 2371 MI.eraseFromParent(); 2372 return true; 2373 } 2374 2375 // Return the use branch instruction, otherwise null if the usage is invalid. 2376 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2377 MachineRegisterInfo &MRI, 2378 MachineInstr *&Br, 2379 MachineBasicBlock *&UncondBrTarget) { 2380 Register CondDef = MI.getOperand(0).getReg(); 2381 if (!MRI.hasOneNonDBGUse(CondDef)) 2382 return nullptr; 2383 2384 MachineBasicBlock *Parent = MI.getParent(); 2385 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2386 if (UseMI.getParent() != Parent || 2387 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2388 return nullptr; 2389 2390 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2391 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2392 if (Next == Parent->end()) { 2393 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2394 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2395 return nullptr; 2396 UncondBrTarget = &*NextMBB; 2397 } else { 2398 if (Next->getOpcode() != AMDGPU::G_BR) 2399 return nullptr; 2400 Br = &*Next; 2401 UncondBrTarget = Br->getOperand(0).getMBB(); 2402 } 2403 2404 return &UseMI; 2405 } 2406 2407 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2408 MachineRegisterInfo &MRI, 2409 Register LiveIn, 2410 Register PhyReg) const { 2411 assert(PhyReg.isPhysical() && "Physical register expected"); 2412 2413 // Insert the live-in copy, if required, by defining destination virtual 2414 // register. 2415 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2416 if (!MRI.getVRegDef(LiveIn)) { 2417 // FIXME: Should have scoped insert pt 2418 MachineBasicBlock &OrigInsBB = B.getMBB(); 2419 auto OrigInsPt = B.getInsertPt(); 2420 2421 MachineBasicBlock &EntryMBB = B.getMF().front(); 2422 EntryMBB.addLiveIn(PhyReg); 2423 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2424 B.buildCopy(LiveIn, PhyReg); 2425 2426 B.setInsertPt(OrigInsBB, OrigInsPt); 2427 } 2428 2429 return LiveIn; 2430 } 2431 2432 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2433 MachineRegisterInfo &MRI, 2434 Register PhyReg, LLT Ty, 2435 bool InsertLiveInCopy) const { 2436 assert(PhyReg.isPhysical() && "Physical register expected"); 2437 2438 // Get or create virtual live-in regester 2439 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2440 if (!LiveIn) { 2441 LiveIn = MRI.createGenericVirtualRegister(Ty); 2442 MRI.addLiveIn(PhyReg, LiveIn); 2443 } 2444 2445 // When the actual true copy required is from virtual register to physical 2446 // register (to be inserted later), live-in copy insertion from physical 2447 // to register virtual register is not required 2448 if (!InsertLiveInCopy) 2449 return LiveIn; 2450 2451 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2452 } 2453 2454 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2455 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2456 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2457 const ArgDescriptor *Arg; 2458 const TargetRegisterClass *RC; 2459 LLT ArgTy; 2460 std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType); 2461 if (!Arg) { 2462 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2463 return nullptr; 2464 } 2465 return Arg; 2466 } 2467 2468 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2469 const ArgDescriptor *Arg) const { 2470 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2471 return false; // TODO: Handle these 2472 2473 Register SrcReg = Arg->getRegister(); 2474 assert(SrcReg.isPhysical() && "Physical register expected"); 2475 assert(DstReg.isVirtual() && "Virtual register expected"); 2476 2477 MachineRegisterInfo &MRI = *B.getMRI(); 2478 2479 LLT Ty = MRI.getType(DstReg); 2480 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2481 2482 if (Arg->isMasked()) { 2483 // TODO: Should we try to emit this once in the entry block? 2484 const LLT S32 = LLT::scalar(32); 2485 const unsigned Mask = Arg->getMask(); 2486 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2487 2488 Register AndMaskSrc = LiveIn; 2489 2490 if (Shift != 0) { 2491 auto ShiftAmt = B.buildConstant(S32, Shift); 2492 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2493 } 2494 2495 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2496 } else { 2497 B.buildCopy(DstReg, LiveIn); 2498 } 2499 2500 return true; 2501 } 2502 2503 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2504 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2505 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2506 2507 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2508 if (!Arg) 2509 return false; 2510 2511 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2512 return false; 2513 2514 MI.eraseFromParent(); 2515 return true; 2516 } 2517 2518 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2519 MachineRegisterInfo &MRI, 2520 MachineIRBuilder &B) const { 2521 Register Dst = MI.getOperand(0).getReg(); 2522 LLT DstTy = MRI.getType(Dst); 2523 LLT S16 = LLT::scalar(16); 2524 LLT S32 = LLT::scalar(32); 2525 LLT S64 = LLT::scalar(64); 2526 2527 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2528 return true; 2529 2530 if (DstTy == S16) 2531 return legalizeFDIV16(MI, MRI, B); 2532 if (DstTy == S32) 2533 return legalizeFDIV32(MI, MRI, B); 2534 if (DstTy == S64) 2535 return legalizeFDIV64(MI, MRI, B); 2536 2537 return false; 2538 } 2539 2540 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2541 Register DstReg, 2542 Register X, 2543 Register Y, 2544 bool IsDiv) const { 2545 const LLT S1 = LLT::scalar(1); 2546 const LLT S32 = LLT::scalar(32); 2547 2548 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2549 // algorithm used here. 2550 2551 // Initial estimate of inv(y). 2552 auto FloatY = B.buildUITOFP(S32, Y); 2553 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2554 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2555 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2556 auto Z = B.buildFPTOUI(S32, ScaledY); 2557 2558 // One round of UNR. 2559 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2560 auto NegYZ = B.buildMul(S32, NegY, Z); 2561 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2562 2563 // Quotient/remainder estimate. 2564 auto Q = B.buildUMulH(S32, X, Z); 2565 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2566 2567 // First quotient/remainder refinement. 2568 auto One = B.buildConstant(S32, 1); 2569 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2570 if (IsDiv) 2571 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2572 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2573 2574 // Second quotient/remainder refinement. 2575 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2576 if (IsDiv) 2577 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2578 else 2579 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2580 } 2581 2582 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2583 MachineRegisterInfo &MRI, 2584 MachineIRBuilder &B) const { 2585 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2586 Register DstReg = MI.getOperand(0).getReg(); 2587 Register Num = MI.getOperand(1).getReg(); 2588 Register Den = MI.getOperand(2).getReg(); 2589 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2590 MI.eraseFromParent(); 2591 return true; 2592 } 2593 2594 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2595 // 2596 // Return lo, hi of result 2597 // 2598 // %cvt.lo = G_UITOFP Val.lo 2599 // %cvt.hi = G_UITOFP Val.hi 2600 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2601 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2602 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2603 // %mul2 = G_FMUL %mul1, 2**(-32) 2604 // %trunc = G_INTRINSIC_TRUNC %mul2 2605 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2606 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2607 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2608 Register Val) { 2609 const LLT S32 = LLT::scalar(32); 2610 auto Unmerge = B.buildUnmerge(S32, Val); 2611 2612 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2613 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2614 2615 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2616 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2617 2618 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2619 auto Mul1 = 2620 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2621 2622 // 2**(-32) 2623 auto Mul2 = 2624 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2625 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2626 2627 // -(2**32) 2628 auto Mad2 = B.buildFMAD(S32, Trunc, 2629 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2630 2631 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2632 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2633 2634 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2635 } 2636 2637 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2638 Register DstReg, 2639 Register Numer, 2640 Register Denom, 2641 bool IsDiv) const { 2642 const LLT S32 = LLT::scalar(32); 2643 const LLT S64 = LLT::scalar(64); 2644 const LLT S1 = LLT::scalar(1); 2645 Register RcpLo, RcpHi; 2646 2647 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2648 2649 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2650 2651 auto Zero64 = B.buildConstant(S64, 0); 2652 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2653 2654 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2655 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2656 2657 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2658 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2659 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2660 2661 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2662 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2663 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2664 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2665 2666 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2667 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2668 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2669 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2670 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2671 2672 auto Zero32 = B.buildConstant(S32, 0); 2673 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2674 auto Add2_HiC = 2675 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2676 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2677 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2678 2679 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2680 Register NumerLo = UnmergeNumer.getReg(0); 2681 Register NumerHi = UnmergeNumer.getReg(1); 2682 2683 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2684 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2685 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2686 Register Mul3_Lo = UnmergeMul3.getReg(0); 2687 Register Mul3_Hi = UnmergeMul3.getReg(1); 2688 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2689 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2690 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2691 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2692 2693 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2694 Register DenomLo = UnmergeDenom.getReg(0); 2695 Register DenomHi = UnmergeDenom.getReg(1); 2696 2697 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2698 auto C1 = B.buildSExt(S32, CmpHi); 2699 2700 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2701 auto C2 = B.buildSExt(S32, CmpLo); 2702 2703 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2704 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2705 2706 // TODO: Here and below portions of the code can be enclosed into if/endif. 2707 // Currently control flow is unconditional and we have 4 selects after 2708 // potential endif to substitute PHIs. 2709 2710 // if C3 != 0 ... 2711 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2712 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2713 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2714 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2715 2716 auto One64 = B.buildConstant(S64, 1); 2717 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2718 2719 auto C4 = 2720 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2721 auto C5 = 2722 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2723 auto C6 = B.buildSelect( 2724 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2725 2726 // if (C6 != 0) 2727 auto Add4 = B.buildAdd(S64, Add3, One64); 2728 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2729 2730 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2731 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2732 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2733 2734 // endif C6 2735 // endif C3 2736 2737 if (IsDiv) { 2738 auto Sel1 = B.buildSelect( 2739 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2740 B.buildSelect(DstReg, 2741 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2742 } else { 2743 auto Sel2 = B.buildSelect( 2744 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2745 B.buildSelect(DstReg, 2746 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2747 } 2748 } 2749 2750 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2751 MachineRegisterInfo &MRI, 2752 MachineIRBuilder &B) const { 2753 const LLT S64 = LLT::scalar(64); 2754 const LLT S32 = LLT::scalar(32); 2755 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2756 Register DstReg = MI.getOperand(0).getReg(); 2757 Register Num = MI.getOperand(1).getReg(); 2758 Register Den = MI.getOperand(2).getReg(); 2759 LLT Ty = MRI.getType(DstReg); 2760 2761 if (Ty == S32) 2762 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2763 else if (Ty == S64) 2764 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2765 else 2766 return false; 2767 2768 MI.eraseFromParent(); 2769 return true; 2770 2771 } 2772 2773 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2774 MachineRegisterInfo &MRI, 2775 MachineIRBuilder &B) const { 2776 const LLT S64 = LLT::scalar(64); 2777 const LLT S32 = LLT::scalar(32); 2778 2779 Register DstReg = MI.getOperand(0).getReg(); 2780 const LLT Ty = MRI.getType(DstReg); 2781 if (Ty != S32 && Ty != S64) 2782 return false; 2783 2784 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2785 2786 Register LHS = MI.getOperand(1).getReg(); 2787 Register RHS = MI.getOperand(2).getReg(); 2788 2789 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2790 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2791 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2792 2793 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2794 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2795 2796 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2797 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2798 2799 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2800 if (Ty == S32) 2801 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2802 else 2803 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2804 2805 Register Sign; 2806 if (IsDiv) 2807 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2808 else 2809 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2810 2811 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2812 B.buildSub(DstReg, UDivRem, Sign); 2813 2814 MI.eraseFromParent(); 2815 return true; 2816 } 2817 2818 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2819 MachineRegisterInfo &MRI, 2820 MachineIRBuilder &B) const { 2821 Register Res = MI.getOperand(0).getReg(); 2822 Register LHS = MI.getOperand(1).getReg(); 2823 Register RHS = MI.getOperand(2).getReg(); 2824 2825 uint16_t Flags = MI.getFlags(); 2826 2827 LLT ResTy = MRI.getType(Res); 2828 LLT S32 = LLT::scalar(32); 2829 LLT S64 = LLT::scalar(64); 2830 2831 const MachineFunction &MF = B.getMF(); 2832 bool Unsafe = 2833 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2834 2835 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2836 return false; 2837 2838 if (!Unsafe && ResTy == S32 && 2839 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2840 return false; 2841 2842 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2843 // 1 / x -> RCP(x) 2844 if (CLHS->isExactlyValue(1.0)) { 2845 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2846 .addUse(RHS) 2847 .setMIFlags(Flags); 2848 2849 MI.eraseFromParent(); 2850 return true; 2851 } 2852 2853 // -1 / x -> RCP( FNEG(x) ) 2854 if (CLHS->isExactlyValue(-1.0)) { 2855 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2856 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2857 .addUse(FNeg.getReg(0)) 2858 .setMIFlags(Flags); 2859 2860 MI.eraseFromParent(); 2861 return true; 2862 } 2863 } 2864 2865 // x / y -> x * (1.0 / y) 2866 if (Unsafe) { 2867 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2868 .addUse(RHS) 2869 .setMIFlags(Flags); 2870 B.buildFMul(Res, LHS, RCP, Flags); 2871 2872 MI.eraseFromParent(); 2873 return true; 2874 } 2875 2876 return false; 2877 } 2878 2879 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2880 MachineRegisterInfo &MRI, 2881 MachineIRBuilder &B) const { 2882 Register Res = MI.getOperand(0).getReg(); 2883 Register LHS = MI.getOperand(1).getReg(); 2884 Register RHS = MI.getOperand(2).getReg(); 2885 2886 uint16_t Flags = MI.getFlags(); 2887 2888 LLT S16 = LLT::scalar(16); 2889 LLT S32 = LLT::scalar(32); 2890 2891 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2892 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2893 2894 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2895 .addUse(RHSExt.getReg(0)) 2896 .setMIFlags(Flags); 2897 2898 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2899 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2900 2901 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2902 .addUse(RDst.getReg(0)) 2903 .addUse(RHS) 2904 .addUse(LHS) 2905 .setMIFlags(Flags); 2906 2907 MI.eraseFromParent(); 2908 return true; 2909 } 2910 2911 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2912 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2913 static void toggleSPDenormMode(bool Enable, 2914 MachineIRBuilder &B, 2915 const GCNSubtarget &ST, 2916 AMDGPU::SIModeRegisterDefaults Mode) { 2917 // Set SP denorm mode to this value. 2918 unsigned SPDenormMode = 2919 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2920 2921 if (ST.hasDenormModeInst()) { 2922 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2923 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2924 2925 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2926 B.buildInstr(AMDGPU::S_DENORM_MODE) 2927 .addImm(NewDenormModeValue); 2928 2929 } else { 2930 // Select FP32 bit field in mode register. 2931 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2932 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2933 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2934 2935 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2936 .addImm(SPDenormMode) 2937 .addImm(SPDenormModeBitField); 2938 } 2939 } 2940 2941 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2942 MachineRegisterInfo &MRI, 2943 MachineIRBuilder &B) const { 2944 Register Res = MI.getOperand(0).getReg(); 2945 Register LHS = MI.getOperand(1).getReg(); 2946 Register RHS = MI.getOperand(2).getReg(); 2947 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2948 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2949 2950 uint16_t Flags = MI.getFlags(); 2951 2952 LLT S32 = LLT::scalar(32); 2953 LLT S1 = LLT::scalar(1); 2954 2955 auto One = B.buildFConstant(S32, 1.0f); 2956 2957 auto DenominatorScaled = 2958 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2959 .addUse(LHS) 2960 .addUse(RHS) 2961 .addImm(0) 2962 .setMIFlags(Flags); 2963 auto NumeratorScaled = 2964 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2965 .addUse(LHS) 2966 .addUse(RHS) 2967 .addImm(1) 2968 .setMIFlags(Flags); 2969 2970 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2971 .addUse(DenominatorScaled.getReg(0)) 2972 .setMIFlags(Flags); 2973 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2974 2975 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2976 // aren't modeled as reading it. 2977 if (!Mode.allFP32Denormals()) 2978 toggleSPDenormMode(true, B, ST, Mode); 2979 2980 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2981 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2982 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2983 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2984 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2985 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2986 2987 if (!Mode.allFP32Denormals()) 2988 toggleSPDenormMode(false, B, ST, Mode); 2989 2990 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2991 .addUse(Fma4.getReg(0)) 2992 .addUse(Fma1.getReg(0)) 2993 .addUse(Fma3.getReg(0)) 2994 .addUse(NumeratorScaled.getReg(1)) 2995 .setMIFlags(Flags); 2996 2997 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2998 .addUse(Fmas.getReg(0)) 2999 .addUse(RHS) 3000 .addUse(LHS) 3001 .setMIFlags(Flags); 3002 3003 MI.eraseFromParent(); 3004 return true; 3005 } 3006 3007 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3008 MachineRegisterInfo &MRI, 3009 MachineIRBuilder &B) const { 3010 Register Res = MI.getOperand(0).getReg(); 3011 Register LHS = MI.getOperand(1).getReg(); 3012 Register RHS = MI.getOperand(2).getReg(); 3013 3014 uint16_t Flags = MI.getFlags(); 3015 3016 LLT S64 = LLT::scalar(64); 3017 LLT S1 = LLT::scalar(1); 3018 3019 auto One = B.buildFConstant(S64, 1.0); 3020 3021 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3022 .addUse(LHS) 3023 .addUse(RHS) 3024 .addImm(0) 3025 .setMIFlags(Flags); 3026 3027 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3028 3029 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3030 .addUse(DivScale0.getReg(0)) 3031 .setMIFlags(Flags); 3032 3033 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3034 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3035 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3036 3037 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3038 .addUse(LHS) 3039 .addUse(RHS) 3040 .addImm(1) 3041 .setMIFlags(Flags); 3042 3043 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3044 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3045 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3046 3047 Register Scale; 3048 if (!ST.hasUsableDivScaleConditionOutput()) { 3049 // Workaround a hardware bug on SI where the condition output from div_scale 3050 // is not usable. 3051 3052 LLT S32 = LLT::scalar(32); 3053 3054 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3055 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3056 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3057 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3058 3059 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3060 Scale1Unmerge.getReg(1)); 3061 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3062 Scale0Unmerge.getReg(1)); 3063 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3064 } else { 3065 Scale = DivScale1.getReg(1); 3066 } 3067 3068 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3069 .addUse(Fma4.getReg(0)) 3070 .addUse(Fma3.getReg(0)) 3071 .addUse(Mul.getReg(0)) 3072 .addUse(Scale) 3073 .setMIFlags(Flags); 3074 3075 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3076 .addUse(Fmas.getReg(0)) 3077 .addUse(RHS) 3078 .addUse(LHS) 3079 .setMIFlags(Flags); 3080 3081 MI.eraseFromParent(); 3082 return true; 3083 } 3084 3085 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3086 MachineRegisterInfo &MRI, 3087 MachineIRBuilder &B) const { 3088 Register Res = MI.getOperand(0).getReg(); 3089 Register LHS = MI.getOperand(2).getReg(); 3090 Register RHS = MI.getOperand(3).getReg(); 3091 uint16_t Flags = MI.getFlags(); 3092 3093 LLT S32 = LLT::scalar(32); 3094 LLT S1 = LLT::scalar(1); 3095 3096 auto Abs = B.buildFAbs(S32, RHS, Flags); 3097 const APFloat C0Val(1.0f); 3098 3099 auto C0 = B.buildConstant(S32, 0x6f800000); 3100 auto C1 = B.buildConstant(S32, 0x2f800000); 3101 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3102 3103 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3104 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3105 3106 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3107 3108 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3109 .addUse(Mul0.getReg(0)) 3110 .setMIFlags(Flags); 3111 3112 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3113 3114 B.buildFMul(Res, Sel, Mul1, Flags); 3115 3116 MI.eraseFromParent(); 3117 return true; 3118 } 3119 3120 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3121 MachineRegisterInfo &MRI, 3122 MachineIRBuilder &B) const { 3123 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3124 uint64_t Offset = 3125 ST.getTargetLowering()->getImplicitParameterOffset( 3126 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3127 LLT DstTy = MRI.getType(DstReg); 3128 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3129 3130 const ArgDescriptor *Arg; 3131 const TargetRegisterClass *RC; 3132 LLT ArgTy; 3133 std::tie(Arg, RC, ArgTy) = 3134 MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3135 if (!Arg) 3136 return false; 3137 3138 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3139 if (!loadInputValue(KernargPtrReg, B, Arg)) 3140 return false; 3141 3142 // FIXME: This should be nuw 3143 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3144 return true; 3145 } 3146 3147 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3148 MachineRegisterInfo &MRI, 3149 MachineIRBuilder &B) const { 3150 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3151 if (!MFI->isEntryFunction()) { 3152 return legalizePreloadedArgIntrin(MI, MRI, B, 3153 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3154 } 3155 3156 Register DstReg = MI.getOperand(0).getReg(); 3157 if (!getImplicitArgPtr(DstReg, MRI, B)) 3158 return false; 3159 3160 MI.eraseFromParent(); 3161 return true; 3162 } 3163 3164 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3165 MachineRegisterInfo &MRI, 3166 MachineIRBuilder &B, 3167 unsigned AddrSpace) const { 3168 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3169 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3170 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3171 MI.eraseFromParent(); 3172 return true; 3173 } 3174 3175 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3176 // offset (the offset that is included in bounds checking and swizzling, to be 3177 // split between the instruction's voffset and immoffset fields) and soffset 3178 // (the offset that is excluded from bounds checking and swizzling, to go in 3179 // the instruction's soffset field). This function takes the first kind of 3180 // offset and figures out how to split it between voffset and immoffset. 3181 std::tuple<Register, unsigned, unsigned> 3182 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3183 Register OrigOffset) const { 3184 const unsigned MaxImm = 4095; 3185 Register BaseReg; 3186 unsigned TotalConstOffset; 3187 MachineInstr *OffsetDef; 3188 const LLT S32 = LLT::scalar(32); 3189 3190 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3191 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3192 3193 unsigned ImmOffset = TotalConstOffset; 3194 3195 // If the immediate value is too big for the immoffset field, put the value 3196 // and -4096 into the immoffset field so that the value that is copied/added 3197 // for the voffset field is a multiple of 4096, and it stands more chance 3198 // of being CSEd with the copy/add for another similar load/store. 3199 // However, do not do that rounding down to a multiple of 4096 if that is a 3200 // negative number, as it appears to be illegal to have a negative offset 3201 // in the vgpr, even if adding the immediate offset makes it positive. 3202 unsigned Overflow = ImmOffset & ~MaxImm; 3203 ImmOffset -= Overflow; 3204 if ((int32_t)Overflow < 0) { 3205 Overflow += ImmOffset; 3206 ImmOffset = 0; 3207 } 3208 3209 if (Overflow != 0) { 3210 if (!BaseReg) { 3211 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3212 } else { 3213 auto OverflowVal = B.buildConstant(S32, Overflow); 3214 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3215 } 3216 } 3217 3218 if (!BaseReg) 3219 BaseReg = B.buildConstant(S32, 0).getReg(0); 3220 3221 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3222 } 3223 3224 /// Handle register layout difference for f16 images for some subtargets. 3225 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3226 MachineRegisterInfo &MRI, 3227 Register Reg) const { 3228 if (!ST.hasUnpackedD16VMem()) 3229 return Reg; 3230 3231 const LLT S16 = LLT::scalar(16); 3232 const LLT S32 = LLT::scalar(32); 3233 LLT StoreVT = MRI.getType(Reg); 3234 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3235 3236 auto Unmerge = B.buildUnmerge(S16, Reg); 3237 3238 SmallVector<Register, 4> WideRegs; 3239 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3240 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3241 3242 int NumElts = StoreVT.getNumElements(); 3243 3244 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3245 } 3246 3247 Register AMDGPULegalizerInfo::fixStoreSourceType( 3248 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3249 MachineRegisterInfo *MRI = B.getMRI(); 3250 LLT Ty = MRI->getType(VData); 3251 3252 const LLT S16 = LLT::scalar(16); 3253 3254 // Fixup illegal register types for i8 stores. 3255 if (Ty == LLT::scalar(8) || Ty == S16) { 3256 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3257 return AnyExt; 3258 } 3259 3260 if (Ty.isVector()) { 3261 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3262 if (IsFormat) 3263 return handleD16VData(B, *MRI, VData); 3264 } 3265 } 3266 3267 return VData; 3268 } 3269 3270 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3271 MachineRegisterInfo &MRI, 3272 MachineIRBuilder &B, 3273 bool IsTyped, 3274 bool IsFormat) const { 3275 Register VData = MI.getOperand(1).getReg(); 3276 LLT Ty = MRI.getType(VData); 3277 LLT EltTy = Ty.getScalarType(); 3278 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3279 const LLT S32 = LLT::scalar(32); 3280 3281 VData = fixStoreSourceType(B, VData, IsFormat); 3282 Register RSrc = MI.getOperand(2).getReg(); 3283 3284 MachineMemOperand *MMO = *MI.memoperands_begin(); 3285 const int MemSize = MMO->getSize(); 3286 3287 unsigned ImmOffset; 3288 unsigned TotalOffset; 3289 3290 // The typed intrinsics add an immediate after the registers. 3291 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3292 3293 // The struct intrinsic variants add one additional operand over raw. 3294 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3295 Register VIndex; 3296 int OpOffset = 0; 3297 if (HasVIndex) { 3298 VIndex = MI.getOperand(3).getReg(); 3299 OpOffset = 1; 3300 } 3301 3302 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3303 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3304 3305 unsigned Format = 0; 3306 if (IsTyped) { 3307 Format = MI.getOperand(5 + OpOffset).getImm(); 3308 ++OpOffset; 3309 } 3310 3311 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3312 3313 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3314 if (TotalOffset != 0) 3315 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3316 3317 unsigned Opc; 3318 if (IsTyped) { 3319 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3320 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3321 } else if (IsFormat) { 3322 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3323 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3324 } else { 3325 switch (MemSize) { 3326 case 1: 3327 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3328 break; 3329 case 2: 3330 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3331 break; 3332 default: 3333 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3334 break; 3335 } 3336 } 3337 3338 if (!VIndex) 3339 VIndex = B.buildConstant(S32, 0).getReg(0); 3340 3341 auto MIB = B.buildInstr(Opc) 3342 .addUse(VData) // vdata 3343 .addUse(RSrc) // rsrc 3344 .addUse(VIndex) // vindex 3345 .addUse(VOffset) // voffset 3346 .addUse(SOffset) // soffset 3347 .addImm(ImmOffset); // offset(imm) 3348 3349 if (IsTyped) 3350 MIB.addImm(Format); 3351 3352 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3353 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3354 .addMemOperand(MMO); 3355 3356 MI.eraseFromParent(); 3357 return true; 3358 } 3359 3360 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3361 MachineRegisterInfo &MRI, 3362 MachineIRBuilder &B, 3363 bool IsFormat, 3364 bool IsTyped) const { 3365 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3366 MachineMemOperand *MMO = *MI.memoperands_begin(); 3367 const int MemSize = MMO->getSize(); 3368 const LLT S32 = LLT::scalar(32); 3369 3370 Register Dst = MI.getOperand(0).getReg(); 3371 Register RSrc = MI.getOperand(2).getReg(); 3372 3373 // The typed intrinsics add an immediate after the registers. 3374 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3375 3376 // The struct intrinsic variants add one additional operand over raw. 3377 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3378 Register VIndex; 3379 int OpOffset = 0; 3380 if (HasVIndex) { 3381 VIndex = MI.getOperand(3).getReg(); 3382 OpOffset = 1; 3383 } 3384 3385 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3386 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3387 3388 unsigned Format = 0; 3389 if (IsTyped) { 3390 Format = MI.getOperand(5 + OpOffset).getImm(); 3391 ++OpOffset; 3392 } 3393 3394 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3395 unsigned ImmOffset; 3396 unsigned TotalOffset; 3397 3398 LLT Ty = MRI.getType(Dst); 3399 LLT EltTy = Ty.getScalarType(); 3400 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3401 const bool Unpacked = ST.hasUnpackedD16VMem(); 3402 3403 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3404 if (TotalOffset != 0) 3405 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3406 3407 unsigned Opc; 3408 3409 if (IsTyped) { 3410 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3411 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3412 } else if (IsFormat) { 3413 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3414 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3415 } else { 3416 switch (MemSize) { 3417 case 1: 3418 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3419 break; 3420 case 2: 3421 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3422 break; 3423 default: 3424 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3425 break; 3426 } 3427 } 3428 3429 Register LoadDstReg; 3430 3431 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3432 LLT UnpackedTy = Ty.changeElementSize(32); 3433 3434 if (IsExtLoad) 3435 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3436 else if (Unpacked && IsD16 && Ty.isVector()) 3437 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3438 else 3439 LoadDstReg = Dst; 3440 3441 if (!VIndex) 3442 VIndex = B.buildConstant(S32, 0).getReg(0); 3443 3444 auto MIB = B.buildInstr(Opc) 3445 .addDef(LoadDstReg) // vdata 3446 .addUse(RSrc) // rsrc 3447 .addUse(VIndex) // vindex 3448 .addUse(VOffset) // voffset 3449 .addUse(SOffset) // soffset 3450 .addImm(ImmOffset); // offset(imm) 3451 3452 if (IsTyped) 3453 MIB.addImm(Format); 3454 3455 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3456 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3457 .addMemOperand(MMO); 3458 3459 if (LoadDstReg != Dst) { 3460 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3461 3462 // Widen result for extending loads was widened. 3463 if (IsExtLoad) 3464 B.buildTrunc(Dst, LoadDstReg); 3465 else { 3466 // Repack to original 16-bit vector result 3467 // FIXME: G_TRUNC should work, but legalization currently fails 3468 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3469 SmallVector<Register, 4> Repack; 3470 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3471 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3472 B.buildMerge(Dst, Repack); 3473 } 3474 } 3475 3476 MI.eraseFromParent(); 3477 return true; 3478 } 3479 3480 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3481 MachineIRBuilder &B, 3482 bool IsInc) const { 3483 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3484 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3485 B.buildInstr(Opc) 3486 .addDef(MI.getOperand(0).getReg()) 3487 .addUse(MI.getOperand(2).getReg()) 3488 .addUse(MI.getOperand(3).getReg()) 3489 .cloneMemRefs(MI); 3490 MI.eraseFromParent(); 3491 return true; 3492 } 3493 3494 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3495 switch (IntrID) { 3496 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3497 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3498 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3499 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3500 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3501 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3502 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3503 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3504 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3505 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3506 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3507 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3508 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3509 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3510 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3511 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3512 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3513 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3514 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3515 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3516 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3517 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3518 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3519 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3520 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3521 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3522 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3523 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3524 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3525 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3526 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3527 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3528 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3529 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3530 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3531 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3532 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3533 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3534 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3535 default: 3536 llvm_unreachable("unhandled atomic opcode"); 3537 } 3538 } 3539 3540 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3541 MachineIRBuilder &B, 3542 Intrinsic::ID IID) const { 3543 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3544 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3545 3546 Register Dst = MI.getOperand(0).getReg(); 3547 Register VData = MI.getOperand(2).getReg(); 3548 3549 Register CmpVal; 3550 int OpOffset = 0; 3551 3552 if (IsCmpSwap) { 3553 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3554 ++OpOffset; 3555 } 3556 3557 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3558 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3559 3560 // The struct intrinsic variants add one additional operand over raw. 3561 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3562 Register VIndex; 3563 if (HasVIndex) { 3564 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3565 ++OpOffset; 3566 } 3567 3568 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3569 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3570 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3571 3572 MachineMemOperand *MMO = *MI.memoperands_begin(); 3573 3574 unsigned ImmOffset; 3575 unsigned TotalOffset; 3576 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3577 if (TotalOffset != 0) 3578 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3579 3580 if (!VIndex) 3581 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3582 3583 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3584 .addDef(Dst) 3585 .addUse(VData); // vdata 3586 3587 if (IsCmpSwap) 3588 MIB.addReg(CmpVal); 3589 3590 MIB.addUse(RSrc) // rsrc 3591 .addUse(VIndex) // vindex 3592 .addUse(VOffset) // voffset 3593 .addUse(SOffset) // soffset 3594 .addImm(ImmOffset) // offset(imm) 3595 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3596 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3597 .addMemOperand(MMO); 3598 3599 MI.eraseFromParent(); 3600 return true; 3601 } 3602 3603 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3604 /// vector with s16 typed elements. 3605 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3606 SmallVectorImpl<Register> &PackedAddrs, 3607 int AddrIdx, int DimIdx, int EndIdx, 3608 int NumGradients) { 3609 const LLT S16 = LLT::scalar(16); 3610 const LLT V2S16 = LLT::vector(2, 16); 3611 3612 for (int I = AddrIdx; I < EndIdx; ++I) { 3613 MachineOperand &SrcOp = MI.getOperand(I); 3614 if (!SrcOp.isReg()) 3615 continue; // _L to _LZ may have eliminated this. 3616 3617 Register AddrReg = SrcOp.getReg(); 3618 3619 if (I < DimIdx) { 3620 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3621 PackedAddrs.push_back(AddrReg); 3622 } else { 3623 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3624 // derivatives dx/dh and dx/dv are packed with undef. 3625 if (((I + 1) >= EndIdx) || 3626 ((NumGradients / 2) % 2 == 1 && 3627 (I == DimIdx + (NumGradients / 2) - 1 || 3628 I == DimIdx + NumGradients - 1)) || 3629 // Check for _L to _LZ optimization 3630 !MI.getOperand(I + 1).isReg()) { 3631 PackedAddrs.push_back( 3632 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3633 .getReg(0)); 3634 } else { 3635 PackedAddrs.push_back( 3636 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3637 .getReg(0)); 3638 ++I; 3639 } 3640 } 3641 } 3642 } 3643 3644 /// Convert from separate vaddr components to a single vector address register, 3645 /// and replace the remaining operands with $noreg. 3646 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3647 int DimIdx, int NumVAddrs) { 3648 const LLT S32 = LLT::scalar(32); 3649 3650 SmallVector<Register, 8> AddrRegs; 3651 for (int I = 0; I != NumVAddrs; ++I) { 3652 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3653 if (SrcOp.isReg()) { 3654 AddrRegs.push_back(SrcOp.getReg()); 3655 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3656 } 3657 } 3658 3659 int NumAddrRegs = AddrRegs.size(); 3660 if (NumAddrRegs != 1) { 3661 // Round up to 8 elements for v5-v7 3662 // FIXME: Missing intermediate sized register classes and instructions. 3663 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3664 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3665 auto Undef = B.buildUndef(S32); 3666 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3667 NumAddrRegs = RoundedNumRegs; 3668 } 3669 3670 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3671 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3672 } 3673 3674 for (int I = 1; I != NumVAddrs; ++I) { 3675 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3676 if (SrcOp.isReg()) 3677 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3678 } 3679 } 3680 3681 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3682 /// 3683 /// Depending on the subtarget, load/store with 16-bit element data need to be 3684 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3685 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3686 /// registers. 3687 /// 3688 /// We don't want to directly select image instructions just yet, but also want 3689 /// to exposes all register repacking to the legalizer/combiners. We also don't 3690 /// want a selected instrution entering RegBankSelect. In order to avoid 3691 /// defining a multitude of intermediate image instructions, directly hack on 3692 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3693 /// now unnecessary arguments with $noreg. 3694 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3695 MachineInstr &MI, MachineIRBuilder &B, 3696 GISelChangeObserver &Observer, 3697 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3698 3699 const int NumDefs = MI.getNumExplicitDefs(); 3700 bool IsTFE = NumDefs == 2; 3701 // We are only processing the operands of d16 image operations on subtargets 3702 // that use the unpacked register layout, or need to repack the TFE result. 3703 3704 // TODO: Do we need to guard against already legalized intrinsics? 3705 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3706 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3707 3708 MachineRegisterInfo *MRI = B.getMRI(); 3709 const LLT S32 = LLT::scalar(32); 3710 const LLT S16 = LLT::scalar(16); 3711 const LLT V2S16 = LLT::vector(2, 16); 3712 3713 // Index of first address argument 3714 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3715 3716 int NumVAddrs, NumGradients; 3717 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3718 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3719 getDMaskIdx(BaseOpcode, NumDefs); 3720 unsigned DMask = 0; 3721 3722 // Check for 16 bit addresses and pack if true. 3723 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3724 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3725 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3726 const bool IsG16 = GradTy == S16; 3727 const bool IsA16 = AddrTy == S16; 3728 3729 int DMaskLanes = 0; 3730 if (!BaseOpcode->Atomic) { 3731 DMask = MI.getOperand(DMaskIdx).getImm(); 3732 if (BaseOpcode->Gather4) { 3733 DMaskLanes = 4; 3734 } else if (DMask != 0) { 3735 DMaskLanes = countPopulation(DMask); 3736 } else if (!IsTFE && !BaseOpcode->Store) { 3737 // If dmask is 0, this is a no-op load. This can be eliminated. 3738 B.buildUndef(MI.getOperand(0)); 3739 MI.eraseFromParent(); 3740 return true; 3741 } 3742 } 3743 3744 Observer.changingInstr(MI); 3745 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3746 3747 unsigned NewOpcode = NumDefs == 0 ? 3748 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3749 3750 // Track that we legalized this 3751 MI.setDesc(B.getTII().get(NewOpcode)); 3752 3753 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3754 // dmask to be at least 1 otherwise the instruction will fail 3755 if (IsTFE && DMask == 0) { 3756 DMask = 0x1; 3757 DMaskLanes = 1; 3758 MI.getOperand(DMaskIdx).setImm(DMask); 3759 } 3760 3761 if (BaseOpcode->Atomic) { 3762 Register VData0 = MI.getOperand(2).getReg(); 3763 LLT Ty = MRI->getType(VData0); 3764 3765 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3766 if (Ty.isVector()) 3767 return false; 3768 3769 if (BaseOpcode->AtomicX2) { 3770 Register VData1 = MI.getOperand(3).getReg(); 3771 // The two values are packed in one register. 3772 LLT PackedTy = LLT::vector(2, Ty); 3773 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3774 MI.getOperand(2).setReg(Concat.getReg(0)); 3775 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3776 } 3777 } 3778 3779 int CorrectedNumVAddrs = NumVAddrs; 3780 3781 // Optimize _L to _LZ when _L is zero 3782 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3783 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3784 const ConstantFP *ConstantLod; 3785 const int LodIdx = AddrIdx + NumVAddrs - 1; 3786 3787 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3788 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3789 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3790 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3791 LZMappingInfo->LZ, ImageDimIntr->Dim); 3792 3793 // The starting indexes should remain in the same place. 3794 --NumVAddrs; 3795 --CorrectedNumVAddrs; 3796 3797 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3798 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3799 MI.RemoveOperand(LodIdx); 3800 } 3801 } 3802 } 3803 3804 // Optimize _mip away, when 'lod' is zero 3805 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3806 int64_t ConstantLod; 3807 const int LodIdx = AddrIdx + NumVAddrs - 1; 3808 3809 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3810 if (ConstantLod == 0) { 3811 // TODO: Change intrinsic opcode and remove operand instead or replacing 3812 // it with 0, as the _L to _LZ handling is done above. 3813 MI.getOperand(LodIdx).ChangeToImmediate(0); 3814 --CorrectedNumVAddrs; 3815 } 3816 } 3817 } 3818 3819 // Rewrite the addressing register layout before doing anything else. 3820 if (IsA16 || IsG16) { 3821 if (IsA16) { 3822 // Target must support the feature and gradients need to be 16 bit too 3823 if (!ST.hasA16() || !IsG16) 3824 return false; 3825 } else if (!ST.hasG16()) 3826 return false; 3827 3828 if (NumVAddrs > 1) { 3829 SmallVector<Register, 4> PackedRegs; 3830 // Don't compress addresses for G16 3831 const int PackEndIdx = 3832 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3833 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3834 PackEndIdx, NumGradients); 3835 3836 if (!IsA16) { 3837 // Add uncompressed address 3838 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3839 int AddrReg = MI.getOperand(I).getReg(); 3840 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3841 PackedRegs.push_back(AddrReg); 3842 } 3843 } 3844 3845 // See also below in the non-a16 branch 3846 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3847 3848 if (!UseNSA && PackedRegs.size() > 1) { 3849 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3850 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3851 PackedRegs[0] = Concat.getReg(0); 3852 PackedRegs.resize(1); 3853 } 3854 3855 const int NumPacked = PackedRegs.size(); 3856 for (int I = 0; I != NumVAddrs; ++I) { 3857 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3858 if (!SrcOp.isReg()) { 3859 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3860 continue; 3861 } 3862 3863 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3864 3865 if (I < NumPacked) 3866 SrcOp.setReg(PackedRegs[I]); 3867 else 3868 SrcOp.setReg(AMDGPU::NoRegister); 3869 } 3870 } 3871 } else { 3872 // If the register allocator cannot place the address registers contiguously 3873 // without introducing moves, then using the non-sequential address encoding 3874 // is always preferable, since it saves VALU instructions and is usually a 3875 // wash in terms of code size or even better. 3876 // 3877 // However, we currently have no way of hinting to the register allocator 3878 // that MIMG addresses should be placed contiguously when it is possible to 3879 // do so, so force non-NSA for the common 2-address case as a heuristic. 3880 // 3881 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3882 // allocation when possible. 3883 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3884 3885 if (!UseNSA && NumVAddrs > 1) 3886 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3887 } 3888 3889 int Flags = 0; 3890 if (IsA16) 3891 Flags |= 1; 3892 if (IsG16) 3893 Flags |= 2; 3894 MI.addOperand(MachineOperand::CreateImm(Flags)); 3895 3896 if (BaseOpcode->Store) { // No TFE for stores? 3897 // TODO: Handle dmask trim 3898 Register VData = MI.getOperand(1).getReg(); 3899 LLT Ty = MRI->getType(VData); 3900 if (!Ty.isVector() || Ty.getElementType() != S16) 3901 return true; 3902 3903 Register RepackedReg = handleD16VData(B, *MRI, VData); 3904 if (RepackedReg != VData) { 3905 MI.getOperand(1).setReg(RepackedReg); 3906 } 3907 3908 return true; 3909 } 3910 3911 Register DstReg = MI.getOperand(0).getReg(); 3912 LLT Ty = MRI->getType(DstReg); 3913 const LLT EltTy = Ty.getScalarType(); 3914 const bool IsD16 = Ty.getScalarType() == S16; 3915 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3916 3917 // Confirm that the return type is large enough for the dmask specified 3918 if (NumElts < DMaskLanes) 3919 return false; 3920 3921 if (NumElts > 4 || DMaskLanes > 4) 3922 return false; 3923 3924 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3925 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3926 3927 // The raw dword aligned data component of the load. The only legal cases 3928 // where this matters should be when using the packed D16 format, for 3929 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3930 LLT RoundedTy; 3931 3932 // S32 vector to to cover all data, plus TFE result element. 3933 LLT TFETy; 3934 3935 // Register type to use for each loaded component. Will be S32 or V2S16. 3936 LLT RegTy; 3937 3938 if (IsD16 && ST.hasUnpackedD16VMem()) { 3939 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3940 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3941 RegTy = S32; 3942 } else { 3943 unsigned EltSize = EltTy.getSizeInBits(); 3944 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3945 unsigned RoundedSize = 32 * RoundedElts; 3946 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3947 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3948 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3949 } 3950 3951 // The return type does not need adjustment. 3952 // TODO: Should we change s16 case to s32 or <2 x s16>? 3953 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3954 return true; 3955 3956 Register Dst1Reg; 3957 3958 // Insert after the instruction. 3959 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3960 3961 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3962 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3963 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3964 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3965 3966 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3967 3968 MI.getOperand(0).setReg(NewResultReg); 3969 3970 // In the IR, TFE is supposed to be used with a 2 element struct return 3971 // type. The intruction really returns these two values in one contiguous 3972 // register, with one additional dword beyond the loaded data. Rewrite the 3973 // return type to use a single register result. 3974 3975 if (IsTFE) { 3976 Dst1Reg = MI.getOperand(1).getReg(); 3977 if (MRI->getType(Dst1Reg) != S32) 3978 return false; 3979 3980 // TODO: Make sure the TFE operand bit is set. 3981 MI.RemoveOperand(1); 3982 3983 // Handle the easy case that requires no repack instructions. 3984 if (Ty == S32) { 3985 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 3986 return true; 3987 } 3988 } 3989 3990 // Now figure out how to copy the new result register back into the old 3991 // result. 3992 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 3993 3994 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 3995 3996 if (ResultNumRegs == 1) { 3997 assert(!IsTFE); 3998 ResultRegs[0] = NewResultReg; 3999 } else { 4000 // We have to repack into a new vector of some kind. 4001 for (int I = 0; I != NumDataRegs; ++I) 4002 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4003 B.buildUnmerge(ResultRegs, NewResultReg); 4004 4005 // Drop the final TFE element to get the data part. The TFE result is 4006 // directly written to the right place already. 4007 if (IsTFE) 4008 ResultRegs.resize(NumDataRegs); 4009 } 4010 4011 // For an s16 scalar result, we form an s32 result with a truncate regardless 4012 // of packed vs. unpacked. 4013 if (IsD16 && !Ty.isVector()) { 4014 B.buildTrunc(DstReg, ResultRegs[0]); 4015 return true; 4016 } 4017 4018 // Avoid a build/concat_vector of 1 entry. 4019 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4020 B.buildBitcast(DstReg, ResultRegs[0]); 4021 return true; 4022 } 4023 4024 assert(Ty.isVector()); 4025 4026 if (IsD16) { 4027 // For packed D16 results with TFE enabled, all the data components are 4028 // S32. Cast back to the expected type. 4029 // 4030 // TODO: We don't really need to use load s32 elements. We would only need one 4031 // cast for the TFE result if a multiple of v2s16 was used. 4032 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4033 for (Register &Reg : ResultRegs) 4034 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4035 } else if (ST.hasUnpackedD16VMem()) { 4036 for (Register &Reg : ResultRegs) 4037 Reg = B.buildTrunc(S16, Reg).getReg(0); 4038 } 4039 } 4040 4041 auto padWithUndef = [&](LLT Ty, int NumElts) { 4042 if (NumElts == 0) 4043 return; 4044 Register Undef = B.buildUndef(Ty).getReg(0); 4045 for (int I = 0; I != NumElts; ++I) 4046 ResultRegs.push_back(Undef); 4047 }; 4048 4049 // Pad out any elements eliminated due to the dmask. 4050 LLT ResTy = MRI->getType(ResultRegs[0]); 4051 if (!ResTy.isVector()) { 4052 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4053 B.buildBuildVector(DstReg, ResultRegs); 4054 return true; 4055 } 4056 4057 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4058 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4059 4060 // Deal with the one annoying legal case. 4061 const LLT V3S16 = LLT::vector(3, 16); 4062 if (Ty == V3S16) { 4063 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4064 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4065 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4066 return true; 4067 } 4068 4069 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4070 B.buildConcatVectors(DstReg, ResultRegs); 4071 return true; 4072 } 4073 4074 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4075 MachineInstr &MI, MachineIRBuilder &B, 4076 GISelChangeObserver &Observer) const { 4077 Register Dst = MI.getOperand(0).getReg(); 4078 LLT Ty = B.getMRI()->getType(Dst); 4079 unsigned Size = Ty.getSizeInBits(); 4080 MachineFunction &MF = B.getMF(); 4081 4082 Observer.changingInstr(MI); 4083 4084 // FIXME: We don't really need this intermediate instruction. The intrinsic 4085 // should be fixed to have a memory operand. Since it's readnone, we're not 4086 // allowed to add one. 4087 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4088 MI.RemoveOperand(1); // Remove intrinsic ID 4089 4090 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4091 // TODO: Should this use datalayout alignment? 4092 const unsigned MemSize = (Size + 7) / 8; 4093 const Align MemAlign(4); 4094 MachineMemOperand *MMO = MF.getMachineMemOperand( 4095 MachinePointerInfo(), 4096 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4097 MachineMemOperand::MOInvariant, 4098 MemSize, MemAlign); 4099 MI.addMemOperand(MF, MMO); 4100 4101 // There are no 96-bit result scalar loads, but widening to 128-bit should 4102 // always be legal. We may need to restore this to a 96-bit result if it turns 4103 // out this needs to be converted to a vector load during RegBankSelect. 4104 if (!isPowerOf2_32(Size)) { 4105 LegalizerHelper Helper(MF, *this, Observer, B); 4106 4107 if (Ty.isVector()) 4108 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4109 else 4110 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4111 } 4112 4113 Observer.changedInstr(MI); 4114 return true; 4115 } 4116 4117 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4118 MachineRegisterInfo &MRI, 4119 MachineIRBuilder &B) const { 4120 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4121 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4122 !ST.isTrapHandlerEnabled()) { 4123 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4124 } else { 4125 // Pass queue pointer to trap handler as input, and insert trap instruction 4126 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4127 const ArgDescriptor *Arg = 4128 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4129 if (!Arg) 4130 return false; 4131 MachineRegisterInfo &MRI = *B.getMRI(); 4132 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4133 Register LiveIn = getLiveInRegister( 4134 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4135 /*InsertLiveInCopy=*/false); 4136 if (!loadInputValue(LiveIn, B, Arg)) 4137 return false; 4138 B.buildCopy(SGPR01, LiveIn); 4139 B.buildInstr(AMDGPU::S_TRAP) 4140 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4141 .addReg(SGPR01, RegState::Implicit); 4142 } 4143 4144 MI.eraseFromParent(); 4145 return true; 4146 } 4147 4148 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4149 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4150 // Is non-HSA path or trap-handler disabled? then, report a warning 4151 // accordingly 4152 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4153 !ST.isTrapHandlerEnabled()) { 4154 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4155 "debugtrap handler not supported", 4156 MI.getDebugLoc(), DS_Warning); 4157 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4158 Ctx.diagnose(NoTrap); 4159 } else { 4160 // Insert debug-trap instruction 4161 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4162 } 4163 4164 MI.eraseFromParent(); 4165 return true; 4166 } 4167 4168 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4169 MachineInstr &MI) const { 4170 MachineIRBuilder &B = Helper.MIRBuilder; 4171 MachineRegisterInfo &MRI = *B.getMRI(); 4172 4173 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4174 auto IntrID = MI.getIntrinsicID(); 4175 switch (IntrID) { 4176 case Intrinsic::amdgcn_if: 4177 case Intrinsic::amdgcn_else: { 4178 MachineInstr *Br = nullptr; 4179 MachineBasicBlock *UncondBrTarget = nullptr; 4180 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4181 const SIRegisterInfo *TRI 4182 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4183 4184 Register Def = MI.getOperand(1).getReg(); 4185 Register Use = MI.getOperand(3).getReg(); 4186 4187 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4188 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4189 if (IntrID == Intrinsic::amdgcn_if) { 4190 B.buildInstr(AMDGPU::SI_IF) 4191 .addDef(Def) 4192 .addUse(Use) 4193 .addMBB(UncondBrTarget); 4194 } else { 4195 B.buildInstr(AMDGPU::SI_ELSE) 4196 .addDef(Def) 4197 .addUse(Use) 4198 .addMBB(UncondBrTarget) 4199 .addImm(0); 4200 } 4201 4202 if (Br) { 4203 Br->getOperand(0).setMBB(CondBrTarget); 4204 } else { 4205 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4206 // since we're swapping branch targets it needs to be reinserted. 4207 // FIXME: IRTranslator should probably not do this 4208 B.buildBr(*CondBrTarget); 4209 } 4210 4211 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4212 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4213 MI.eraseFromParent(); 4214 BrCond->eraseFromParent(); 4215 return true; 4216 } 4217 4218 return false; 4219 } 4220 case Intrinsic::amdgcn_loop: { 4221 MachineInstr *Br = nullptr; 4222 MachineBasicBlock *UncondBrTarget = nullptr; 4223 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4224 const SIRegisterInfo *TRI 4225 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4226 4227 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4228 Register Reg = MI.getOperand(2).getReg(); 4229 4230 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4231 B.buildInstr(AMDGPU::SI_LOOP) 4232 .addUse(Reg) 4233 .addMBB(UncondBrTarget); 4234 4235 if (Br) 4236 Br->getOperand(0).setMBB(CondBrTarget); 4237 else 4238 B.buildBr(*CondBrTarget); 4239 4240 MI.eraseFromParent(); 4241 BrCond->eraseFromParent(); 4242 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4243 return true; 4244 } 4245 4246 return false; 4247 } 4248 case Intrinsic::amdgcn_kernarg_segment_ptr: 4249 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4250 // This only makes sense to call in a kernel, so just lower to null. 4251 B.buildConstant(MI.getOperand(0).getReg(), 0); 4252 MI.eraseFromParent(); 4253 return true; 4254 } 4255 4256 return legalizePreloadedArgIntrin( 4257 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4258 case Intrinsic::amdgcn_implicitarg_ptr: 4259 return legalizeImplicitArgPtr(MI, MRI, B); 4260 case Intrinsic::amdgcn_workitem_id_x: 4261 return legalizePreloadedArgIntrin(MI, MRI, B, 4262 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4263 case Intrinsic::amdgcn_workitem_id_y: 4264 return legalizePreloadedArgIntrin(MI, MRI, B, 4265 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4266 case Intrinsic::amdgcn_workitem_id_z: 4267 return legalizePreloadedArgIntrin(MI, MRI, B, 4268 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4269 case Intrinsic::amdgcn_workgroup_id_x: 4270 return legalizePreloadedArgIntrin(MI, MRI, B, 4271 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4272 case Intrinsic::amdgcn_workgroup_id_y: 4273 return legalizePreloadedArgIntrin(MI, MRI, B, 4274 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4275 case Intrinsic::amdgcn_workgroup_id_z: 4276 return legalizePreloadedArgIntrin(MI, MRI, B, 4277 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4278 case Intrinsic::amdgcn_dispatch_ptr: 4279 return legalizePreloadedArgIntrin(MI, MRI, B, 4280 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4281 case Intrinsic::amdgcn_queue_ptr: 4282 return legalizePreloadedArgIntrin(MI, MRI, B, 4283 AMDGPUFunctionArgInfo::QUEUE_PTR); 4284 case Intrinsic::amdgcn_implicit_buffer_ptr: 4285 return legalizePreloadedArgIntrin( 4286 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4287 case Intrinsic::amdgcn_dispatch_id: 4288 return legalizePreloadedArgIntrin(MI, MRI, B, 4289 AMDGPUFunctionArgInfo::DISPATCH_ID); 4290 case Intrinsic::amdgcn_fdiv_fast: 4291 return legalizeFDIVFastIntrin(MI, MRI, B); 4292 case Intrinsic::amdgcn_is_shared: 4293 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4294 case Intrinsic::amdgcn_is_private: 4295 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4296 case Intrinsic::amdgcn_wavefrontsize: { 4297 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4298 MI.eraseFromParent(); 4299 return true; 4300 } 4301 case Intrinsic::amdgcn_s_buffer_load: 4302 return legalizeSBufferLoad(MI, B, Helper.Observer); 4303 case Intrinsic::amdgcn_raw_buffer_store: 4304 case Intrinsic::amdgcn_struct_buffer_store: 4305 return legalizeBufferStore(MI, MRI, B, false, false); 4306 case Intrinsic::amdgcn_raw_buffer_store_format: 4307 case Intrinsic::amdgcn_struct_buffer_store_format: 4308 return legalizeBufferStore(MI, MRI, B, false, true); 4309 case Intrinsic::amdgcn_raw_tbuffer_store: 4310 case Intrinsic::amdgcn_struct_tbuffer_store: 4311 return legalizeBufferStore(MI, MRI, B, true, true); 4312 case Intrinsic::amdgcn_raw_buffer_load: 4313 case Intrinsic::amdgcn_struct_buffer_load: 4314 return legalizeBufferLoad(MI, MRI, B, false, false); 4315 case Intrinsic::amdgcn_raw_buffer_load_format: 4316 case Intrinsic::amdgcn_struct_buffer_load_format: 4317 return legalizeBufferLoad(MI, MRI, B, true, false); 4318 case Intrinsic::amdgcn_raw_tbuffer_load: 4319 case Intrinsic::amdgcn_struct_tbuffer_load: 4320 return legalizeBufferLoad(MI, MRI, B, true, true); 4321 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4322 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4323 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4324 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4325 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4326 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4327 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4328 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4329 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4330 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4331 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4332 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4333 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4334 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4335 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4336 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4337 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4338 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4339 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4340 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4341 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4342 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4343 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4344 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4345 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4346 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4347 return legalizeBufferAtomic(MI, B, IntrID); 4348 case Intrinsic::amdgcn_atomic_inc: 4349 return legalizeAtomicIncDec(MI, B, true); 4350 case Intrinsic::amdgcn_atomic_dec: 4351 return legalizeAtomicIncDec(MI, B, false); 4352 case Intrinsic::trap: 4353 return legalizeTrapIntrinsic(MI, MRI, B); 4354 case Intrinsic::debugtrap: 4355 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4356 default: { 4357 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4358 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4359 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4360 return true; 4361 } 4362 } 4363 4364 return true; 4365 } 4366