1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returns true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 uint64_t AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .minScalar(0, S16) 536 .clampMaxNumElementsStrict(0, S16, 2) 537 .widenScalarToNextMultipleOf(0, 32) 538 .maxScalar(0, S32) 539 .scalarize(0); 540 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 542 .legalFor({S32, S16, V2S16}) // Clamp modifier 543 .minScalarOrElt(0, S16) 544 .clampMaxNumElementsStrict(0, S16, 2) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 32) 547 .lower(); 548 } else if (ST.has16BitInsts()) { 549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 550 .legalFor({S32, S16}) 551 .minScalar(0, S16) 552 .widenScalarToNextMultipleOf(0, 32) 553 .maxScalar(0, S32) 554 .scalarize(0); 555 556 // Technically the saturating operations require clamp bit support, but this 557 // was introduced at the same time as 16-bit operations. 558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 559 .legalFor({S32, S16}) // Clamp modifier 560 .minScalar(0, S16) 561 .scalarize(0) 562 .widenScalarToNextPow2(0, 16) 563 .lower(); 564 565 // We're just lowering this, but it helps get a better result to try to 566 // coerce to the desired type first. 567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 568 .minScalar(0, S16) 569 .scalarize(0) 570 .lower(); 571 } else { 572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 573 .legalFor({S32}) 574 .widenScalarToNextMultipleOf(0, 32) 575 .clampScalar(0, S32, S32) 576 .scalarize(0); 577 578 if (ST.hasIntClamp()) { 579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 580 .legalFor({S32}) // Clamp modifier. 581 .scalarize(0) 582 .minScalarOrElt(0, S32) 583 .lower(); 584 } else { 585 // Clamp bit support was added in VI, along with 16-bit operations. 586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 587 .minScalar(0, S32) 588 .scalarize(0) 589 .lower(); 590 } 591 592 // FIXME: DAG expansion gets better results. The widening uses the smaller 593 // range values and goes for the min/max lowering directly. 594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 595 .minScalar(0, S32) 596 .scalarize(0) 597 .lower(); 598 } 599 600 getActionDefinitionsBuilder( 601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 602 .customFor({S32, S64}) 603 .clampScalar(0, S32, S64) 604 .widenScalarToNextPow2(0, 32) 605 .scalarize(0); 606 607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 608 .legalFor({S32}) 609 .maxScalar(0, S32); 610 611 if (ST.hasVOP3PInsts()) { 612 Mulh 613 .clampMaxNumElements(0, S8, 2) 614 .lowerFor({V2S8}); 615 } 616 617 Mulh 618 .scalarize(0) 619 .lower(); 620 621 // Report legal for any types we can handle anywhere. For the cases only legal 622 // on the SALU, RegBankSelect will be able to re-legalize. 623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 625 .clampScalar(0, S32, S64) 626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 628 .widenScalarToNextPow2(0) 629 .scalarize(0); 630 631 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 633 .legalFor({{S32, S1}, {S32, S32}}) 634 .minScalar(0, S32) 635 // TODO: .scalarize(0) 636 .lower(); 637 638 getActionDefinitionsBuilder(G_BITCAST) 639 // Don't worry about the size constraint. 640 .legalIf(all(isRegisterType(0), isRegisterType(1))) 641 .lower(); 642 643 644 getActionDefinitionsBuilder(G_CONSTANT) 645 .legalFor({S1, S32, S64, S16, GlobalPtr, 646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 647 .legalIf(isPointer(0)) 648 .clampScalar(0, S32, S64) 649 .widenScalarToNextPow2(0); 650 651 getActionDefinitionsBuilder(G_FCONSTANT) 652 .legalFor({S32, S64, S16}) 653 .clampScalar(0, S16, S64); 654 655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 656 .legalIf(isRegisterType(0)) 657 // s1 and s16 are special cases because they have legal operations on 658 // them, but don't really occupy registers in the normal way. 659 .legalFor({S1, S16}) 660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 661 .clampScalarOrElt(0, S32, MaxScalar) 662 .widenScalarToNextPow2(0, 32) 663 .clampMaxNumElements(0, S32, 16); 664 665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 666 667 // If the amount is divergent, we have to do a wave reduction to get the 668 // maximum value, so this is expanded during RegBankSelect. 669 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 670 .legalFor({{PrivatePtr, S32}}); 671 672 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 673 .customIf(typeIsNot(0, PrivatePtr)); 674 675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 676 677 auto &FPOpActions = getActionDefinitionsBuilder( 678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 679 .legalFor({S32, S64}); 680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 681 .customFor({S32, S64}); 682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 683 .customFor({S32, S64}); 684 685 if (ST.has16BitInsts()) { 686 if (ST.hasVOP3PInsts()) 687 FPOpActions.legalFor({S16, V2S16}); 688 else 689 FPOpActions.legalFor({S16}); 690 691 TrigActions.customFor({S16}); 692 FDIVActions.customFor({S16}); 693 } 694 695 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 697 698 if (ST.hasVOP3PInsts()) { 699 MinNumMaxNum.customFor(FPTypesPK16) 700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 701 .clampMaxNumElements(0, S16, 2) 702 .clampScalar(0, S16, S64) 703 .scalarize(0); 704 } else if (ST.has16BitInsts()) { 705 MinNumMaxNum.customFor(FPTypes16) 706 .clampScalar(0, S16, S64) 707 .scalarize(0); 708 } else { 709 MinNumMaxNum.customFor(FPTypesBase) 710 .clampScalar(0, S32, S64) 711 .scalarize(0); 712 } 713 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 716 717 FPOpActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 TrigActions 722 .scalarize(0) 723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 724 725 FDIVActions 726 .scalarize(0) 727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 728 729 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 730 .legalFor(FPTypesPK16) 731 .clampMaxNumElementsStrict(0, S16, 2) 732 .scalarize(0) 733 .clampScalar(0, S16, S64); 734 735 if (ST.has16BitInsts()) { 736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 737 .legalFor({S32, S64, S16}) 738 .scalarize(0) 739 .clampScalar(0, S16, S64); 740 } else { 741 getActionDefinitionsBuilder(G_FSQRT) 742 .legalFor({S32, S64}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 746 if (ST.hasFractBug()) { 747 getActionDefinitionsBuilder(G_FFLOOR) 748 .customFor({S64}) 749 .legalFor({S32, S64}) 750 .scalarize(0) 751 .clampScalar(0, S32, S64); 752 } else { 753 getActionDefinitionsBuilder(G_FFLOOR) 754 .legalFor({S32, S64}) 755 .scalarize(0) 756 .clampScalar(0, S32, S64); 757 } 758 } 759 760 getActionDefinitionsBuilder(G_FPTRUNC) 761 .legalFor({{S32, S64}, {S16, S32}}) 762 .scalarize(0) 763 .lower(); 764 765 getActionDefinitionsBuilder(G_FPEXT) 766 .legalFor({{S64, S32}, {S32, S16}}) 767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 768 .scalarize(0); 769 770 getActionDefinitionsBuilder(G_FSUB) 771 // Use actual fsub instruction 772 .legalFor({S32}) 773 // Must use fadd + fneg 774 .lowerFor({S64, S16, V2S16}) 775 .scalarize(0) 776 .clampScalar(0, S32, S64); 777 778 // Whether this is legal depends on the floating point mode for the function. 779 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 780 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 781 FMad.customFor({S32, S16}); 782 else if (ST.hasMadMacF32Insts()) 783 FMad.customFor({S32}); 784 else if (ST.hasMadF16()) 785 FMad.customFor({S16}); 786 FMad.scalarize(0) 787 .lower(); 788 789 auto &FRem = getActionDefinitionsBuilder(G_FREM); 790 if (ST.has16BitInsts()) { 791 FRem.customFor({S16, S32, S64}); 792 } else { 793 FRem.minScalar(0, S32) 794 .customFor({S32, S64}); 795 } 796 FRem.scalarize(0); 797 798 // TODO: Do we need to clamp maximum bitwidth? 799 getActionDefinitionsBuilder(G_TRUNC) 800 .legalIf(isScalar(0)) 801 .legalFor({{V2S16, V2S32}}) 802 .clampMaxNumElements(0, S16, 2) 803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 804 // situations (like an invalid implicit use), we don't want to infinite loop 805 // in the legalizer. 806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 807 .alwaysLegal(); 808 809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 811 {S32, S1}, {S64, S1}, {S16, S1}}) 812 .scalarize(0) 813 .clampScalar(0, S32, S64) 814 .widenScalarToNextPow2(1, 32); 815 816 // TODO: Split s1->s64 during regbankselect for VALU. 817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 819 .lowerIf(typeIs(1, S1)) 820 .customFor({{S32, S64}, {S64, S64}}); 821 if (ST.has16BitInsts()) 822 IToFP.legalFor({{S16, S16}}); 823 IToFP.clampScalar(1, S32, S64) 824 .minScalar(0, S32) 825 .scalarize(0) 826 .widenScalarToNextPow2(1); 827 828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 830 .customFor({{S64, S32}, {S64, S64}}) 831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 832 if (ST.has16BitInsts()) 833 FPToI.legalFor({{S16, S16}}); 834 else 835 FPToI.minScalar(1, S32); 836 837 FPToI.minScalar(0, S32) 838 .widenScalarToNextPow2(0, 32) 839 .scalarize(0) 840 .lower(); 841 842 // Lower roundeven into G_FRINT 843 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 844 .scalarize(0) 845 .lower(); 846 847 if (ST.has16BitInsts()) { 848 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 849 .legalFor({S16, S32, S64}) 850 .clampScalar(0, S16, S64) 851 .scalarize(0); 852 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 854 .legalFor({S32, S64}) 855 .clampScalar(0, S32, S64) 856 .scalarize(0); 857 } else { 858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 859 .legalFor({S32}) 860 .customFor({S64}) 861 .clampScalar(0, S32, S64) 862 .scalarize(0); 863 } 864 865 getActionDefinitionsBuilder(G_PTR_ADD) 866 .legalIf(all(isPointer(0), sameSize(0, 1))) 867 .scalarize(0) 868 .scalarSameSizeAs(1, 0); 869 870 getActionDefinitionsBuilder(G_PTRMASK) 871 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 872 .scalarSameSizeAs(1, 0) 873 .scalarize(0); 874 875 auto &CmpBuilder = 876 getActionDefinitionsBuilder(G_ICMP) 877 // The compare output type differs based on the register bank of the output, 878 // so make both s1 and s32 legal. 879 // 880 // Scalar compares producing output in scc will be promoted to s32, as that 881 // is the allocatable register type that will be needed for the copy from 882 // scc. This will be promoted during RegBankSelect, and we assume something 883 // before that won't try to use s32 result types. 884 // 885 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 886 // bank. 887 .legalForCartesianProduct( 888 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 889 .legalForCartesianProduct( 890 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 891 if (ST.has16BitInsts()) { 892 CmpBuilder.legalFor({{S1, S16}}); 893 } 894 895 CmpBuilder 896 .widenScalarToNextPow2(1) 897 .clampScalar(1, S32, S64) 898 .scalarize(0) 899 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 900 901 getActionDefinitionsBuilder(G_FCMP) 902 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 903 .widenScalarToNextPow2(1) 904 .clampScalar(1, S32, S64) 905 .scalarize(0); 906 907 // FIXME: fpow has a selection pattern that should move to custom lowering. 908 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 909 if (ST.has16BitInsts()) 910 Exp2Ops.legalFor({S32, S16}); 911 else 912 Exp2Ops.legalFor({S32}); 913 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 914 Exp2Ops.scalarize(0); 915 916 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 917 if (ST.has16BitInsts()) 918 ExpOps.customFor({{S32}, {S16}}); 919 else 920 ExpOps.customFor({S32}); 921 ExpOps.clampScalar(0, MinScalarFPTy, S32) 922 .scalarize(0); 923 924 getActionDefinitionsBuilder(G_FPOWI) 925 .clampScalar(0, MinScalarFPTy, S32) 926 .lower(); 927 928 // The 64-bit versions produce 32-bit results, but only on the SALU. 929 getActionDefinitionsBuilder(G_CTPOP) 930 .legalFor({{S32, S32}, {S32, S64}}) 931 .clampScalar(0, S32, S32) 932 .widenScalarToNextPow2(1, 32) 933 .clampScalar(1, S32, S64) 934 .scalarize(0) 935 .widenScalarToNextPow2(0, 32); 936 937 938 // The hardware instructions return a different result on 0 than the generic 939 // instructions expect. The hardware produces -1, but these produce the 940 // bitwidth. 941 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 942 .scalarize(0) 943 .clampScalar(0, S32, S32) 944 .clampScalar(1, S32, S64) 945 .widenScalarToNextPow2(0, 32) 946 .widenScalarToNextPow2(1, 32) 947 .custom(); 948 949 // The 64-bit versions produce 32-bit results, but only on the SALU. 950 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 951 .legalFor({{S32, S32}, {S32, S64}}) 952 .clampScalar(0, S32, S32) 953 .clampScalar(1, S32, S64) 954 .scalarize(0) 955 .widenScalarToNextPow2(0, 32) 956 .widenScalarToNextPow2(1, 32); 957 958 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 959 // RegBankSelect. 960 getActionDefinitionsBuilder(G_BITREVERSE) 961 .legalFor({S32, S64}) 962 .clampScalar(0, S32, S64) 963 .scalarize(0) 964 .widenScalarToNextPow2(0); 965 966 if (ST.has16BitInsts()) { 967 getActionDefinitionsBuilder(G_BSWAP) 968 .legalFor({S16, S32, V2S16}) 969 .clampMaxNumElementsStrict(0, S16, 2) 970 // FIXME: Fixing non-power-of-2 before clamp is workaround for 971 // narrowScalar limitation. 972 .widenScalarToNextPow2(0) 973 .clampScalar(0, S16, S32) 974 .scalarize(0); 975 976 if (ST.hasVOP3PInsts()) { 977 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 978 .legalFor({S32, S16, V2S16}) 979 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 980 .clampMaxNumElements(0, S16, 2) 981 .minScalar(0, S16) 982 .widenScalarToNextPow2(0) 983 .scalarize(0) 984 .lower(); 985 } else { 986 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 987 .legalFor({S32, S16}) 988 .widenScalarToNextPow2(0) 989 .minScalar(0, S16) 990 .scalarize(0) 991 .lower(); 992 } 993 } else { 994 // TODO: Should have same legality without v_perm_b32 995 getActionDefinitionsBuilder(G_BSWAP) 996 .legalFor({S32}) 997 .lowerIf(scalarNarrowerThan(0, 32)) 998 // FIXME: Fixing non-power-of-2 before clamp is workaround for 999 // narrowScalar limitation. 1000 .widenScalarToNextPow2(0) 1001 .maxScalar(0, S32) 1002 .scalarize(0) 1003 .lower(); 1004 1005 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1006 .legalFor({S32}) 1007 .minScalar(0, S32) 1008 .widenScalarToNextPow2(0) 1009 .scalarize(0) 1010 .lower(); 1011 } 1012 1013 getActionDefinitionsBuilder(G_INTTOPTR) 1014 // List the common cases 1015 .legalForCartesianProduct(AddrSpaces64, {S64}) 1016 .legalForCartesianProduct(AddrSpaces32, {S32}) 1017 .scalarize(0) 1018 // Accept any address space as long as the size matches 1019 .legalIf(sameSize(0, 1)) 1020 .widenScalarIf(smallerThan(1, 0), 1021 [](const LegalityQuery &Query) { 1022 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1023 }) 1024 .narrowScalarIf(largerThan(1, 0), 1025 [](const LegalityQuery &Query) { 1026 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1027 }); 1028 1029 getActionDefinitionsBuilder(G_PTRTOINT) 1030 // List the common cases 1031 .legalForCartesianProduct(AddrSpaces64, {S64}) 1032 .legalForCartesianProduct(AddrSpaces32, {S32}) 1033 .scalarize(0) 1034 // Accept any address space as long as the size matches 1035 .legalIf(sameSize(0, 1)) 1036 .widenScalarIf(smallerThan(0, 1), 1037 [](const LegalityQuery &Query) { 1038 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1039 }) 1040 .narrowScalarIf( 1041 largerThan(0, 1), 1042 [](const LegalityQuery &Query) { 1043 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1044 }); 1045 1046 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1047 .scalarize(0) 1048 .custom(); 1049 1050 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1051 bool IsLoad) -> bool { 1052 const LLT DstTy = Query.Types[0]; 1053 1054 // Split vector extloads. 1055 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1056 1057 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1058 return true; 1059 1060 const LLT PtrTy = Query.Types[1]; 1061 unsigned AS = PtrTy.getAddressSpace(); 1062 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1063 return true; 1064 1065 // Catch weird sized loads that don't evenly divide into the access sizes 1066 // TODO: May be able to widen depending on alignment etc. 1067 unsigned NumRegs = (MemSize + 31) / 32; 1068 if (NumRegs == 3) { 1069 if (!ST.hasDwordx3LoadStores()) 1070 return true; 1071 } else { 1072 // If the alignment allows, these should have been widened. 1073 if (!isPowerOf2_32(NumRegs)) 1074 return true; 1075 } 1076 1077 return false; 1078 }; 1079 1080 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1081 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1082 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1083 1084 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1085 // LDS 1086 // TODO: Unsupported flat for SI. 1087 1088 for (unsigned Op : {G_LOAD, G_STORE}) { 1089 const bool IsStore = Op == G_STORE; 1090 1091 auto &Actions = getActionDefinitionsBuilder(Op); 1092 // Explicitly list some common cases. 1093 // TODO: Does this help compile time at all? 1094 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1095 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1096 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1097 {S64, GlobalPtr, S64, GlobalAlign32}, 1098 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1099 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1100 {S32, GlobalPtr, S8, GlobalAlign8}, 1101 {S32, GlobalPtr, S16, GlobalAlign16}, 1102 1103 {S32, LocalPtr, S32, 32}, 1104 {S64, LocalPtr, S64, 32}, 1105 {V2S32, LocalPtr, V2S32, 32}, 1106 {S32, LocalPtr, S8, 8}, 1107 {S32, LocalPtr, S16, 16}, 1108 {V2S16, LocalPtr, S32, 32}, 1109 1110 {S32, PrivatePtr, S32, 32}, 1111 {S32, PrivatePtr, S8, 8}, 1112 {S32, PrivatePtr, S16, 16}, 1113 {V2S16, PrivatePtr, S32, 32}, 1114 1115 {S32, ConstantPtr, S32, GlobalAlign32}, 1116 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1117 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1118 {S64, ConstantPtr, S64, GlobalAlign32}, 1119 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1120 Actions.legalIf( 1121 [=](const LegalityQuery &Query) -> bool { 1122 return isLoadStoreLegal(ST, Query); 1123 }); 1124 1125 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1126 // 64-bits. 1127 // 1128 // TODO: Should generalize bitcast action into coerce, which will also cover 1129 // inserting addrspacecasts. 1130 Actions.customIf(typeIs(1, Constant32Ptr)); 1131 1132 // Turn any illegal element vectors into something easier to deal 1133 // with. These will ultimately produce 32-bit scalar shifts to extract the 1134 // parts anyway. 1135 // 1136 // For odd 16-bit element vectors, prefer to split those into pieces with 1137 // 16-bit vector parts. 1138 Actions.bitcastIf( 1139 [=](const LegalityQuery &Query) -> bool { 1140 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1141 Query.MMODescrs[0].MemoryTy); 1142 }, bitcastToRegisterType(0)); 1143 1144 if (!IsStore) { 1145 // Widen suitably aligned loads by loading extra bytes. The standard 1146 // legalization actions can't properly express widening memory operands. 1147 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1148 return shouldWidenLoad(ST, Query, G_LOAD); 1149 }); 1150 } 1151 1152 // FIXME: load/store narrowing should be moved to lower action 1153 Actions 1154 .narrowScalarIf( 1155 [=](const LegalityQuery &Query) -> bool { 1156 return !Query.Types[0].isVector() && 1157 needToSplitMemOp(Query, Op == G_LOAD); 1158 }, 1159 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1160 const LLT DstTy = Query.Types[0]; 1161 const LLT PtrTy = Query.Types[1]; 1162 1163 const unsigned DstSize = DstTy.getSizeInBits(); 1164 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1165 1166 // Split extloads. 1167 if (DstSize > MemSize) 1168 return std::make_pair(0, LLT::scalar(MemSize)); 1169 1170 unsigned MaxSize = maxSizeForAddrSpace(ST, 1171 PtrTy.getAddressSpace(), 1172 Op == G_LOAD); 1173 if (MemSize > MaxSize) 1174 return std::make_pair(0, LLT::scalar(MaxSize)); 1175 1176 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1177 return std::make_pair(0, LLT::scalar(Align)); 1178 }) 1179 .fewerElementsIf( 1180 [=](const LegalityQuery &Query) -> bool { 1181 return Query.Types[0].isVector() && 1182 needToSplitMemOp(Query, Op == G_LOAD); 1183 }, 1184 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1185 const LLT DstTy = Query.Types[0]; 1186 const LLT PtrTy = Query.Types[1]; 1187 1188 LLT EltTy = DstTy.getElementType(); 1189 unsigned MaxSize = maxSizeForAddrSpace(ST, 1190 PtrTy.getAddressSpace(), 1191 Op == G_LOAD); 1192 1193 // FIXME: Handle widened to power of 2 results better. This ends 1194 // up scalarizing. 1195 // FIXME: 3 element stores scalarized on SI 1196 1197 // Split if it's too large for the address space. 1198 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1199 if (MemSize > MaxSize) { 1200 unsigned NumElts = DstTy.getNumElements(); 1201 unsigned EltSize = EltTy.getSizeInBits(); 1202 1203 if (MaxSize % EltSize == 0) { 1204 return std::make_pair( 1205 0, LLT::scalarOrVector( 1206 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1207 } 1208 1209 unsigned NumPieces = MemSize / MaxSize; 1210 1211 // FIXME: Refine when odd breakdowns handled 1212 // The scalars will need to be re-legalized. 1213 if (NumPieces == 1 || NumPieces >= NumElts || 1214 NumElts % NumPieces != 0) 1215 return std::make_pair(0, EltTy); 1216 1217 return std::make_pair( 1218 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1219 } 1220 1221 // FIXME: We could probably handle weird extending loads better. 1222 if (DstTy.getSizeInBits() > MemSize) 1223 return std::make_pair(0, EltTy); 1224 1225 unsigned EltSize = EltTy.getSizeInBits(); 1226 unsigned DstSize = DstTy.getSizeInBits(); 1227 if (!isPowerOf2_32(DstSize)) { 1228 // We're probably decomposing an odd sized store. Try to split 1229 // to the widest type. TODO: Account for alignment. As-is it 1230 // should be OK, since the new parts will be further legalized. 1231 unsigned FloorSize = PowerOf2Floor(DstSize); 1232 return std::make_pair( 1233 0, LLT::scalarOrVector( 1234 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1235 } 1236 1237 // May need relegalization for the scalars. 1238 return std::make_pair(0, EltTy); 1239 }) 1240 .minScalar(0, S32) 1241 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1242 .widenScalarToNextPow2(0) 1243 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1244 .lower(); 1245 } 1246 1247 // FIXME: Unaligned accesses not lowered. 1248 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1249 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1250 {S32, GlobalPtr, S16, 2 * 8}, 1251 {S32, LocalPtr, S8, 8}, 1252 {S32, LocalPtr, S16, 16}, 1253 {S32, PrivatePtr, S8, 8}, 1254 {S32, PrivatePtr, S16, 16}, 1255 {S32, ConstantPtr, S8, 8}, 1256 {S32, ConstantPtr, S16, 2 * 8}}) 1257 .legalIf( 1258 [=](const LegalityQuery &Query) -> bool { 1259 return isLoadStoreLegal(ST, Query); 1260 }); 1261 1262 if (ST.hasFlatAddressSpace()) { 1263 ExtLoads.legalForTypesWithMemDesc( 1264 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1265 } 1266 1267 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1268 // 64-bits. 1269 // 1270 // TODO: Should generalize bitcast action into coerce, which will also cover 1271 // inserting addrspacecasts. 1272 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1273 1274 ExtLoads.clampScalar(0, S32, S32) 1275 .widenScalarToNextPow2(0) 1276 .lower(); 1277 1278 auto &Atomics = getActionDefinitionsBuilder( 1279 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1280 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1281 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1282 G_ATOMICRMW_UMIN}) 1283 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1284 {S64, GlobalPtr}, {S64, LocalPtr}, 1285 {S32, RegionPtr}, {S64, RegionPtr}}); 1286 if (ST.hasFlatAddressSpace()) { 1287 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1288 } 1289 1290 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1291 if (ST.hasLDSFPAtomicAdd()) { 1292 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1293 if (ST.hasGFX90AInsts()) 1294 Atomic.legalFor({{S64, LocalPtr}}); 1295 } 1296 if (ST.hasAtomicFaddInsts()) 1297 Atomic.legalFor({{S32, GlobalPtr}}); 1298 1299 if (ST.hasGFX90AInsts()) { 1300 // These are legal with some caveats, and should have undergone expansion in 1301 // the IR in most situations 1302 // TODO: Move atomic expansion into legalizer 1303 // TODO: Also supports <2 x f16> 1304 Atomic.legalFor({ 1305 {S32, GlobalPtr}, 1306 {S64, GlobalPtr}, 1307 {S64, FlatPtr} 1308 }); 1309 } 1310 1311 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1312 // demarshalling 1313 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1314 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1315 {S32, FlatPtr}, {S64, FlatPtr}}) 1316 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1317 {S32, RegionPtr}, {S64, RegionPtr}}); 1318 // TODO: Pointer types, any 32-bit or 64-bit vector 1319 1320 // Condition should be s32 for scalar, s1 for vector. 1321 getActionDefinitionsBuilder(G_SELECT) 1322 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1323 LocalPtr, FlatPtr, PrivatePtr, 1324 LLT::fixed_vector(2, LocalPtr), 1325 LLT::fixed_vector(2, PrivatePtr)}, 1326 {S1, S32}) 1327 .clampScalar(0, S16, S64) 1328 .scalarize(1) 1329 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1330 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1331 .clampMaxNumElements(0, S32, 2) 1332 .clampMaxNumElements(0, LocalPtr, 2) 1333 .clampMaxNumElements(0, PrivatePtr, 2) 1334 .scalarize(0) 1335 .widenScalarToNextPow2(0) 1336 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1337 1338 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1339 // be more flexible with the shift amount type. 1340 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1341 .legalFor({{S32, S32}, {S64, S32}}); 1342 if (ST.has16BitInsts()) { 1343 if (ST.hasVOP3PInsts()) { 1344 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1345 .clampMaxNumElements(0, S16, 2); 1346 } else 1347 Shifts.legalFor({{S16, S16}}); 1348 1349 // TODO: Support 16-bit shift amounts for all types 1350 Shifts.widenScalarIf( 1351 [=](const LegalityQuery &Query) { 1352 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1353 // 32-bit amount. 1354 const LLT ValTy = Query.Types[0]; 1355 const LLT AmountTy = Query.Types[1]; 1356 return ValTy.getSizeInBits() <= 16 && 1357 AmountTy.getSizeInBits() < 16; 1358 }, changeTo(1, S16)); 1359 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1360 Shifts.clampScalar(1, S32, S32); 1361 Shifts.widenScalarToNextPow2(0, 16); 1362 Shifts.clampScalar(0, S16, S64); 1363 1364 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1365 .minScalar(0, S16) 1366 .scalarize(0) 1367 .lower(); 1368 } else { 1369 // Make sure we legalize the shift amount type first, as the general 1370 // expansion for the shifted type will produce much worse code if it hasn't 1371 // been truncated already. 1372 Shifts.clampScalar(1, S32, S32); 1373 Shifts.widenScalarToNextPow2(0, 32); 1374 Shifts.clampScalar(0, S32, S64); 1375 1376 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1377 .minScalar(0, S32) 1378 .scalarize(0) 1379 .lower(); 1380 } 1381 Shifts.scalarize(0); 1382 1383 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1384 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1385 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1386 unsigned IdxTypeIdx = 2; 1387 1388 getActionDefinitionsBuilder(Op) 1389 .customIf([=](const LegalityQuery &Query) { 1390 const LLT EltTy = Query.Types[EltTypeIdx]; 1391 const LLT VecTy = Query.Types[VecTypeIdx]; 1392 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1393 const unsigned EltSize = EltTy.getSizeInBits(); 1394 return (EltSize == 32 || EltSize == 64) && 1395 VecTy.getSizeInBits() % 32 == 0 && 1396 VecTy.getSizeInBits() <= MaxRegisterSize && 1397 IdxTy.getSizeInBits() == 32; 1398 }) 1399 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1400 bitcastToVectorElement32(VecTypeIdx)) 1401 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1402 .bitcastIf( 1403 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1404 [=](const LegalityQuery &Query) { 1405 // For > 64-bit element types, try to turn this into a 64-bit 1406 // element vector since we may be able to do better indexing 1407 // if this is scalar. If not, fall back to 32. 1408 const LLT EltTy = Query.Types[EltTypeIdx]; 1409 const LLT VecTy = Query.Types[VecTypeIdx]; 1410 const unsigned DstEltSize = EltTy.getSizeInBits(); 1411 const unsigned VecSize = VecTy.getSizeInBits(); 1412 1413 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1414 return std::make_pair( 1415 VecTypeIdx, 1416 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1417 }) 1418 .clampScalar(EltTypeIdx, S32, S64) 1419 .clampScalar(VecTypeIdx, S32, S64) 1420 .clampScalar(IdxTypeIdx, S32, S32) 1421 .clampMaxNumElements(VecTypeIdx, S32, 32) 1422 // TODO: Clamp elements for 64-bit vectors? 1423 // It should only be necessary with variable indexes. 1424 // As a last resort, lower to the stack 1425 .lower(); 1426 } 1427 1428 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1429 .unsupportedIf([=](const LegalityQuery &Query) { 1430 const LLT &EltTy = Query.Types[1].getElementType(); 1431 return Query.Types[0] != EltTy; 1432 }); 1433 1434 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1435 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1436 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1437 1438 // FIXME: Doesn't handle extract of illegal sizes. 1439 getActionDefinitionsBuilder(Op) 1440 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1441 .lowerIf([=](const LegalityQuery &Query) { 1442 // Sub-vector(or single element) insert and extract. 1443 // TODO: verify immediate offset here since lower only works with 1444 // whole elements. 1445 const LLT BigTy = Query.Types[BigTyIdx]; 1446 return BigTy.isVector(); 1447 }) 1448 // FIXME: Multiples of 16 should not be legal. 1449 .legalIf([=](const LegalityQuery &Query) { 1450 const LLT BigTy = Query.Types[BigTyIdx]; 1451 const LLT LitTy = Query.Types[LitTyIdx]; 1452 return (BigTy.getSizeInBits() % 32 == 0) && 1453 (LitTy.getSizeInBits() % 16 == 0); 1454 }) 1455 .widenScalarIf( 1456 [=](const LegalityQuery &Query) { 1457 const LLT BigTy = Query.Types[BigTyIdx]; 1458 return (BigTy.getScalarSizeInBits() < 16); 1459 }, 1460 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1461 .widenScalarIf( 1462 [=](const LegalityQuery &Query) { 1463 const LLT LitTy = Query.Types[LitTyIdx]; 1464 return (LitTy.getScalarSizeInBits() < 16); 1465 }, 1466 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1467 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1468 .widenScalarToNextPow2(BigTyIdx, 32); 1469 1470 } 1471 1472 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1473 .legalForCartesianProduct(AllS32Vectors, {S32}) 1474 .legalForCartesianProduct(AllS64Vectors, {S64}) 1475 .clampNumElements(0, V16S32, V32S32) 1476 .clampNumElements(0, V2S64, V16S64) 1477 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1478 1479 if (ST.hasScalarPackInsts()) { 1480 BuildVector 1481 // FIXME: Should probably widen s1 vectors straight to s32 1482 .minScalarOrElt(0, S16) 1483 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1484 .minScalar(1, S32); 1485 1486 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1487 .legalFor({V2S16, S32}) 1488 .lower(); 1489 BuildVector.minScalarOrElt(0, S32); 1490 } else { 1491 BuildVector.customFor({V2S16, S16}); 1492 BuildVector.minScalarOrElt(0, S32); 1493 1494 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1495 .customFor({V2S16, S32}) 1496 .lower(); 1497 } 1498 1499 BuildVector.legalIf(isRegisterType(0)); 1500 1501 // FIXME: Clamp maximum size 1502 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1503 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1504 .clampMaxNumElements(0, S32, 32) 1505 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1506 .clampMaxNumElements(0, S16, 64); 1507 1508 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1509 // pre-legalize. 1510 if (ST.hasVOP3PInsts()) { 1511 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1512 .customFor({V2S16, V2S16}) 1513 .lower(); 1514 } else 1515 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1516 1517 // Merge/Unmerge 1518 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1519 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1520 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1521 1522 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1523 const LLT Ty = Query.Types[TypeIdx]; 1524 if (Ty.isVector()) { 1525 const LLT &EltTy = Ty.getElementType(); 1526 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1527 return true; 1528 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1529 return true; 1530 } 1531 return false; 1532 }; 1533 1534 auto &Builder = getActionDefinitionsBuilder(Op) 1535 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1536 .lowerFor({{S16, V2S16}}) 1537 .lowerIf([=](const LegalityQuery &Query) { 1538 const LLT BigTy = Query.Types[BigTyIdx]; 1539 return BigTy.getSizeInBits() == 32; 1540 }) 1541 // Try to widen to s16 first for small types. 1542 // TODO: Only do this on targets with legal s16 shifts 1543 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1544 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1545 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1546 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1547 elementTypeIs(1, S16)), 1548 changeTo(1, V2S16)) 1549 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1550 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1551 // valid. 1552 .clampScalar(LitTyIdx, S32, S512) 1553 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1554 // Break up vectors with weird elements into scalars 1555 .fewerElementsIf( 1556 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1557 scalarize(0)) 1558 .fewerElementsIf( 1559 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1560 scalarize(1)) 1561 .clampScalar(BigTyIdx, S32, MaxScalar); 1562 1563 if (Op == G_MERGE_VALUES) { 1564 Builder.widenScalarIf( 1565 // TODO: Use 16-bit shifts if legal for 8-bit values? 1566 [=](const LegalityQuery &Query) { 1567 const LLT Ty = Query.Types[LitTyIdx]; 1568 return Ty.getSizeInBits() < 32; 1569 }, 1570 changeTo(LitTyIdx, S32)); 1571 } 1572 1573 Builder.widenScalarIf( 1574 [=](const LegalityQuery &Query) { 1575 const LLT Ty = Query.Types[BigTyIdx]; 1576 return !isPowerOf2_32(Ty.getSizeInBits()) && 1577 Ty.getSizeInBits() % 16 != 0; 1578 }, 1579 [=](const LegalityQuery &Query) { 1580 // Pick the next power of 2, or a multiple of 64 over 128. 1581 // Whichever is smaller. 1582 const LLT &Ty = Query.Types[BigTyIdx]; 1583 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1584 if (NewSizeInBits >= 256) { 1585 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1586 if (RoundedTo < NewSizeInBits) 1587 NewSizeInBits = RoundedTo; 1588 } 1589 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1590 }) 1591 // Any vectors left are the wrong size. Scalarize them. 1592 .scalarize(0) 1593 .scalarize(1); 1594 } 1595 1596 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1597 // RegBankSelect. 1598 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1599 .legalFor({{S32}, {S64}}); 1600 1601 if (ST.hasVOP3PInsts()) { 1602 SextInReg.lowerFor({{V2S16}}) 1603 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1604 // get more vector shift opportunities, since we'll get those when 1605 // expanded. 1606 .clampMaxNumElementsStrict(0, S16, 2); 1607 } else if (ST.has16BitInsts()) { 1608 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1609 } else { 1610 // Prefer to promote to s32 before lowering if we don't have 16-bit 1611 // shifts. This avoid a lot of intermediate truncate and extend operations. 1612 SextInReg.lowerFor({{S32}, {S64}}); 1613 } 1614 1615 SextInReg 1616 .scalarize(0) 1617 .clampScalar(0, S32, S64) 1618 .lower(); 1619 1620 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1621 .scalarize(0) 1622 .lower(); 1623 1624 // TODO: Only Try to form v2s16 with legal packed instructions. 1625 getActionDefinitionsBuilder(G_FSHR) 1626 .legalFor({{S32, S32}}) 1627 .lowerFor({{V2S16, V2S16}}) 1628 .clampMaxNumElementsStrict(0, S16, 2) 1629 .scalarize(0) 1630 .lower(); 1631 1632 if (ST.hasVOP3PInsts()) { 1633 getActionDefinitionsBuilder(G_FSHL) 1634 .lowerFor({{V2S16, V2S16}}) 1635 .clampMaxNumElementsStrict(0, S16, 2) 1636 .scalarize(0) 1637 .lower(); 1638 } else { 1639 getActionDefinitionsBuilder(G_FSHL) 1640 .scalarize(0) 1641 .lower(); 1642 } 1643 1644 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1645 .legalFor({S64}); 1646 1647 getActionDefinitionsBuilder(G_FENCE) 1648 .alwaysLegal(); 1649 1650 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1651 .scalarize(0) 1652 .minScalar(0, S32) 1653 .lower(); 1654 1655 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1656 .legalFor({{S32, S32}, {S64, S32}}) 1657 .clampScalar(1, S32, S32) 1658 .clampScalar(0, S32, S64) 1659 .widenScalarToNextPow2(0) 1660 .scalarize(0); 1661 1662 getActionDefinitionsBuilder({ 1663 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1664 G_FCOPYSIGN, 1665 1666 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1667 G_ATOMICRMW_NAND, 1668 G_ATOMICRMW_FSUB, 1669 G_READ_REGISTER, 1670 G_WRITE_REGISTER, 1671 1672 G_SADDO, G_SSUBO, 1673 1674 // TODO: Implement 1675 G_FMINIMUM, G_FMAXIMUM}).lower(); 1676 1677 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1678 .lower(); 1679 1680 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1681 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1682 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1683 .unsupported(); 1684 1685 getLegacyLegalizerInfo().computeTables(); 1686 verify(*ST.getInstrInfo()); 1687 } 1688 1689 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1690 MachineInstr &MI) const { 1691 MachineIRBuilder &B = Helper.MIRBuilder; 1692 MachineRegisterInfo &MRI = *B.getMRI(); 1693 1694 switch (MI.getOpcode()) { 1695 case TargetOpcode::G_ADDRSPACE_CAST: 1696 return legalizeAddrSpaceCast(MI, MRI, B); 1697 case TargetOpcode::G_FRINT: 1698 return legalizeFrint(MI, MRI, B); 1699 case TargetOpcode::G_FCEIL: 1700 return legalizeFceil(MI, MRI, B); 1701 case TargetOpcode::G_FREM: 1702 return legalizeFrem(MI, MRI, B); 1703 case TargetOpcode::G_INTRINSIC_TRUNC: 1704 return legalizeIntrinsicTrunc(MI, MRI, B); 1705 case TargetOpcode::G_SITOFP: 1706 return legalizeITOFP(MI, MRI, B, true); 1707 case TargetOpcode::G_UITOFP: 1708 return legalizeITOFP(MI, MRI, B, false); 1709 case TargetOpcode::G_FPTOSI: 1710 return legalizeFPTOI(MI, MRI, B, true); 1711 case TargetOpcode::G_FPTOUI: 1712 return legalizeFPTOI(MI, MRI, B, false); 1713 case TargetOpcode::G_FMINNUM: 1714 case TargetOpcode::G_FMAXNUM: 1715 case TargetOpcode::G_FMINNUM_IEEE: 1716 case TargetOpcode::G_FMAXNUM_IEEE: 1717 return legalizeMinNumMaxNum(Helper, MI); 1718 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1719 return legalizeExtractVectorElt(MI, MRI, B); 1720 case TargetOpcode::G_INSERT_VECTOR_ELT: 1721 return legalizeInsertVectorElt(MI, MRI, B); 1722 case TargetOpcode::G_SHUFFLE_VECTOR: 1723 return legalizeShuffleVector(MI, MRI, B); 1724 case TargetOpcode::G_FSIN: 1725 case TargetOpcode::G_FCOS: 1726 return legalizeSinCos(MI, MRI, B); 1727 case TargetOpcode::G_GLOBAL_VALUE: 1728 return legalizeGlobalValue(MI, MRI, B); 1729 case TargetOpcode::G_LOAD: 1730 case TargetOpcode::G_SEXTLOAD: 1731 case TargetOpcode::G_ZEXTLOAD: 1732 return legalizeLoad(Helper, MI); 1733 case TargetOpcode::G_FMAD: 1734 return legalizeFMad(MI, MRI, B); 1735 case TargetOpcode::G_FDIV: 1736 return legalizeFDIV(MI, MRI, B); 1737 case TargetOpcode::G_UDIV: 1738 case TargetOpcode::G_UREM: 1739 case TargetOpcode::G_UDIVREM: 1740 return legalizeUnsignedDIV_REM(MI, MRI, B); 1741 case TargetOpcode::G_SDIV: 1742 case TargetOpcode::G_SREM: 1743 case TargetOpcode::G_SDIVREM: 1744 return legalizeSignedDIV_REM(MI, MRI, B); 1745 case TargetOpcode::G_ATOMIC_CMPXCHG: 1746 return legalizeAtomicCmpXChg(MI, MRI, B); 1747 case TargetOpcode::G_FLOG: 1748 return legalizeFlog(MI, B, numbers::ln2f); 1749 case TargetOpcode::G_FLOG10: 1750 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1751 case TargetOpcode::G_FEXP: 1752 return legalizeFExp(MI, B); 1753 case TargetOpcode::G_FPOW: 1754 return legalizeFPow(MI, B); 1755 case TargetOpcode::G_FFLOOR: 1756 return legalizeFFloor(MI, MRI, B); 1757 case TargetOpcode::G_BUILD_VECTOR: 1758 return legalizeBuildVector(MI, MRI, B); 1759 case TargetOpcode::G_CTLZ: 1760 case TargetOpcode::G_CTTZ: 1761 return legalizeCTLZ_CTTZ(MI, MRI, B); 1762 default: 1763 return false; 1764 } 1765 1766 llvm_unreachable("expected switch to return"); 1767 } 1768 1769 Register AMDGPULegalizerInfo::getSegmentAperture( 1770 unsigned AS, 1771 MachineRegisterInfo &MRI, 1772 MachineIRBuilder &B) const { 1773 MachineFunction &MF = B.getMF(); 1774 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1775 const LLT S32 = LLT::scalar(32); 1776 1777 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1778 1779 if (ST.hasApertureRegs()) { 1780 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1781 // getreg. 1782 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1783 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1784 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1785 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1786 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1787 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1788 unsigned Encoding = 1789 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1790 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1791 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1792 1793 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1794 1795 B.buildInstr(AMDGPU::S_GETREG_B32) 1796 .addDef(GetReg) 1797 .addImm(Encoding); 1798 MRI.setType(GetReg, S32); 1799 1800 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1801 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1802 } 1803 1804 Register QueuePtr = MRI.createGenericVirtualRegister( 1805 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1806 1807 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1808 return Register(); 1809 1810 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1811 // private_segment_aperture_base_hi. 1812 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1813 1814 // TODO: can we be smarter about machine pointer info? 1815 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1816 MachineMemOperand *MMO = MF.getMachineMemOperand( 1817 PtrInfo, 1818 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1819 MachineMemOperand::MOInvariant, 1820 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1821 1822 Register LoadAddr; 1823 1824 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1825 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1826 } 1827 1828 /// Return true if the value is a known valid address, such that a null check is 1829 /// not necessary. 1830 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 1831 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 1832 MachineInstr *Def = MRI.getVRegDef(Val); 1833 switch (Def->getOpcode()) { 1834 case AMDGPU::G_FRAME_INDEX: 1835 case AMDGPU::G_GLOBAL_VALUE: 1836 case AMDGPU::G_BLOCK_ADDR: 1837 return true; 1838 case AMDGPU::G_CONSTANT: { 1839 const ConstantInt *CI = Def->getOperand(1).getCImm(); 1840 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 1841 } 1842 default: 1843 return false; 1844 } 1845 1846 return false; 1847 } 1848 1849 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1850 MachineInstr &MI, MachineRegisterInfo &MRI, 1851 MachineIRBuilder &B) const { 1852 MachineFunction &MF = B.getMF(); 1853 1854 const LLT S32 = LLT::scalar(32); 1855 Register Dst = MI.getOperand(0).getReg(); 1856 Register Src = MI.getOperand(1).getReg(); 1857 1858 LLT DstTy = MRI.getType(Dst); 1859 LLT SrcTy = MRI.getType(Src); 1860 unsigned DestAS = DstTy.getAddressSpace(); 1861 unsigned SrcAS = SrcTy.getAddressSpace(); 1862 1863 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1864 // vector element. 1865 assert(!DstTy.isVector()); 1866 1867 const AMDGPUTargetMachine &TM 1868 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1869 1870 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1871 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1872 return true; 1873 } 1874 1875 if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 1876 (DestAS == AMDGPUAS::LOCAL_ADDRESS || 1877 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1878 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1879 // Extract low 32-bits of the pointer. 1880 B.buildExtract(Dst, Src, 0); 1881 MI.eraseFromParent(); 1882 return true; 1883 } 1884 1885 unsigned NullVal = TM.getNullPointerValue(DestAS); 1886 1887 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1888 auto FlatNull = B.buildConstant(SrcTy, 0); 1889 1890 // Extract low 32-bits of the pointer. 1891 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1892 1893 auto CmpRes = 1894 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1895 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1896 1897 MI.eraseFromParent(); 1898 return true; 1899 } 1900 1901 if (DestAS == AMDGPUAS::FLAT_ADDRESS && 1902 (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 1903 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1904 if (!ST.hasFlatAddressSpace()) 1905 return false; 1906 1907 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1908 if (!ApertureReg.isValid()) 1909 return false; 1910 1911 // Coerce the type of the low half of the result so we can use merge_values. 1912 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1913 1914 // TODO: Should we allow mismatched types but matching sizes in merges to 1915 // avoid the ptrtoint? 1916 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1917 1918 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1919 B.buildCopy(Dst, BuildPtr); 1920 MI.eraseFromParent(); 1921 return true; 1922 } 1923 1924 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1925 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1926 1927 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 1928 SegmentNull.getReg(0)); 1929 1930 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1931 1932 MI.eraseFromParent(); 1933 return true; 1934 } 1935 1936 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 1937 SrcTy.getSizeInBits() == 64) { 1938 // Truncate. 1939 B.buildExtract(Dst, Src, 0); 1940 MI.eraseFromParent(); 1941 return true; 1942 } 1943 1944 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 1945 DstTy.getSizeInBits() == 64) { 1946 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1947 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1948 1949 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1950 // another. Merge operands are required to be the same type, but creating an 1951 // extra ptrtoint would be kind of pointless. 1952 auto HighAddr = B.buildConstant( 1953 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1954 B.buildMerge(Dst, {Src, HighAddr}); 1955 MI.eraseFromParent(); 1956 return true; 1957 } 1958 1959 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 1960 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 1961 1962 LLVMContext &Ctx = MF.getFunction().getContext(); 1963 Ctx.diagnose(InvalidAddrSpaceCast); 1964 B.buildUndef(Dst); 1965 MI.eraseFromParent(); 1966 return true; 1967 } 1968 1969 bool AMDGPULegalizerInfo::legalizeFrint( 1970 MachineInstr &MI, MachineRegisterInfo &MRI, 1971 MachineIRBuilder &B) const { 1972 Register Src = MI.getOperand(1).getReg(); 1973 LLT Ty = MRI.getType(Src); 1974 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1975 1976 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1977 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1978 1979 auto C1 = B.buildFConstant(Ty, C1Val); 1980 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1981 1982 // TODO: Should this propagate fast-math-flags? 1983 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1984 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1985 1986 auto C2 = B.buildFConstant(Ty, C2Val); 1987 auto Fabs = B.buildFAbs(Ty, Src); 1988 1989 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1990 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1991 MI.eraseFromParent(); 1992 return true; 1993 } 1994 1995 bool AMDGPULegalizerInfo::legalizeFceil( 1996 MachineInstr &MI, MachineRegisterInfo &MRI, 1997 MachineIRBuilder &B) const { 1998 1999 const LLT S1 = LLT::scalar(1); 2000 const LLT S64 = LLT::scalar(64); 2001 2002 Register Src = MI.getOperand(1).getReg(); 2003 assert(MRI.getType(Src) == S64); 2004 2005 // result = trunc(src) 2006 // if (src > 0.0 && src != result) 2007 // result += 1.0 2008 2009 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 2010 2011 const auto Zero = B.buildFConstant(S64, 0.0); 2012 const auto One = B.buildFConstant(S64, 1.0); 2013 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2014 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2015 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2016 auto Add = B.buildSelect(S64, And, One, Zero); 2017 2018 // TODO: Should this propagate fast-math-flags? 2019 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2020 MI.eraseFromParent(); 2021 return true; 2022 } 2023 2024 bool AMDGPULegalizerInfo::legalizeFrem( 2025 MachineInstr &MI, MachineRegisterInfo &MRI, 2026 MachineIRBuilder &B) const { 2027 Register DstReg = MI.getOperand(0).getReg(); 2028 Register Src0Reg = MI.getOperand(1).getReg(); 2029 Register Src1Reg = MI.getOperand(2).getReg(); 2030 auto Flags = MI.getFlags(); 2031 LLT Ty = MRI.getType(DstReg); 2032 2033 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2034 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2035 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2036 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2037 MI.eraseFromParent(); 2038 return true; 2039 } 2040 2041 static MachineInstrBuilder extractF64Exponent(Register Hi, 2042 MachineIRBuilder &B) { 2043 const unsigned FractBits = 52; 2044 const unsigned ExpBits = 11; 2045 LLT S32 = LLT::scalar(32); 2046 2047 auto Const0 = B.buildConstant(S32, FractBits - 32); 2048 auto Const1 = B.buildConstant(S32, ExpBits); 2049 2050 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2051 .addUse(Hi) 2052 .addUse(Const0.getReg(0)) 2053 .addUse(Const1.getReg(0)); 2054 2055 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2056 } 2057 2058 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2059 MachineInstr &MI, MachineRegisterInfo &MRI, 2060 MachineIRBuilder &B) const { 2061 const LLT S1 = LLT::scalar(1); 2062 const LLT S32 = LLT::scalar(32); 2063 const LLT S64 = LLT::scalar(64); 2064 2065 Register Src = MI.getOperand(1).getReg(); 2066 assert(MRI.getType(Src) == S64); 2067 2068 // TODO: Should this use extract since the low half is unused? 2069 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2070 Register Hi = Unmerge.getReg(1); 2071 2072 // Extract the upper half, since this is where we will find the sign and 2073 // exponent. 2074 auto Exp = extractF64Exponent(Hi, B); 2075 2076 const unsigned FractBits = 52; 2077 2078 // Extract the sign bit. 2079 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2080 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2081 2082 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2083 2084 const auto Zero32 = B.buildConstant(S32, 0); 2085 2086 // Extend back to 64-bits. 2087 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2088 2089 auto Shr = B.buildAShr(S64, FractMask, Exp); 2090 auto Not = B.buildNot(S64, Shr); 2091 auto Tmp0 = B.buildAnd(S64, Src, Not); 2092 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2093 2094 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2095 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2096 2097 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2098 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2099 MI.eraseFromParent(); 2100 return true; 2101 } 2102 2103 bool AMDGPULegalizerInfo::legalizeITOFP( 2104 MachineInstr &MI, MachineRegisterInfo &MRI, 2105 MachineIRBuilder &B, bool Signed) const { 2106 2107 Register Dst = MI.getOperand(0).getReg(); 2108 Register Src = MI.getOperand(1).getReg(); 2109 2110 const LLT S64 = LLT::scalar(64); 2111 const LLT S32 = LLT::scalar(32); 2112 2113 assert(MRI.getType(Src) == S64); 2114 2115 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2116 auto ThirtyTwo = B.buildConstant(S32, 32); 2117 2118 if (MRI.getType(Dst) == S64) { 2119 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2120 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2121 2122 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2123 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2124 .addUse(CvtHi.getReg(0)) 2125 .addUse(ThirtyTwo.getReg(0)); 2126 2127 // TODO: Should this propagate fast-math-flags? 2128 B.buildFAdd(Dst, LdExp, CvtLo); 2129 MI.eraseFromParent(); 2130 return true; 2131 } 2132 2133 assert(MRI.getType(Dst) == S32); 2134 2135 auto One = B.buildConstant(S32, 1); 2136 2137 MachineInstrBuilder ShAmt; 2138 if (Signed) { 2139 auto ThirtyOne = B.buildConstant(S32, 31); 2140 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2141 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2142 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2143 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2144 /*HasSideEffects=*/false) 2145 .addUse(Unmerge.getReg(1)); 2146 auto LS2 = B.buildSub(S32, LS, One); 2147 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2148 } else 2149 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2150 auto Norm = B.buildShl(S64, Src, ShAmt); 2151 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2152 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2153 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2154 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2155 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2156 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2157 /*HasSideEffects=*/false) 2158 .addUse(FVal.getReg(0)) 2159 .addUse(Scale.getReg(0)); 2160 MI.eraseFromParent(); 2161 return true; 2162 } 2163 2164 // TODO: Copied from DAG implementation. Verify logic and document how this 2165 // actually works. 2166 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2167 MachineRegisterInfo &MRI, 2168 MachineIRBuilder &B, 2169 bool Signed) const { 2170 2171 Register Dst = MI.getOperand(0).getReg(); 2172 Register Src = MI.getOperand(1).getReg(); 2173 2174 const LLT S64 = LLT::scalar(64); 2175 const LLT S32 = LLT::scalar(32); 2176 2177 const LLT SrcLT = MRI.getType(Src); 2178 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2179 2180 unsigned Flags = MI.getFlags(); 2181 2182 // The basic idea of converting a floating point number into a pair of 32-bit 2183 // integers is illustrated as follows: 2184 // 2185 // tf := trunc(val); 2186 // hif := floor(tf * 2^-32); 2187 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2188 // hi := fptoi(hif); 2189 // lo := fptoi(lof); 2190 // 2191 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2192 MachineInstrBuilder Sign; 2193 if (Signed && SrcLT == S32) { 2194 // However, a 32-bit floating point number has only 23 bits mantissa and 2195 // it's not enough to hold all the significant bits of `lof` if val is 2196 // negative. To avoid the loss of precision, We need to take the absolute 2197 // value after truncating and flip the result back based on the original 2198 // signedness. 2199 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2200 Trunc = B.buildFAbs(S32, Trunc, Flags); 2201 } 2202 MachineInstrBuilder K0, K1; 2203 if (SrcLT == S64) { 2204 K0 = B.buildFConstant(S64, 2205 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2206 K1 = B.buildFConstant(S64, 2207 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2208 } else { 2209 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2210 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2211 } 2212 2213 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2214 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2215 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2216 2217 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2218 : B.buildFPTOUI(S32, FloorMul); 2219 auto Lo = B.buildFPTOUI(S32, Fma); 2220 2221 if (Signed && SrcLT == S32) { 2222 // Flip the result based on the signedness, which is either all 0s or 1s. 2223 Sign = B.buildMerge(S64, {Sign, Sign}); 2224 // r := xor({lo, hi}, sign) - sign; 2225 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2226 } else 2227 B.buildMerge(Dst, {Lo, Hi}); 2228 MI.eraseFromParent(); 2229 2230 return true; 2231 } 2232 2233 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2234 MachineInstr &MI) const { 2235 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2236 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2237 2238 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2239 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2240 2241 // With ieee_mode disabled, the instructions have the correct behavior 2242 // already for G_FMINNUM/G_FMAXNUM 2243 if (!MFI->getMode().IEEE) 2244 return !IsIEEEOp; 2245 2246 if (IsIEEEOp) 2247 return true; 2248 2249 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2250 } 2251 2252 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2253 MachineInstr &MI, MachineRegisterInfo &MRI, 2254 MachineIRBuilder &B) const { 2255 // TODO: Should move some of this into LegalizerHelper. 2256 2257 // TODO: Promote dynamic indexing of s16 to s32 2258 2259 // FIXME: Artifact combiner probably should have replaced the truncated 2260 // constant before this, so we shouldn't need 2261 // getIConstantVRegValWithLookThrough. 2262 Optional<ValueAndVReg> MaybeIdxVal = 2263 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2264 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2265 return true; 2266 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2267 2268 Register Dst = MI.getOperand(0).getReg(); 2269 Register Vec = MI.getOperand(1).getReg(); 2270 2271 LLT VecTy = MRI.getType(Vec); 2272 LLT EltTy = VecTy.getElementType(); 2273 assert(EltTy == MRI.getType(Dst)); 2274 2275 if (IdxVal < VecTy.getNumElements()) { 2276 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2277 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2278 } else { 2279 B.buildUndef(Dst); 2280 } 2281 2282 MI.eraseFromParent(); 2283 return true; 2284 } 2285 2286 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2287 MachineInstr &MI, MachineRegisterInfo &MRI, 2288 MachineIRBuilder &B) const { 2289 // TODO: Should move some of this into LegalizerHelper. 2290 2291 // TODO: Promote dynamic indexing of s16 to s32 2292 2293 // FIXME: Artifact combiner probably should have replaced the truncated 2294 // constant before this, so we shouldn't need 2295 // getIConstantVRegValWithLookThrough. 2296 Optional<ValueAndVReg> MaybeIdxVal = 2297 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2298 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2299 return true; 2300 2301 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2302 Register Dst = MI.getOperand(0).getReg(); 2303 Register Vec = MI.getOperand(1).getReg(); 2304 Register Ins = MI.getOperand(2).getReg(); 2305 2306 LLT VecTy = MRI.getType(Vec); 2307 LLT EltTy = VecTy.getElementType(); 2308 assert(EltTy == MRI.getType(Ins)); 2309 (void)Ins; 2310 2311 unsigned NumElts = VecTy.getNumElements(); 2312 if (IdxVal < NumElts) { 2313 SmallVector<Register, 8> SrcRegs; 2314 for (unsigned i = 0; i < NumElts; ++i) 2315 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2316 B.buildUnmerge(SrcRegs, Vec); 2317 2318 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2319 B.buildMerge(Dst, SrcRegs); 2320 } else { 2321 B.buildUndef(Dst); 2322 } 2323 2324 MI.eraseFromParent(); 2325 return true; 2326 } 2327 2328 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2329 MachineInstr &MI, MachineRegisterInfo &MRI, 2330 MachineIRBuilder &B) const { 2331 const LLT V2S16 = LLT::fixed_vector(2, 16); 2332 2333 Register Dst = MI.getOperand(0).getReg(); 2334 Register Src0 = MI.getOperand(1).getReg(); 2335 LLT DstTy = MRI.getType(Dst); 2336 LLT SrcTy = MRI.getType(Src0); 2337 2338 if (SrcTy == V2S16 && DstTy == V2S16 && 2339 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2340 return true; 2341 2342 MachineIRBuilder HelperBuilder(MI); 2343 GISelObserverWrapper DummyObserver; 2344 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2345 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2346 } 2347 2348 bool AMDGPULegalizerInfo::legalizeSinCos( 2349 MachineInstr &MI, MachineRegisterInfo &MRI, 2350 MachineIRBuilder &B) const { 2351 2352 Register DstReg = MI.getOperand(0).getReg(); 2353 Register SrcReg = MI.getOperand(1).getReg(); 2354 LLT Ty = MRI.getType(DstReg); 2355 unsigned Flags = MI.getFlags(); 2356 2357 Register TrigVal; 2358 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2359 if (ST.hasTrigReducedRange()) { 2360 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2361 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2362 .addUse(MulVal.getReg(0)) 2363 .setMIFlags(Flags).getReg(0); 2364 } else 2365 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2366 2367 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2368 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2369 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2370 .addUse(TrigVal) 2371 .setMIFlags(Flags); 2372 MI.eraseFromParent(); 2373 return true; 2374 } 2375 2376 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2377 MachineIRBuilder &B, 2378 const GlobalValue *GV, 2379 int64_t Offset, 2380 unsigned GAFlags) const { 2381 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2382 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2383 // to the following code sequence: 2384 // 2385 // For constant address space: 2386 // s_getpc_b64 s[0:1] 2387 // s_add_u32 s0, s0, $symbol 2388 // s_addc_u32 s1, s1, 0 2389 // 2390 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2391 // a fixup or relocation is emitted to replace $symbol with a literal 2392 // constant, which is a pc-relative offset from the encoding of the $symbol 2393 // operand to the global variable. 2394 // 2395 // For global address space: 2396 // s_getpc_b64 s[0:1] 2397 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2398 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2399 // 2400 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2401 // fixups or relocations are emitted to replace $symbol@*@lo and 2402 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2403 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2404 // operand to the global variable. 2405 // 2406 // What we want here is an offset from the value returned by s_getpc 2407 // (which is the address of the s_add_u32 instruction) to the global 2408 // variable, but since the encoding of $symbol starts 4 bytes after the start 2409 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2410 // small. This requires us to add 4 to the global variable offset in order to 2411 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2412 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2413 // instruction. 2414 2415 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2416 2417 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2418 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2419 2420 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2421 .addDef(PCReg); 2422 2423 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2424 if (GAFlags == SIInstrInfo::MO_NONE) 2425 MIB.addImm(0); 2426 else 2427 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2428 2429 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2430 2431 if (PtrTy.getSizeInBits() == 32) 2432 B.buildExtract(DstReg, PCReg, 0); 2433 return true; 2434 } 2435 2436 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2437 MachineInstr &MI, MachineRegisterInfo &MRI, 2438 MachineIRBuilder &B) const { 2439 Register DstReg = MI.getOperand(0).getReg(); 2440 LLT Ty = MRI.getType(DstReg); 2441 unsigned AS = Ty.getAddressSpace(); 2442 2443 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2444 MachineFunction &MF = B.getMF(); 2445 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2446 2447 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2448 if (!MFI->isModuleEntryFunction() && 2449 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2450 const Function &Fn = MF.getFunction(); 2451 DiagnosticInfoUnsupported BadLDSDecl( 2452 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2453 DS_Warning); 2454 Fn.getContext().diagnose(BadLDSDecl); 2455 2456 // We currently don't have a way to correctly allocate LDS objects that 2457 // aren't directly associated with a kernel. We do force inlining of 2458 // functions that use local objects. However, if these dead functions are 2459 // not eliminated, we don't want a compile time error. Just emit a warning 2460 // and a trap, since there should be no callable path here. 2461 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2462 B.buildUndef(DstReg); 2463 MI.eraseFromParent(); 2464 return true; 2465 } 2466 2467 // TODO: We could emit code to handle the initialization somewhere. 2468 // We ignore the initializer for now and legalize it to allow selection. 2469 // The initializer will anyway get errored out during assembly emission. 2470 const SITargetLowering *TLI = ST.getTargetLowering(); 2471 if (!TLI->shouldUseLDSConstAddress(GV)) { 2472 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2473 return true; // Leave in place; 2474 } 2475 2476 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2477 Type *Ty = GV->getValueType(); 2478 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2479 // zero-sized type in other languages to declare the dynamic shared 2480 // memory which size is not known at the compile time. They will be 2481 // allocated by the runtime and placed directly after the static 2482 // allocated ones. They all share the same offset. 2483 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2484 // Adjust alignment for that dynamic shared memory array. 2485 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2486 LLT S32 = LLT::scalar(32); 2487 auto Sz = 2488 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2489 B.buildIntToPtr(DstReg, Sz); 2490 MI.eraseFromParent(); 2491 return true; 2492 } 2493 } 2494 2495 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2496 *cast<GlobalVariable>(GV))); 2497 MI.eraseFromParent(); 2498 return true; 2499 } 2500 2501 const SITargetLowering *TLI = ST.getTargetLowering(); 2502 2503 if (TLI->shouldEmitFixup(GV)) { 2504 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2505 MI.eraseFromParent(); 2506 return true; 2507 } 2508 2509 if (TLI->shouldEmitPCReloc(GV)) { 2510 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2511 MI.eraseFromParent(); 2512 return true; 2513 } 2514 2515 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2516 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2517 2518 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2519 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2520 MachinePointerInfo::getGOT(MF), 2521 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2522 MachineMemOperand::MOInvariant, 2523 LoadTy, Align(8)); 2524 2525 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2526 2527 if (Ty.getSizeInBits() == 32) { 2528 // Truncate if this is a 32-bit constant address. 2529 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2530 B.buildExtract(DstReg, Load, 0); 2531 } else 2532 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2533 2534 MI.eraseFromParent(); 2535 return true; 2536 } 2537 2538 static LLT widenToNextPowerOf2(LLT Ty) { 2539 if (Ty.isVector()) 2540 return Ty.changeElementCount( 2541 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2542 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2543 } 2544 2545 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2546 MachineInstr &MI) const { 2547 MachineIRBuilder &B = Helper.MIRBuilder; 2548 MachineRegisterInfo &MRI = *B.getMRI(); 2549 GISelChangeObserver &Observer = Helper.Observer; 2550 2551 Register PtrReg = MI.getOperand(1).getReg(); 2552 LLT PtrTy = MRI.getType(PtrReg); 2553 unsigned AddrSpace = PtrTy.getAddressSpace(); 2554 2555 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2556 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2557 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2558 Observer.changingInstr(MI); 2559 MI.getOperand(1).setReg(Cast.getReg(0)); 2560 Observer.changedInstr(MI); 2561 return true; 2562 } 2563 2564 if (MI.getOpcode() != AMDGPU::G_LOAD) 2565 return false; 2566 2567 Register ValReg = MI.getOperand(0).getReg(); 2568 LLT ValTy = MRI.getType(ValReg); 2569 2570 MachineMemOperand *MMO = *MI.memoperands_begin(); 2571 const unsigned ValSize = ValTy.getSizeInBits(); 2572 const LLT MemTy = MMO->getMemoryType(); 2573 const Align MemAlign = MMO->getAlign(); 2574 const unsigned MemSize = MemTy.getSizeInBits(); 2575 const uint64_t AlignInBits = 8 * MemAlign.value(); 2576 2577 // Widen non-power-of-2 loads to the alignment if needed 2578 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2579 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2580 2581 // This was already the correct extending load result type, so just adjust 2582 // the memory type. 2583 if (WideMemSize == ValSize) { 2584 MachineFunction &MF = B.getMF(); 2585 2586 MachineMemOperand *WideMMO = 2587 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2588 Observer.changingInstr(MI); 2589 MI.setMemRefs(MF, {WideMMO}); 2590 Observer.changedInstr(MI); 2591 return true; 2592 } 2593 2594 // Don't bother handling edge case that should probably never be produced. 2595 if (ValSize > WideMemSize) 2596 return false; 2597 2598 LLT WideTy = widenToNextPowerOf2(ValTy); 2599 2600 Register WideLoad; 2601 if (!WideTy.isVector()) { 2602 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2603 B.buildTrunc(ValReg, WideLoad).getReg(0); 2604 } else { 2605 // Extract the subvector. 2606 2607 if (isRegisterType(ValTy)) { 2608 // If this a case where G_EXTRACT is legal, use it. 2609 // (e.g. <3 x s32> -> <4 x s32>) 2610 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2611 B.buildExtract(ValReg, WideLoad, 0); 2612 } else { 2613 // For cases where the widened type isn't a nice register value, unmerge 2614 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2615 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2616 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2617 } 2618 } 2619 2620 MI.eraseFromParent(); 2621 return true; 2622 } 2623 2624 return false; 2625 } 2626 2627 bool AMDGPULegalizerInfo::legalizeFMad( 2628 MachineInstr &MI, MachineRegisterInfo &MRI, 2629 MachineIRBuilder &B) const { 2630 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2631 assert(Ty.isScalar()); 2632 2633 MachineFunction &MF = B.getMF(); 2634 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2635 2636 // TODO: Always legal with future ftz flag. 2637 // FIXME: Do we need just output? 2638 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2639 return true; 2640 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2641 return true; 2642 2643 MachineIRBuilder HelperBuilder(MI); 2644 GISelObserverWrapper DummyObserver; 2645 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2646 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2647 } 2648 2649 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2650 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2651 Register DstReg = MI.getOperand(0).getReg(); 2652 Register PtrReg = MI.getOperand(1).getReg(); 2653 Register CmpVal = MI.getOperand(2).getReg(); 2654 Register NewVal = MI.getOperand(3).getReg(); 2655 2656 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2657 "this should not have been custom lowered"); 2658 2659 LLT ValTy = MRI.getType(CmpVal); 2660 LLT VecTy = LLT::fixed_vector(2, ValTy); 2661 2662 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2663 2664 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2665 .addDef(DstReg) 2666 .addUse(PtrReg) 2667 .addUse(PackedVal) 2668 .setMemRefs(MI.memoperands()); 2669 2670 MI.eraseFromParent(); 2671 return true; 2672 } 2673 2674 bool AMDGPULegalizerInfo::legalizeFlog( 2675 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2676 Register Dst = MI.getOperand(0).getReg(); 2677 Register Src = MI.getOperand(1).getReg(); 2678 LLT Ty = B.getMRI()->getType(Dst); 2679 unsigned Flags = MI.getFlags(); 2680 2681 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2682 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2683 2684 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2685 MI.eraseFromParent(); 2686 return true; 2687 } 2688 2689 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2690 MachineIRBuilder &B) const { 2691 Register Dst = MI.getOperand(0).getReg(); 2692 Register Src = MI.getOperand(1).getReg(); 2693 unsigned Flags = MI.getFlags(); 2694 LLT Ty = B.getMRI()->getType(Dst); 2695 2696 auto K = B.buildFConstant(Ty, numbers::log2e); 2697 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2698 B.buildFExp2(Dst, Mul, Flags); 2699 MI.eraseFromParent(); 2700 return true; 2701 } 2702 2703 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2704 MachineIRBuilder &B) const { 2705 Register Dst = MI.getOperand(0).getReg(); 2706 Register Src0 = MI.getOperand(1).getReg(); 2707 Register Src1 = MI.getOperand(2).getReg(); 2708 unsigned Flags = MI.getFlags(); 2709 LLT Ty = B.getMRI()->getType(Dst); 2710 const LLT S16 = LLT::scalar(16); 2711 const LLT S32 = LLT::scalar(32); 2712 2713 if (Ty == S32) { 2714 auto Log = B.buildFLog2(S32, Src0, Flags); 2715 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2716 .addUse(Log.getReg(0)) 2717 .addUse(Src1) 2718 .setMIFlags(Flags); 2719 B.buildFExp2(Dst, Mul, Flags); 2720 } else if (Ty == S16) { 2721 // There's no f16 fmul_legacy, so we need to convert for it. 2722 auto Log = B.buildFLog2(S16, Src0, Flags); 2723 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2724 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2725 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2726 .addUse(Ext0.getReg(0)) 2727 .addUse(Ext1.getReg(0)) 2728 .setMIFlags(Flags); 2729 2730 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2731 } else 2732 return false; 2733 2734 MI.eraseFromParent(); 2735 return true; 2736 } 2737 2738 // Find a source register, ignoring any possible source modifiers. 2739 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2740 Register ModSrc = OrigSrc; 2741 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2742 ModSrc = SrcFNeg->getOperand(1).getReg(); 2743 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2744 ModSrc = SrcFAbs->getOperand(1).getReg(); 2745 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2746 ModSrc = SrcFAbs->getOperand(1).getReg(); 2747 return ModSrc; 2748 } 2749 2750 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2751 MachineRegisterInfo &MRI, 2752 MachineIRBuilder &B) const { 2753 2754 const LLT S1 = LLT::scalar(1); 2755 const LLT S64 = LLT::scalar(64); 2756 Register Dst = MI.getOperand(0).getReg(); 2757 Register OrigSrc = MI.getOperand(1).getReg(); 2758 unsigned Flags = MI.getFlags(); 2759 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2760 "this should not have been custom lowered"); 2761 2762 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2763 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2764 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2765 // V_FRACT bug is: 2766 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2767 // 2768 // Convert floor(x) to (x - fract(x)) 2769 2770 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2771 .addUse(OrigSrc) 2772 .setMIFlags(Flags); 2773 2774 // Give source modifier matching some assistance before obscuring a foldable 2775 // pattern. 2776 2777 // TODO: We can avoid the neg on the fract? The input sign to fract 2778 // shouldn't matter? 2779 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2780 2781 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2782 2783 Register Min = MRI.createGenericVirtualRegister(S64); 2784 2785 // We don't need to concern ourselves with the snan handling difference, so 2786 // use the one which will directly select. 2787 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2788 if (MFI->getMode().IEEE) 2789 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2790 else 2791 B.buildFMinNum(Min, Fract, Const, Flags); 2792 2793 Register CorrectedFract = Min; 2794 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2795 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2796 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2797 } 2798 2799 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2800 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2801 2802 MI.eraseFromParent(); 2803 return true; 2804 } 2805 2806 // Turn an illegal packed v2s16 build vector into bit operations. 2807 // TODO: This should probably be a bitcast action in LegalizerHelper. 2808 bool AMDGPULegalizerInfo::legalizeBuildVector( 2809 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2810 Register Dst = MI.getOperand(0).getReg(); 2811 const LLT S32 = LLT::scalar(32); 2812 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2813 2814 Register Src0 = MI.getOperand(1).getReg(); 2815 Register Src1 = MI.getOperand(2).getReg(); 2816 assert(MRI.getType(Src0) == LLT::scalar(16)); 2817 2818 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2819 B.buildBitcast(Dst, Merge); 2820 2821 MI.eraseFromParent(); 2822 return true; 2823 } 2824 2825 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2826 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2827 // case with a single min instruction instead of a compare+select. 2828 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2829 MachineRegisterInfo &MRI, 2830 MachineIRBuilder &B) const { 2831 Register Dst = MI.getOperand(0).getReg(); 2832 Register Src = MI.getOperand(1).getReg(); 2833 LLT DstTy = MRI.getType(Dst); 2834 LLT SrcTy = MRI.getType(Src); 2835 2836 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2837 ? AMDGPU::G_AMDGPU_FFBH_U32 2838 : AMDGPU::G_AMDGPU_FFBL_B32; 2839 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2840 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2841 2842 MI.eraseFromParent(); 2843 return true; 2844 } 2845 2846 // Check that this is a G_XOR x, -1 2847 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2848 if (MI.getOpcode() != TargetOpcode::G_XOR) 2849 return false; 2850 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2851 return ConstVal && *ConstVal == -1; 2852 } 2853 2854 // Return the use branch instruction, otherwise null if the usage is invalid. 2855 static MachineInstr * 2856 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2857 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2858 Register CondDef = MI.getOperand(0).getReg(); 2859 if (!MRI.hasOneNonDBGUse(CondDef)) 2860 return nullptr; 2861 2862 MachineBasicBlock *Parent = MI.getParent(); 2863 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2864 2865 if (isNot(MRI, *UseMI)) { 2866 Register NegatedCond = UseMI->getOperand(0).getReg(); 2867 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2868 return nullptr; 2869 2870 // We're deleting the def of this value, so we need to remove it. 2871 eraseInstr(*UseMI, MRI); 2872 2873 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2874 Negated = true; 2875 } 2876 2877 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2878 return nullptr; 2879 2880 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2881 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2882 if (Next == Parent->end()) { 2883 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2884 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2885 return nullptr; 2886 UncondBrTarget = &*NextMBB; 2887 } else { 2888 if (Next->getOpcode() != AMDGPU::G_BR) 2889 return nullptr; 2890 Br = &*Next; 2891 UncondBrTarget = Br->getOperand(0).getMBB(); 2892 } 2893 2894 return UseMI; 2895 } 2896 2897 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2898 const ArgDescriptor *Arg, 2899 const TargetRegisterClass *ArgRC, 2900 LLT ArgTy) const { 2901 MCRegister SrcReg = Arg->getRegister(); 2902 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2903 assert(DstReg.isVirtual() && "Virtual register expected"); 2904 2905 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 2906 *ArgRC, B.getDebugLoc(), ArgTy); 2907 if (Arg->isMasked()) { 2908 // TODO: Should we try to emit this once in the entry block? 2909 const LLT S32 = LLT::scalar(32); 2910 const unsigned Mask = Arg->getMask(); 2911 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2912 2913 Register AndMaskSrc = LiveIn; 2914 2915 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 2916 // 0. 2917 if (Shift != 0) { 2918 auto ShiftAmt = B.buildConstant(S32, Shift); 2919 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2920 } 2921 2922 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2923 } else { 2924 B.buildCopy(DstReg, LiveIn); 2925 } 2926 2927 return true; 2928 } 2929 2930 bool AMDGPULegalizerInfo::loadInputValue( 2931 Register DstReg, MachineIRBuilder &B, 2932 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2933 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2934 const ArgDescriptor *Arg; 2935 const TargetRegisterClass *ArgRC; 2936 LLT ArgTy; 2937 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2938 2939 if (!Arg) { 2940 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 2941 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 2942 // case the pointer argument may be missing and we use null. 2943 B.buildConstant(DstReg, 0); 2944 return true; 2945 } 2946 2947 // It's undefined behavior if a function marked with the amdgpu-no-* 2948 // attributes uses the corresponding intrinsic. 2949 B.buildUndef(DstReg); 2950 return true; 2951 } 2952 2953 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2954 return false; // TODO: Handle these 2955 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2956 } 2957 2958 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2959 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2960 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2961 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2962 return false; 2963 2964 MI.eraseFromParent(); 2965 return true; 2966 } 2967 2968 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2969 MachineRegisterInfo &MRI, 2970 MachineIRBuilder &B) const { 2971 Register Dst = MI.getOperand(0).getReg(); 2972 LLT DstTy = MRI.getType(Dst); 2973 LLT S16 = LLT::scalar(16); 2974 LLT S32 = LLT::scalar(32); 2975 LLT S64 = LLT::scalar(64); 2976 2977 if (DstTy == S16) 2978 return legalizeFDIV16(MI, MRI, B); 2979 if (DstTy == S32) 2980 return legalizeFDIV32(MI, MRI, B); 2981 if (DstTy == S64) 2982 return legalizeFDIV64(MI, MRI, B); 2983 2984 return false; 2985 } 2986 2987 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2988 Register DstDivReg, 2989 Register DstRemReg, 2990 Register X, 2991 Register Y) const { 2992 const LLT S1 = LLT::scalar(1); 2993 const LLT S32 = LLT::scalar(32); 2994 2995 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2996 // algorithm used here. 2997 2998 // Initial estimate of inv(y). 2999 auto FloatY = B.buildUITOFP(S32, Y); 3000 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 3001 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 3002 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 3003 auto Z = B.buildFPTOUI(S32, ScaledY); 3004 3005 // One round of UNR. 3006 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 3007 auto NegYZ = B.buildMul(S32, NegY, Z); 3008 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 3009 3010 // Quotient/remainder estimate. 3011 auto Q = B.buildUMulH(S32, X, Z); 3012 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 3013 3014 // First quotient/remainder refinement. 3015 auto One = B.buildConstant(S32, 1); 3016 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3017 if (DstDivReg) 3018 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 3019 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 3020 3021 // Second quotient/remainder refinement. 3022 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3023 if (DstDivReg) 3024 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 3025 3026 if (DstRemReg) 3027 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 3028 } 3029 3030 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 3031 // 3032 // Return lo, hi of result 3033 // 3034 // %cvt.lo = G_UITOFP Val.lo 3035 // %cvt.hi = G_UITOFP Val.hi 3036 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 3037 // %rcp = G_AMDGPU_RCP_IFLAG %mad 3038 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 3039 // %mul2 = G_FMUL %mul1, 2**(-32) 3040 // %trunc = G_INTRINSIC_TRUNC %mul2 3041 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 3042 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 3043 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 3044 Register Val) { 3045 const LLT S32 = LLT::scalar(32); 3046 auto Unmerge = B.buildUnmerge(S32, Val); 3047 3048 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 3049 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 3050 3051 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 3052 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 3053 3054 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 3055 auto Mul1 = 3056 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3057 3058 // 2**(-32) 3059 auto Mul2 = 3060 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3061 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3062 3063 // -(2**32) 3064 auto Mad2 = B.buildFMAD(S32, Trunc, 3065 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3066 3067 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3068 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3069 3070 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3071 } 3072 3073 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3074 Register DstDivReg, 3075 Register DstRemReg, 3076 Register Numer, 3077 Register Denom) const { 3078 const LLT S32 = LLT::scalar(32); 3079 const LLT S64 = LLT::scalar(64); 3080 const LLT S1 = LLT::scalar(1); 3081 Register RcpLo, RcpHi; 3082 3083 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3084 3085 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3086 3087 auto Zero64 = B.buildConstant(S64, 0); 3088 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3089 3090 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3091 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3092 3093 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3094 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3095 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3096 3097 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3098 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3099 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3100 3101 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3102 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3103 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3104 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3105 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3106 3107 auto Zero32 = B.buildConstant(S32, 0); 3108 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3109 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3110 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3111 3112 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3113 Register NumerLo = UnmergeNumer.getReg(0); 3114 Register NumerHi = UnmergeNumer.getReg(1); 3115 3116 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3117 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3118 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3119 Register Mul3_Lo = UnmergeMul3.getReg(0); 3120 Register Mul3_Hi = UnmergeMul3.getReg(1); 3121 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3122 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3123 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3124 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3125 3126 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3127 Register DenomLo = UnmergeDenom.getReg(0); 3128 Register DenomHi = UnmergeDenom.getReg(1); 3129 3130 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3131 auto C1 = B.buildSExt(S32, CmpHi); 3132 3133 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3134 auto C2 = B.buildSExt(S32, CmpLo); 3135 3136 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3137 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3138 3139 // TODO: Here and below portions of the code can be enclosed into if/endif. 3140 // Currently control flow is unconditional and we have 4 selects after 3141 // potential endif to substitute PHIs. 3142 3143 // if C3 != 0 ... 3144 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3145 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3146 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3147 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3148 3149 auto One64 = B.buildConstant(S64, 1); 3150 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3151 3152 auto C4 = 3153 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3154 auto C5 = 3155 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3156 auto C6 = B.buildSelect( 3157 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3158 3159 // if (C6 != 0) 3160 auto Add4 = B.buildAdd(S64, Add3, One64); 3161 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3162 3163 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3164 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3165 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3166 3167 // endif C6 3168 // endif C3 3169 3170 if (DstDivReg) { 3171 auto Sel1 = B.buildSelect( 3172 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3173 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3174 Sel1, MulHi3); 3175 } 3176 3177 if (DstRemReg) { 3178 auto Sel2 = B.buildSelect( 3179 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3180 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3181 Sel2, Sub1); 3182 } 3183 } 3184 3185 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3186 MachineRegisterInfo &MRI, 3187 MachineIRBuilder &B) const { 3188 Register DstDivReg, DstRemReg; 3189 switch (MI.getOpcode()) { 3190 default: 3191 llvm_unreachable("Unexpected opcode!"); 3192 case AMDGPU::G_UDIV: { 3193 DstDivReg = MI.getOperand(0).getReg(); 3194 break; 3195 } 3196 case AMDGPU::G_UREM: { 3197 DstRemReg = MI.getOperand(0).getReg(); 3198 break; 3199 } 3200 case AMDGPU::G_UDIVREM: { 3201 DstDivReg = MI.getOperand(0).getReg(); 3202 DstRemReg = MI.getOperand(1).getReg(); 3203 break; 3204 } 3205 } 3206 3207 const LLT S64 = LLT::scalar(64); 3208 const LLT S32 = LLT::scalar(32); 3209 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3210 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3211 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3212 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3213 3214 if (Ty == S32) 3215 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3216 else if (Ty == S64) 3217 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3218 else 3219 return false; 3220 3221 MI.eraseFromParent(); 3222 return true; 3223 } 3224 3225 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3226 MachineRegisterInfo &MRI, 3227 MachineIRBuilder &B) const { 3228 const LLT S64 = LLT::scalar(64); 3229 const LLT S32 = LLT::scalar(32); 3230 3231 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3232 if (Ty != S32 && Ty != S64) 3233 return false; 3234 3235 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3236 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3237 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3238 3239 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3240 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3241 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3242 3243 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3244 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3245 3246 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3247 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3248 3249 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3250 switch (MI.getOpcode()) { 3251 default: 3252 llvm_unreachable("Unexpected opcode!"); 3253 case AMDGPU::G_SDIV: { 3254 DstDivReg = MI.getOperand(0).getReg(); 3255 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3256 break; 3257 } 3258 case AMDGPU::G_SREM: { 3259 DstRemReg = MI.getOperand(0).getReg(); 3260 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3261 break; 3262 } 3263 case AMDGPU::G_SDIVREM: { 3264 DstDivReg = MI.getOperand(0).getReg(); 3265 DstRemReg = MI.getOperand(1).getReg(); 3266 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3267 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3268 break; 3269 } 3270 } 3271 3272 if (Ty == S32) 3273 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3274 else 3275 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3276 3277 if (DstDivReg) { 3278 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3279 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3280 B.buildSub(DstDivReg, SignXor, Sign); 3281 } 3282 3283 if (DstRemReg) { 3284 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3285 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3286 B.buildSub(DstRemReg, SignXor, Sign); 3287 } 3288 3289 MI.eraseFromParent(); 3290 return true; 3291 } 3292 3293 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3294 MachineRegisterInfo &MRI, 3295 MachineIRBuilder &B) const { 3296 Register Res = MI.getOperand(0).getReg(); 3297 Register LHS = MI.getOperand(1).getReg(); 3298 Register RHS = MI.getOperand(2).getReg(); 3299 uint16_t Flags = MI.getFlags(); 3300 LLT ResTy = MRI.getType(Res); 3301 3302 const MachineFunction &MF = B.getMF(); 3303 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3304 MI.getFlag(MachineInstr::FmAfn); 3305 3306 if (!AllowInaccurateRcp) 3307 return false; 3308 3309 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3310 // 1 / x -> RCP(x) 3311 if (CLHS->isExactlyValue(1.0)) { 3312 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3313 .addUse(RHS) 3314 .setMIFlags(Flags); 3315 3316 MI.eraseFromParent(); 3317 return true; 3318 } 3319 3320 // -1 / x -> RCP( FNEG(x) ) 3321 if (CLHS->isExactlyValue(-1.0)) { 3322 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3323 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3324 .addUse(FNeg.getReg(0)) 3325 .setMIFlags(Flags); 3326 3327 MI.eraseFromParent(); 3328 return true; 3329 } 3330 } 3331 3332 // x / y -> x * (1.0 / y) 3333 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3334 .addUse(RHS) 3335 .setMIFlags(Flags); 3336 B.buildFMul(Res, LHS, RCP, Flags); 3337 3338 MI.eraseFromParent(); 3339 return true; 3340 } 3341 3342 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3343 MachineRegisterInfo &MRI, 3344 MachineIRBuilder &B) const { 3345 Register Res = MI.getOperand(0).getReg(); 3346 Register X = MI.getOperand(1).getReg(); 3347 Register Y = MI.getOperand(2).getReg(); 3348 uint16_t Flags = MI.getFlags(); 3349 LLT ResTy = MRI.getType(Res); 3350 3351 const MachineFunction &MF = B.getMF(); 3352 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3353 MI.getFlag(MachineInstr::FmAfn); 3354 3355 if (!AllowInaccurateRcp) 3356 return false; 3357 3358 auto NegY = B.buildFNeg(ResTy, Y); 3359 auto One = B.buildFConstant(ResTy, 1.0); 3360 3361 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3362 .addUse(Y) 3363 .setMIFlags(Flags); 3364 3365 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3366 R = B.buildFMA(ResTy, Tmp0, R, R); 3367 3368 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3369 R = B.buildFMA(ResTy, Tmp1, R, R); 3370 3371 auto Ret = B.buildFMul(ResTy, X, R); 3372 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3373 3374 B.buildFMA(Res, Tmp2, R, Ret); 3375 MI.eraseFromParent(); 3376 return true; 3377 } 3378 3379 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3380 MachineRegisterInfo &MRI, 3381 MachineIRBuilder &B) const { 3382 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3383 return true; 3384 3385 Register Res = MI.getOperand(0).getReg(); 3386 Register LHS = MI.getOperand(1).getReg(); 3387 Register RHS = MI.getOperand(2).getReg(); 3388 3389 uint16_t Flags = MI.getFlags(); 3390 3391 LLT S16 = LLT::scalar(16); 3392 LLT S32 = LLT::scalar(32); 3393 3394 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3395 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3396 3397 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3398 .addUse(RHSExt.getReg(0)) 3399 .setMIFlags(Flags); 3400 3401 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3402 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3403 3404 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3405 .addUse(RDst.getReg(0)) 3406 .addUse(RHS) 3407 .addUse(LHS) 3408 .setMIFlags(Flags); 3409 3410 MI.eraseFromParent(); 3411 return true; 3412 } 3413 3414 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3415 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3416 static void toggleSPDenormMode(bool Enable, 3417 MachineIRBuilder &B, 3418 const GCNSubtarget &ST, 3419 AMDGPU::SIModeRegisterDefaults Mode) { 3420 // Set SP denorm mode to this value. 3421 unsigned SPDenormMode = 3422 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3423 3424 if (ST.hasDenormModeInst()) { 3425 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3426 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3427 3428 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3429 B.buildInstr(AMDGPU::S_DENORM_MODE) 3430 .addImm(NewDenormModeValue); 3431 3432 } else { 3433 // Select FP32 bit field in mode register. 3434 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3435 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3436 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3437 3438 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3439 .addImm(SPDenormMode) 3440 .addImm(SPDenormModeBitField); 3441 } 3442 } 3443 3444 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3445 MachineRegisterInfo &MRI, 3446 MachineIRBuilder &B) const { 3447 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3448 return true; 3449 3450 Register Res = MI.getOperand(0).getReg(); 3451 Register LHS = MI.getOperand(1).getReg(); 3452 Register RHS = MI.getOperand(2).getReg(); 3453 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3454 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3455 3456 uint16_t Flags = MI.getFlags(); 3457 3458 LLT S32 = LLT::scalar(32); 3459 LLT S1 = LLT::scalar(1); 3460 3461 auto One = B.buildFConstant(S32, 1.0f); 3462 3463 auto DenominatorScaled = 3464 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3465 .addUse(LHS) 3466 .addUse(RHS) 3467 .addImm(0) 3468 .setMIFlags(Flags); 3469 auto NumeratorScaled = 3470 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3471 .addUse(LHS) 3472 .addUse(RHS) 3473 .addImm(1) 3474 .setMIFlags(Flags); 3475 3476 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3477 .addUse(DenominatorScaled.getReg(0)) 3478 .setMIFlags(Flags); 3479 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3480 3481 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3482 // aren't modeled as reading it. 3483 if (!Mode.allFP32Denormals()) 3484 toggleSPDenormMode(true, B, ST, Mode); 3485 3486 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3487 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3488 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3489 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3490 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3491 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3492 3493 if (!Mode.allFP32Denormals()) 3494 toggleSPDenormMode(false, B, ST, Mode); 3495 3496 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3497 .addUse(Fma4.getReg(0)) 3498 .addUse(Fma1.getReg(0)) 3499 .addUse(Fma3.getReg(0)) 3500 .addUse(NumeratorScaled.getReg(1)) 3501 .setMIFlags(Flags); 3502 3503 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3504 .addUse(Fmas.getReg(0)) 3505 .addUse(RHS) 3506 .addUse(LHS) 3507 .setMIFlags(Flags); 3508 3509 MI.eraseFromParent(); 3510 return true; 3511 } 3512 3513 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3514 MachineRegisterInfo &MRI, 3515 MachineIRBuilder &B) const { 3516 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3517 return true; 3518 3519 Register Res = MI.getOperand(0).getReg(); 3520 Register LHS = MI.getOperand(1).getReg(); 3521 Register RHS = MI.getOperand(2).getReg(); 3522 3523 uint16_t Flags = MI.getFlags(); 3524 3525 LLT S64 = LLT::scalar(64); 3526 LLT S1 = LLT::scalar(1); 3527 3528 auto One = B.buildFConstant(S64, 1.0); 3529 3530 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3531 .addUse(LHS) 3532 .addUse(RHS) 3533 .addImm(0) 3534 .setMIFlags(Flags); 3535 3536 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3537 3538 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3539 .addUse(DivScale0.getReg(0)) 3540 .setMIFlags(Flags); 3541 3542 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3543 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3544 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3545 3546 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3547 .addUse(LHS) 3548 .addUse(RHS) 3549 .addImm(1) 3550 .setMIFlags(Flags); 3551 3552 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3553 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3554 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3555 3556 Register Scale; 3557 if (!ST.hasUsableDivScaleConditionOutput()) { 3558 // Workaround a hardware bug on SI where the condition output from div_scale 3559 // is not usable. 3560 3561 LLT S32 = LLT::scalar(32); 3562 3563 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3564 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3565 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3566 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3567 3568 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3569 Scale1Unmerge.getReg(1)); 3570 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3571 Scale0Unmerge.getReg(1)); 3572 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3573 } else { 3574 Scale = DivScale1.getReg(1); 3575 } 3576 3577 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3578 .addUse(Fma4.getReg(0)) 3579 .addUse(Fma3.getReg(0)) 3580 .addUse(Mul.getReg(0)) 3581 .addUse(Scale) 3582 .setMIFlags(Flags); 3583 3584 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3585 .addUse(Fmas.getReg(0)) 3586 .addUse(RHS) 3587 .addUse(LHS) 3588 .setMIFlags(Flags); 3589 3590 MI.eraseFromParent(); 3591 return true; 3592 } 3593 3594 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3595 MachineRegisterInfo &MRI, 3596 MachineIRBuilder &B) const { 3597 Register Res = MI.getOperand(0).getReg(); 3598 Register LHS = MI.getOperand(2).getReg(); 3599 Register RHS = MI.getOperand(3).getReg(); 3600 uint16_t Flags = MI.getFlags(); 3601 3602 LLT S32 = LLT::scalar(32); 3603 LLT S1 = LLT::scalar(1); 3604 3605 auto Abs = B.buildFAbs(S32, RHS, Flags); 3606 const APFloat C0Val(1.0f); 3607 3608 auto C0 = B.buildConstant(S32, 0x6f800000); 3609 auto C1 = B.buildConstant(S32, 0x2f800000); 3610 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3611 3612 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3613 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3614 3615 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3616 3617 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3618 .addUse(Mul0.getReg(0)) 3619 .setMIFlags(Flags); 3620 3621 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3622 3623 B.buildFMul(Res, Sel, Mul1, Flags); 3624 3625 MI.eraseFromParent(); 3626 return true; 3627 } 3628 3629 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3630 // FIXME: Why do we handle this one but not other removed instructions? 3631 // 3632 // Reciprocal square root. The clamp prevents infinite results, clamping 3633 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3634 // +-max_float. 3635 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3636 MachineRegisterInfo &MRI, 3637 MachineIRBuilder &B) const { 3638 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3639 return true; 3640 3641 Register Dst = MI.getOperand(0).getReg(); 3642 Register Src = MI.getOperand(2).getReg(); 3643 auto Flags = MI.getFlags(); 3644 3645 LLT Ty = MRI.getType(Dst); 3646 3647 const fltSemantics *FltSemantics; 3648 if (Ty == LLT::scalar(32)) 3649 FltSemantics = &APFloat::IEEEsingle(); 3650 else if (Ty == LLT::scalar(64)) 3651 FltSemantics = &APFloat::IEEEdouble(); 3652 else 3653 return false; 3654 3655 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3656 .addUse(Src) 3657 .setMIFlags(Flags); 3658 3659 // We don't need to concern ourselves with the snan handling difference, since 3660 // the rsq quieted (or not) so use the one which will directly select. 3661 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3662 const bool UseIEEE = MFI->getMode().IEEE; 3663 3664 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3665 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3666 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3667 3668 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3669 3670 if (UseIEEE) 3671 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3672 else 3673 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3674 MI.eraseFromParent(); 3675 return true; 3676 } 3677 3678 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3679 switch (IID) { 3680 case Intrinsic::amdgcn_ds_fadd: 3681 return AMDGPU::G_ATOMICRMW_FADD; 3682 case Intrinsic::amdgcn_ds_fmin: 3683 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3684 case Intrinsic::amdgcn_ds_fmax: 3685 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3686 default: 3687 llvm_unreachable("not a DS FP intrinsic"); 3688 } 3689 } 3690 3691 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3692 MachineInstr &MI, 3693 Intrinsic::ID IID) const { 3694 GISelChangeObserver &Observer = Helper.Observer; 3695 Observer.changingInstr(MI); 3696 3697 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3698 3699 // The remaining operands were used to set fields in the MemOperand on 3700 // construction. 3701 for (int I = 6; I > 3; --I) 3702 MI.RemoveOperand(I); 3703 3704 MI.RemoveOperand(1); // Remove the intrinsic ID. 3705 Observer.changedInstr(MI); 3706 return true; 3707 } 3708 3709 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3710 MachineRegisterInfo &MRI, 3711 MachineIRBuilder &B) const { 3712 uint64_t Offset = 3713 ST.getTargetLowering()->getImplicitParameterOffset( 3714 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3715 LLT DstTy = MRI.getType(DstReg); 3716 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3717 3718 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3719 if (!loadInputValue(KernargPtrReg, B, 3720 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3721 return false; 3722 3723 // FIXME: This should be nuw 3724 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3725 return true; 3726 } 3727 3728 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3729 MachineRegisterInfo &MRI, 3730 MachineIRBuilder &B) const { 3731 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3732 if (!MFI->isEntryFunction()) { 3733 return legalizePreloadedArgIntrin(MI, MRI, B, 3734 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3735 } 3736 3737 Register DstReg = MI.getOperand(0).getReg(); 3738 if (!getImplicitArgPtr(DstReg, MRI, B)) 3739 return false; 3740 3741 MI.eraseFromParent(); 3742 return true; 3743 } 3744 3745 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3746 MachineRegisterInfo &MRI, 3747 MachineIRBuilder &B, 3748 unsigned AddrSpace) const { 3749 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3750 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3751 Register Hi32 = Unmerge.getReg(1); 3752 3753 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3754 MI.eraseFromParent(); 3755 return true; 3756 } 3757 3758 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3759 // offset (the offset that is included in bounds checking and swizzling, to be 3760 // split between the instruction's voffset and immoffset fields) and soffset 3761 // (the offset that is excluded from bounds checking and swizzling, to go in 3762 // the instruction's soffset field). This function takes the first kind of 3763 // offset and figures out how to split it between voffset and immoffset. 3764 std::pair<Register, unsigned> 3765 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3766 Register OrigOffset) const { 3767 const unsigned MaxImm = 4095; 3768 Register BaseReg; 3769 unsigned ImmOffset; 3770 const LLT S32 = LLT::scalar(32); 3771 MachineRegisterInfo &MRI = *B.getMRI(); 3772 3773 std::tie(BaseReg, ImmOffset) = 3774 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3775 3776 // If BaseReg is a pointer, convert it to int. 3777 if (MRI.getType(BaseReg).isPointer()) 3778 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3779 3780 // If the immediate value is too big for the immoffset field, put the value 3781 // and -4096 into the immoffset field so that the value that is copied/added 3782 // for the voffset field is a multiple of 4096, and it stands more chance 3783 // of being CSEd with the copy/add for another similar load/store. 3784 // However, do not do that rounding down to a multiple of 4096 if that is a 3785 // negative number, as it appears to be illegal to have a negative offset 3786 // in the vgpr, even if adding the immediate offset makes it positive. 3787 unsigned Overflow = ImmOffset & ~MaxImm; 3788 ImmOffset -= Overflow; 3789 if ((int32_t)Overflow < 0) { 3790 Overflow += ImmOffset; 3791 ImmOffset = 0; 3792 } 3793 3794 if (Overflow != 0) { 3795 if (!BaseReg) { 3796 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3797 } else { 3798 auto OverflowVal = B.buildConstant(S32, Overflow); 3799 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3800 } 3801 } 3802 3803 if (!BaseReg) 3804 BaseReg = B.buildConstant(S32, 0).getReg(0); 3805 3806 return std::make_pair(BaseReg, ImmOffset); 3807 } 3808 3809 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3810 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3811 Register VOffset, Register SOffset, 3812 unsigned ImmOffset, Register VIndex, 3813 MachineRegisterInfo &MRI) const { 3814 Optional<ValueAndVReg> MaybeVOffsetVal = 3815 getIConstantVRegValWithLookThrough(VOffset, MRI); 3816 Optional<ValueAndVReg> MaybeSOffsetVal = 3817 getIConstantVRegValWithLookThrough(SOffset, MRI); 3818 Optional<ValueAndVReg> MaybeVIndexVal = 3819 getIConstantVRegValWithLookThrough(VIndex, MRI); 3820 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3821 // update the MMO with that offset. The stride is unknown so we can only do 3822 // this if VIndex is constant 0. 3823 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3824 MaybeVIndexVal->Value == 0) { 3825 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3826 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3827 MMO->setOffset(TotalOffset); 3828 } else { 3829 // We don't have a constant combined offset to use in the MMO. Give up. 3830 MMO->setValue((Value *)nullptr); 3831 } 3832 } 3833 3834 /// Handle register layout difference for f16 images for some subtargets. 3835 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3836 MachineRegisterInfo &MRI, 3837 Register Reg, 3838 bool ImageStore) const { 3839 const LLT S16 = LLT::scalar(16); 3840 const LLT S32 = LLT::scalar(32); 3841 LLT StoreVT = MRI.getType(Reg); 3842 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3843 3844 if (ST.hasUnpackedD16VMem()) { 3845 auto Unmerge = B.buildUnmerge(S16, Reg); 3846 3847 SmallVector<Register, 4> WideRegs; 3848 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3849 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3850 3851 int NumElts = StoreVT.getNumElements(); 3852 3853 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3854 .getReg(0); 3855 } 3856 3857 if (ImageStore && ST.hasImageStoreD16Bug()) { 3858 if (StoreVT.getNumElements() == 2) { 3859 SmallVector<Register, 4> PackedRegs; 3860 Reg = B.buildBitcast(S32, Reg).getReg(0); 3861 PackedRegs.push_back(Reg); 3862 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3863 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3864 .getReg(0); 3865 } 3866 3867 if (StoreVT.getNumElements() == 3) { 3868 SmallVector<Register, 4> PackedRegs; 3869 auto Unmerge = B.buildUnmerge(S16, Reg); 3870 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3871 PackedRegs.push_back(Unmerge.getReg(I)); 3872 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3873 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3874 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3875 } 3876 3877 if (StoreVT.getNumElements() == 4) { 3878 SmallVector<Register, 4> PackedRegs; 3879 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3880 auto Unmerge = B.buildUnmerge(S32, Reg); 3881 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3882 PackedRegs.push_back(Unmerge.getReg(I)); 3883 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3884 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3885 .getReg(0); 3886 } 3887 3888 llvm_unreachable("invalid data type"); 3889 } 3890 3891 if (StoreVT == LLT::fixed_vector(3, S16)) { 3892 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 3893 .getReg(0); 3894 } 3895 return Reg; 3896 } 3897 3898 Register AMDGPULegalizerInfo::fixStoreSourceType( 3899 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3900 MachineRegisterInfo *MRI = B.getMRI(); 3901 LLT Ty = MRI->getType(VData); 3902 3903 const LLT S16 = LLT::scalar(16); 3904 3905 // Fixup illegal register types for i8 stores. 3906 if (Ty == LLT::scalar(8) || Ty == S16) { 3907 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3908 return AnyExt; 3909 } 3910 3911 if (Ty.isVector()) { 3912 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3913 if (IsFormat) 3914 return handleD16VData(B, *MRI, VData); 3915 } 3916 } 3917 3918 return VData; 3919 } 3920 3921 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3922 MachineRegisterInfo &MRI, 3923 MachineIRBuilder &B, 3924 bool IsTyped, 3925 bool IsFormat) const { 3926 Register VData = MI.getOperand(1).getReg(); 3927 LLT Ty = MRI.getType(VData); 3928 LLT EltTy = Ty.getScalarType(); 3929 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3930 const LLT S32 = LLT::scalar(32); 3931 3932 VData = fixStoreSourceType(B, VData, IsFormat); 3933 Register RSrc = MI.getOperand(2).getReg(); 3934 3935 MachineMemOperand *MMO = *MI.memoperands_begin(); 3936 const int MemSize = MMO->getSize(); 3937 3938 unsigned ImmOffset; 3939 3940 // The typed intrinsics add an immediate after the registers. 3941 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3942 3943 // The struct intrinsic variants add one additional operand over raw. 3944 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3945 Register VIndex; 3946 int OpOffset = 0; 3947 if (HasVIndex) { 3948 VIndex = MI.getOperand(3).getReg(); 3949 OpOffset = 1; 3950 } else { 3951 VIndex = B.buildConstant(S32, 0).getReg(0); 3952 } 3953 3954 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3955 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3956 3957 unsigned Format = 0; 3958 if (IsTyped) { 3959 Format = MI.getOperand(5 + OpOffset).getImm(); 3960 ++OpOffset; 3961 } 3962 3963 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3964 3965 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3966 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3967 3968 unsigned Opc; 3969 if (IsTyped) { 3970 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3971 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3972 } else if (IsFormat) { 3973 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3974 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3975 } else { 3976 switch (MemSize) { 3977 case 1: 3978 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3979 break; 3980 case 2: 3981 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3982 break; 3983 default: 3984 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3985 break; 3986 } 3987 } 3988 3989 auto MIB = B.buildInstr(Opc) 3990 .addUse(VData) // vdata 3991 .addUse(RSrc) // rsrc 3992 .addUse(VIndex) // vindex 3993 .addUse(VOffset) // voffset 3994 .addUse(SOffset) // soffset 3995 .addImm(ImmOffset); // offset(imm) 3996 3997 if (IsTyped) 3998 MIB.addImm(Format); 3999 4000 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4001 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4002 .addMemOperand(MMO); 4003 4004 MI.eraseFromParent(); 4005 return true; 4006 } 4007 4008 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 4009 MachineRegisterInfo &MRI, 4010 MachineIRBuilder &B, 4011 bool IsFormat, 4012 bool IsTyped) const { 4013 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 4014 MachineMemOperand *MMO = *MI.memoperands_begin(); 4015 const LLT MemTy = MMO->getMemoryType(); 4016 const LLT S32 = LLT::scalar(32); 4017 4018 Register Dst = MI.getOperand(0).getReg(); 4019 Register RSrc = MI.getOperand(2).getReg(); 4020 4021 // The typed intrinsics add an immediate after the registers. 4022 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4023 4024 // The struct intrinsic variants add one additional operand over raw. 4025 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4026 Register VIndex; 4027 int OpOffset = 0; 4028 if (HasVIndex) { 4029 VIndex = MI.getOperand(3).getReg(); 4030 OpOffset = 1; 4031 } else { 4032 VIndex = B.buildConstant(S32, 0).getReg(0); 4033 } 4034 4035 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4036 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4037 4038 unsigned Format = 0; 4039 if (IsTyped) { 4040 Format = MI.getOperand(5 + OpOffset).getImm(); 4041 ++OpOffset; 4042 } 4043 4044 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4045 unsigned ImmOffset; 4046 4047 LLT Ty = MRI.getType(Dst); 4048 LLT EltTy = Ty.getScalarType(); 4049 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4050 const bool Unpacked = ST.hasUnpackedD16VMem(); 4051 4052 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4053 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4054 4055 unsigned Opc; 4056 4057 if (IsTyped) { 4058 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4059 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4060 } else if (IsFormat) { 4061 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4062 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4063 } else { 4064 switch (MemTy.getSizeInBits()) { 4065 case 8: 4066 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4067 break; 4068 case 16: 4069 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4070 break; 4071 default: 4072 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4073 break; 4074 } 4075 } 4076 4077 Register LoadDstReg; 4078 4079 bool IsExtLoad = 4080 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4081 LLT UnpackedTy = Ty.changeElementSize(32); 4082 4083 if (IsExtLoad) 4084 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4085 else if (Unpacked && IsD16 && Ty.isVector()) 4086 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4087 else 4088 LoadDstReg = Dst; 4089 4090 auto MIB = B.buildInstr(Opc) 4091 .addDef(LoadDstReg) // vdata 4092 .addUse(RSrc) // rsrc 4093 .addUse(VIndex) // vindex 4094 .addUse(VOffset) // voffset 4095 .addUse(SOffset) // soffset 4096 .addImm(ImmOffset); // offset(imm) 4097 4098 if (IsTyped) 4099 MIB.addImm(Format); 4100 4101 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4102 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4103 .addMemOperand(MMO); 4104 4105 if (LoadDstReg != Dst) { 4106 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4107 4108 // Widen result for extending loads was widened. 4109 if (IsExtLoad) 4110 B.buildTrunc(Dst, LoadDstReg); 4111 else { 4112 // Repack to original 16-bit vector result 4113 // FIXME: G_TRUNC should work, but legalization currently fails 4114 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4115 SmallVector<Register, 4> Repack; 4116 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4117 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4118 B.buildMerge(Dst, Repack); 4119 } 4120 } 4121 4122 MI.eraseFromParent(); 4123 return true; 4124 } 4125 4126 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4127 MachineIRBuilder &B, 4128 bool IsInc) const { 4129 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4130 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4131 B.buildInstr(Opc) 4132 .addDef(MI.getOperand(0).getReg()) 4133 .addUse(MI.getOperand(2).getReg()) 4134 .addUse(MI.getOperand(3).getReg()) 4135 .cloneMemRefs(MI); 4136 MI.eraseFromParent(); 4137 return true; 4138 } 4139 4140 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4141 switch (IntrID) { 4142 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4143 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4144 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4145 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4146 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4147 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4148 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4149 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4150 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4151 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4152 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4153 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4154 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4155 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4156 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4157 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4158 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4159 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4160 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4161 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4162 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4163 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4164 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4165 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4166 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4167 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4168 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4169 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4170 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4171 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4172 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4173 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4174 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4175 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4176 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4177 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4178 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4179 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4180 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4181 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4182 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4183 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4184 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4185 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4186 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4187 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4188 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4189 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4190 default: 4191 llvm_unreachable("unhandled atomic opcode"); 4192 } 4193 } 4194 4195 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4196 MachineIRBuilder &B, 4197 Intrinsic::ID IID) const { 4198 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4199 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4200 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4201 4202 Register Dst; 4203 4204 int OpOffset = 0; 4205 if (HasReturn) { 4206 // A few FP atomics do not support return values. 4207 Dst = MI.getOperand(0).getReg(); 4208 } else { 4209 OpOffset = -1; 4210 } 4211 4212 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4213 Register CmpVal; 4214 4215 if (IsCmpSwap) { 4216 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4217 ++OpOffset; 4218 } 4219 4220 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4221 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4222 4223 // The struct intrinsic variants add one additional operand over raw. 4224 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4225 Register VIndex; 4226 if (HasVIndex) { 4227 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4228 ++OpOffset; 4229 } else { 4230 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4231 } 4232 4233 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4234 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4235 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4236 4237 MachineMemOperand *MMO = *MI.memoperands_begin(); 4238 4239 unsigned ImmOffset; 4240 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4241 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4242 4243 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4244 4245 if (HasReturn) 4246 MIB.addDef(Dst); 4247 4248 MIB.addUse(VData); // vdata 4249 4250 if (IsCmpSwap) 4251 MIB.addReg(CmpVal); 4252 4253 MIB.addUse(RSrc) // rsrc 4254 .addUse(VIndex) // vindex 4255 .addUse(VOffset) // voffset 4256 .addUse(SOffset) // soffset 4257 .addImm(ImmOffset) // offset(imm) 4258 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4259 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4260 .addMemOperand(MMO); 4261 4262 MI.eraseFromParent(); 4263 return true; 4264 } 4265 4266 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4267 /// vector with s16 typed elements. 4268 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4269 SmallVectorImpl<Register> &PackedAddrs, 4270 unsigned ArgOffset, 4271 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4272 bool IsA16, bool IsG16) { 4273 const LLT S16 = LLT::scalar(16); 4274 const LLT V2S16 = LLT::fixed_vector(2, 16); 4275 auto EndIdx = Intr->VAddrEnd; 4276 4277 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4278 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4279 if (!SrcOp.isReg()) 4280 continue; // _L to _LZ may have eliminated this. 4281 4282 Register AddrReg = SrcOp.getReg(); 4283 4284 if ((I < Intr->GradientStart) || 4285 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4286 (I >= Intr->CoordStart && !IsA16)) { 4287 if ((I < Intr->GradientStart) && IsA16 && 4288 (B.getMRI()->getType(AddrReg) == S16)) { 4289 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 4290 // Special handling of bias when A16 is on. Bias is of type half but 4291 // occupies full 32-bit. 4292 PackedAddrs.push_back( 4293 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4294 .getReg(0)); 4295 } else { 4296 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 4297 "Bias needs to be converted to 16 bit in A16 mode"); 4298 // Handle any gradient or coordinate operands that should not be packed 4299 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4300 PackedAddrs.push_back(AddrReg); 4301 } 4302 } else { 4303 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4304 // derivatives dx/dh and dx/dv are packed with undef. 4305 if (((I + 1) >= EndIdx) || 4306 ((Intr->NumGradients / 2) % 2 == 1 && 4307 (I == static_cast<unsigned>(Intr->GradientStart + 4308 (Intr->NumGradients / 2) - 1) || 4309 I == static_cast<unsigned>(Intr->GradientStart + 4310 Intr->NumGradients - 1))) || 4311 // Check for _L to _LZ optimization 4312 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4313 PackedAddrs.push_back( 4314 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4315 .getReg(0)); 4316 } else { 4317 PackedAddrs.push_back( 4318 B.buildBuildVector( 4319 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4320 .getReg(0)); 4321 ++I; 4322 } 4323 } 4324 } 4325 } 4326 4327 /// Convert from separate vaddr components to a single vector address register, 4328 /// and replace the remaining operands with $noreg. 4329 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4330 int DimIdx, int NumVAddrs) { 4331 const LLT S32 = LLT::scalar(32); 4332 4333 SmallVector<Register, 8> AddrRegs; 4334 for (int I = 0; I != NumVAddrs; ++I) { 4335 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4336 if (SrcOp.isReg()) { 4337 AddrRegs.push_back(SrcOp.getReg()); 4338 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4339 } 4340 } 4341 4342 int NumAddrRegs = AddrRegs.size(); 4343 if (NumAddrRegs != 1) { 4344 // Above 8 elements round up to next power of 2 (i.e. 16). 4345 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4346 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4347 auto Undef = B.buildUndef(S32); 4348 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4349 NumAddrRegs = RoundedNumRegs; 4350 } 4351 4352 auto VAddr = 4353 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4354 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4355 } 4356 4357 for (int I = 1; I != NumVAddrs; ++I) { 4358 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4359 if (SrcOp.isReg()) 4360 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4361 } 4362 } 4363 4364 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4365 /// 4366 /// Depending on the subtarget, load/store with 16-bit element data need to be 4367 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4368 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4369 /// registers. 4370 /// 4371 /// We don't want to directly select image instructions just yet, but also want 4372 /// to exposes all register repacking to the legalizer/combiners. We also don't 4373 /// want a selected instrution entering RegBankSelect. In order to avoid 4374 /// defining a multitude of intermediate image instructions, directly hack on 4375 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4376 /// padding now unnecessary arguments with $noreg. 4377 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4378 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4379 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4380 4381 const unsigned NumDefs = MI.getNumExplicitDefs(); 4382 const unsigned ArgOffset = NumDefs + 1; 4383 bool IsTFE = NumDefs == 2; 4384 // We are only processing the operands of d16 image operations on subtargets 4385 // that use the unpacked register layout, or need to repack the TFE result. 4386 4387 // TODO: Do we need to guard against already legalized intrinsics? 4388 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4389 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4390 4391 MachineRegisterInfo *MRI = B.getMRI(); 4392 const LLT S32 = LLT::scalar(32); 4393 const LLT S16 = LLT::scalar(16); 4394 const LLT V2S16 = LLT::fixed_vector(2, 16); 4395 4396 unsigned DMask = 0; 4397 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 4398 LLT Ty = MRI->getType(VData); 4399 4400 // Check for 16 bit addresses and pack if true. 4401 LLT GradTy = 4402 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4403 LLT AddrTy = 4404 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4405 const bool IsG16 = GradTy == S16; 4406 const bool IsA16 = AddrTy == S16; 4407 const bool IsD16 = Ty.getScalarType() == S16; 4408 4409 int DMaskLanes = 0; 4410 if (!BaseOpcode->Atomic) { 4411 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4412 if (BaseOpcode->Gather4) { 4413 DMaskLanes = 4; 4414 } else if (DMask != 0) { 4415 DMaskLanes = countPopulation(DMask); 4416 } else if (!IsTFE && !BaseOpcode->Store) { 4417 // If dmask is 0, this is a no-op load. This can be eliminated. 4418 B.buildUndef(MI.getOperand(0)); 4419 MI.eraseFromParent(); 4420 return true; 4421 } 4422 } 4423 4424 Observer.changingInstr(MI); 4425 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4426 4427 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 4428 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 4429 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 4430 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4431 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 4432 4433 // Track that we legalized this 4434 MI.setDesc(B.getTII().get(NewOpcode)); 4435 4436 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4437 // dmask to be at least 1 otherwise the instruction will fail 4438 if (IsTFE && DMask == 0) { 4439 DMask = 0x1; 4440 DMaskLanes = 1; 4441 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4442 } 4443 4444 if (BaseOpcode->Atomic) { 4445 Register VData0 = MI.getOperand(2).getReg(); 4446 LLT Ty = MRI->getType(VData0); 4447 4448 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4449 if (Ty.isVector()) 4450 return false; 4451 4452 if (BaseOpcode->AtomicX2) { 4453 Register VData1 = MI.getOperand(3).getReg(); 4454 // The two values are packed in one register. 4455 LLT PackedTy = LLT::fixed_vector(2, Ty); 4456 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4457 MI.getOperand(2).setReg(Concat.getReg(0)); 4458 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4459 } 4460 } 4461 4462 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4463 4464 // Rewrite the addressing register layout before doing anything else. 4465 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4466 // 16 bit gradients are supported, but are tied to the A16 control 4467 // so both gradients and addresses must be 16 bit 4468 return false; 4469 } 4470 4471 if (IsA16 && !ST.hasA16()) { 4472 // A16 not supported 4473 return false; 4474 } 4475 4476 if (IsA16 || IsG16) { 4477 if (Intr->NumVAddrs > 1) { 4478 SmallVector<Register, 4> PackedRegs; 4479 4480 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4481 IsG16); 4482 4483 // See also below in the non-a16 branch 4484 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4485 PackedRegs.size() <= ST.getNSAMaxSize(); 4486 4487 if (!UseNSA && PackedRegs.size() > 1) { 4488 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4489 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4490 PackedRegs[0] = Concat.getReg(0); 4491 PackedRegs.resize(1); 4492 } 4493 4494 const unsigned NumPacked = PackedRegs.size(); 4495 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4496 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4497 if (!SrcOp.isReg()) { 4498 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4499 continue; 4500 } 4501 4502 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4503 4504 if (I - Intr->VAddrStart < NumPacked) 4505 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4506 else 4507 SrcOp.setReg(AMDGPU::NoRegister); 4508 } 4509 } 4510 } else { 4511 // If the register allocator cannot place the address registers contiguously 4512 // without introducing moves, then using the non-sequential address encoding 4513 // is always preferable, since it saves VALU instructions and is usually a 4514 // wash in terms of code size or even better. 4515 // 4516 // However, we currently have no way of hinting to the register allocator 4517 // that MIMG addresses should be placed contiguously when it is possible to 4518 // do so, so force non-NSA for the common 2-address case as a heuristic. 4519 // 4520 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4521 // allocation when possible. 4522 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4523 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4524 4525 if (!UseNSA && Intr->NumVAddrs > 1) 4526 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4527 Intr->NumVAddrs); 4528 } 4529 4530 int Flags = 0; 4531 if (IsA16) 4532 Flags |= 1; 4533 if (IsG16) 4534 Flags |= 2; 4535 MI.addOperand(MachineOperand::CreateImm(Flags)); 4536 4537 if (BaseOpcode->Store) { // No TFE for stores? 4538 // TODO: Handle dmask trim 4539 if (!Ty.isVector() || !IsD16) 4540 return true; 4541 4542 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4543 if (RepackedReg != VData) { 4544 MI.getOperand(1).setReg(RepackedReg); 4545 } 4546 4547 return true; 4548 } 4549 4550 Register DstReg = MI.getOperand(0).getReg(); 4551 const LLT EltTy = Ty.getScalarType(); 4552 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4553 4554 // Confirm that the return type is large enough for the dmask specified 4555 if (NumElts < DMaskLanes) 4556 return false; 4557 4558 if (NumElts > 4 || DMaskLanes > 4) 4559 return false; 4560 4561 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4562 const LLT AdjustedTy = 4563 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4564 4565 // The raw dword aligned data component of the load. The only legal cases 4566 // where this matters should be when using the packed D16 format, for 4567 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4568 LLT RoundedTy; 4569 4570 // S32 vector to to cover all data, plus TFE result element. 4571 LLT TFETy; 4572 4573 // Register type to use for each loaded component. Will be S32 or V2S16. 4574 LLT RegTy; 4575 4576 if (IsD16 && ST.hasUnpackedD16VMem()) { 4577 RoundedTy = 4578 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4579 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4580 RegTy = S32; 4581 } else { 4582 unsigned EltSize = EltTy.getSizeInBits(); 4583 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4584 unsigned RoundedSize = 32 * RoundedElts; 4585 RoundedTy = LLT::scalarOrVector( 4586 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4587 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4588 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4589 } 4590 4591 // The return type does not need adjustment. 4592 // TODO: Should we change s16 case to s32 or <2 x s16>? 4593 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4594 return true; 4595 4596 Register Dst1Reg; 4597 4598 // Insert after the instruction. 4599 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4600 4601 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4602 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4603 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4604 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4605 4606 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4607 4608 MI.getOperand(0).setReg(NewResultReg); 4609 4610 // In the IR, TFE is supposed to be used with a 2 element struct return 4611 // type. The instruction really returns these two values in one contiguous 4612 // register, with one additional dword beyond the loaded data. Rewrite the 4613 // return type to use a single register result. 4614 4615 if (IsTFE) { 4616 Dst1Reg = MI.getOperand(1).getReg(); 4617 if (MRI->getType(Dst1Reg) != S32) 4618 return false; 4619 4620 // TODO: Make sure the TFE operand bit is set. 4621 MI.RemoveOperand(1); 4622 4623 // Handle the easy case that requires no repack instructions. 4624 if (Ty == S32) { 4625 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4626 return true; 4627 } 4628 } 4629 4630 // Now figure out how to copy the new result register back into the old 4631 // result. 4632 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4633 4634 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4635 4636 if (ResultNumRegs == 1) { 4637 assert(!IsTFE); 4638 ResultRegs[0] = NewResultReg; 4639 } else { 4640 // We have to repack into a new vector of some kind. 4641 for (int I = 0; I != NumDataRegs; ++I) 4642 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4643 B.buildUnmerge(ResultRegs, NewResultReg); 4644 4645 // Drop the final TFE element to get the data part. The TFE result is 4646 // directly written to the right place already. 4647 if (IsTFE) 4648 ResultRegs.resize(NumDataRegs); 4649 } 4650 4651 // For an s16 scalar result, we form an s32 result with a truncate regardless 4652 // of packed vs. unpacked. 4653 if (IsD16 && !Ty.isVector()) { 4654 B.buildTrunc(DstReg, ResultRegs[0]); 4655 return true; 4656 } 4657 4658 // Avoid a build/concat_vector of 1 entry. 4659 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4660 B.buildBitcast(DstReg, ResultRegs[0]); 4661 return true; 4662 } 4663 4664 assert(Ty.isVector()); 4665 4666 if (IsD16) { 4667 // For packed D16 results with TFE enabled, all the data components are 4668 // S32. Cast back to the expected type. 4669 // 4670 // TODO: We don't really need to use load s32 elements. We would only need one 4671 // cast for the TFE result if a multiple of v2s16 was used. 4672 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4673 for (Register &Reg : ResultRegs) 4674 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4675 } else if (ST.hasUnpackedD16VMem()) { 4676 for (Register &Reg : ResultRegs) 4677 Reg = B.buildTrunc(S16, Reg).getReg(0); 4678 } 4679 } 4680 4681 auto padWithUndef = [&](LLT Ty, int NumElts) { 4682 if (NumElts == 0) 4683 return; 4684 Register Undef = B.buildUndef(Ty).getReg(0); 4685 for (int I = 0; I != NumElts; ++I) 4686 ResultRegs.push_back(Undef); 4687 }; 4688 4689 // Pad out any elements eliminated due to the dmask. 4690 LLT ResTy = MRI->getType(ResultRegs[0]); 4691 if (!ResTy.isVector()) { 4692 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4693 B.buildBuildVector(DstReg, ResultRegs); 4694 return true; 4695 } 4696 4697 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4698 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4699 4700 // Deal with the one annoying legal case. 4701 const LLT V3S16 = LLT::fixed_vector(3, 16); 4702 if (Ty == V3S16) { 4703 if (IsTFE) { 4704 if (ResultRegs.size() == 1) { 4705 NewResultReg = ResultRegs[0]; 4706 } else if (ResultRegs.size() == 2) { 4707 LLT V4S16 = LLT::fixed_vector(4, 16); 4708 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 4709 } else { 4710 return false; 4711 } 4712 } 4713 4714 if (MRI->getType(DstReg).getNumElements() < 4715 MRI->getType(NewResultReg).getNumElements()) { 4716 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 4717 } else { 4718 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 4719 } 4720 return true; 4721 } 4722 4723 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4724 B.buildConcatVectors(DstReg, ResultRegs); 4725 return true; 4726 } 4727 4728 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4729 LegalizerHelper &Helper, MachineInstr &MI) const { 4730 MachineIRBuilder &B = Helper.MIRBuilder; 4731 GISelChangeObserver &Observer = Helper.Observer; 4732 4733 Register Dst = MI.getOperand(0).getReg(); 4734 LLT Ty = B.getMRI()->getType(Dst); 4735 unsigned Size = Ty.getSizeInBits(); 4736 MachineFunction &MF = B.getMF(); 4737 4738 Observer.changingInstr(MI); 4739 4740 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4741 Ty = getBitcastRegisterType(Ty); 4742 Helper.bitcastDst(MI, Ty, 0); 4743 Dst = MI.getOperand(0).getReg(); 4744 B.setInsertPt(B.getMBB(), MI); 4745 } 4746 4747 // FIXME: We don't really need this intermediate instruction. The intrinsic 4748 // should be fixed to have a memory operand. Since it's readnone, we're not 4749 // allowed to add one. 4750 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4751 MI.RemoveOperand(1); // Remove intrinsic ID 4752 4753 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4754 // TODO: Should this use datalayout alignment? 4755 const unsigned MemSize = (Size + 7) / 8; 4756 const Align MemAlign(4); 4757 MachineMemOperand *MMO = MF.getMachineMemOperand( 4758 MachinePointerInfo(), 4759 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4760 MachineMemOperand::MOInvariant, 4761 MemSize, MemAlign); 4762 MI.addMemOperand(MF, MMO); 4763 4764 // There are no 96-bit result scalar loads, but widening to 128-bit should 4765 // always be legal. We may need to restore this to a 96-bit result if it turns 4766 // out this needs to be converted to a vector load during RegBankSelect. 4767 if (!isPowerOf2_32(Size)) { 4768 if (Ty.isVector()) 4769 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4770 else 4771 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4772 } 4773 4774 Observer.changedInstr(MI); 4775 return true; 4776 } 4777 4778 // TODO: Move to selection 4779 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4780 MachineRegisterInfo &MRI, 4781 MachineIRBuilder &B) const { 4782 if (!ST.isTrapHandlerEnabled() || 4783 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4784 return legalizeTrapEndpgm(MI, MRI, B); 4785 4786 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4787 switch (*HsaAbiVer) { 4788 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4789 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4790 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4791 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4792 case ELF::ELFABIVERSION_AMDGPU_HSA_V5: 4793 return ST.supportsGetDoorbellID() ? 4794 legalizeTrapHsa(MI, MRI, B) : 4795 legalizeTrapHsaQueuePtr(MI, MRI, B); 4796 } 4797 } 4798 4799 llvm_unreachable("Unknown trap handler"); 4800 } 4801 4802 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4803 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4804 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4805 MI.eraseFromParent(); 4806 return true; 4807 } 4808 4809 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4810 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4811 // Pass queue pointer to trap handler as input, and insert trap instruction 4812 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4813 Register LiveIn = 4814 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4815 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4816 return false; 4817 4818 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4819 B.buildCopy(SGPR01, LiveIn); 4820 B.buildInstr(AMDGPU::S_TRAP) 4821 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4822 .addReg(SGPR01, RegState::Implicit); 4823 4824 MI.eraseFromParent(); 4825 return true; 4826 } 4827 4828 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4829 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4830 B.buildInstr(AMDGPU::S_TRAP) 4831 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4832 MI.eraseFromParent(); 4833 return true; 4834 } 4835 4836 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4837 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4838 // Is non-HSA path or trap-handler disabled? Then, report a warning 4839 // accordingly 4840 if (!ST.isTrapHandlerEnabled() || 4841 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4842 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4843 "debugtrap handler not supported", 4844 MI.getDebugLoc(), DS_Warning); 4845 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4846 Ctx.diagnose(NoTrap); 4847 } else { 4848 // Insert debug-trap instruction 4849 B.buildInstr(AMDGPU::S_TRAP) 4850 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4851 } 4852 4853 MI.eraseFromParent(); 4854 return true; 4855 } 4856 4857 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4858 MachineIRBuilder &B) const { 4859 MachineRegisterInfo &MRI = *B.getMRI(); 4860 const LLT S16 = LLT::scalar(16); 4861 const LLT S32 = LLT::scalar(32); 4862 4863 Register DstReg = MI.getOperand(0).getReg(); 4864 Register NodePtr = MI.getOperand(2).getReg(); 4865 Register RayExtent = MI.getOperand(3).getReg(); 4866 Register RayOrigin = MI.getOperand(4).getReg(); 4867 Register RayDir = MI.getOperand(5).getReg(); 4868 Register RayInvDir = MI.getOperand(6).getReg(); 4869 Register TDescr = MI.getOperand(7).getReg(); 4870 4871 if (!ST.hasGFX10_AEncoding()) { 4872 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4873 "intrinsic not supported on subtarget", 4874 MI.getDebugLoc()); 4875 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4876 return false; 4877 } 4878 4879 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4880 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4881 const unsigned NumVDataDwords = 4; 4882 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4883 const bool UseNSA = 4884 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 4885 const unsigned BaseOpcodes[2][2] = { 4886 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 4887 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 4888 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 4889 int Opcode; 4890 if (UseNSA) { 4891 Opcode = 4892 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 4893 NumVDataDwords, NumVAddrDwords); 4894 } else { 4895 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 4896 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 4897 PowerOf2Ceil(NumVAddrDwords)); 4898 } 4899 assert(Opcode != -1); 4900 4901 SmallVector<Register, 12> Ops; 4902 if (Is64) { 4903 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4904 Ops.push_back(Unmerge.getReg(0)); 4905 Ops.push_back(Unmerge.getReg(1)); 4906 } else { 4907 Ops.push_back(NodePtr); 4908 } 4909 Ops.push_back(RayExtent); 4910 4911 auto packLanes = [&Ops, &S32, &B](Register Src) { 4912 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 4913 Ops.push_back(Unmerge.getReg(0)); 4914 Ops.push_back(Unmerge.getReg(1)); 4915 Ops.push_back(Unmerge.getReg(2)); 4916 }; 4917 4918 packLanes(RayOrigin); 4919 if (IsA16) { 4920 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 4921 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 4922 Register R1 = MRI.createGenericVirtualRegister(S32); 4923 Register R2 = MRI.createGenericVirtualRegister(S32); 4924 Register R3 = MRI.createGenericVirtualRegister(S32); 4925 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4926 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4927 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4928 Ops.push_back(R1); 4929 Ops.push_back(R2); 4930 Ops.push_back(R3); 4931 } else { 4932 packLanes(RayDir); 4933 packLanes(RayInvDir); 4934 } 4935 4936 if (!UseNSA) { 4937 // Build a single vector containing all the operands so far prepared. 4938 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 4939 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 4940 Ops.clear(); 4941 Ops.push_back(MergedOps); 4942 } 4943 4944 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4945 .addDef(DstReg) 4946 .addImm(Opcode); 4947 4948 for (Register R : Ops) { 4949 MIB.addUse(R); 4950 } 4951 4952 MIB.addUse(TDescr) 4953 .addImm(IsA16 ? 1 : 0) 4954 .cloneMemRefs(MI); 4955 4956 MI.eraseFromParent(); 4957 return true; 4958 } 4959 4960 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) { 4961 B.buildConstant(MI.getOperand(0).getReg(), C); 4962 MI.eraseFromParent(); 4963 return true; 4964 } 4965 4966 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4967 MachineInstr &MI) const { 4968 MachineIRBuilder &B = Helper.MIRBuilder; 4969 MachineRegisterInfo &MRI = *B.getMRI(); 4970 4971 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4972 auto IntrID = MI.getIntrinsicID(); 4973 switch (IntrID) { 4974 case Intrinsic::amdgcn_if: 4975 case Intrinsic::amdgcn_else: { 4976 MachineInstr *Br = nullptr; 4977 MachineBasicBlock *UncondBrTarget = nullptr; 4978 bool Negated = false; 4979 if (MachineInstr *BrCond = 4980 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4981 const SIRegisterInfo *TRI 4982 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4983 4984 Register Def = MI.getOperand(1).getReg(); 4985 Register Use = MI.getOperand(3).getReg(); 4986 4987 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4988 4989 if (Negated) 4990 std::swap(CondBrTarget, UncondBrTarget); 4991 4992 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4993 if (IntrID == Intrinsic::amdgcn_if) { 4994 B.buildInstr(AMDGPU::SI_IF) 4995 .addDef(Def) 4996 .addUse(Use) 4997 .addMBB(UncondBrTarget); 4998 } else { 4999 B.buildInstr(AMDGPU::SI_ELSE) 5000 .addDef(Def) 5001 .addUse(Use) 5002 .addMBB(UncondBrTarget); 5003 } 5004 5005 if (Br) { 5006 Br->getOperand(0).setMBB(CondBrTarget); 5007 } else { 5008 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 5009 // since we're swapping branch targets it needs to be reinserted. 5010 // FIXME: IRTranslator should probably not do this 5011 B.buildBr(*CondBrTarget); 5012 } 5013 5014 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 5015 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 5016 MI.eraseFromParent(); 5017 BrCond->eraseFromParent(); 5018 return true; 5019 } 5020 5021 return false; 5022 } 5023 case Intrinsic::amdgcn_loop: { 5024 MachineInstr *Br = nullptr; 5025 MachineBasicBlock *UncondBrTarget = nullptr; 5026 bool Negated = false; 5027 if (MachineInstr *BrCond = 5028 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5029 const SIRegisterInfo *TRI 5030 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5031 5032 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5033 Register Reg = MI.getOperand(2).getReg(); 5034 5035 if (Negated) 5036 std::swap(CondBrTarget, UncondBrTarget); 5037 5038 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5039 B.buildInstr(AMDGPU::SI_LOOP) 5040 .addUse(Reg) 5041 .addMBB(UncondBrTarget); 5042 5043 if (Br) 5044 Br->getOperand(0).setMBB(CondBrTarget); 5045 else 5046 B.buildBr(*CondBrTarget); 5047 5048 MI.eraseFromParent(); 5049 BrCond->eraseFromParent(); 5050 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5051 return true; 5052 } 5053 5054 return false; 5055 } 5056 case Intrinsic::amdgcn_kernarg_segment_ptr: 5057 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5058 // This only makes sense to call in a kernel, so just lower to null. 5059 B.buildConstant(MI.getOperand(0).getReg(), 0); 5060 MI.eraseFromParent(); 5061 return true; 5062 } 5063 5064 return legalizePreloadedArgIntrin( 5065 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5066 case Intrinsic::amdgcn_implicitarg_ptr: 5067 return legalizeImplicitArgPtr(MI, MRI, B); 5068 case Intrinsic::amdgcn_workitem_id_x: 5069 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0) 5070 return replaceWithConstant(B, MI, 0); 5071 return legalizePreloadedArgIntrin(MI, MRI, B, 5072 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5073 case Intrinsic::amdgcn_workitem_id_y: 5074 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0) 5075 return replaceWithConstant(B, MI, 0); 5076 5077 return legalizePreloadedArgIntrin(MI, MRI, B, 5078 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5079 case Intrinsic::amdgcn_workitem_id_z: 5080 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0) 5081 return replaceWithConstant(B, MI, 0); 5082 5083 return legalizePreloadedArgIntrin(MI, MRI, B, 5084 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5085 case Intrinsic::amdgcn_workgroup_id_x: 5086 return legalizePreloadedArgIntrin(MI, MRI, B, 5087 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5088 case Intrinsic::amdgcn_workgroup_id_y: 5089 return legalizePreloadedArgIntrin(MI, MRI, B, 5090 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5091 case Intrinsic::amdgcn_workgroup_id_z: 5092 return legalizePreloadedArgIntrin(MI, MRI, B, 5093 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5094 case Intrinsic::amdgcn_dispatch_ptr: 5095 return legalizePreloadedArgIntrin(MI, MRI, B, 5096 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5097 case Intrinsic::amdgcn_queue_ptr: 5098 return legalizePreloadedArgIntrin(MI, MRI, B, 5099 AMDGPUFunctionArgInfo::QUEUE_PTR); 5100 case Intrinsic::amdgcn_implicit_buffer_ptr: 5101 return legalizePreloadedArgIntrin( 5102 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5103 case Intrinsic::amdgcn_dispatch_id: 5104 return legalizePreloadedArgIntrin(MI, MRI, B, 5105 AMDGPUFunctionArgInfo::DISPATCH_ID); 5106 case Intrinsic::amdgcn_fdiv_fast: 5107 return legalizeFDIVFastIntrin(MI, MRI, B); 5108 case Intrinsic::amdgcn_is_shared: 5109 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5110 case Intrinsic::amdgcn_is_private: 5111 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5112 case Intrinsic::amdgcn_wavefrontsize: { 5113 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5114 MI.eraseFromParent(); 5115 return true; 5116 } 5117 case Intrinsic::amdgcn_s_buffer_load: 5118 return legalizeSBufferLoad(Helper, MI); 5119 case Intrinsic::amdgcn_raw_buffer_store: 5120 case Intrinsic::amdgcn_struct_buffer_store: 5121 return legalizeBufferStore(MI, MRI, B, false, false); 5122 case Intrinsic::amdgcn_raw_buffer_store_format: 5123 case Intrinsic::amdgcn_struct_buffer_store_format: 5124 return legalizeBufferStore(MI, MRI, B, false, true); 5125 case Intrinsic::amdgcn_raw_tbuffer_store: 5126 case Intrinsic::amdgcn_struct_tbuffer_store: 5127 return legalizeBufferStore(MI, MRI, B, true, true); 5128 case Intrinsic::amdgcn_raw_buffer_load: 5129 case Intrinsic::amdgcn_struct_buffer_load: 5130 return legalizeBufferLoad(MI, MRI, B, false, false); 5131 case Intrinsic::amdgcn_raw_buffer_load_format: 5132 case Intrinsic::amdgcn_struct_buffer_load_format: 5133 return legalizeBufferLoad(MI, MRI, B, true, false); 5134 case Intrinsic::amdgcn_raw_tbuffer_load: 5135 case Intrinsic::amdgcn_struct_tbuffer_load: 5136 return legalizeBufferLoad(MI, MRI, B, true, true); 5137 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5138 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5139 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5140 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5141 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5142 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5143 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5144 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5145 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5146 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5147 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5148 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5149 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5150 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5151 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5152 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5153 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5154 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5155 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5156 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5157 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5158 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5159 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5160 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5161 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5162 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5163 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5164 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5165 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5166 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5167 return legalizeBufferAtomic(MI, B, IntrID); 5168 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5169 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { 5170 Register DstReg = MI.getOperand(0).getReg(); 5171 if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) { 5172 Function &F = B.getMF().getFunction(); 5173 DiagnosticInfoUnsupported NoFpRet( 5174 F, "return versions of fp atomics not supported", B.getDebugLoc(), 5175 DS_Error); 5176 F.getContext().diagnose(NoFpRet); 5177 B.buildUndef(DstReg); 5178 MI.eraseFromParent(); 5179 return true; 5180 } 5181 5182 return legalizeBufferAtomic(MI, B, IntrID); 5183 } 5184 case Intrinsic::amdgcn_atomic_inc: 5185 return legalizeAtomicIncDec(MI, B, true); 5186 case Intrinsic::amdgcn_atomic_dec: 5187 return legalizeAtomicIncDec(MI, B, false); 5188 case Intrinsic::trap: 5189 return legalizeTrapIntrinsic(MI, MRI, B); 5190 case Intrinsic::debugtrap: 5191 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5192 case Intrinsic::amdgcn_rsq_clamp: 5193 return legalizeRsqClampIntrinsic(MI, MRI, B); 5194 case Intrinsic::amdgcn_ds_fadd: 5195 case Intrinsic::amdgcn_ds_fmin: 5196 case Intrinsic::amdgcn_ds_fmax: 5197 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5198 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5199 return legalizeBVHIntrinsic(MI, B); 5200 default: { 5201 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5202 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5203 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5204 return true; 5205 } 5206 } 5207 5208 return true; 5209 } 5210