1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returns true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 unsigned AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .minScalar(0, S16) 536 .clampMaxNumElementsStrict(0, S16, 2) 537 .widenScalarToNextMultipleOf(0, 32) 538 .maxScalar(0, S32) 539 .scalarize(0); 540 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 542 .legalFor({S32, S16, V2S16}) // Clamp modifier 543 .minScalarOrElt(0, S16) 544 .clampMaxNumElementsStrict(0, S16, 2) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 32) 547 .lower(); 548 } else if (ST.has16BitInsts()) { 549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 550 .legalFor({S32, S16}) 551 .minScalar(0, S16) 552 .widenScalarToNextMultipleOf(0, 32) 553 .maxScalar(0, S32) 554 .scalarize(0); 555 556 // Technically the saturating operations require clamp bit support, but this 557 // was introduced at the same time as 16-bit operations. 558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 559 .legalFor({S32, S16}) // Clamp modifier 560 .minScalar(0, S16) 561 .scalarize(0) 562 .widenScalarToNextPow2(0, 16) 563 .lower(); 564 565 // We're just lowering this, but it helps get a better result to try to 566 // coerce to the desired type first. 567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 568 .minScalar(0, S16) 569 .scalarize(0) 570 .lower(); 571 } else { 572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 573 .legalFor({S32}) 574 .widenScalarToNextMultipleOf(0, 32) 575 .clampScalar(0, S32, S32) 576 .scalarize(0); 577 578 if (ST.hasIntClamp()) { 579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 580 .legalFor({S32}) // Clamp modifier. 581 .scalarize(0) 582 .minScalarOrElt(0, S32) 583 .lower(); 584 } else { 585 // Clamp bit support was added in VI, along with 16-bit operations. 586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 587 .minScalar(0, S32) 588 .scalarize(0) 589 .lower(); 590 } 591 592 // FIXME: DAG expansion gets better results. The widening uses the smaller 593 // range values and goes for the min/max lowering directly. 594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 595 .minScalar(0, S32) 596 .scalarize(0) 597 .lower(); 598 } 599 600 getActionDefinitionsBuilder( 601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 602 .customFor({S32, S64}) 603 .clampScalar(0, S32, S64) 604 .widenScalarToNextPow2(0, 32) 605 .scalarize(0); 606 607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 608 .legalFor({S32}) 609 .maxScalar(0, S32); 610 611 if (ST.hasVOP3PInsts()) { 612 Mulh 613 .clampMaxNumElements(0, S8, 2) 614 .lowerFor({V2S8}); 615 } 616 617 Mulh 618 .scalarize(0) 619 .lower(); 620 621 // Report legal for any types we can handle anywhere. For the cases only legal 622 // on the SALU, RegBankSelect will be able to re-legalize. 623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 625 .clampScalar(0, S32, S64) 626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 628 .widenScalarToNextPow2(0) 629 .scalarize(0); 630 631 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 633 .legalFor({{S32, S1}, {S32, S32}}) 634 .minScalar(0, S32) 635 // TODO: .scalarize(0) 636 .lower(); 637 638 getActionDefinitionsBuilder(G_BITCAST) 639 // Don't worry about the size constraint. 640 .legalIf(all(isRegisterType(0), isRegisterType(1))) 641 .lower(); 642 643 644 getActionDefinitionsBuilder(G_CONSTANT) 645 .legalFor({S1, S32, S64, S16, GlobalPtr, 646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 647 .legalIf(isPointer(0)) 648 .clampScalar(0, S32, S64) 649 .widenScalarToNextPow2(0); 650 651 getActionDefinitionsBuilder(G_FCONSTANT) 652 .legalFor({S32, S64, S16}) 653 .clampScalar(0, S16, S64); 654 655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 656 .legalIf(isRegisterType(0)) 657 // s1 and s16 are special cases because they have legal operations on 658 // them, but don't really occupy registers in the normal way. 659 .legalFor({S1, S16}) 660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 661 .clampScalarOrElt(0, S32, MaxScalar) 662 .widenScalarToNextPow2(0, 32) 663 .clampMaxNumElements(0, S32, 16); 664 665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 666 667 // If the amount is divergent, we have to do a wave reduction to get the 668 // maximum value, so this is expanded during RegBankSelect. 669 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 670 .legalFor({{PrivatePtr, S32}}); 671 672 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 673 .customIf(typeIsNot(0, PrivatePtr)); 674 675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 676 677 auto &FPOpActions = getActionDefinitionsBuilder( 678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 679 .legalFor({S32, S64}); 680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 681 .customFor({S32, S64}); 682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 683 .customFor({S32, S64}); 684 685 if (ST.has16BitInsts()) { 686 if (ST.hasVOP3PInsts()) 687 FPOpActions.legalFor({S16, V2S16}); 688 else 689 FPOpActions.legalFor({S16}); 690 691 TrigActions.customFor({S16}); 692 FDIVActions.customFor({S16}); 693 } 694 695 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 697 698 if (ST.hasVOP3PInsts()) { 699 MinNumMaxNum.customFor(FPTypesPK16) 700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 701 .clampMaxNumElements(0, S16, 2) 702 .clampScalar(0, S16, S64) 703 .scalarize(0); 704 } else if (ST.has16BitInsts()) { 705 MinNumMaxNum.customFor(FPTypes16) 706 .clampScalar(0, S16, S64) 707 .scalarize(0); 708 } else { 709 MinNumMaxNum.customFor(FPTypesBase) 710 .clampScalar(0, S32, S64) 711 .scalarize(0); 712 } 713 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 716 717 FPOpActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 TrigActions 722 .scalarize(0) 723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 724 725 FDIVActions 726 .scalarize(0) 727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 728 729 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 730 .legalFor(FPTypesPK16) 731 .clampMaxNumElementsStrict(0, S16, 2) 732 .scalarize(0) 733 .clampScalar(0, S16, S64); 734 735 if (ST.has16BitInsts()) { 736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 737 .legalFor({S32, S64, S16}) 738 .scalarize(0) 739 .clampScalar(0, S16, S64); 740 } else { 741 getActionDefinitionsBuilder(G_FSQRT) 742 .legalFor({S32, S64}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 746 if (ST.hasFractBug()) { 747 getActionDefinitionsBuilder(G_FFLOOR) 748 .customFor({S64}) 749 .legalFor({S32, S64}) 750 .scalarize(0) 751 .clampScalar(0, S32, S64); 752 } else { 753 getActionDefinitionsBuilder(G_FFLOOR) 754 .legalFor({S32, S64}) 755 .scalarize(0) 756 .clampScalar(0, S32, S64); 757 } 758 } 759 760 getActionDefinitionsBuilder(G_FPTRUNC) 761 .legalFor({{S32, S64}, {S16, S32}}) 762 .scalarize(0) 763 .lower(); 764 765 getActionDefinitionsBuilder(G_FPEXT) 766 .legalFor({{S64, S32}, {S32, S16}}) 767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 768 .scalarize(0); 769 770 getActionDefinitionsBuilder(G_FSUB) 771 // Use actual fsub instruction 772 .legalFor({S32}) 773 // Must use fadd + fneg 774 .lowerFor({S64, S16, V2S16}) 775 .scalarize(0) 776 .clampScalar(0, S32, S64); 777 778 // Whether this is legal depends on the floating point mode for the function. 779 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 780 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 781 FMad.customFor({S32, S16}); 782 else if (ST.hasMadMacF32Insts()) 783 FMad.customFor({S32}); 784 else if (ST.hasMadF16()) 785 FMad.customFor({S16}); 786 FMad.scalarize(0) 787 .lower(); 788 789 auto &FRem = getActionDefinitionsBuilder(G_FREM); 790 if (ST.has16BitInsts()) { 791 FRem.customFor({S16, S32, S64}); 792 } else { 793 FRem.minScalar(0, S32) 794 .customFor({S32, S64}); 795 } 796 FRem.scalarize(0); 797 798 // TODO: Do we need to clamp maximum bitwidth? 799 getActionDefinitionsBuilder(G_TRUNC) 800 .legalIf(isScalar(0)) 801 .legalFor({{V2S16, V2S32}}) 802 .clampMaxNumElements(0, S16, 2) 803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 804 // situations (like an invalid implicit use), we don't want to infinite loop 805 // in the legalizer. 806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 807 .alwaysLegal(); 808 809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 811 {S32, S1}, {S64, S1}, {S16, S1}}) 812 .scalarize(0) 813 .clampScalar(0, S32, S64) 814 .widenScalarToNextPow2(1, 32); 815 816 // TODO: Split s1->s64 during regbankselect for VALU. 817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 819 .lowerIf(typeIs(1, S1)) 820 .customFor({{S32, S64}, {S64, S64}}); 821 if (ST.has16BitInsts()) 822 IToFP.legalFor({{S16, S16}}); 823 IToFP.clampScalar(1, S32, S64) 824 .minScalar(0, S32) 825 .scalarize(0) 826 .widenScalarToNextPow2(1); 827 828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 830 .customFor({{S64, S32}, {S64, S64}}) 831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 832 if (ST.has16BitInsts()) 833 FPToI.legalFor({{S16, S16}}); 834 else 835 FPToI.minScalar(1, S32); 836 837 FPToI.minScalar(0, S32) 838 .widenScalarToNextPow2(0, 32) 839 .scalarize(0) 840 .lower(); 841 842 // Lower roundeven into G_FRINT 843 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 844 .scalarize(0) 845 .lower(); 846 847 if (ST.has16BitInsts()) { 848 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 849 .legalFor({S16, S32, S64}) 850 .clampScalar(0, S16, S64) 851 .scalarize(0); 852 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 854 .legalFor({S32, S64}) 855 .clampScalar(0, S32, S64) 856 .scalarize(0); 857 } else { 858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 859 .legalFor({S32}) 860 .customFor({S64}) 861 .clampScalar(0, S32, S64) 862 .scalarize(0); 863 } 864 865 getActionDefinitionsBuilder(G_PTR_ADD) 866 .legalIf(all(isPointer(0), sameSize(0, 1))) 867 .scalarize(0) 868 .scalarSameSizeAs(1, 0); 869 870 getActionDefinitionsBuilder(G_PTRMASK) 871 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 872 .scalarSameSizeAs(1, 0) 873 .scalarize(0); 874 875 auto &CmpBuilder = 876 getActionDefinitionsBuilder(G_ICMP) 877 // The compare output type differs based on the register bank of the output, 878 // so make both s1 and s32 legal. 879 // 880 // Scalar compares producing output in scc will be promoted to s32, as that 881 // is the allocatable register type that will be needed for the copy from 882 // scc. This will be promoted during RegBankSelect, and we assume something 883 // before that won't try to use s32 result types. 884 // 885 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 886 // bank. 887 .legalForCartesianProduct( 888 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 889 .legalForCartesianProduct( 890 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 891 if (ST.has16BitInsts()) { 892 CmpBuilder.legalFor({{S1, S16}}); 893 } 894 895 CmpBuilder 896 .widenScalarToNextPow2(1) 897 .clampScalar(1, S32, S64) 898 .scalarize(0) 899 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 900 901 getActionDefinitionsBuilder(G_FCMP) 902 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 903 .widenScalarToNextPow2(1) 904 .clampScalar(1, S32, S64) 905 .scalarize(0); 906 907 // FIXME: fpow has a selection pattern that should move to custom lowering. 908 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 909 if (ST.has16BitInsts()) 910 Exp2Ops.legalFor({S32, S16}); 911 else 912 Exp2Ops.legalFor({S32}); 913 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 914 Exp2Ops.scalarize(0); 915 916 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 917 if (ST.has16BitInsts()) 918 ExpOps.customFor({{S32}, {S16}}); 919 else 920 ExpOps.customFor({S32}); 921 ExpOps.clampScalar(0, MinScalarFPTy, S32) 922 .scalarize(0); 923 924 getActionDefinitionsBuilder(G_FPOWI) 925 .clampScalar(0, MinScalarFPTy, S32) 926 .lower(); 927 928 // The 64-bit versions produce 32-bit results, but only on the SALU. 929 getActionDefinitionsBuilder(G_CTPOP) 930 .legalFor({{S32, S32}, {S32, S64}}) 931 .clampScalar(0, S32, S32) 932 .clampScalar(1, S32, S64) 933 .scalarize(0) 934 .widenScalarToNextPow2(0, 32) 935 .widenScalarToNextPow2(1, 32); 936 937 // The hardware instructions return a different result on 0 than the generic 938 // instructions expect. The hardware produces -1, but these produce the 939 // bitwidth. 940 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 941 .scalarize(0) 942 .clampScalar(0, S32, S32) 943 .clampScalar(1, S32, S64) 944 .widenScalarToNextPow2(0, 32) 945 .widenScalarToNextPow2(1, 32) 946 .custom(); 947 948 // The 64-bit versions produce 32-bit results, but only on the SALU. 949 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 950 .legalFor({{S32, S32}, {S32, S64}}) 951 .clampScalar(0, S32, S32) 952 .clampScalar(1, S32, S64) 953 .scalarize(0) 954 .widenScalarToNextPow2(0, 32) 955 .widenScalarToNextPow2(1, 32); 956 957 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 958 // RegBankSelect. 959 getActionDefinitionsBuilder(G_BITREVERSE) 960 .legalFor({S32, S64}) 961 .clampScalar(0, S32, S64) 962 .scalarize(0) 963 .widenScalarToNextPow2(0); 964 965 if (ST.has16BitInsts()) { 966 getActionDefinitionsBuilder(G_BSWAP) 967 .legalFor({S16, S32, V2S16}) 968 .clampMaxNumElementsStrict(0, S16, 2) 969 // FIXME: Fixing non-power-of-2 before clamp is workaround for 970 // narrowScalar limitation. 971 .widenScalarToNextPow2(0) 972 .clampScalar(0, S16, S32) 973 .scalarize(0); 974 975 if (ST.hasVOP3PInsts()) { 976 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 977 .legalFor({S32, S16, V2S16}) 978 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 979 .clampMaxNumElements(0, S16, 2) 980 .minScalar(0, S16) 981 .widenScalarToNextPow2(0) 982 .scalarize(0) 983 .lower(); 984 } else { 985 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 986 .legalFor({S32, S16}) 987 .widenScalarToNextPow2(0) 988 .minScalar(0, S16) 989 .scalarize(0) 990 .lower(); 991 } 992 } else { 993 // TODO: Should have same legality without v_perm_b32 994 getActionDefinitionsBuilder(G_BSWAP) 995 .legalFor({S32}) 996 .lowerIf(scalarNarrowerThan(0, 32)) 997 // FIXME: Fixing non-power-of-2 before clamp is workaround for 998 // narrowScalar limitation. 999 .widenScalarToNextPow2(0) 1000 .maxScalar(0, S32) 1001 .scalarize(0) 1002 .lower(); 1003 1004 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1005 .legalFor({S32}) 1006 .minScalar(0, S32) 1007 .widenScalarToNextPow2(0) 1008 .scalarize(0) 1009 .lower(); 1010 } 1011 1012 getActionDefinitionsBuilder(G_INTTOPTR) 1013 // List the common cases 1014 .legalForCartesianProduct(AddrSpaces64, {S64}) 1015 .legalForCartesianProduct(AddrSpaces32, {S32}) 1016 .scalarize(0) 1017 // Accept any address space as long as the size matches 1018 .legalIf(sameSize(0, 1)) 1019 .widenScalarIf(smallerThan(1, 0), 1020 [](const LegalityQuery &Query) { 1021 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1022 }) 1023 .narrowScalarIf(largerThan(1, 0), 1024 [](const LegalityQuery &Query) { 1025 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1026 }); 1027 1028 getActionDefinitionsBuilder(G_PTRTOINT) 1029 // List the common cases 1030 .legalForCartesianProduct(AddrSpaces64, {S64}) 1031 .legalForCartesianProduct(AddrSpaces32, {S32}) 1032 .scalarize(0) 1033 // Accept any address space as long as the size matches 1034 .legalIf(sameSize(0, 1)) 1035 .widenScalarIf(smallerThan(0, 1), 1036 [](const LegalityQuery &Query) { 1037 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1038 }) 1039 .narrowScalarIf( 1040 largerThan(0, 1), 1041 [](const LegalityQuery &Query) { 1042 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1043 }); 1044 1045 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1046 .scalarize(0) 1047 .custom(); 1048 1049 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1050 bool IsLoad) -> bool { 1051 const LLT DstTy = Query.Types[0]; 1052 1053 // Split vector extloads. 1054 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1055 1056 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1057 return true; 1058 1059 const LLT PtrTy = Query.Types[1]; 1060 unsigned AS = PtrTy.getAddressSpace(); 1061 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1062 return true; 1063 1064 // Catch weird sized loads that don't evenly divide into the access sizes 1065 // TODO: May be able to widen depending on alignment etc. 1066 unsigned NumRegs = (MemSize + 31) / 32; 1067 if (NumRegs == 3) { 1068 if (!ST.hasDwordx3LoadStores()) 1069 return true; 1070 } else { 1071 // If the alignment allows, these should have been widened. 1072 if (!isPowerOf2_32(NumRegs)) 1073 return true; 1074 } 1075 1076 return false; 1077 }; 1078 1079 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1080 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1081 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1082 1083 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1084 // LDS 1085 // TODO: Unsupported flat for SI. 1086 1087 for (unsigned Op : {G_LOAD, G_STORE}) { 1088 const bool IsStore = Op == G_STORE; 1089 1090 auto &Actions = getActionDefinitionsBuilder(Op); 1091 // Explicitly list some common cases. 1092 // TODO: Does this help compile time at all? 1093 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1094 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1095 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1096 {S64, GlobalPtr, S64, GlobalAlign32}, 1097 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1098 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1099 {S32, GlobalPtr, S8, GlobalAlign8}, 1100 {S32, GlobalPtr, S16, GlobalAlign16}, 1101 1102 {S32, LocalPtr, S32, 32}, 1103 {S64, LocalPtr, S64, 32}, 1104 {V2S32, LocalPtr, V2S32, 32}, 1105 {S32, LocalPtr, S8, 8}, 1106 {S32, LocalPtr, S16, 16}, 1107 {V2S16, LocalPtr, S32, 32}, 1108 1109 {S32, PrivatePtr, S32, 32}, 1110 {S32, PrivatePtr, S8, 8}, 1111 {S32, PrivatePtr, S16, 16}, 1112 {V2S16, PrivatePtr, S32, 32}, 1113 1114 {S32, ConstantPtr, S32, GlobalAlign32}, 1115 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1116 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1117 {S64, ConstantPtr, S64, GlobalAlign32}, 1118 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1119 Actions.legalIf( 1120 [=](const LegalityQuery &Query) -> bool { 1121 return isLoadStoreLegal(ST, Query); 1122 }); 1123 1124 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1125 // 64-bits. 1126 // 1127 // TODO: Should generalize bitcast action into coerce, which will also cover 1128 // inserting addrspacecasts. 1129 Actions.customIf(typeIs(1, Constant32Ptr)); 1130 1131 // Turn any illegal element vectors into something easier to deal 1132 // with. These will ultimately produce 32-bit scalar shifts to extract the 1133 // parts anyway. 1134 // 1135 // For odd 16-bit element vectors, prefer to split those into pieces with 1136 // 16-bit vector parts. 1137 Actions.bitcastIf( 1138 [=](const LegalityQuery &Query) -> bool { 1139 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1140 Query.MMODescrs[0].MemoryTy); 1141 }, bitcastToRegisterType(0)); 1142 1143 if (!IsStore) { 1144 // Widen suitably aligned loads by loading extra bytes. The standard 1145 // legalization actions can't properly express widening memory operands. 1146 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1147 return shouldWidenLoad(ST, Query, G_LOAD); 1148 }); 1149 } 1150 1151 // FIXME: load/store narrowing should be moved to lower action 1152 Actions 1153 .narrowScalarIf( 1154 [=](const LegalityQuery &Query) -> bool { 1155 return !Query.Types[0].isVector() && 1156 needToSplitMemOp(Query, Op == G_LOAD); 1157 }, 1158 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1159 const LLT DstTy = Query.Types[0]; 1160 const LLT PtrTy = Query.Types[1]; 1161 1162 const unsigned DstSize = DstTy.getSizeInBits(); 1163 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1164 1165 // Split extloads. 1166 if (DstSize > MemSize) 1167 return std::make_pair(0, LLT::scalar(MemSize)); 1168 1169 unsigned MaxSize = maxSizeForAddrSpace(ST, 1170 PtrTy.getAddressSpace(), 1171 Op == G_LOAD); 1172 if (MemSize > MaxSize) 1173 return std::make_pair(0, LLT::scalar(MaxSize)); 1174 1175 unsigned Align = Query.MMODescrs[0].AlignInBits; 1176 return std::make_pair(0, LLT::scalar(Align)); 1177 }) 1178 .fewerElementsIf( 1179 [=](const LegalityQuery &Query) -> bool { 1180 return Query.Types[0].isVector() && 1181 needToSplitMemOp(Query, Op == G_LOAD); 1182 }, 1183 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1184 const LLT DstTy = Query.Types[0]; 1185 const LLT PtrTy = Query.Types[1]; 1186 1187 LLT EltTy = DstTy.getElementType(); 1188 unsigned MaxSize = maxSizeForAddrSpace(ST, 1189 PtrTy.getAddressSpace(), 1190 Op == G_LOAD); 1191 1192 // FIXME: Handle widened to power of 2 results better. This ends 1193 // up scalarizing. 1194 // FIXME: 3 element stores scalarized on SI 1195 1196 // Split if it's too large for the address space. 1197 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1198 if (MemSize > MaxSize) { 1199 unsigned NumElts = DstTy.getNumElements(); 1200 unsigned EltSize = EltTy.getSizeInBits(); 1201 1202 if (MaxSize % EltSize == 0) { 1203 return std::make_pair( 1204 0, LLT::scalarOrVector( 1205 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1206 } 1207 1208 unsigned NumPieces = MemSize / MaxSize; 1209 1210 // FIXME: Refine when odd breakdowns handled 1211 // The scalars will need to be re-legalized. 1212 if (NumPieces == 1 || NumPieces >= NumElts || 1213 NumElts % NumPieces != 0) 1214 return std::make_pair(0, EltTy); 1215 1216 return std::make_pair( 1217 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1218 } 1219 1220 // FIXME: We could probably handle weird extending loads better. 1221 if (DstTy.getSizeInBits() > MemSize) 1222 return std::make_pair(0, EltTy); 1223 1224 unsigned EltSize = EltTy.getSizeInBits(); 1225 unsigned DstSize = DstTy.getSizeInBits(); 1226 if (!isPowerOf2_32(DstSize)) { 1227 // We're probably decomposing an odd sized store. Try to split 1228 // to the widest type. TODO: Account for alignment. As-is it 1229 // should be OK, since the new parts will be further legalized. 1230 unsigned FloorSize = PowerOf2Floor(DstSize); 1231 return std::make_pair( 1232 0, LLT::scalarOrVector( 1233 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1234 } 1235 1236 // May need relegalization for the scalars. 1237 return std::make_pair(0, EltTy); 1238 }) 1239 .minScalar(0, S32) 1240 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1241 .widenScalarToNextPow2(0) 1242 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1243 .lower(); 1244 } 1245 1246 // FIXME: Unaligned accesses not lowered. 1247 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1248 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1249 {S32, GlobalPtr, S16, 2 * 8}, 1250 {S32, LocalPtr, S8, 8}, 1251 {S32, LocalPtr, S16, 16}, 1252 {S32, PrivatePtr, S8, 8}, 1253 {S32, PrivatePtr, S16, 16}, 1254 {S32, ConstantPtr, S8, 8}, 1255 {S32, ConstantPtr, S16, 2 * 8}}) 1256 .legalIf( 1257 [=](const LegalityQuery &Query) -> bool { 1258 return isLoadStoreLegal(ST, Query); 1259 }); 1260 1261 if (ST.hasFlatAddressSpace()) { 1262 ExtLoads.legalForTypesWithMemDesc( 1263 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1264 } 1265 1266 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1267 // 64-bits. 1268 // 1269 // TODO: Should generalize bitcast action into coerce, which will also cover 1270 // inserting addrspacecasts. 1271 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1272 1273 ExtLoads.clampScalar(0, S32, S32) 1274 .widenScalarToNextPow2(0) 1275 .lower(); 1276 1277 auto &Atomics = getActionDefinitionsBuilder( 1278 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1279 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1280 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1281 G_ATOMICRMW_UMIN}) 1282 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1283 {S64, GlobalPtr}, {S64, LocalPtr}, 1284 {S32, RegionPtr}, {S64, RegionPtr}}); 1285 if (ST.hasFlatAddressSpace()) { 1286 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1287 } 1288 1289 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1290 if (ST.hasLDSFPAtomicAdd()) { 1291 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1292 if (ST.hasGFX90AInsts()) 1293 Atomic.legalFor({{S64, LocalPtr}}); 1294 } 1295 if (ST.hasAtomicFaddInsts()) 1296 Atomic.legalFor({{S32, GlobalPtr}}); 1297 1298 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1299 // demarshalling 1300 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1301 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1302 {S32, FlatPtr}, {S64, FlatPtr}}) 1303 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1304 {S32, RegionPtr}, {S64, RegionPtr}}); 1305 // TODO: Pointer types, any 32-bit or 64-bit vector 1306 1307 // Condition should be s32 for scalar, s1 for vector. 1308 getActionDefinitionsBuilder(G_SELECT) 1309 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1310 LocalPtr, FlatPtr, PrivatePtr, 1311 LLT::fixed_vector(2, LocalPtr), 1312 LLT::fixed_vector(2, PrivatePtr)}, 1313 {S1, S32}) 1314 .clampScalar(0, S16, S64) 1315 .scalarize(1) 1316 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1317 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1318 .clampMaxNumElements(0, S32, 2) 1319 .clampMaxNumElements(0, LocalPtr, 2) 1320 .clampMaxNumElements(0, PrivatePtr, 2) 1321 .scalarize(0) 1322 .widenScalarToNextPow2(0) 1323 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1324 1325 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1326 // be more flexible with the shift amount type. 1327 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1328 .legalFor({{S32, S32}, {S64, S32}}); 1329 if (ST.has16BitInsts()) { 1330 if (ST.hasVOP3PInsts()) { 1331 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1332 .clampMaxNumElements(0, S16, 2); 1333 } else 1334 Shifts.legalFor({{S16, S16}}); 1335 1336 // TODO: Support 16-bit shift amounts for all types 1337 Shifts.widenScalarIf( 1338 [=](const LegalityQuery &Query) { 1339 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1340 // 32-bit amount. 1341 const LLT ValTy = Query.Types[0]; 1342 const LLT AmountTy = Query.Types[1]; 1343 return ValTy.getSizeInBits() <= 16 && 1344 AmountTy.getSizeInBits() < 16; 1345 }, changeTo(1, S16)); 1346 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1347 Shifts.clampScalar(1, S32, S32); 1348 Shifts.clampScalar(0, S16, S64); 1349 Shifts.widenScalarToNextPow2(0, 16); 1350 1351 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1352 .minScalar(0, S16) 1353 .scalarize(0) 1354 .lower(); 1355 } else { 1356 // Make sure we legalize the shift amount type first, as the general 1357 // expansion for the shifted type will produce much worse code if it hasn't 1358 // been truncated already. 1359 Shifts.clampScalar(1, S32, S32); 1360 Shifts.clampScalar(0, S32, S64); 1361 Shifts.widenScalarToNextPow2(0, 32); 1362 1363 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1364 .minScalar(0, S32) 1365 .scalarize(0) 1366 .lower(); 1367 } 1368 Shifts.scalarize(0); 1369 1370 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1371 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1372 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1373 unsigned IdxTypeIdx = 2; 1374 1375 getActionDefinitionsBuilder(Op) 1376 .customIf([=](const LegalityQuery &Query) { 1377 const LLT EltTy = Query.Types[EltTypeIdx]; 1378 const LLT VecTy = Query.Types[VecTypeIdx]; 1379 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1380 const unsigned EltSize = EltTy.getSizeInBits(); 1381 return (EltSize == 32 || EltSize == 64) && 1382 VecTy.getSizeInBits() % 32 == 0 && 1383 VecTy.getSizeInBits() <= MaxRegisterSize && 1384 IdxTy.getSizeInBits() == 32; 1385 }) 1386 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1387 bitcastToVectorElement32(VecTypeIdx)) 1388 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1389 .bitcastIf( 1390 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1391 [=](const LegalityQuery &Query) { 1392 // For > 64-bit element types, try to turn this into a 64-bit 1393 // element vector since we may be able to do better indexing 1394 // if this is scalar. If not, fall back to 32. 1395 const LLT EltTy = Query.Types[EltTypeIdx]; 1396 const LLT VecTy = Query.Types[VecTypeIdx]; 1397 const unsigned DstEltSize = EltTy.getSizeInBits(); 1398 const unsigned VecSize = VecTy.getSizeInBits(); 1399 1400 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1401 return std::make_pair( 1402 VecTypeIdx, 1403 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1404 }) 1405 .clampScalar(EltTypeIdx, S32, S64) 1406 .clampScalar(VecTypeIdx, S32, S64) 1407 .clampScalar(IdxTypeIdx, S32, S32) 1408 .clampMaxNumElements(VecTypeIdx, S32, 32) 1409 // TODO: Clamp elements for 64-bit vectors? 1410 // It should only be necessary with variable indexes. 1411 // As a last resort, lower to the stack 1412 .lower(); 1413 } 1414 1415 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1416 .unsupportedIf([=](const LegalityQuery &Query) { 1417 const LLT &EltTy = Query.Types[1].getElementType(); 1418 return Query.Types[0] != EltTy; 1419 }); 1420 1421 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1422 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1423 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1424 1425 // FIXME: Doesn't handle extract of illegal sizes. 1426 getActionDefinitionsBuilder(Op) 1427 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1428 .lowerIf([=](const LegalityQuery &Query) { 1429 // Sub-vector(or single element) insert and extract. 1430 // TODO: verify immediate offset here since lower only works with 1431 // whole elements. 1432 const LLT BigTy = Query.Types[BigTyIdx]; 1433 return BigTy.isVector(); 1434 }) 1435 // FIXME: Multiples of 16 should not be legal. 1436 .legalIf([=](const LegalityQuery &Query) { 1437 const LLT BigTy = Query.Types[BigTyIdx]; 1438 const LLT LitTy = Query.Types[LitTyIdx]; 1439 return (BigTy.getSizeInBits() % 32 == 0) && 1440 (LitTy.getSizeInBits() % 16 == 0); 1441 }) 1442 .widenScalarIf( 1443 [=](const LegalityQuery &Query) { 1444 const LLT BigTy = Query.Types[BigTyIdx]; 1445 return (BigTy.getScalarSizeInBits() < 16); 1446 }, 1447 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1448 .widenScalarIf( 1449 [=](const LegalityQuery &Query) { 1450 const LLT LitTy = Query.Types[LitTyIdx]; 1451 return (LitTy.getScalarSizeInBits() < 16); 1452 }, 1453 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1454 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1455 .widenScalarToNextPow2(BigTyIdx, 32); 1456 1457 } 1458 1459 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1460 .legalForCartesianProduct(AllS32Vectors, {S32}) 1461 .legalForCartesianProduct(AllS64Vectors, {S64}) 1462 .clampNumElements(0, V16S32, V32S32) 1463 .clampNumElements(0, V2S64, V16S64) 1464 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1465 1466 if (ST.hasScalarPackInsts()) { 1467 BuildVector 1468 // FIXME: Should probably widen s1 vectors straight to s32 1469 .minScalarOrElt(0, S16) 1470 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1471 .minScalar(1, S32); 1472 1473 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1474 .legalFor({V2S16, S32}) 1475 .lower(); 1476 BuildVector.minScalarOrElt(0, S32); 1477 } else { 1478 BuildVector.customFor({V2S16, S16}); 1479 BuildVector.minScalarOrElt(0, S32); 1480 1481 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1482 .customFor({V2S16, S32}) 1483 .lower(); 1484 } 1485 1486 BuildVector.legalIf(isRegisterType(0)); 1487 1488 // FIXME: Clamp maximum size 1489 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1490 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1491 .clampMaxNumElements(0, S32, 32) 1492 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1493 .clampMaxNumElements(0, S16, 64); 1494 1495 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1496 // pre-legalize. 1497 if (ST.hasVOP3PInsts()) { 1498 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1499 .customFor({V2S16, V2S16}) 1500 .lower(); 1501 } else 1502 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1503 1504 // Merge/Unmerge 1505 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1506 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1507 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1508 1509 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1510 const LLT Ty = Query.Types[TypeIdx]; 1511 if (Ty.isVector()) { 1512 const LLT &EltTy = Ty.getElementType(); 1513 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1514 return true; 1515 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1516 return true; 1517 } 1518 return false; 1519 }; 1520 1521 auto &Builder = getActionDefinitionsBuilder(Op) 1522 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1523 .lowerFor({{S16, V2S16}}) 1524 .lowerIf([=](const LegalityQuery &Query) { 1525 const LLT BigTy = Query.Types[BigTyIdx]; 1526 return BigTy.getSizeInBits() == 32; 1527 }) 1528 // Try to widen to s16 first for small types. 1529 // TODO: Only do this on targets with legal s16 shifts 1530 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1531 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1532 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1533 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1534 elementTypeIs(1, S16)), 1535 changeTo(1, V2S16)) 1536 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1537 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1538 // valid. 1539 .clampScalar(LitTyIdx, S32, S512) 1540 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1541 // Break up vectors with weird elements into scalars 1542 .fewerElementsIf( 1543 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1544 scalarize(0)) 1545 .fewerElementsIf( 1546 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1547 scalarize(1)) 1548 .clampScalar(BigTyIdx, S32, MaxScalar); 1549 1550 if (Op == G_MERGE_VALUES) { 1551 Builder.widenScalarIf( 1552 // TODO: Use 16-bit shifts if legal for 8-bit values? 1553 [=](const LegalityQuery &Query) { 1554 const LLT Ty = Query.Types[LitTyIdx]; 1555 return Ty.getSizeInBits() < 32; 1556 }, 1557 changeTo(LitTyIdx, S32)); 1558 } 1559 1560 Builder.widenScalarIf( 1561 [=](const LegalityQuery &Query) { 1562 const LLT Ty = Query.Types[BigTyIdx]; 1563 return !isPowerOf2_32(Ty.getSizeInBits()) && 1564 Ty.getSizeInBits() % 16 != 0; 1565 }, 1566 [=](const LegalityQuery &Query) { 1567 // Pick the next power of 2, or a multiple of 64 over 128. 1568 // Whichever is smaller. 1569 const LLT &Ty = Query.Types[BigTyIdx]; 1570 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1571 if (NewSizeInBits >= 256) { 1572 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1573 if (RoundedTo < NewSizeInBits) 1574 NewSizeInBits = RoundedTo; 1575 } 1576 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1577 }) 1578 // Any vectors left are the wrong size. Scalarize them. 1579 .scalarize(0) 1580 .scalarize(1); 1581 } 1582 1583 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1584 // RegBankSelect. 1585 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1586 .legalFor({{S32}, {S64}}); 1587 1588 if (ST.hasVOP3PInsts()) { 1589 SextInReg.lowerFor({{V2S16}}) 1590 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1591 // get more vector shift opportunities, since we'll get those when 1592 // expanded. 1593 .clampMaxNumElementsStrict(0, S16, 2); 1594 } else if (ST.has16BitInsts()) { 1595 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1596 } else { 1597 // Prefer to promote to s32 before lowering if we don't have 16-bit 1598 // shifts. This avoid a lot of intermediate truncate and extend operations. 1599 SextInReg.lowerFor({{S32}, {S64}}); 1600 } 1601 1602 SextInReg 1603 .scalarize(0) 1604 .clampScalar(0, S32, S64) 1605 .lower(); 1606 1607 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1608 .scalarize(0) 1609 .lower(); 1610 1611 // TODO: Only Try to form v2s16 with legal packed instructions. 1612 getActionDefinitionsBuilder(G_FSHR) 1613 .legalFor({{S32, S32}}) 1614 .lowerFor({{V2S16, V2S16}}) 1615 .clampMaxNumElementsStrict(0, S16, 2) 1616 .scalarize(0) 1617 .lower(); 1618 1619 if (ST.hasVOP3PInsts()) { 1620 getActionDefinitionsBuilder(G_FSHL) 1621 .lowerFor({{V2S16, V2S16}}) 1622 .clampMaxNumElementsStrict(0, S16, 2) 1623 .scalarize(0) 1624 .lower(); 1625 } else { 1626 getActionDefinitionsBuilder(G_FSHL) 1627 .scalarize(0) 1628 .lower(); 1629 } 1630 1631 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1632 .legalFor({S64}); 1633 1634 getActionDefinitionsBuilder(G_FENCE) 1635 .alwaysLegal(); 1636 1637 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1638 .scalarize(0) 1639 .minScalar(0, S32) 1640 .lower(); 1641 1642 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1643 .legalFor({{S32, S32}, {S64, S32}}) 1644 .clampScalar(1, S32, S32) 1645 .clampScalar(0, S32, S64) 1646 .widenScalarToNextPow2(0) 1647 .scalarize(0); 1648 1649 getActionDefinitionsBuilder({ 1650 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1651 G_FCOPYSIGN, 1652 1653 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1654 G_ATOMICRMW_NAND, 1655 G_ATOMICRMW_FSUB, 1656 G_READ_REGISTER, 1657 G_WRITE_REGISTER, 1658 1659 G_SADDO, G_SSUBO, 1660 1661 // TODO: Implement 1662 G_FMINIMUM, G_FMAXIMUM}).lower(); 1663 1664 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1665 .lower(); 1666 1667 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1668 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1669 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1670 .unsupported(); 1671 1672 getLegacyLegalizerInfo().computeTables(); 1673 verify(*ST.getInstrInfo()); 1674 } 1675 1676 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1677 MachineInstr &MI) const { 1678 MachineIRBuilder &B = Helper.MIRBuilder; 1679 MachineRegisterInfo &MRI = *B.getMRI(); 1680 1681 switch (MI.getOpcode()) { 1682 case TargetOpcode::G_ADDRSPACE_CAST: 1683 return legalizeAddrSpaceCast(MI, MRI, B); 1684 case TargetOpcode::G_FRINT: 1685 return legalizeFrint(MI, MRI, B); 1686 case TargetOpcode::G_FCEIL: 1687 return legalizeFceil(MI, MRI, B); 1688 case TargetOpcode::G_FREM: 1689 return legalizeFrem(MI, MRI, B); 1690 case TargetOpcode::G_INTRINSIC_TRUNC: 1691 return legalizeIntrinsicTrunc(MI, MRI, B); 1692 case TargetOpcode::G_SITOFP: 1693 return legalizeITOFP(MI, MRI, B, true); 1694 case TargetOpcode::G_UITOFP: 1695 return legalizeITOFP(MI, MRI, B, false); 1696 case TargetOpcode::G_FPTOSI: 1697 return legalizeFPTOI(MI, MRI, B, true); 1698 case TargetOpcode::G_FPTOUI: 1699 return legalizeFPTOI(MI, MRI, B, false); 1700 case TargetOpcode::G_FMINNUM: 1701 case TargetOpcode::G_FMAXNUM: 1702 case TargetOpcode::G_FMINNUM_IEEE: 1703 case TargetOpcode::G_FMAXNUM_IEEE: 1704 return legalizeMinNumMaxNum(Helper, MI); 1705 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1706 return legalizeExtractVectorElt(MI, MRI, B); 1707 case TargetOpcode::G_INSERT_VECTOR_ELT: 1708 return legalizeInsertVectorElt(MI, MRI, B); 1709 case TargetOpcode::G_SHUFFLE_VECTOR: 1710 return legalizeShuffleVector(MI, MRI, B); 1711 case TargetOpcode::G_FSIN: 1712 case TargetOpcode::G_FCOS: 1713 return legalizeSinCos(MI, MRI, B); 1714 case TargetOpcode::G_GLOBAL_VALUE: 1715 return legalizeGlobalValue(MI, MRI, B); 1716 case TargetOpcode::G_LOAD: 1717 case TargetOpcode::G_SEXTLOAD: 1718 case TargetOpcode::G_ZEXTLOAD: 1719 return legalizeLoad(Helper, MI); 1720 case TargetOpcode::G_FMAD: 1721 return legalizeFMad(MI, MRI, B); 1722 case TargetOpcode::G_FDIV: 1723 return legalizeFDIV(MI, MRI, B); 1724 case TargetOpcode::G_UDIV: 1725 case TargetOpcode::G_UREM: 1726 case TargetOpcode::G_UDIVREM: 1727 return legalizeUnsignedDIV_REM(MI, MRI, B); 1728 case TargetOpcode::G_SDIV: 1729 case TargetOpcode::G_SREM: 1730 case TargetOpcode::G_SDIVREM: 1731 return legalizeSignedDIV_REM(MI, MRI, B); 1732 case TargetOpcode::G_ATOMIC_CMPXCHG: 1733 return legalizeAtomicCmpXChg(MI, MRI, B); 1734 case TargetOpcode::G_FLOG: 1735 return legalizeFlog(MI, B, numbers::ln2f); 1736 case TargetOpcode::G_FLOG10: 1737 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1738 case TargetOpcode::G_FEXP: 1739 return legalizeFExp(MI, B); 1740 case TargetOpcode::G_FPOW: 1741 return legalizeFPow(MI, B); 1742 case TargetOpcode::G_FFLOOR: 1743 return legalizeFFloor(MI, MRI, B); 1744 case TargetOpcode::G_BUILD_VECTOR: 1745 return legalizeBuildVector(MI, MRI, B); 1746 case TargetOpcode::G_CTLZ: 1747 case TargetOpcode::G_CTTZ: 1748 return legalizeCTLZ_CTTZ(MI, MRI, B); 1749 default: 1750 return false; 1751 } 1752 1753 llvm_unreachable("expected switch to return"); 1754 } 1755 1756 Register AMDGPULegalizerInfo::getSegmentAperture( 1757 unsigned AS, 1758 MachineRegisterInfo &MRI, 1759 MachineIRBuilder &B) const { 1760 MachineFunction &MF = B.getMF(); 1761 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1762 const LLT S32 = LLT::scalar(32); 1763 1764 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1765 1766 if (ST.hasApertureRegs()) { 1767 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1768 // getreg. 1769 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1770 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1771 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1772 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1773 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1774 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1775 unsigned Encoding = 1776 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1777 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1778 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1779 1780 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1781 1782 B.buildInstr(AMDGPU::S_GETREG_B32) 1783 .addDef(GetReg) 1784 .addImm(Encoding); 1785 MRI.setType(GetReg, S32); 1786 1787 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1788 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1789 } 1790 1791 Register QueuePtr = MRI.createGenericVirtualRegister( 1792 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1793 1794 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1795 return Register(); 1796 1797 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1798 // private_segment_aperture_base_hi. 1799 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1800 1801 // TODO: can we be smarter about machine pointer info? 1802 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1803 MachineMemOperand *MMO = MF.getMachineMemOperand( 1804 PtrInfo, 1805 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1806 MachineMemOperand::MOInvariant, 1807 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1808 1809 Register LoadAddr; 1810 1811 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1812 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1813 } 1814 1815 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1816 MachineInstr &MI, MachineRegisterInfo &MRI, 1817 MachineIRBuilder &B) const { 1818 MachineFunction &MF = B.getMF(); 1819 1820 const LLT S32 = LLT::scalar(32); 1821 Register Dst = MI.getOperand(0).getReg(); 1822 Register Src = MI.getOperand(1).getReg(); 1823 1824 LLT DstTy = MRI.getType(Dst); 1825 LLT SrcTy = MRI.getType(Src); 1826 unsigned DestAS = DstTy.getAddressSpace(); 1827 unsigned SrcAS = SrcTy.getAddressSpace(); 1828 1829 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1830 // vector element. 1831 assert(!DstTy.isVector()); 1832 1833 const AMDGPUTargetMachine &TM 1834 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1835 1836 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1837 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1838 return true; 1839 } 1840 1841 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1842 // Truncate. 1843 B.buildExtract(Dst, Src, 0); 1844 MI.eraseFromParent(); 1845 return true; 1846 } 1847 1848 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1849 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1850 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1851 1852 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1853 // another. Merge operands are required to be the same type, but creating an 1854 // extra ptrtoint would be kind of pointless. 1855 auto HighAddr = B.buildConstant( 1856 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1857 B.buildMerge(Dst, {Src, HighAddr}); 1858 MI.eraseFromParent(); 1859 return true; 1860 } 1861 1862 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1863 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1864 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1865 unsigned NullVal = TM.getNullPointerValue(DestAS); 1866 1867 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1868 auto FlatNull = B.buildConstant(SrcTy, 0); 1869 1870 // Extract low 32-bits of the pointer. 1871 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1872 1873 auto CmpRes = 1874 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1875 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1876 1877 MI.eraseFromParent(); 1878 return true; 1879 } 1880 1881 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1882 return false; 1883 1884 if (!ST.hasFlatAddressSpace()) 1885 return false; 1886 1887 auto SegmentNull = 1888 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1889 auto FlatNull = 1890 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1891 1892 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1893 if (!ApertureReg.isValid()) 1894 return false; 1895 1896 auto CmpRes = 1897 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1898 1899 // Coerce the type of the low half of the result so we can use merge_values. 1900 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1901 1902 // TODO: Should we allow mismatched types but matching sizes in merges to 1903 // avoid the ptrtoint? 1904 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1905 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1906 1907 MI.eraseFromParent(); 1908 return true; 1909 } 1910 1911 bool AMDGPULegalizerInfo::legalizeFrint( 1912 MachineInstr &MI, MachineRegisterInfo &MRI, 1913 MachineIRBuilder &B) const { 1914 Register Src = MI.getOperand(1).getReg(); 1915 LLT Ty = MRI.getType(Src); 1916 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1917 1918 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1919 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1920 1921 auto C1 = B.buildFConstant(Ty, C1Val); 1922 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1923 1924 // TODO: Should this propagate fast-math-flags? 1925 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1926 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1927 1928 auto C2 = B.buildFConstant(Ty, C2Val); 1929 auto Fabs = B.buildFAbs(Ty, Src); 1930 1931 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1932 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1933 MI.eraseFromParent(); 1934 return true; 1935 } 1936 1937 bool AMDGPULegalizerInfo::legalizeFceil( 1938 MachineInstr &MI, MachineRegisterInfo &MRI, 1939 MachineIRBuilder &B) const { 1940 1941 const LLT S1 = LLT::scalar(1); 1942 const LLT S64 = LLT::scalar(64); 1943 1944 Register Src = MI.getOperand(1).getReg(); 1945 assert(MRI.getType(Src) == S64); 1946 1947 // result = trunc(src) 1948 // if (src > 0.0 && src != result) 1949 // result += 1.0 1950 1951 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1952 1953 const auto Zero = B.buildFConstant(S64, 0.0); 1954 const auto One = B.buildFConstant(S64, 1.0); 1955 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1956 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1957 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1958 auto Add = B.buildSelect(S64, And, One, Zero); 1959 1960 // TODO: Should this propagate fast-math-flags? 1961 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1962 return true; 1963 } 1964 1965 bool AMDGPULegalizerInfo::legalizeFrem( 1966 MachineInstr &MI, MachineRegisterInfo &MRI, 1967 MachineIRBuilder &B) const { 1968 Register DstReg = MI.getOperand(0).getReg(); 1969 Register Src0Reg = MI.getOperand(1).getReg(); 1970 Register Src1Reg = MI.getOperand(2).getReg(); 1971 auto Flags = MI.getFlags(); 1972 LLT Ty = MRI.getType(DstReg); 1973 1974 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1975 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1976 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1977 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1978 MI.eraseFromParent(); 1979 return true; 1980 } 1981 1982 static MachineInstrBuilder extractF64Exponent(Register Hi, 1983 MachineIRBuilder &B) { 1984 const unsigned FractBits = 52; 1985 const unsigned ExpBits = 11; 1986 LLT S32 = LLT::scalar(32); 1987 1988 auto Const0 = B.buildConstant(S32, FractBits - 32); 1989 auto Const1 = B.buildConstant(S32, ExpBits); 1990 1991 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1992 .addUse(Hi) 1993 .addUse(Const0.getReg(0)) 1994 .addUse(Const1.getReg(0)); 1995 1996 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1997 } 1998 1999 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2000 MachineInstr &MI, MachineRegisterInfo &MRI, 2001 MachineIRBuilder &B) const { 2002 const LLT S1 = LLT::scalar(1); 2003 const LLT S32 = LLT::scalar(32); 2004 const LLT S64 = LLT::scalar(64); 2005 2006 Register Src = MI.getOperand(1).getReg(); 2007 assert(MRI.getType(Src) == S64); 2008 2009 // TODO: Should this use extract since the low half is unused? 2010 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2011 Register Hi = Unmerge.getReg(1); 2012 2013 // Extract the upper half, since this is where we will find the sign and 2014 // exponent. 2015 auto Exp = extractF64Exponent(Hi, B); 2016 2017 const unsigned FractBits = 52; 2018 2019 // Extract the sign bit. 2020 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2021 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2022 2023 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2024 2025 const auto Zero32 = B.buildConstant(S32, 0); 2026 2027 // Extend back to 64-bits. 2028 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2029 2030 auto Shr = B.buildAShr(S64, FractMask, Exp); 2031 auto Not = B.buildNot(S64, Shr); 2032 auto Tmp0 = B.buildAnd(S64, Src, Not); 2033 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2034 2035 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2036 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2037 2038 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2039 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2040 MI.eraseFromParent(); 2041 return true; 2042 } 2043 2044 bool AMDGPULegalizerInfo::legalizeITOFP( 2045 MachineInstr &MI, MachineRegisterInfo &MRI, 2046 MachineIRBuilder &B, bool Signed) const { 2047 2048 Register Dst = MI.getOperand(0).getReg(); 2049 Register Src = MI.getOperand(1).getReg(); 2050 2051 const LLT S64 = LLT::scalar(64); 2052 const LLT S32 = LLT::scalar(32); 2053 2054 assert(MRI.getType(Src) == S64); 2055 2056 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2057 auto ThirtyTwo = B.buildConstant(S32, 32); 2058 2059 if (MRI.getType(Dst) == S64) { 2060 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2061 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2062 2063 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2064 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2065 .addUse(CvtHi.getReg(0)) 2066 .addUse(ThirtyTwo.getReg(0)); 2067 2068 // TODO: Should this propagate fast-math-flags? 2069 B.buildFAdd(Dst, LdExp, CvtLo); 2070 MI.eraseFromParent(); 2071 return true; 2072 } 2073 2074 assert(MRI.getType(Dst) == S32); 2075 2076 auto One = B.buildConstant(S32, 1); 2077 2078 MachineInstrBuilder ShAmt; 2079 if (Signed) { 2080 auto ThirtyOne = B.buildConstant(S32, 31); 2081 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2082 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2083 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2084 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2085 /*HasSideEffects=*/false) 2086 .addUse(Unmerge.getReg(1)); 2087 auto LS2 = B.buildSub(S32, LS, One); 2088 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2089 } else 2090 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2091 auto Norm = B.buildShl(S64, Src, ShAmt); 2092 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2093 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2094 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2095 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2096 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2097 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2098 /*HasSideEffects=*/false) 2099 .addUse(FVal.getReg(0)) 2100 .addUse(Scale.getReg(0)); 2101 MI.eraseFromParent(); 2102 return true; 2103 } 2104 2105 // TODO: Copied from DAG implementation. Verify logic and document how this 2106 // actually works. 2107 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2108 MachineRegisterInfo &MRI, 2109 MachineIRBuilder &B, 2110 bool Signed) const { 2111 2112 Register Dst = MI.getOperand(0).getReg(); 2113 Register Src = MI.getOperand(1).getReg(); 2114 2115 const LLT S64 = LLT::scalar(64); 2116 const LLT S32 = LLT::scalar(32); 2117 2118 const LLT SrcLT = MRI.getType(Src); 2119 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2120 2121 unsigned Flags = MI.getFlags(); 2122 2123 // The basic idea of converting a floating point number into a pair of 32-bit 2124 // integers is illustrated as follows: 2125 // 2126 // tf := trunc(val); 2127 // hif := floor(tf * 2^-32); 2128 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2129 // hi := fptoi(hif); 2130 // lo := fptoi(lof); 2131 // 2132 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2133 MachineInstrBuilder Sign; 2134 if (Signed && SrcLT == S32) { 2135 // However, a 32-bit floating point number has only 23 bits mantissa and 2136 // it's not enough to hold all the significant bits of `lof` if val is 2137 // negative. To avoid the loss of precision, We need to take the absolute 2138 // value after truncating and flip the result back based on the original 2139 // signedness. 2140 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2141 Trunc = B.buildFAbs(S32, Trunc, Flags); 2142 } 2143 MachineInstrBuilder K0, K1; 2144 if (SrcLT == S64) { 2145 K0 = B.buildFConstant(S64, 2146 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2147 K1 = B.buildFConstant(S64, 2148 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2149 } else { 2150 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2151 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2152 } 2153 2154 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2155 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2156 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2157 2158 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2159 : B.buildFPTOUI(S32, FloorMul); 2160 auto Lo = B.buildFPTOUI(S32, Fma); 2161 2162 if (Signed && SrcLT == S32) { 2163 // Flip the result based on the signedness, which is either all 0s or 1s. 2164 Sign = B.buildMerge(S64, {Sign, Sign}); 2165 // r := xor({lo, hi}, sign) - sign; 2166 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2167 } else 2168 B.buildMerge(Dst, {Lo, Hi}); 2169 MI.eraseFromParent(); 2170 2171 return true; 2172 } 2173 2174 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2175 MachineInstr &MI) const { 2176 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2177 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2178 2179 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2180 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2181 2182 // With ieee_mode disabled, the instructions have the correct behavior 2183 // already for G_FMINNUM/G_FMAXNUM 2184 if (!MFI->getMode().IEEE) 2185 return !IsIEEEOp; 2186 2187 if (IsIEEEOp) 2188 return true; 2189 2190 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2191 } 2192 2193 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2194 MachineInstr &MI, MachineRegisterInfo &MRI, 2195 MachineIRBuilder &B) const { 2196 // TODO: Should move some of this into LegalizerHelper. 2197 2198 // TODO: Promote dynamic indexing of s16 to s32 2199 2200 // FIXME: Artifact combiner probably should have replaced the truncated 2201 // constant before this, so we shouldn't need 2202 // getIConstantVRegValWithLookThrough. 2203 Optional<ValueAndVReg> MaybeIdxVal = 2204 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2205 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2206 return true; 2207 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2208 2209 Register Dst = MI.getOperand(0).getReg(); 2210 Register Vec = MI.getOperand(1).getReg(); 2211 2212 LLT VecTy = MRI.getType(Vec); 2213 LLT EltTy = VecTy.getElementType(); 2214 assert(EltTy == MRI.getType(Dst)); 2215 2216 if (IdxVal < VecTy.getNumElements()) { 2217 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2218 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2219 } else { 2220 B.buildUndef(Dst); 2221 } 2222 2223 MI.eraseFromParent(); 2224 return true; 2225 } 2226 2227 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2228 MachineInstr &MI, MachineRegisterInfo &MRI, 2229 MachineIRBuilder &B) const { 2230 // TODO: Should move some of this into LegalizerHelper. 2231 2232 // TODO: Promote dynamic indexing of s16 to s32 2233 2234 // FIXME: Artifact combiner probably should have replaced the truncated 2235 // constant before this, so we shouldn't need 2236 // getIConstantVRegValWithLookThrough. 2237 Optional<ValueAndVReg> MaybeIdxVal = 2238 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2239 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2240 return true; 2241 2242 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2243 Register Dst = MI.getOperand(0).getReg(); 2244 Register Vec = MI.getOperand(1).getReg(); 2245 Register Ins = MI.getOperand(2).getReg(); 2246 2247 LLT VecTy = MRI.getType(Vec); 2248 LLT EltTy = VecTy.getElementType(); 2249 assert(EltTy == MRI.getType(Ins)); 2250 (void)Ins; 2251 2252 unsigned NumElts = VecTy.getNumElements(); 2253 if (IdxVal < NumElts) { 2254 SmallVector<Register, 8> SrcRegs; 2255 for (unsigned i = 0; i < NumElts; ++i) 2256 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2257 B.buildUnmerge(SrcRegs, Vec); 2258 2259 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2260 B.buildMerge(Dst, SrcRegs); 2261 } else { 2262 B.buildUndef(Dst); 2263 } 2264 2265 MI.eraseFromParent(); 2266 return true; 2267 } 2268 2269 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2270 MachineInstr &MI, MachineRegisterInfo &MRI, 2271 MachineIRBuilder &B) const { 2272 const LLT V2S16 = LLT::fixed_vector(2, 16); 2273 2274 Register Dst = MI.getOperand(0).getReg(); 2275 Register Src0 = MI.getOperand(1).getReg(); 2276 LLT DstTy = MRI.getType(Dst); 2277 LLT SrcTy = MRI.getType(Src0); 2278 2279 if (SrcTy == V2S16 && DstTy == V2S16 && 2280 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2281 return true; 2282 2283 MachineIRBuilder HelperBuilder(MI); 2284 GISelObserverWrapper DummyObserver; 2285 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2286 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2287 } 2288 2289 bool AMDGPULegalizerInfo::legalizeSinCos( 2290 MachineInstr &MI, MachineRegisterInfo &MRI, 2291 MachineIRBuilder &B) const { 2292 2293 Register DstReg = MI.getOperand(0).getReg(); 2294 Register SrcReg = MI.getOperand(1).getReg(); 2295 LLT Ty = MRI.getType(DstReg); 2296 unsigned Flags = MI.getFlags(); 2297 2298 Register TrigVal; 2299 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2300 if (ST.hasTrigReducedRange()) { 2301 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2302 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2303 .addUse(MulVal.getReg(0)) 2304 .setMIFlags(Flags).getReg(0); 2305 } else 2306 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2307 2308 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2309 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2310 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2311 .addUse(TrigVal) 2312 .setMIFlags(Flags); 2313 MI.eraseFromParent(); 2314 return true; 2315 } 2316 2317 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2318 MachineIRBuilder &B, 2319 const GlobalValue *GV, 2320 int64_t Offset, 2321 unsigned GAFlags) const { 2322 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2323 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2324 // to the following code sequence: 2325 // 2326 // For constant address space: 2327 // s_getpc_b64 s[0:1] 2328 // s_add_u32 s0, s0, $symbol 2329 // s_addc_u32 s1, s1, 0 2330 // 2331 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2332 // a fixup or relocation is emitted to replace $symbol with a literal 2333 // constant, which is a pc-relative offset from the encoding of the $symbol 2334 // operand to the global variable. 2335 // 2336 // For global address space: 2337 // s_getpc_b64 s[0:1] 2338 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2339 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2340 // 2341 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2342 // fixups or relocations are emitted to replace $symbol@*@lo and 2343 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2344 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2345 // operand to the global variable. 2346 // 2347 // What we want here is an offset from the value returned by s_getpc 2348 // (which is the address of the s_add_u32 instruction) to the global 2349 // variable, but since the encoding of $symbol starts 4 bytes after the start 2350 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2351 // small. This requires us to add 4 to the global variable offset in order to 2352 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2353 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2354 // instruction. 2355 2356 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2357 2358 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2359 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2360 2361 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2362 .addDef(PCReg); 2363 2364 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2365 if (GAFlags == SIInstrInfo::MO_NONE) 2366 MIB.addImm(0); 2367 else 2368 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2369 2370 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2371 2372 if (PtrTy.getSizeInBits() == 32) 2373 B.buildExtract(DstReg, PCReg, 0); 2374 return true; 2375 } 2376 2377 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2378 MachineInstr &MI, MachineRegisterInfo &MRI, 2379 MachineIRBuilder &B) const { 2380 Register DstReg = MI.getOperand(0).getReg(); 2381 LLT Ty = MRI.getType(DstReg); 2382 unsigned AS = Ty.getAddressSpace(); 2383 2384 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2385 MachineFunction &MF = B.getMF(); 2386 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2387 2388 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2389 if (!MFI->isModuleEntryFunction() && 2390 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2391 const Function &Fn = MF.getFunction(); 2392 DiagnosticInfoUnsupported BadLDSDecl( 2393 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2394 DS_Warning); 2395 Fn.getContext().diagnose(BadLDSDecl); 2396 2397 // We currently don't have a way to correctly allocate LDS objects that 2398 // aren't directly associated with a kernel. We do force inlining of 2399 // functions that use local objects. However, if these dead functions are 2400 // not eliminated, we don't want a compile time error. Just emit a warning 2401 // and a trap, since there should be no callable path here. 2402 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2403 B.buildUndef(DstReg); 2404 MI.eraseFromParent(); 2405 return true; 2406 } 2407 2408 // TODO: We could emit code to handle the initialization somewhere. 2409 // We ignore the initializer for now and legalize it to allow selection. 2410 // The initializer will anyway get errored out during assembly emission. 2411 const SITargetLowering *TLI = ST.getTargetLowering(); 2412 if (!TLI->shouldUseLDSConstAddress(GV)) { 2413 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2414 return true; // Leave in place; 2415 } 2416 2417 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2418 Type *Ty = GV->getValueType(); 2419 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2420 // zero-sized type in other languages to declare the dynamic shared 2421 // memory which size is not known at the compile time. They will be 2422 // allocated by the runtime and placed directly after the static 2423 // allocated ones. They all share the same offset. 2424 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2425 // Adjust alignment for that dynamic shared memory array. 2426 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2427 LLT S32 = LLT::scalar(32); 2428 auto Sz = 2429 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2430 B.buildIntToPtr(DstReg, Sz); 2431 MI.eraseFromParent(); 2432 return true; 2433 } 2434 } 2435 2436 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2437 *cast<GlobalVariable>(GV))); 2438 MI.eraseFromParent(); 2439 return true; 2440 } 2441 2442 const SITargetLowering *TLI = ST.getTargetLowering(); 2443 2444 if (TLI->shouldEmitFixup(GV)) { 2445 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2446 MI.eraseFromParent(); 2447 return true; 2448 } 2449 2450 if (TLI->shouldEmitPCReloc(GV)) { 2451 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2452 MI.eraseFromParent(); 2453 return true; 2454 } 2455 2456 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2457 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2458 2459 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2460 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2461 MachinePointerInfo::getGOT(MF), 2462 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2463 MachineMemOperand::MOInvariant, 2464 LoadTy, Align(8)); 2465 2466 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2467 2468 if (Ty.getSizeInBits() == 32) { 2469 // Truncate if this is a 32-bit constant address. 2470 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2471 B.buildExtract(DstReg, Load, 0); 2472 } else 2473 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2474 2475 MI.eraseFromParent(); 2476 return true; 2477 } 2478 2479 static LLT widenToNextPowerOf2(LLT Ty) { 2480 if (Ty.isVector()) 2481 return Ty.changeElementCount( 2482 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2483 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2484 } 2485 2486 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2487 MachineInstr &MI) const { 2488 MachineIRBuilder &B = Helper.MIRBuilder; 2489 MachineRegisterInfo &MRI = *B.getMRI(); 2490 GISelChangeObserver &Observer = Helper.Observer; 2491 2492 Register PtrReg = MI.getOperand(1).getReg(); 2493 LLT PtrTy = MRI.getType(PtrReg); 2494 unsigned AddrSpace = PtrTy.getAddressSpace(); 2495 2496 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2497 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2498 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2499 Observer.changingInstr(MI); 2500 MI.getOperand(1).setReg(Cast.getReg(0)); 2501 Observer.changedInstr(MI); 2502 return true; 2503 } 2504 2505 if (MI.getOpcode() != AMDGPU::G_LOAD) 2506 return false; 2507 2508 Register ValReg = MI.getOperand(0).getReg(); 2509 LLT ValTy = MRI.getType(ValReg); 2510 2511 MachineMemOperand *MMO = *MI.memoperands_begin(); 2512 const unsigned ValSize = ValTy.getSizeInBits(); 2513 const LLT MemTy = MMO->getMemoryType(); 2514 const Align MemAlign = MMO->getAlign(); 2515 const unsigned MemSize = MemTy.getSizeInBits(); 2516 const unsigned AlignInBits = 8 * MemAlign.value(); 2517 2518 // Widen non-power-of-2 loads to the alignment if needed 2519 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2520 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2521 2522 // This was already the correct extending load result type, so just adjust 2523 // the memory type. 2524 if (WideMemSize == ValSize) { 2525 MachineFunction &MF = B.getMF(); 2526 2527 MachineMemOperand *WideMMO = 2528 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2529 Observer.changingInstr(MI); 2530 MI.setMemRefs(MF, {WideMMO}); 2531 Observer.changedInstr(MI); 2532 return true; 2533 } 2534 2535 // Don't bother handling edge case that should probably never be produced. 2536 if (ValSize > WideMemSize) 2537 return false; 2538 2539 LLT WideTy = widenToNextPowerOf2(ValTy); 2540 2541 Register WideLoad; 2542 if (!WideTy.isVector()) { 2543 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2544 B.buildTrunc(ValReg, WideLoad).getReg(0); 2545 } else { 2546 // Extract the subvector. 2547 2548 if (isRegisterType(ValTy)) { 2549 // If this a case where G_EXTRACT is legal, use it. 2550 // (e.g. <3 x s32> -> <4 x s32>) 2551 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2552 B.buildExtract(ValReg, WideLoad, 0); 2553 } else { 2554 // For cases where the widened type isn't a nice register value, unmerge 2555 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2556 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2557 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2558 } 2559 } 2560 2561 MI.eraseFromParent(); 2562 return true; 2563 } 2564 2565 return false; 2566 } 2567 2568 bool AMDGPULegalizerInfo::legalizeFMad( 2569 MachineInstr &MI, MachineRegisterInfo &MRI, 2570 MachineIRBuilder &B) const { 2571 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2572 assert(Ty.isScalar()); 2573 2574 MachineFunction &MF = B.getMF(); 2575 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2576 2577 // TODO: Always legal with future ftz flag. 2578 // FIXME: Do we need just output? 2579 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2580 return true; 2581 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2582 return true; 2583 2584 MachineIRBuilder HelperBuilder(MI); 2585 GISelObserverWrapper DummyObserver; 2586 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2587 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2588 } 2589 2590 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2591 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2592 Register DstReg = MI.getOperand(0).getReg(); 2593 Register PtrReg = MI.getOperand(1).getReg(); 2594 Register CmpVal = MI.getOperand(2).getReg(); 2595 Register NewVal = MI.getOperand(3).getReg(); 2596 2597 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2598 "this should not have been custom lowered"); 2599 2600 LLT ValTy = MRI.getType(CmpVal); 2601 LLT VecTy = LLT::fixed_vector(2, ValTy); 2602 2603 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2604 2605 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2606 .addDef(DstReg) 2607 .addUse(PtrReg) 2608 .addUse(PackedVal) 2609 .setMemRefs(MI.memoperands()); 2610 2611 MI.eraseFromParent(); 2612 return true; 2613 } 2614 2615 bool AMDGPULegalizerInfo::legalizeFlog( 2616 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2617 Register Dst = MI.getOperand(0).getReg(); 2618 Register Src = MI.getOperand(1).getReg(); 2619 LLT Ty = B.getMRI()->getType(Dst); 2620 unsigned Flags = MI.getFlags(); 2621 2622 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2623 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2624 2625 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2626 MI.eraseFromParent(); 2627 return true; 2628 } 2629 2630 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2631 MachineIRBuilder &B) const { 2632 Register Dst = MI.getOperand(0).getReg(); 2633 Register Src = MI.getOperand(1).getReg(); 2634 unsigned Flags = MI.getFlags(); 2635 LLT Ty = B.getMRI()->getType(Dst); 2636 2637 auto K = B.buildFConstant(Ty, numbers::log2e); 2638 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2639 B.buildFExp2(Dst, Mul, Flags); 2640 MI.eraseFromParent(); 2641 return true; 2642 } 2643 2644 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2645 MachineIRBuilder &B) const { 2646 Register Dst = MI.getOperand(0).getReg(); 2647 Register Src0 = MI.getOperand(1).getReg(); 2648 Register Src1 = MI.getOperand(2).getReg(); 2649 unsigned Flags = MI.getFlags(); 2650 LLT Ty = B.getMRI()->getType(Dst); 2651 const LLT S16 = LLT::scalar(16); 2652 const LLT S32 = LLT::scalar(32); 2653 2654 if (Ty == S32) { 2655 auto Log = B.buildFLog2(S32, Src0, Flags); 2656 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2657 .addUse(Log.getReg(0)) 2658 .addUse(Src1) 2659 .setMIFlags(Flags); 2660 B.buildFExp2(Dst, Mul, Flags); 2661 } else if (Ty == S16) { 2662 // There's no f16 fmul_legacy, so we need to convert for it. 2663 auto Log = B.buildFLog2(S16, Src0, Flags); 2664 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2665 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2666 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2667 .addUse(Ext0.getReg(0)) 2668 .addUse(Ext1.getReg(0)) 2669 .setMIFlags(Flags); 2670 2671 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2672 } else 2673 return false; 2674 2675 MI.eraseFromParent(); 2676 return true; 2677 } 2678 2679 // Find a source register, ignoring any possible source modifiers. 2680 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2681 Register ModSrc = OrigSrc; 2682 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2683 ModSrc = SrcFNeg->getOperand(1).getReg(); 2684 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2685 ModSrc = SrcFAbs->getOperand(1).getReg(); 2686 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2687 ModSrc = SrcFAbs->getOperand(1).getReg(); 2688 return ModSrc; 2689 } 2690 2691 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2692 MachineRegisterInfo &MRI, 2693 MachineIRBuilder &B) const { 2694 2695 const LLT S1 = LLT::scalar(1); 2696 const LLT S64 = LLT::scalar(64); 2697 Register Dst = MI.getOperand(0).getReg(); 2698 Register OrigSrc = MI.getOperand(1).getReg(); 2699 unsigned Flags = MI.getFlags(); 2700 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2701 "this should not have been custom lowered"); 2702 2703 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2704 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2705 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2706 // V_FRACT bug is: 2707 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2708 // 2709 // Convert floor(x) to (x - fract(x)) 2710 2711 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2712 .addUse(OrigSrc) 2713 .setMIFlags(Flags); 2714 2715 // Give source modifier matching some assistance before obscuring a foldable 2716 // pattern. 2717 2718 // TODO: We can avoid the neg on the fract? The input sign to fract 2719 // shouldn't matter? 2720 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2721 2722 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2723 2724 Register Min = MRI.createGenericVirtualRegister(S64); 2725 2726 // We don't need to concern ourselves with the snan handling difference, so 2727 // use the one which will directly select. 2728 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2729 if (MFI->getMode().IEEE) 2730 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2731 else 2732 B.buildFMinNum(Min, Fract, Const, Flags); 2733 2734 Register CorrectedFract = Min; 2735 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2736 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2737 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2738 } 2739 2740 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2741 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2742 2743 MI.eraseFromParent(); 2744 return true; 2745 } 2746 2747 // Turn an illegal packed v2s16 build vector into bit operations. 2748 // TODO: This should probably be a bitcast action in LegalizerHelper. 2749 bool AMDGPULegalizerInfo::legalizeBuildVector( 2750 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2751 Register Dst = MI.getOperand(0).getReg(); 2752 const LLT S32 = LLT::scalar(32); 2753 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2754 2755 Register Src0 = MI.getOperand(1).getReg(); 2756 Register Src1 = MI.getOperand(2).getReg(); 2757 assert(MRI.getType(Src0) == LLT::scalar(16)); 2758 2759 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2760 B.buildBitcast(Dst, Merge); 2761 2762 MI.eraseFromParent(); 2763 return true; 2764 } 2765 2766 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2767 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2768 // case with a single min instruction instead of a compare+select. 2769 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2770 MachineRegisterInfo &MRI, 2771 MachineIRBuilder &B) const { 2772 Register Dst = MI.getOperand(0).getReg(); 2773 Register Src = MI.getOperand(1).getReg(); 2774 LLT DstTy = MRI.getType(Dst); 2775 LLT SrcTy = MRI.getType(Src); 2776 2777 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2778 ? AMDGPU::G_AMDGPU_FFBH_U32 2779 : AMDGPU::G_AMDGPU_FFBL_B32; 2780 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2781 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2782 2783 MI.eraseFromParent(); 2784 return true; 2785 } 2786 2787 // Check that this is a G_XOR x, -1 2788 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2789 if (MI.getOpcode() != TargetOpcode::G_XOR) 2790 return false; 2791 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2792 return ConstVal && *ConstVal == -1; 2793 } 2794 2795 // Return the use branch instruction, otherwise null if the usage is invalid. 2796 static MachineInstr * 2797 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2798 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2799 Register CondDef = MI.getOperand(0).getReg(); 2800 if (!MRI.hasOneNonDBGUse(CondDef)) 2801 return nullptr; 2802 2803 MachineBasicBlock *Parent = MI.getParent(); 2804 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2805 2806 if (isNot(MRI, *UseMI)) { 2807 Register NegatedCond = UseMI->getOperand(0).getReg(); 2808 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2809 return nullptr; 2810 2811 // We're deleting the def of this value, so we need to remove it. 2812 eraseInstr(*UseMI, MRI); 2813 2814 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2815 Negated = true; 2816 } 2817 2818 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2819 return nullptr; 2820 2821 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2822 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2823 if (Next == Parent->end()) { 2824 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2825 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2826 return nullptr; 2827 UncondBrTarget = &*NextMBB; 2828 } else { 2829 if (Next->getOpcode() != AMDGPU::G_BR) 2830 return nullptr; 2831 Br = &*Next; 2832 UncondBrTarget = Br->getOperand(0).getMBB(); 2833 } 2834 2835 return UseMI; 2836 } 2837 2838 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2839 const ArgDescriptor *Arg, 2840 const TargetRegisterClass *ArgRC, 2841 LLT ArgTy) const { 2842 MCRegister SrcReg = Arg->getRegister(); 2843 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2844 assert(DstReg.isVirtual() && "Virtual register expected"); 2845 2846 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2847 ArgTy); 2848 if (Arg->isMasked()) { 2849 // TODO: Should we try to emit this once in the entry block? 2850 const LLT S32 = LLT::scalar(32); 2851 const unsigned Mask = Arg->getMask(); 2852 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2853 2854 Register AndMaskSrc = LiveIn; 2855 2856 if (Shift != 0) { 2857 auto ShiftAmt = B.buildConstant(S32, Shift); 2858 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2859 } 2860 2861 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2862 } else { 2863 B.buildCopy(DstReg, LiveIn); 2864 } 2865 2866 return true; 2867 } 2868 2869 bool AMDGPULegalizerInfo::loadInputValue( 2870 Register DstReg, MachineIRBuilder &B, 2871 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2872 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2873 const ArgDescriptor *Arg; 2874 const TargetRegisterClass *ArgRC; 2875 LLT ArgTy; 2876 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2877 2878 if (!Arg) { 2879 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 2880 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 2881 // case the pointer argument may be missing and we use null. 2882 B.buildConstant(DstReg, 0); 2883 return true; 2884 } 2885 2886 // It's undefined behavior if a function marked with the amdgpu-no-* 2887 // attributes uses the corresponding intrinsic. 2888 B.buildUndef(DstReg); 2889 return true; 2890 } 2891 2892 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2893 return false; // TODO: Handle these 2894 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2895 } 2896 2897 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2898 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2899 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2900 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2901 return false; 2902 2903 MI.eraseFromParent(); 2904 return true; 2905 } 2906 2907 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2908 MachineRegisterInfo &MRI, 2909 MachineIRBuilder &B) const { 2910 Register Dst = MI.getOperand(0).getReg(); 2911 LLT DstTy = MRI.getType(Dst); 2912 LLT S16 = LLT::scalar(16); 2913 LLT S32 = LLT::scalar(32); 2914 LLT S64 = LLT::scalar(64); 2915 2916 if (DstTy == S16) 2917 return legalizeFDIV16(MI, MRI, B); 2918 if (DstTy == S32) 2919 return legalizeFDIV32(MI, MRI, B); 2920 if (DstTy == S64) 2921 return legalizeFDIV64(MI, MRI, B); 2922 2923 return false; 2924 } 2925 2926 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2927 Register DstDivReg, 2928 Register DstRemReg, 2929 Register X, 2930 Register Y) const { 2931 const LLT S1 = LLT::scalar(1); 2932 const LLT S32 = LLT::scalar(32); 2933 2934 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2935 // algorithm used here. 2936 2937 // Initial estimate of inv(y). 2938 auto FloatY = B.buildUITOFP(S32, Y); 2939 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2940 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2941 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2942 auto Z = B.buildFPTOUI(S32, ScaledY); 2943 2944 // One round of UNR. 2945 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2946 auto NegYZ = B.buildMul(S32, NegY, Z); 2947 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2948 2949 // Quotient/remainder estimate. 2950 auto Q = B.buildUMulH(S32, X, Z); 2951 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2952 2953 // First quotient/remainder refinement. 2954 auto One = B.buildConstant(S32, 1); 2955 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2956 if (DstDivReg) 2957 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2958 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2959 2960 // Second quotient/remainder refinement. 2961 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2962 if (DstDivReg) 2963 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2964 2965 if (DstRemReg) 2966 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2967 } 2968 2969 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 2970 // 2971 // Return lo, hi of result 2972 // 2973 // %cvt.lo = G_UITOFP Val.lo 2974 // %cvt.hi = G_UITOFP Val.hi 2975 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2976 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2977 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2978 // %mul2 = G_FMUL %mul1, 2**(-32) 2979 // %trunc = G_INTRINSIC_TRUNC %mul2 2980 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2981 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2982 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2983 Register Val) { 2984 const LLT S32 = LLT::scalar(32); 2985 auto Unmerge = B.buildUnmerge(S32, Val); 2986 2987 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2988 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2989 2990 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2991 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2992 2993 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2994 auto Mul1 = 2995 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2996 2997 // 2**(-32) 2998 auto Mul2 = 2999 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3000 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3001 3002 // -(2**32) 3003 auto Mad2 = B.buildFMAD(S32, Trunc, 3004 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3005 3006 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3007 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3008 3009 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3010 } 3011 3012 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3013 Register DstDivReg, 3014 Register DstRemReg, 3015 Register Numer, 3016 Register Denom) const { 3017 const LLT S32 = LLT::scalar(32); 3018 const LLT S64 = LLT::scalar(64); 3019 const LLT S1 = LLT::scalar(1); 3020 Register RcpLo, RcpHi; 3021 3022 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3023 3024 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3025 3026 auto Zero64 = B.buildConstant(S64, 0); 3027 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3028 3029 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3030 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3031 3032 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3033 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3034 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3035 3036 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3037 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3038 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3039 3040 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3041 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3042 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3043 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3044 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3045 3046 auto Zero32 = B.buildConstant(S32, 0); 3047 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3048 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3049 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3050 3051 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3052 Register NumerLo = UnmergeNumer.getReg(0); 3053 Register NumerHi = UnmergeNumer.getReg(1); 3054 3055 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3056 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3057 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3058 Register Mul3_Lo = UnmergeMul3.getReg(0); 3059 Register Mul3_Hi = UnmergeMul3.getReg(1); 3060 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3061 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3062 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3063 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3064 3065 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3066 Register DenomLo = UnmergeDenom.getReg(0); 3067 Register DenomHi = UnmergeDenom.getReg(1); 3068 3069 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3070 auto C1 = B.buildSExt(S32, CmpHi); 3071 3072 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3073 auto C2 = B.buildSExt(S32, CmpLo); 3074 3075 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3076 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3077 3078 // TODO: Here and below portions of the code can be enclosed into if/endif. 3079 // Currently control flow is unconditional and we have 4 selects after 3080 // potential endif to substitute PHIs. 3081 3082 // if C3 != 0 ... 3083 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3084 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3085 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3086 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3087 3088 auto One64 = B.buildConstant(S64, 1); 3089 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3090 3091 auto C4 = 3092 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3093 auto C5 = 3094 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3095 auto C6 = B.buildSelect( 3096 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3097 3098 // if (C6 != 0) 3099 auto Add4 = B.buildAdd(S64, Add3, One64); 3100 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3101 3102 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3103 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3104 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3105 3106 // endif C6 3107 // endif C3 3108 3109 if (DstDivReg) { 3110 auto Sel1 = B.buildSelect( 3111 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3112 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3113 Sel1, MulHi3); 3114 } 3115 3116 if (DstRemReg) { 3117 auto Sel2 = B.buildSelect( 3118 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3119 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3120 Sel2, Sub1); 3121 } 3122 } 3123 3124 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3125 MachineRegisterInfo &MRI, 3126 MachineIRBuilder &B) const { 3127 Register DstDivReg, DstRemReg; 3128 switch (MI.getOpcode()) { 3129 default: 3130 llvm_unreachable("Unexpected opcode!"); 3131 case AMDGPU::G_UDIV: { 3132 DstDivReg = MI.getOperand(0).getReg(); 3133 break; 3134 } 3135 case AMDGPU::G_UREM: { 3136 DstRemReg = MI.getOperand(0).getReg(); 3137 break; 3138 } 3139 case AMDGPU::G_UDIVREM: { 3140 DstDivReg = MI.getOperand(0).getReg(); 3141 DstRemReg = MI.getOperand(1).getReg(); 3142 break; 3143 } 3144 } 3145 3146 const LLT S64 = LLT::scalar(64); 3147 const LLT S32 = LLT::scalar(32); 3148 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3149 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3150 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3151 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3152 3153 if (Ty == S32) 3154 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3155 else if (Ty == S64) 3156 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3157 else 3158 return false; 3159 3160 MI.eraseFromParent(); 3161 return true; 3162 } 3163 3164 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3165 MachineRegisterInfo &MRI, 3166 MachineIRBuilder &B) const { 3167 const LLT S64 = LLT::scalar(64); 3168 const LLT S32 = LLT::scalar(32); 3169 3170 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3171 if (Ty != S32 && Ty != S64) 3172 return false; 3173 3174 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3175 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3176 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3177 3178 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3179 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3180 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3181 3182 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3183 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3184 3185 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3186 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3187 3188 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3189 switch (MI.getOpcode()) { 3190 default: 3191 llvm_unreachable("Unexpected opcode!"); 3192 case AMDGPU::G_SDIV: { 3193 DstDivReg = MI.getOperand(0).getReg(); 3194 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3195 break; 3196 } 3197 case AMDGPU::G_SREM: { 3198 DstRemReg = MI.getOperand(0).getReg(); 3199 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3200 break; 3201 } 3202 case AMDGPU::G_SDIVREM: { 3203 DstDivReg = MI.getOperand(0).getReg(); 3204 DstRemReg = MI.getOperand(1).getReg(); 3205 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3206 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3207 break; 3208 } 3209 } 3210 3211 if (Ty == S32) 3212 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3213 else 3214 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3215 3216 if (DstDivReg) { 3217 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3218 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3219 B.buildSub(DstDivReg, SignXor, Sign); 3220 } 3221 3222 if (DstRemReg) { 3223 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3224 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3225 B.buildSub(DstRemReg, SignXor, Sign); 3226 } 3227 3228 MI.eraseFromParent(); 3229 return true; 3230 } 3231 3232 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3233 MachineRegisterInfo &MRI, 3234 MachineIRBuilder &B) const { 3235 Register Res = MI.getOperand(0).getReg(); 3236 Register LHS = MI.getOperand(1).getReg(); 3237 Register RHS = MI.getOperand(2).getReg(); 3238 uint16_t Flags = MI.getFlags(); 3239 LLT ResTy = MRI.getType(Res); 3240 3241 const MachineFunction &MF = B.getMF(); 3242 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3243 MI.getFlag(MachineInstr::FmAfn); 3244 3245 if (!AllowInaccurateRcp) 3246 return false; 3247 3248 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3249 // 1 / x -> RCP(x) 3250 if (CLHS->isExactlyValue(1.0)) { 3251 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3252 .addUse(RHS) 3253 .setMIFlags(Flags); 3254 3255 MI.eraseFromParent(); 3256 return true; 3257 } 3258 3259 // -1 / x -> RCP( FNEG(x) ) 3260 if (CLHS->isExactlyValue(-1.0)) { 3261 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3262 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3263 .addUse(FNeg.getReg(0)) 3264 .setMIFlags(Flags); 3265 3266 MI.eraseFromParent(); 3267 return true; 3268 } 3269 } 3270 3271 // x / y -> x * (1.0 / y) 3272 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3273 .addUse(RHS) 3274 .setMIFlags(Flags); 3275 B.buildFMul(Res, LHS, RCP, Flags); 3276 3277 MI.eraseFromParent(); 3278 return true; 3279 } 3280 3281 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3282 MachineRegisterInfo &MRI, 3283 MachineIRBuilder &B) const { 3284 Register Res = MI.getOperand(0).getReg(); 3285 Register X = MI.getOperand(1).getReg(); 3286 Register Y = MI.getOperand(2).getReg(); 3287 uint16_t Flags = MI.getFlags(); 3288 LLT ResTy = MRI.getType(Res); 3289 3290 const MachineFunction &MF = B.getMF(); 3291 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3292 MI.getFlag(MachineInstr::FmAfn); 3293 3294 if (!AllowInaccurateRcp) 3295 return false; 3296 3297 auto NegY = B.buildFNeg(ResTy, Y); 3298 auto One = B.buildFConstant(ResTy, 1.0); 3299 3300 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3301 .addUse(Y) 3302 .setMIFlags(Flags); 3303 3304 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3305 R = B.buildFMA(ResTy, Tmp0, R, R); 3306 3307 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3308 R = B.buildFMA(ResTy, Tmp1, R, R); 3309 3310 auto Ret = B.buildFMul(ResTy, X, R); 3311 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3312 3313 B.buildFMA(Res, Tmp2, R, Ret); 3314 MI.eraseFromParent(); 3315 return true; 3316 } 3317 3318 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3319 MachineRegisterInfo &MRI, 3320 MachineIRBuilder &B) const { 3321 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3322 return true; 3323 3324 Register Res = MI.getOperand(0).getReg(); 3325 Register LHS = MI.getOperand(1).getReg(); 3326 Register RHS = MI.getOperand(2).getReg(); 3327 3328 uint16_t Flags = MI.getFlags(); 3329 3330 LLT S16 = LLT::scalar(16); 3331 LLT S32 = LLT::scalar(32); 3332 3333 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3334 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3335 3336 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3337 .addUse(RHSExt.getReg(0)) 3338 .setMIFlags(Flags); 3339 3340 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3341 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3342 3343 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3344 .addUse(RDst.getReg(0)) 3345 .addUse(RHS) 3346 .addUse(LHS) 3347 .setMIFlags(Flags); 3348 3349 MI.eraseFromParent(); 3350 return true; 3351 } 3352 3353 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3354 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3355 static void toggleSPDenormMode(bool Enable, 3356 MachineIRBuilder &B, 3357 const GCNSubtarget &ST, 3358 AMDGPU::SIModeRegisterDefaults Mode) { 3359 // Set SP denorm mode to this value. 3360 unsigned SPDenormMode = 3361 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3362 3363 if (ST.hasDenormModeInst()) { 3364 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3365 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3366 3367 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3368 B.buildInstr(AMDGPU::S_DENORM_MODE) 3369 .addImm(NewDenormModeValue); 3370 3371 } else { 3372 // Select FP32 bit field in mode register. 3373 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3374 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3375 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3376 3377 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3378 .addImm(SPDenormMode) 3379 .addImm(SPDenormModeBitField); 3380 } 3381 } 3382 3383 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3384 MachineRegisterInfo &MRI, 3385 MachineIRBuilder &B) const { 3386 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3387 return true; 3388 3389 Register Res = MI.getOperand(0).getReg(); 3390 Register LHS = MI.getOperand(1).getReg(); 3391 Register RHS = MI.getOperand(2).getReg(); 3392 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3393 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3394 3395 uint16_t Flags = MI.getFlags(); 3396 3397 LLT S32 = LLT::scalar(32); 3398 LLT S1 = LLT::scalar(1); 3399 3400 auto One = B.buildFConstant(S32, 1.0f); 3401 3402 auto DenominatorScaled = 3403 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3404 .addUse(LHS) 3405 .addUse(RHS) 3406 .addImm(0) 3407 .setMIFlags(Flags); 3408 auto NumeratorScaled = 3409 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3410 .addUse(LHS) 3411 .addUse(RHS) 3412 .addImm(1) 3413 .setMIFlags(Flags); 3414 3415 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3416 .addUse(DenominatorScaled.getReg(0)) 3417 .setMIFlags(Flags); 3418 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3419 3420 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3421 // aren't modeled as reading it. 3422 if (!Mode.allFP32Denormals()) 3423 toggleSPDenormMode(true, B, ST, Mode); 3424 3425 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3426 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3427 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3428 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3429 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3430 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3431 3432 if (!Mode.allFP32Denormals()) 3433 toggleSPDenormMode(false, B, ST, Mode); 3434 3435 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3436 .addUse(Fma4.getReg(0)) 3437 .addUse(Fma1.getReg(0)) 3438 .addUse(Fma3.getReg(0)) 3439 .addUse(NumeratorScaled.getReg(1)) 3440 .setMIFlags(Flags); 3441 3442 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3443 .addUse(Fmas.getReg(0)) 3444 .addUse(RHS) 3445 .addUse(LHS) 3446 .setMIFlags(Flags); 3447 3448 MI.eraseFromParent(); 3449 return true; 3450 } 3451 3452 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3453 MachineRegisterInfo &MRI, 3454 MachineIRBuilder &B) const { 3455 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3456 return true; 3457 3458 Register Res = MI.getOperand(0).getReg(); 3459 Register LHS = MI.getOperand(1).getReg(); 3460 Register RHS = MI.getOperand(2).getReg(); 3461 3462 uint16_t Flags = MI.getFlags(); 3463 3464 LLT S64 = LLT::scalar(64); 3465 LLT S1 = LLT::scalar(1); 3466 3467 auto One = B.buildFConstant(S64, 1.0); 3468 3469 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3470 .addUse(LHS) 3471 .addUse(RHS) 3472 .addImm(0) 3473 .setMIFlags(Flags); 3474 3475 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3476 3477 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3478 .addUse(DivScale0.getReg(0)) 3479 .setMIFlags(Flags); 3480 3481 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3482 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3483 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3484 3485 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3486 .addUse(LHS) 3487 .addUse(RHS) 3488 .addImm(1) 3489 .setMIFlags(Flags); 3490 3491 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3492 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3493 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3494 3495 Register Scale; 3496 if (!ST.hasUsableDivScaleConditionOutput()) { 3497 // Workaround a hardware bug on SI where the condition output from div_scale 3498 // is not usable. 3499 3500 LLT S32 = LLT::scalar(32); 3501 3502 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3503 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3504 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3505 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3506 3507 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3508 Scale1Unmerge.getReg(1)); 3509 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3510 Scale0Unmerge.getReg(1)); 3511 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3512 } else { 3513 Scale = DivScale1.getReg(1); 3514 } 3515 3516 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3517 .addUse(Fma4.getReg(0)) 3518 .addUse(Fma3.getReg(0)) 3519 .addUse(Mul.getReg(0)) 3520 .addUse(Scale) 3521 .setMIFlags(Flags); 3522 3523 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3524 .addUse(Fmas.getReg(0)) 3525 .addUse(RHS) 3526 .addUse(LHS) 3527 .setMIFlags(Flags); 3528 3529 MI.eraseFromParent(); 3530 return true; 3531 } 3532 3533 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3534 MachineRegisterInfo &MRI, 3535 MachineIRBuilder &B) const { 3536 Register Res = MI.getOperand(0).getReg(); 3537 Register LHS = MI.getOperand(2).getReg(); 3538 Register RHS = MI.getOperand(3).getReg(); 3539 uint16_t Flags = MI.getFlags(); 3540 3541 LLT S32 = LLT::scalar(32); 3542 LLT S1 = LLT::scalar(1); 3543 3544 auto Abs = B.buildFAbs(S32, RHS, Flags); 3545 const APFloat C0Val(1.0f); 3546 3547 auto C0 = B.buildConstant(S32, 0x6f800000); 3548 auto C1 = B.buildConstant(S32, 0x2f800000); 3549 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3550 3551 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3552 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3553 3554 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3555 3556 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3557 .addUse(Mul0.getReg(0)) 3558 .setMIFlags(Flags); 3559 3560 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3561 3562 B.buildFMul(Res, Sel, Mul1, Flags); 3563 3564 MI.eraseFromParent(); 3565 return true; 3566 } 3567 3568 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3569 // FIXME: Why do we handle this one but not other removed instructions? 3570 // 3571 // Reciprocal square root. The clamp prevents infinite results, clamping 3572 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3573 // +-max_float. 3574 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3575 MachineRegisterInfo &MRI, 3576 MachineIRBuilder &B) const { 3577 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3578 return true; 3579 3580 Register Dst = MI.getOperand(0).getReg(); 3581 Register Src = MI.getOperand(2).getReg(); 3582 auto Flags = MI.getFlags(); 3583 3584 LLT Ty = MRI.getType(Dst); 3585 3586 const fltSemantics *FltSemantics; 3587 if (Ty == LLT::scalar(32)) 3588 FltSemantics = &APFloat::IEEEsingle(); 3589 else if (Ty == LLT::scalar(64)) 3590 FltSemantics = &APFloat::IEEEdouble(); 3591 else 3592 return false; 3593 3594 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3595 .addUse(Src) 3596 .setMIFlags(Flags); 3597 3598 // We don't need to concern ourselves with the snan handling difference, since 3599 // the rsq quieted (or not) so use the one which will directly select. 3600 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3601 const bool UseIEEE = MFI->getMode().IEEE; 3602 3603 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3604 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3605 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3606 3607 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3608 3609 if (UseIEEE) 3610 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3611 else 3612 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3613 MI.eraseFromParent(); 3614 return true; 3615 } 3616 3617 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3618 switch (IID) { 3619 case Intrinsic::amdgcn_ds_fadd: 3620 return AMDGPU::G_ATOMICRMW_FADD; 3621 case Intrinsic::amdgcn_ds_fmin: 3622 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3623 case Intrinsic::amdgcn_ds_fmax: 3624 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3625 default: 3626 llvm_unreachable("not a DS FP intrinsic"); 3627 } 3628 } 3629 3630 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3631 MachineInstr &MI, 3632 Intrinsic::ID IID) const { 3633 GISelChangeObserver &Observer = Helper.Observer; 3634 Observer.changingInstr(MI); 3635 3636 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3637 3638 // The remaining operands were used to set fields in the MemOperand on 3639 // construction. 3640 for (int I = 6; I > 3; --I) 3641 MI.RemoveOperand(I); 3642 3643 MI.RemoveOperand(1); // Remove the intrinsic ID. 3644 Observer.changedInstr(MI); 3645 return true; 3646 } 3647 3648 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3649 MachineRegisterInfo &MRI, 3650 MachineIRBuilder &B) const { 3651 uint64_t Offset = 3652 ST.getTargetLowering()->getImplicitParameterOffset( 3653 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3654 LLT DstTy = MRI.getType(DstReg); 3655 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3656 3657 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3658 if (!loadInputValue(KernargPtrReg, B, 3659 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3660 return false; 3661 3662 // FIXME: This should be nuw 3663 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3664 return true; 3665 } 3666 3667 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3668 MachineRegisterInfo &MRI, 3669 MachineIRBuilder &B) const { 3670 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3671 if (!MFI->isEntryFunction()) { 3672 return legalizePreloadedArgIntrin(MI, MRI, B, 3673 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3674 } 3675 3676 Register DstReg = MI.getOperand(0).getReg(); 3677 if (!getImplicitArgPtr(DstReg, MRI, B)) 3678 return false; 3679 3680 MI.eraseFromParent(); 3681 return true; 3682 } 3683 3684 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3685 MachineRegisterInfo &MRI, 3686 MachineIRBuilder &B, 3687 unsigned AddrSpace) const { 3688 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3689 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3690 Register Hi32 = Unmerge.getReg(1); 3691 3692 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3693 MI.eraseFromParent(); 3694 return true; 3695 } 3696 3697 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3698 // offset (the offset that is included in bounds checking and swizzling, to be 3699 // split between the instruction's voffset and immoffset fields) and soffset 3700 // (the offset that is excluded from bounds checking and swizzling, to go in 3701 // the instruction's soffset field). This function takes the first kind of 3702 // offset and figures out how to split it between voffset and immoffset. 3703 std::pair<Register, unsigned> 3704 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3705 Register OrigOffset) const { 3706 const unsigned MaxImm = 4095; 3707 Register BaseReg; 3708 unsigned ImmOffset; 3709 const LLT S32 = LLT::scalar(32); 3710 MachineRegisterInfo &MRI = *B.getMRI(); 3711 3712 std::tie(BaseReg, ImmOffset) = 3713 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3714 3715 // If BaseReg is a pointer, convert it to int. 3716 if (MRI.getType(BaseReg).isPointer()) 3717 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3718 3719 // If the immediate value is too big for the immoffset field, put the value 3720 // and -4096 into the immoffset field so that the value that is copied/added 3721 // for the voffset field is a multiple of 4096, and it stands more chance 3722 // of being CSEd with the copy/add for another similar load/store. 3723 // However, do not do that rounding down to a multiple of 4096 if that is a 3724 // negative number, as it appears to be illegal to have a negative offset 3725 // in the vgpr, even if adding the immediate offset makes it positive. 3726 unsigned Overflow = ImmOffset & ~MaxImm; 3727 ImmOffset -= Overflow; 3728 if ((int32_t)Overflow < 0) { 3729 Overflow += ImmOffset; 3730 ImmOffset = 0; 3731 } 3732 3733 if (Overflow != 0) { 3734 if (!BaseReg) { 3735 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3736 } else { 3737 auto OverflowVal = B.buildConstant(S32, Overflow); 3738 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3739 } 3740 } 3741 3742 if (!BaseReg) 3743 BaseReg = B.buildConstant(S32, 0).getReg(0); 3744 3745 return std::make_pair(BaseReg, ImmOffset); 3746 } 3747 3748 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3749 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3750 Register VOffset, Register SOffset, 3751 unsigned ImmOffset, Register VIndex, 3752 MachineRegisterInfo &MRI) const { 3753 Optional<ValueAndVReg> MaybeVOffsetVal = 3754 getIConstantVRegValWithLookThrough(VOffset, MRI); 3755 Optional<ValueAndVReg> MaybeSOffsetVal = 3756 getIConstantVRegValWithLookThrough(SOffset, MRI); 3757 Optional<ValueAndVReg> MaybeVIndexVal = 3758 getIConstantVRegValWithLookThrough(VIndex, MRI); 3759 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3760 // update the MMO with that offset. The stride is unknown so we can only do 3761 // this if VIndex is constant 0. 3762 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3763 MaybeVIndexVal->Value == 0) { 3764 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3765 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3766 MMO->setOffset(TotalOffset); 3767 } else { 3768 // We don't have a constant combined offset to use in the MMO. Give up. 3769 MMO->setValue((Value *)nullptr); 3770 } 3771 } 3772 3773 /// Handle register layout difference for f16 images for some subtargets. 3774 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3775 MachineRegisterInfo &MRI, 3776 Register Reg, 3777 bool ImageStore) const { 3778 const LLT S16 = LLT::scalar(16); 3779 const LLT S32 = LLT::scalar(32); 3780 LLT StoreVT = MRI.getType(Reg); 3781 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3782 3783 if (ST.hasUnpackedD16VMem()) { 3784 auto Unmerge = B.buildUnmerge(S16, Reg); 3785 3786 SmallVector<Register, 4> WideRegs; 3787 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3788 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3789 3790 int NumElts = StoreVT.getNumElements(); 3791 3792 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3793 .getReg(0); 3794 } 3795 3796 if (ImageStore && ST.hasImageStoreD16Bug()) { 3797 if (StoreVT.getNumElements() == 2) { 3798 SmallVector<Register, 4> PackedRegs; 3799 Reg = B.buildBitcast(S32, Reg).getReg(0); 3800 PackedRegs.push_back(Reg); 3801 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3802 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3803 .getReg(0); 3804 } 3805 3806 if (StoreVT.getNumElements() == 3) { 3807 SmallVector<Register, 4> PackedRegs; 3808 auto Unmerge = B.buildUnmerge(S16, Reg); 3809 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3810 PackedRegs.push_back(Unmerge.getReg(I)); 3811 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3812 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3813 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3814 } 3815 3816 if (StoreVT.getNumElements() == 4) { 3817 SmallVector<Register, 4> PackedRegs; 3818 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3819 auto Unmerge = B.buildUnmerge(S32, Reg); 3820 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3821 PackedRegs.push_back(Unmerge.getReg(I)); 3822 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3823 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3824 .getReg(0); 3825 } 3826 3827 llvm_unreachable("invalid data type"); 3828 } 3829 3830 if (StoreVT == LLT::fixed_vector(3, S16)) { 3831 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 3832 .getReg(0); 3833 } 3834 return Reg; 3835 } 3836 3837 Register AMDGPULegalizerInfo::fixStoreSourceType( 3838 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3839 MachineRegisterInfo *MRI = B.getMRI(); 3840 LLT Ty = MRI->getType(VData); 3841 3842 const LLT S16 = LLT::scalar(16); 3843 3844 // Fixup illegal register types for i8 stores. 3845 if (Ty == LLT::scalar(8) || Ty == S16) { 3846 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3847 return AnyExt; 3848 } 3849 3850 if (Ty.isVector()) { 3851 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3852 if (IsFormat) 3853 return handleD16VData(B, *MRI, VData); 3854 } 3855 } 3856 3857 return VData; 3858 } 3859 3860 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3861 MachineRegisterInfo &MRI, 3862 MachineIRBuilder &B, 3863 bool IsTyped, 3864 bool IsFormat) const { 3865 Register VData = MI.getOperand(1).getReg(); 3866 LLT Ty = MRI.getType(VData); 3867 LLT EltTy = Ty.getScalarType(); 3868 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3869 const LLT S32 = LLT::scalar(32); 3870 3871 VData = fixStoreSourceType(B, VData, IsFormat); 3872 Register RSrc = MI.getOperand(2).getReg(); 3873 3874 MachineMemOperand *MMO = *MI.memoperands_begin(); 3875 const int MemSize = MMO->getSize(); 3876 3877 unsigned ImmOffset; 3878 3879 // The typed intrinsics add an immediate after the registers. 3880 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3881 3882 // The struct intrinsic variants add one additional operand over raw. 3883 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3884 Register VIndex; 3885 int OpOffset = 0; 3886 if (HasVIndex) { 3887 VIndex = MI.getOperand(3).getReg(); 3888 OpOffset = 1; 3889 } else { 3890 VIndex = B.buildConstant(S32, 0).getReg(0); 3891 } 3892 3893 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3894 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3895 3896 unsigned Format = 0; 3897 if (IsTyped) { 3898 Format = MI.getOperand(5 + OpOffset).getImm(); 3899 ++OpOffset; 3900 } 3901 3902 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3903 3904 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3905 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3906 3907 unsigned Opc; 3908 if (IsTyped) { 3909 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3910 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3911 } else if (IsFormat) { 3912 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3913 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3914 } else { 3915 switch (MemSize) { 3916 case 1: 3917 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3918 break; 3919 case 2: 3920 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3921 break; 3922 default: 3923 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3924 break; 3925 } 3926 } 3927 3928 auto MIB = B.buildInstr(Opc) 3929 .addUse(VData) // vdata 3930 .addUse(RSrc) // rsrc 3931 .addUse(VIndex) // vindex 3932 .addUse(VOffset) // voffset 3933 .addUse(SOffset) // soffset 3934 .addImm(ImmOffset); // offset(imm) 3935 3936 if (IsTyped) 3937 MIB.addImm(Format); 3938 3939 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3940 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3941 .addMemOperand(MMO); 3942 3943 MI.eraseFromParent(); 3944 return true; 3945 } 3946 3947 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3948 MachineRegisterInfo &MRI, 3949 MachineIRBuilder &B, 3950 bool IsFormat, 3951 bool IsTyped) const { 3952 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3953 MachineMemOperand *MMO = *MI.memoperands_begin(); 3954 const LLT MemTy = MMO->getMemoryType(); 3955 const LLT S32 = LLT::scalar(32); 3956 3957 Register Dst = MI.getOperand(0).getReg(); 3958 Register RSrc = MI.getOperand(2).getReg(); 3959 3960 // The typed intrinsics add an immediate after the registers. 3961 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3962 3963 // The struct intrinsic variants add one additional operand over raw. 3964 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3965 Register VIndex; 3966 int OpOffset = 0; 3967 if (HasVIndex) { 3968 VIndex = MI.getOperand(3).getReg(); 3969 OpOffset = 1; 3970 } else { 3971 VIndex = B.buildConstant(S32, 0).getReg(0); 3972 } 3973 3974 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3975 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3976 3977 unsigned Format = 0; 3978 if (IsTyped) { 3979 Format = MI.getOperand(5 + OpOffset).getImm(); 3980 ++OpOffset; 3981 } 3982 3983 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3984 unsigned ImmOffset; 3985 3986 LLT Ty = MRI.getType(Dst); 3987 LLT EltTy = Ty.getScalarType(); 3988 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3989 const bool Unpacked = ST.hasUnpackedD16VMem(); 3990 3991 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3992 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3993 3994 unsigned Opc; 3995 3996 if (IsTyped) { 3997 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3998 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3999 } else if (IsFormat) { 4000 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4001 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4002 } else { 4003 switch (MemTy.getSizeInBits()) { 4004 case 8: 4005 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4006 break; 4007 case 16: 4008 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4009 break; 4010 default: 4011 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4012 break; 4013 } 4014 } 4015 4016 Register LoadDstReg; 4017 4018 bool IsExtLoad = 4019 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4020 LLT UnpackedTy = Ty.changeElementSize(32); 4021 4022 if (IsExtLoad) 4023 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4024 else if (Unpacked && IsD16 && Ty.isVector()) 4025 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4026 else 4027 LoadDstReg = Dst; 4028 4029 auto MIB = B.buildInstr(Opc) 4030 .addDef(LoadDstReg) // vdata 4031 .addUse(RSrc) // rsrc 4032 .addUse(VIndex) // vindex 4033 .addUse(VOffset) // voffset 4034 .addUse(SOffset) // soffset 4035 .addImm(ImmOffset); // offset(imm) 4036 4037 if (IsTyped) 4038 MIB.addImm(Format); 4039 4040 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4041 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4042 .addMemOperand(MMO); 4043 4044 if (LoadDstReg != Dst) { 4045 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4046 4047 // Widen result for extending loads was widened. 4048 if (IsExtLoad) 4049 B.buildTrunc(Dst, LoadDstReg); 4050 else { 4051 // Repack to original 16-bit vector result 4052 // FIXME: G_TRUNC should work, but legalization currently fails 4053 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4054 SmallVector<Register, 4> Repack; 4055 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4056 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4057 B.buildMerge(Dst, Repack); 4058 } 4059 } 4060 4061 MI.eraseFromParent(); 4062 return true; 4063 } 4064 4065 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4066 MachineIRBuilder &B, 4067 bool IsInc) const { 4068 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4069 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4070 B.buildInstr(Opc) 4071 .addDef(MI.getOperand(0).getReg()) 4072 .addUse(MI.getOperand(2).getReg()) 4073 .addUse(MI.getOperand(3).getReg()) 4074 .cloneMemRefs(MI); 4075 MI.eraseFromParent(); 4076 return true; 4077 } 4078 4079 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4080 switch (IntrID) { 4081 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4082 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4083 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4084 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4085 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4086 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4087 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4088 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4089 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4090 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4091 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4092 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4093 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4094 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4095 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4096 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4097 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4098 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4099 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4100 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4101 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4102 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4103 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4104 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4105 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4106 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4107 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4108 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4109 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4110 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4111 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4112 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4113 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4114 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4115 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4116 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4117 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4118 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4119 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4120 case Intrinsic::amdgcn_buffer_atomic_fadd: 4121 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4122 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4123 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4124 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4125 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4126 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4127 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4128 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4129 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4130 default: 4131 llvm_unreachable("unhandled atomic opcode"); 4132 } 4133 } 4134 4135 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4136 MachineIRBuilder &B, 4137 Intrinsic::ID IID) const { 4138 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4139 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4140 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4141 4142 Register Dst; 4143 4144 int OpOffset = 0; 4145 if (HasReturn) { 4146 // A few FP atomics do not support return values. 4147 Dst = MI.getOperand(0).getReg(); 4148 } else { 4149 OpOffset = -1; 4150 } 4151 4152 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4153 Register CmpVal; 4154 4155 if (IsCmpSwap) { 4156 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4157 ++OpOffset; 4158 } 4159 4160 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4161 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4162 4163 // The struct intrinsic variants add one additional operand over raw. 4164 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4165 Register VIndex; 4166 if (HasVIndex) { 4167 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4168 ++OpOffset; 4169 } else { 4170 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4171 } 4172 4173 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4174 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4175 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4176 4177 MachineMemOperand *MMO = *MI.memoperands_begin(); 4178 4179 unsigned ImmOffset; 4180 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4181 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4182 4183 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4184 4185 if (HasReturn) 4186 MIB.addDef(Dst); 4187 4188 MIB.addUse(VData); // vdata 4189 4190 if (IsCmpSwap) 4191 MIB.addReg(CmpVal); 4192 4193 MIB.addUse(RSrc) // rsrc 4194 .addUse(VIndex) // vindex 4195 .addUse(VOffset) // voffset 4196 .addUse(SOffset) // soffset 4197 .addImm(ImmOffset) // offset(imm) 4198 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4199 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4200 .addMemOperand(MMO); 4201 4202 MI.eraseFromParent(); 4203 return true; 4204 } 4205 4206 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4207 /// vector with s16 typed elements. 4208 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4209 SmallVectorImpl<Register> &PackedAddrs, 4210 unsigned ArgOffset, 4211 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4212 bool IsA16, bool IsG16) { 4213 const LLT S16 = LLT::scalar(16); 4214 const LLT V2S16 = LLT::fixed_vector(2, 16); 4215 auto EndIdx = Intr->VAddrEnd; 4216 4217 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4218 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4219 if (!SrcOp.isReg()) 4220 continue; // _L to _LZ may have eliminated this. 4221 4222 Register AddrReg = SrcOp.getReg(); 4223 4224 if ((I < Intr->GradientStart) || 4225 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4226 (I >= Intr->CoordStart && !IsA16)) { 4227 // Handle any gradient or coordinate operands that should not be packed 4228 if ((I < Intr->GradientStart) && IsA16 && 4229 (B.getMRI()->getType(AddrReg) == S16)) { 4230 // Special handling of bias when A16 is on. Bias is of type half but 4231 // occupies full 32-bit. 4232 PackedAddrs.push_back( 4233 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4234 .getReg(0)); 4235 } else { 4236 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4237 PackedAddrs.push_back(AddrReg); 4238 } 4239 } else { 4240 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4241 // derivatives dx/dh and dx/dv are packed with undef. 4242 if (((I + 1) >= EndIdx) || 4243 ((Intr->NumGradients / 2) % 2 == 1 && 4244 (I == static_cast<unsigned>(Intr->GradientStart + 4245 (Intr->NumGradients / 2) - 1) || 4246 I == static_cast<unsigned>(Intr->GradientStart + 4247 Intr->NumGradients - 1))) || 4248 // Check for _L to _LZ optimization 4249 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4250 PackedAddrs.push_back( 4251 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4252 .getReg(0)); 4253 } else { 4254 PackedAddrs.push_back( 4255 B.buildBuildVector( 4256 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4257 .getReg(0)); 4258 ++I; 4259 } 4260 } 4261 } 4262 } 4263 4264 /// Convert from separate vaddr components to a single vector address register, 4265 /// and replace the remaining operands with $noreg. 4266 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4267 int DimIdx, int NumVAddrs) { 4268 const LLT S32 = LLT::scalar(32); 4269 4270 SmallVector<Register, 8> AddrRegs; 4271 for (int I = 0; I != NumVAddrs; ++I) { 4272 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4273 if (SrcOp.isReg()) { 4274 AddrRegs.push_back(SrcOp.getReg()); 4275 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4276 } 4277 } 4278 4279 int NumAddrRegs = AddrRegs.size(); 4280 if (NumAddrRegs != 1) { 4281 // Above 8 elements round up to next power of 2 (i.e. 16). 4282 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4283 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4284 auto Undef = B.buildUndef(S32); 4285 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4286 NumAddrRegs = RoundedNumRegs; 4287 } 4288 4289 auto VAddr = 4290 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4291 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4292 } 4293 4294 for (int I = 1; I != NumVAddrs; ++I) { 4295 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4296 if (SrcOp.isReg()) 4297 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4298 } 4299 } 4300 4301 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4302 /// 4303 /// Depending on the subtarget, load/store with 16-bit element data need to be 4304 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4305 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4306 /// registers. 4307 /// 4308 /// We don't want to directly select image instructions just yet, but also want 4309 /// to exposes all register repacking to the legalizer/combiners. We also don't 4310 /// want a selected instrution entering RegBankSelect. In order to avoid 4311 /// defining a multitude of intermediate image instructions, directly hack on 4312 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4313 /// padding now unnecessary arguments with $noreg. 4314 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4315 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4316 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4317 4318 const unsigned NumDefs = MI.getNumExplicitDefs(); 4319 const unsigned ArgOffset = NumDefs + 1; 4320 bool IsTFE = NumDefs == 2; 4321 // We are only processing the operands of d16 image operations on subtargets 4322 // that use the unpacked register layout, or need to repack the TFE result. 4323 4324 // TODO: Do we need to guard against already legalized intrinsics? 4325 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4326 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4327 4328 MachineRegisterInfo *MRI = B.getMRI(); 4329 const LLT S32 = LLT::scalar(32); 4330 const LLT S16 = LLT::scalar(16); 4331 const LLT V2S16 = LLT::fixed_vector(2, 16); 4332 4333 unsigned DMask = 0; 4334 4335 // Check for 16 bit addresses and pack if true. 4336 LLT GradTy = 4337 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4338 LLT AddrTy = 4339 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4340 const bool IsG16 = GradTy == S16; 4341 const bool IsA16 = AddrTy == S16; 4342 4343 int DMaskLanes = 0; 4344 if (!BaseOpcode->Atomic) { 4345 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4346 if (BaseOpcode->Gather4) { 4347 DMaskLanes = 4; 4348 } else if (DMask != 0) { 4349 DMaskLanes = countPopulation(DMask); 4350 } else if (!IsTFE && !BaseOpcode->Store) { 4351 // If dmask is 0, this is a no-op load. This can be eliminated. 4352 B.buildUndef(MI.getOperand(0)); 4353 MI.eraseFromParent(); 4354 return true; 4355 } 4356 } 4357 4358 Observer.changingInstr(MI); 4359 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4360 4361 unsigned NewOpcode = NumDefs == 0 ? 4362 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4363 4364 // Track that we legalized this 4365 MI.setDesc(B.getTII().get(NewOpcode)); 4366 4367 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4368 // dmask to be at least 1 otherwise the instruction will fail 4369 if (IsTFE && DMask == 0) { 4370 DMask = 0x1; 4371 DMaskLanes = 1; 4372 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4373 } 4374 4375 if (BaseOpcode->Atomic) { 4376 Register VData0 = MI.getOperand(2).getReg(); 4377 LLT Ty = MRI->getType(VData0); 4378 4379 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4380 if (Ty.isVector()) 4381 return false; 4382 4383 if (BaseOpcode->AtomicX2) { 4384 Register VData1 = MI.getOperand(3).getReg(); 4385 // The two values are packed in one register. 4386 LLT PackedTy = LLT::fixed_vector(2, Ty); 4387 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4388 MI.getOperand(2).setReg(Concat.getReg(0)); 4389 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4390 } 4391 } 4392 4393 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4394 4395 // Optimize _L to _LZ when _L is zero 4396 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4397 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4398 const ConstantFP *ConstantLod; 4399 4400 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4401 m_GFCst(ConstantLod))) { 4402 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4403 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4404 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4405 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, 4406 Intr->Dim); 4407 4408 // The starting indexes should remain in the same place. 4409 --CorrectedNumVAddrs; 4410 4411 MI.getOperand(MI.getNumExplicitDefs()) 4412 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4413 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4414 Intr = NewImageDimIntr; 4415 } 4416 } 4417 } 4418 4419 // Optimize _mip away, when 'lod' is zero 4420 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4421 int64_t ConstantLod; 4422 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4423 m_ICst(ConstantLod))) { 4424 if (ConstantLod == 0) { 4425 // TODO: Change intrinsic opcode and remove operand instead or replacing 4426 // it with 0, as the _L to _LZ handling is done above. 4427 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4428 --CorrectedNumVAddrs; 4429 } 4430 } 4431 } 4432 4433 // Rewrite the addressing register layout before doing anything else. 4434 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4435 // 16 bit gradients are supported, but are tied to the A16 control 4436 // so both gradients and addresses must be 16 bit 4437 return false; 4438 } 4439 4440 if (IsA16 && !ST.hasA16()) { 4441 // A16 not supported 4442 return false; 4443 } 4444 4445 if (IsA16 || IsG16) { 4446 if (Intr->NumVAddrs > 1) { 4447 SmallVector<Register, 4> PackedRegs; 4448 4449 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4450 IsG16); 4451 4452 // See also below in the non-a16 branch 4453 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4454 PackedRegs.size() <= ST.getNSAMaxSize(); 4455 4456 if (!UseNSA && PackedRegs.size() > 1) { 4457 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4458 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4459 PackedRegs[0] = Concat.getReg(0); 4460 PackedRegs.resize(1); 4461 } 4462 4463 const unsigned NumPacked = PackedRegs.size(); 4464 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4465 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4466 if (!SrcOp.isReg()) { 4467 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4468 continue; 4469 } 4470 4471 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4472 4473 if (I - Intr->VAddrStart < NumPacked) 4474 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4475 else 4476 SrcOp.setReg(AMDGPU::NoRegister); 4477 } 4478 } 4479 } else { 4480 // If the register allocator cannot place the address registers contiguously 4481 // without introducing moves, then using the non-sequential address encoding 4482 // is always preferable, since it saves VALU instructions and is usually a 4483 // wash in terms of code size or even better. 4484 // 4485 // However, we currently have no way of hinting to the register allocator 4486 // that MIMG addresses should be placed contiguously when it is possible to 4487 // do so, so force non-NSA for the common 2-address case as a heuristic. 4488 // 4489 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4490 // allocation when possible. 4491 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4492 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4493 4494 if (!UseNSA && Intr->NumVAddrs > 1) 4495 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4496 Intr->NumVAddrs); 4497 } 4498 4499 int Flags = 0; 4500 if (IsA16) 4501 Flags |= 1; 4502 if (IsG16) 4503 Flags |= 2; 4504 MI.addOperand(MachineOperand::CreateImm(Flags)); 4505 4506 if (BaseOpcode->Store) { // No TFE for stores? 4507 // TODO: Handle dmask trim 4508 Register VData = MI.getOperand(1).getReg(); 4509 LLT Ty = MRI->getType(VData); 4510 if (!Ty.isVector() || Ty.getElementType() != S16) 4511 return true; 4512 4513 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4514 if (RepackedReg != VData) { 4515 MI.getOperand(1).setReg(RepackedReg); 4516 } 4517 4518 return true; 4519 } 4520 4521 Register DstReg = MI.getOperand(0).getReg(); 4522 LLT Ty = MRI->getType(DstReg); 4523 const LLT EltTy = Ty.getScalarType(); 4524 const bool IsD16 = Ty.getScalarType() == S16; 4525 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4526 4527 // Confirm that the return type is large enough for the dmask specified 4528 if (NumElts < DMaskLanes) 4529 return false; 4530 4531 if (NumElts > 4 || DMaskLanes > 4) 4532 return false; 4533 4534 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4535 const LLT AdjustedTy = 4536 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4537 4538 // The raw dword aligned data component of the load. The only legal cases 4539 // where this matters should be when using the packed D16 format, for 4540 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4541 LLT RoundedTy; 4542 4543 // S32 vector to to cover all data, plus TFE result element. 4544 LLT TFETy; 4545 4546 // Register type to use for each loaded component. Will be S32 or V2S16. 4547 LLT RegTy; 4548 4549 if (IsD16 && ST.hasUnpackedD16VMem()) { 4550 RoundedTy = 4551 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4552 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4553 RegTy = S32; 4554 } else { 4555 unsigned EltSize = EltTy.getSizeInBits(); 4556 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4557 unsigned RoundedSize = 32 * RoundedElts; 4558 RoundedTy = LLT::scalarOrVector( 4559 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4560 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4561 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4562 } 4563 4564 // The return type does not need adjustment. 4565 // TODO: Should we change s16 case to s32 or <2 x s16>? 4566 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4567 return true; 4568 4569 Register Dst1Reg; 4570 4571 // Insert after the instruction. 4572 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4573 4574 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4575 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4576 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4577 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4578 4579 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4580 4581 MI.getOperand(0).setReg(NewResultReg); 4582 4583 // In the IR, TFE is supposed to be used with a 2 element struct return 4584 // type. The instruction really returns these two values in one contiguous 4585 // register, with one additional dword beyond the loaded data. Rewrite the 4586 // return type to use a single register result. 4587 4588 if (IsTFE) { 4589 Dst1Reg = MI.getOperand(1).getReg(); 4590 if (MRI->getType(Dst1Reg) != S32) 4591 return false; 4592 4593 // TODO: Make sure the TFE operand bit is set. 4594 MI.RemoveOperand(1); 4595 4596 // Handle the easy case that requires no repack instructions. 4597 if (Ty == S32) { 4598 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4599 return true; 4600 } 4601 } 4602 4603 // Now figure out how to copy the new result register back into the old 4604 // result. 4605 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4606 4607 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4608 4609 if (ResultNumRegs == 1) { 4610 assert(!IsTFE); 4611 ResultRegs[0] = NewResultReg; 4612 } else { 4613 // We have to repack into a new vector of some kind. 4614 for (int I = 0; I != NumDataRegs; ++I) 4615 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4616 B.buildUnmerge(ResultRegs, NewResultReg); 4617 4618 // Drop the final TFE element to get the data part. The TFE result is 4619 // directly written to the right place already. 4620 if (IsTFE) 4621 ResultRegs.resize(NumDataRegs); 4622 } 4623 4624 // For an s16 scalar result, we form an s32 result with a truncate regardless 4625 // of packed vs. unpacked. 4626 if (IsD16 && !Ty.isVector()) { 4627 B.buildTrunc(DstReg, ResultRegs[0]); 4628 return true; 4629 } 4630 4631 // Avoid a build/concat_vector of 1 entry. 4632 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4633 B.buildBitcast(DstReg, ResultRegs[0]); 4634 return true; 4635 } 4636 4637 assert(Ty.isVector()); 4638 4639 if (IsD16) { 4640 // For packed D16 results with TFE enabled, all the data components are 4641 // S32. Cast back to the expected type. 4642 // 4643 // TODO: We don't really need to use load s32 elements. We would only need one 4644 // cast for the TFE result if a multiple of v2s16 was used. 4645 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4646 for (Register &Reg : ResultRegs) 4647 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4648 } else if (ST.hasUnpackedD16VMem()) { 4649 for (Register &Reg : ResultRegs) 4650 Reg = B.buildTrunc(S16, Reg).getReg(0); 4651 } 4652 } 4653 4654 auto padWithUndef = [&](LLT Ty, int NumElts) { 4655 if (NumElts == 0) 4656 return; 4657 Register Undef = B.buildUndef(Ty).getReg(0); 4658 for (int I = 0; I != NumElts; ++I) 4659 ResultRegs.push_back(Undef); 4660 }; 4661 4662 // Pad out any elements eliminated due to the dmask. 4663 LLT ResTy = MRI->getType(ResultRegs[0]); 4664 if (!ResTy.isVector()) { 4665 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4666 B.buildBuildVector(DstReg, ResultRegs); 4667 return true; 4668 } 4669 4670 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4671 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4672 4673 // Deal with the one annoying legal case. 4674 const LLT V3S16 = LLT::fixed_vector(3, 16); 4675 if (Ty == V3S16) { 4676 if (IsTFE) { 4677 if (ResultRegs.size() == 1) { 4678 NewResultReg = ResultRegs[0]; 4679 } else if (ResultRegs.size() == 2) { 4680 LLT V4S16 = LLT::fixed_vector(4, 16); 4681 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 4682 } else { 4683 return false; 4684 } 4685 } 4686 4687 if (MRI->getType(DstReg).getNumElements() < 4688 MRI->getType(NewResultReg).getNumElements()) { 4689 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 4690 } else { 4691 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 4692 } 4693 return true; 4694 } 4695 4696 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4697 B.buildConcatVectors(DstReg, ResultRegs); 4698 return true; 4699 } 4700 4701 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4702 LegalizerHelper &Helper, MachineInstr &MI) const { 4703 MachineIRBuilder &B = Helper.MIRBuilder; 4704 GISelChangeObserver &Observer = Helper.Observer; 4705 4706 Register Dst = MI.getOperand(0).getReg(); 4707 LLT Ty = B.getMRI()->getType(Dst); 4708 unsigned Size = Ty.getSizeInBits(); 4709 MachineFunction &MF = B.getMF(); 4710 4711 Observer.changingInstr(MI); 4712 4713 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4714 Ty = getBitcastRegisterType(Ty); 4715 Helper.bitcastDst(MI, Ty, 0); 4716 Dst = MI.getOperand(0).getReg(); 4717 B.setInsertPt(B.getMBB(), MI); 4718 } 4719 4720 // FIXME: We don't really need this intermediate instruction. The intrinsic 4721 // should be fixed to have a memory operand. Since it's readnone, we're not 4722 // allowed to add one. 4723 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4724 MI.RemoveOperand(1); // Remove intrinsic ID 4725 4726 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4727 // TODO: Should this use datalayout alignment? 4728 const unsigned MemSize = (Size + 7) / 8; 4729 const Align MemAlign(4); 4730 MachineMemOperand *MMO = MF.getMachineMemOperand( 4731 MachinePointerInfo(), 4732 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4733 MachineMemOperand::MOInvariant, 4734 MemSize, MemAlign); 4735 MI.addMemOperand(MF, MMO); 4736 4737 // There are no 96-bit result scalar loads, but widening to 128-bit should 4738 // always be legal. We may need to restore this to a 96-bit result if it turns 4739 // out this needs to be converted to a vector load during RegBankSelect. 4740 if (!isPowerOf2_32(Size)) { 4741 if (Ty.isVector()) 4742 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4743 else 4744 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4745 } 4746 4747 Observer.changedInstr(MI); 4748 return true; 4749 } 4750 4751 // TODO: Move to selection 4752 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4753 MachineRegisterInfo &MRI, 4754 MachineIRBuilder &B) const { 4755 if (!ST.isTrapHandlerEnabled() || 4756 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4757 return legalizeTrapEndpgm(MI, MRI, B); 4758 4759 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4760 switch (*HsaAbiVer) { 4761 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4762 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4763 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4764 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4765 return ST.supportsGetDoorbellID() ? 4766 legalizeTrapHsa(MI, MRI, B) : 4767 legalizeTrapHsaQueuePtr(MI, MRI, B); 4768 } 4769 } 4770 4771 llvm_unreachable("Unknown trap handler"); 4772 } 4773 4774 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4775 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4776 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4777 MI.eraseFromParent(); 4778 return true; 4779 } 4780 4781 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4782 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4783 // Pass queue pointer to trap handler as input, and insert trap instruction 4784 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4785 Register LiveIn = 4786 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4787 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4788 return false; 4789 4790 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4791 B.buildCopy(SGPR01, LiveIn); 4792 B.buildInstr(AMDGPU::S_TRAP) 4793 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4794 .addReg(SGPR01, RegState::Implicit); 4795 4796 MI.eraseFromParent(); 4797 return true; 4798 } 4799 4800 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4801 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4802 B.buildInstr(AMDGPU::S_TRAP) 4803 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4804 MI.eraseFromParent(); 4805 return true; 4806 } 4807 4808 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4809 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4810 // Is non-HSA path or trap-handler disabled? Then, report a warning 4811 // accordingly 4812 if (!ST.isTrapHandlerEnabled() || 4813 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4814 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4815 "debugtrap handler not supported", 4816 MI.getDebugLoc(), DS_Warning); 4817 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4818 Ctx.diagnose(NoTrap); 4819 } else { 4820 // Insert debug-trap instruction 4821 B.buildInstr(AMDGPU::S_TRAP) 4822 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4823 } 4824 4825 MI.eraseFromParent(); 4826 return true; 4827 } 4828 4829 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4830 MachineIRBuilder &B) const { 4831 MachineRegisterInfo &MRI = *B.getMRI(); 4832 const LLT S16 = LLT::scalar(16); 4833 const LLT S32 = LLT::scalar(32); 4834 4835 Register DstReg = MI.getOperand(0).getReg(); 4836 Register NodePtr = MI.getOperand(2).getReg(); 4837 Register RayExtent = MI.getOperand(3).getReg(); 4838 Register RayOrigin = MI.getOperand(4).getReg(); 4839 Register RayDir = MI.getOperand(5).getReg(); 4840 Register RayInvDir = MI.getOperand(6).getReg(); 4841 Register TDescr = MI.getOperand(7).getReg(); 4842 4843 if (!ST.hasGFX10_AEncoding()) { 4844 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4845 "intrinsic not supported on subtarget", 4846 MI.getDebugLoc()); 4847 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4848 return false; 4849 } 4850 4851 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4852 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4853 const unsigned NumVDataDwords = 4; 4854 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4855 const bool UseNSA = 4856 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 4857 const unsigned BaseOpcodes[2][2] = { 4858 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 4859 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 4860 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 4861 int Opcode; 4862 if (UseNSA) { 4863 Opcode = 4864 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 4865 NumVDataDwords, NumVAddrDwords); 4866 } else { 4867 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 4868 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 4869 PowerOf2Ceil(NumVAddrDwords)); 4870 } 4871 assert(Opcode != -1); 4872 4873 SmallVector<Register, 12> Ops; 4874 if (Is64) { 4875 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4876 Ops.push_back(Unmerge.getReg(0)); 4877 Ops.push_back(Unmerge.getReg(1)); 4878 } else { 4879 Ops.push_back(NodePtr); 4880 } 4881 Ops.push_back(RayExtent); 4882 4883 auto packLanes = [&Ops, &S32, &B](Register Src) { 4884 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 4885 Ops.push_back(Unmerge.getReg(0)); 4886 Ops.push_back(Unmerge.getReg(1)); 4887 Ops.push_back(Unmerge.getReg(2)); 4888 }; 4889 4890 packLanes(RayOrigin); 4891 if (IsA16) { 4892 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 4893 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 4894 Register R1 = MRI.createGenericVirtualRegister(S32); 4895 Register R2 = MRI.createGenericVirtualRegister(S32); 4896 Register R3 = MRI.createGenericVirtualRegister(S32); 4897 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4898 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4899 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4900 Ops.push_back(R1); 4901 Ops.push_back(R2); 4902 Ops.push_back(R3); 4903 } else { 4904 packLanes(RayDir); 4905 packLanes(RayInvDir); 4906 } 4907 4908 if (!UseNSA) { 4909 // Build a single vector containing all the operands so far prepared. 4910 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 4911 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 4912 Ops.clear(); 4913 Ops.push_back(MergedOps); 4914 } 4915 4916 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4917 .addDef(DstReg) 4918 .addImm(Opcode); 4919 4920 for (Register R : Ops) { 4921 MIB.addUse(R); 4922 } 4923 4924 MIB.addUse(TDescr) 4925 .addImm(IsA16 ? 1 : 0) 4926 .cloneMemRefs(MI); 4927 4928 MI.eraseFromParent(); 4929 return true; 4930 } 4931 4932 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4933 MachineInstr &MI) const { 4934 MachineIRBuilder &B = Helper.MIRBuilder; 4935 MachineRegisterInfo &MRI = *B.getMRI(); 4936 4937 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4938 auto IntrID = MI.getIntrinsicID(); 4939 switch (IntrID) { 4940 case Intrinsic::amdgcn_if: 4941 case Intrinsic::amdgcn_else: { 4942 MachineInstr *Br = nullptr; 4943 MachineBasicBlock *UncondBrTarget = nullptr; 4944 bool Negated = false; 4945 if (MachineInstr *BrCond = 4946 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4947 const SIRegisterInfo *TRI 4948 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4949 4950 Register Def = MI.getOperand(1).getReg(); 4951 Register Use = MI.getOperand(3).getReg(); 4952 4953 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4954 4955 if (Negated) 4956 std::swap(CondBrTarget, UncondBrTarget); 4957 4958 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4959 if (IntrID == Intrinsic::amdgcn_if) { 4960 B.buildInstr(AMDGPU::SI_IF) 4961 .addDef(Def) 4962 .addUse(Use) 4963 .addMBB(UncondBrTarget); 4964 } else { 4965 B.buildInstr(AMDGPU::SI_ELSE) 4966 .addDef(Def) 4967 .addUse(Use) 4968 .addMBB(UncondBrTarget); 4969 } 4970 4971 if (Br) { 4972 Br->getOperand(0).setMBB(CondBrTarget); 4973 } else { 4974 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4975 // since we're swapping branch targets it needs to be reinserted. 4976 // FIXME: IRTranslator should probably not do this 4977 B.buildBr(*CondBrTarget); 4978 } 4979 4980 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4981 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4982 MI.eraseFromParent(); 4983 BrCond->eraseFromParent(); 4984 return true; 4985 } 4986 4987 return false; 4988 } 4989 case Intrinsic::amdgcn_loop: { 4990 MachineInstr *Br = nullptr; 4991 MachineBasicBlock *UncondBrTarget = nullptr; 4992 bool Negated = false; 4993 if (MachineInstr *BrCond = 4994 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4995 const SIRegisterInfo *TRI 4996 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4997 4998 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4999 Register Reg = MI.getOperand(2).getReg(); 5000 5001 if (Negated) 5002 std::swap(CondBrTarget, UncondBrTarget); 5003 5004 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5005 B.buildInstr(AMDGPU::SI_LOOP) 5006 .addUse(Reg) 5007 .addMBB(UncondBrTarget); 5008 5009 if (Br) 5010 Br->getOperand(0).setMBB(CondBrTarget); 5011 else 5012 B.buildBr(*CondBrTarget); 5013 5014 MI.eraseFromParent(); 5015 BrCond->eraseFromParent(); 5016 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5017 return true; 5018 } 5019 5020 return false; 5021 } 5022 case Intrinsic::amdgcn_kernarg_segment_ptr: 5023 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5024 // This only makes sense to call in a kernel, so just lower to null. 5025 B.buildConstant(MI.getOperand(0).getReg(), 0); 5026 MI.eraseFromParent(); 5027 return true; 5028 } 5029 5030 return legalizePreloadedArgIntrin( 5031 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5032 case Intrinsic::amdgcn_implicitarg_ptr: 5033 return legalizeImplicitArgPtr(MI, MRI, B); 5034 case Intrinsic::amdgcn_workitem_id_x: 5035 return legalizePreloadedArgIntrin(MI, MRI, B, 5036 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5037 case Intrinsic::amdgcn_workitem_id_y: 5038 return legalizePreloadedArgIntrin(MI, MRI, B, 5039 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5040 case Intrinsic::amdgcn_workitem_id_z: 5041 return legalizePreloadedArgIntrin(MI, MRI, B, 5042 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5043 case Intrinsic::amdgcn_workgroup_id_x: 5044 return legalizePreloadedArgIntrin(MI, MRI, B, 5045 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5046 case Intrinsic::amdgcn_workgroup_id_y: 5047 return legalizePreloadedArgIntrin(MI, MRI, B, 5048 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5049 case Intrinsic::amdgcn_workgroup_id_z: 5050 return legalizePreloadedArgIntrin(MI, MRI, B, 5051 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5052 case Intrinsic::amdgcn_dispatch_ptr: 5053 return legalizePreloadedArgIntrin(MI, MRI, B, 5054 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5055 case Intrinsic::amdgcn_queue_ptr: 5056 return legalizePreloadedArgIntrin(MI, MRI, B, 5057 AMDGPUFunctionArgInfo::QUEUE_PTR); 5058 case Intrinsic::amdgcn_implicit_buffer_ptr: 5059 return legalizePreloadedArgIntrin( 5060 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5061 case Intrinsic::amdgcn_dispatch_id: 5062 return legalizePreloadedArgIntrin(MI, MRI, B, 5063 AMDGPUFunctionArgInfo::DISPATCH_ID); 5064 case Intrinsic::amdgcn_fdiv_fast: 5065 return legalizeFDIVFastIntrin(MI, MRI, B); 5066 case Intrinsic::amdgcn_is_shared: 5067 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5068 case Intrinsic::amdgcn_is_private: 5069 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5070 case Intrinsic::amdgcn_wavefrontsize: { 5071 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5072 MI.eraseFromParent(); 5073 return true; 5074 } 5075 case Intrinsic::amdgcn_s_buffer_load: 5076 return legalizeSBufferLoad(Helper, MI); 5077 case Intrinsic::amdgcn_raw_buffer_store: 5078 case Intrinsic::amdgcn_struct_buffer_store: 5079 return legalizeBufferStore(MI, MRI, B, false, false); 5080 case Intrinsic::amdgcn_raw_buffer_store_format: 5081 case Intrinsic::amdgcn_struct_buffer_store_format: 5082 return legalizeBufferStore(MI, MRI, B, false, true); 5083 case Intrinsic::amdgcn_raw_tbuffer_store: 5084 case Intrinsic::amdgcn_struct_tbuffer_store: 5085 return legalizeBufferStore(MI, MRI, B, true, true); 5086 case Intrinsic::amdgcn_raw_buffer_load: 5087 case Intrinsic::amdgcn_struct_buffer_load: 5088 return legalizeBufferLoad(MI, MRI, B, false, false); 5089 case Intrinsic::amdgcn_raw_buffer_load_format: 5090 case Intrinsic::amdgcn_struct_buffer_load_format: 5091 return legalizeBufferLoad(MI, MRI, B, true, false); 5092 case Intrinsic::amdgcn_raw_tbuffer_load: 5093 case Intrinsic::amdgcn_struct_tbuffer_load: 5094 return legalizeBufferLoad(MI, MRI, B, true, true); 5095 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5096 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5097 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5098 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5099 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5100 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5101 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5102 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5103 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5104 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5105 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5106 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5107 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5108 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5109 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5110 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5111 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5112 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5113 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5114 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5115 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5116 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5117 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5118 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5119 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5120 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5121 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5122 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5123 case Intrinsic::amdgcn_buffer_atomic_fadd: 5124 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5125 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5126 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5127 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5128 return legalizeBufferAtomic(MI, B, IntrID); 5129 case Intrinsic::amdgcn_atomic_inc: 5130 return legalizeAtomicIncDec(MI, B, true); 5131 case Intrinsic::amdgcn_atomic_dec: 5132 return legalizeAtomicIncDec(MI, B, false); 5133 case Intrinsic::trap: 5134 return legalizeTrapIntrinsic(MI, MRI, B); 5135 case Intrinsic::debugtrap: 5136 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5137 case Intrinsic::amdgcn_rsq_clamp: 5138 return legalizeRsqClampIntrinsic(MI, MRI, B); 5139 case Intrinsic::amdgcn_ds_fadd: 5140 case Intrinsic::amdgcn_ds_fmin: 5141 case Intrinsic::amdgcn_ds_fmax: 5142 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5143 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5144 return legalizeBVHIntrinsic(MI, B); 5145 default: { 5146 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5147 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5148 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5149 return true; 5150 } 5151 } 5152 5153 return true; 5154 } 5155