1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returns true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 unsigned AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .minScalar(0, S16) 536 .clampMaxNumElementsStrict(0, S16, 2) 537 .widenScalarToNextMultipleOf(0, 32) 538 .maxScalar(0, S32) 539 .scalarize(0); 540 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 542 .legalFor({S32, S16, V2S16}) // Clamp modifier 543 .minScalarOrElt(0, S16) 544 .clampMaxNumElementsStrict(0, S16, 2) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 32) 547 .lower(); 548 } else if (ST.has16BitInsts()) { 549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 550 .legalFor({S32, S16}) 551 .minScalar(0, S16) 552 .widenScalarToNextMultipleOf(0, 32) 553 .maxScalar(0, S32) 554 .scalarize(0); 555 556 // Technically the saturating operations require clamp bit support, but this 557 // was introduced at the same time as 16-bit operations. 558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 559 .legalFor({S32, S16}) // Clamp modifier 560 .minScalar(0, S16) 561 .scalarize(0) 562 .widenScalarToNextPow2(0, 16) 563 .lower(); 564 565 // We're just lowering this, but it helps get a better result to try to 566 // coerce to the desired type first. 567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 568 .minScalar(0, S16) 569 .scalarize(0) 570 .lower(); 571 } else { 572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 573 .legalFor({S32}) 574 .widenScalarToNextMultipleOf(0, 32) 575 .clampScalar(0, S32, S32) 576 .scalarize(0); 577 578 if (ST.hasIntClamp()) { 579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 580 .legalFor({S32}) // Clamp modifier. 581 .scalarize(0) 582 .minScalarOrElt(0, S32) 583 .lower(); 584 } else { 585 // Clamp bit support was added in VI, along with 16-bit operations. 586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 587 .minScalar(0, S32) 588 .scalarize(0) 589 .lower(); 590 } 591 592 // FIXME: DAG expansion gets better results. The widening uses the smaller 593 // range values and goes for the min/max lowering directly. 594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 595 .minScalar(0, S32) 596 .scalarize(0) 597 .lower(); 598 } 599 600 getActionDefinitionsBuilder( 601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 602 .customFor({S32, S64}) 603 .clampScalar(0, S32, S64) 604 .widenScalarToNextPow2(0, 32) 605 .scalarize(0); 606 607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 608 .legalFor({S32}) 609 .maxScalar(0, S32); 610 611 if (ST.hasVOP3PInsts()) { 612 Mulh 613 .clampMaxNumElements(0, S8, 2) 614 .lowerFor({V2S8}); 615 } 616 617 Mulh 618 .scalarize(0) 619 .lower(); 620 621 // Report legal for any types we can handle anywhere. For the cases only legal 622 // on the SALU, RegBankSelect will be able to re-legalize. 623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 625 .clampScalar(0, S32, S64) 626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 628 .widenScalarToNextPow2(0) 629 .scalarize(0); 630 631 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 633 .legalFor({{S32, S1}, {S32, S32}}) 634 .minScalar(0, S32) 635 // TODO: .scalarize(0) 636 .lower(); 637 638 getActionDefinitionsBuilder(G_BITCAST) 639 // Don't worry about the size constraint. 640 .legalIf(all(isRegisterType(0), isRegisterType(1))) 641 .lower(); 642 643 644 getActionDefinitionsBuilder(G_CONSTANT) 645 .legalFor({S1, S32, S64, S16, GlobalPtr, 646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 647 .legalIf(isPointer(0)) 648 .clampScalar(0, S32, S64) 649 .widenScalarToNextPow2(0); 650 651 getActionDefinitionsBuilder(G_FCONSTANT) 652 .legalFor({S32, S64, S16}) 653 .clampScalar(0, S16, S64); 654 655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 656 .legalIf(isRegisterType(0)) 657 // s1 and s16 are special cases because they have legal operations on 658 // them, but don't really occupy registers in the normal way. 659 .legalFor({S1, S16}) 660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 661 .clampScalarOrElt(0, S32, MaxScalar) 662 .widenScalarToNextPow2(0, 32) 663 .clampMaxNumElements(0, S32, 16); 664 665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 666 667 // If the amount is divergent, we have to do a wave reduction to get the 668 // maximum value, so this is expanded during RegBankSelect. 669 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 670 .legalFor({{PrivatePtr, S32}}); 671 672 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 673 .customIf(typeIsNot(0, PrivatePtr)); 674 675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 676 677 auto &FPOpActions = getActionDefinitionsBuilder( 678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 679 .legalFor({S32, S64}); 680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 681 .customFor({S32, S64}); 682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 683 .customFor({S32, S64}); 684 685 if (ST.has16BitInsts()) { 686 if (ST.hasVOP3PInsts()) 687 FPOpActions.legalFor({S16, V2S16}); 688 else 689 FPOpActions.legalFor({S16}); 690 691 TrigActions.customFor({S16}); 692 FDIVActions.customFor({S16}); 693 } 694 695 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 697 698 if (ST.hasVOP3PInsts()) { 699 MinNumMaxNum.customFor(FPTypesPK16) 700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 701 .clampMaxNumElements(0, S16, 2) 702 .clampScalar(0, S16, S64) 703 .scalarize(0); 704 } else if (ST.has16BitInsts()) { 705 MinNumMaxNum.customFor(FPTypes16) 706 .clampScalar(0, S16, S64) 707 .scalarize(0); 708 } else { 709 MinNumMaxNum.customFor(FPTypesBase) 710 .clampScalar(0, S32, S64) 711 .scalarize(0); 712 } 713 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 716 717 FPOpActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 TrigActions 722 .scalarize(0) 723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 724 725 FDIVActions 726 .scalarize(0) 727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 728 729 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 730 .legalFor(FPTypesPK16) 731 .clampMaxNumElementsStrict(0, S16, 2) 732 .scalarize(0) 733 .clampScalar(0, S16, S64); 734 735 if (ST.has16BitInsts()) { 736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 737 .legalFor({S32, S64, S16}) 738 .scalarize(0) 739 .clampScalar(0, S16, S64); 740 } else { 741 getActionDefinitionsBuilder(G_FSQRT) 742 .legalFor({S32, S64}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 746 if (ST.hasFractBug()) { 747 getActionDefinitionsBuilder(G_FFLOOR) 748 .customFor({S64}) 749 .legalFor({S32, S64}) 750 .scalarize(0) 751 .clampScalar(0, S32, S64); 752 } else { 753 getActionDefinitionsBuilder(G_FFLOOR) 754 .legalFor({S32, S64}) 755 .scalarize(0) 756 .clampScalar(0, S32, S64); 757 } 758 } 759 760 getActionDefinitionsBuilder(G_FPTRUNC) 761 .legalFor({{S32, S64}, {S16, S32}}) 762 .scalarize(0) 763 .lower(); 764 765 getActionDefinitionsBuilder(G_FPEXT) 766 .legalFor({{S64, S32}, {S32, S16}}) 767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 768 .scalarize(0); 769 770 getActionDefinitionsBuilder(G_FSUB) 771 // Use actual fsub instruction 772 .legalFor({S32}) 773 // Must use fadd + fneg 774 .lowerFor({S64, S16, V2S16}) 775 .scalarize(0) 776 .clampScalar(0, S32, S64); 777 778 // Whether this is legal depends on the floating point mode for the function. 779 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 780 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 781 FMad.customFor({S32, S16}); 782 else if (ST.hasMadMacF32Insts()) 783 FMad.customFor({S32}); 784 else if (ST.hasMadF16()) 785 FMad.customFor({S16}); 786 FMad.scalarize(0) 787 .lower(); 788 789 auto &FRem = getActionDefinitionsBuilder(G_FREM); 790 if (ST.has16BitInsts()) { 791 FRem.customFor({S16, S32, S64}); 792 } else { 793 FRem.minScalar(0, S32) 794 .customFor({S32, S64}); 795 } 796 FRem.scalarize(0); 797 798 // TODO: Do we need to clamp maximum bitwidth? 799 getActionDefinitionsBuilder(G_TRUNC) 800 .legalIf(isScalar(0)) 801 .legalFor({{V2S16, V2S32}}) 802 .clampMaxNumElements(0, S16, 2) 803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 804 // situations (like an invalid implicit use), we don't want to infinite loop 805 // in the legalizer. 806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 807 .alwaysLegal(); 808 809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 811 {S32, S1}, {S64, S1}, {S16, S1}}) 812 .scalarize(0) 813 .clampScalar(0, S32, S64) 814 .widenScalarToNextPow2(1, 32); 815 816 // TODO: Split s1->s64 during regbankselect for VALU. 817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 819 .lowerIf(typeIs(1, S1)) 820 .customFor({{S32, S64}, {S64, S64}}); 821 if (ST.has16BitInsts()) 822 IToFP.legalFor({{S16, S16}}); 823 IToFP.clampScalar(1, S32, S64) 824 .minScalar(0, S32) 825 .scalarize(0) 826 .widenScalarToNextPow2(1); 827 828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 830 .customFor({{S64, S32}, {S64, S64}}) 831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 832 if (ST.has16BitInsts()) 833 FPToI.legalFor({{S16, S16}}); 834 else 835 FPToI.minScalar(1, S32); 836 837 FPToI.minScalar(0, S32) 838 .widenScalarToNextPow2(0, 32) 839 .scalarize(0) 840 .lower(); 841 842 // Lower roundeven into G_FRINT 843 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 844 .scalarize(0) 845 .lower(); 846 847 if (ST.has16BitInsts()) { 848 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 849 .legalFor({S16, S32, S64}) 850 .clampScalar(0, S16, S64) 851 .scalarize(0); 852 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 854 .legalFor({S32, S64}) 855 .clampScalar(0, S32, S64) 856 .scalarize(0); 857 } else { 858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 859 .legalFor({S32}) 860 .customFor({S64}) 861 .clampScalar(0, S32, S64) 862 .scalarize(0); 863 } 864 865 getActionDefinitionsBuilder(G_PTR_ADD) 866 .legalIf(all(isPointer(0), sameSize(0, 1))) 867 .scalarize(0) 868 .scalarSameSizeAs(1, 0); 869 870 getActionDefinitionsBuilder(G_PTRMASK) 871 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 872 .scalarSameSizeAs(1, 0) 873 .scalarize(0); 874 875 auto &CmpBuilder = 876 getActionDefinitionsBuilder(G_ICMP) 877 // The compare output type differs based on the register bank of the output, 878 // so make both s1 and s32 legal. 879 // 880 // Scalar compares producing output in scc will be promoted to s32, as that 881 // is the allocatable register type that will be needed for the copy from 882 // scc. This will be promoted during RegBankSelect, and we assume something 883 // before that won't try to use s32 result types. 884 // 885 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 886 // bank. 887 .legalForCartesianProduct( 888 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 889 .legalForCartesianProduct( 890 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 891 if (ST.has16BitInsts()) { 892 CmpBuilder.legalFor({{S1, S16}}); 893 } 894 895 CmpBuilder 896 .widenScalarToNextPow2(1) 897 .clampScalar(1, S32, S64) 898 .scalarize(0) 899 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 900 901 getActionDefinitionsBuilder(G_FCMP) 902 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 903 .widenScalarToNextPow2(1) 904 .clampScalar(1, S32, S64) 905 .scalarize(0); 906 907 // FIXME: fpow has a selection pattern that should move to custom lowering. 908 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 909 if (ST.has16BitInsts()) 910 Exp2Ops.legalFor({S32, S16}); 911 else 912 Exp2Ops.legalFor({S32}); 913 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 914 Exp2Ops.scalarize(0); 915 916 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 917 if (ST.has16BitInsts()) 918 ExpOps.customFor({{S32}, {S16}}); 919 else 920 ExpOps.customFor({S32}); 921 ExpOps.clampScalar(0, MinScalarFPTy, S32) 922 .scalarize(0); 923 924 getActionDefinitionsBuilder(G_FPOWI) 925 .clampScalar(0, MinScalarFPTy, S32) 926 .lower(); 927 928 // The 64-bit versions produce 32-bit results, but only on the SALU. 929 getActionDefinitionsBuilder(G_CTPOP) 930 .legalFor({{S32, S32}, {S32, S64}}) 931 .clampScalar(0, S32, S32) 932 .clampScalar(1, S32, S64) 933 .scalarize(0) 934 .widenScalarToNextPow2(0, 32) 935 .widenScalarToNextPow2(1, 32); 936 937 // The hardware instructions return a different result on 0 than the generic 938 // instructions expect. The hardware produces -1, but these produce the 939 // bitwidth. 940 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 941 .scalarize(0) 942 .clampScalar(0, S32, S32) 943 .clampScalar(1, S32, S64) 944 .widenScalarToNextPow2(0, 32) 945 .widenScalarToNextPow2(1, 32) 946 .custom(); 947 948 // The 64-bit versions produce 32-bit results, but only on the SALU. 949 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 950 .legalFor({{S32, S32}, {S32, S64}}) 951 .clampScalar(0, S32, S32) 952 .clampScalar(1, S32, S64) 953 .scalarize(0) 954 .widenScalarToNextPow2(0, 32) 955 .widenScalarToNextPow2(1, 32); 956 957 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 958 // RegBankSelect. 959 getActionDefinitionsBuilder(G_BITREVERSE) 960 .legalFor({S32, S64}) 961 .clampScalar(0, S32, S64) 962 .scalarize(0) 963 .widenScalarToNextPow2(0); 964 965 if (ST.has16BitInsts()) { 966 getActionDefinitionsBuilder(G_BSWAP) 967 .legalFor({S16, S32, V2S16}) 968 .clampMaxNumElementsStrict(0, S16, 2) 969 // FIXME: Fixing non-power-of-2 before clamp is workaround for 970 // narrowScalar limitation. 971 .widenScalarToNextPow2(0) 972 .clampScalar(0, S16, S32) 973 .scalarize(0); 974 975 if (ST.hasVOP3PInsts()) { 976 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 977 .legalFor({S32, S16, V2S16}) 978 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 979 .clampMaxNumElements(0, S16, 2) 980 .minScalar(0, S16) 981 .widenScalarToNextPow2(0) 982 .scalarize(0) 983 .lower(); 984 } else { 985 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 986 .legalFor({S32, S16}) 987 .widenScalarToNextPow2(0) 988 .minScalar(0, S16) 989 .scalarize(0) 990 .lower(); 991 } 992 } else { 993 // TODO: Should have same legality without v_perm_b32 994 getActionDefinitionsBuilder(G_BSWAP) 995 .legalFor({S32}) 996 .lowerIf(scalarNarrowerThan(0, 32)) 997 // FIXME: Fixing non-power-of-2 before clamp is workaround for 998 // narrowScalar limitation. 999 .widenScalarToNextPow2(0) 1000 .maxScalar(0, S32) 1001 .scalarize(0) 1002 .lower(); 1003 1004 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1005 .legalFor({S32}) 1006 .minScalar(0, S32) 1007 .widenScalarToNextPow2(0) 1008 .scalarize(0) 1009 .lower(); 1010 } 1011 1012 getActionDefinitionsBuilder(G_INTTOPTR) 1013 // List the common cases 1014 .legalForCartesianProduct(AddrSpaces64, {S64}) 1015 .legalForCartesianProduct(AddrSpaces32, {S32}) 1016 .scalarize(0) 1017 // Accept any address space as long as the size matches 1018 .legalIf(sameSize(0, 1)) 1019 .widenScalarIf(smallerThan(1, 0), 1020 [](const LegalityQuery &Query) { 1021 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1022 }) 1023 .narrowScalarIf(largerThan(1, 0), 1024 [](const LegalityQuery &Query) { 1025 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1026 }); 1027 1028 getActionDefinitionsBuilder(G_PTRTOINT) 1029 // List the common cases 1030 .legalForCartesianProduct(AddrSpaces64, {S64}) 1031 .legalForCartesianProduct(AddrSpaces32, {S32}) 1032 .scalarize(0) 1033 // Accept any address space as long as the size matches 1034 .legalIf(sameSize(0, 1)) 1035 .widenScalarIf(smallerThan(0, 1), 1036 [](const LegalityQuery &Query) { 1037 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1038 }) 1039 .narrowScalarIf( 1040 largerThan(0, 1), 1041 [](const LegalityQuery &Query) { 1042 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1043 }); 1044 1045 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1046 .scalarize(0) 1047 .custom(); 1048 1049 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1050 bool IsLoad) -> bool { 1051 const LLT DstTy = Query.Types[0]; 1052 1053 // Split vector extloads. 1054 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1055 1056 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1057 return true; 1058 1059 const LLT PtrTy = Query.Types[1]; 1060 unsigned AS = PtrTy.getAddressSpace(); 1061 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1062 return true; 1063 1064 // Catch weird sized loads that don't evenly divide into the access sizes 1065 // TODO: May be able to widen depending on alignment etc. 1066 unsigned NumRegs = (MemSize + 31) / 32; 1067 if (NumRegs == 3) { 1068 if (!ST.hasDwordx3LoadStores()) 1069 return true; 1070 } else { 1071 // If the alignment allows, these should have been widened. 1072 if (!isPowerOf2_32(NumRegs)) 1073 return true; 1074 } 1075 1076 return false; 1077 }; 1078 1079 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1080 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1081 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1082 1083 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1084 // LDS 1085 // TODO: Unsupported flat for SI. 1086 1087 for (unsigned Op : {G_LOAD, G_STORE}) { 1088 const bool IsStore = Op == G_STORE; 1089 1090 auto &Actions = getActionDefinitionsBuilder(Op); 1091 // Explicitly list some common cases. 1092 // TODO: Does this help compile time at all? 1093 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1094 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1095 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1096 {S64, GlobalPtr, S64, GlobalAlign32}, 1097 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1098 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1099 {S32, GlobalPtr, S8, GlobalAlign8}, 1100 {S32, GlobalPtr, S16, GlobalAlign16}, 1101 1102 {S32, LocalPtr, S32, 32}, 1103 {S64, LocalPtr, S64, 32}, 1104 {V2S32, LocalPtr, V2S32, 32}, 1105 {S32, LocalPtr, S8, 8}, 1106 {S32, LocalPtr, S16, 16}, 1107 {V2S16, LocalPtr, S32, 32}, 1108 1109 {S32, PrivatePtr, S32, 32}, 1110 {S32, PrivatePtr, S8, 8}, 1111 {S32, PrivatePtr, S16, 16}, 1112 {V2S16, PrivatePtr, S32, 32}, 1113 1114 {S32, ConstantPtr, S32, GlobalAlign32}, 1115 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1116 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1117 {S64, ConstantPtr, S64, GlobalAlign32}, 1118 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1119 Actions.legalIf( 1120 [=](const LegalityQuery &Query) -> bool { 1121 return isLoadStoreLegal(ST, Query); 1122 }); 1123 1124 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1125 // 64-bits. 1126 // 1127 // TODO: Should generalize bitcast action into coerce, which will also cover 1128 // inserting addrspacecasts. 1129 Actions.customIf(typeIs(1, Constant32Ptr)); 1130 1131 // Turn any illegal element vectors into something easier to deal 1132 // with. These will ultimately produce 32-bit scalar shifts to extract the 1133 // parts anyway. 1134 // 1135 // For odd 16-bit element vectors, prefer to split those into pieces with 1136 // 16-bit vector parts. 1137 Actions.bitcastIf( 1138 [=](const LegalityQuery &Query) -> bool { 1139 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1140 Query.MMODescrs[0].MemoryTy); 1141 }, bitcastToRegisterType(0)); 1142 1143 if (!IsStore) { 1144 // Widen suitably aligned loads by loading extra bytes. The standard 1145 // legalization actions can't properly express widening memory operands. 1146 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1147 return shouldWidenLoad(ST, Query, G_LOAD); 1148 }); 1149 } 1150 1151 // FIXME: load/store narrowing should be moved to lower action 1152 Actions 1153 .narrowScalarIf( 1154 [=](const LegalityQuery &Query) -> bool { 1155 return !Query.Types[0].isVector() && 1156 needToSplitMemOp(Query, Op == G_LOAD); 1157 }, 1158 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1159 const LLT DstTy = Query.Types[0]; 1160 const LLT PtrTy = Query.Types[1]; 1161 1162 const unsigned DstSize = DstTy.getSizeInBits(); 1163 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1164 1165 // Split extloads. 1166 if (DstSize > MemSize) 1167 return std::make_pair(0, LLT::scalar(MemSize)); 1168 1169 unsigned MaxSize = maxSizeForAddrSpace(ST, 1170 PtrTy.getAddressSpace(), 1171 Op == G_LOAD); 1172 if (MemSize > MaxSize) 1173 return std::make_pair(0, LLT::scalar(MaxSize)); 1174 1175 unsigned Align = Query.MMODescrs[0].AlignInBits; 1176 return std::make_pair(0, LLT::scalar(Align)); 1177 }) 1178 .fewerElementsIf( 1179 [=](const LegalityQuery &Query) -> bool { 1180 return Query.Types[0].isVector() && 1181 needToSplitMemOp(Query, Op == G_LOAD); 1182 }, 1183 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1184 const LLT DstTy = Query.Types[0]; 1185 const LLT PtrTy = Query.Types[1]; 1186 1187 LLT EltTy = DstTy.getElementType(); 1188 unsigned MaxSize = maxSizeForAddrSpace(ST, 1189 PtrTy.getAddressSpace(), 1190 Op == G_LOAD); 1191 1192 // FIXME: Handle widened to power of 2 results better. This ends 1193 // up scalarizing. 1194 // FIXME: 3 element stores scalarized on SI 1195 1196 // Split if it's too large for the address space. 1197 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1198 if (MemSize > MaxSize) { 1199 unsigned NumElts = DstTy.getNumElements(); 1200 unsigned EltSize = EltTy.getSizeInBits(); 1201 1202 if (MaxSize % EltSize == 0) { 1203 return std::make_pair( 1204 0, LLT::scalarOrVector( 1205 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1206 } 1207 1208 unsigned NumPieces = MemSize / MaxSize; 1209 1210 // FIXME: Refine when odd breakdowns handled 1211 // The scalars will need to be re-legalized. 1212 if (NumPieces == 1 || NumPieces >= NumElts || 1213 NumElts % NumPieces != 0) 1214 return std::make_pair(0, EltTy); 1215 1216 return std::make_pair( 1217 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1218 } 1219 1220 // FIXME: We could probably handle weird extending loads better. 1221 if (DstTy.getSizeInBits() > MemSize) 1222 return std::make_pair(0, EltTy); 1223 1224 unsigned EltSize = EltTy.getSizeInBits(); 1225 unsigned DstSize = DstTy.getSizeInBits(); 1226 if (!isPowerOf2_32(DstSize)) { 1227 // We're probably decomposing an odd sized store. Try to split 1228 // to the widest type. TODO: Account for alignment. As-is it 1229 // should be OK, since the new parts will be further legalized. 1230 unsigned FloorSize = PowerOf2Floor(DstSize); 1231 return std::make_pair( 1232 0, LLT::scalarOrVector( 1233 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1234 } 1235 1236 // May need relegalization for the scalars. 1237 return std::make_pair(0, EltTy); 1238 }) 1239 .minScalar(0, S32) 1240 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1241 .widenScalarToNextPow2(0) 1242 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1243 .lower(); 1244 } 1245 1246 // FIXME: Unaligned accesses not lowered. 1247 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1248 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1249 {S32, GlobalPtr, S16, 2 * 8}, 1250 {S32, LocalPtr, S8, 8}, 1251 {S32, LocalPtr, S16, 16}, 1252 {S32, PrivatePtr, S8, 8}, 1253 {S32, PrivatePtr, S16, 16}, 1254 {S32, ConstantPtr, S8, 8}, 1255 {S32, ConstantPtr, S16, 2 * 8}}) 1256 .legalIf( 1257 [=](const LegalityQuery &Query) -> bool { 1258 return isLoadStoreLegal(ST, Query); 1259 }); 1260 1261 if (ST.hasFlatAddressSpace()) { 1262 ExtLoads.legalForTypesWithMemDesc( 1263 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1264 } 1265 1266 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1267 // 64-bits. 1268 // 1269 // TODO: Should generalize bitcast action into coerce, which will also cover 1270 // inserting addrspacecasts. 1271 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1272 1273 ExtLoads.clampScalar(0, S32, S32) 1274 .widenScalarToNextPow2(0) 1275 .lower(); 1276 1277 auto &Atomics = getActionDefinitionsBuilder( 1278 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1279 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1280 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1281 G_ATOMICRMW_UMIN}) 1282 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1283 {S64, GlobalPtr}, {S64, LocalPtr}, 1284 {S32, RegionPtr}, {S64, RegionPtr}}); 1285 if (ST.hasFlatAddressSpace()) { 1286 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1287 } 1288 1289 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1290 if (ST.hasLDSFPAtomicAdd()) { 1291 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1292 if (ST.hasGFX90AInsts()) 1293 Atomic.legalFor({{S64, LocalPtr}}); 1294 } 1295 if (ST.hasAtomicFaddInsts()) 1296 Atomic.legalFor({{S32, GlobalPtr}}); 1297 1298 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1299 // demarshalling 1300 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1301 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1302 {S32, FlatPtr}, {S64, FlatPtr}}) 1303 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1304 {S32, RegionPtr}, {S64, RegionPtr}}); 1305 // TODO: Pointer types, any 32-bit or 64-bit vector 1306 1307 // Condition should be s32 for scalar, s1 for vector. 1308 getActionDefinitionsBuilder(G_SELECT) 1309 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1310 LocalPtr, FlatPtr, PrivatePtr, 1311 LLT::fixed_vector(2, LocalPtr), 1312 LLT::fixed_vector(2, PrivatePtr)}, 1313 {S1, S32}) 1314 .clampScalar(0, S16, S64) 1315 .scalarize(1) 1316 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1317 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1318 .clampMaxNumElements(0, S32, 2) 1319 .clampMaxNumElements(0, LocalPtr, 2) 1320 .clampMaxNumElements(0, PrivatePtr, 2) 1321 .scalarize(0) 1322 .widenScalarToNextPow2(0) 1323 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1324 1325 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1326 // be more flexible with the shift amount type. 1327 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1328 .legalFor({{S32, S32}, {S64, S32}}); 1329 if (ST.has16BitInsts()) { 1330 if (ST.hasVOP3PInsts()) { 1331 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1332 .clampMaxNumElements(0, S16, 2); 1333 } else 1334 Shifts.legalFor({{S16, S16}}); 1335 1336 // TODO: Support 16-bit shift amounts for all types 1337 Shifts.widenScalarIf( 1338 [=](const LegalityQuery &Query) { 1339 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1340 // 32-bit amount. 1341 const LLT ValTy = Query.Types[0]; 1342 const LLT AmountTy = Query.Types[1]; 1343 return ValTy.getSizeInBits() <= 16 && 1344 AmountTy.getSizeInBits() < 16; 1345 }, changeTo(1, S16)); 1346 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1347 Shifts.clampScalar(1, S32, S32); 1348 Shifts.clampScalar(0, S16, S64); 1349 Shifts.widenScalarToNextPow2(0, 16); 1350 1351 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1352 .minScalar(0, S16) 1353 .scalarize(0) 1354 .lower(); 1355 } else { 1356 // Make sure we legalize the shift amount type first, as the general 1357 // expansion for the shifted type will produce much worse code if it hasn't 1358 // been truncated already. 1359 Shifts.clampScalar(1, S32, S32); 1360 Shifts.clampScalar(0, S32, S64); 1361 Shifts.widenScalarToNextPow2(0, 32); 1362 1363 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1364 .minScalar(0, S32) 1365 .scalarize(0) 1366 .lower(); 1367 } 1368 Shifts.scalarize(0); 1369 1370 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1371 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1372 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1373 unsigned IdxTypeIdx = 2; 1374 1375 getActionDefinitionsBuilder(Op) 1376 .customIf([=](const LegalityQuery &Query) { 1377 const LLT EltTy = Query.Types[EltTypeIdx]; 1378 const LLT VecTy = Query.Types[VecTypeIdx]; 1379 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1380 const unsigned EltSize = EltTy.getSizeInBits(); 1381 return (EltSize == 32 || EltSize == 64) && 1382 VecTy.getSizeInBits() % 32 == 0 && 1383 VecTy.getSizeInBits() <= MaxRegisterSize && 1384 IdxTy.getSizeInBits() == 32; 1385 }) 1386 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1387 bitcastToVectorElement32(VecTypeIdx)) 1388 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1389 .bitcastIf( 1390 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1391 [=](const LegalityQuery &Query) { 1392 // For > 64-bit element types, try to turn this into a 64-bit 1393 // element vector since we may be able to do better indexing 1394 // if this is scalar. If not, fall back to 32. 1395 const LLT EltTy = Query.Types[EltTypeIdx]; 1396 const LLT VecTy = Query.Types[VecTypeIdx]; 1397 const unsigned DstEltSize = EltTy.getSizeInBits(); 1398 const unsigned VecSize = VecTy.getSizeInBits(); 1399 1400 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1401 return std::make_pair( 1402 VecTypeIdx, 1403 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1404 }) 1405 .clampScalar(EltTypeIdx, S32, S64) 1406 .clampScalar(VecTypeIdx, S32, S64) 1407 .clampScalar(IdxTypeIdx, S32, S32) 1408 .clampMaxNumElements(VecTypeIdx, S32, 32) 1409 // TODO: Clamp elements for 64-bit vectors? 1410 // It should only be necessary with variable indexes. 1411 // As a last resort, lower to the stack 1412 .lower(); 1413 } 1414 1415 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1416 .unsupportedIf([=](const LegalityQuery &Query) { 1417 const LLT &EltTy = Query.Types[1].getElementType(); 1418 return Query.Types[0] != EltTy; 1419 }); 1420 1421 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1422 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1423 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1424 1425 // FIXME: Doesn't handle extract of illegal sizes. 1426 getActionDefinitionsBuilder(Op) 1427 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1428 .lowerIf([=](const LegalityQuery &Query) { 1429 // Sub-vector(or single element) insert and extract. 1430 // TODO: verify immediate offset here since lower only works with 1431 // whole elements. 1432 const LLT BigTy = Query.Types[BigTyIdx]; 1433 return BigTy.isVector(); 1434 }) 1435 // FIXME: Multiples of 16 should not be legal. 1436 .legalIf([=](const LegalityQuery &Query) { 1437 const LLT BigTy = Query.Types[BigTyIdx]; 1438 const LLT LitTy = Query.Types[LitTyIdx]; 1439 return (BigTy.getSizeInBits() % 32 == 0) && 1440 (LitTy.getSizeInBits() % 16 == 0); 1441 }) 1442 .widenScalarIf( 1443 [=](const LegalityQuery &Query) { 1444 const LLT BigTy = Query.Types[BigTyIdx]; 1445 return (BigTy.getScalarSizeInBits() < 16); 1446 }, 1447 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1448 .widenScalarIf( 1449 [=](const LegalityQuery &Query) { 1450 const LLT LitTy = Query.Types[LitTyIdx]; 1451 return (LitTy.getScalarSizeInBits() < 16); 1452 }, 1453 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1454 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1455 .widenScalarToNextPow2(BigTyIdx, 32); 1456 1457 } 1458 1459 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1460 .legalForCartesianProduct(AllS32Vectors, {S32}) 1461 .legalForCartesianProduct(AllS64Vectors, {S64}) 1462 .clampNumElements(0, V16S32, V32S32) 1463 .clampNumElements(0, V2S64, V16S64) 1464 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1465 1466 if (ST.hasScalarPackInsts()) { 1467 BuildVector 1468 // FIXME: Should probably widen s1 vectors straight to s32 1469 .minScalarOrElt(0, S16) 1470 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1471 .minScalar(1, S32); 1472 1473 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1474 .legalFor({V2S16, S32}) 1475 .lower(); 1476 BuildVector.minScalarOrElt(0, S32); 1477 } else { 1478 BuildVector.customFor({V2S16, S16}); 1479 BuildVector.minScalarOrElt(0, S32); 1480 1481 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1482 .customFor({V2S16, S32}) 1483 .lower(); 1484 } 1485 1486 BuildVector.legalIf(isRegisterType(0)); 1487 1488 // FIXME: Clamp maximum size 1489 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1490 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1491 .clampMaxNumElements(0, S32, 32) 1492 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1493 .clampMaxNumElements(0, S16, 64); 1494 1495 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1496 // pre-legalize. 1497 if (ST.hasVOP3PInsts()) { 1498 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1499 .customFor({V2S16, V2S16}) 1500 .lower(); 1501 } else 1502 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1503 1504 // Merge/Unmerge 1505 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1506 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1507 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1508 1509 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1510 const LLT Ty = Query.Types[TypeIdx]; 1511 if (Ty.isVector()) { 1512 const LLT &EltTy = Ty.getElementType(); 1513 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1514 return true; 1515 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1516 return true; 1517 } 1518 return false; 1519 }; 1520 1521 auto &Builder = getActionDefinitionsBuilder(Op) 1522 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1523 .lowerFor({{S16, V2S16}}) 1524 .lowerIf([=](const LegalityQuery &Query) { 1525 const LLT BigTy = Query.Types[BigTyIdx]; 1526 return BigTy.getSizeInBits() == 32; 1527 }) 1528 // Try to widen to s16 first for small types. 1529 // TODO: Only do this on targets with legal s16 shifts 1530 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1531 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1532 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1533 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1534 elementTypeIs(1, S16)), 1535 changeTo(1, V2S16)) 1536 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1537 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1538 // valid. 1539 .clampScalar(LitTyIdx, S32, S512) 1540 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1541 // Break up vectors with weird elements into scalars 1542 .fewerElementsIf( 1543 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1544 scalarize(0)) 1545 .fewerElementsIf( 1546 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1547 scalarize(1)) 1548 .clampScalar(BigTyIdx, S32, MaxScalar); 1549 1550 if (Op == G_MERGE_VALUES) { 1551 Builder.widenScalarIf( 1552 // TODO: Use 16-bit shifts if legal for 8-bit values? 1553 [=](const LegalityQuery &Query) { 1554 const LLT Ty = Query.Types[LitTyIdx]; 1555 return Ty.getSizeInBits() < 32; 1556 }, 1557 changeTo(LitTyIdx, S32)); 1558 } 1559 1560 Builder.widenScalarIf( 1561 [=](const LegalityQuery &Query) { 1562 const LLT Ty = Query.Types[BigTyIdx]; 1563 return !isPowerOf2_32(Ty.getSizeInBits()) && 1564 Ty.getSizeInBits() % 16 != 0; 1565 }, 1566 [=](const LegalityQuery &Query) { 1567 // Pick the next power of 2, or a multiple of 64 over 128. 1568 // Whichever is smaller. 1569 const LLT &Ty = Query.Types[BigTyIdx]; 1570 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1571 if (NewSizeInBits >= 256) { 1572 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1573 if (RoundedTo < NewSizeInBits) 1574 NewSizeInBits = RoundedTo; 1575 } 1576 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1577 }) 1578 // Any vectors left are the wrong size. Scalarize them. 1579 .scalarize(0) 1580 .scalarize(1); 1581 } 1582 1583 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1584 // RegBankSelect. 1585 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1586 .legalFor({{S32}, {S64}}); 1587 1588 if (ST.hasVOP3PInsts()) { 1589 SextInReg.lowerFor({{V2S16}}) 1590 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1591 // get more vector shift opportunities, since we'll get those when 1592 // expanded. 1593 .clampMaxNumElementsStrict(0, S16, 2); 1594 } else if (ST.has16BitInsts()) { 1595 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1596 } else { 1597 // Prefer to promote to s32 before lowering if we don't have 16-bit 1598 // shifts. This avoid a lot of intermediate truncate and extend operations. 1599 SextInReg.lowerFor({{S32}, {S64}}); 1600 } 1601 1602 SextInReg 1603 .scalarize(0) 1604 .clampScalar(0, S32, S64) 1605 .lower(); 1606 1607 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1608 .scalarize(0) 1609 .lower(); 1610 1611 // TODO: Only Try to form v2s16 with legal packed instructions. 1612 getActionDefinitionsBuilder(G_FSHR) 1613 .legalFor({{S32, S32}}) 1614 .lowerFor({{V2S16, V2S16}}) 1615 .clampMaxNumElementsStrict(0, S16, 2) 1616 .scalarize(0) 1617 .lower(); 1618 1619 if (ST.hasVOP3PInsts()) { 1620 getActionDefinitionsBuilder(G_FSHL) 1621 .lowerFor({{V2S16, V2S16}}) 1622 .clampMaxNumElementsStrict(0, S16, 2) 1623 .scalarize(0) 1624 .lower(); 1625 } else { 1626 getActionDefinitionsBuilder(G_FSHL) 1627 .scalarize(0) 1628 .lower(); 1629 } 1630 1631 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1632 .legalFor({S64}); 1633 1634 getActionDefinitionsBuilder(G_FENCE) 1635 .alwaysLegal(); 1636 1637 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1638 .scalarize(0) 1639 .minScalar(0, S32) 1640 .lower(); 1641 1642 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1643 .legalFor({{S32, S32}, {S64, S32}}) 1644 .clampScalar(1, S32, S32) 1645 .clampScalar(0, S32, S64) 1646 .widenScalarToNextPow2(0) 1647 .scalarize(0); 1648 1649 getActionDefinitionsBuilder({ 1650 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1651 G_FCOPYSIGN, 1652 1653 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1654 G_ATOMICRMW_NAND, 1655 G_ATOMICRMW_FSUB, 1656 G_READ_REGISTER, 1657 G_WRITE_REGISTER, 1658 1659 G_SADDO, G_SSUBO, 1660 1661 // TODO: Implement 1662 G_FMINIMUM, G_FMAXIMUM}).lower(); 1663 1664 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1665 .lower(); 1666 1667 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1668 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1669 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1670 .unsupported(); 1671 1672 getLegacyLegalizerInfo().computeTables(); 1673 verify(*ST.getInstrInfo()); 1674 } 1675 1676 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1677 MachineInstr &MI) const { 1678 MachineIRBuilder &B = Helper.MIRBuilder; 1679 MachineRegisterInfo &MRI = *B.getMRI(); 1680 1681 switch (MI.getOpcode()) { 1682 case TargetOpcode::G_ADDRSPACE_CAST: 1683 return legalizeAddrSpaceCast(MI, MRI, B); 1684 case TargetOpcode::G_FRINT: 1685 return legalizeFrint(MI, MRI, B); 1686 case TargetOpcode::G_FCEIL: 1687 return legalizeFceil(MI, MRI, B); 1688 case TargetOpcode::G_FREM: 1689 return legalizeFrem(MI, MRI, B); 1690 case TargetOpcode::G_INTRINSIC_TRUNC: 1691 return legalizeIntrinsicTrunc(MI, MRI, B); 1692 case TargetOpcode::G_SITOFP: 1693 return legalizeITOFP(MI, MRI, B, true); 1694 case TargetOpcode::G_UITOFP: 1695 return legalizeITOFP(MI, MRI, B, false); 1696 case TargetOpcode::G_FPTOSI: 1697 return legalizeFPTOI(MI, MRI, B, true); 1698 case TargetOpcode::G_FPTOUI: 1699 return legalizeFPTOI(MI, MRI, B, false); 1700 case TargetOpcode::G_FMINNUM: 1701 case TargetOpcode::G_FMAXNUM: 1702 case TargetOpcode::G_FMINNUM_IEEE: 1703 case TargetOpcode::G_FMAXNUM_IEEE: 1704 return legalizeMinNumMaxNum(Helper, MI); 1705 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1706 return legalizeExtractVectorElt(MI, MRI, B); 1707 case TargetOpcode::G_INSERT_VECTOR_ELT: 1708 return legalizeInsertVectorElt(MI, MRI, B); 1709 case TargetOpcode::G_SHUFFLE_VECTOR: 1710 return legalizeShuffleVector(MI, MRI, B); 1711 case TargetOpcode::G_FSIN: 1712 case TargetOpcode::G_FCOS: 1713 return legalizeSinCos(MI, MRI, B); 1714 case TargetOpcode::G_GLOBAL_VALUE: 1715 return legalizeGlobalValue(MI, MRI, B); 1716 case TargetOpcode::G_LOAD: 1717 case TargetOpcode::G_SEXTLOAD: 1718 case TargetOpcode::G_ZEXTLOAD: 1719 return legalizeLoad(Helper, MI); 1720 case TargetOpcode::G_FMAD: 1721 return legalizeFMad(MI, MRI, B); 1722 case TargetOpcode::G_FDIV: 1723 return legalizeFDIV(MI, MRI, B); 1724 case TargetOpcode::G_UDIV: 1725 case TargetOpcode::G_UREM: 1726 case TargetOpcode::G_UDIVREM: 1727 return legalizeUnsignedDIV_REM(MI, MRI, B); 1728 case TargetOpcode::G_SDIV: 1729 case TargetOpcode::G_SREM: 1730 case TargetOpcode::G_SDIVREM: 1731 return legalizeSignedDIV_REM(MI, MRI, B); 1732 case TargetOpcode::G_ATOMIC_CMPXCHG: 1733 return legalizeAtomicCmpXChg(MI, MRI, B); 1734 case TargetOpcode::G_FLOG: 1735 return legalizeFlog(MI, B, numbers::ln2f); 1736 case TargetOpcode::G_FLOG10: 1737 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1738 case TargetOpcode::G_FEXP: 1739 return legalizeFExp(MI, B); 1740 case TargetOpcode::G_FPOW: 1741 return legalizeFPow(MI, B); 1742 case TargetOpcode::G_FFLOOR: 1743 return legalizeFFloor(MI, MRI, B); 1744 case TargetOpcode::G_BUILD_VECTOR: 1745 return legalizeBuildVector(MI, MRI, B); 1746 case TargetOpcode::G_CTLZ: 1747 case TargetOpcode::G_CTTZ: 1748 return legalizeCTLZ_CTTZ(MI, MRI, B); 1749 default: 1750 return false; 1751 } 1752 1753 llvm_unreachable("expected switch to return"); 1754 } 1755 1756 Register AMDGPULegalizerInfo::getSegmentAperture( 1757 unsigned AS, 1758 MachineRegisterInfo &MRI, 1759 MachineIRBuilder &B) const { 1760 MachineFunction &MF = B.getMF(); 1761 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1762 const LLT S32 = LLT::scalar(32); 1763 1764 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1765 1766 if (ST.hasApertureRegs()) { 1767 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1768 // getreg. 1769 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1770 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1771 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1772 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1773 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1774 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1775 unsigned Encoding = 1776 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1777 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1778 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1779 1780 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1781 1782 B.buildInstr(AMDGPU::S_GETREG_B32) 1783 .addDef(GetReg) 1784 .addImm(Encoding); 1785 MRI.setType(GetReg, S32); 1786 1787 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1788 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1789 } 1790 1791 Register QueuePtr = MRI.createGenericVirtualRegister( 1792 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1793 1794 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1795 return Register(); 1796 1797 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1798 // private_segment_aperture_base_hi. 1799 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1800 1801 // TODO: can we be smarter about machine pointer info? 1802 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1803 MachineMemOperand *MMO = MF.getMachineMemOperand( 1804 PtrInfo, 1805 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1806 MachineMemOperand::MOInvariant, 1807 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1808 1809 Register LoadAddr; 1810 1811 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1812 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1813 } 1814 1815 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1816 MachineInstr &MI, MachineRegisterInfo &MRI, 1817 MachineIRBuilder &B) const { 1818 MachineFunction &MF = B.getMF(); 1819 1820 const LLT S32 = LLT::scalar(32); 1821 Register Dst = MI.getOperand(0).getReg(); 1822 Register Src = MI.getOperand(1).getReg(); 1823 1824 LLT DstTy = MRI.getType(Dst); 1825 LLT SrcTy = MRI.getType(Src); 1826 unsigned DestAS = DstTy.getAddressSpace(); 1827 unsigned SrcAS = SrcTy.getAddressSpace(); 1828 1829 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1830 // vector element. 1831 assert(!DstTy.isVector()); 1832 1833 const AMDGPUTargetMachine &TM 1834 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1835 1836 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1837 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1838 return true; 1839 } 1840 1841 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1842 // Truncate. 1843 B.buildExtract(Dst, Src, 0); 1844 MI.eraseFromParent(); 1845 return true; 1846 } 1847 1848 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1849 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1850 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1851 1852 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1853 // another. Merge operands are required to be the same type, but creating an 1854 // extra ptrtoint would be kind of pointless. 1855 auto HighAddr = B.buildConstant( 1856 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1857 B.buildMerge(Dst, {Src, HighAddr}); 1858 MI.eraseFromParent(); 1859 return true; 1860 } 1861 1862 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1863 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1864 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1865 unsigned NullVal = TM.getNullPointerValue(DestAS); 1866 1867 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1868 auto FlatNull = B.buildConstant(SrcTy, 0); 1869 1870 // Extract low 32-bits of the pointer. 1871 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1872 1873 auto CmpRes = 1874 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1875 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1876 1877 MI.eraseFromParent(); 1878 return true; 1879 } 1880 1881 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1882 return false; 1883 1884 if (!ST.hasFlatAddressSpace()) 1885 return false; 1886 1887 auto SegmentNull = 1888 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1889 auto FlatNull = 1890 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1891 1892 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1893 if (!ApertureReg.isValid()) 1894 return false; 1895 1896 auto CmpRes = 1897 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1898 1899 // Coerce the type of the low half of the result so we can use merge_values. 1900 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1901 1902 // TODO: Should we allow mismatched types but matching sizes in merges to 1903 // avoid the ptrtoint? 1904 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1905 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1906 1907 MI.eraseFromParent(); 1908 return true; 1909 } 1910 1911 bool AMDGPULegalizerInfo::legalizeFrint( 1912 MachineInstr &MI, MachineRegisterInfo &MRI, 1913 MachineIRBuilder &B) const { 1914 Register Src = MI.getOperand(1).getReg(); 1915 LLT Ty = MRI.getType(Src); 1916 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1917 1918 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1919 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1920 1921 auto C1 = B.buildFConstant(Ty, C1Val); 1922 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1923 1924 // TODO: Should this propagate fast-math-flags? 1925 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1926 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1927 1928 auto C2 = B.buildFConstant(Ty, C2Val); 1929 auto Fabs = B.buildFAbs(Ty, Src); 1930 1931 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1932 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1933 MI.eraseFromParent(); 1934 return true; 1935 } 1936 1937 bool AMDGPULegalizerInfo::legalizeFceil( 1938 MachineInstr &MI, MachineRegisterInfo &MRI, 1939 MachineIRBuilder &B) const { 1940 1941 const LLT S1 = LLT::scalar(1); 1942 const LLT S64 = LLT::scalar(64); 1943 1944 Register Src = MI.getOperand(1).getReg(); 1945 assert(MRI.getType(Src) == S64); 1946 1947 // result = trunc(src) 1948 // if (src > 0.0 && src != result) 1949 // result += 1.0 1950 1951 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1952 1953 const auto Zero = B.buildFConstant(S64, 0.0); 1954 const auto One = B.buildFConstant(S64, 1.0); 1955 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1956 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1957 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1958 auto Add = B.buildSelect(S64, And, One, Zero); 1959 1960 // TODO: Should this propagate fast-math-flags? 1961 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1962 return true; 1963 } 1964 1965 bool AMDGPULegalizerInfo::legalizeFrem( 1966 MachineInstr &MI, MachineRegisterInfo &MRI, 1967 MachineIRBuilder &B) const { 1968 Register DstReg = MI.getOperand(0).getReg(); 1969 Register Src0Reg = MI.getOperand(1).getReg(); 1970 Register Src1Reg = MI.getOperand(2).getReg(); 1971 auto Flags = MI.getFlags(); 1972 LLT Ty = MRI.getType(DstReg); 1973 1974 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1975 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1976 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1977 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1978 MI.eraseFromParent(); 1979 return true; 1980 } 1981 1982 static MachineInstrBuilder extractF64Exponent(Register Hi, 1983 MachineIRBuilder &B) { 1984 const unsigned FractBits = 52; 1985 const unsigned ExpBits = 11; 1986 LLT S32 = LLT::scalar(32); 1987 1988 auto Const0 = B.buildConstant(S32, FractBits - 32); 1989 auto Const1 = B.buildConstant(S32, ExpBits); 1990 1991 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1992 .addUse(Hi) 1993 .addUse(Const0.getReg(0)) 1994 .addUse(Const1.getReg(0)); 1995 1996 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1997 } 1998 1999 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2000 MachineInstr &MI, MachineRegisterInfo &MRI, 2001 MachineIRBuilder &B) const { 2002 const LLT S1 = LLT::scalar(1); 2003 const LLT S32 = LLT::scalar(32); 2004 const LLT S64 = LLT::scalar(64); 2005 2006 Register Src = MI.getOperand(1).getReg(); 2007 assert(MRI.getType(Src) == S64); 2008 2009 // TODO: Should this use extract since the low half is unused? 2010 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2011 Register Hi = Unmerge.getReg(1); 2012 2013 // Extract the upper half, since this is where we will find the sign and 2014 // exponent. 2015 auto Exp = extractF64Exponent(Hi, B); 2016 2017 const unsigned FractBits = 52; 2018 2019 // Extract the sign bit. 2020 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2021 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2022 2023 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2024 2025 const auto Zero32 = B.buildConstant(S32, 0); 2026 2027 // Extend back to 64-bits. 2028 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2029 2030 auto Shr = B.buildAShr(S64, FractMask, Exp); 2031 auto Not = B.buildNot(S64, Shr); 2032 auto Tmp0 = B.buildAnd(S64, Src, Not); 2033 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2034 2035 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2036 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2037 2038 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2039 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2040 MI.eraseFromParent(); 2041 return true; 2042 } 2043 2044 bool AMDGPULegalizerInfo::legalizeITOFP( 2045 MachineInstr &MI, MachineRegisterInfo &MRI, 2046 MachineIRBuilder &B, bool Signed) const { 2047 2048 Register Dst = MI.getOperand(0).getReg(); 2049 Register Src = MI.getOperand(1).getReg(); 2050 2051 const LLT S64 = LLT::scalar(64); 2052 const LLT S32 = LLT::scalar(32); 2053 2054 assert(MRI.getType(Src) == S64); 2055 2056 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2057 auto ThirtyTwo = B.buildConstant(S32, 32); 2058 2059 if (MRI.getType(Dst) == S64) { 2060 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2061 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2062 2063 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2064 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2065 .addUse(CvtHi.getReg(0)) 2066 .addUse(ThirtyTwo.getReg(0)); 2067 2068 // TODO: Should this propagate fast-math-flags? 2069 B.buildFAdd(Dst, LdExp, CvtLo); 2070 MI.eraseFromParent(); 2071 return true; 2072 } 2073 2074 assert(MRI.getType(Dst) == S32); 2075 2076 auto One = B.buildConstant(S32, 1); 2077 2078 MachineInstrBuilder ShAmt; 2079 if (Signed) { 2080 auto ThirtyOne = B.buildConstant(S32, 31); 2081 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2082 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2083 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2084 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2085 /*HasSideEffects=*/false) 2086 .addUse(Unmerge.getReg(1)); 2087 auto LS2 = B.buildSub(S32, LS, One); 2088 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2089 } else 2090 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2091 auto Norm = B.buildShl(S64, Src, ShAmt); 2092 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2093 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2094 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2095 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2096 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2097 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2098 /*HasSideEffects=*/false) 2099 .addUse(FVal.getReg(0)) 2100 .addUse(Scale.getReg(0)); 2101 MI.eraseFromParent(); 2102 return true; 2103 } 2104 2105 // TODO: Copied from DAG implementation. Verify logic and document how this 2106 // actually works. 2107 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2108 MachineRegisterInfo &MRI, 2109 MachineIRBuilder &B, 2110 bool Signed) const { 2111 2112 Register Dst = MI.getOperand(0).getReg(); 2113 Register Src = MI.getOperand(1).getReg(); 2114 2115 const LLT S64 = LLT::scalar(64); 2116 const LLT S32 = LLT::scalar(32); 2117 2118 const LLT SrcLT = MRI.getType(Src); 2119 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2120 2121 unsigned Flags = MI.getFlags(); 2122 2123 // The basic idea of converting a floating point number into a pair of 32-bit 2124 // integers is illustrated as follows: 2125 // 2126 // tf := trunc(val); 2127 // hif := floor(tf * 2^-32); 2128 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2129 // hi := fptoi(hif); 2130 // lo := fptoi(lof); 2131 // 2132 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2133 MachineInstrBuilder Sign; 2134 if (Signed && SrcLT == S32) { 2135 // However, a 32-bit floating point number has only 23 bits mantissa and 2136 // it's not enough to hold all the significant bits of `lof` if val is 2137 // negative. To avoid the loss of precision, We need to take the absolute 2138 // value after truncating and flip the result back based on the original 2139 // signedness. 2140 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2141 Trunc = B.buildFAbs(S32, Trunc, Flags); 2142 } 2143 MachineInstrBuilder K0, K1; 2144 if (SrcLT == S64) { 2145 K0 = B.buildFConstant(S64, 2146 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2147 K1 = B.buildFConstant(S64, 2148 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2149 } else { 2150 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2151 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2152 } 2153 2154 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2155 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2156 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2157 2158 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2159 : B.buildFPTOUI(S32, FloorMul); 2160 auto Lo = B.buildFPTOUI(S32, Fma); 2161 2162 if (Signed && SrcLT == S32) { 2163 // Flip the result based on the signedness, which is either all 0s or 1s. 2164 Sign = B.buildMerge(S64, {Sign, Sign}); 2165 // r := xor({lo, hi}, sign) - sign; 2166 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2167 } else 2168 B.buildMerge(Dst, {Lo, Hi}); 2169 MI.eraseFromParent(); 2170 2171 return true; 2172 } 2173 2174 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2175 MachineInstr &MI) const { 2176 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2177 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2178 2179 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2180 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2181 2182 // With ieee_mode disabled, the instructions have the correct behavior 2183 // already for G_FMINNUM/G_FMAXNUM 2184 if (!MFI->getMode().IEEE) 2185 return !IsIEEEOp; 2186 2187 if (IsIEEEOp) 2188 return true; 2189 2190 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2191 } 2192 2193 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2194 MachineInstr &MI, MachineRegisterInfo &MRI, 2195 MachineIRBuilder &B) const { 2196 // TODO: Should move some of this into LegalizerHelper. 2197 2198 // TODO: Promote dynamic indexing of s16 to s32 2199 2200 // FIXME: Artifact combiner probably should have replaced the truncated 2201 // constant before this, so we shouldn't need 2202 // getIConstantVRegValWithLookThrough. 2203 Optional<ValueAndVReg> MaybeIdxVal = 2204 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2205 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2206 return true; 2207 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2208 2209 Register Dst = MI.getOperand(0).getReg(); 2210 Register Vec = MI.getOperand(1).getReg(); 2211 2212 LLT VecTy = MRI.getType(Vec); 2213 LLT EltTy = VecTy.getElementType(); 2214 assert(EltTy == MRI.getType(Dst)); 2215 2216 if (IdxVal < VecTy.getNumElements()) 2217 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2218 else 2219 B.buildUndef(Dst); 2220 2221 MI.eraseFromParent(); 2222 return true; 2223 } 2224 2225 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2226 MachineInstr &MI, MachineRegisterInfo &MRI, 2227 MachineIRBuilder &B) const { 2228 // TODO: Should move some of this into LegalizerHelper. 2229 2230 // TODO: Promote dynamic indexing of s16 to s32 2231 2232 // FIXME: Artifact combiner probably should have replaced the truncated 2233 // constant before this, so we shouldn't need 2234 // getIConstantVRegValWithLookThrough. 2235 Optional<ValueAndVReg> MaybeIdxVal = 2236 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2237 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2238 return true; 2239 2240 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2241 Register Dst = MI.getOperand(0).getReg(); 2242 Register Vec = MI.getOperand(1).getReg(); 2243 Register Ins = MI.getOperand(2).getReg(); 2244 2245 LLT VecTy = MRI.getType(Vec); 2246 LLT EltTy = VecTy.getElementType(); 2247 assert(EltTy == MRI.getType(Ins)); 2248 2249 if (IdxVal < VecTy.getNumElements()) 2250 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2251 else 2252 B.buildUndef(Dst); 2253 2254 MI.eraseFromParent(); 2255 return true; 2256 } 2257 2258 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2259 MachineInstr &MI, MachineRegisterInfo &MRI, 2260 MachineIRBuilder &B) const { 2261 const LLT V2S16 = LLT::fixed_vector(2, 16); 2262 2263 Register Dst = MI.getOperand(0).getReg(); 2264 Register Src0 = MI.getOperand(1).getReg(); 2265 LLT DstTy = MRI.getType(Dst); 2266 LLT SrcTy = MRI.getType(Src0); 2267 2268 if (SrcTy == V2S16 && DstTy == V2S16 && 2269 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2270 return true; 2271 2272 MachineIRBuilder HelperBuilder(MI); 2273 GISelObserverWrapper DummyObserver; 2274 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2275 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2276 } 2277 2278 bool AMDGPULegalizerInfo::legalizeSinCos( 2279 MachineInstr &MI, MachineRegisterInfo &MRI, 2280 MachineIRBuilder &B) const { 2281 2282 Register DstReg = MI.getOperand(0).getReg(); 2283 Register SrcReg = MI.getOperand(1).getReg(); 2284 LLT Ty = MRI.getType(DstReg); 2285 unsigned Flags = MI.getFlags(); 2286 2287 Register TrigVal; 2288 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2289 if (ST.hasTrigReducedRange()) { 2290 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2291 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2292 .addUse(MulVal.getReg(0)) 2293 .setMIFlags(Flags).getReg(0); 2294 } else 2295 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2296 2297 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2298 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2299 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2300 .addUse(TrigVal) 2301 .setMIFlags(Flags); 2302 MI.eraseFromParent(); 2303 return true; 2304 } 2305 2306 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2307 MachineIRBuilder &B, 2308 const GlobalValue *GV, 2309 int64_t Offset, 2310 unsigned GAFlags) const { 2311 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2312 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2313 // to the following code sequence: 2314 // 2315 // For constant address space: 2316 // s_getpc_b64 s[0:1] 2317 // s_add_u32 s0, s0, $symbol 2318 // s_addc_u32 s1, s1, 0 2319 // 2320 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2321 // a fixup or relocation is emitted to replace $symbol with a literal 2322 // constant, which is a pc-relative offset from the encoding of the $symbol 2323 // operand to the global variable. 2324 // 2325 // For global address space: 2326 // s_getpc_b64 s[0:1] 2327 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2328 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2329 // 2330 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2331 // fixups or relocations are emitted to replace $symbol@*@lo and 2332 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2333 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2334 // operand to the global variable. 2335 // 2336 // What we want here is an offset from the value returned by s_getpc 2337 // (which is the address of the s_add_u32 instruction) to the global 2338 // variable, but since the encoding of $symbol starts 4 bytes after the start 2339 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2340 // small. This requires us to add 4 to the global variable offset in order to 2341 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2342 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2343 // instruction. 2344 2345 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2346 2347 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2348 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2349 2350 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2351 .addDef(PCReg); 2352 2353 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2354 if (GAFlags == SIInstrInfo::MO_NONE) 2355 MIB.addImm(0); 2356 else 2357 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2358 2359 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2360 2361 if (PtrTy.getSizeInBits() == 32) 2362 B.buildExtract(DstReg, PCReg, 0); 2363 return true; 2364 } 2365 2366 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2367 MachineInstr &MI, MachineRegisterInfo &MRI, 2368 MachineIRBuilder &B) const { 2369 Register DstReg = MI.getOperand(0).getReg(); 2370 LLT Ty = MRI.getType(DstReg); 2371 unsigned AS = Ty.getAddressSpace(); 2372 2373 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2374 MachineFunction &MF = B.getMF(); 2375 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2376 2377 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2378 if (!MFI->isModuleEntryFunction() && 2379 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2380 const Function &Fn = MF.getFunction(); 2381 DiagnosticInfoUnsupported BadLDSDecl( 2382 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2383 DS_Warning); 2384 Fn.getContext().diagnose(BadLDSDecl); 2385 2386 // We currently don't have a way to correctly allocate LDS objects that 2387 // aren't directly associated with a kernel. We do force inlining of 2388 // functions that use local objects. However, if these dead functions are 2389 // not eliminated, we don't want a compile time error. Just emit a warning 2390 // and a trap, since there should be no callable path here. 2391 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2392 B.buildUndef(DstReg); 2393 MI.eraseFromParent(); 2394 return true; 2395 } 2396 2397 // TODO: We could emit code to handle the initialization somewhere. 2398 // We ignore the initializer for now and legalize it to allow selection. 2399 // The initializer will anyway get errored out during assembly emission. 2400 const SITargetLowering *TLI = ST.getTargetLowering(); 2401 if (!TLI->shouldUseLDSConstAddress(GV)) { 2402 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2403 return true; // Leave in place; 2404 } 2405 2406 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2407 Type *Ty = GV->getValueType(); 2408 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2409 // zero-sized type in other languages to declare the dynamic shared 2410 // memory which size is not known at the compile time. They will be 2411 // allocated by the runtime and placed directly after the static 2412 // allocated ones. They all share the same offset. 2413 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2414 // Adjust alignment for that dynamic shared memory array. 2415 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2416 LLT S32 = LLT::scalar(32); 2417 auto Sz = 2418 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2419 B.buildIntToPtr(DstReg, Sz); 2420 MI.eraseFromParent(); 2421 return true; 2422 } 2423 } 2424 2425 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2426 *cast<GlobalVariable>(GV))); 2427 MI.eraseFromParent(); 2428 return true; 2429 } 2430 2431 const SITargetLowering *TLI = ST.getTargetLowering(); 2432 2433 if (TLI->shouldEmitFixup(GV)) { 2434 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2435 MI.eraseFromParent(); 2436 return true; 2437 } 2438 2439 if (TLI->shouldEmitPCReloc(GV)) { 2440 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2441 MI.eraseFromParent(); 2442 return true; 2443 } 2444 2445 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2446 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2447 2448 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2449 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2450 MachinePointerInfo::getGOT(MF), 2451 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2452 MachineMemOperand::MOInvariant, 2453 LoadTy, Align(8)); 2454 2455 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2456 2457 if (Ty.getSizeInBits() == 32) { 2458 // Truncate if this is a 32-bit constant address. 2459 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2460 B.buildExtract(DstReg, Load, 0); 2461 } else 2462 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2463 2464 MI.eraseFromParent(); 2465 return true; 2466 } 2467 2468 static LLT widenToNextPowerOf2(LLT Ty) { 2469 if (Ty.isVector()) 2470 return Ty.changeElementCount( 2471 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2472 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2473 } 2474 2475 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2476 MachineInstr &MI) const { 2477 MachineIRBuilder &B = Helper.MIRBuilder; 2478 MachineRegisterInfo &MRI = *B.getMRI(); 2479 GISelChangeObserver &Observer = Helper.Observer; 2480 2481 Register PtrReg = MI.getOperand(1).getReg(); 2482 LLT PtrTy = MRI.getType(PtrReg); 2483 unsigned AddrSpace = PtrTy.getAddressSpace(); 2484 2485 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2486 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2487 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2488 Observer.changingInstr(MI); 2489 MI.getOperand(1).setReg(Cast.getReg(0)); 2490 Observer.changedInstr(MI); 2491 return true; 2492 } 2493 2494 if (MI.getOpcode() != AMDGPU::G_LOAD) 2495 return false; 2496 2497 Register ValReg = MI.getOperand(0).getReg(); 2498 LLT ValTy = MRI.getType(ValReg); 2499 2500 MachineMemOperand *MMO = *MI.memoperands_begin(); 2501 const unsigned ValSize = ValTy.getSizeInBits(); 2502 const LLT MemTy = MMO->getMemoryType(); 2503 const Align MemAlign = MMO->getAlign(); 2504 const unsigned MemSize = MemTy.getSizeInBits(); 2505 const unsigned AlignInBits = 8 * MemAlign.value(); 2506 2507 // Widen non-power-of-2 loads to the alignment if needed 2508 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2509 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2510 2511 // This was already the correct extending load result type, so just adjust 2512 // the memory type. 2513 if (WideMemSize == ValSize) { 2514 MachineFunction &MF = B.getMF(); 2515 2516 MachineMemOperand *WideMMO = 2517 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2518 Observer.changingInstr(MI); 2519 MI.setMemRefs(MF, {WideMMO}); 2520 Observer.changedInstr(MI); 2521 return true; 2522 } 2523 2524 // Don't bother handling edge case that should probably never be produced. 2525 if (ValSize > WideMemSize) 2526 return false; 2527 2528 LLT WideTy = widenToNextPowerOf2(ValTy); 2529 2530 Register WideLoad; 2531 if (!WideTy.isVector()) { 2532 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2533 B.buildTrunc(ValReg, WideLoad).getReg(0); 2534 } else { 2535 // Extract the subvector. 2536 2537 if (isRegisterType(ValTy)) { 2538 // If this a case where G_EXTRACT is legal, use it. 2539 // (e.g. <3 x s32> -> <4 x s32>) 2540 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2541 B.buildExtract(ValReg, WideLoad, 0); 2542 } else { 2543 // For cases where the widened type isn't a nice register value, unmerge 2544 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2545 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2546 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2547 } 2548 } 2549 2550 MI.eraseFromParent(); 2551 return true; 2552 } 2553 2554 return false; 2555 } 2556 2557 bool AMDGPULegalizerInfo::legalizeFMad( 2558 MachineInstr &MI, MachineRegisterInfo &MRI, 2559 MachineIRBuilder &B) const { 2560 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2561 assert(Ty.isScalar()); 2562 2563 MachineFunction &MF = B.getMF(); 2564 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2565 2566 // TODO: Always legal with future ftz flag. 2567 // FIXME: Do we need just output? 2568 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2569 return true; 2570 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2571 return true; 2572 2573 MachineIRBuilder HelperBuilder(MI); 2574 GISelObserverWrapper DummyObserver; 2575 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2576 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2577 } 2578 2579 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2580 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2581 Register DstReg = MI.getOperand(0).getReg(); 2582 Register PtrReg = MI.getOperand(1).getReg(); 2583 Register CmpVal = MI.getOperand(2).getReg(); 2584 Register NewVal = MI.getOperand(3).getReg(); 2585 2586 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2587 "this should not have been custom lowered"); 2588 2589 LLT ValTy = MRI.getType(CmpVal); 2590 LLT VecTy = LLT::fixed_vector(2, ValTy); 2591 2592 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2593 2594 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2595 .addDef(DstReg) 2596 .addUse(PtrReg) 2597 .addUse(PackedVal) 2598 .setMemRefs(MI.memoperands()); 2599 2600 MI.eraseFromParent(); 2601 return true; 2602 } 2603 2604 bool AMDGPULegalizerInfo::legalizeFlog( 2605 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2606 Register Dst = MI.getOperand(0).getReg(); 2607 Register Src = MI.getOperand(1).getReg(); 2608 LLT Ty = B.getMRI()->getType(Dst); 2609 unsigned Flags = MI.getFlags(); 2610 2611 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2612 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2613 2614 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2615 MI.eraseFromParent(); 2616 return true; 2617 } 2618 2619 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2620 MachineIRBuilder &B) const { 2621 Register Dst = MI.getOperand(0).getReg(); 2622 Register Src = MI.getOperand(1).getReg(); 2623 unsigned Flags = MI.getFlags(); 2624 LLT Ty = B.getMRI()->getType(Dst); 2625 2626 auto K = B.buildFConstant(Ty, numbers::log2e); 2627 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2628 B.buildFExp2(Dst, Mul, Flags); 2629 MI.eraseFromParent(); 2630 return true; 2631 } 2632 2633 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2634 MachineIRBuilder &B) const { 2635 Register Dst = MI.getOperand(0).getReg(); 2636 Register Src0 = MI.getOperand(1).getReg(); 2637 Register Src1 = MI.getOperand(2).getReg(); 2638 unsigned Flags = MI.getFlags(); 2639 LLT Ty = B.getMRI()->getType(Dst); 2640 const LLT S16 = LLT::scalar(16); 2641 const LLT S32 = LLT::scalar(32); 2642 2643 if (Ty == S32) { 2644 auto Log = B.buildFLog2(S32, Src0, Flags); 2645 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2646 .addUse(Log.getReg(0)) 2647 .addUse(Src1) 2648 .setMIFlags(Flags); 2649 B.buildFExp2(Dst, Mul, Flags); 2650 } else if (Ty == S16) { 2651 // There's no f16 fmul_legacy, so we need to convert for it. 2652 auto Log = B.buildFLog2(S16, Src0, Flags); 2653 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2654 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2655 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2656 .addUse(Ext0.getReg(0)) 2657 .addUse(Ext1.getReg(0)) 2658 .setMIFlags(Flags); 2659 2660 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2661 } else 2662 return false; 2663 2664 MI.eraseFromParent(); 2665 return true; 2666 } 2667 2668 // Find a source register, ignoring any possible source modifiers. 2669 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2670 Register ModSrc = OrigSrc; 2671 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2672 ModSrc = SrcFNeg->getOperand(1).getReg(); 2673 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2674 ModSrc = SrcFAbs->getOperand(1).getReg(); 2675 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2676 ModSrc = SrcFAbs->getOperand(1).getReg(); 2677 return ModSrc; 2678 } 2679 2680 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2681 MachineRegisterInfo &MRI, 2682 MachineIRBuilder &B) const { 2683 2684 const LLT S1 = LLT::scalar(1); 2685 const LLT S64 = LLT::scalar(64); 2686 Register Dst = MI.getOperand(0).getReg(); 2687 Register OrigSrc = MI.getOperand(1).getReg(); 2688 unsigned Flags = MI.getFlags(); 2689 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2690 "this should not have been custom lowered"); 2691 2692 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2693 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2694 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2695 // V_FRACT bug is: 2696 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2697 // 2698 // Convert floor(x) to (x - fract(x)) 2699 2700 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2701 .addUse(OrigSrc) 2702 .setMIFlags(Flags); 2703 2704 // Give source modifier matching some assistance before obscuring a foldable 2705 // pattern. 2706 2707 // TODO: We can avoid the neg on the fract? The input sign to fract 2708 // shouldn't matter? 2709 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2710 2711 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2712 2713 Register Min = MRI.createGenericVirtualRegister(S64); 2714 2715 // We don't need to concern ourselves with the snan handling difference, so 2716 // use the one which will directly select. 2717 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2718 if (MFI->getMode().IEEE) 2719 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2720 else 2721 B.buildFMinNum(Min, Fract, Const, Flags); 2722 2723 Register CorrectedFract = Min; 2724 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2725 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2726 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2727 } 2728 2729 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2730 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2731 2732 MI.eraseFromParent(); 2733 return true; 2734 } 2735 2736 // Turn an illegal packed v2s16 build vector into bit operations. 2737 // TODO: This should probably be a bitcast action in LegalizerHelper. 2738 bool AMDGPULegalizerInfo::legalizeBuildVector( 2739 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2740 Register Dst = MI.getOperand(0).getReg(); 2741 const LLT S32 = LLT::scalar(32); 2742 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2743 2744 Register Src0 = MI.getOperand(1).getReg(); 2745 Register Src1 = MI.getOperand(2).getReg(); 2746 assert(MRI.getType(Src0) == LLT::scalar(16)); 2747 2748 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2749 B.buildBitcast(Dst, Merge); 2750 2751 MI.eraseFromParent(); 2752 return true; 2753 } 2754 2755 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2756 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2757 // case with a single min instruction instead of a compare+select. 2758 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2759 MachineRegisterInfo &MRI, 2760 MachineIRBuilder &B) const { 2761 Register Dst = MI.getOperand(0).getReg(); 2762 Register Src = MI.getOperand(1).getReg(); 2763 LLT DstTy = MRI.getType(Dst); 2764 LLT SrcTy = MRI.getType(Src); 2765 2766 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2767 ? AMDGPU::G_AMDGPU_FFBH_U32 2768 : AMDGPU::G_AMDGPU_FFBL_B32; 2769 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2770 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2771 2772 MI.eraseFromParent(); 2773 return true; 2774 } 2775 2776 // Check that this is a G_XOR x, -1 2777 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2778 if (MI.getOpcode() != TargetOpcode::G_XOR) 2779 return false; 2780 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2781 return ConstVal && *ConstVal == -1; 2782 } 2783 2784 // Return the use branch instruction, otherwise null if the usage is invalid. 2785 static MachineInstr * 2786 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2787 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2788 Register CondDef = MI.getOperand(0).getReg(); 2789 if (!MRI.hasOneNonDBGUse(CondDef)) 2790 return nullptr; 2791 2792 MachineBasicBlock *Parent = MI.getParent(); 2793 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2794 2795 if (isNot(MRI, *UseMI)) { 2796 Register NegatedCond = UseMI->getOperand(0).getReg(); 2797 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2798 return nullptr; 2799 2800 // We're deleting the def of this value, so we need to remove it. 2801 eraseInstr(*UseMI, MRI); 2802 2803 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2804 Negated = true; 2805 } 2806 2807 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2808 return nullptr; 2809 2810 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2811 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2812 if (Next == Parent->end()) { 2813 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2814 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2815 return nullptr; 2816 UncondBrTarget = &*NextMBB; 2817 } else { 2818 if (Next->getOpcode() != AMDGPU::G_BR) 2819 return nullptr; 2820 Br = &*Next; 2821 UncondBrTarget = Br->getOperand(0).getMBB(); 2822 } 2823 2824 return UseMI; 2825 } 2826 2827 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2828 const ArgDescriptor *Arg, 2829 const TargetRegisterClass *ArgRC, 2830 LLT ArgTy) const { 2831 MCRegister SrcReg = Arg->getRegister(); 2832 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2833 assert(DstReg.isVirtual() && "Virtual register expected"); 2834 2835 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2836 ArgTy); 2837 if (Arg->isMasked()) { 2838 // TODO: Should we try to emit this once in the entry block? 2839 const LLT S32 = LLT::scalar(32); 2840 const unsigned Mask = Arg->getMask(); 2841 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2842 2843 Register AndMaskSrc = LiveIn; 2844 2845 if (Shift != 0) { 2846 auto ShiftAmt = B.buildConstant(S32, Shift); 2847 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2848 } 2849 2850 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2851 } else { 2852 B.buildCopy(DstReg, LiveIn); 2853 } 2854 2855 return true; 2856 } 2857 2858 bool AMDGPULegalizerInfo::loadInputValue( 2859 Register DstReg, MachineIRBuilder &B, 2860 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2861 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2862 const ArgDescriptor *Arg; 2863 const TargetRegisterClass *ArgRC; 2864 LLT ArgTy; 2865 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2866 2867 if (!Arg) { 2868 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 2869 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 2870 // case the pointer argument may be missing and we use null. 2871 B.buildConstant(DstReg, 0); 2872 return true; 2873 } 2874 2875 // It's undefined behavior if a function marked with the amdgpu-no-* 2876 // attributes uses the corresponding intrinsic. 2877 B.buildUndef(DstReg); 2878 return true; 2879 } 2880 2881 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2882 return false; // TODO: Handle these 2883 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2884 } 2885 2886 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2887 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2888 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2889 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2890 return false; 2891 2892 MI.eraseFromParent(); 2893 return true; 2894 } 2895 2896 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2897 MachineRegisterInfo &MRI, 2898 MachineIRBuilder &B) const { 2899 Register Dst = MI.getOperand(0).getReg(); 2900 LLT DstTy = MRI.getType(Dst); 2901 LLT S16 = LLT::scalar(16); 2902 LLT S32 = LLT::scalar(32); 2903 LLT S64 = LLT::scalar(64); 2904 2905 if (DstTy == S16) 2906 return legalizeFDIV16(MI, MRI, B); 2907 if (DstTy == S32) 2908 return legalizeFDIV32(MI, MRI, B); 2909 if (DstTy == S64) 2910 return legalizeFDIV64(MI, MRI, B); 2911 2912 return false; 2913 } 2914 2915 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2916 Register DstDivReg, 2917 Register DstRemReg, 2918 Register X, 2919 Register Y) const { 2920 const LLT S1 = LLT::scalar(1); 2921 const LLT S32 = LLT::scalar(32); 2922 2923 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2924 // algorithm used here. 2925 2926 // Initial estimate of inv(y). 2927 auto FloatY = B.buildUITOFP(S32, Y); 2928 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2929 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2930 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2931 auto Z = B.buildFPTOUI(S32, ScaledY); 2932 2933 // One round of UNR. 2934 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2935 auto NegYZ = B.buildMul(S32, NegY, Z); 2936 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2937 2938 // Quotient/remainder estimate. 2939 auto Q = B.buildUMulH(S32, X, Z); 2940 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2941 2942 // First quotient/remainder refinement. 2943 auto One = B.buildConstant(S32, 1); 2944 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2945 if (DstDivReg) 2946 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2947 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2948 2949 // Second quotient/remainder refinement. 2950 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2951 if (DstDivReg) 2952 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2953 2954 if (DstRemReg) 2955 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2956 } 2957 2958 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 2959 // 2960 // Return lo, hi of result 2961 // 2962 // %cvt.lo = G_UITOFP Val.lo 2963 // %cvt.hi = G_UITOFP Val.hi 2964 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2965 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2966 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2967 // %mul2 = G_FMUL %mul1, 2**(-32) 2968 // %trunc = G_INTRINSIC_TRUNC %mul2 2969 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2970 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2971 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2972 Register Val) { 2973 const LLT S32 = LLT::scalar(32); 2974 auto Unmerge = B.buildUnmerge(S32, Val); 2975 2976 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2977 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2978 2979 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2980 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2981 2982 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2983 auto Mul1 = 2984 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2985 2986 // 2**(-32) 2987 auto Mul2 = 2988 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2989 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2990 2991 // -(2**32) 2992 auto Mad2 = B.buildFMAD(S32, Trunc, 2993 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2994 2995 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2996 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2997 2998 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2999 } 3000 3001 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3002 Register DstDivReg, 3003 Register DstRemReg, 3004 Register Numer, 3005 Register Denom) const { 3006 const LLT S32 = LLT::scalar(32); 3007 const LLT S64 = LLT::scalar(64); 3008 const LLT S1 = LLT::scalar(1); 3009 Register RcpLo, RcpHi; 3010 3011 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3012 3013 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3014 3015 auto Zero64 = B.buildConstant(S64, 0); 3016 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3017 3018 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3019 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3020 3021 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3022 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3023 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3024 3025 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3026 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3027 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3028 3029 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3030 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3031 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3032 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3033 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3034 3035 auto Zero32 = B.buildConstant(S32, 0); 3036 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3037 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3038 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3039 3040 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3041 Register NumerLo = UnmergeNumer.getReg(0); 3042 Register NumerHi = UnmergeNumer.getReg(1); 3043 3044 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3045 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3046 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3047 Register Mul3_Lo = UnmergeMul3.getReg(0); 3048 Register Mul3_Hi = UnmergeMul3.getReg(1); 3049 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3050 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3051 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3052 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3053 3054 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3055 Register DenomLo = UnmergeDenom.getReg(0); 3056 Register DenomHi = UnmergeDenom.getReg(1); 3057 3058 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3059 auto C1 = B.buildSExt(S32, CmpHi); 3060 3061 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3062 auto C2 = B.buildSExt(S32, CmpLo); 3063 3064 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3065 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3066 3067 // TODO: Here and below portions of the code can be enclosed into if/endif. 3068 // Currently control flow is unconditional and we have 4 selects after 3069 // potential endif to substitute PHIs. 3070 3071 // if C3 != 0 ... 3072 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3073 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3074 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3075 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3076 3077 auto One64 = B.buildConstant(S64, 1); 3078 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3079 3080 auto C4 = 3081 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3082 auto C5 = 3083 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3084 auto C6 = B.buildSelect( 3085 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3086 3087 // if (C6 != 0) 3088 auto Add4 = B.buildAdd(S64, Add3, One64); 3089 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3090 3091 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3092 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3093 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3094 3095 // endif C6 3096 // endif C3 3097 3098 if (DstDivReg) { 3099 auto Sel1 = B.buildSelect( 3100 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3101 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3102 Sel1, MulHi3); 3103 } 3104 3105 if (DstRemReg) { 3106 auto Sel2 = B.buildSelect( 3107 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3108 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3109 Sel2, Sub1); 3110 } 3111 } 3112 3113 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3114 MachineRegisterInfo &MRI, 3115 MachineIRBuilder &B) const { 3116 Register DstDivReg, DstRemReg; 3117 switch (MI.getOpcode()) { 3118 default: 3119 llvm_unreachable("Unexpected opcode!"); 3120 case AMDGPU::G_UDIV: { 3121 DstDivReg = MI.getOperand(0).getReg(); 3122 break; 3123 } 3124 case AMDGPU::G_UREM: { 3125 DstRemReg = MI.getOperand(0).getReg(); 3126 break; 3127 } 3128 case AMDGPU::G_UDIVREM: { 3129 DstDivReg = MI.getOperand(0).getReg(); 3130 DstRemReg = MI.getOperand(1).getReg(); 3131 break; 3132 } 3133 } 3134 3135 const LLT S64 = LLT::scalar(64); 3136 const LLT S32 = LLT::scalar(32); 3137 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3138 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3139 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3140 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3141 3142 if (Ty == S32) 3143 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3144 else if (Ty == S64) 3145 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3146 else 3147 return false; 3148 3149 MI.eraseFromParent(); 3150 return true; 3151 } 3152 3153 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3154 MachineRegisterInfo &MRI, 3155 MachineIRBuilder &B) const { 3156 const LLT S64 = LLT::scalar(64); 3157 const LLT S32 = LLT::scalar(32); 3158 3159 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3160 if (Ty != S32 && Ty != S64) 3161 return false; 3162 3163 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3164 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3165 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3166 3167 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3168 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3169 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3170 3171 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3172 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3173 3174 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3175 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3176 3177 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3178 switch (MI.getOpcode()) { 3179 default: 3180 llvm_unreachable("Unexpected opcode!"); 3181 case AMDGPU::G_SDIV: { 3182 DstDivReg = MI.getOperand(0).getReg(); 3183 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3184 break; 3185 } 3186 case AMDGPU::G_SREM: { 3187 DstRemReg = MI.getOperand(0).getReg(); 3188 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3189 break; 3190 } 3191 case AMDGPU::G_SDIVREM: { 3192 DstDivReg = MI.getOperand(0).getReg(); 3193 DstRemReg = MI.getOperand(1).getReg(); 3194 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3195 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3196 break; 3197 } 3198 } 3199 3200 if (Ty == S32) 3201 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3202 else 3203 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3204 3205 if (DstDivReg) { 3206 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3207 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3208 B.buildSub(DstDivReg, SignXor, Sign); 3209 } 3210 3211 if (DstRemReg) { 3212 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3213 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3214 B.buildSub(DstRemReg, SignXor, Sign); 3215 } 3216 3217 MI.eraseFromParent(); 3218 return true; 3219 } 3220 3221 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3222 MachineRegisterInfo &MRI, 3223 MachineIRBuilder &B) const { 3224 Register Res = MI.getOperand(0).getReg(); 3225 Register LHS = MI.getOperand(1).getReg(); 3226 Register RHS = MI.getOperand(2).getReg(); 3227 uint16_t Flags = MI.getFlags(); 3228 LLT ResTy = MRI.getType(Res); 3229 3230 const MachineFunction &MF = B.getMF(); 3231 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3232 MI.getFlag(MachineInstr::FmAfn); 3233 3234 if (!AllowInaccurateRcp) 3235 return false; 3236 3237 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3238 // 1 / x -> RCP(x) 3239 if (CLHS->isExactlyValue(1.0)) { 3240 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3241 .addUse(RHS) 3242 .setMIFlags(Flags); 3243 3244 MI.eraseFromParent(); 3245 return true; 3246 } 3247 3248 // -1 / x -> RCP( FNEG(x) ) 3249 if (CLHS->isExactlyValue(-1.0)) { 3250 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3251 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3252 .addUse(FNeg.getReg(0)) 3253 .setMIFlags(Flags); 3254 3255 MI.eraseFromParent(); 3256 return true; 3257 } 3258 } 3259 3260 // x / y -> x * (1.0 / y) 3261 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3262 .addUse(RHS) 3263 .setMIFlags(Flags); 3264 B.buildFMul(Res, LHS, RCP, Flags); 3265 3266 MI.eraseFromParent(); 3267 return true; 3268 } 3269 3270 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3271 MachineRegisterInfo &MRI, 3272 MachineIRBuilder &B) const { 3273 Register Res = MI.getOperand(0).getReg(); 3274 Register X = MI.getOperand(1).getReg(); 3275 Register Y = MI.getOperand(2).getReg(); 3276 uint16_t Flags = MI.getFlags(); 3277 LLT ResTy = MRI.getType(Res); 3278 3279 const MachineFunction &MF = B.getMF(); 3280 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3281 MI.getFlag(MachineInstr::FmAfn); 3282 3283 if (!AllowInaccurateRcp) 3284 return false; 3285 3286 auto NegY = B.buildFNeg(ResTy, Y); 3287 auto One = B.buildFConstant(ResTy, 1.0); 3288 3289 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3290 .addUse(Y) 3291 .setMIFlags(Flags); 3292 3293 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3294 R = B.buildFMA(ResTy, Tmp0, R, R); 3295 3296 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3297 R = B.buildFMA(ResTy, Tmp1, R, R); 3298 3299 auto Ret = B.buildFMul(ResTy, X, R); 3300 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3301 3302 B.buildFMA(Res, Tmp2, R, Ret); 3303 MI.eraseFromParent(); 3304 return true; 3305 } 3306 3307 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3308 MachineRegisterInfo &MRI, 3309 MachineIRBuilder &B) const { 3310 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3311 return true; 3312 3313 Register Res = MI.getOperand(0).getReg(); 3314 Register LHS = MI.getOperand(1).getReg(); 3315 Register RHS = MI.getOperand(2).getReg(); 3316 3317 uint16_t Flags = MI.getFlags(); 3318 3319 LLT S16 = LLT::scalar(16); 3320 LLT S32 = LLT::scalar(32); 3321 3322 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3323 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3324 3325 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3326 .addUse(RHSExt.getReg(0)) 3327 .setMIFlags(Flags); 3328 3329 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3330 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3331 3332 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3333 .addUse(RDst.getReg(0)) 3334 .addUse(RHS) 3335 .addUse(LHS) 3336 .setMIFlags(Flags); 3337 3338 MI.eraseFromParent(); 3339 return true; 3340 } 3341 3342 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3343 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3344 static void toggleSPDenormMode(bool Enable, 3345 MachineIRBuilder &B, 3346 const GCNSubtarget &ST, 3347 AMDGPU::SIModeRegisterDefaults Mode) { 3348 // Set SP denorm mode to this value. 3349 unsigned SPDenormMode = 3350 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3351 3352 if (ST.hasDenormModeInst()) { 3353 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3354 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3355 3356 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3357 B.buildInstr(AMDGPU::S_DENORM_MODE) 3358 .addImm(NewDenormModeValue); 3359 3360 } else { 3361 // Select FP32 bit field in mode register. 3362 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3363 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3364 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3365 3366 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3367 .addImm(SPDenormMode) 3368 .addImm(SPDenormModeBitField); 3369 } 3370 } 3371 3372 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3373 MachineRegisterInfo &MRI, 3374 MachineIRBuilder &B) const { 3375 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3376 return true; 3377 3378 Register Res = MI.getOperand(0).getReg(); 3379 Register LHS = MI.getOperand(1).getReg(); 3380 Register RHS = MI.getOperand(2).getReg(); 3381 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3382 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3383 3384 uint16_t Flags = MI.getFlags(); 3385 3386 LLT S32 = LLT::scalar(32); 3387 LLT S1 = LLT::scalar(1); 3388 3389 auto One = B.buildFConstant(S32, 1.0f); 3390 3391 auto DenominatorScaled = 3392 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3393 .addUse(LHS) 3394 .addUse(RHS) 3395 .addImm(0) 3396 .setMIFlags(Flags); 3397 auto NumeratorScaled = 3398 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3399 .addUse(LHS) 3400 .addUse(RHS) 3401 .addImm(1) 3402 .setMIFlags(Flags); 3403 3404 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3405 .addUse(DenominatorScaled.getReg(0)) 3406 .setMIFlags(Flags); 3407 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3408 3409 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3410 // aren't modeled as reading it. 3411 if (!Mode.allFP32Denormals()) 3412 toggleSPDenormMode(true, B, ST, Mode); 3413 3414 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3415 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3416 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3417 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3418 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3419 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3420 3421 if (!Mode.allFP32Denormals()) 3422 toggleSPDenormMode(false, B, ST, Mode); 3423 3424 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3425 .addUse(Fma4.getReg(0)) 3426 .addUse(Fma1.getReg(0)) 3427 .addUse(Fma3.getReg(0)) 3428 .addUse(NumeratorScaled.getReg(1)) 3429 .setMIFlags(Flags); 3430 3431 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3432 .addUse(Fmas.getReg(0)) 3433 .addUse(RHS) 3434 .addUse(LHS) 3435 .setMIFlags(Flags); 3436 3437 MI.eraseFromParent(); 3438 return true; 3439 } 3440 3441 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3442 MachineRegisterInfo &MRI, 3443 MachineIRBuilder &B) const { 3444 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3445 return true; 3446 3447 Register Res = MI.getOperand(0).getReg(); 3448 Register LHS = MI.getOperand(1).getReg(); 3449 Register RHS = MI.getOperand(2).getReg(); 3450 3451 uint16_t Flags = MI.getFlags(); 3452 3453 LLT S64 = LLT::scalar(64); 3454 LLT S1 = LLT::scalar(1); 3455 3456 auto One = B.buildFConstant(S64, 1.0); 3457 3458 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3459 .addUse(LHS) 3460 .addUse(RHS) 3461 .addImm(0) 3462 .setMIFlags(Flags); 3463 3464 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3465 3466 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3467 .addUse(DivScale0.getReg(0)) 3468 .setMIFlags(Flags); 3469 3470 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3471 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3472 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3473 3474 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3475 .addUse(LHS) 3476 .addUse(RHS) 3477 .addImm(1) 3478 .setMIFlags(Flags); 3479 3480 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3481 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3482 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3483 3484 Register Scale; 3485 if (!ST.hasUsableDivScaleConditionOutput()) { 3486 // Workaround a hardware bug on SI where the condition output from div_scale 3487 // is not usable. 3488 3489 LLT S32 = LLT::scalar(32); 3490 3491 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3492 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3493 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3494 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3495 3496 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3497 Scale1Unmerge.getReg(1)); 3498 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3499 Scale0Unmerge.getReg(1)); 3500 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3501 } else { 3502 Scale = DivScale1.getReg(1); 3503 } 3504 3505 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3506 .addUse(Fma4.getReg(0)) 3507 .addUse(Fma3.getReg(0)) 3508 .addUse(Mul.getReg(0)) 3509 .addUse(Scale) 3510 .setMIFlags(Flags); 3511 3512 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3513 .addUse(Fmas.getReg(0)) 3514 .addUse(RHS) 3515 .addUse(LHS) 3516 .setMIFlags(Flags); 3517 3518 MI.eraseFromParent(); 3519 return true; 3520 } 3521 3522 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3523 MachineRegisterInfo &MRI, 3524 MachineIRBuilder &B) const { 3525 Register Res = MI.getOperand(0).getReg(); 3526 Register LHS = MI.getOperand(2).getReg(); 3527 Register RHS = MI.getOperand(3).getReg(); 3528 uint16_t Flags = MI.getFlags(); 3529 3530 LLT S32 = LLT::scalar(32); 3531 LLT S1 = LLT::scalar(1); 3532 3533 auto Abs = B.buildFAbs(S32, RHS, Flags); 3534 const APFloat C0Val(1.0f); 3535 3536 auto C0 = B.buildConstant(S32, 0x6f800000); 3537 auto C1 = B.buildConstant(S32, 0x2f800000); 3538 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3539 3540 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3541 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3542 3543 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3544 3545 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3546 .addUse(Mul0.getReg(0)) 3547 .setMIFlags(Flags); 3548 3549 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3550 3551 B.buildFMul(Res, Sel, Mul1, Flags); 3552 3553 MI.eraseFromParent(); 3554 return true; 3555 } 3556 3557 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3558 // FIXME: Why do we handle this one but not other removed instructions? 3559 // 3560 // Reciprocal square root. The clamp prevents infinite results, clamping 3561 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3562 // +-max_float. 3563 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3564 MachineRegisterInfo &MRI, 3565 MachineIRBuilder &B) const { 3566 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3567 return true; 3568 3569 Register Dst = MI.getOperand(0).getReg(); 3570 Register Src = MI.getOperand(2).getReg(); 3571 auto Flags = MI.getFlags(); 3572 3573 LLT Ty = MRI.getType(Dst); 3574 3575 const fltSemantics *FltSemantics; 3576 if (Ty == LLT::scalar(32)) 3577 FltSemantics = &APFloat::IEEEsingle(); 3578 else if (Ty == LLT::scalar(64)) 3579 FltSemantics = &APFloat::IEEEdouble(); 3580 else 3581 return false; 3582 3583 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3584 .addUse(Src) 3585 .setMIFlags(Flags); 3586 3587 // We don't need to concern ourselves with the snan handling difference, since 3588 // the rsq quieted (or not) so use the one which will directly select. 3589 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3590 const bool UseIEEE = MFI->getMode().IEEE; 3591 3592 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3593 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3594 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3595 3596 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3597 3598 if (UseIEEE) 3599 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3600 else 3601 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3602 MI.eraseFromParent(); 3603 return true; 3604 } 3605 3606 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3607 switch (IID) { 3608 case Intrinsic::amdgcn_ds_fadd: 3609 return AMDGPU::G_ATOMICRMW_FADD; 3610 case Intrinsic::amdgcn_ds_fmin: 3611 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3612 case Intrinsic::amdgcn_ds_fmax: 3613 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3614 default: 3615 llvm_unreachable("not a DS FP intrinsic"); 3616 } 3617 } 3618 3619 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3620 MachineInstr &MI, 3621 Intrinsic::ID IID) const { 3622 GISelChangeObserver &Observer = Helper.Observer; 3623 Observer.changingInstr(MI); 3624 3625 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3626 3627 // The remaining operands were used to set fields in the MemOperand on 3628 // construction. 3629 for (int I = 6; I > 3; --I) 3630 MI.RemoveOperand(I); 3631 3632 MI.RemoveOperand(1); // Remove the intrinsic ID. 3633 Observer.changedInstr(MI); 3634 return true; 3635 } 3636 3637 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3638 MachineRegisterInfo &MRI, 3639 MachineIRBuilder &B) const { 3640 uint64_t Offset = 3641 ST.getTargetLowering()->getImplicitParameterOffset( 3642 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3643 LLT DstTy = MRI.getType(DstReg); 3644 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3645 3646 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3647 if (!loadInputValue(KernargPtrReg, B, 3648 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3649 return false; 3650 3651 // FIXME: This should be nuw 3652 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3653 return true; 3654 } 3655 3656 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3657 MachineRegisterInfo &MRI, 3658 MachineIRBuilder &B) const { 3659 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3660 if (!MFI->isEntryFunction()) { 3661 return legalizePreloadedArgIntrin(MI, MRI, B, 3662 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3663 } 3664 3665 Register DstReg = MI.getOperand(0).getReg(); 3666 if (!getImplicitArgPtr(DstReg, MRI, B)) 3667 return false; 3668 3669 MI.eraseFromParent(); 3670 return true; 3671 } 3672 3673 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3674 MachineRegisterInfo &MRI, 3675 MachineIRBuilder &B, 3676 unsigned AddrSpace) const { 3677 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3678 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3679 Register Hi32 = Unmerge.getReg(1); 3680 3681 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3682 MI.eraseFromParent(); 3683 return true; 3684 } 3685 3686 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3687 // offset (the offset that is included in bounds checking and swizzling, to be 3688 // split between the instruction's voffset and immoffset fields) and soffset 3689 // (the offset that is excluded from bounds checking and swizzling, to go in 3690 // the instruction's soffset field). This function takes the first kind of 3691 // offset and figures out how to split it between voffset and immoffset. 3692 std::pair<Register, unsigned> 3693 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3694 Register OrigOffset) const { 3695 const unsigned MaxImm = 4095; 3696 Register BaseReg; 3697 unsigned ImmOffset; 3698 const LLT S32 = LLT::scalar(32); 3699 MachineRegisterInfo &MRI = *B.getMRI(); 3700 3701 std::tie(BaseReg, ImmOffset) = 3702 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3703 3704 // If BaseReg is a pointer, convert it to int. 3705 if (MRI.getType(BaseReg).isPointer()) 3706 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3707 3708 // If the immediate value is too big for the immoffset field, put the value 3709 // and -4096 into the immoffset field so that the value that is copied/added 3710 // for the voffset field is a multiple of 4096, and it stands more chance 3711 // of being CSEd with the copy/add for another similar load/store. 3712 // However, do not do that rounding down to a multiple of 4096 if that is a 3713 // negative number, as it appears to be illegal to have a negative offset 3714 // in the vgpr, even if adding the immediate offset makes it positive. 3715 unsigned Overflow = ImmOffset & ~MaxImm; 3716 ImmOffset -= Overflow; 3717 if ((int32_t)Overflow < 0) { 3718 Overflow += ImmOffset; 3719 ImmOffset = 0; 3720 } 3721 3722 if (Overflow != 0) { 3723 if (!BaseReg) { 3724 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3725 } else { 3726 auto OverflowVal = B.buildConstant(S32, Overflow); 3727 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3728 } 3729 } 3730 3731 if (!BaseReg) 3732 BaseReg = B.buildConstant(S32, 0).getReg(0); 3733 3734 return std::make_pair(BaseReg, ImmOffset); 3735 } 3736 3737 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3738 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3739 Register VOffset, Register SOffset, 3740 unsigned ImmOffset, Register VIndex, 3741 MachineRegisterInfo &MRI) const { 3742 Optional<ValueAndVReg> MaybeVOffsetVal = 3743 getIConstantVRegValWithLookThrough(VOffset, MRI); 3744 Optional<ValueAndVReg> MaybeSOffsetVal = 3745 getIConstantVRegValWithLookThrough(SOffset, MRI); 3746 Optional<ValueAndVReg> MaybeVIndexVal = 3747 getIConstantVRegValWithLookThrough(VIndex, MRI); 3748 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3749 // update the MMO with that offset. The stride is unknown so we can only do 3750 // this if VIndex is constant 0. 3751 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3752 MaybeVIndexVal->Value == 0) { 3753 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3754 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3755 MMO->setOffset(TotalOffset); 3756 } else { 3757 // We don't have a constant combined offset to use in the MMO. Give up. 3758 MMO->setValue((Value *)nullptr); 3759 } 3760 } 3761 3762 /// Handle register layout difference for f16 images for some subtargets. 3763 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3764 MachineRegisterInfo &MRI, 3765 Register Reg, 3766 bool ImageStore) const { 3767 const LLT S16 = LLT::scalar(16); 3768 const LLT S32 = LLT::scalar(32); 3769 LLT StoreVT = MRI.getType(Reg); 3770 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3771 3772 if (ST.hasUnpackedD16VMem()) { 3773 auto Unmerge = B.buildUnmerge(S16, Reg); 3774 3775 SmallVector<Register, 4> WideRegs; 3776 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3777 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3778 3779 int NumElts = StoreVT.getNumElements(); 3780 3781 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3782 .getReg(0); 3783 } 3784 3785 if (ImageStore && ST.hasImageStoreD16Bug()) { 3786 if (StoreVT.getNumElements() == 2) { 3787 SmallVector<Register, 4> PackedRegs; 3788 Reg = B.buildBitcast(S32, Reg).getReg(0); 3789 PackedRegs.push_back(Reg); 3790 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3791 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3792 .getReg(0); 3793 } 3794 3795 if (StoreVT.getNumElements() == 3) { 3796 SmallVector<Register, 4> PackedRegs; 3797 auto Unmerge = B.buildUnmerge(S16, Reg); 3798 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3799 PackedRegs.push_back(Unmerge.getReg(I)); 3800 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3801 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3802 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3803 } 3804 3805 if (StoreVT.getNumElements() == 4) { 3806 SmallVector<Register, 4> PackedRegs; 3807 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3808 auto Unmerge = B.buildUnmerge(S32, Reg); 3809 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3810 PackedRegs.push_back(Unmerge.getReg(I)); 3811 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3812 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3813 .getReg(0); 3814 } 3815 3816 llvm_unreachable("invalid data type"); 3817 } 3818 3819 if (StoreVT == LLT::fixed_vector(3, S16)) { 3820 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 3821 .getReg(0); 3822 } 3823 return Reg; 3824 } 3825 3826 Register AMDGPULegalizerInfo::fixStoreSourceType( 3827 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3828 MachineRegisterInfo *MRI = B.getMRI(); 3829 LLT Ty = MRI->getType(VData); 3830 3831 const LLT S16 = LLT::scalar(16); 3832 3833 // Fixup illegal register types for i8 stores. 3834 if (Ty == LLT::scalar(8) || Ty == S16) { 3835 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3836 return AnyExt; 3837 } 3838 3839 if (Ty.isVector()) { 3840 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3841 if (IsFormat) 3842 return handleD16VData(B, *MRI, VData); 3843 } 3844 } 3845 3846 return VData; 3847 } 3848 3849 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3850 MachineRegisterInfo &MRI, 3851 MachineIRBuilder &B, 3852 bool IsTyped, 3853 bool IsFormat) const { 3854 Register VData = MI.getOperand(1).getReg(); 3855 LLT Ty = MRI.getType(VData); 3856 LLT EltTy = Ty.getScalarType(); 3857 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3858 const LLT S32 = LLT::scalar(32); 3859 3860 VData = fixStoreSourceType(B, VData, IsFormat); 3861 Register RSrc = MI.getOperand(2).getReg(); 3862 3863 MachineMemOperand *MMO = *MI.memoperands_begin(); 3864 const int MemSize = MMO->getSize(); 3865 3866 unsigned ImmOffset; 3867 3868 // The typed intrinsics add an immediate after the registers. 3869 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3870 3871 // The struct intrinsic variants add one additional operand over raw. 3872 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3873 Register VIndex; 3874 int OpOffset = 0; 3875 if (HasVIndex) { 3876 VIndex = MI.getOperand(3).getReg(); 3877 OpOffset = 1; 3878 } else { 3879 VIndex = B.buildConstant(S32, 0).getReg(0); 3880 } 3881 3882 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3883 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3884 3885 unsigned Format = 0; 3886 if (IsTyped) { 3887 Format = MI.getOperand(5 + OpOffset).getImm(); 3888 ++OpOffset; 3889 } 3890 3891 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3892 3893 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3894 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3895 3896 unsigned Opc; 3897 if (IsTyped) { 3898 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3899 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3900 } else if (IsFormat) { 3901 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3902 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3903 } else { 3904 switch (MemSize) { 3905 case 1: 3906 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3907 break; 3908 case 2: 3909 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3910 break; 3911 default: 3912 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3913 break; 3914 } 3915 } 3916 3917 auto MIB = B.buildInstr(Opc) 3918 .addUse(VData) // vdata 3919 .addUse(RSrc) // rsrc 3920 .addUse(VIndex) // vindex 3921 .addUse(VOffset) // voffset 3922 .addUse(SOffset) // soffset 3923 .addImm(ImmOffset); // offset(imm) 3924 3925 if (IsTyped) 3926 MIB.addImm(Format); 3927 3928 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3929 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3930 .addMemOperand(MMO); 3931 3932 MI.eraseFromParent(); 3933 return true; 3934 } 3935 3936 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3937 MachineRegisterInfo &MRI, 3938 MachineIRBuilder &B, 3939 bool IsFormat, 3940 bool IsTyped) const { 3941 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3942 MachineMemOperand *MMO = *MI.memoperands_begin(); 3943 const LLT MemTy = MMO->getMemoryType(); 3944 const LLT S32 = LLT::scalar(32); 3945 3946 Register Dst = MI.getOperand(0).getReg(); 3947 Register RSrc = MI.getOperand(2).getReg(); 3948 3949 // The typed intrinsics add an immediate after the registers. 3950 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3951 3952 // The struct intrinsic variants add one additional operand over raw. 3953 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3954 Register VIndex; 3955 int OpOffset = 0; 3956 if (HasVIndex) { 3957 VIndex = MI.getOperand(3).getReg(); 3958 OpOffset = 1; 3959 } else { 3960 VIndex = B.buildConstant(S32, 0).getReg(0); 3961 } 3962 3963 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3964 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3965 3966 unsigned Format = 0; 3967 if (IsTyped) { 3968 Format = MI.getOperand(5 + OpOffset).getImm(); 3969 ++OpOffset; 3970 } 3971 3972 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3973 unsigned ImmOffset; 3974 3975 LLT Ty = MRI.getType(Dst); 3976 LLT EltTy = Ty.getScalarType(); 3977 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3978 const bool Unpacked = ST.hasUnpackedD16VMem(); 3979 3980 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3981 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3982 3983 unsigned Opc; 3984 3985 if (IsTyped) { 3986 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3987 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3988 } else if (IsFormat) { 3989 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3990 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3991 } else { 3992 switch (MemTy.getSizeInBits()) { 3993 case 8: 3994 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3995 break; 3996 case 16: 3997 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3998 break; 3999 default: 4000 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4001 break; 4002 } 4003 } 4004 4005 Register LoadDstReg; 4006 4007 bool IsExtLoad = 4008 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4009 LLT UnpackedTy = Ty.changeElementSize(32); 4010 4011 if (IsExtLoad) 4012 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4013 else if (Unpacked && IsD16 && Ty.isVector()) 4014 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4015 else 4016 LoadDstReg = Dst; 4017 4018 auto MIB = B.buildInstr(Opc) 4019 .addDef(LoadDstReg) // vdata 4020 .addUse(RSrc) // rsrc 4021 .addUse(VIndex) // vindex 4022 .addUse(VOffset) // voffset 4023 .addUse(SOffset) // soffset 4024 .addImm(ImmOffset); // offset(imm) 4025 4026 if (IsTyped) 4027 MIB.addImm(Format); 4028 4029 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4030 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4031 .addMemOperand(MMO); 4032 4033 if (LoadDstReg != Dst) { 4034 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4035 4036 // Widen result for extending loads was widened. 4037 if (IsExtLoad) 4038 B.buildTrunc(Dst, LoadDstReg); 4039 else { 4040 // Repack to original 16-bit vector result 4041 // FIXME: G_TRUNC should work, but legalization currently fails 4042 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4043 SmallVector<Register, 4> Repack; 4044 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4045 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4046 B.buildMerge(Dst, Repack); 4047 } 4048 } 4049 4050 MI.eraseFromParent(); 4051 return true; 4052 } 4053 4054 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4055 MachineIRBuilder &B, 4056 bool IsInc) const { 4057 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4058 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4059 B.buildInstr(Opc) 4060 .addDef(MI.getOperand(0).getReg()) 4061 .addUse(MI.getOperand(2).getReg()) 4062 .addUse(MI.getOperand(3).getReg()) 4063 .cloneMemRefs(MI); 4064 MI.eraseFromParent(); 4065 return true; 4066 } 4067 4068 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4069 switch (IntrID) { 4070 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4071 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4072 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4073 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4074 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4075 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4076 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4077 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4078 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4079 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4080 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4081 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4082 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4083 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4084 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4085 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4086 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4087 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4088 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4089 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4090 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4091 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4092 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4093 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4094 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4095 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4096 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4097 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4098 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4099 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4100 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4101 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4102 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4103 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4104 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4105 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4106 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4107 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4108 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4109 case Intrinsic::amdgcn_buffer_atomic_fadd: 4110 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4111 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4112 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4113 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4114 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4115 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4116 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4117 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4118 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4119 default: 4120 llvm_unreachable("unhandled atomic opcode"); 4121 } 4122 } 4123 4124 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4125 MachineIRBuilder &B, 4126 Intrinsic::ID IID) const { 4127 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4128 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4129 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4130 4131 Register Dst; 4132 4133 int OpOffset = 0; 4134 if (HasReturn) { 4135 // A few FP atomics do not support return values. 4136 Dst = MI.getOperand(0).getReg(); 4137 } else { 4138 OpOffset = -1; 4139 } 4140 4141 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4142 Register CmpVal; 4143 4144 if (IsCmpSwap) { 4145 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4146 ++OpOffset; 4147 } 4148 4149 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4150 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4151 4152 // The struct intrinsic variants add one additional operand over raw. 4153 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4154 Register VIndex; 4155 if (HasVIndex) { 4156 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4157 ++OpOffset; 4158 } else { 4159 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4160 } 4161 4162 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4163 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4164 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4165 4166 MachineMemOperand *MMO = *MI.memoperands_begin(); 4167 4168 unsigned ImmOffset; 4169 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4170 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4171 4172 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4173 4174 if (HasReturn) 4175 MIB.addDef(Dst); 4176 4177 MIB.addUse(VData); // vdata 4178 4179 if (IsCmpSwap) 4180 MIB.addReg(CmpVal); 4181 4182 MIB.addUse(RSrc) // rsrc 4183 .addUse(VIndex) // vindex 4184 .addUse(VOffset) // voffset 4185 .addUse(SOffset) // soffset 4186 .addImm(ImmOffset) // offset(imm) 4187 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4188 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4189 .addMemOperand(MMO); 4190 4191 MI.eraseFromParent(); 4192 return true; 4193 } 4194 4195 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4196 /// vector with s16 typed elements. 4197 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4198 SmallVectorImpl<Register> &PackedAddrs, 4199 unsigned ArgOffset, 4200 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4201 bool IsA16, bool IsG16) { 4202 const LLT S16 = LLT::scalar(16); 4203 const LLT V2S16 = LLT::fixed_vector(2, 16); 4204 auto EndIdx = Intr->VAddrEnd; 4205 4206 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4207 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4208 if (!SrcOp.isReg()) 4209 continue; // _L to _LZ may have eliminated this. 4210 4211 Register AddrReg = SrcOp.getReg(); 4212 4213 if ((I < Intr->GradientStart) || 4214 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4215 (I >= Intr->CoordStart && !IsA16)) { 4216 // Handle any gradient or coordinate operands that should not be packed 4217 if ((I < Intr->GradientStart) && IsA16 && 4218 (B.getMRI()->getType(AddrReg) == S16)) { 4219 // Special handling of bias when A16 is on. Bias is of type half but 4220 // occupies full 32-bit. 4221 PackedAddrs.push_back( 4222 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4223 .getReg(0)); 4224 } else { 4225 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4226 PackedAddrs.push_back(AddrReg); 4227 } 4228 } else { 4229 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4230 // derivatives dx/dh and dx/dv are packed with undef. 4231 if (((I + 1) >= EndIdx) || 4232 ((Intr->NumGradients / 2) % 2 == 1 && 4233 (I == static_cast<unsigned>(Intr->GradientStart + 4234 (Intr->NumGradients / 2) - 1) || 4235 I == static_cast<unsigned>(Intr->GradientStart + 4236 Intr->NumGradients - 1))) || 4237 // Check for _L to _LZ optimization 4238 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4239 PackedAddrs.push_back( 4240 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4241 .getReg(0)); 4242 } else { 4243 PackedAddrs.push_back( 4244 B.buildBuildVector( 4245 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4246 .getReg(0)); 4247 ++I; 4248 } 4249 } 4250 } 4251 } 4252 4253 /// Convert from separate vaddr components to a single vector address register, 4254 /// and replace the remaining operands with $noreg. 4255 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4256 int DimIdx, int NumVAddrs) { 4257 const LLT S32 = LLT::scalar(32); 4258 4259 SmallVector<Register, 8> AddrRegs; 4260 for (int I = 0; I != NumVAddrs; ++I) { 4261 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4262 if (SrcOp.isReg()) { 4263 AddrRegs.push_back(SrcOp.getReg()); 4264 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4265 } 4266 } 4267 4268 int NumAddrRegs = AddrRegs.size(); 4269 if (NumAddrRegs != 1) { 4270 // Above 8 elements round up to next power of 2 (i.e. 16). 4271 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4272 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4273 auto Undef = B.buildUndef(S32); 4274 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4275 NumAddrRegs = RoundedNumRegs; 4276 } 4277 4278 auto VAddr = 4279 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4280 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4281 } 4282 4283 for (int I = 1; I != NumVAddrs; ++I) { 4284 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4285 if (SrcOp.isReg()) 4286 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4287 } 4288 } 4289 4290 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4291 /// 4292 /// Depending on the subtarget, load/store with 16-bit element data need to be 4293 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4294 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4295 /// registers. 4296 /// 4297 /// We don't want to directly select image instructions just yet, but also want 4298 /// to exposes all register repacking to the legalizer/combiners. We also don't 4299 /// want a selected instrution entering RegBankSelect. In order to avoid 4300 /// defining a multitude of intermediate image instructions, directly hack on 4301 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4302 /// padding now unnecessary arguments with $noreg. 4303 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4304 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4305 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4306 4307 const unsigned NumDefs = MI.getNumExplicitDefs(); 4308 const unsigned ArgOffset = NumDefs + 1; 4309 bool IsTFE = NumDefs == 2; 4310 // We are only processing the operands of d16 image operations on subtargets 4311 // that use the unpacked register layout, or need to repack the TFE result. 4312 4313 // TODO: Do we need to guard against already legalized intrinsics? 4314 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4315 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4316 4317 MachineRegisterInfo *MRI = B.getMRI(); 4318 const LLT S32 = LLT::scalar(32); 4319 const LLT S16 = LLT::scalar(16); 4320 const LLT V2S16 = LLT::fixed_vector(2, 16); 4321 4322 unsigned DMask = 0; 4323 4324 // Check for 16 bit addresses and pack if true. 4325 LLT GradTy = 4326 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4327 LLT AddrTy = 4328 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4329 const bool IsG16 = GradTy == S16; 4330 const bool IsA16 = AddrTy == S16; 4331 4332 int DMaskLanes = 0; 4333 if (!BaseOpcode->Atomic) { 4334 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4335 if (BaseOpcode->Gather4) { 4336 DMaskLanes = 4; 4337 } else if (DMask != 0) { 4338 DMaskLanes = countPopulation(DMask); 4339 } else if (!IsTFE && !BaseOpcode->Store) { 4340 // If dmask is 0, this is a no-op load. This can be eliminated. 4341 B.buildUndef(MI.getOperand(0)); 4342 MI.eraseFromParent(); 4343 return true; 4344 } 4345 } 4346 4347 Observer.changingInstr(MI); 4348 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4349 4350 unsigned NewOpcode = NumDefs == 0 ? 4351 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4352 4353 // Track that we legalized this 4354 MI.setDesc(B.getTII().get(NewOpcode)); 4355 4356 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4357 // dmask to be at least 1 otherwise the instruction will fail 4358 if (IsTFE && DMask == 0) { 4359 DMask = 0x1; 4360 DMaskLanes = 1; 4361 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4362 } 4363 4364 if (BaseOpcode->Atomic) { 4365 Register VData0 = MI.getOperand(2).getReg(); 4366 LLT Ty = MRI->getType(VData0); 4367 4368 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4369 if (Ty.isVector()) 4370 return false; 4371 4372 if (BaseOpcode->AtomicX2) { 4373 Register VData1 = MI.getOperand(3).getReg(); 4374 // The two values are packed in one register. 4375 LLT PackedTy = LLT::fixed_vector(2, Ty); 4376 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4377 MI.getOperand(2).setReg(Concat.getReg(0)); 4378 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4379 } 4380 } 4381 4382 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4383 4384 // Optimize _L to _LZ when _L is zero 4385 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4386 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4387 const ConstantFP *ConstantLod; 4388 4389 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4390 m_GFCst(ConstantLod))) { 4391 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4392 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4393 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4394 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, 4395 Intr->Dim); 4396 4397 // The starting indexes should remain in the same place. 4398 --CorrectedNumVAddrs; 4399 4400 MI.getOperand(MI.getNumExplicitDefs()) 4401 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4402 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4403 Intr = NewImageDimIntr; 4404 } 4405 } 4406 } 4407 4408 // Optimize _mip away, when 'lod' is zero 4409 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4410 int64_t ConstantLod; 4411 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4412 m_ICst(ConstantLod))) { 4413 if (ConstantLod == 0) { 4414 // TODO: Change intrinsic opcode and remove operand instead or replacing 4415 // it with 0, as the _L to _LZ handling is done above. 4416 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4417 --CorrectedNumVAddrs; 4418 } 4419 } 4420 } 4421 4422 // Rewrite the addressing register layout before doing anything else. 4423 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4424 // 16 bit gradients are supported, but are tied to the A16 control 4425 // so both gradients and addresses must be 16 bit 4426 return false; 4427 } 4428 4429 if (IsA16 && !ST.hasA16()) { 4430 // A16 not supported 4431 return false; 4432 } 4433 4434 if (IsA16 || IsG16) { 4435 if (Intr->NumVAddrs > 1) { 4436 SmallVector<Register, 4> PackedRegs; 4437 4438 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4439 IsG16); 4440 4441 // See also below in the non-a16 branch 4442 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4443 PackedRegs.size() <= ST.getNSAMaxSize(); 4444 4445 if (!UseNSA && PackedRegs.size() > 1) { 4446 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4447 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4448 PackedRegs[0] = Concat.getReg(0); 4449 PackedRegs.resize(1); 4450 } 4451 4452 const unsigned NumPacked = PackedRegs.size(); 4453 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4454 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4455 if (!SrcOp.isReg()) { 4456 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4457 continue; 4458 } 4459 4460 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4461 4462 if (I - Intr->VAddrStart < NumPacked) 4463 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4464 else 4465 SrcOp.setReg(AMDGPU::NoRegister); 4466 } 4467 } 4468 } else { 4469 // If the register allocator cannot place the address registers contiguously 4470 // without introducing moves, then using the non-sequential address encoding 4471 // is always preferable, since it saves VALU instructions and is usually a 4472 // wash in terms of code size or even better. 4473 // 4474 // However, we currently have no way of hinting to the register allocator 4475 // that MIMG addresses should be placed contiguously when it is possible to 4476 // do so, so force non-NSA for the common 2-address case as a heuristic. 4477 // 4478 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4479 // allocation when possible. 4480 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4481 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4482 4483 if (!UseNSA && Intr->NumVAddrs > 1) 4484 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4485 Intr->NumVAddrs); 4486 } 4487 4488 int Flags = 0; 4489 if (IsA16) 4490 Flags |= 1; 4491 if (IsG16) 4492 Flags |= 2; 4493 MI.addOperand(MachineOperand::CreateImm(Flags)); 4494 4495 if (BaseOpcode->Store) { // No TFE for stores? 4496 // TODO: Handle dmask trim 4497 Register VData = MI.getOperand(1).getReg(); 4498 LLT Ty = MRI->getType(VData); 4499 if (!Ty.isVector() || Ty.getElementType() != S16) 4500 return true; 4501 4502 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4503 if (RepackedReg != VData) { 4504 MI.getOperand(1).setReg(RepackedReg); 4505 } 4506 4507 return true; 4508 } 4509 4510 Register DstReg = MI.getOperand(0).getReg(); 4511 LLT Ty = MRI->getType(DstReg); 4512 const LLT EltTy = Ty.getScalarType(); 4513 const bool IsD16 = Ty.getScalarType() == S16; 4514 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4515 4516 // Confirm that the return type is large enough for the dmask specified 4517 if (NumElts < DMaskLanes) 4518 return false; 4519 4520 if (NumElts > 4 || DMaskLanes > 4) 4521 return false; 4522 4523 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4524 const LLT AdjustedTy = 4525 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4526 4527 // The raw dword aligned data component of the load. The only legal cases 4528 // where this matters should be when using the packed D16 format, for 4529 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4530 LLT RoundedTy; 4531 4532 // S32 vector to to cover all data, plus TFE result element. 4533 LLT TFETy; 4534 4535 // Register type to use for each loaded component. Will be S32 or V2S16. 4536 LLT RegTy; 4537 4538 if (IsD16 && ST.hasUnpackedD16VMem()) { 4539 RoundedTy = 4540 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4541 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4542 RegTy = S32; 4543 } else { 4544 unsigned EltSize = EltTy.getSizeInBits(); 4545 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4546 unsigned RoundedSize = 32 * RoundedElts; 4547 RoundedTy = LLT::scalarOrVector( 4548 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4549 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4550 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4551 } 4552 4553 // The return type does not need adjustment. 4554 // TODO: Should we change s16 case to s32 or <2 x s16>? 4555 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4556 return true; 4557 4558 Register Dst1Reg; 4559 4560 // Insert after the instruction. 4561 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4562 4563 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4564 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4565 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4566 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4567 4568 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4569 4570 MI.getOperand(0).setReg(NewResultReg); 4571 4572 // In the IR, TFE is supposed to be used with a 2 element struct return 4573 // type. The instruction really returns these two values in one contiguous 4574 // register, with one additional dword beyond the loaded data. Rewrite the 4575 // return type to use a single register result. 4576 4577 if (IsTFE) { 4578 Dst1Reg = MI.getOperand(1).getReg(); 4579 if (MRI->getType(Dst1Reg) != S32) 4580 return false; 4581 4582 // TODO: Make sure the TFE operand bit is set. 4583 MI.RemoveOperand(1); 4584 4585 // Handle the easy case that requires no repack instructions. 4586 if (Ty == S32) { 4587 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4588 return true; 4589 } 4590 } 4591 4592 // Now figure out how to copy the new result register back into the old 4593 // result. 4594 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4595 4596 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4597 4598 if (ResultNumRegs == 1) { 4599 assert(!IsTFE); 4600 ResultRegs[0] = NewResultReg; 4601 } else { 4602 // We have to repack into a new vector of some kind. 4603 for (int I = 0; I != NumDataRegs; ++I) 4604 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4605 B.buildUnmerge(ResultRegs, NewResultReg); 4606 4607 // Drop the final TFE element to get the data part. The TFE result is 4608 // directly written to the right place already. 4609 if (IsTFE) 4610 ResultRegs.resize(NumDataRegs); 4611 } 4612 4613 // For an s16 scalar result, we form an s32 result with a truncate regardless 4614 // of packed vs. unpacked. 4615 if (IsD16 && !Ty.isVector()) { 4616 B.buildTrunc(DstReg, ResultRegs[0]); 4617 return true; 4618 } 4619 4620 // Avoid a build/concat_vector of 1 entry. 4621 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4622 B.buildBitcast(DstReg, ResultRegs[0]); 4623 return true; 4624 } 4625 4626 assert(Ty.isVector()); 4627 4628 if (IsD16) { 4629 // For packed D16 results with TFE enabled, all the data components are 4630 // S32. Cast back to the expected type. 4631 // 4632 // TODO: We don't really need to use load s32 elements. We would only need one 4633 // cast for the TFE result if a multiple of v2s16 was used. 4634 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4635 for (Register &Reg : ResultRegs) 4636 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4637 } else if (ST.hasUnpackedD16VMem()) { 4638 for (Register &Reg : ResultRegs) 4639 Reg = B.buildTrunc(S16, Reg).getReg(0); 4640 } 4641 } 4642 4643 auto padWithUndef = [&](LLT Ty, int NumElts) { 4644 if (NumElts == 0) 4645 return; 4646 Register Undef = B.buildUndef(Ty).getReg(0); 4647 for (int I = 0; I != NumElts; ++I) 4648 ResultRegs.push_back(Undef); 4649 }; 4650 4651 // Pad out any elements eliminated due to the dmask. 4652 LLT ResTy = MRI->getType(ResultRegs[0]); 4653 if (!ResTy.isVector()) { 4654 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4655 B.buildBuildVector(DstReg, ResultRegs); 4656 return true; 4657 } 4658 4659 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4660 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4661 4662 // Deal with the one annoying legal case. 4663 const LLT V3S16 = LLT::fixed_vector(3, 16); 4664 if (Ty == V3S16) { 4665 if (IsTFE) { 4666 if (ResultRegs.size() == 1) { 4667 NewResultReg = ResultRegs[0]; 4668 } else if (ResultRegs.size() == 2) { 4669 LLT V4S16 = LLT::fixed_vector(4, 16); 4670 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 4671 } else { 4672 return false; 4673 } 4674 } 4675 4676 if (MRI->getType(DstReg).getNumElements() < 4677 MRI->getType(NewResultReg).getNumElements()) { 4678 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 4679 } else { 4680 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 4681 } 4682 return true; 4683 } 4684 4685 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4686 B.buildConcatVectors(DstReg, ResultRegs); 4687 return true; 4688 } 4689 4690 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4691 LegalizerHelper &Helper, MachineInstr &MI) const { 4692 MachineIRBuilder &B = Helper.MIRBuilder; 4693 GISelChangeObserver &Observer = Helper.Observer; 4694 4695 Register Dst = MI.getOperand(0).getReg(); 4696 LLT Ty = B.getMRI()->getType(Dst); 4697 unsigned Size = Ty.getSizeInBits(); 4698 MachineFunction &MF = B.getMF(); 4699 4700 Observer.changingInstr(MI); 4701 4702 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4703 Ty = getBitcastRegisterType(Ty); 4704 Helper.bitcastDst(MI, Ty, 0); 4705 Dst = MI.getOperand(0).getReg(); 4706 B.setInsertPt(B.getMBB(), MI); 4707 } 4708 4709 // FIXME: We don't really need this intermediate instruction. The intrinsic 4710 // should be fixed to have a memory operand. Since it's readnone, we're not 4711 // allowed to add one. 4712 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4713 MI.RemoveOperand(1); // Remove intrinsic ID 4714 4715 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4716 // TODO: Should this use datalayout alignment? 4717 const unsigned MemSize = (Size + 7) / 8; 4718 const Align MemAlign(4); 4719 MachineMemOperand *MMO = MF.getMachineMemOperand( 4720 MachinePointerInfo(), 4721 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4722 MachineMemOperand::MOInvariant, 4723 MemSize, MemAlign); 4724 MI.addMemOperand(MF, MMO); 4725 4726 // There are no 96-bit result scalar loads, but widening to 128-bit should 4727 // always be legal. We may need to restore this to a 96-bit result if it turns 4728 // out this needs to be converted to a vector load during RegBankSelect. 4729 if (!isPowerOf2_32(Size)) { 4730 if (Ty.isVector()) 4731 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4732 else 4733 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4734 } 4735 4736 Observer.changedInstr(MI); 4737 return true; 4738 } 4739 4740 // TODO: Move to selection 4741 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4742 MachineRegisterInfo &MRI, 4743 MachineIRBuilder &B) const { 4744 if (!ST.isTrapHandlerEnabled() || 4745 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4746 return legalizeTrapEndpgm(MI, MRI, B); 4747 4748 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4749 switch (*HsaAbiVer) { 4750 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4751 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4752 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4753 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4754 return ST.supportsGetDoorbellID() ? 4755 legalizeTrapHsa(MI, MRI, B) : 4756 legalizeTrapHsaQueuePtr(MI, MRI, B); 4757 } 4758 } 4759 4760 llvm_unreachable("Unknown trap handler"); 4761 } 4762 4763 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4764 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4765 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4766 MI.eraseFromParent(); 4767 return true; 4768 } 4769 4770 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4771 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4772 // Pass queue pointer to trap handler as input, and insert trap instruction 4773 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4774 Register LiveIn = 4775 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4776 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4777 return false; 4778 4779 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4780 B.buildCopy(SGPR01, LiveIn); 4781 B.buildInstr(AMDGPU::S_TRAP) 4782 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4783 .addReg(SGPR01, RegState::Implicit); 4784 4785 MI.eraseFromParent(); 4786 return true; 4787 } 4788 4789 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4790 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4791 B.buildInstr(AMDGPU::S_TRAP) 4792 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4793 MI.eraseFromParent(); 4794 return true; 4795 } 4796 4797 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4798 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4799 // Is non-HSA path or trap-handler disabled? Then, report a warning 4800 // accordingly 4801 if (!ST.isTrapHandlerEnabled() || 4802 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4803 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4804 "debugtrap handler not supported", 4805 MI.getDebugLoc(), DS_Warning); 4806 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4807 Ctx.diagnose(NoTrap); 4808 } else { 4809 // Insert debug-trap instruction 4810 B.buildInstr(AMDGPU::S_TRAP) 4811 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4812 } 4813 4814 MI.eraseFromParent(); 4815 return true; 4816 } 4817 4818 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4819 MachineIRBuilder &B) const { 4820 MachineRegisterInfo &MRI = *B.getMRI(); 4821 const LLT S16 = LLT::scalar(16); 4822 const LLT S32 = LLT::scalar(32); 4823 4824 Register DstReg = MI.getOperand(0).getReg(); 4825 Register NodePtr = MI.getOperand(2).getReg(); 4826 Register RayExtent = MI.getOperand(3).getReg(); 4827 Register RayOrigin = MI.getOperand(4).getReg(); 4828 Register RayDir = MI.getOperand(5).getReg(); 4829 Register RayInvDir = MI.getOperand(6).getReg(); 4830 Register TDescr = MI.getOperand(7).getReg(); 4831 4832 if (!ST.hasGFX10_AEncoding()) { 4833 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4834 "intrinsic not supported on subtarget", 4835 MI.getDebugLoc()); 4836 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4837 return false; 4838 } 4839 4840 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4841 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4842 const unsigned NumVDataDwords = 4; 4843 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4844 const bool UseNSA = 4845 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 4846 const unsigned BaseOpcodes[2][2] = { 4847 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 4848 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 4849 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 4850 int Opcode; 4851 if (UseNSA) { 4852 Opcode = 4853 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 4854 NumVDataDwords, NumVAddrDwords); 4855 } else { 4856 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 4857 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 4858 PowerOf2Ceil(NumVAddrDwords)); 4859 } 4860 assert(Opcode != -1); 4861 4862 SmallVector<Register, 12> Ops; 4863 if (Is64) { 4864 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4865 Ops.push_back(Unmerge.getReg(0)); 4866 Ops.push_back(Unmerge.getReg(1)); 4867 } else { 4868 Ops.push_back(NodePtr); 4869 } 4870 Ops.push_back(RayExtent); 4871 4872 auto packLanes = [&Ops, &S32, &B](Register Src) { 4873 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 4874 Ops.push_back(Unmerge.getReg(0)); 4875 Ops.push_back(Unmerge.getReg(1)); 4876 Ops.push_back(Unmerge.getReg(2)); 4877 }; 4878 4879 packLanes(RayOrigin); 4880 if (IsA16) { 4881 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 4882 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 4883 Register R1 = MRI.createGenericVirtualRegister(S32); 4884 Register R2 = MRI.createGenericVirtualRegister(S32); 4885 Register R3 = MRI.createGenericVirtualRegister(S32); 4886 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4887 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4888 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4889 Ops.push_back(R1); 4890 Ops.push_back(R2); 4891 Ops.push_back(R3); 4892 } else { 4893 packLanes(RayDir); 4894 packLanes(RayInvDir); 4895 } 4896 4897 if (!UseNSA) { 4898 // Build a single vector containing all the operands so far prepared. 4899 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 4900 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 4901 Ops.clear(); 4902 Ops.push_back(MergedOps); 4903 } 4904 4905 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4906 .addDef(DstReg) 4907 .addImm(Opcode); 4908 4909 for (Register R : Ops) { 4910 MIB.addUse(R); 4911 } 4912 4913 MIB.addUse(TDescr) 4914 .addImm(IsA16 ? 1 : 0) 4915 .cloneMemRefs(MI); 4916 4917 MI.eraseFromParent(); 4918 return true; 4919 } 4920 4921 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4922 MachineInstr &MI) const { 4923 MachineIRBuilder &B = Helper.MIRBuilder; 4924 MachineRegisterInfo &MRI = *B.getMRI(); 4925 4926 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4927 auto IntrID = MI.getIntrinsicID(); 4928 switch (IntrID) { 4929 case Intrinsic::amdgcn_if: 4930 case Intrinsic::amdgcn_else: { 4931 MachineInstr *Br = nullptr; 4932 MachineBasicBlock *UncondBrTarget = nullptr; 4933 bool Negated = false; 4934 if (MachineInstr *BrCond = 4935 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4936 const SIRegisterInfo *TRI 4937 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4938 4939 Register Def = MI.getOperand(1).getReg(); 4940 Register Use = MI.getOperand(3).getReg(); 4941 4942 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4943 4944 if (Negated) 4945 std::swap(CondBrTarget, UncondBrTarget); 4946 4947 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4948 if (IntrID == Intrinsic::amdgcn_if) { 4949 B.buildInstr(AMDGPU::SI_IF) 4950 .addDef(Def) 4951 .addUse(Use) 4952 .addMBB(UncondBrTarget); 4953 } else { 4954 B.buildInstr(AMDGPU::SI_ELSE) 4955 .addDef(Def) 4956 .addUse(Use) 4957 .addMBB(UncondBrTarget); 4958 } 4959 4960 if (Br) { 4961 Br->getOperand(0).setMBB(CondBrTarget); 4962 } else { 4963 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4964 // since we're swapping branch targets it needs to be reinserted. 4965 // FIXME: IRTranslator should probably not do this 4966 B.buildBr(*CondBrTarget); 4967 } 4968 4969 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4970 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4971 MI.eraseFromParent(); 4972 BrCond->eraseFromParent(); 4973 return true; 4974 } 4975 4976 return false; 4977 } 4978 case Intrinsic::amdgcn_loop: { 4979 MachineInstr *Br = nullptr; 4980 MachineBasicBlock *UncondBrTarget = nullptr; 4981 bool Negated = false; 4982 if (MachineInstr *BrCond = 4983 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4984 const SIRegisterInfo *TRI 4985 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4986 4987 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4988 Register Reg = MI.getOperand(2).getReg(); 4989 4990 if (Negated) 4991 std::swap(CondBrTarget, UncondBrTarget); 4992 4993 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4994 B.buildInstr(AMDGPU::SI_LOOP) 4995 .addUse(Reg) 4996 .addMBB(UncondBrTarget); 4997 4998 if (Br) 4999 Br->getOperand(0).setMBB(CondBrTarget); 5000 else 5001 B.buildBr(*CondBrTarget); 5002 5003 MI.eraseFromParent(); 5004 BrCond->eraseFromParent(); 5005 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5006 return true; 5007 } 5008 5009 return false; 5010 } 5011 case Intrinsic::amdgcn_kernarg_segment_ptr: 5012 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5013 // This only makes sense to call in a kernel, so just lower to null. 5014 B.buildConstant(MI.getOperand(0).getReg(), 0); 5015 MI.eraseFromParent(); 5016 return true; 5017 } 5018 5019 return legalizePreloadedArgIntrin( 5020 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5021 case Intrinsic::amdgcn_implicitarg_ptr: 5022 return legalizeImplicitArgPtr(MI, MRI, B); 5023 case Intrinsic::amdgcn_workitem_id_x: 5024 return legalizePreloadedArgIntrin(MI, MRI, B, 5025 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5026 case Intrinsic::amdgcn_workitem_id_y: 5027 return legalizePreloadedArgIntrin(MI, MRI, B, 5028 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5029 case Intrinsic::amdgcn_workitem_id_z: 5030 return legalizePreloadedArgIntrin(MI, MRI, B, 5031 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5032 case Intrinsic::amdgcn_workgroup_id_x: 5033 return legalizePreloadedArgIntrin(MI, MRI, B, 5034 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5035 case Intrinsic::amdgcn_workgroup_id_y: 5036 return legalizePreloadedArgIntrin(MI, MRI, B, 5037 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5038 case Intrinsic::amdgcn_workgroup_id_z: 5039 return legalizePreloadedArgIntrin(MI, MRI, B, 5040 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5041 case Intrinsic::amdgcn_dispatch_ptr: 5042 return legalizePreloadedArgIntrin(MI, MRI, B, 5043 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5044 case Intrinsic::amdgcn_queue_ptr: 5045 return legalizePreloadedArgIntrin(MI, MRI, B, 5046 AMDGPUFunctionArgInfo::QUEUE_PTR); 5047 case Intrinsic::amdgcn_implicit_buffer_ptr: 5048 return legalizePreloadedArgIntrin( 5049 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5050 case Intrinsic::amdgcn_dispatch_id: 5051 return legalizePreloadedArgIntrin(MI, MRI, B, 5052 AMDGPUFunctionArgInfo::DISPATCH_ID); 5053 case Intrinsic::amdgcn_fdiv_fast: 5054 return legalizeFDIVFastIntrin(MI, MRI, B); 5055 case Intrinsic::amdgcn_is_shared: 5056 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5057 case Intrinsic::amdgcn_is_private: 5058 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5059 case Intrinsic::amdgcn_wavefrontsize: { 5060 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5061 MI.eraseFromParent(); 5062 return true; 5063 } 5064 case Intrinsic::amdgcn_s_buffer_load: 5065 return legalizeSBufferLoad(Helper, MI); 5066 case Intrinsic::amdgcn_raw_buffer_store: 5067 case Intrinsic::amdgcn_struct_buffer_store: 5068 return legalizeBufferStore(MI, MRI, B, false, false); 5069 case Intrinsic::amdgcn_raw_buffer_store_format: 5070 case Intrinsic::amdgcn_struct_buffer_store_format: 5071 return legalizeBufferStore(MI, MRI, B, false, true); 5072 case Intrinsic::amdgcn_raw_tbuffer_store: 5073 case Intrinsic::amdgcn_struct_tbuffer_store: 5074 return legalizeBufferStore(MI, MRI, B, true, true); 5075 case Intrinsic::amdgcn_raw_buffer_load: 5076 case Intrinsic::amdgcn_struct_buffer_load: 5077 return legalizeBufferLoad(MI, MRI, B, false, false); 5078 case Intrinsic::amdgcn_raw_buffer_load_format: 5079 case Intrinsic::amdgcn_struct_buffer_load_format: 5080 return legalizeBufferLoad(MI, MRI, B, true, false); 5081 case Intrinsic::amdgcn_raw_tbuffer_load: 5082 case Intrinsic::amdgcn_struct_tbuffer_load: 5083 return legalizeBufferLoad(MI, MRI, B, true, true); 5084 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5085 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5086 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5087 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5088 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5089 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5090 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5091 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5092 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5093 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5094 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5095 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5096 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5097 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5098 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5099 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5100 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5101 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5102 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5103 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5104 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5105 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5106 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5107 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5108 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5109 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5110 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5111 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5112 case Intrinsic::amdgcn_buffer_atomic_fadd: 5113 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5114 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5115 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5116 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5117 return legalizeBufferAtomic(MI, B, IntrID); 5118 case Intrinsic::amdgcn_atomic_inc: 5119 return legalizeAtomicIncDec(MI, B, true); 5120 case Intrinsic::amdgcn_atomic_dec: 5121 return legalizeAtomicIncDec(MI, B, false); 5122 case Intrinsic::trap: 5123 return legalizeTrapIntrinsic(MI, MRI, B); 5124 case Intrinsic::debugtrap: 5125 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5126 case Intrinsic::amdgcn_rsq_clamp: 5127 return legalizeRsqClampIntrinsic(MI, MRI, B); 5128 case Intrinsic::amdgcn_ds_fadd: 5129 case Intrinsic::amdgcn_ds_fmin: 5130 case Intrinsic::amdgcn_ds_fmax: 5131 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5132 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5133 return legalizeBVHIntrinsic(MI, B); 5134 default: { 5135 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5136 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5137 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5138 return true; 5139 } 5140 } 5141 5142 return true; 5143 } 5144