1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returns true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 unsigned AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .minScalar(0, S16) 536 .clampMaxNumElements(0, S16, 2) 537 .widenScalarToNextMultipleOf(0, 32) 538 .maxScalar(0, S32) 539 .scalarize(0); 540 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 542 .legalFor({S32, S16, V2S16}) // Clamp modifier 543 .minScalarOrElt(0, S16) 544 .clampMaxNumElements(0, S16, 2) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 32) 547 .lower(); 548 } else if (ST.has16BitInsts()) { 549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 550 .legalFor({S32, S16}) 551 .minScalar(0, S16) 552 .widenScalarToNextMultipleOf(0, 32) 553 .maxScalar(0, S32) 554 .scalarize(0); 555 556 // Technically the saturating operations require clamp bit support, but this 557 // was introduced at the same time as 16-bit operations. 558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 559 .legalFor({S32, S16}) // Clamp modifier 560 .minScalar(0, S16) 561 .scalarize(0) 562 .widenScalarToNextPow2(0, 16) 563 .lower(); 564 565 // We're just lowering this, but it helps get a better result to try to 566 // coerce to the desired type first. 567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 568 .minScalar(0, S16) 569 .scalarize(0) 570 .lower(); 571 } else { 572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 573 .legalFor({S32}) 574 .widenScalarToNextMultipleOf(0, 32) 575 .clampScalar(0, S32, S32) 576 .scalarize(0); 577 578 if (ST.hasIntClamp()) { 579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 580 .legalFor({S32}) // Clamp modifier. 581 .scalarize(0) 582 .minScalarOrElt(0, S32) 583 .lower(); 584 } else { 585 // Clamp bit support was added in VI, along with 16-bit operations. 586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 587 .minScalar(0, S32) 588 .scalarize(0) 589 .lower(); 590 } 591 592 // FIXME: DAG expansion gets better results. The widening uses the smaller 593 // range values and goes for the min/max lowering directly. 594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 595 .minScalar(0, S32) 596 .scalarize(0) 597 .lower(); 598 } 599 600 getActionDefinitionsBuilder( 601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 602 .customFor({S32, S64}) 603 .clampScalar(0, S32, S64) 604 .widenScalarToNextPow2(0, 32) 605 .scalarize(0); 606 607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 608 .legalFor({S32}) 609 .maxScalar(0, S32); 610 611 if (ST.hasVOP3PInsts()) { 612 Mulh 613 .clampMaxNumElements(0, S8, 2) 614 .lowerFor({V2S8}); 615 } 616 617 Mulh 618 .scalarize(0) 619 .lower(); 620 621 // Report legal for any types we can handle anywhere. For the cases only legal 622 // on the SALU, RegBankSelect will be able to re-legalize. 623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 625 .clampScalar(0, S32, S64) 626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 628 .widenScalarToNextPow2(0) 629 .scalarize(0); 630 631 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 633 .legalFor({{S32, S1}, {S32, S32}}) 634 .minScalar(0, S32) 635 // TODO: .scalarize(0) 636 .lower(); 637 638 getActionDefinitionsBuilder(G_BITCAST) 639 // Don't worry about the size constraint. 640 .legalIf(all(isRegisterType(0), isRegisterType(1))) 641 .lower(); 642 643 644 getActionDefinitionsBuilder(G_CONSTANT) 645 .legalFor({S1, S32, S64, S16, GlobalPtr, 646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 647 .legalIf(isPointer(0)) 648 .clampScalar(0, S32, S64) 649 .widenScalarToNextPow2(0); 650 651 getActionDefinitionsBuilder(G_FCONSTANT) 652 .legalFor({S32, S64, S16}) 653 .clampScalar(0, S16, S64); 654 655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 656 .legalIf(isRegisterType(0)) 657 // s1 and s16 are special cases because they have legal operations on 658 // them, but don't really occupy registers in the normal way. 659 .legalFor({S1, S16}) 660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 661 .clampScalarOrElt(0, S32, MaxScalar) 662 .widenScalarToNextPow2(0, 32) 663 .clampMaxNumElements(0, S32, 16); 664 665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 666 667 // If the amount is divergent, we have to do a wave reduction to get the 668 // maximum value, so this is expanded during RegBankSelect. 669 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 670 .legalFor({{PrivatePtr, S32}}); 671 672 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 673 .customIf(typeIsNot(0, PrivatePtr)); 674 675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 676 677 auto &FPOpActions = getActionDefinitionsBuilder( 678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 679 .legalFor({S32, S64}); 680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 681 .customFor({S32, S64}); 682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 683 .customFor({S32, S64}); 684 685 if (ST.has16BitInsts()) { 686 if (ST.hasVOP3PInsts()) 687 FPOpActions.legalFor({S16, V2S16}); 688 else 689 FPOpActions.legalFor({S16}); 690 691 TrigActions.customFor({S16}); 692 FDIVActions.customFor({S16}); 693 } 694 695 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 697 698 if (ST.hasVOP3PInsts()) { 699 MinNumMaxNum.customFor(FPTypesPK16) 700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 701 .clampMaxNumElements(0, S16, 2) 702 .clampScalar(0, S16, S64) 703 .scalarize(0); 704 } else if (ST.has16BitInsts()) { 705 MinNumMaxNum.customFor(FPTypes16) 706 .clampScalar(0, S16, S64) 707 .scalarize(0); 708 } else { 709 MinNumMaxNum.customFor(FPTypesBase) 710 .clampScalar(0, S32, S64) 711 .scalarize(0); 712 } 713 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.clampMaxNumElements(0, S16, 2); 716 717 FPOpActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 TrigActions 722 .scalarize(0) 723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 724 725 FDIVActions 726 .scalarize(0) 727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 728 729 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 730 .legalFor(FPTypesPK16) 731 .clampMaxNumElements(0, S16, 2) 732 .scalarize(0) 733 .clampScalar(0, S16, S64); 734 735 if (ST.has16BitInsts()) { 736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 737 .legalFor({S32, S64, S16}) 738 .scalarize(0) 739 .clampScalar(0, S16, S64); 740 } else { 741 getActionDefinitionsBuilder(G_FSQRT) 742 .legalFor({S32, S64}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 746 if (ST.hasFractBug()) { 747 getActionDefinitionsBuilder(G_FFLOOR) 748 .customFor({S64}) 749 .legalFor({S32, S64}) 750 .scalarize(0) 751 .clampScalar(0, S32, S64); 752 } else { 753 getActionDefinitionsBuilder(G_FFLOOR) 754 .legalFor({S32, S64}) 755 .scalarize(0) 756 .clampScalar(0, S32, S64); 757 } 758 } 759 760 getActionDefinitionsBuilder(G_FPTRUNC) 761 .legalFor({{S32, S64}, {S16, S32}}) 762 .scalarize(0) 763 .lower(); 764 765 getActionDefinitionsBuilder(G_FPEXT) 766 .legalFor({{S64, S32}, {S32, S16}}) 767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 768 .scalarize(0); 769 770 getActionDefinitionsBuilder(G_FSUB) 771 // Use actual fsub instruction 772 .legalFor({S32}) 773 // Must use fadd + fneg 774 .lowerFor({S64, S16, V2S16}) 775 .scalarize(0) 776 .clampScalar(0, S32, S64); 777 778 // Whether this is legal depends on the floating point mode for the function. 779 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 780 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 781 FMad.customFor({S32, S16}); 782 else if (ST.hasMadMacF32Insts()) 783 FMad.customFor({S32}); 784 else if (ST.hasMadF16()) 785 FMad.customFor({S16}); 786 FMad.scalarize(0) 787 .lower(); 788 789 auto &FRem = getActionDefinitionsBuilder(G_FREM); 790 if (ST.has16BitInsts()) { 791 FRem.customFor({S16, S32, S64}); 792 } else { 793 FRem.minScalar(0, S32) 794 .customFor({S32, S64}); 795 } 796 FRem.scalarize(0); 797 798 // TODO: Do we need to clamp maximum bitwidth? 799 getActionDefinitionsBuilder(G_TRUNC) 800 .legalIf(isScalar(0)) 801 .legalFor({{V2S16, V2S32}}) 802 .clampMaxNumElements(0, S16, 2) 803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 804 // situations (like an invalid implicit use), we don't want to infinite loop 805 // in the legalizer. 806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 807 .alwaysLegal(); 808 809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 811 {S32, S1}, {S64, S1}, {S16, S1}}) 812 .scalarize(0) 813 .clampScalar(0, S32, S64) 814 .widenScalarToNextPow2(1, 32); 815 816 // TODO: Split s1->s64 during regbankselect for VALU. 817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 819 .lowerIf(typeIs(1, S1)) 820 .customFor({{S32, S64}, {S64, S64}}); 821 if (ST.has16BitInsts()) 822 IToFP.legalFor({{S16, S16}}); 823 IToFP.clampScalar(1, S32, S64) 824 .minScalar(0, S32) 825 .scalarize(0) 826 .widenScalarToNextPow2(1); 827 828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 830 .customFor({{S64, S32}, {S64, S64}}) 831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 832 if (ST.has16BitInsts()) 833 FPToI.legalFor({{S16, S16}}); 834 else 835 FPToI.minScalar(1, S32); 836 837 FPToI.minScalar(0, S32) 838 .widenScalarToNextPow2(0, 32) 839 .scalarize(0) 840 .lower(); 841 842 // Lower roundeven into G_FRINT 843 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 844 .scalarize(0) 845 .lower(); 846 847 if (ST.has16BitInsts()) { 848 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 849 .legalFor({S16, S32, S64}) 850 .clampScalar(0, S16, S64) 851 .scalarize(0); 852 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 854 .legalFor({S32, S64}) 855 .clampScalar(0, S32, S64) 856 .scalarize(0); 857 } else { 858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 859 .legalFor({S32}) 860 .customFor({S64}) 861 .clampScalar(0, S32, S64) 862 .scalarize(0); 863 } 864 865 getActionDefinitionsBuilder(G_PTR_ADD) 866 .legalIf(all(isPointer(0), sameSize(0, 1))) 867 .scalarize(0) 868 .scalarSameSizeAs(1, 0); 869 870 getActionDefinitionsBuilder(G_PTRMASK) 871 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 872 .scalarSameSizeAs(1, 0) 873 .scalarize(0); 874 875 auto &CmpBuilder = 876 getActionDefinitionsBuilder(G_ICMP) 877 // The compare output type differs based on the register bank of the output, 878 // so make both s1 and s32 legal. 879 // 880 // Scalar compares producing output in scc will be promoted to s32, as that 881 // is the allocatable register type that will be needed for the copy from 882 // scc. This will be promoted during RegBankSelect, and we assume something 883 // before that won't try to use s32 result types. 884 // 885 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 886 // bank. 887 .legalForCartesianProduct( 888 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 889 .legalForCartesianProduct( 890 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 891 if (ST.has16BitInsts()) { 892 CmpBuilder.legalFor({{S1, S16}}); 893 } 894 895 CmpBuilder 896 .widenScalarToNextPow2(1) 897 .clampScalar(1, S32, S64) 898 .scalarize(0) 899 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 900 901 getActionDefinitionsBuilder(G_FCMP) 902 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 903 .widenScalarToNextPow2(1) 904 .clampScalar(1, S32, S64) 905 .scalarize(0); 906 907 // FIXME: fpow has a selection pattern that should move to custom lowering. 908 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 909 if (ST.has16BitInsts()) 910 Exp2Ops.legalFor({S32, S16}); 911 else 912 Exp2Ops.legalFor({S32}); 913 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 914 Exp2Ops.scalarize(0); 915 916 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 917 if (ST.has16BitInsts()) 918 ExpOps.customFor({{S32}, {S16}}); 919 else 920 ExpOps.customFor({S32}); 921 ExpOps.clampScalar(0, MinScalarFPTy, S32) 922 .scalarize(0); 923 924 getActionDefinitionsBuilder(G_FPOWI) 925 .clampScalar(0, MinScalarFPTy, S32) 926 .lower(); 927 928 // The 64-bit versions produce 32-bit results, but only on the SALU. 929 getActionDefinitionsBuilder(G_CTPOP) 930 .legalFor({{S32, S32}, {S32, S64}}) 931 .clampScalar(0, S32, S32) 932 .clampScalar(1, S32, S64) 933 .scalarize(0) 934 .widenScalarToNextPow2(0, 32) 935 .widenScalarToNextPow2(1, 32); 936 937 // The hardware instructions return a different result on 0 than the generic 938 // instructions expect. The hardware produces -1, but these produce the 939 // bitwidth. 940 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 941 .scalarize(0) 942 .clampScalar(0, S32, S32) 943 .clampScalar(1, S32, S64) 944 .widenScalarToNextPow2(0, 32) 945 .widenScalarToNextPow2(1, 32) 946 .custom(); 947 948 // The 64-bit versions produce 32-bit results, but only on the SALU. 949 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 950 .legalFor({{S32, S32}, {S32, S64}}) 951 .clampScalar(0, S32, S32) 952 .clampScalar(1, S32, S64) 953 .scalarize(0) 954 .widenScalarToNextPow2(0, 32) 955 .widenScalarToNextPow2(1, 32); 956 957 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 958 // RegBankSelect. 959 getActionDefinitionsBuilder(G_BITREVERSE) 960 .legalFor({S32, S64}) 961 .clampScalar(0, S32, S64) 962 .scalarize(0) 963 .widenScalarToNextPow2(0); 964 965 if (ST.has16BitInsts()) { 966 getActionDefinitionsBuilder(G_BSWAP) 967 .legalFor({S16, S32, V2S16}) 968 .clampMaxNumElements(0, S16, 2) 969 // FIXME: Fixing non-power-of-2 before clamp is workaround for 970 // narrowScalar limitation. 971 .widenScalarToNextPow2(0) 972 .clampScalar(0, S16, S32) 973 .scalarize(0); 974 975 if (ST.hasVOP3PInsts()) { 976 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 977 .legalFor({S32, S16, V2S16}) 978 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 979 .clampMaxNumElements(0, S16, 2) 980 .minScalar(0, S16) 981 .widenScalarToNextPow2(0) 982 .scalarize(0) 983 .lower(); 984 } else { 985 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 986 .legalFor({S32, S16}) 987 .widenScalarToNextPow2(0) 988 .minScalar(0, S16) 989 .scalarize(0) 990 .lower(); 991 } 992 } else { 993 // TODO: Should have same legality without v_perm_b32 994 getActionDefinitionsBuilder(G_BSWAP) 995 .legalFor({S32}) 996 .lowerIf(scalarNarrowerThan(0, 32)) 997 // FIXME: Fixing non-power-of-2 before clamp is workaround for 998 // narrowScalar limitation. 999 .widenScalarToNextPow2(0) 1000 .maxScalar(0, S32) 1001 .scalarize(0) 1002 .lower(); 1003 1004 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1005 .legalFor({S32}) 1006 .minScalar(0, S32) 1007 .widenScalarToNextPow2(0) 1008 .scalarize(0) 1009 .lower(); 1010 } 1011 1012 getActionDefinitionsBuilder(G_INTTOPTR) 1013 // List the common cases 1014 .legalForCartesianProduct(AddrSpaces64, {S64}) 1015 .legalForCartesianProduct(AddrSpaces32, {S32}) 1016 .scalarize(0) 1017 // Accept any address space as long as the size matches 1018 .legalIf(sameSize(0, 1)) 1019 .widenScalarIf(smallerThan(1, 0), 1020 [](const LegalityQuery &Query) { 1021 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1022 }) 1023 .narrowScalarIf(largerThan(1, 0), 1024 [](const LegalityQuery &Query) { 1025 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1026 }); 1027 1028 getActionDefinitionsBuilder(G_PTRTOINT) 1029 // List the common cases 1030 .legalForCartesianProduct(AddrSpaces64, {S64}) 1031 .legalForCartesianProduct(AddrSpaces32, {S32}) 1032 .scalarize(0) 1033 // Accept any address space as long as the size matches 1034 .legalIf(sameSize(0, 1)) 1035 .widenScalarIf(smallerThan(0, 1), 1036 [](const LegalityQuery &Query) { 1037 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1038 }) 1039 .narrowScalarIf( 1040 largerThan(0, 1), 1041 [](const LegalityQuery &Query) { 1042 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1043 }); 1044 1045 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1046 .scalarize(0) 1047 .custom(); 1048 1049 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1050 bool IsLoad) -> bool { 1051 const LLT DstTy = Query.Types[0]; 1052 1053 // Split vector extloads. 1054 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1055 1056 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1057 return true; 1058 1059 const LLT PtrTy = Query.Types[1]; 1060 unsigned AS = PtrTy.getAddressSpace(); 1061 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1062 return true; 1063 1064 // Catch weird sized loads that don't evenly divide into the access sizes 1065 // TODO: May be able to widen depending on alignment etc. 1066 unsigned NumRegs = (MemSize + 31) / 32; 1067 if (NumRegs == 3) { 1068 if (!ST.hasDwordx3LoadStores()) 1069 return true; 1070 } else { 1071 // If the alignment allows, these should have been widened. 1072 if (!isPowerOf2_32(NumRegs)) 1073 return true; 1074 } 1075 1076 return false; 1077 }; 1078 1079 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1080 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1081 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1082 1083 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1084 // LDS 1085 // TODO: Unsupported flat for SI. 1086 1087 for (unsigned Op : {G_LOAD, G_STORE}) { 1088 const bool IsStore = Op == G_STORE; 1089 1090 auto &Actions = getActionDefinitionsBuilder(Op); 1091 // Explicitly list some common cases. 1092 // TODO: Does this help compile time at all? 1093 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1094 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1095 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1096 {S64, GlobalPtr, S64, GlobalAlign32}, 1097 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1098 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1099 {S32, GlobalPtr, S8, GlobalAlign8}, 1100 {S32, GlobalPtr, S16, GlobalAlign16}, 1101 1102 {S32, LocalPtr, S32, 32}, 1103 {S64, LocalPtr, S64, 32}, 1104 {V2S32, LocalPtr, V2S32, 32}, 1105 {S32, LocalPtr, S8, 8}, 1106 {S32, LocalPtr, S16, 16}, 1107 {V2S16, LocalPtr, S32, 32}, 1108 1109 {S32, PrivatePtr, S32, 32}, 1110 {S32, PrivatePtr, S8, 8}, 1111 {S32, PrivatePtr, S16, 16}, 1112 {V2S16, PrivatePtr, S32, 32}, 1113 1114 {S32, ConstantPtr, S32, GlobalAlign32}, 1115 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1116 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1117 {S64, ConstantPtr, S64, GlobalAlign32}, 1118 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1119 Actions.legalIf( 1120 [=](const LegalityQuery &Query) -> bool { 1121 return isLoadStoreLegal(ST, Query); 1122 }); 1123 1124 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1125 // 64-bits. 1126 // 1127 // TODO: Should generalize bitcast action into coerce, which will also cover 1128 // inserting addrspacecasts. 1129 Actions.customIf(typeIs(1, Constant32Ptr)); 1130 1131 // Turn any illegal element vectors into something easier to deal 1132 // with. These will ultimately produce 32-bit scalar shifts to extract the 1133 // parts anyway. 1134 // 1135 // For odd 16-bit element vectors, prefer to split those into pieces with 1136 // 16-bit vector parts. 1137 Actions.bitcastIf( 1138 [=](const LegalityQuery &Query) -> bool { 1139 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1140 Query.MMODescrs[0].MemoryTy); 1141 }, bitcastToRegisterType(0)); 1142 1143 if (!IsStore) { 1144 // Widen suitably aligned loads by loading extra bytes. The standard 1145 // legalization actions can't properly express widening memory operands. 1146 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1147 return shouldWidenLoad(ST, Query, G_LOAD); 1148 }); 1149 } 1150 1151 // FIXME: load/store narrowing should be moved to lower action 1152 Actions 1153 .narrowScalarIf( 1154 [=](const LegalityQuery &Query) -> bool { 1155 return !Query.Types[0].isVector() && 1156 needToSplitMemOp(Query, Op == G_LOAD); 1157 }, 1158 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1159 const LLT DstTy = Query.Types[0]; 1160 const LLT PtrTy = Query.Types[1]; 1161 1162 const unsigned DstSize = DstTy.getSizeInBits(); 1163 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1164 1165 // Split extloads. 1166 if (DstSize > MemSize) 1167 return std::make_pair(0, LLT::scalar(MemSize)); 1168 1169 unsigned MaxSize = maxSizeForAddrSpace(ST, 1170 PtrTy.getAddressSpace(), 1171 Op == G_LOAD); 1172 if (MemSize > MaxSize) 1173 return std::make_pair(0, LLT::scalar(MaxSize)); 1174 1175 unsigned Align = Query.MMODescrs[0].AlignInBits; 1176 return std::make_pair(0, LLT::scalar(Align)); 1177 }) 1178 .fewerElementsIf( 1179 [=](const LegalityQuery &Query) -> bool { 1180 return Query.Types[0].isVector() && 1181 needToSplitMemOp(Query, Op == G_LOAD); 1182 }, 1183 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1184 const LLT DstTy = Query.Types[0]; 1185 const LLT PtrTy = Query.Types[1]; 1186 1187 LLT EltTy = DstTy.getElementType(); 1188 unsigned MaxSize = maxSizeForAddrSpace(ST, 1189 PtrTy.getAddressSpace(), 1190 Op == G_LOAD); 1191 1192 // FIXME: Handle widened to power of 2 results better. This ends 1193 // up scalarizing. 1194 // FIXME: 3 element stores scalarized on SI 1195 1196 // Split if it's too large for the address space. 1197 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1198 if (MemSize > MaxSize) { 1199 unsigned NumElts = DstTy.getNumElements(); 1200 unsigned EltSize = EltTy.getSizeInBits(); 1201 1202 if (MaxSize % EltSize == 0) { 1203 return std::make_pair( 1204 0, LLT::scalarOrVector( 1205 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1206 } 1207 1208 unsigned NumPieces = MemSize / MaxSize; 1209 1210 // FIXME: Refine when odd breakdowns handled 1211 // The scalars will need to be re-legalized. 1212 if (NumPieces == 1 || NumPieces >= NumElts || 1213 NumElts % NumPieces != 0) 1214 return std::make_pair(0, EltTy); 1215 1216 return std::make_pair( 1217 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1218 } 1219 1220 // FIXME: We could probably handle weird extending loads better. 1221 if (DstTy.getSizeInBits() > MemSize) 1222 return std::make_pair(0, EltTy); 1223 1224 unsigned EltSize = EltTy.getSizeInBits(); 1225 unsigned DstSize = DstTy.getSizeInBits(); 1226 if (!isPowerOf2_32(DstSize)) { 1227 // We're probably decomposing an odd sized store. Try to split 1228 // to the widest type. TODO: Account for alignment. As-is it 1229 // should be OK, since the new parts will be further legalized. 1230 unsigned FloorSize = PowerOf2Floor(DstSize); 1231 return std::make_pair( 1232 0, LLT::scalarOrVector( 1233 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1234 } 1235 1236 // May need relegalization for the scalars. 1237 return std::make_pair(0, EltTy); 1238 }) 1239 .minScalar(0, S32) 1240 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1241 .widenScalarToNextPow2(0) 1242 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1243 .lower(); 1244 } 1245 1246 // FIXME: Unaligned accesses not lowered. 1247 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1248 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1249 {S32, GlobalPtr, S16, 2 * 8}, 1250 {S32, LocalPtr, S8, 8}, 1251 {S32, LocalPtr, S16, 16}, 1252 {S32, PrivatePtr, S8, 8}, 1253 {S32, PrivatePtr, S16, 16}, 1254 {S32, ConstantPtr, S8, 8}, 1255 {S32, ConstantPtr, S16, 2 * 8}}) 1256 .legalIf( 1257 [=](const LegalityQuery &Query) -> bool { 1258 return isLoadStoreLegal(ST, Query); 1259 }); 1260 1261 if (ST.hasFlatAddressSpace()) { 1262 ExtLoads.legalForTypesWithMemDesc( 1263 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1264 } 1265 1266 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1267 // 64-bits. 1268 // 1269 // TODO: Should generalize bitcast action into coerce, which will also cover 1270 // inserting addrspacecasts. 1271 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1272 1273 ExtLoads.clampScalar(0, S32, S32) 1274 .widenScalarToNextPow2(0) 1275 .lower(); 1276 1277 auto &Atomics = getActionDefinitionsBuilder( 1278 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1279 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1280 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1281 G_ATOMICRMW_UMIN}) 1282 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1283 {S64, GlobalPtr}, {S64, LocalPtr}, 1284 {S32, RegionPtr}, {S64, RegionPtr}}); 1285 if (ST.hasFlatAddressSpace()) { 1286 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1287 } 1288 1289 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1290 if (ST.hasLDSFPAtomicAdd()) { 1291 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1292 if (ST.hasGFX90AInsts()) 1293 Atomic.legalFor({{S64, LocalPtr}}); 1294 } 1295 if (ST.hasAtomicFaddInsts()) 1296 Atomic.legalFor({{S32, GlobalPtr}}); 1297 1298 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1299 // demarshalling 1300 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1301 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1302 {S32, FlatPtr}, {S64, FlatPtr}}) 1303 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1304 {S32, RegionPtr}, {S64, RegionPtr}}); 1305 // TODO: Pointer types, any 32-bit or 64-bit vector 1306 1307 // Condition should be s32 for scalar, s1 for vector. 1308 getActionDefinitionsBuilder(G_SELECT) 1309 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1310 LocalPtr, FlatPtr, PrivatePtr, 1311 LLT::fixed_vector(2, LocalPtr), 1312 LLT::fixed_vector(2, PrivatePtr)}, 1313 {S1, S32}) 1314 .clampScalar(0, S16, S64) 1315 .scalarize(1) 1316 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1317 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1318 .clampMaxNumElements(0, S32, 2) 1319 .clampMaxNumElements(0, LocalPtr, 2) 1320 .clampMaxNumElements(0, PrivatePtr, 2) 1321 .scalarize(0) 1322 .widenScalarToNextPow2(0) 1323 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1324 1325 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1326 // be more flexible with the shift amount type. 1327 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1328 .legalFor({{S32, S32}, {S64, S32}}); 1329 if (ST.has16BitInsts()) { 1330 if (ST.hasVOP3PInsts()) { 1331 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1332 .clampMaxNumElements(0, S16, 2); 1333 } else 1334 Shifts.legalFor({{S16, S16}}); 1335 1336 // TODO: Support 16-bit shift amounts for all types 1337 Shifts.widenScalarIf( 1338 [=](const LegalityQuery &Query) { 1339 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1340 // 32-bit amount. 1341 const LLT ValTy = Query.Types[0]; 1342 const LLT AmountTy = Query.Types[1]; 1343 return ValTy.getSizeInBits() <= 16 && 1344 AmountTy.getSizeInBits() < 16; 1345 }, changeTo(1, S16)); 1346 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1347 Shifts.clampScalar(1, S32, S32); 1348 Shifts.clampScalar(0, S16, S64); 1349 Shifts.widenScalarToNextPow2(0, 16); 1350 1351 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1352 .minScalar(0, S16) 1353 .scalarize(0) 1354 .lower(); 1355 } else { 1356 // Make sure we legalize the shift amount type first, as the general 1357 // expansion for the shifted type will produce much worse code if it hasn't 1358 // been truncated already. 1359 Shifts.clampScalar(1, S32, S32); 1360 Shifts.clampScalar(0, S32, S64); 1361 Shifts.widenScalarToNextPow2(0, 32); 1362 1363 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1364 .minScalar(0, S32) 1365 .scalarize(0) 1366 .lower(); 1367 } 1368 Shifts.scalarize(0); 1369 1370 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1371 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1372 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1373 unsigned IdxTypeIdx = 2; 1374 1375 getActionDefinitionsBuilder(Op) 1376 .customIf([=](const LegalityQuery &Query) { 1377 const LLT EltTy = Query.Types[EltTypeIdx]; 1378 const LLT VecTy = Query.Types[VecTypeIdx]; 1379 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1380 const unsigned EltSize = EltTy.getSizeInBits(); 1381 return (EltSize == 32 || EltSize == 64) && 1382 VecTy.getSizeInBits() % 32 == 0 && 1383 VecTy.getSizeInBits() <= MaxRegisterSize && 1384 IdxTy.getSizeInBits() == 32; 1385 }) 1386 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1387 bitcastToVectorElement32(VecTypeIdx)) 1388 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1389 .bitcastIf( 1390 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1391 [=](const LegalityQuery &Query) { 1392 // For > 64-bit element types, try to turn this into a 64-bit 1393 // element vector since we may be able to do better indexing 1394 // if this is scalar. If not, fall back to 32. 1395 const LLT EltTy = Query.Types[EltTypeIdx]; 1396 const LLT VecTy = Query.Types[VecTypeIdx]; 1397 const unsigned DstEltSize = EltTy.getSizeInBits(); 1398 const unsigned VecSize = VecTy.getSizeInBits(); 1399 1400 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1401 return std::make_pair( 1402 VecTypeIdx, 1403 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1404 }) 1405 .clampScalar(EltTypeIdx, S32, S64) 1406 .clampScalar(VecTypeIdx, S32, S64) 1407 .clampScalar(IdxTypeIdx, S32, S32) 1408 .clampMaxNumElements(VecTypeIdx, S32, 32) 1409 // TODO: Clamp elements for 64-bit vectors? 1410 // It should only be necessary with variable indexes. 1411 // As a last resort, lower to the stack 1412 .lower(); 1413 } 1414 1415 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1416 .unsupportedIf([=](const LegalityQuery &Query) { 1417 const LLT &EltTy = Query.Types[1].getElementType(); 1418 return Query.Types[0] != EltTy; 1419 }); 1420 1421 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1422 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1423 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1424 1425 // FIXME: Doesn't handle extract of illegal sizes. 1426 getActionDefinitionsBuilder(Op) 1427 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1428 // FIXME: Multiples of 16 should not be legal. 1429 .legalIf([=](const LegalityQuery &Query) { 1430 const LLT BigTy = Query.Types[BigTyIdx]; 1431 const LLT LitTy = Query.Types[LitTyIdx]; 1432 return (BigTy.getSizeInBits() % 32 == 0) && 1433 (LitTy.getSizeInBits() % 16 == 0); 1434 }) 1435 .widenScalarIf( 1436 [=](const LegalityQuery &Query) { 1437 const LLT BigTy = Query.Types[BigTyIdx]; 1438 return (BigTy.getScalarSizeInBits() < 16); 1439 }, 1440 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1441 .widenScalarIf( 1442 [=](const LegalityQuery &Query) { 1443 const LLT LitTy = Query.Types[LitTyIdx]; 1444 return (LitTy.getScalarSizeInBits() < 16); 1445 }, 1446 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1447 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1448 .widenScalarToNextPow2(BigTyIdx, 32); 1449 1450 } 1451 1452 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1453 .legalForCartesianProduct(AllS32Vectors, {S32}) 1454 .legalForCartesianProduct(AllS64Vectors, {S64}) 1455 .clampNumElements(0, V16S32, V32S32) 1456 .clampNumElements(0, V2S64, V16S64) 1457 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1458 1459 if (ST.hasScalarPackInsts()) { 1460 BuildVector 1461 // FIXME: Should probably widen s1 vectors straight to s32 1462 .minScalarOrElt(0, S16) 1463 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1464 .minScalar(1, S32); 1465 1466 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1467 .legalFor({V2S16, S32}) 1468 .lower(); 1469 BuildVector.minScalarOrElt(0, S32); 1470 } else { 1471 BuildVector.customFor({V2S16, S16}); 1472 BuildVector.minScalarOrElt(0, S32); 1473 1474 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1475 .customFor({V2S16, S32}) 1476 .lower(); 1477 } 1478 1479 BuildVector.legalIf(isRegisterType(0)); 1480 1481 // FIXME: Clamp maximum size 1482 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1483 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1484 .clampMaxNumElements(0, S32, 32) 1485 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1486 .clampMaxNumElements(0, S16, 64); 1487 1488 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1489 // pre-legalize. 1490 if (ST.hasVOP3PInsts()) { 1491 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1492 .customFor({V2S16, V2S16}) 1493 .lower(); 1494 } else 1495 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1496 1497 // Merge/Unmerge 1498 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1499 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1500 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1501 1502 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1503 const LLT Ty = Query.Types[TypeIdx]; 1504 if (Ty.isVector()) { 1505 const LLT &EltTy = Ty.getElementType(); 1506 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1507 return true; 1508 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1509 return true; 1510 } 1511 return false; 1512 }; 1513 1514 auto &Builder = getActionDefinitionsBuilder(Op) 1515 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1516 .lowerFor({{S16, V2S16}}) 1517 .lowerIf([=](const LegalityQuery &Query) { 1518 const LLT BigTy = Query.Types[BigTyIdx]; 1519 return BigTy.getSizeInBits() == 32; 1520 }) 1521 // Try to widen to s16 first for small types. 1522 // TODO: Only do this on targets with legal s16 shifts 1523 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1524 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1525 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1526 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1527 elementTypeIs(1, S16)), 1528 changeTo(1, V2S16)) 1529 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1530 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1531 // valid. 1532 .clampScalar(LitTyIdx, S32, S512) 1533 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1534 // Break up vectors with weird elements into scalars 1535 .fewerElementsIf( 1536 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1537 scalarize(0)) 1538 .fewerElementsIf( 1539 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1540 scalarize(1)) 1541 .clampScalar(BigTyIdx, S32, MaxScalar); 1542 1543 if (Op == G_MERGE_VALUES) { 1544 Builder.widenScalarIf( 1545 // TODO: Use 16-bit shifts if legal for 8-bit values? 1546 [=](const LegalityQuery &Query) { 1547 const LLT Ty = Query.Types[LitTyIdx]; 1548 return Ty.getSizeInBits() < 32; 1549 }, 1550 changeTo(LitTyIdx, S32)); 1551 } 1552 1553 Builder.widenScalarIf( 1554 [=](const LegalityQuery &Query) { 1555 const LLT Ty = Query.Types[BigTyIdx]; 1556 return !isPowerOf2_32(Ty.getSizeInBits()) && 1557 Ty.getSizeInBits() % 16 != 0; 1558 }, 1559 [=](const LegalityQuery &Query) { 1560 // Pick the next power of 2, or a multiple of 64 over 128. 1561 // Whichever is smaller. 1562 const LLT &Ty = Query.Types[BigTyIdx]; 1563 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1564 if (NewSizeInBits >= 256) { 1565 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1566 if (RoundedTo < NewSizeInBits) 1567 NewSizeInBits = RoundedTo; 1568 } 1569 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1570 }) 1571 // Any vectors left are the wrong size. Scalarize them. 1572 .scalarize(0) 1573 .scalarize(1); 1574 } 1575 1576 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1577 // RegBankSelect. 1578 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1579 .legalFor({{S32}, {S64}}); 1580 1581 if (ST.hasVOP3PInsts()) { 1582 SextInReg.lowerFor({{V2S16}}) 1583 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1584 // get more vector shift opportunities, since we'll get those when 1585 // expanded. 1586 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1587 } else if (ST.has16BitInsts()) { 1588 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1589 } else { 1590 // Prefer to promote to s32 before lowering if we don't have 16-bit 1591 // shifts. This avoid a lot of intermediate truncate and extend operations. 1592 SextInReg.lowerFor({{S32}, {S64}}); 1593 } 1594 1595 SextInReg 1596 .scalarize(0) 1597 .clampScalar(0, S32, S64) 1598 .lower(); 1599 1600 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1601 .scalarize(0) 1602 .lower(); 1603 1604 // TODO: Only Try to form v2s16 with legal packed instructions. 1605 getActionDefinitionsBuilder(G_FSHR) 1606 .legalFor({{S32, S32}}) 1607 .lowerFor({{V2S16, V2S16}}) 1608 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1609 .scalarize(0) 1610 .lower(); 1611 1612 if (ST.hasVOP3PInsts()) { 1613 getActionDefinitionsBuilder(G_FSHL) 1614 .lowerFor({{V2S16, V2S16}}) 1615 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1616 .scalarize(0) 1617 .lower(); 1618 } else { 1619 getActionDefinitionsBuilder(G_FSHL) 1620 .scalarize(0) 1621 .lower(); 1622 } 1623 1624 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1625 .legalFor({S64}); 1626 1627 getActionDefinitionsBuilder(G_FENCE) 1628 .alwaysLegal(); 1629 1630 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1631 .scalarize(0) 1632 .minScalar(0, S32) 1633 .lower(); 1634 1635 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1636 .legalFor({{S32, S32}, {S64, S32}}) 1637 .clampScalar(1, S32, S32) 1638 .clampScalar(0, S32, S64) 1639 .widenScalarToNextPow2(0) 1640 .scalarize(0); 1641 1642 getActionDefinitionsBuilder({ 1643 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1644 G_FCOPYSIGN, 1645 1646 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1647 G_ATOMICRMW_NAND, 1648 G_ATOMICRMW_FSUB, 1649 G_READ_REGISTER, 1650 G_WRITE_REGISTER, 1651 1652 G_SADDO, G_SSUBO, 1653 1654 // TODO: Implement 1655 G_FMINIMUM, G_FMAXIMUM}).lower(); 1656 1657 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1658 .lower(); 1659 1660 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1661 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1662 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1663 .unsupported(); 1664 1665 getLegacyLegalizerInfo().computeTables(); 1666 verify(*ST.getInstrInfo()); 1667 } 1668 1669 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1670 MachineInstr &MI) const { 1671 MachineIRBuilder &B = Helper.MIRBuilder; 1672 MachineRegisterInfo &MRI = *B.getMRI(); 1673 1674 switch (MI.getOpcode()) { 1675 case TargetOpcode::G_ADDRSPACE_CAST: 1676 return legalizeAddrSpaceCast(MI, MRI, B); 1677 case TargetOpcode::G_FRINT: 1678 return legalizeFrint(MI, MRI, B); 1679 case TargetOpcode::G_FCEIL: 1680 return legalizeFceil(MI, MRI, B); 1681 case TargetOpcode::G_FREM: 1682 return legalizeFrem(MI, MRI, B); 1683 case TargetOpcode::G_INTRINSIC_TRUNC: 1684 return legalizeIntrinsicTrunc(MI, MRI, B); 1685 case TargetOpcode::G_SITOFP: 1686 return legalizeITOFP(MI, MRI, B, true); 1687 case TargetOpcode::G_UITOFP: 1688 return legalizeITOFP(MI, MRI, B, false); 1689 case TargetOpcode::G_FPTOSI: 1690 return legalizeFPTOI(MI, MRI, B, true); 1691 case TargetOpcode::G_FPTOUI: 1692 return legalizeFPTOI(MI, MRI, B, false); 1693 case TargetOpcode::G_FMINNUM: 1694 case TargetOpcode::G_FMAXNUM: 1695 case TargetOpcode::G_FMINNUM_IEEE: 1696 case TargetOpcode::G_FMAXNUM_IEEE: 1697 return legalizeMinNumMaxNum(Helper, MI); 1698 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1699 return legalizeExtractVectorElt(MI, MRI, B); 1700 case TargetOpcode::G_INSERT_VECTOR_ELT: 1701 return legalizeInsertVectorElt(MI, MRI, B); 1702 case TargetOpcode::G_SHUFFLE_VECTOR: 1703 return legalizeShuffleVector(MI, MRI, B); 1704 case TargetOpcode::G_FSIN: 1705 case TargetOpcode::G_FCOS: 1706 return legalizeSinCos(MI, MRI, B); 1707 case TargetOpcode::G_GLOBAL_VALUE: 1708 return legalizeGlobalValue(MI, MRI, B); 1709 case TargetOpcode::G_LOAD: 1710 case TargetOpcode::G_SEXTLOAD: 1711 case TargetOpcode::G_ZEXTLOAD: 1712 return legalizeLoad(Helper, MI); 1713 case TargetOpcode::G_FMAD: 1714 return legalizeFMad(MI, MRI, B); 1715 case TargetOpcode::G_FDIV: 1716 return legalizeFDIV(MI, MRI, B); 1717 case TargetOpcode::G_UDIV: 1718 case TargetOpcode::G_UREM: 1719 case TargetOpcode::G_UDIVREM: 1720 return legalizeUnsignedDIV_REM(MI, MRI, B); 1721 case TargetOpcode::G_SDIV: 1722 case TargetOpcode::G_SREM: 1723 case TargetOpcode::G_SDIVREM: 1724 return legalizeSignedDIV_REM(MI, MRI, B); 1725 case TargetOpcode::G_ATOMIC_CMPXCHG: 1726 return legalizeAtomicCmpXChg(MI, MRI, B); 1727 case TargetOpcode::G_FLOG: 1728 return legalizeFlog(MI, B, numbers::ln2f); 1729 case TargetOpcode::G_FLOG10: 1730 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1731 case TargetOpcode::G_FEXP: 1732 return legalizeFExp(MI, B); 1733 case TargetOpcode::G_FPOW: 1734 return legalizeFPow(MI, B); 1735 case TargetOpcode::G_FFLOOR: 1736 return legalizeFFloor(MI, MRI, B); 1737 case TargetOpcode::G_BUILD_VECTOR: 1738 return legalizeBuildVector(MI, MRI, B); 1739 case TargetOpcode::G_CTLZ: 1740 case TargetOpcode::G_CTTZ: 1741 return legalizeCTLZ_CTTZ(MI, MRI, B); 1742 default: 1743 return false; 1744 } 1745 1746 llvm_unreachable("expected switch to return"); 1747 } 1748 1749 Register AMDGPULegalizerInfo::getSegmentAperture( 1750 unsigned AS, 1751 MachineRegisterInfo &MRI, 1752 MachineIRBuilder &B) const { 1753 MachineFunction &MF = B.getMF(); 1754 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1755 const LLT S32 = LLT::scalar(32); 1756 1757 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1758 1759 if (ST.hasApertureRegs()) { 1760 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1761 // getreg. 1762 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1763 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1764 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1765 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1766 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1767 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1768 unsigned Encoding = 1769 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1770 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1771 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1772 1773 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1774 1775 B.buildInstr(AMDGPU::S_GETREG_B32) 1776 .addDef(GetReg) 1777 .addImm(Encoding); 1778 MRI.setType(GetReg, S32); 1779 1780 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1781 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1782 } 1783 1784 Register QueuePtr = MRI.createGenericVirtualRegister( 1785 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1786 1787 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1788 return Register(); 1789 1790 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1791 // private_segment_aperture_base_hi. 1792 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1793 1794 // TODO: can we be smarter about machine pointer info? 1795 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1796 MachineMemOperand *MMO = MF.getMachineMemOperand( 1797 PtrInfo, 1798 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1799 MachineMemOperand::MOInvariant, 1800 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1801 1802 Register LoadAddr; 1803 1804 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1805 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1806 } 1807 1808 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1809 MachineInstr &MI, MachineRegisterInfo &MRI, 1810 MachineIRBuilder &B) const { 1811 MachineFunction &MF = B.getMF(); 1812 1813 const LLT S32 = LLT::scalar(32); 1814 Register Dst = MI.getOperand(0).getReg(); 1815 Register Src = MI.getOperand(1).getReg(); 1816 1817 LLT DstTy = MRI.getType(Dst); 1818 LLT SrcTy = MRI.getType(Src); 1819 unsigned DestAS = DstTy.getAddressSpace(); 1820 unsigned SrcAS = SrcTy.getAddressSpace(); 1821 1822 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1823 // vector element. 1824 assert(!DstTy.isVector()); 1825 1826 const AMDGPUTargetMachine &TM 1827 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1828 1829 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1830 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1831 return true; 1832 } 1833 1834 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1835 // Truncate. 1836 B.buildExtract(Dst, Src, 0); 1837 MI.eraseFromParent(); 1838 return true; 1839 } 1840 1841 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1842 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1843 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1844 1845 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1846 // another. Merge operands are required to be the same type, but creating an 1847 // extra ptrtoint would be kind of pointless. 1848 auto HighAddr = B.buildConstant( 1849 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1850 B.buildMerge(Dst, {Src, HighAddr}); 1851 MI.eraseFromParent(); 1852 return true; 1853 } 1854 1855 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1856 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1857 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1858 unsigned NullVal = TM.getNullPointerValue(DestAS); 1859 1860 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1861 auto FlatNull = B.buildConstant(SrcTy, 0); 1862 1863 // Extract low 32-bits of the pointer. 1864 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1865 1866 auto CmpRes = 1867 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1868 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1869 1870 MI.eraseFromParent(); 1871 return true; 1872 } 1873 1874 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1875 return false; 1876 1877 if (!ST.hasFlatAddressSpace()) 1878 return false; 1879 1880 auto SegmentNull = 1881 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1882 auto FlatNull = 1883 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1884 1885 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1886 if (!ApertureReg.isValid()) 1887 return false; 1888 1889 auto CmpRes = 1890 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1891 1892 // Coerce the type of the low half of the result so we can use merge_values. 1893 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1894 1895 // TODO: Should we allow mismatched types but matching sizes in merges to 1896 // avoid the ptrtoint? 1897 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1898 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1899 1900 MI.eraseFromParent(); 1901 return true; 1902 } 1903 1904 bool AMDGPULegalizerInfo::legalizeFrint( 1905 MachineInstr &MI, MachineRegisterInfo &MRI, 1906 MachineIRBuilder &B) const { 1907 Register Src = MI.getOperand(1).getReg(); 1908 LLT Ty = MRI.getType(Src); 1909 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1910 1911 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1912 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1913 1914 auto C1 = B.buildFConstant(Ty, C1Val); 1915 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1916 1917 // TODO: Should this propagate fast-math-flags? 1918 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1919 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1920 1921 auto C2 = B.buildFConstant(Ty, C2Val); 1922 auto Fabs = B.buildFAbs(Ty, Src); 1923 1924 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1925 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1926 MI.eraseFromParent(); 1927 return true; 1928 } 1929 1930 bool AMDGPULegalizerInfo::legalizeFceil( 1931 MachineInstr &MI, MachineRegisterInfo &MRI, 1932 MachineIRBuilder &B) const { 1933 1934 const LLT S1 = LLT::scalar(1); 1935 const LLT S64 = LLT::scalar(64); 1936 1937 Register Src = MI.getOperand(1).getReg(); 1938 assert(MRI.getType(Src) == S64); 1939 1940 // result = trunc(src) 1941 // if (src > 0.0 && src != result) 1942 // result += 1.0 1943 1944 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1945 1946 const auto Zero = B.buildFConstant(S64, 0.0); 1947 const auto One = B.buildFConstant(S64, 1.0); 1948 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1949 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1950 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1951 auto Add = B.buildSelect(S64, And, One, Zero); 1952 1953 // TODO: Should this propagate fast-math-flags? 1954 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1955 return true; 1956 } 1957 1958 bool AMDGPULegalizerInfo::legalizeFrem( 1959 MachineInstr &MI, MachineRegisterInfo &MRI, 1960 MachineIRBuilder &B) const { 1961 Register DstReg = MI.getOperand(0).getReg(); 1962 Register Src0Reg = MI.getOperand(1).getReg(); 1963 Register Src1Reg = MI.getOperand(2).getReg(); 1964 auto Flags = MI.getFlags(); 1965 LLT Ty = MRI.getType(DstReg); 1966 1967 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1968 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1969 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1970 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1971 MI.eraseFromParent(); 1972 return true; 1973 } 1974 1975 static MachineInstrBuilder extractF64Exponent(Register Hi, 1976 MachineIRBuilder &B) { 1977 const unsigned FractBits = 52; 1978 const unsigned ExpBits = 11; 1979 LLT S32 = LLT::scalar(32); 1980 1981 auto Const0 = B.buildConstant(S32, FractBits - 32); 1982 auto Const1 = B.buildConstant(S32, ExpBits); 1983 1984 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1985 .addUse(Hi) 1986 .addUse(Const0.getReg(0)) 1987 .addUse(Const1.getReg(0)); 1988 1989 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1990 } 1991 1992 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1993 MachineInstr &MI, MachineRegisterInfo &MRI, 1994 MachineIRBuilder &B) const { 1995 const LLT S1 = LLT::scalar(1); 1996 const LLT S32 = LLT::scalar(32); 1997 const LLT S64 = LLT::scalar(64); 1998 1999 Register Src = MI.getOperand(1).getReg(); 2000 assert(MRI.getType(Src) == S64); 2001 2002 // TODO: Should this use extract since the low half is unused? 2003 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2004 Register Hi = Unmerge.getReg(1); 2005 2006 // Extract the upper half, since this is where we will find the sign and 2007 // exponent. 2008 auto Exp = extractF64Exponent(Hi, B); 2009 2010 const unsigned FractBits = 52; 2011 2012 // Extract the sign bit. 2013 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2014 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2015 2016 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2017 2018 const auto Zero32 = B.buildConstant(S32, 0); 2019 2020 // Extend back to 64-bits. 2021 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2022 2023 auto Shr = B.buildAShr(S64, FractMask, Exp); 2024 auto Not = B.buildNot(S64, Shr); 2025 auto Tmp0 = B.buildAnd(S64, Src, Not); 2026 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2027 2028 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2029 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2030 2031 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2032 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2033 MI.eraseFromParent(); 2034 return true; 2035 } 2036 2037 bool AMDGPULegalizerInfo::legalizeITOFP( 2038 MachineInstr &MI, MachineRegisterInfo &MRI, 2039 MachineIRBuilder &B, bool Signed) const { 2040 2041 Register Dst = MI.getOperand(0).getReg(); 2042 Register Src = MI.getOperand(1).getReg(); 2043 2044 const LLT S64 = LLT::scalar(64); 2045 const LLT S32 = LLT::scalar(32); 2046 2047 assert(MRI.getType(Src) == S64); 2048 2049 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2050 auto ThirtyTwo = B.buildConstant(S32, 32); 2051 2052 if (MRI.getType(Dst) == S64) { 2053 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2054 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2055 2056 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2057 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2058 .addUse(CvtHi.getReg(0)) 2059 .addUse(ThirtyTwo.getReg(0)); 2060 2061 // TODO: Should this propagate fast-math-flags? 2062 B.buildFAdd(Dst, LdExp, CvtLo); 2063 MI.eraseFromParent(); 2064 return true; 2065 } 2066 2067 assert(MRI.getType(Dst) == S32); 2068 2069 auto One = B.buildConstant(S32, 1); 2070 2071 MachineInstrBuilder ShAmt; 2072 if (Signed) { 2073 auto ThirtyOne = B.buildConstant(S32, 31); 2074 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2075 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2076 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2077 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2078 /*HasSideEffects=*/false) 2079 .addUse(Unmerge.getReg(1)); 2080 auto LS2 = B.buildSub(S32, LS, One); 2081 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2082 } else 2083 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2084 auto Norm = B.buildShl(S64, Src, ShAmt); 2085 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2086 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2087 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2088 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2089 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2090 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2091 /*HasSideEffects=*/false) 2092 .addUse(FVal.getReg(0)) 2093 .addUse(Scale.getReg(0)); 2094 MI.eraseFromParent(); 2095 return true; 2096 } 2097 2098 // TODO: Copied from DAG implementation. Verify logic and document how this 2099 // actually works. 2100 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2101 MachineRegisterInfo &MRI, 2102 MachineIRBuilder &B, 2103 bool Signed) const { 2104 2105 Register Dst = MI.getOperand(0).getReg(); 2106 Register Src = MI.getOperand(1).getReg(); 2107 2108 const LLT S64 = LLT::scalar(64); 2109 const LLT S32 = LLT::scalar(32); 2110 2111 const LLT SrcLT = MRI.getType(Src); 2112 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2113 2114 unsigned Flags = MI.getFlags(); 2115 2116 // The basic idea of converting a floating point number into a pair of 32-bit 2117 // integers is illustrated as follows: 2118 // 2119 // tf := trunc(val); 2120 // hif := floor(tf * 2^-32); 2121 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2122 // hi := fptoi(hif); 2123 // lo := fptoi(lof); 2124 // 2125 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2126 MachineInstrBuilder Sign; 2127 if (Signed && SrcLT == S32) { 2128 // However, a 32-bit floating point number has only 23 bits mantissa and 2129 // it's not enough to hold all the significant bits of `lof` if val is 2130 // negative. To avoid the loss of precision, We need to take the absolute 2131 // value after truncating and flip the result back based on the original 2132 // signedness. 2133 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2134 Trunc = B.buildFAbs(S32, Trunc, Flags); 2135 } 2136 MachineInstrBuilder K0, K1; 2137 if (SrcLT == S64) { 2138 K0 = B.buildFConstant(S64, 2139 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2140 K1 = B.buildFConstant(S64, 2141 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2142 } else { 2143 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2144 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2145 } 2146 2147 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2148 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2149 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2150 2151 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2152 : B.buildFPTOUI(S32, FloorMul); 2153 auto Lo = B.buildFPTOUI(S32, Fma); 2154 2155 if (Signed && SrcLT == S32) { 2156 // Flip the result based on the signedness, which is either all 0s or 1s. 2157 Sign = B.buildMerge(S64, {Sign, Sign}); 2158 // r := xor({lo, hi}, sign) - sign; 2159 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2160 } else 2161 B.buildMerge(Dst, {Lo, Hi}); 2162 MI.eraseFromParent(); 2163 2164 return true; 2165 } 2166 2167 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2168 MachineInstr &MI) const { 2169 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2170 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2171 2172 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2173 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2174 2175 // With ieee_mode disabled, the instructions have the correct behavior 2176 // already for G_FMINNUM/G_FMAXNUM 2177 if (!MFI->getMode().IEEE) 2178 return !IsIEEEOp; 2179 2180 if (IsIEEEOp) 2181 return true; 2182 2183 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2184 } 2185 2186 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2187 MachineInstr &MI, MachineRegisterInfo &MRI, 2188 MachineIRBuilder &B) const { 2189 // TODO: Should move some of this into LegalizerHelper. 2190 2191 // TODO: Promote dynamic indexing of s16 to s32 2192 2193 // FIXME: Artifact combiner probably should have replaced the truncated 2194 // constant before this, so we shouldn't need 2195 // getIConstantVRegValWithLookThrough. 2196 Optional<ValueAndVReg> MaybeIdxVal = 2197 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2198 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2199 return true; 2200 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2201 2202 Register Dst = MI.getOperand(0).getReg(); 2203 Register Vec = MI.getOperand(1).getReg(); 2204 2205 LLT VecTy = MRI.getType(Vec); 2206 LLT EltTy = VecTy.getElementType(); 2207 assert(EltTy == MRI.getType(Dst)); 2208 2209 if (IdxVal < VecTy.getNumElements()) 2210 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2211 else 2212 B.buildUndef(Dst); 2213 2214 MI.eraseFromParent(); 2215 return true; 2216 } 2217 2218 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2219 MachineInstr &MI, MachineRegisterInfo &MRI, 2220 MachineIRBuilder &B) const { 2221 // TODO: Should move some of this into LegalizerHelper. 2222 2223 // TODO: Promote dynamic indexing of s16 to s32 2224 2225 // FIXME: Artifact combiner probably should have replaced the truncated 2226 // constant before this, so we shouldn't need 2227 // getIConstantVRegValWithLookThrough. 2228 Optional<ValueAndVReg> MaybeIdxVal = 2229 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2230 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2231 return true; 2232 2233 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2234 Register Dst = MI.getOperand(0).getReg(); 2235 Register Vec = MI.getOperand(1).getReg(); 2236 Register Ins = MI.getOperand(2).getReg(); 2237 2238 LLT VecTy = MRI.getType(Vec); 2239 LLT EltTy = VecTy.getElementType(); 2240 assert(EltTy == MRI.getType(Ins)); 2241 2242 if (IdxVal < VecTy.getNumElements()) 2243 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2244 else 2245 B.buildUndef(Dst); 2246 2247 MI.eraseFromParent(); 2248 return true; 2249 } 2250 2251 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2252 MachineInstr &MI, MachineRegisterInfo &MRI, 2253 MachineIRBuilder &B) const { 2254 const LLT V2S16 = LLT::fixed_vector(2, 16); 2255 2256 Register Dst = MI.getOperand(0).getReg(); 2257 Register Src0 = MI.getOperand(1).getReg(); 2258 LLT DstTy = MRI.getType(Dst); 2259 LLT SrcTy = MRI.getType(Src0); 2260 2261 if (SrcTy == V2S16 && DstTy == V2S16 && 2262 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2263 return true; 2264 2265 MachineIRBuilder HelperBuilder(MI); 2266 GISelObserverWrapper DummyObserver; 2267 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2268 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2269 } 2270 2271 bool AMDGPULegalizerInfo::legalizeSinCos( 2272 MachineInstr &MI, MachineRegisterInfo &MRI, 2273 MachineIRBuilder &B) const { 2274 2275 Register DstReg = MI.getOperand(0).getReg(); 2276 Register SrcReg = MI.getOperand(1).getReg(); 2277 LLT Ty = MRI.getType(DstReg); 2278 unsigned Flags = MI.getFlags(); 2279 2280 Register TrigVal; 2281 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2282 if (ST.hasTrigReducedRange()) { 2283 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2284 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2285 .addUse(MulVal.getReg(0)) 2286 .setMIFlags(Flags).getReg(0); 2287 } else 2288 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2289 2290 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2291 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2292 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2293 .addUse(TrigVal) 2294 .setMIFlags(Flags); 2295 MI.eraseFromParent(); 2296 return true; 2297 } 2298 2299 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2300 MachineIRBuilder &B, 2301 const GlobalValue *GV, 2302 int64_t Offset, 2303 unsigned GAFlags) const { 2304 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2305 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2306 // to the following code sequence: 2307 // 2308 // For constant address space: 2309 // s_getpc_b64 s[0:1] 2310 // s_add_u32 s0, s0, $symbol 2311 // s_addc_u32 s1, s1, 0 2312 // 2313 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2314 // a fixup or relocation is emitted to replace $symbol with a literal 2315 // constant, which is a pc-relative offset from the encoding of the $symbol 2316 // operand to the global variable. 2317 // 2318 // For global address space: 2319 // s_getpc_b64 s[0:1] 2320 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2321 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2322 // 2323 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2324 // fixups or relocations are emitted to replace $symbol@*@lo and 2325 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2326 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2327 // operand to the global variable. 2328 // 2329 // What we want here is an offset from the value returned by s_getpc 2330 // (which is the address of the s_add_u32 instruction) to the global 2331 // variable, but since the encoding of $symbol starts 4 bytes after the start 2332 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2333 // small. This requires us to add 4 to the global variable offset in order to 2334 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2335 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2336 // instruction. 2337 2338 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2339 2340 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2341 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2342 2343 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2344 .addDef(PCReg); 2345 2346 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2347 if (GAFlags == SIInstrInfo::MO_NONE) 2348 MIB.addImm(0); 2349 else 2350 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2351 2352 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2353 2354 if (PtrTy.getSizeInBits() == 32) 2355 B.buildExtract(DstReg, PCReg, 0); 2356 return true; 2357 } 2358 2359 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2360 MachineInstr &MI, MachineRegisterInfo &MRI, 2361 MachineIRBuilder &B) const { 2362 Register DstReg = MI.getOperand(0).getReg(); 2363 LLT Ty = MRI.getType(DstReg); 2364 unsigned AS = Ty.getAddressSpace(); 2365 2366 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2367 MachineFunction &MF = B.getMF(); 2368 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2369 2370 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2371 if (!MFI->isModuleEntryFunction() && 2372 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2373 const Function &Fn = MF.getFunction(); 2374 DiagnosticInfoUnsupported BadLDSDecl( 2375 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2376 DS_Warning); 2377 Fn.getContext().diagnose(BadLDSDecl); 2378 2379 // We currently don't have a way to correctly allocate LDS objects that 2380 // aren't directly associated with a kernel. We do force inlining of 2381 // functions that use local objects. However, if these dead functions are 2382 // not eliminated, we don't want a compile time error. Just emit a warning 2383 // and a trap, since there should be no callable path here. 2384 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2385 B.buildUndef(DstReg); 2386 MI.eraseFromParent(); 2387 return true; 2388 } 2389 2390 // TODO: We could emit code to handle the initialization somewhere. 2391 // We ignore the initializer for now and legalize it to allow selection. 2392 // The initializer will anyway get errored out during assembly emission. 2393 const SITargetLowering *TLI = ST.getTargetLowering(); 2394 if (!TLI->shouldUseLDSConstAddress(GV)) { 2395 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2396 return true; // Leave in place; 2397 } 2398 2399 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2400 Type *Ty = GV->getValueType(); 2401 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2402 // zero-sized type in other languages to declare the dynamic shared 2403 // memory which size is not known at the compile time. They will be 2404 // allocated by the runtime and placed directly after the static 2405 // allocated ones. They all share the same offset. 2406 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2407 // Adjust alignment for that dynamic shared memory array. 2408 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2409 LLT S32 = LLT::scalar(32); 2410 auto Sz = 2411 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2412 B.buildIntToPtr(DstReg, Sz); 2413 MI.eraseFromParent(); 2414 return true; 2415 } 2416 } 2417 2418 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2419 *cast<GlobalVariable>(GV))); 2420 MI.eraseFromParent(); 2421 return true; 2422 } 2423 2424 const SITargetLowering *TLI = ST.getTargetLowering(); 2425 2426 if (TLI->shouldEmitFixup(GV)) { 2427 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2428 MI.eraseFromParent(); 2429 return true; 2430 } 2431 2432 if (TLI->shouldEmitPCReloc(GV)) { 2433 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2434 MI.eraseFromParent(); 2435 return true; 2436 } 2437 2438 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2439 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2440 2441 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2442 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2443 MachinePointerInfo::getGOT(MF), 2444 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2445 MachineMemOperand::MOInvariant, 2446 LoadTy, Align(8)); 2447 2448 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2449 2450 if (Ty.getSizeInBits() == 32) { 2451 // Truncate if this is a 32-bit constant address. 2452 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2453 B.buildExtract(DstReg, Load, 0); 2454 } else 2455 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2456 2457 MI.eraseFromParent(); 2458 return true; 2459 } 2460 2461 static LLT widenToNextPowerOf2(LLT Ty) { 2462 if (Ty.isVector()) 2463 return Ty.changeElementCount( 2464 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2465 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2466 } 2467 2468 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2469 MachineInstr &MI) const { 2470 MachineIRBuilder &B = Helper.MIRBuilder; 2471 MachineRegisterInfo &MRI = *B.getMRI(); 2472 GISelChangeObserver &Observer = Helper.Observer; 2473 2474 Register PtrReg = MI.getOperand(1).getReg(); 2475 LLT PtrTy = MRI.getType(PtrReg); 2476 unsigned AddrSpace = PtrTy.getAddressSpace(); 2477 2478 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2479 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2480 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2481 Observer.changingInstr(MI); 2482 MI.getOperand(1).setReg(Cast.getReg(0)); 2483 Observer.changedInstr(MI); 2484 return true; 2485 } 2486 2487 if (MI.getOpcode() != AMDGPU::G_LOAD) 2488 return false; 2489 2490 Register ValReg = MI.getOperand(0).getReg(); 2491 LLT ValTy = MRI.getType(ValReg); 2492 2493 MachineMemOperand *MMO = *MI.memoperands_begin(); 2494 const unsigned ValSize = ValTy.getSizeInBits(); 2495 const LLT MemTy = MMO->getMemoryType(); 2496 const Align MemAlign = MMO->getAlign(); 2497 const unsigned MemSize = MemTy.getSizeInBits(); 2498 const unsigned AlignInBits = 8 * MemAlign.value(); 2499 2500 // Widen non-power-of-2 loads to the alignment if needed 2501 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2502 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2503 2504 // This was already the correct extending load result type, so just adjust 2505 // the memory type. 2506 if (WideMemSize == ValSize) { 2507 MachineFunction &MF = B.getMF(); 2508 2509 MachineMemOperand *WideMMO = 2510 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2511 Observer.changingInstr(MI); 2512 MI.setMemRefs(MF, {WideMMO}); 2513 Observer.changedInstr(MI); 2514 return true; 2515 } 2516 2517 // Don't bother handling edge case that should probably never be produced. 2518 if (ValSize > WideMemSize) 2519 return false; 2520 2521 LLT WideTy = widenToNextPowerOf2(ValTy); 2522 2523 Register WideLoad; 2524 if (!WideTy.isVector()) { 2525 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2526 B.buildTrunc(ValReg, WideLoad).getReg(0); 2527 } else { 2528 // Extract the subvector. 2529 2530 if (isRegisterType(ValTy)) { 2531 // If this a case where G_EXTRACT is legal, use it. 2532 // (e.g. <3 x s32> -> <4 x s32>) 2533 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2534 B.buildExtract(ValReg, WideLoad, 0); 2535 } else { 2536 // For cases where the widened type isn't a nice register value, unmerge 2537 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2538 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2539 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2540 B.setInsertPt(B.getMBB(), MI.getIterator()); 2541 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2542 } 2543 } 2544 2545 MI.eraseFromParent(); 2546 return true; 2547 } 2548 2549 return false; 2550 } 2551 2552 bool AMDGPULegalizerInfo::legalizeFMad( 2553 MachineInstr &MI, MachineRegisterInfo &MRI, 2554 MachineIRBuilder &B) const { 2555 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2556 assert(Ty.isScalar()); 2557 2558 MachineFunction &MF = B.getMF(); 2559 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2560 2561 // TODO: Always legal with future ftz flag. 2562 // FIXME: Do we need just output? 2563 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2564 return true; 2565 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2566 return true; 2567 2568 MachineIRBuilder HelperBuilder(MI); 2569 GISelObserverWrapper DummyObserver; 2570 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2571 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2572 } 2573 2574 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2575 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2576 Register DstReg = MI.getOperand(0).getReg(); 2577 Register PtrReg = MI.getOperand(1).getReg(); 2578 Register CmpVal = MI.getOperand(2).getReg(); 2579 Register NewVal = MI.getOperand(3).getReg(); 2580 2581 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2582 "this should not have been custom lowered"); 2583 2584 LLT ValTy = MRI.getType(CmpVal); 2585 LLT VecTy = LLT::fixed_vector(2, ValTy); 2586 2587 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2588 2589 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2590 .addDef(DstReg) 2591 .addUse(PtrReg) 2592 .addUse(PackedVal) 2593 .setMemRefs(MI.memoperands()); 2594 2595 MI.eraseFromParent(); 2596 return true; 2597 } 2598 2599 bool AMDGPULegalizerInfo::legalizeFlog( 2600 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2601 Register Dst = MI.getOperand(0).getReg(); 2602 Register Src = MI.getOperand(1).getReg(); 2603 LLT Ty = B.getMRI()->getType(Dst); 2604 unsigned Flags = MI.getFlags(); 2605 2606 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2607 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2608 2609 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2610 MI.eraseFromParent(); 2611 return true; 2612 } 2613 2614 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2615 MachineIRBuilder &B) const { 2616 Register Dst = MI.getOperand(0).getReg(); 2617 Register Src = MI.getOperand(1).getReg(); 2618 unsigned Flags = MI.getFlags(); 2619 LLT Ty = B.getMRI()->getType(Dst); 2620 2621 auto K = B.buildFConstant(Ty, numbers::log2e); 2622 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2623 B.buildFExp2(Dst, Mul, Flags); 2624 MI.eraseFromParent(); 2625 return true; 2626 } 2627 2628 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2629 MachineIRBuilder &B) const { 2630 Register Dst = MI.getOperand(0).getReg(); 2631 Register Src0 = MI.getOperand(1).getReg(); 2632 Register Src1 = MI.getOperand(2).getReg(); 2633 unsigned Flags = MI.getFlags(); 2634 LLT Ty = B.getMRI()->getType(Dst); 2635 const LLT S16 = LLT::scalar(16); 2636 const LLT S32 = LLT::scalar(32); 2637 2638 if (Ty == S32) { 2639 auto Log = B.buildFLog2(S32, Src0, Flags); 2640 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2641 .addUse(Log.getReg(0)) 2642 .addUse(Src1) 2643 .setMIFlags(Flags); 2644 B.buildFExp2(Dst, Mul, Flags); 2645 } else if (Ty == S16) { 2646 // There's no f16 fmul_legacy, so we need to convert for it. 2647 auto Log = B.buildFLog2(S16, Src0, Flags); 2648 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2649 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2650 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2651 .addUse(Ext0.getReg(0)) 2652 .addUse(Ext1.getReg(0)) 2653 .setMIFlags(Flags); 2654 2655 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2656 } else 2657 return false; 2658 2659 MI.eraseFromParent(); 2660 return true; 2661 } 2662 2663 // Find a source register, ignoring any possible source modifiers. 2664 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2665 Register ModSrc = OrigSrc; 2666 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2667 ModSrc = SrcFNeg->getOperand(1).getReg(); 2668 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2669 ModSrc = SrcFAbs->getOperand(1).getReg(); 2670 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2671 ModSrc = SrcFAbs->getOperand(1).getReg(); 2672 return ModSrc; 2673 } 2674 2675 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2676 MachineRegisterInfo &MRI, 2677 MachineIRBuilder &B) const { 2678 2679 const LLT S1 = LLT::scalar(1); 2680 const LLT S64 = LLT::scalar(64); 2681 Register Dst = MI.getOperand(0).getReg(); 2682 Register OrigSrc = MI.getOperand(1).getReg(); 2683 unsigned Flags = MI.getFlags(); 2684 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2685 "this should not have been custom lowered"); 2686 2687 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2688 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2689 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2690 // V_FRACT bug is: 2691 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2692 // 2693 // Convert floor(x) to (x - fract(x)) 2694 2695 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2696 .addUse(OrigSrc) 2697 .setMIFlags(Flags); 2698 2699 // Give source modifier matching some assistance before obscuring a foldable 2700 // pattern. 2701 2702 // TODO: We can avoid the neg on the fract? The input sign to fract 2703 // shouldn't matter? 2704 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2705 2706 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2707 2708 Register Min = MRI.createGenericVirtualRegister(S64); 2709 2710 // We don't need to concern ourselves with the snan handling difference, so 2711 // use the one which will directly select. 2712 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2713 if (MFI->getMode().IEEE) 2714 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2715 else 2716 B.buildFMinNum(Min, Fract, Const, Flags); 2717 2718 Register CorrectedFract = Min; 2719 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2720 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2721 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2722 } 2723 2724 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2725 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2726 2727 MI.eraseFromParent(); 2728 return true; 2729 } 2730 2731 // Turn an illegal packed v2s16 build vector into bit operations. 2732 // TODO: This should probably be a bitcast action in LegalizerHelper. 2733 bool AMDGPULegalizerInfo::legalizeBuildVector( 2734 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2735 Register Dst = MI.getOperand(0).getReg(); 2736 const LLT S32 = LLT::scalar(32); 2737 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2738 2739 Register Src0 = MI.getOperand(1).getReg(); 2740 Register Src1 = MI.getOperand(2).getReg(); 2741 assert(MRI.getType(Src0) == LLT::scalar(16)); 2742 2743 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2744 B.buildBitcast(Dst, Merge); 2745 2746 MI.eraseFromParent(); 2747 return true; 2748 } 2749 2750 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2751 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2752 // case with a single min instruction instead of a compare+select. 2753 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2754 MachineRegisterInfo &MRI, 2755 MachineIRBuilder &B) const { 2756 Register Dst = MI.getOperand(0).getReg(); 2757 Register Src = MI.getOperand(1).getReg(); 2758 LLT DstTy = MRI.getType(Dst); 2759 LLT SrcTy = MRI.getType(Src); 2760 2761 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2762 ? AMDGPU::G_AMDGPU_FFBH_U32 2763 : AMDGPU::G_AMDGPU_FFBL_B32; 2764 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2765 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2766 2767 MI.eraseFromParent(); 2768 return true; 2769 } 2770 2771 // Check that this is a G_XOR x, -1 2772 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2773 if (MI.getOpcode() != TargetOpcode::G_XOR) 2774 return false; 2775 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2776 return ConstVal && *ConstVal == -1; 2777 } 2778 2779 // Return the use branch instruction, otherwise null if the usage is invalid. 2780 static MachineInstr * 2781 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2782 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2783 Register CondDef = MI.getOperand(0).getReg(); 2784 if (!MRI.hasOneNonDBGUse(CondDef)) 2785 return nullptr; 2786 2787 MachineBasicBlock *Parent = MI.getParent(); 2788 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2789 2790 if (isNot(MRI, *UseMI)) { 2791 Register NegatedCond = UseMI->getOperand(0).getReg(); 2792 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2793 return nullptr; 2794 2795 // We're deleting the def of this value, so we need to remove it. 2796 eraseInstr(*UseMI, MRI); 2797 2798 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2799 Negated = true; 2800 } 2801 2802 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2803 return nullptr; 2804 2805 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2806 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2807 if (Next == Parent->end()) { 2808 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2809 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2810 return nullptr; 2811 UncondBrTarget = &*NextMBB; 2812 } else { 2813 if (Next->getOpcode() != AMDGPU::G_BR) 2814 return nullptr; 2815 Br = &*Next; 2816 UncondBrTarget = Br->getOperand(0).getMBB(); 2817 } 2818 2819 return UseMI; 2820 } 2821 2822 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2823 const ArgDescriptor *Arg, 2824 const TargetRegisterClass *ArgRC, 2825 LLT ArgTy) const { 2826 MCRegister SrcReg = Arg->getRegister(); 2827 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2828 assert(DstReg.isVirtual() && "Virtual register expected"); 2829 2830 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2831 ArgTy); 2832 if (Arg->isMasked()) { 2833 // TODO: Should we try to emit this once in the entry block? 2834 const LLT S32 = LLT::scalar(32); 2835 const unsigned Mask = Arg->getMask(); 2836 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2837 2838 Register AndMaskSrc = LiveIn; 2839 2840 if (Shift != 0) { 2841 auto ShiftAmt = B.buildConstant(S32, Shift); 2842 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2843 } 2844 2845 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2846 } else { 2847 B.buildCopy(DstReg, LiveIn); 2848 } 2849 2850 return true; 2851 } 2852 2853 bool AMDGPULegalizerInfo::loadInputValue( 2854 Register DstReg, MachineIRBuilder &B, 2855 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2856 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2857 const ArgDescriptor *Arg; 2858 const TargetRegisterClass *ArgRC; 2859 LLT ArgTy; 2860 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2861 2862 if (!Arg) { 2863 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 2864 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 2865 // case the pointer argument may be missing and we use null. 2866 B.buildConstant(DstReg, 0); 2867 return true; 2868 } 2869 2870 // It's undefined behavior if a function marked with the amdgpu-no-* 2871 // attributes uses the corresponding intrinsic. 2872 B.buildUndef(DstReg); 2873 return true; 2874 } 2875 2876 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2877 return false; // TODO: Handle these 2878 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2879 } 2880 2881 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2882 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2883 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2884 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2885 return false; 2886 2887 MI.eraseFromParent(); 2888 return true; 2889 } 2890 2891 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2892 MachineRegisterInfo &MRI, 2893 MachineIRBuilder &B) const { 2894 Register Dst = MI.getOperand(0).getReg(); 2895 LLT DstTy = MRI.getType(Dst); 2896 LLT S16 = LLT::scalar(16); 2897 LLT S32 = LLT::scalar(32); 2898 LLT S64 = LLT::scalar(64); 2899 2900 if (DstTy == S16) 2901 return legalizeFDIV16(MI, MRI, B); 2902 if (DstTy == S32) 2903 return legalizeFDIV32(MI, MRI, B); 2904 if (DstTy == S64) 2905 return legalizeFDIV64(MI, MRI, B); 2906 2907 return false; 2908 } 2909 2910 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2911 Register DstDivReg, 2912 Register DstRemReg, 2913 Register X, 2914 Register Y) const { 2915 const LLT S1 = LLT::scalar(1); 2916 const LLT S32 = LLT::scalar(32); 2917 2918 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2919 // algorithm used here. 2920 2921 // Initial estimate of inv(y). 2922 auto FloatY = B.buildUITOFP(S32, Y); 2923 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2924 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2925 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2926 auto Z = B.buildFPTOUI(S32, ScaledY); 2927 2928 // One round of UNR. 2929 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2930 auto NegYZ = B.buildMul(S32, NegY, Z); 2931 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2932 2933 // Quotient/remainder estimate. 2934 auto Q = B.buildUMulH(S32, X, Z); 2935 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2936 2937 // First quotient/remainder refinement. 2938 auto One = B.buildConstant(S32, 1); 2939 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2940 if (DstDivReg) 2941 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2942 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2943 2944 // Second quotient/remainder refinement. 2945 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2946 if (DstDivReg) 2947 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2948 2949 if (DstRemReg) 2950 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2951 } 2952 2953 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 2954 // 2955 // Return lo, hi of result 2956 // 2957 // %cvt.lo = G_UITOFP Val.lo 2958 // %cvt.hi = G_UITOFP Val.hi 2959 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2960 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2961 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2962 // %mul2 = G_FMUL %mul1, 2**(-32) 2963 // %trunc = G_INTRINSIC_TRUNC %mul2 2964 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2965 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2966 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2967 Register Val) { 2968 const LLT S32 = LLT::scalar(32); 2969 auto Unmerge = B.buildUnmerge(S32, Val); 2970 2971 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2972 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2973 2974 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2975 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2976 2977 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2978 auto Mul1 = 2979 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2980 2981 // 2**(-32) 2982 auto Mul2 = 2983 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2984 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2985 2986 // -(2**32) 2987 auto Mad2 = B.buildFMAD(S32, Trunc, 2988 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2989 2990 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2991 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2992 2993 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2994 } 2995 2996 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 2997 Register DstDivReg, 2998 Register DstRemReg, 2999 Register Numer, 3000 Register Denom) const { 3001 const LLT S32 = LLT::scalar(32); 3002 const LLT S64 = LLT::scalar(64); 3003 const LLT S1 = LLT::scalar(1); 3004 Register RcpLo, RcpHi; 3005 3006 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3007 3008 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3009 3010 auto Zero64 = B.buildConstant(S64, 0); 3011 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3012 3013 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3014 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3015 3016 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3017 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3018 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3019 3020 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3021 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3022 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3023 3024 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3025 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3026 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3027 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3028 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3029 3030 auto Zero32 = B.buildConstant(S32, 0); 3031 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3032 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3033 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3034 3035 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3036 Register NumerLo = UnmergeNumer.getReg(0); 3037 Register NumerHi = UnmergeNumer.getReg(1); 3038 3039 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3040 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3041 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3042 Register Mul3_Lo = UnmergeMul3.getReg(0); 3043 Register Mul3_Hi = UnmergeMul3.getReg(1); 3044 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3045 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3046 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3047 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3048 3049 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3050 Register DenomLo = UnmergeDenom.getReg(0); 3051 Register DenomHi = UnmergeDenom.getReg(1); 3052 3053 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3054 auto C1 = B.buildSExt(S32, CmpHi); 3055 3056 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3057 auto C2 = B.buildSExt(S32, CmpLo); 3058 3059 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3060 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3061 3062 // TODO: Here and below portions of the code can be enclosed into if/endif. 3063 // Currently control flow is unconditional and we have 4 selects after 3064 // potential endif to substitute PHIs. 3065 3066 // if C3 != 0 ... 3067 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3068 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3069 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3070 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3071 3072 auto One64 = B.buildConstant(S64, 1); 3073 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3074 3075 auto C4 = 3076 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3077 auto C5 = 3078 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3079 auto C6 = B.buildSelect( 3080 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3081 3082 // if (C6 != 0) 3083 auto Add4 = B.buildAdd(S64, Add3, One64); 3084 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3085 3086 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3087 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3088 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3089 3090 // endif C6 3091 // endif C3 3092 3093 if (DstDivReg) { 3094 auto Sel1 = B.buildSelect( 3095 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3096 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3097 Sel1, MulHi3); 3098 } 3099 3100 if (DstRemReg) { 3101 auto Sel2 = B.buildSelect( 3102 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3103 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3104 Sel2, Sub1); 3105 } 3106 } 3107 3108 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3109 MachineRegisterInfo &MRI, 3110 MachineIRBuilder &B) const { 3111 Register DstDivReg, DstRemReg; 3112 switch (MI.getOpcode()) { 3113 default: 3114 llvm_unreachable("Unexpected opcode!"); 3115 case AMDGPU::G_UDIV: { 3116 DstDivReg = MI.getOperand(0).getReg(); 3117 break; 3118 } 3119 case AMDGPU::G_UREM: { 3120 DstRemReg = MI.getOperand(0).getReg(); 3121 break; 3122 } 3123 case AMDGPU::G_UDIVREM: { 3124 DstDivReg = MI.getOperand(0).getReg(); 3125 DstRemReg = MI.getOperand(1).getReg(); 3126 break; 3127 } 3128 } 3129 3130 const LLT S64 = LLT::scalar(64); 3131 const LLT S32 = LLT::scalar(32); 3132 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3133 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3134 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3135 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3136 3137 if (Ty == S32) 3138 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3139 else if (Ty == S64) 3140 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3141 else 3142 return false; 3143 3144 MI.eraseFromParent(); 3145 return true; 3146 } 3147 3148 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3149 MachineRegisterInfo &MRI, 3150 MachineIRBuilder &B) const { 3151 const LLT S64 = LLT::scalar(64); 3152 const LLT S32 = LLT::scalar(32); 3153 3154 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3155 if (Ty != S32 && Ty != S64) 3156 return false; 3157 3158 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3159 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3160 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3161 3162 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3163 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3164 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3165 3166 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3167 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3168 3169 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3170 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3171 3172 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3173 switch (MI.getOpcode()) { 3174 default: 3175 llvm_unreachable("Unexpected opcode!"); 3176 case AMDGPU::G_SDIV: { 3177 DstDivReg = MI.getOperand(0).getReg(); 3178 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3179 break; 3180 } 3181 case AMDGPU::G_SREM: { 3182 DstRemReg = MI.getOperand(0).getReg(); 3183 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3184 break; 3185 } 3186 case AMDGPU::G_SDIVREM: { 3187 DstDivReg = MI.getOperand(0).getReg(); 3188 DstRemReg = MI.getOperand(1).getReg(); 3189 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3190 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3191 break; 3192 } 3193 } 3194 3195 if (Ty == S32) 3196 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3197 else 3198 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3199 3200 if (DstDivReg) { 3201 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3202 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3203 B.buildSub(DstDivReg, SignXor, Sign); 3204 } 3205 3206 if (DstRemReg) { 3207 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3208 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3209 B.buildSub(DstRemReg, SignXor, Sign); 3210 } 3211 3212 MI.eraseFromParent(); 3213 return true; 3214 } 3215 3216 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3217 MachineRegisterInfo &MRI, 3218 MachineIRBuilder &B) const { 3219 Register Res = MI.getOperand(0).getReg(); 3220 Register LHS = MI.getOperand(1).getReg(); 3221 Register RHS = MI.getOperand(2).getReg(); 3222 uint16_t Flags = MI.getFlags(); 3223 LLT ResTy = MRI.getType(Res); 3224 3225 const MachineFunction &MF = B.getMF(); 3226 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3227 MI.getFlag(MachineInstr::FmAfn); 3228 3229 if (!AllowInaccurateRcp) 3230 return false; 3231 3232 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3233 // 1 / x -> RCP(x) 3234 if (CLHS->isExactlyValue(1.0)) { 3235 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3236 .addUse(RHS) 3237 .setMIFlags(Flags); 3238 3239 MI.eraseFromParent(); 3240 return true; 3241 } 3242 3243 // -1 / x -> RCP( FNEG(x) ) 3244 if (CLHS->isExactlyValue(-1.0)) { 3245 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3246 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3247 .addUse(FNeg.getReg(0)) 3248 .setMIFlags(Flags); 3249 3250 MI.eraseFromParent(); 3251 return true; 3252 } 3253 } 3254 3255 // x / y -> x * (1.0 / y) 3256 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3257 .addUse(RHS) 3258 .setMIFlags(Flags); 3259 B.buildFMul(Res, LHS, RCP, Flags); 3260 3261 MI.eraseFromParent(); 3262 return true; 3263 } 3264 3265 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3266 MachineRegisterInfo &MRI, 3267 MachineIRBuilder &B) const { 3268 Register Res = MI.getOperand(0).getReg(); 3269 Register X = MI.getOperand(1).getReg(); 3270 Register Y = MI.getOperand(2).getReg(); 3271 uint16_t Flags = MI.getFlags(); 3272 LLT ResTy = MRI.getType(Res); 3273 3274 const MachineFunction &MF = B.getMF(); 3275 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3276 MI.getFlag(MachineInstr::FmAfn); 3277 3278 if (!AllowInaccurateRcp) 3279 return false; 3280 3281 auto NegY = B.buildFNeg(ResTy, Y); 3282 auto One = B.buildFConstant(ResTy, 1.0); 3283 3284 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3285 .addUse(Y) 3286 .setMIFlags(Flags); 3287 3288 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3289 R = B.buildFMA(ResTy, Tmp0, R, R); 3290 3291 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3292 R = B.buildFMA(ResTy, Tmp1, R, R); 3293 3294 auto Ret = B.buildFMul(ResTy, X, R); 3295 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3296 3297 B.buildFMA(Res, Tmp2, R, Ret); 3298 MI.eraseFromParent(); 3299 return true; 3300 } 3301 3302 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3303 MachineRegisterInfo &MRI, 3304 MachineIRBuilder &B) const { 3305 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3306 return true; 3307 3308 Register Res = MI.getOperand(0).getReg(); 3309 Register LHS = MI.getOperand(1).getReg(); 3310 Register RHS = MI.getOperand(2).getReg(); 3311 3312 uint16_t Flags = MI.getFlags(); 3313 3314 LLT S16 = LLT::scalar(16); 3315 LLT S32 = LLT::scalar(32); 3316 3317 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3318 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3319 3320 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3321 .addUse(RHSExt.getReg(0)) 3322 .setMIFlags(Flags); 3323 3324 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3325 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3326 3327 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3328 .addUse(RDst.getReg(0)) 3329 .addUse(RHS) 3330 .addUse(LHS) 3331 .setMIFlags(Flags); 3332 3333 MI.eraseFromParent(); 3334 return true; 3335 } 3336 3337 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3338 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3339 static void toggleSPDenormMode(bool Enable, 3340 MachineIRBuilder &B, 3341 const GCNSubtarget &ST, 3342 AMDGPU::SIModeRegisterDefaults Mode) { 3343 // Set SP denorm mode to this value. 3344 unsigned SPDenormMode = 3345 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3346 3347 if (ST.hasDenormModeInst()) { 3348 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3349 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3350 3351 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3352 B.buildInstr(AMDGPU::S_DENORM_MODE) 3353 .addImm(NewDenormModeValue); 3354 3355 } else { 3356 // Select FP32 bit field in mode register. 3357 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3358 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3359 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3360 3361 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3362 .addImm(SPDenormMode) 3363 .addImm(SPDenormModeBitField); 3364 } 3365 } 3366 3367 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3368 MachineRegisterInfo &MRI, 3369 MachineIRBuilder &B) const { 3370 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3371 return true; 3372 3373 Register Res = MI.getOperand(0).getReg(); 3374 Register LHS = MI.getOperand(1).getReg(); 3375 Register RHS = MI.getOperand(2).getReg(); 3376 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3377 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3378 3379 uint16_t Flags = MI.getFlags(); 3380 3381 LLT S32 = LLT::scalar(32); 3382 LLT S1 = LLT::scalar(1); 3383 3384 auto One = B.buildFConstant(S32, 1.0f); 3385 3386 auto DenominatorScaled = 3387 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3388 .addUse(LHS) 3389 .addUse(RHS) 3390 .addImm(0) 3391 .setMIFlags(Flags); 3392 auto NumeratorScaled = 3393 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3394 .addUse(LHS) 3395 .addUse(RHS) 3396 .addImm(1) 3397 .setMIFlags(Flags); 3398 3399 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3400 .addUse(DenominatorScaled.getReg(0)) 3401 .setMIFlags(Flags); 3402 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3403 3404 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3405 // aren't modeled as reading it. 3406 if (!Mode.allFP32Denormals()) 3407 toggleSPDenormMode(true, B, ST, Mode); 3408 3409 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3410 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3411 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3412 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3413 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3414 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3415 3416 if (!Mode.allFP32Denormals()) 3417 toggleSPDenormMode(false, B, ST, Mode); 3418 3419 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3420 .addUse(Fma4.getReg(0)) 3421 .addUse(Fma1.getReg(0)) 3422 .addUse(Fma3.getReg(0)) 3423 .addUse(NumeratorScaled.getReg(1)) 3424 .setMIFlags(Flags); 3425 3426 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3427 .addUse(Fmas.getReg(0)) 3428 .addUse(RHS) 3429 .addUse(LHS) 3430 .setMIFlags(Flags); 3431 3432 MI.eraseFromParent(); 3433 return true; 3434 } 3435 3436 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3437 MachineRegisterInfo &MRI, 3438 MachineIRBuilder &B) const { 3439 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3440 return true; 3441 3442 Register Res = MI.getOperand(0).getReg(); 3443 Register LHS = MI.getOperand(1).getReg(); 3444 Register RHS = MI.getOperand(2).getReg(); 3445 3446 uint16_t Flags = MI.getFlags(); 3447 3448 LLT S64 = LLT::scalar(64); 3449 LLT S1 = LLT::scalar(1); 3450 3451 auto One = B.buildFConstant(S64, 1.0); 3452 3453 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3454 .addUse(LHS) 3455 .addUse(RHS) 3456 .addImm(0) 3457 .setMIFlags(Flags); 3458 3459 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3460 3461 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3462 .addUse(DivScale0.getReg(0)) 3463 .setMIFlags(Flags); 3464 3465 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3466 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3467 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3468 3469 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3470 .addUse(LHS) 3471 .addUse(RHS) 3472 .addImm(1) 3473 .setMIFlags(Flags); 3474 3475 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3476 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3477 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3478 3479 Register Scale; 3480 if (!ST.hasUsableDivScaleConditionOutput()) { 3481 // Workaround a hardware bug on SI where the condition output from div_scale 3482 // is not usable. 3483 3484 LLT S32 = LLT::scalar(32); 3485 3486 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3487 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3488 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3489 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3490 3491 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3492 Scale1Unmerge.getReg(1)); 3493 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3494 Scale0Unmerge.getReg(1)); 3495 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3496 } else { 3497 Scale = DivScale1.getReg(1); 3498 } 3499 3500 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3501 .addUse(Fma4.getReg(0)) 3502 .addUse(Fma3.getReg(0)) 3503 .addUse(Mul.getReg(0)) 3504 .addUse(Scale) 3505 .setMIFlags(Flags); 3506 3507 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3508 .addUse(Fmas.getReg(0)) 3509 .addUse(RHS) 3510 .addUse(LHS) 3511 .setMIFlags(Flags); 3512 3513 MI.eraseFromParent(); 3514 return true; 3515 } 3516 3517 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3518 MachineRegisterInfo &MRI, 3519 MachineIRBuilder &B) const { 3520 Register Res = MI.getOperand(0).getReg(); 3521 Register LHS = MI.getOperand(2).getReg(); 3522 Register RHS = MI.getOperand(3).getReg(); 3523 uint16_t Flags = MI.getFlags(); 3524 3525 LLT S32 = LLT::scalar(32); 3526 LLT S1 = LLT::scalar(1); 3527 3528 auto Abs = B.buildFAbs(S32, RHS, Flags); 3529 const APFloat C0Val(1.0f); 3530 3531 auto C0 = B.buildConstant(S32, 0x6f800000); 3532 auto C1 = B.buildConstant(S32, 0x2f800000); 3533 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3534 3535 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3536 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3537 3538 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3539 3540 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3541 .addUse(Mul0.getReg(0)) 3542 .setMIFlags(Flags); 3543 3544 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3545 3546 B.buildFMul(Res, Sel, Mul1, Flags); 3547 3548 MI.eraseFromParent(); 3549 return true; 3550 } 3551 3552 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3553 // FIXME: Why do we handle this one but not other removed instructions? 3554 // 3555 // Reciprocal square root. The clamp prevents infinite results, clamping 3556 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3557 // +-max_float. 3558 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3559 MachineRegisterInfo &MRI, 3560 MachineIRBuilder &B) const { 3561 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3562 return true; 3563 3564 Register Dst = MI.getOperand(0).getReg(); 3565 Register Src = MI.getOperand(2).getReg(); 3566 auto Flags = MI.getFlags(); 3567 3568 LLT Ty = MRI.getType(Dst); 3569 3570 const fltSemantics *FltSemantics; 3571 if (Ty == LLT::scalar(32)) 3572 FltSemantics = &APFloat::IEEEsingle(); 3573 else if (Ty == LLT::scalar(64)) 3574 FltSemantics = &APFloat::IEEEdouble(); 3575 else 3576 return false; 3577 3578 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3579 .addUse(Src) 3580 .setMIFlags(Flags); 3581 3582 // We don't need to concern ourselves with the snan handling difference, since 3583 // the rsq quieted (or not) so use the one which will directly select. 3584 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3585 const bool UseIEEE = MFI->getMode().IEEE; 3586 3587 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3588 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3589 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3590 3591 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3592 3593 if (UseIEEE) 3594 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3595 else 3596 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3597 MI.eraseFromParent(); 3598 return true; 3599 } 3600 3601 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3602 switch (IID) { 3603 case Intrinsic::amdgcn_ds_fadd: 3604 return AMDGPU::G_ATOMICRMW_FADD; 3605 case Intrinsic::amdgcn_ds_fmin: 3606 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3607 case Intrinsic::amdgcn_ds_fmax: 3608 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3609 default: 3610 llvm_unreachable("not a DS FP intrinsic"); 3611 } 3612 } 3613 3614 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3615 MachineInstr &MI, 3616 Intrinsic::ID IID) const { 3617 GISelChangeObserver &Observer = Helper.Observer; 3618 Observer.changingInstr(MI); 3619 3620 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3621 3622 // The remaining operands were used to set fields in the MemOperand on 3623 // construction. 3624 for (int I = 6; I > 3; --I) 3625 MI.RemoveOperand(I); 3626 3627 MI.RemoveOperand(1); // Remove the intrinsic ID. 3628 Observer.changedInstr(MI); 3629 return true; 3630 } 3631 3632 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3633 MachineRegisterInfo &MRI, 3634 MachineIRBuilder &B) const { 3635 uint64_t Offset = 3636 ST.getTargetLowering()->getImplicitParameterOffset( 3637 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3638 LLT DstTy = MRI.getType(DstReg); 3639 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3640 3641 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3642 if (!loadInputValue(KernargPtrReg, B, 3643 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3644 return false; 3645 3646 // FIXME: This should be nuw 3647 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3648 return true; 3649 } 3650 3651 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3652 MachineRegisterInfo &MRI, 3653 MachineIRBuilder &B) const { 3654 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3655 if (!MFI->isEntryFunction()) { 3656 return legalizePreloadedArgIntrin(MI, MRI, B, 3657 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3658 } 3659 3660 Register DstReg = MI.getOperand(0).getReg(); 3661 if (!getImplicitArgPtr(DstReg, MRI, B)) 3662 return false; 3663 3664 MI.eraseFromParent(); 3665 return true; 3666 } 3667 3668 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3669 MachineRegisterInfo &MRI, 3670 MachineIRBuilder &B, 3671 unsigned AddrSpace) const { 3672 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3673 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3674 Register Hi32 = Unmerge.getReg(1); 3675 3676 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3677 MI.eraseFromParent(); 3678 return true; 3679 } 3680 3681 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3682 // offset (the offset that is included in bounds checking and swizzling, to be 3683 // split between the instruction's voffset and immoffset fields) and soffset 3684 // (the offset that is excluded from bounds checking and swizzling, to go in 3685 // the instruction's soffset field). This function takes the first kind of 3686 // offset and figures out how to split it between voffset and immoffset. 3687 std::pair<Register, unsigned> 3688 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3689 Register OrigOffset) const { 3690 const unsigned MaxImm = 4095; 3691 Register BaseReg; 3692 unsigned ImmOffset; 3693 const LLT S32 = LLT::scalar(32); 3694 MachineRegisterInfo &MRI = *B.getMRI(); 3695 3696 std::tie(BaseReg, ImmOffset) = 3697 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3698 3699 // If BaseReg is a pointer, convert it to int. 3700 if (MRI.getType(BaseReg).isPointer()) 3701 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3702 3703 // If the immediate value is too big for the immoffset field, put the value 3704 // and -4096 into the immoffset field so that the value that is copied/added 3705 // for the voffset field is a multiple of 4096, and it stands more chance 3706 // of being CSEd with the copy/add for another similar load/store. 3707 // However, do not do that rounding down to a multiple of 4096 if that is a 3708 // negative number, as it appears to be illegal to have a negative offset 3709 // in the vgpr, even if adding the immediate offset makes it positive. 3710 unsigned Overflow = ImmOffset & ~MaxImm; 3711 ImmOffset -= Overflow; 3712 if ((int32_t)Overflow < 0) { 3713 Overflow += ImmOffset; 3714 ImmOffset = 0; 3715 } 3716 3717 if (Overflow != 0) { 3718 if (!BaseReg) { 3719 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3720 } else { 3721 auto OverflowVal = B.buildConstant(S32, Overflow); 3722 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3723 } 3724 } 3725 3726 if (!BaseReg) 3727 BaseReg = B.buildConstant(S32, 0).getReg(0); 3728 3729 return std::make_pair(BaseReg, ImmOffset); 3730 } 3731 3732 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3733 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3734 Register VOffset, Register SOffset, 3735 unsigned ImmOffset, Register VIndex, 3736 MachineRegisterInfo &MRI) const { 3737 Optional<ValueAndVReg> MaybeVOffsetVal = 3738 getIConstantVRegValWithLookThrough(VOffset, MRI); 3739 Optional<ValueAndVReg> MaybeSOffsetVal = 3740 getIConstantVRegValWithLookThrough(SOffset, MRI); 3741 Optional<ValueAndVReg> MaybeVIndexVal = 3742 getIConstantVRegValWithLookThrough(VIndex, MRI); 3743 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3744 // update the MMO with that offset. The stride is unknown so we can only do 3745 // this if VIndex is constant 0. 3746 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3747 MaybeVIndexVal->Value == 0) { 3748 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3749 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3750 MMO->setOffset(TotalOffset); 3751 } else { 3752 // We don't have a constant combined offset to use in the MMO. Give up. 3753 MMO->setValue((Value *)nullptr); 3754 } 3755 } 3756 3757 /// Handle register layout difference for f16 images for some subtargets. 3758 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3759 MachineRegisterInfo &MRI, 3760 Register Reg, 3761 bool ImageStore) const { 3762 const LLT S16 = LLT::scalar(16); 3763 const LLT S32 = LLT::scalar(32); 3764 LLT StoreVT = MRI.getType(Reg); 3765 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3766 3767 if (ST.hasUnpackedD16VMem()) { 3768 auto Unmerge = B.buildUnmerge(S16, Reg); 3769 3770 SmallVector<Register, 4> WideRegs; 3771 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3772 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3773 3774 int NumElts = StoreVT.getNumElements(); 3775 3776 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3777 .getReg(0); 3778 } 3779 3780 if (ImageStore && ST.hasImageStoreD16Bug()) { 3781 if (StoreVT.getNumElements() == 2) { 3782 SmallVector<Register, 4> PackedRegs; 3783 Reg = B.buildBitcast(S32, Reg).getReg(0); 3784 PackedRegs.push_back(Reg); 3785 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3786 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3787 .getReg(0); 3788 } 3789 3790 if (StoreVT.getNumElements() == 3) { 3791 SmallVector<Register, 4> PackedRegs; 3792 auto Unmerge = B.buildUnmerge(S16, Reg); 3793 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3794 PackedRegs.push_back(Unmerge.getReg(I)); 3795 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3796 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3797 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3798 } 3799 3800 if (StoreVT.getNumElements() == 4) { 3801 SmallVector<Register, 4> PackedRegs; 3802 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3803 auto Unmerge = B.buildUnmerge(S32, Reg); 3804 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3805 PackedRegs.push_back(Unmerge.getReg(I)); 3806 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3807 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3808 .getReg(0); 3809 } 3810 3811 llvm_unreachable("invalid data type"); 3812 } 3813 3814 return Reg; 3815 } 3816 3817 Register AMDGPULegalizerInfo::fixStoreSourceType( 3818 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3819 MachineRegisterInfo *MRI = B.getMRI(); 3820 LLT Ty = MRI->getType(VData); 3821 3822 const LLT S16 = LLT::scalar(16); 3823 3824 // Fixup illegal register types for i8 stores. 3825 if (Ty == LLT::scalar(8) || Ty == S16) { 3826 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3827 return AnyExt; 3828 } 3829 3830 if (Ty.isVector()) { 3831 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3832 if (IsFormat) 3833 return handleD16VData(B, *MRI, VData); 3834 } 3835 } 3836 3837 return VData; 3838 } 3839 3840 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3841 MachineRegisterInfo &MRI, 3842 MachineIRBuilder &B, 3843 bool IsTyped, 3844 bool IsFormat) const { 3845 Register VData = MI.getOperand(1).getReg(); 3846 LLT Ty = MRI.getType(VData); 3847 LLT EltTy = Ty.getScalarType(); 3848 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3849 const LLT S32 = LLT::scalar(32); 3850 3851 VData = fixStoreSourceType(B, VData, IsFormat); 3852 Register RSrc = MI.getOperand(2).getReg(); 3853 3854 MachineMemOperand *MMO = *MI.memoperands_begin(); 3855 const int MemSize = MMO->getSize(); 3856 3857 unsigned ImmOffset; 3858 3859 // The typed intrinsics add an immediate after the registers. 3860 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3861 3862 // The struct intrinsic variants add one additional operand over raw. 3863 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3864 Register VIndex; 3865 int OpOffset = 0; 3866 if (HasVIndex) { 3867 VIndex = MI.getOperand(3).getReg(); 3868 OpOffset = 1; 3869 } else { 3870 VIndex = B.buildConstant(S32, 0).getReg(0); 3871 } 3872 3873 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3874 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3875 3876 unsigned Format = 0; 3877 if (IsTyped) { 3878 Format = MI.getOperand(5 + OpOffset).getImm(); 3879 ++OpOffset; 3880 } 3881 3882 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3883 3884 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3885 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3886 3887 unsigned Opc; 3888 if (IsTyped) { 3889 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3890 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3891 } else if (IsFormat) { 3892 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3893 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3894 } else { 3895 switch (MemSize) { 3896 case 1: 3897 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3898 break; 3899 case 2: 3900 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3901 break; 3902 default: 3903 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3904 break; 3905 } 3906 } 3907 3908 auto MIB = B.buildInstr(Opc) 3909 .addUse(VData) // vdata 3910 .addUse(RSrc) // rsrc 3911 .addUse(VIndex) // vindex 3912 .addUse(VOffset) // voffset 3913 .addUse(SOffset) // soffset 3914 .addImm(ImmOffset); // offset(imm) 3915 3916 if (IsTyped) 3917 MIB.addImm(Format); 3918 3919 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3920 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3921 .addMemOperand(MMO); 3922 3923 MI.eraseFromParent(); 3924 return true; 3925 } 3926 3927 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3928 MachineRegisterInfo &MRI, 3929 MachineIRBuilder &B, 3930 bool IsFormat, 3931 bool IsTyped) const { 3932 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3933 MachineMemOperand *MMO = *MI.memoperands_begin(); 3934 const LLT MemTy = MMO->getMemoryType(); 3935 const LLT S32 = LLT::scalar(32); 3936 3937 Register Dst = MI.getOperand(0).getReg(); 3938 Register RSrc = MI.getOperand(2).getReg(); 3939 3940 // The typed intrinsics add an immediate after the registers. 3941 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3942 3943 // The struct intrinsic variants add one additional operand over raw. 3944 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3945 Register VIndex; 3946 int OpOffset = 0; 3947 if (HasVIndex) { 3948 VIndex = MI.getOperand(3).getReg(); 3949 OpOffset = 1; 3950 } else { 3951 VIndex = B.buildConstant(S32, 0).getReg(0); 3952 } 3953 3954 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3955 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3956 3957 unsigned Format = 0; 3958 if (IsTyped) { 3959 Format = MI.getOperand(5 + OpOffset).getImm(); 3960 ++OpOffset; 3961 } 3962 3963 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3964 unsigned ImmOffset; 3965 3966 LLT Ty = MRI.getType(Dst); 3967 LLT EltTy = Ty.getScalarType(); 3968 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3969 const bool Unpacked = ST.hasUnpackedD16VMem(); 3970 3971 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3972 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3973 3974 unsigned Opc; 3975 3976 if (IsTyped) { 3977 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3978 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3979 } else if (IsFormat) { 3980 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3981 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3982 } else { 3983 switch (MemTy.getSizeInBits()) { 3984 case 8: 3985 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3986 break; 3987 case 16: 3988 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3989 break; 3990 default: 3991 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3992 break; 3993 } 3994 } 3995 3996 Register LoadDstReg; 3997 3998 bool IsExtLoad = 3999 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4000 LLT UnpackedTy = Ty.changeElementSize(32); 4001 4002 if (IsExtLoad) 4003 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4004 else if (Unpacked && IsD16 && Ty.isVector()) 4005 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4006 else 4007 LoadDstReg = Dst; 4008 4009 auto MIB = B.buildInstr(Opc) 4010 .addDef(LoadDstReg) // vdata 4011 .addUse(RSrc) // rsrc 4012 .addUse(VIndex) // vindex 4013 .addUse(VOffset) // voffset 4014 .addUse(SOffset) // soffset 4015 .addImm(ImmOffset); // offset(imm) 4016 4017 if (IsTyped) 4018 MIB.addImm(Format); 4019 4020 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4021 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4022 .addMemOperand(MMO); 4023 4024 if (LoadDstReg != Dst) { 4025 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4026 4027 // Widen result for extending loads was widened. 4028 if (IsExtLoad) 4029 B.buildTrunc(Dst, LoadDstReg); 4030 else { 4031 // Repack to original 16-bit vector result 4032 // FIXME: G_TRUNC should work, but legalization currently fails 4033 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4034 SmallVector<Register, 4> Repack; 4035 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4036 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4037 B.buildMerge(Dst, Repack); 4038 } 4039 } 4040 4041 MI.eraseFromParent(); 4042 return true; 4043 } 4044 4045 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4046 MachineIRBuilder &B, 4047 bool IsInc) const { 4048 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4049 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4050 B.buildInstr(Opc) 4051 .addDef(MI.getOperand(0).getReg()) 4052 .addUse(MI.getOperand(2).getReg()) 4053 .addUse(MI.getOperand(3).getReg()) 4054 .cloneMemRefs(MI); 4055 MI.eraseFromParent(); 4056 return true; 4057 } 4058 4059 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4060 switch (IntrID) { 4061 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4062 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4063 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4064 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4065 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4066 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4067 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4068 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4069 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4070 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4071 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4072 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4073 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4074 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4075 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4076 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4077 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4078 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4079 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4080 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4081 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4082 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4083 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4084 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4085 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4086 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4087 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4088 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4089 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4090 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4091 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4092 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4093 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4094 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4095 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4096 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4097 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4098 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4099 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4100 case Intrinsic::amdgcn_buffer_atomic_fadd: 4101 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4102 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4103 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4104 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4105 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4106 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4107 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4108 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4109 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4110 default: 4111 llvm_unreachable("unhandled atomic opcode"); 4112 } 4113 } 4114 4115 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4116 MachineIRBuilder &B, 4117 Intrinsic::ID IID) const { 4118 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4119 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4120 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4121 4122 Register Dst; 4123 4124 int OpOffset = 0; 4125 if (HasReturn) { 4126 // A few FP atomics do not support return values. 4127 Dst = MI.getOperand(0).getReg(); 4128 } else { 4129 OpOffset = -1; 4130 } 4131 4132 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4133 Register CmpVal; 4134 4135 if (IsCmpSwap) { 4136 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4137 ++OpOffset; 4138 } 4139 4140 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4141 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4142 4143 // The struct intrinsic variants add one additional operand over raw. 4144 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4145 Register VIndex; 4146 if (HasVIndex) { 4147 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4148 ++OpOffset; 4149 } else { 4150 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4151 } 4152 4153 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4154 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4155 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4156 4157 MachineMemOperand *MMO = *MI.memoperands_begin(); 4158 4159 unsigned ImmOffset; 4160 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4161 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4162 4163 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4164 4165 if (HasReturn) 4166 MIB.addDef(Dst); 4167 4168 MIB.addUse(VData); // vdata 4169 4170 if (IsCmpSwap) 4171 MIB.addReg(CmpVal); 4172 4173 MIB.addUse(RSrc) // rsrc 4174 .addUse(VIndex) // vindex 4175 .addUse(VOffset) // voffset 4176 .addUse(SOffset) // soffset 4177 .addImm(ImmOffset) // offset(imm) 4178 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4179 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4180 .addMemOperand(MMO); 4181 4182 MI.eraseFromParent(); 4183 return true; 4184 } 4185 4186 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4187 /// vector with s16 typed elements. 4188 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4189 SmallVectorImpl<Register> &PackedAddrs, 4190 unsigned ArgOffset, 4191 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4192 bool IsA16, bool IsG16) { 4193 const LLT S16 = LLT::scalar(16); 4194 const LLT V2S16 = LLT::fixed_vector(2, 16); 4195 auto EndIdx = Intr->VAddrEnd; 4196 4197 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4198 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4199 if (!SrcOp.isReg()) 4200 continue; // _L to _LZ may have eliminated this. 4201 4202 Register AddrReg = SrcOp.getReg(); 4203 4204 if ((I < Intr->GradientStart) || 4205 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4206 (I >= Intr->CoordStart && !IsA16)) { 4207 // Handle any gradient or coordinate operands that should not be packed 4208 if ((I < Intr->GradientStart) && IsA16 && 4209 (B.getMRI()->getType(AddrReg) == S16)) { 4210 // Special handling of bias when A16 is on. Bias is of type half but 4211 // occupies full 32-bit. 4212 PackedAddrs.push_back( 4213 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4214 .getReg(0)); 4215 } else { 4216 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4217 PackedAddrs.push_back(AddrReg); 4218 } 4219 } else { 4220 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4221 // derivatives dx/dh and dx/dv are packed with undef. 4222 if (((I + 1) >= EndIdx) || 4223 ((Intr->NumGradients / 2) % 2 == 1 && 4224 (I == static_cast<unsigned>(Intr->GradientStart + 4225 (Intr->NumGradients / 2) - 1) || 4226 I == static_cast<unsigned>(Intr->GradientStart + 4227 Intr->NumGradients - 1))) || 4228 // Check for _L to _LZ optimization 4229 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4230 PackedAddrs.push_back( 4231 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4232 .getReg(0)); 4233 } else { 4234 PackedAddrs.push_back( 4235 B.buildBuildVector( 4236 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4237 .getReg(0)); 4238 ++I; 4239 } 4240 } 4241 } 4242 } 4243 4244 /// Convert from separate vaddr components to a single vector address register, 4245 /// and replace the remaining operands with $noreg. 4246 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4247 int DimIdx, int NumVAddrs) { 4248 const LLT S32 = LLT::scalar(32); 4249 4250 SmallVector<Register, 8> AddrRegs; 4251 for (int I = 0; I != NumVAddrs; ++I) { 4252 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4253 if (SrcOp.isReg()) { 4254 AddrRegs.push_back(SrcOp.getReg()); 4255 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4256 } 4257 } 4258 4259 int NumAddrRegs = AddrRegs.size(); 4260 if (NumAddrRegs != 1) { 4261 // Above 8 elements round up to next power of 2 (i.e. 16). 4262 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4263 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4264 auto Undef = B.buildUndef(S32); 4265 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4266 NumAddrRegs = RoundedNumRegs; 4267 } 4268 4269 auto VAddr = 4270 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4271 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4272 } 4273 4274 for (int I = 1; I != NumVAddrs; ++I) { 4275 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4276 if (SrcOp.isReg()) 4277 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4278 } 4279 } 4280 4281 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4282 /// 4283 /// Depending on the subtarget, load/store with 16-bit element data need to be 4284 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4285 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4286 /// registers. 4287 /// 4288 /// We don't want to directly select image instructions just yet, but also want 4289 /// to exposes all register repacking to the legalizer/combiners. We also don't 4290 /// want a selected instrution entering RegBankSelect. In order to avoid 4291 /// defining a multitude of intermediate image instructions, directly hack on 4292 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4293 /// padding now unnecessary arguments with $noreg. 4294 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4295 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4296 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4297 4298 const unsigned NumDefs = MI.getNumExplicitDefs(); 4299 const unsigned ArgOffset = NumDefs + 1; 4300 bool IsTFE = NumDefs == 2; 4301 // We are only processing the operands of d16 image operations on subtargets 4302 // that use the unpacked register layout, or need to repack the TFE result. 4303 4304 // TODO: Do we need to guard against already legalized intrinsics? 4305 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4306 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4307 4308 MachineRegisterInfo *MRI = B.getMRI(); 4309 const LLT S32 = LLT::scalar(32); 4310 const LLT S16 = LLT::scalar(16); 4311 const LLT V2S16 = LLT::fixed_vector(2, 16); 4312 4313 unsigned DMask = 0; 4314 4315 // Check for 16 bit addresses and pack if true. 4316 LLT GradTy = 4317 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4318 LLT AddrTy = 4319 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4320 const bool IsG16 = GradTy == S16; 4321 const bool IsA16 = AddrTy == S16; 4322 4323 int DMaskLanes = 0; 4324 if (!BaseOpcode->Atomic) { 4325 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4326 if (BaseOpcode->Gather4) { 4327 DMaskLanes = 4; 4328 } else if (DMask != 0) { 4329 DMaskLanes = countPopulation(DMask); 4330 } else if (!IsTFE && !BaseOpcode->Store) { 4331 // If dmask is 0, this is a no-op load. This can be eliminated. 4332 B.buildUndef(MI.getOperand(0)); 4333 MI.eraseFromParent(); 4334 return true; 4335 } 4336 } 4337 4338 Observer.changingInstr(MI); 4339 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4340 4341 unsigned NewOpcode = NumDefs == 0 ? 4342 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4343 4344 // Track that we legalized this 4345 MI.setDesc(B.getTII().get(NewOpcode)); 4346 4347 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4348 // dmask to be at least 1 otherwise the instruction will fail 4349 if (IsTFE && DMask == 0) { 4350 DMask = 0x1; 4351 DMaskLanes = 1; 4352 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4353 } 4354 4355 if (BaseOpcode->Atomic) { 4356 Register VData0 = MI.getOperand(2).getReg(); 4357 LLT Ty = MRI->getType(VData0); 4358 4359 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4360 if (Ty.isVector()) 4361 return false; 4362 4363 if (BaseOpcode->AtomicX2) { 4364 Register VData1 = MI.getOperand(3).getReg(); 4365 // The two values are packed in one register. 4366 LLT PackedTy = LLT::fixed_vector(2, Ty); 4367 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4368 MI.getOperand(2).setReg(Concat.getReg(0)); 4369 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4370 } 4371 } 4372 4373 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4374 4375 // Optimize _L to _LZ when _L is zero 4376 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4377 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4378 const ConstantFP *ConstantLod; 4379 4380 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4381 m_GFCst(ConstantLod))) { 4382 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4383 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4384 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4385 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, 4386 Intr->Dim); 4387 4388 // The starting indexes should remain in the same place. 4389 --CorrectedNumVAddrs; 4390 4391 MI.getOperand(MI.getNumExplicitDefs()) 4392 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4393 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4394 Intr = NewImageDimIntr; 4395 } 4396 } 4397 } 4398 4399 // Optimize _mip away, when 'lod' is zero 4400 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4401 int64_t ConstantLod; 4402 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4403 m_ICst(ConstantLod))) { 4404 if (ConstantLod == 0) { 4405 // TODO: Change intrinsic opcode and remove operand instead or replacing 4406 // it with 0, as the _L to _LZ handling is done above. 4407 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4408 --CorrectedNumVAddrs; 4409 } 4410 } 4411 } 4412 4413 // Rewrite the addressing register layout before doing anything else. 4414 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4415 // 16 bit gradients are supported, but are tied to the A16 control 4416 // so both gradients and addresses must be 16 bit 4417 return false; 4418 } 4419 4420 if (IsA16 && !ST.hasA16()) { 4421 // A16 not supported 4422 return false; 4423 } 4424 4425 if (IsA16 || IsG16) { 4426 if (Intr->NumVAddrs > 1) { 4427 SmallVector<Register, 4> PackedRegs; 4428 4429 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4430 IsG16); 4431 4432 // See also below in the non-a16 branch 4433 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4434 PackedRegs.size() <= ST.getNSAMaxSize(); 4435 4436 if (!UseNSA && PackedRegs.size() > 1) { 4437 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4438 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4439 PackedRegs[0] = Concat.getReg(0); 4440 PackedRegs.resize(1); 4441 } 4442 4443 const unsigned NumPacked = PackedRegs.size(); 4444 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4445 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4446 if (!SrcOp.isReg()) { 4447 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4448 continue; 4449 } 4450 4451 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4452 4453 if (I - Intr->VAddrStart < NumPacked) 4454 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4455 else 4456 SrcOp.setReg(AMDGPU::NoRegister); 4457 } 4458 } 4459 } else { 4460 // If the register allocator cannot place the address registers contiguously 4461 // without introducing moves, then using the non-sequential address encoding 4462 // is always preferable, since it saves VALU instructions and is usually a 4463 // wash in terms of code size or even better. 4464 // 4465 // However, we currently have no way of hinting to the register allocator 4466 // that MIMG addresses should be placed contiguously when it is possible to 4467 // do so, so force non-NSA for the common 2-address case as a heuristic. 4468 // 4469 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4470 // allocation when possible. 4471 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4472 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4473 4474 if (!UseNSA && Intr->NumVAddrs > 1) 4475 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4476 Intr->NumVAddrs); 4477 } 4478 4479 int Flags = 0; 4480 if (IsA16) 4481 Flags |= 1; 4482 if (IsG16) 4483 Flags |= 2; 4484 MI.addOperand(MachineOperand::CreateImm(Flags)); 4485 4486 if (BaseOpcode->Store) { // No TFE for stores? 4487 // TODO: Handle dmask trim 4488 Register VData = MI.getOperand(1).getReg(); 4489 LLT Ty = MRI->getType(VData); 4490 if (!Ty.isVector() || Ty.getElementType() != S16) 4491 return true; 4492 4493 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4494 if (RepackedReg != VData) { 4495 MI.getOperand(1).setReg(RepackedReg); 4496 } 4497 4498 return true; 4499 } 4500 4501 Register DstReg = MI.getOperand(0).getReg(); 4502 LLT Ty = MRI->getType(DstReg); 4503 const LLT EltTy = Ty.getScalarType(); 4504 const bool IsD16 = Ty.getScalarType() == S16; 4505 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4506 4507 // Confirm that the return type is large enough for the dmask specified 4508 if (NumElts < DMaskLanes) 4509 return false; 4510 4511 if (NumElts > 4 || DMaskLanes > 4) 4512 return false; 4513 4514 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4515 const LLT AdjustedTy = 4516 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4517 4518 // The raw dword aligned data component of the load. The only legal cases 4519 // where this matters should be when using the packed D16 format, for 4520 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4521 LLT RoundedTy; 4522 4523 // S32 vector to to cover all data, plus TFE result element. 4524 LLT TFETy; 4525 4526 // Register type to use for each loaded component. Will be S32 or V2S16. 4527 LLT RegTy; 4528 4529 if (IsD16 && ST.hasUnpackedD16VMem()) { 4530 RoundedTy = 4531 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4532 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4533 RegTy = S32; 4534 } else { 4535 unsigned EltSize = EltTy.getSizeInBits(); 4536 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4537 unsigned RoundedSize = 32 * RoundedElts; 4538 RoundedTy = LLT::scalarOrVector( 4539 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4540 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4541 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4542 } 4543 4544 // The return type does not need adjustment. 4545 // TODO: Should we change s16 case to s32 or <2 x s16>? 4546 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4547 return true; 4548 4549 Register Dst1Reg; 4550 4551 // Insert after the instruction. 4552 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4553 4554 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4555 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4556 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4557 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4558 4559 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4560 4561 MI.getOperand(0).setReg(NewResultReg); 4562 4563 // In the IR, TFE is supposed to be used with a 2 element struct return 4564 // type. The instruction really returns these two values in one contiguous 4565 // register, with one additional dword beyond the loaded data. Rewrite the 4566 // return type to use a single register result. 4567 4568 if (IsTFE) { 4569 Dst1Reg = MI.getOperand(1).getReg(); 4570 if (MRI->getType(Dst1Reg) != S32) 4571 return false; 4572 4573 // TODO: Make sure the TFE operand bit is set. 4574 MI.RemoveOperand(1); 4575 4576 // Handle the easy case that requires no repack instructions. 4577 if (Ty == S32) { 4578 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4579 return true; 4580 } 4581 } 4582 4583 // Now figure out how to copy the new result register back into the old 4584 // result. 4585 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4586 4587 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4588 4589 if (ResultNumRegs == 1) { 4590 assert(!IsTFE); 4591 ResultRegs[0] = NewResultReg; 4592 } else { 4593 // We have to repack into a new vector of some kind. 4594 for (int I = 0; I != NumDataRegs; ++I) 4595 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4596 B.buildUnmerge(ResultRegs, NewResultReg); 4597 4598 // Drop the final TFE element to get the data part. The TFE result is 4599 // directly written to the right place already. 4600 if (IsTFE) 4601 ResultRegs.resize(NumDataRegs); 4602 } 4603 4604 // For an s16 scalar result, we form an s32 result with a truncate regardless 4605 // of packed vs. unpacked. 4606 if (IsD16 && !Ty.isVector()) { 4607 B.buildTrunc(DstReg, ResultRegs[0]); 4608 return true; 4609 } 4610 4611 // Avoid a build/concat_vector of 1 entry. 4612 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4613 B.buildBitcast(DstReg, ResultRegs[0]); 4614 return true; 4615 } 4616 4617 assert(Ty.isVector()); 4618 4619 if (IsD16) { 4620 // For packed D16 results with TFE enabled, all the data components are 4621 // S32. Cast back to the expected type. 4622 // 4623 // TODO: We don't really need to use load s32 elements. We would only need one 4624 // cast for the TFE result if a multiple of v2s16 was used. 4625 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4626 for (Register &Reg : ResultRegs) 4627 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4628 } else if (ST.hasUnpackedD16VMem()) { 4629 for (Register &Reg : ResultRegs) 4630 Reg = B.buildTrunc(S16, Reg).getReg(0); 4631 } 4632 } 4633 4634 auto padWithUndef = [&](LLT Ty, int NumElts) { 4635 if (NumElts == 0) 4636 return; 4637 Register Undef = B.buildUndef(Ty).getReg(0); 4638 for (int I = 0; I != NumElts; ++I) 4639 ResultRegs.push_back(Undef); 4640 }; 4641 4642 // Pad out any elements eliminated due to the dmask. 4643 LLT ResTy = MRI->getType(ResultRegs[0]); 4644 if (!ResTy.isVector()) { 4645 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4646 B.buildBuildVector(DstReg, ResultRegs); 4647 return true; 4648 } 4649 4650 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4651 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4652 4653 // Deal with the one annoying legal case. 4654 const LLT V3S16 = LLT::fixed_vector(3, 16); 4655 if (Ty == V3S16) { 4656 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4657 auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs); 4658 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4659 return true; 4660 } 4661 4662 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4663 B.buildConcatVectors(DstReg, ResultRegs); 4664 return true; 4665 } 4666 4667 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4668 LegalizerHelper &Helper, MachineInstr &MI) const { 4669 MachineIRBuilder &B = Helper.MIRBuilder; 4670 GISelChangeObserver &Observer = Helper.Observer; 4671 4672 Register Dst = MI.getOperand(0).getReg(); 4673 LLT Ty = B.getMRI()->getType(Dst); 4674 unsigned Size = Ty.getSizeInBits(); 4675 MachineFunction &MF = B.getMF(); 4676 4677 Observer.changingInstr(MI); 4678 4679 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4680 Ty = getBitcastRegisterType(Ty); 4681 Helper.bitcastDst(MI, Ty, 0); 4682 Dst = MI.getOperand(0).getReg(); 4683 B.setInsertPt(B.getMBB(), MI); 4684 } 4685 4686 // FIXME: We don't really need this intermediate instruction. The intrinsic 4687 // should be fixed to have a memory operand. Since it's readnone, we're not 4688 // allowed to add one. 4689 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4690 MI.RemoveOperand(1); // Remove intrinsic ID 4691 4692 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4693 // TODO: Should this use datalayout alignment? 4694 const unsigned MemSize = (Size + 7) / 8; 4695 const Align MemAlign(4); 4696 MachineMemOperand *MMO = MF.getMachineMemOperand( 4697 MachinePointerInfo(), 4698 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4699 MachineMemOperand::MOInvariant, 4700 MemSize, MemAlign); 4701 MI.addMemOperand(MF, MMO); 4702 4703 // There are no 96-bit result scalar loads, but widening to 128-bit should 4704 // always be legal. We may need to restore this to a 96-bit result if it turns 4705 // out this needs to be converted to a vector load during RegBankSelect. 4706 if (!isPowerOf2_32(Size)) { 4707 if (Ty.isVector()) 4708 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4709 else 4710 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4711 } 4712 4713 Observer.changedInstr(MI); 4714 return true; 4715 } 4716 4717 // TODO: Move to selection 4718 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4719 MachineRegisterInfo &MRI, 4720 MachineIRBuilder &B) const { 4721 if (!ST.isTrapHandlerEnabled() || 4722 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4723 return legalizeTrapEndpgm(MI, MRI, B); 4724 4725 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4726 switch (*HsaAbiVer) { 4727 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4728 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4729 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4730 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4731 return ST.supportsGetDoorbellID() ? 4732 legalizeTrapHsa(MI, MRI, B) : 4733 legalizeTrapHsaQueuePtr(MI, MRI, B); 4734 } 4735 } 4736 4737 llvm_unreachable("Unknown trap handler"); 4738 } 4739 4740 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4741 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4742 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4743 MI.eraseFromParent(); 4744 return true; 4745 } 4746 4747 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4748 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4749 // Pass queue pointer to trap handler as input, and insert trap instruction 4750 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4751 Register LiveIn = 4752 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4753 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4754 return false; 4755 4756 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4757 B.buildCopy(SGPR01, LiveIn); 4758 B.buildInstr(AMDGPU::S_TRAP) 4759 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4760 .addReg(SGPR01, RegState::Implicit); 4761 4762 MI.eraseFromParent(); 4763 return true; 4764 } 4765 4766 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4767 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4768 B.buildInstr(AMDGPU::S_TRAP) 4769 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4770 MI.eraseFromParent(); 4771 return true; 4772 } 4773 4774 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4775 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4776 // Is non-HSA path or trap-handler disabled? Then, report a warning 4777 // accordingly 4778 if (!ST.isTrapHandlerEnabled() || 4779 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4780 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4781 "debugtrap handler not supported", 4782 MI.getDebugLoc(), DS_Warning); 4783 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4784 Ctx.diagnose(NoTrap); 4785 } else { 4786 // Insert debug-trap instruction 4787 B.buildInstr(AMDGPU::S_TRAP) 4788 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4789 } 4790 4791 MI.eraseFromParent(); 4792 return true; 4793 } 4794 4795 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4796 MachineIRBuilder &B) const { 4797 MachineRegisterInfo &MRI = *B.getMRI(); 4798 const LLT S16 = LLT::scalar(16); 4799 const LLT S32 = LLT::scalar(32); 4800 4801 Register DstReg = MI.getOperand(0).getReg(); 4802 Register NodePtr = MI.getOperand(2).getReg(); 4803 Register RayExtent = MI.getOperand(3).getReg(); 4804 Register RayOrigin = MI.getOperand(4).getReg(); 4805 Register RayDir = MI.getOperand(5).getReg(); 4806 Register RayInvDir = MI.getOperand(6).getReg(); 4807 Register TDescr = MI.getOperand(7).getReg(); 4808 4809 if (!ST.hasGFX10_AEncoding()) { 4810 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4811 "intrinsic not supported on subtarget", 4812 MI.getDebugLoc()); 4813 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4814 return false; 4815 } 4816 4817 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4818 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4819 const unsigned NumVDataDwords = 4; 4820 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4821 const bool UseNSA = 4822 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 4823 const unsigned BaseOpcodes[2][2] = { 4824 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 4825 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 4826 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 4827 int Opcode; 4828 if (UseNSA) { 4829 Opcode = 4830 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 4831 NumVDataDwords, NumVAddrDwords); 4832 } else { 4833 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 4834 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 4835 PowerOf2Ceil(NumVAddrDwords)); 4836 } 4837 assert(Opcode != -1); 4838 4839 SmallVector<Register, 12> Ops; 4840 if (Is64) { 4841 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4842 Ops.push_back(Unmerge.getReg(0)); 4843 Ops.push_back(Unmerge.getReg(1)); 4844 } else { 4845 Ops.push_back(NodePtr); 4846 } 4847 Ops.push_back(RayExtent); 4848 4849 auto packLanes = [&Ops, &S32, &B](Register Src) { 4850 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 4851 Ops.push_back(Unmerge.getReg(0)); 4852 Ops.push_back(Unmerge.getReg(1)); 4853 Ops.push_back(Unmerge.getReg(2)); 4854 }; 4855 4856 packLanes(RayOrigin); 4857 if (IsA16) { 4858 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 4859 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 4860 Register R1 = MRI.createGenericVirtualRegister(S32); 4861 Register R2 = MRI.createGenericVirtualRegister(S32); 4862 Register R3 = MRI.createGenericVirtualRegister(S32); 4863 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4864 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4865 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4866 Ops.push_back(R1); 4867 Ops.push_back(R2); 4868 Ops.push_back(R3); 4869 } else { 4870 packLanes(RayDir); 4871 packLanes(RayInvDir); 4872 } 4873 4874 if (!UseNSA) { 4875 // Build a single vector containing all the operands so far prepared. 4876 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 4877 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 4878 Ops.clear(); 4879 Ops.push_back(MergedOps); 4880 } 4881 4882 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4883 .addDef(DstReg) 4884 .addImm(Opcode); 4885 4886 for (Register R : Ops) { 4887 MIB.addUse(R); 4888 } 4889 4890 MIB.addUse(TDescr) 4891 .addImm(IsA16 ? 1 : 0) 4892 .cloneMemRefs(MI); 4893 4894 MI.eraseFromParent(); 4895 return true; 4896 } 4897 4898 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4899 MachineInstr &MI) const { 4900 MachineIRBuilder &B = Helper.MIRBuilder; 4901 MachineRegisterInfo &MRI = *B.getMRI(); 4902 4903 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4904 auto IntrID = MI.getIntrinsicID(); 4905 switch (IntrID) { 4906 case Intrinsic::amdgcn_if: 4907 case Intrinsic::amdgcn_else: { 4908 MachineInstr *Br = nullptr; 4909 MachineBasicBlock *UncondBrTarget = nullptr; 4910 bool Negated = false; 4911 if (MachineInstr *BrCond = 4912 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4913 const SIRegisterInfo *TRI 4914 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4915 4916 Register Def = MI.getOperand(1).getReg(); 4917 Register Use = MI.getOperand(3).getReg(); 4918 4919 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4920 4921 if (Negated) 4922 std::swap(CondBrTarget, UncondBrTarget); 4923 4924 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4925 if (IntrID == Intrinsic::amdgcn_if) { 4926 B.buildInstr(AMDGPU::SI_IF) 4927 .addDef(Def) 4928 .addUse(Use) 4929 .addMBB(UncondBrTarget); 4930 } else { 4931 B.buildInstr(AMDGPU::SI_ELSE) 4932 .addDef(Def) 4933 .addUse(Use) 4934 .addMBB(UncondBrTarget); 4935 } 4936 4937 if (Br) { 4938 Br->getOperand(0).setMBB(CondBrTarget); 4939 } else { 4940 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4941 // since we're swapping branch targets it needs to be reinserted. 4942 // FIXME: IRTranslator should probably not do this 4943 B.buildBr(*CondBrTarget); 4944 } 4945 4946 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4947 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4948 MI.eraseFromParent(); 4949 BrCond->eraseFromParent(); 4950 return true; 4951 } 4952 4953 return false; 4954 } 4955 case Intrinsic::amdgcn_loop: { 4956 MachineInstr *Br = nullptr; 4957 MachineBasicBlock *UncondBrTarget = nullptr; 4958 bool Negated = false; 4959 if (MachineInstr *BrCond = 4960 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4961 const SIRegisterInfo *TRI 4962 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4963 4964 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4965 Register Reg = MI.getOperand(2).getReg(); 4966 4967 if (Negated) 4968 std::swap(CondBrTarget, UncondBrTarget); 4969 4970 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4971 B.buildInstr(AMDGPU::SI_LOOP) 4972 .addUse(Reg) 4973 .addMBB(UncondBrTarget); 4974 4975 if (Br) 4976 Br->getOperand(0).setMBB(CondBrTarget); 4977 else 4978 B.buildBr(*CondBrTarget); 4979 4980 MI.eraseFromParent(); 4981 BrCond->eraseFromParent(); 4982 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4983 return true; 4984 } 4985 4986 return false; 4987 } 4988 case Intrinsic::amdgcn_kernarg_segment_ptr: 4989 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4990 // This only makes sense to call in a kernel, so just lower to null. 4991 B.buildConstant(MI.getOperand(0).getReg(), 0); 4992 MI.eraseFromParent(); 4993 return true; 4994 } 4995 4996 return legalizePreloadedArgIntrin( 4997 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4998 case Intrinsic::amdgcn_implicitarg_ptr: 4999 return legalizeImplicitArgPtr(MI, MRI, B); 5000 case Intrinsic::amdgcn_workitem_id_x: 5001 return legalizePreloadedArgIntrin(MI, MRI, B, 5002 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5003 case Intrinsic::amdgcn_workitem_id_y: 5004 return legalizePreloadedArgIntrin(MI, MRI, B, 5005 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5006 case Intrinsic::amdgcn_workitem_id_z: 5007 return legalizePreloadedArgIntrin(MI, MRI, B, 5008 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5009 case Intrinsic::amdgcn_workgroup_id_x: 5010 return legalizePreloadedArgIntrin(MI, MRI, B, 5011 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5012 case Intrinsic::amdgcn_workgroup_id_y: 5013 return legalizePreloadedArgIntrin(MI, MRI, B, 5014 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5015 case Intrinsic::amdgcn_workgroup_id_z: 5016 return legalizePreloadedArgIntrin(MI, MRI, B, 5017 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5018 case Intrinsic::amdgcn_dispatch_ptr: 5019 return legalizePreloadedArgIntrin(MI, MRI, B, 5020 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5021 case Intrinsic::amdgcn_queue_ptr: 5022 return legalizePreloadedArgIntrin(MI, MRI, B, 5023 AMDGPUFunctionArgInfo::QUEUE_PTR); 5024 case Intrinsic::amdgcn_implicit_buffer_ptr: 5025 return legalizePreloadedArgIntrin( 5026 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5027 case Intrinsic::amdgcn_dispatch_id: 5028 return legalizePreloadedArgIntrin(MI, MRI, B, 5029 AMDGPUFunctionArgInfo::DISPATCH_ID); 5030 case Intrinsic::amdgcn_fdiv_fast: 5031 return legalizeFDIVFastIntrin(MI, MRI, B); 5032 case Intrinsic::amdgcn_is_shared: 5033 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5034 case Intrinsic::amdgcn_is_private: 5035 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5036 case Intrinsic::amdgcn_wavefrontsize: { 5037 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5038 MI.eraseFromParent(); 5039 return true; 5040 } 5041 case Intrinsic::amdgcn_s_buffer_load: 5042 return legalizeSBufferLoad(Helper, MI); 5043 case Intrinsic::amdgcn_raw_buffer_store: 5044 case Intrinsic::amdgcn_struct_buffer_store: 5045 return legalizeBufferStore(MI, MRI, B, false, false); 5046 case Intrinsic::amdgcn_raw_buffer_store_format: 5047 case Intrinsic::amdgcn_struct_buffer_store_format: 5048 return legalizeBufferStore(MI, MRI, B, false, true); 5049 case Intrinsic::amdgcn_raw_tbuffer_store: 5050 case Intrinsic::amdgcn_struct_tbuffer_store: 5051 return legalizeBufferStore(MI, MRI, B, true, true); 5052 case Intrinsic::amdgcn_raw_buffer_load: 5053 case Intrinsic::amdgcn_struct_buffer_load: 5054 return legalizeBufferLoad(MI, MRI, B, false, false); 5055 case Intrinsic::amdgcn_raw_buffer_load_format: 5056 case Intrinsic::amdgcn_struct_buffer_load_format: 5057 return legalizeBufferLoad(MI, MRI, B, true, false); 5058 case Intrinsic::amdgcn_raw_tbuffer_load: 5059 case Intrinsic::amdgcn_struct_tbuffer_load: 5060 return legalizeBufferLoad(MI, MRI, B, true, true); 5061 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5062 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5063 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5064 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5065 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5066 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5067 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5068 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5069 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5070 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5071 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5072 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5073 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5074 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5075 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5076 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5077 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5078 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5079 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5080 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5081 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5082 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5083 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5084 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5085 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5086 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5087 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5088 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5089 case Intrinsic::amdgcn_buffer_atomic_fadd: 5090 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5091 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5092 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5093 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5094 return legalizeBufferAtomic(MI, B, IntrID); 5095 case Intrinsic::amdgcn_atomic_inc: 5096 return legalizeAtomicIncDec(MI, B, true); 5097 case Intrinsic::amdgcn_atomic_dec: 5098 return legalizeAtomicIncDec(MI, B, false); 5099 case Intrinsic::trap: 5100 return legalizeTrapIntrinsic(MI, MRI, B); 5101 case Intrinsic::debugtrap: 5102 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5103 case Intrinsic::amdgcn_rsq_clamp: 5104 return legalizeRsqClampIntrinsic(MI, MRI, B); 5105 case Intrinsic::amdgcn_ds_fadd: 5106 case Intrinsic::amdgcn_ds_fmin: 5107 case Intrinsic::amdgcn_ds_fmax: 5108 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5109 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5110 return legalizeBVHIntrinsic(MI, B); 5111 default: { 5112 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5113 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5114 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5115 return true; 5116 } 5117 } 5118 5119 return true; 5120 } 5121