1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returns true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 unsigned AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .minScalar(0, S16) 536 .clampMaxNumElementsStrict(0, S16, 2) 537 .widenScalarToNextMultipleOf(0, 32) 538 .maxScalar(0, S32) 539 .scalarize(0); 540 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 542 .legalFor({S32, S16, V2S16}) // Clamp modifier 543 .minScalarOrElt(0, S16) 544 .clampMaxNumElementsStrict(0, S16, 2) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 32) 547 .lower(); 548 } else if (ST.has16BitInsts()) { 549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 550 .legalFor({S32, S16}) 551 .minScalar(0, S16) 552 .widenScalarToNextMultipleOf(0, 32) 553 .maxScalar(0, S32) 554 .scalarize(0); 555 556 // Technically the saturating operations require clamp bit support, but this 557 // was introduced at the same time as 16-bit operations. 558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 559 .legalFor({S32, S16}) // Clamp modifier 560 .minScalar(0, S16) 561 .scalarize(0) 562 .widenScalarToNextPow2(0, 16) 563 .lower(); 564 565 // We're just lowering this, but it helps get a better result to try to 566 // coerce to the desired type first. 567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 568 .minScalar(0, S16) 569 .scalarize(0) 570 .lower(); 571 } else { 572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 573 .legalFor({S32}) 574 .widenScalarToNextMultipleOf(0, 32) 575 .clampScalar(0, S32, S32) 576 .scalarize(0); 577 578 if (ST.hasIntClamp()) { 579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 580 .legalFor({S32}) // Clamp modifier. 581 .scalarize(0) 582 .minScalarOrElt(0, S32) 583 .lower(); 584 } else { 585 // Clamp bit support was added in VI, along with 16-bit operations. 586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 587 .minScalar(0, S32) 588 .scalarize(0) 589 .lower(); 590 } 591 592 // FIXME: DAG expansion gets better results. The widening uses the smaller 593 // range values and goes for the min/max lowering directly. 594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 595 .minScalar(0, S32) 596 .scalarize(0) 597 .lower(); 598 } 599 600 getActionDefinitionsBuilder( 601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 602 .customFor({S32, S64}) 603 .clampScalar(0, S32, S64) 604 .widenScalarToNextPow2(0, 32) 605 .scalarize(0); 606 607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 608 .legalFor({S32}) 609 .maxScalar(0, S32); 610 611 if (ST.hasVOP3PInsts()) { 612 Mulh 613 .clampMaxNumElements(0, S8, 2) 614 .lowerFor({V2S8}); 615 } 616 617 Mulh 618 .scalarize(0) 619 .lower(); 620 621 // Report legal for any types we can handle anywhere. For the cases only legal 622 // on the SALU, RegBankSelect will be able to re-legalize. 623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 625 .clampScalar(0, S32, S64) 626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 628 .widenScalarToNextPow2(0) 629 .scalarize(0); 630 631 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 633 .legalFor({{S32, S1}, {S32, S32}}) 634 .minScalar(0, S32) 635 // TODO: .scalarize(0) 636 .lower(); 637 638 getActionDefinitionsBuilder(G_BITCAST) 639 // Don't worry about the size constraint. 640 .legalIf(all(isRegisterType(0), isRegisterType(1))) 641 .lower(); 642 643 644 getActionDefinitionsBuilder(G_CONSTANT) 645 .legalFor({S1, S32, S64, S16, GlobalPtr, 646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 647 .legalIf(isPointer(0)) 648 .clampScalar(0, S32, S64) 649 .widenScalarToNextPow2(0); 650 651 getActionDefinitionsBuilder(G_FCONSTANT) 652 .legalFor({S32, S64, S16}) 653 .clampScalar(0, S16, S64); 654 655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 656 .legalIf(isRegisterType(0)) 657 // s1 and s16 are special cases because they have legal operations on 658 // them, but don't really occupy registers in the normal way. 659 .legalFor({S1, S16}) 660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 661 .clampScalarOrElt(0, S32, MaxScalar) 662 .widenScalarToNextPow2(0, 32) 663 .clampMaxNumElements(0, S32, 16); 664 665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 666 667 // If the amount is divergent, we have to do a wave reduction to get the 668 // maximum value, so this is expanded during RegBankSelect. 669 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 670 .legalFor({{PrivatePtr, S32}}); 671 672 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 673 .customIf(typeIsNot(0, PrivatePtr)); 674 675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 676 677 auto &FPOpActions = getActionDefinitionsBuilder( 678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 679 .legalFor({S32, S64}); 680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 681 .customFor({S32, S64}); 682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 683 .customFor({S32, S64}); 684 685 if (ST.has16BitInsts()) { 686 if (ST.hasVOP3PInsts()) 687 FPOpActions.legalFor({S16, V2S16}); 688 else 689 FPOpActions.legalFor({S16}); 690 691 TrigActions.customFor({S16}); 692 FDIVActions.customFor({S16}); 693 } 694 695 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 697 698 if (ST.hasVOP3PInsts()) { 699 MinNumMaxNum.customFor(FPTypesPK16) 700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 701 .clampMaxNumElements(0, S16, 2) 702 .clampScalar(0, S16, S64) 703 .scalarize(0); 704 } else if (ST.has16BitInsts()) { 705 MinNumMaxNum.customFor(FPTypes16) 706 .clampScalar(0, S16, S64) 707 .scalarize(0); 708 } else { 709 MinNumMaxNum.customFor(FPTypesBase) 710 .clampScalar(0, S32, S64) 711 .scalarize(0); 712 } 713 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 716 717 FPOpActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 TrigActions 722 .scalarize(0) 723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 724 725 FDIVActions 726 .scalarize(0) 727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 728 729 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 730 .legalFor(FPTypesPK16) 731 .clampMaxNumElementsStrict(0, S16, 2) 732 .scalarize(0) 733 .clampScalar(0, S16, S64); 734 735 if (ST.has16BitInsts()) { 736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 737 .legalFor({S32, S64, S16}) 738 .scalarize(0) 739 .clampScalar(0, S16, S64); 740 } else { 741 getActionDefinitionsBuilder(G_FSQRT) 742 .legalFor({S32, S64}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 746 if (ST.hasFractBug()) { 747 getActionDefinitionsBuilder(G_FFLOOR) 748 .customFor({S64}) 749 .legalFor({S32, S64}) 750 .scalarize(0) 751 .clampScalar(0, S32, S64); 752 } else { 753 getActionDefinitionsBuilder(G_FFLOOR) 754 .legalFor({S32, S64}) 755 .scalarize(0) 756 .clampScalar(0, S32, S64); 757 } 758 } 759 760 getActionDefinitionsBuilder(G_FPTRUNC) 761 .legalFor({{S32, S64}, {S16, S32}}) 762 .scalarize(0) 763 .lower(); 764 765 getActionDefinitionsBuilder(G_FPEXT) 766 .legalFor({{S64, S32}, {S32, S16}}) 767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 768 .scalarize(0); 769 770 getActionDefinitionsBuilder(G_FSUB) 771 // Use actual fsub instruction 772 .legalFor({S32}) 773 // Must use fadd + fneg 774 .lowerFor({S64, S16, V2S16}) 775 .scalarize(0) 776 .clampScalar(0, S32, S64); 777 778 // Whether this is legal depends on the floating point mode for the function. 779 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 780 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 781 FMad.customFor({S32, S16}); 782 else if (ST.hasMadMacF32Insts()) 783 FMad.customFor({S32}); 784 else if (ST.hasMadF16()) 785 FMad.customFor({S16}); 786 FMad.scalarize(0) 787 .lower(); 788 789 auto &FRem = getActionDefinitionsBuilder(G_FREM); 790 if (ST.has16BitInsts()) { 791 FRem.customFor({S16, S32, S64}); 792 } else { 793 FRem.minScalar(0, S32) 794 .customFor({S32, S64}); 795 } 796 FRem.scalarize(0); 797 798 // TODO: Do we need to clamp maximum bitwidth? 799 getActionDefinitionsBuilder(G_TRUNC) 800 .legalIf(isScalar(0)) 801 .legalFor({{V2S16, V2S32}}) 802 .clampMaxNumElements(0, S16, 2) 803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 804 // situations (like an invalid implicit use), we don't want to infinite loop 805 // in the legalizer. 806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 807 .alwaysLegal(); 808 809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 811 {S32, S1}, {S64, S1}, {S16, S1}}) 812 .scalarize(0) 813 .clampScalar(0, S32, S64) 814 .widenScalarToNextPow2(1, 32); 815 816 // TODO: Split s1->s64 during regbankselect for VALU. 817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 819 .lowerIf(typeIs(1, S1)) 820 .customFor({{S32, S64}, {S64, S64}}); 821 if (ST.has16BitInsts()) 822 IToFP.legalFor({{S16, S16}}); 823 IToFP.clampScalar(1, S32, S64) 824 .minScalar(0, S32) 825 .scalarize(0) 826 .widenScalarToNextPow2(1); 827 828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 830 .customFor({{S64, S32}, {S64, S64}}) 831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 832 if (ST.has16BitInsts()) 833 FPToI.legalFor({{S16, S16}}); 834 else 835 FPToI.minScalar(1, S32); 836 837 FPToI.minScalar(0, S32) 838 .widenScalarToNextPow2(0, 32) 839 .scalarize(0) 840 .lower(); 841 842 // Lower roundeven into G_FRINT 843 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 844 .scalarize(0) 845 .lower(); 846 847 if (ST.has16BitInsts()) { 848 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 849 .legalFor({S16, S32, S64}) 850 .clampScalar(0, S16, S64) 851 .scalarize(0); 852 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 854 .legalFor({S32, S64}) 855 .clampScalar(0, S32, S64) 856 .scalarize(0); 857 } else { 858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 859 .legalFor({S32}) 860 .customFor({S64}) 861 .clampScalar(0, S32, S64) 862 .scalarize(0); 863 } 864 865 getActionDefinitionsBuilder(G_PTR_ADD) 866 .legalIf(all(isPointer(0), sameSize(0, 1))) 867 .scalarize(0) 868 .scalarSameSizeAs(1, 0); 869 870 getActionDefinitionsBuilder(G_PTRMASK) 871 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 872 .scalarSameSizeAs(1, 0) 873 .scalarize(0); 874 875 auto &CmpBuilder = 876 getActionDefinitionsBuilder(G_ICMP) 877 // The compare output type differs based on the register bank of the output, 878 // so make both s1 and s32 legal. 879 // 880 // Scalar compares producing output in scc will be promoted to s32, as that 881 // is the allocatable register type that will be needed for the copy from 882 // scc. This will be promoted during RegBankSelect, and we assume something 883 // before that won't try to use s32 result types. 884 // 885 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 886 // bank. 887 .legalForCartesianProduct( 888 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 889 .legalForCartesianProduct( 890 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 891 if (ST.has16BitInsts()) { 892 CmpBuilder.legalFor({{S1, S16}}); 893 } 894 895 CmpBuilder 896 .widenScalarToNextPow2(1) 897 .clampScalar(1, S32, S64) 898 .scalarize(0) 899 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 900 901 getActionDefinitionsBuilder(G_FCMP) 902 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 903 .widenScalarToNextPow2(1) 904 .clampScalar(1, S32, S64) 905 .scalarize(0); 906 907 // FIXME: fpow has a selection pattern that should move to custom lowering. 908 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 909 if (ST.has16BitInsts()) 910 Exp2Ops.legalFor({S32, S16}); 911 else 912 Exp2Ops.legalFor({S32}); 913 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 914 Exp2Ops.scalarize(0); 915 916 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 917 if (ST.has16BitInsts()) 918 ExpOps.customFor({{S32}, {S16}}); 919 else 920 ExpOps.customFor({S32}); 921 ExpOps.clampScalar(0, MinScalarFPTy, S32) 922 .scalarize(0); 923 924 getActionDefinitionsBuilder(G_FPOWI) 925 .clampScalar(0, MinScalarFPTy, S32) 926 .lower(); 927 928 // The 64-bit versions produce 32-bit results, but only on the SALU. 929 getActionDefinitionsBuilder(G_CTPOP) 930 .legalFor({{S32, S32}, {S32, S64}}) 931 .clampScalar(0, S32, S32) 932 .clampScalar(1, S32, S64) 933 .scalarize(0) 934 .widenScalarToNextPow2(0, 32) 935 .widenScalarToNextPow2(1, 32); 936 937 // The hardware instructions return a different result on 0 than the generic 938 // instructions expect. The hardware produces -1, but these produce the 939 // bitwidth. 940 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 941 .scalarize(0) 942 .clampScalar(0, S32, S32) 943 .clampScalar(1, S32, S64) 944 .widenScalarToNextPow2(0, 32) 945 .widenScalarToNextPow2(1, 32) 946 .custom(); 947 948 // The 64-bit versions produce 32-bit results, but only on the SALU. 949 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 950 .legalFor({{S32, S32}, {S32, S64}}) 951 .clampScalar(0, S32, S32) 952 .clampScalar(1, S32, S64) 953 .scalarize(0) 954 .widenScalarToNextPow2(0, 32) 955 .widenScalarToNextPow2(1, 32); 956 957 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 958 // RegBankSelect. 959 getActionDefinitionsBuilder(G_BITREVERSE) 960 .legalFor({S32, S64}) 961 .clampScalar(0, S32, S64) 962 .scalarize(0) 963 .widenScalarToNextPow2(0); 964 965 if (ST.has16BitInsts()) { 966 getActionDefinitionsBuilder(G_BSWAP) 967 .legalFor({S16, S32, V2S16}) 968 .clampMaxNumElementsStrict(0, S16, 2) 969 // FIXME: Fixing non-power-of-2 before clamp is workaround for 970 // narrowScalar limitation. 971 .widenScalarToNextPow2(0) 972 .clampScalar(0, S16, S32) 973 .scalarize(0); 974 975 if (ST.hasVOP3PInsts()) { 976 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 977 .legalFor({S32, S16, V2S16}) 978 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 979 .clampMaxNumElements(0, S16, 2) 980 .minScalar(0, S16) 981 .widenScalarToNextPow2(0) 982 .scalarize(0) 983 .lower(); 984 } else { 985 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 986 .legalFor({S32, S16}) 987 .widenScalarToNextPow2(0) 988 .minScalar(0, S16) 989 .scalarize(0) 990 .lower(); 991 } 992 } else { 993 // TODO: Should have same legality without v_perm_b32 994 getActionDefinitionsBuilder(G_BSWAP) 995 .legalFor({S32}) 996 .lowerIf(scalarNarrowerThan(0, 32)) 997 // FIXME: Fixing non-power-of-2 before clamp is workaround for 998 // narrowScalar limitation. 999 .widenScalarToNextPow2(0) 1000 .maxScalar(0, S32) 1001 .scalarize(0) 1002 .lower(); 1003 1004 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1005 .legalFor({S32}) 1006 .minScalar(0, S32) 1007 .widenScalarToNextPow2(0) 1008 .scalarize(0) 1009 .lower(); 1010 } 1011 1012 getActionDefinitionsBuilder(G_INTTOPTR) 1013 // List the common cases 1014 .legalForCartesianProduct(AddrSpaces64, {S64}) 1015 .legalForCartesianProduct(AddrSpaces32, {S32}) 1016 .scalarize(0) 1017 // Accept any address space as long as the size matches 1018 .legalIf(sameSize(0, 1)) 1019 .widenScalarIf(smallerThan(1, 0), 1020 [](const LegalityQuery &Query) { 1021 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1022 }) 1023 .narrowScalarIf(largerThan(1, 0), 1024 [](const LegalityQuery &Query) { 1025 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1026 }); 1027 1028 getActionDefinitionsBuilder(G_PTRTOINT) 1029 // List the common cases 1030 .legalForCartesianProduct(AddrSpaces64, {S64}) 1031 .legalForCartesianProduct(AddrSpaces32, {S32}) 1032 .scalarize(0) 1033 // Accept any address space as long as the size matches 1034 .legalIf(sameSize(0, 1)) 1035 .widenScalarIf(smallerThan(0, 1), 1036 [](const LegalityQuery &Query) { 1037 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1038 }) 1039 .narrowScalarIf( 1040 largerThan(0, 1), 1041 [](const LegalityQuery &Query) { 1042 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1043 }); 1044 1045 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1046 .scalarize(0) 1047 .custom(); 1048 1049 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1050 bool IsLoad) -> bool { 1051 const LLT DstTy = Query.Types[0]; 1052 1053 // Split vector extloads. 1054 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1055 1056 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1057 return true; 1058 1059 const LLT PtrTy = Query.Types[1]; 1060 unsigned AS = PtrTy.getAddressSpace(); 1061 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1062 return true; 1063 1064 // Catch weird sized loads that don't evenly divide into the access sizes 1065 // TODO: May be able to widen depending on alignment etc. 1066 unsigned NumRegs = (MemSize + 31) / 32; 1067 if (NumRegs == 3) { 1068 if (!ST.hasDwordx3LoadStores()) 1069 return true; 1070 } else { 1071 // If the alignment allows, these should have been widened. 1072 if (!isPowerOf2_32(NumRegs)) 1073 return true; 1074 } 1075 1076 return false; 1077 }; 1078 1079 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1080 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1081 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1082 1083 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1084 // LDS 1085 // TODO: Unsupported flat for SI. 1086 1087 for (unsigned Op : {G_LOAD, G_STORE}) { 1088 const bool IsStore = Op == G_STORE; 1089 1090 auto &Actions = getActionDefinitionsBuilder(Op); 1091 // Explicitly list some common cases. 1092 // TODO: Does this help compile time at all? 1093 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1094 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1095 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1096 {S64, GlobalPtr, S64, GlobalAlign32}, 1097 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1098 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1099 {S32, GlobalPtr, S8, GlobalAlign8}, 1100 {S32, GlobalPtr, S16, GlobalAlign16}, 1101 1102 {S32, LocalPtr, S32, 32}, 1103 {S64, LocalPtr, S64, 32}, 1104 {V2S32, LocalPtr, V2S32, 32}, 1105 {S32, LocalPtr, S8, 8}, 1106 {S32, LocalPtr, S16, 16}, 1107 {V2S16, LocalPtr, S32, 32}, 1108 1109 {S32, PrivatePtr, S32, 32}, 1110 {S32, PrivatePtr, S8, 8}, 1111 {S32, PrivatePtr, S16, 16}, 1112 {V2S16, PrivatePtr, S32, 32}, 1113 1114 {S32, ConstantPtr, S32, GlobalAlign32}, 1115 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1116 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1117 {S64, ConstantPtr, S64, GlobalAlign32}, 1118 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1119 Actions.legalIf( 1120 [=](const LegalityQuery &Query) -> bool { 1121 return isLoadStoreLegal(ST, Query); 1122 }); 1123 1124 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1125 // 64-bits. 1126 // 1127 // TODO: Should generalize bitcast action into coerce, which will also cover 1128 // inserting addrspacecasts. 1129 Actions.customIf(typeIs(1, Constant32Ptr)); 1130 1131 // Turn any illegal element vectors into something easier to deal 1132 // with. These will ultimately produce 32-bit scalar shifts to extract the 1133 // parts anyway. 1134 // 1135 // For odd 16-bit element vectors, prefer to split those into pieces with 1136 // 16-bit vector parts. 1137 Actions.bitcastIf( 1138 [=](const LegalityQuery &Query) -> bool { 1139 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1140 Query.MMODescrs[0].MemoryTy); 1141 }, bitcastToRegisterType(0)); 1142 1143 if (!IsStore) { 1144 // Widen suitably aligned loads by loading extra bytes. The standard 1145 // legalization actions can't properly express widening memory operands. 1146 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1147 return shouldWidenLoad(ST, Query, G_LOAD); 1148 }); 1149 } 1150 1151 // FIXME: load/store narrowing should be moved to lower action 1152 Actions 1153 .narrowScalarIf( 1154 [=](const LegalityQuery &Query) -> bool { 1155 return !Query.Types[0].isVector() && 1156 needToSplitMemOp(Query, Op == G_LOAD); 1157 }, 1158 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1159 const LLT DstTy = Query.Types[0]; 1160 const LLT PtrTy = Query.Types[1]; 1161 1162 const unsigned DstSize = DstTy.getSizeInBits(); 1163 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1164 1165 // Split extloads. 1166 if (DstSize > MemSize) 1167 return std::make_pair(0, LLT::scalar(MemSize)); 1168 1169 unsigned MaxSize = maxSizeForAddrSpace(ST, 1170 PtrTy.getAddressSpace(), 1171 Op == G_LOAD); 1172 if (MemSize > MaxSize) 1173 return std::make_pair(0, LLT::scalar(MaxSize)); 1174 1175 unsigned Align = Query.MMODescrs[0].AlignInBits; 1176 return std::make_pair(0, LLT::scalar(Align)); 1177 }) 1178 .fewerElementsIf( 1179 [=](const LegalityQuery &Query) -> bool { 1180 return Query.Types[0].isVector() && 1181 needToSplitMemOp(Query, Op == G_LOAD); 1182 }, 1183 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1184 const LLT DstTy = Query.Types[0]; 1185 const LLT PtrTy = Query.Types[1]; 1186 1187 LLT EltTy = DstTy.getElementType(); 1188 unsigned MaxSize = maxSizeForAddrSpace(ST, 1189 PtrTy.getAddressSpace(), 1190 Op == G_LOAD); 1191 1192 // FIXME: Handle widened to power of 2 results better. This ends 1193 // up scalarizing. 1194 // FIXME: 3 element stores scalarized on SI 1195 1196 // Split if it's too large for the address space. 1197 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1198 if (MemSize > MaxSize) { 1199 unsigned NumElts = DstTy.getNumElements(); 1200 unsigned EltSize = EltTy.getSizeInBits(); 1201 1202 if (MaxSize % EltSize == 0) { 1203 return std::make_pair( 1204 0, LLT::scalarOrVector( 1205 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1206 } 1207 1208 unsigned NumPieces = MemSize / MaxSize; 1209 1210 // FIXME: Refine when odd breakdowns handled 1211 // The scalars will need to be re-legalized. 1212 if (NumPieces == 1 || NumPieces >= NumElts || 1213 NumElts % NumPieces != 0) 1214 return std::make_pair(0, EltTy); 1215 1216 return std::make_pair( 1217 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1218 } 1219 1220 // FIXME: We could probably handle weird extending loads better. 1221 if (DstTy.getSizeInBits() > MemSize) 1222 return std::make_pair(0, EltTy); 1223 1224 unsigned EltSize = EltTy.getSizeInBits(); 1225 unsigned DstSize = DstTy.getSizeInBits(); 1226 if (!isPowerOf2_32(DstSize)) { 1227 // We're probably decomposing an odd sized store. Try to split 1228 // to the widest type. TODO: Account for alignment. As-is it 1229 // should be OK, since the new parts will be further legalized. 1230 unsigned FloorSize = PowerOf2Floor(DstSize); 1231 return std::make_pair( 1232 0, LLT::scalarOrVector( 1233 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1234 } 1235 1236 // May need relegalization for the scalars. 1237 return std::make_pair(0, EltTy); 1238 }) 1239 .minScalar(0, S32) 1240 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1241 .widenScalarToNextPow2(0) 1242 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1243 .lower(); 1244 } 1245 1246 // FIXME: Unaligned accesses not lowered. 1247 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1248 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1249 {S32, GlobalPtr, S16, 2 * 8}, 1250 {S32, LocalPtr, S8, 8}, 1251 {S32, LocalPtr, S16, 16}, 1252 {S32, PrivatePtr, S8, 8}, 1253 {S32, PrivatePtr, S16, 16}, 1254 {S32, ConstantPtr, S8, 8}, 1255 {S32, ConstantPtr, S16, 2 * 8}}) 1256 .legalIf( 1257 [=](const LegalityQuery &Query) -> bool { 1258 return isLoadStoreLegal(ST, Query); 1259 }); 1260 1261 if (ST.hasFlatAddressSpace()) { 1262 ExtLoads.legalForTypesWithMemDesc( 1263 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1264 } 1265 1266 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1267 // 64-bits. 1268 // 1269 // TODO: Should generalize bitcast action into coerce, which will also cover 1270 // inserting addrspacecasts. 1271 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1272 1273 ExtLoads.clampScalar(0, S32, S32) 1274 .widenScalarToNextPow2(0) 1275 .lower(); 1276 1277 auto &Atomics = getActionDefinitionsBuilder( 1278 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1279 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1280 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1281 G_ATOMICRMW_UMIN}) 1282 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1283 {S64, GlobalPtr}, {S64, LocalPtr}, 1284 {S32, RegionPtr}, {S64, RegionPtr}}); 1285 if (ST.hasFlatAddressSpace()) { 1286 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1287 } 1288 1289 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1290 if (ST.hasLDSFPAtomicAdd()) { 1291 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1292 if (ST.hasGFX90AInsts()) 1293 Atomic.legalFor({{S64, LocalPtr}}); 1294 } 1295 if (ST.hasAtomicFaddInsts()) 1296 Atomic.legalFor({{S32, GlobalPtr}}); 1297 1298 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1299 // demarshalling 1300 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1301 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1302 {S32, FlatPtr}, {S64, FlatPtr}}) 1303 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1304 {S32, RegionPtr}, {S64, RegionPtr}}); 1305 // TODO: Pointer types, any 32-bit or 64-bit vector 1306 1307 // Condition should be s32 for scalar, s1 for vector. 1308 getActionDefinitionsBuilder(G_SELECT) 1309 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1310 LocalPtr, FlatPtr, PrivatePtr, 1311 LLT::fixed_vector(2, LocalPtr), 1312 LLT::fixed_vector(2, PrivatePtr)}, 1313 {S1, S32}) 1314 .clampScalar(0, S16, S64) 1315 .scalarize(1) 1316 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1317 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1318 .clampMaxNumElements(0, S32, 2) 1319 .clampMaxNumElements(0, LocalPtr, 2) 1320 .clampMaxNumElements(0, PrivatePtr, 2) 1321 .scalarize(0) 1322 .widenScalarToNextPow2(0) 1323 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1324 1325 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1326 // be more flexible with the shift amount type. 1327 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1328 .legalFor({{S32, S32}, {S64, S32}}); 1329 if (ST.has16BitInsts()) { 1330 if (ST.hasVOP3PInsts()) { 1331 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1332 .clampMaxNumElements(0, S16, 2); 1333 } else 1334 Shifts.legalFor({{S16, S16}}); 1335 1336 // TODO: Support 16-bit shift amounts for all types 1337 Shifts.widenScalarIf( 1338 [=](const LegalityQuery &Query) { 1339 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1340 // 32-bit amount. 1341 const LLT ValTy = Query.Types[0]; 1342 const LLT AmountTy = Query.Types[1]; 1343 return ValTy.getSizeInBits() <= 16 && 1344 AmountTy.getSizeInBits() < 16; 1345 }, changeTo(1, S16)); 1346 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1347 Shifts.clampScalar(1, S32, S32); 1348 Shifts.clampScalar(0, S16, S64); 1349 Shifts.widenScalarToNextPow2(0, 16); 1350 1351 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1352 .minScalar(0, S16) 1353 .scalarize(0) 1354 .lower(); 1355 } else { 1356 // Make sure we legalize the shift amount type first, as the general 1357 // expansion for the shifted type will produce much worse code if it hasn't 1358 // been truncated already. 1359 Shifts.clampScalar(1, S32, S32); 1360 Shifts.clampScalar(0, S32, S64); 1361 Shifts.widenScalarToNextPow2(0, 32); 1362 1363 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1364 .minScalar(0, S32) 1365 .scalarize(0) 1366 .lower(); 1367 } 1368 Shifts.scalarize(0); 1369 1370 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1371 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1372 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1373 unsigned IdxTypeIdx = 2; 1374 1375 getActionDefinitionsBuilder(Op) 1376 .customIf([=](const LegalityQuery &Query) { 1377 const LLT EltTy = Query.Types[EltTypeIdx]; 1378 const LLT VecTy = Query.Types[VecTypeIdx]; 1379 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1380 const unsigned EltSize = EltTy.getSizeInBits(); 1381 return (EltSize == 32 || EltSize == 64) && 1382 VecTy.getSizeInBits() % 32 == 0 && 1383 VecTy.getSizeInBits() <= MaxRegisterSize && 1384 IdxTy.getSizeInBits() == 32; 1385 }) 1386 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1387 bitcastToVectorElement32(VecTypeIdx)) 1388 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1389 .bitcastIf( 1390 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1391 [=](const LegalityQuery &Query) { 1392 // For > 64-bit element types, try to turn this into a 64-bit 1393 // element vector since we may be able to do better indexing 1394 // if this is scalar. If not, fall back to 32. 1395 const LLT EltTy = Query.Types[EltTypeIdx]; 1396 const LLT VecTy = Query.Types[VecTypeIdx]; 1397 const unsigned DstEltSize = EltTy.getSizeInBits(); 1398 const unsigned VecSize = VecTy.getSizeInBits(); 1399 1400 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1401 return std::make_pair( 1402 VecTypeIdx, 1403 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1404 }) 1405 .clampScalar(EltTypeIdx, S32, S64) 1406 .clampScalar(VecTypeIdx, S32, S64) 1407 .clampScalar(IdxTypeIdx, S32, S32) 1408 .clampMaxNumElements(VecTypeIdx, S32, 32) 1409 // TODO: Clamp elements for 64-bit vectors? 1410 // It should only be necessary with variable indexes. 1411 // As a last resort, lower to the stack 1412 .lower(); 1413 } 1414 1415 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1416 .unsupportedIf([=](const LegalityQuery &Query) { 1417 const LLT &EltTy = Query.Types[1].getElementType(); 1418 return Query.Types[0] != EltTy; 1419 }); 1420 1421 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1422 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1423 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1424 1425 // FIXME: Doesn't handle extract of illegal sizes. 1426 getActionDefinitionsBuilder(Op) 1427 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1428 .lowerIf([=](const LegalityQuery &Query) { 1429 // Sub-vector(or single element) insert and extract. 1430 // TODO: verify immediate offset here since lower only works with 1431 // whole elements. 1432 const LLT BigTy = Query.Types[BigTyIdx]; 1433 return BigTy.isVector(); 1434 }) 1435 // FIXME: Multiples of 16 should not be legal. 1436 .legalIf([=](const LegalityQuery &Query) { 1437 const LLT BigTy = Query.Types[BigTyIdx]; 1438 const LLT LitTy = Query.Types[LitTyIdx]; 1439 return (BigTy.getSizeInBits() % 32 == 0) && 1440 (LitTy.getSizeInBits() % 16 == 0); 1441 }) 1442 .widenScalarIf( 1443 [=](const LegalityQuery &Query) { 1444 const LLT BigTy = Query.Types[BigTyIdx]; 1445 return (BigTy.getScalarSizeInBits() < 16); 1446 }, 1447 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1448 .widenScalarIf( 1449 [=](const LegalityQuery &Query) { 1450 const LLT LitTy = Query.Types[LitTyIdx]; 1451 return (LitTy.getScalarSizeInBits() < 16); 1452 }, 1453 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1454 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1455 .widenScalarToNextPow2(BigTyIdx, 32); 1456 1457 } 1458 1459 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1460 .legalForCartesianProduct(AllS32Vectors, {S32}) 1461 .legalForCartesianProduct(AllS64Vectors, {S64}) 1462 .clampNumElements(0, V16S32, V32S32) 1463 .clampNumElements(0, V2S64, V16S64) 1464 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1465 1466 if (ST.hasScalarPackInsts()) { 1467 BuildVector 1468 // FIXME: Should probably widen s1 vectors straight to s32 1469 .minScalarOrElt(0, S16) 1470 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1471 .minScalar(1, S32); 1472 1473 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1474 .legalFor({V2S16, S32}) 1475 .lower(); 1476 BuildVector.minScalarOrElt(0, S32); 1477 } else { 1478 BuildVector.customFor({V2S16, S16}); 1479 BuildVector.minScalarOrElt(0, S32); 1480 1481 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1482 .customFor({V2S16, S32}) 1483 .lower(); 1484 } 1485 1486 BuildVector.legalIf(isRegisterType(0)); 1487 1488 // FIXME: Clamp maximum size 1489 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1490 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1491 .clampMaxNumElements(0, S32, 32) 1492 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1493 .clampMaxNumElements(0, S16, 64); 1494 1495 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1496 // pre-legalize. 1497 if (ST.hasVOP3PInsts()) { 1498 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1499 .customFor({V2S16, V2S16}) 1500 .lower(); 1501 } else 1502 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1503 1504 // Merge/Unmerge 1505 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1506 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1507 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1508 1509 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1510 const LLT Ty = Query.Types[TypeIdx]; 1511 if (Ty.isVector()) { 1512 const LLT &EltTy = Ty.getElementType(); 1513 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1514 return true; 1515 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1516 return true; 1517 } 1518 return false; 1519 }; 1520 1521 auto &Builder = getActionDefinitionsBuilder(Op) 1522 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1523 .lowerFor({{S16, V2S16}}) 1524 .lowerIf([=](const LegalityQuery &Query) { 1525 const LLT BigTy = Query.Types[BigTyIdx]; 1526 return BigTy.getSizeInBits() == 32; 1527 }) 1528 // Try to widen to s16 first for small types. 1529 // TODO: Only do this on targets with legal s16 shifts 1530 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1531 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1532 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1533 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1534 elementTypeIs(1, S16)), 1535 changeTo(1, V2S16)) 1536 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1537 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1538 // valid. 1539 .clampScalar(LitTyIdx, S32, S512) 1540 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1541 // Break up vectors with weird elements into scalars 1542 .fewerElementsIf( 1543 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1544 scalarize(0)) 1545 .fewerElementsIf( 1546 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1547 scalarize(1)) 1548 .clampScalar(BigTyIdx, S32, MaxScalar); 1549 1550 if (Op == G_MERGE_VALUES) { 1551 Builder.widenScalarIf( 1552 // TODO: Use 16-bit shifts if legal for 8-bit values? 1553 [=](const LegalityQuery &Query) { 1554 const LLT Ty = Query.Types[LitTyIdx]; 1555 return Ty.getSizeInBits() < 32; 1556 }, 1557 changeTo(LitTyIdx, S32)); 1558 } 1559 1560 Builder.widenScalarIf( 1561 [=](const LegalityQuery &Query) { 1562 const LLT Ty = Query.Types[BigTyIdx]; 1563 return !isPowerOf2_32(Ty.getSizeInBits()) && 1564 Ty.getSizeInBits() % 16 != 0; 1565 }, 1566 [=](const LegalityQuery &Query) { 1567 // Pick the next power of 2, or a multiple of 64 over 128. 1568 // Whichever is smaller. 1569 const LLT &Ty = Query.Types[BigTyIdx]; 1570 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1571 if (NewSizeInBits >= 256) { 1572 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1573 if (RoundedTo < NewSizeInBits) 1574 NewSizeInBits = RoundedTo; 1575 } 1576 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1577 }) 1578 // Any vectors left are the wrong size. Scalarize them. 1579 .scalarize(0) 1580 .scalarize(1); 1581 } 1582 1583 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1584 // RegBankSelect. 1585 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1586 .legalFor({{S32}, {S64}}); 1587 1588 if (ST.hasVOP3PInsts()) { 1589 SextInReg.lowerFor({{V2S16}}) 1590 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1591 // get more vector shift opportunities, since we'll get those when 1592 // expanded. 1593 .clampMaxNumElementsStrict(0, S16, 2); 1594 } else if (ST.has16BitInsts()) { 1595 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1596 } else { 1597 // Prefer to promote to s32 before lowering if we don't have 16-bit 1598 // shifts. This avoid a lot of intermediate truncate and extend operations. 1599 SextInReg.lowerFor({{S32}, {S64}}); 1600 } 1601 1602 SextInReg 1603 .scalarize(0) 1604 .clampScalar(0, S32, S64) 1605 .lower(); 1606 1607 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1608 .scalarize(0) 1609 .lower(); 1610 1611 // TODO: Only Try to form v2s16 with legal packed instructions. 1612 getActionDefinitionsBuilder(G_FSHR) 1613 .legalFor({{S32, S32}}) 1614 .lowerFor({{V2S16, V2S16}}) 1615 .clampMaxNumElementsStrict(0, S16, 2) 1616 .scalarize(0) 1617 .lower(); 1618 1619 if (ST.hasVOP3PInsts()) { 1620 getActionDefinitionsBuilder(G_FSHL) 1621 .lowerFor({{V2S16, V2S16}}) 1622 .clampMaxNumElementsStrict(0, S16, 2) 1623 .scalarize(0) 1624 .lower(); 1625 } else { 1626 getActionDefinitionsBuilder(G_FSHL) 1627 .scalarize(0) 1628 .lower(); 1629 } 1630 1631 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1632 .legalFor({S64}); 1633 1634 getActionDefinitionsBuilder(G_FENCE) 1635 .alwaysLegal(); 1636 1637 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1638 .scalarize(0) 1639 .minScalar(0, S32) 1640 .lower(); 1641 1642 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1643 .legalFor({{S32, S32}, {S64, S32}}) 1644 .clampScalar(1, S32, S32) 1645 .clampScalar(0, S32, S64) 1646 .widenScalarToNextPow2(0) 1647 .scalarize(0); 1648 1649 getActionDefinitionsBuilder({ 1650 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1651 G_FCOPYSIGN, 1652 1653 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1654 G_ATOMICRMW_NAND, 1655 G_ATOMICRMW_FSUB, 1656 G_READ_REGISTER, 1657 G_WRITE_REGISTER, 1658 1659 G_SADDO, G_SSUBO, 1660 1661 // TODO: Implement 1662 G_FMINIMUM, G_FMAXIMUM}).lower(); 1663 1664 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1665 .lower(); 1666 1667 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1668 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1669 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1670 .unsupported(); 1671 1672 getLegacyLegalizerInfo().computeTables(); 1673 verify(*ST.getInstrInfo()); 1674 } 1675 1676 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1677 MachineInstr &MI) const { 1678 MachineIRBuilder &B = Helper.MIRBuilder; 1679 MachineRegisterInfo &MRI = *B.getMRI(); 1680 1681 switch (MI.getOpcode()) { 1682 case TargetOpcode::G_ADDRSPACE_CAST: 1683 return legalizeAddrSpaceCast(MI, MRI, B); 1684 case TargetOpcode::G_FRINT: 1685 return legalizeFrint(MI, MRI, B); 1686 case TargetOpcode::G_FCEIL: 1687 return legalizeFceil(MI, MRI, B); 1688 case TargetOpcode::G_FREM: 1689 return legalizeFrem(MI, MRI, B); 1690 case TargetOpcode::G_INTRINSIC_TRUNC: 1691 return legalizeIntrinsicTrunc(MI, MRI, B); 1692 case TargetOpcode::G_SITOFP: 1693 return legalizeITOFP(MI, MRI, B, true); 1694 case TargetOpcode::G_UITOFP: 1695 return legalizeITOFP(MI, MRI, B, false); 1696 case TargetOpcode::G_FPTOSI: 1697 return legalizeFPTOI(MI, MRI, B, true); 1698 case TargetOpcode::G_FPTOUI: 1699 return legalizeFPTOI(MI, MRI, B, false); 1700 case TargetOpcode::G_FMINNUM: 1701 case TargetOpcode::G_FMAXNUM: 1702 case TargetOpcode::G_FMINNUM_IEEE: 1703 case TargetOpcode::G_FMAXNUM_IEEE: 1704 return legalizeMinNumMaxNum(Helper, MI); 1705 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1706 return legalizeExtractVectorElt(MI, MRI, B); 1707 case TargetOpcode::G_INSERT_VECTOR_ELT: 1708 return legalizeInsertVectorElt(MI, MRI, B); 1709 case TargetOpcode::G_SHUFFLE_VECTOR: 1710 return legalizeShuffleVector(MI, MRI, B); 1711 case TargetOpcode::G_FSIN: 1712 case TargetOpcode::G_FCOS: 1713 return legalizeSinCos(MI, MRI, B); 1714 case TargetOpcode::G_GLOBAL_VALUE: 1715 return legalizeGlobalValue(MI, MRI, B); 1716 case TargetOpcode::G_LOAD: 1717 case TargetOpcode::G_SEXTLOAD: 1718 case TargetOpcode::G_ZEXTLOAD: 1719 return legalizeLoad(Helper, MI); 1720 case TargetOpcode::G_FMAD: 1721 return legalizeFMad(MI, MRI, B); 1722 case TargetOpcode::G_FDIV: 1723 return legalizeFDIV(MI, MRI, B); 1724 case TargetOpcode::G_UDIV: 1725 case TargetOpcode::G_UREM: 1726 case TargetOpcode::G_UDIVREM: 1727 return legalizeUnsignedDIV_REM(MI, MRI, B); 1728 case TargetOpcode::G_SDIV: 1729 case TargetOpcode::G_SREM: 1730 case TargetOpcode::G_SDIVREM: 1731 return legalizeSignedDIV_REM(MI, MRI, B); 1732 case TargetOpcode::G_ATOMIC_CMPXCHG: 1733 return legalizeAtomicCmpXChg(MI, MRI, B); 1734 case TargetOpcode::G_FLOG: 1735 return legalizeFlog(MI, B, numbers::ln2f); 1736 case TargetOpcode::G_FLOG10: 1737 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1738 case TargetOpcode::G_FEXP: 1739 return legalizeFExp(MI, B); 1740 case TargetOpcode::G_FPOW: 1741 return legalizeFPow(MI, B); 1742 case TargetOpcode::G_FFLOOR: 1743 return legalizeFFloor(MI, MRI, B); 1744 case TargetOpcode::G_BUILD_VECTOR: 1745 return legalizeBuildVector(MI, MRI, B); 1746 case TargetOpcode::G_CTLZ: 1747 case TargetOpcode::G_CTTZ: 1748 return legalizeCTLZ_CTTZ(MI, MRI, B); 1749 default: 1750 return false; 1751 } 1752 1753 llvm_unreachable("expected switch to return"); 1754 } 1755 1756 Register AMDGPULegalizerInfo::getSegmentAperture( 1757 unsigned AS, 1758 MachineRegisterInfo &MRI, 1759 MachineIRBuilder &B) const { 1760 MachineFunction &MF = B.getMF(); 1761 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1762 const LLT S32 = LLT::scalar(32); 1763 1764 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1765 1766 if (ST.hasApertureRegs()) { 1767 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1768 // getreg. 1769 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1770 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1771 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1772 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1773 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1774 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1775 unsigned Encoding = 1776 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1777 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1778 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1779 1780 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1781 1782 B.buildInstr(AMDGPU::S_GETREG_B32) 1783 .addDef(GetReg) 1784 .addImm(Encoding); 1785 MRI.setType(GetReg, S32); 1786 1787 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1788 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1789 } 1790 1791 Register QueuePtr = MRI.createGenericVirtualRegister( 1792 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1793 1794 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1795 return Register(); 1796 1797 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1798 // private_segment_aperture_base_hi. 1799 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1800 1801 // TODO: can we be smarter about machine pointer info? 1802 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1803 MachineMemOperand *MMO = MF.getMachineMemOperand( 1804 PtrInfo, 1805 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1806 MachineMemOperand::MOInvariant, 1807 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1808 1809 Register LoadAddr; 1810 1811 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1812 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1813 } 1814 1815 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1816 MachineInstr &MI, MachineRegisterInfo &MRI, 1817 MachineIRBuilder &B) const { 1818 MachineFunction &MF = B.getMF(); 1819 1820 const LLT S32 = LLT::scalar(32); 1821 Register Dst = MI.getOperand(0).getReg(); 1822 Register Src = MI.getOperand(1).getReg(); 1823 1824 LLT DstTy = MRI.getType(Dst); 1825 LLT SrcTy = MRI.getType(Src); 1826 unsigned DestAS = DstTy.getAddressSpace(); 1827 unsigned SrcAS = SrcTy.getAddressSpace(); 1828 1829 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1830 // vector element. 1831 assert(!DstTy.isVector()); 1832 1833 const AMDGPUTargetMachine &TM 1834 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1835 1836 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1837 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1838 return true; 1839 } 1840 1841 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1842 // Truncate. 1843 B.buildExtract(Dst, Src, 0); 1844 MI.eraseFromParent(); 1845 return true; 1846 } 1847 1848 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1849 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1850 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1851 1852 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1853 // another. Merge operands are required to be the same type, but creating an 1854 // extra ptrtoint would be kind of pointless. 1855 auto HighAddr = B.buildConstant( 1856 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1857 B.buildMerge(Dst, {Src, HighAddr}); 1858 MI.eraseFromParent(); 1859 return true; 1860 } 1861 1862 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1863 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1864 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1865 unsigned NullVal = TM.getNullPointerValue(DestAS); 1866 1867 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1868 auto FlatNull = B.buildConstant(SrcTy, 0); 1869 1870 // Extract low 32-bits of the pointer. 1871 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1872 1873 auto CmpRes = 1874 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1875 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1876 1877 MI.eraseFromParent(); 1878 return true; 1879 } 1880 1881 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1882 return false; 1883 1884 if (!ST.hasFlatAddressSpace()) 1885 return false; 1886 1887 auto SegmentNull = 1888 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1889 auto FlatNull = 1890 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1891 1892 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1893 if (!ApertureReg.isValid()) 1894 return false; 1895 1896 auto CmpRes = 1897 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1898 1899 // Coerce the type of the low half of the result so we can use merge_values. 1900 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1901 1902 // TODO: Should we allow mismatched types but matching sizes in merges to 1903 // avoid the ptrtoint? 1904 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1905 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1906 1907 MI.eraseFromParent(); 1908 return true; 1909 } 1910 1911 bool AMDGPULegalizerInfo::legalizeFrint( 1912 MachineInstr &MI, MachineRegisterInfo &MRI, 1913 MachineIRBuilder &B) const { 1914 Register Src = MI.getOperand(1).getReg(); 1915 LLT Ty = MRI.getType(Src); 1916 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1917 1918 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1919 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1920 1921 auto C1 = B.buildFConstant(Ty, C1Val); 1922 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1923 1924 // TODO: Should this propagate fast-math-flags? 1925 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1926 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1927 1928 auto C2 = B.buildFConstant(Ty, C2Val); 1929 auto Fabs = B.buildFAbs(Ty, Src); 1930 1931 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1932 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1933 MI.eraseFromParent(); 1934 return true; 1935 } 1936 1937 bool AMDGPULegalizerInfo::legalizeFceil( 1938 MachineInstr &MI, MachineRegisterInfo &MRI, 1939 MachineIRBuilder &B) const { 1940 1941 const LLT S1 = LLT::scalar(1); 1942 const LLT S64 = LLT::scalar(64); 1943 1944 Register Src = MI.getOperand(1).getReg(); 1945 assert(MRI.getType(Src) == S64); 1946 1947 // result = trunc(src) 1948 // if (src > 0.0 && src != result) 1949 // result += 1.0 1950 1951 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1952 1953 const auto Zero = B.buildFConstant(S64, 0.0); 1954 const auto One = B.buildFConstant(S64, 1.0); 1955 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1956 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1957 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1958 auto Add = B.buildSelect(S64, And, One, Zero); 1959 1960 // TODO: Should this propagate fast-math-flags? 1961 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1962 return true; 1963 } 1964 1965 bool AMDGPULegalizerInfo::legalizeFrem( 1966 MachineInstr &MI, MachineRegisterInfo &MRI, 1967 MachineIRBuilder &B) const { 1968 Register DstReg = MI.getOperand(0).getReg(); 1969 Register Src0Reg = MI.getOperand(1).getReg(); 1970 Register Src1Reg = MI.getOperand(2).getReg(); 1971 auto Flags = MI.getFlags(); 1972 LLT Ty = MRI.getType(DstReg); 1973 1974 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1975 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1976 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1977 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1978 MI.eraseFromParent(); 1979 return true; 1980 } 1981 1982 static MachineInstrBuilder extractF64Exponent(Register Hi, 1983 MachineIRBuilder &B) { 1984 const unsigned FractBits = 52; 1985 const unsigned ExpBits = 11; 1986 LLT S32 = LLT::scalar(32); 1987 1988 auto Const0 = B.buildConstant(S32, FractBits - 32); 1989 auto Const1 = B.buildConstant(S32, ExpBits); 1990 1991 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1992 .addUse(Hi) 1993 .addUse(Const0.getReg(0)) 1994 .addUse(Const1.getReg(0)); 1995 1996 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1997 } 1998 1999 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2000 MachineInstr &MI, MachineRegisterInfo &MRI, 2001 MachineIRBuilder &B) const { 2002 const LLT S1 = LLT::scalar(1); 2003 const LLT S32 = LLT::scalar(32); 2004 const LLT S64 = LLT::scalar(64); 2005 2006 Register Src = MI.getOperand(1).getReg(); 2007 assert(MRI.getType(Src) == S64); 2008 2009 // TODO: Should this use extract since the low half is unused? 2010 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2011 Register Hi = Unmerge.getReg(1); 2012 2013 // Extract the upper half, since this is where we will find the sign and 2014 // exponent. 2015 auto Exp = extractF64Exponent(Hi, B); 2016 2017 const unsigned FractBits = 52; 2018 2019 // Extract the sign bit. 2020 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2021 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2022 2023 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2024 2025 const auto Zero32 = B.buildConstant(S32, 0); 2026 2027 // Extend back to 64-bits. 2028 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2029 2030 auto Shr = B.buildAShr(S64, FractMask, Exp); 2031 auto Not = B.buildNot(S64, Shr); 2032 auto Tmp0 = B.buildAnd(S64, Src, Not); 2033 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2034 2035 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2036 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2037 2038 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2039 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2040 MI.eraseFromParent(); 2041 return true; 2042 } 2043 2044 bool AMDGPULegalizerInfo::legalizeITOFP( 2045 MachineInstr &MI, MachineRegisterInfo &MRI, 2046 MachineIRBuilder &B, bool Signed) const { 2047 2048 Register Dst = MI.getOperand(0).getReg(); 2049 Register Src = MI.getOperand(1).getReg(); 2050 2051 const LLT S64 = LLT::scalar(64); 2052 const LLT S32 = LLT::scalar(32); 2053 2054 assert(MRI.getType(Src) == S64); 2055 2056 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2057 auto ThirtyTwo = B.buildConstant(S32, 32); 2058 2059 if (MRI.getType(Dst) == S64) { 2060 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2061 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2062 2063 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2064 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2065 .addUse(CvtHi.getReg(0)) 2066 .addUse(ThirtyTwo.getReg(0)); 2067 2068 // TODO: Should this propagate fast-math-flags? 2069 B.buildFAdd(Dst, LdExp, CvtLo); 2070 MI.eraseFromParent(); 2071 return true; 2072 } 2073 2074 assert(MRI.getType(Dst) == S32); 2075 2076 auto One = B.buildConstant(S32, 1); 2077 2078 MachineInstrBuilder ShAmt; 2079 if (Signed) { 2080 auto ThirtyOne = B.buildConstant(S32, 31); 2081 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2082 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2083 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2084 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2085 /*HasSideEffects=*/false) 2086 .addUse(Unmerge.getReg(1)); 2087 auto LS2 = B.buildSub(S32, LS, One); 2088 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2089 } else 2090 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2091 auto Norm = B.buildShl(S64, Src, ShAmt); 2092 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2093 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2094 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2095 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2096 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2097 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2098 /*HasSideEffects=*/false) 2099 .addUse(FVal.getReg(0)) 2100 .addUse(Scale.getReg(0)); 2101 MI.eraseFromParent(); 2102 return true; 2103 } 2104 2105 // TODO: Copied from DAG implementation. Verify logic and document how this 2106 // actually works. 2107 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2108 MachineRegisterInfo &MRI, 2109 MachineIRBuilder &B, 2110 bool Signed) const { 2111 2112 Register Dst = MI.getOperand(0).getReg(); 2113 Register Src = MI.getOperand(1).getReg(); 2114 2115 const LLT S64 = LLT::scalar(64); 2116 const LLT S32 = LLT::scalar(32); 2117 2118 const LLT SrcLT = MRI.getType(Src); 2119 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2120 2121 unsigned Flags = MI.getFlags(); 2122 2123 // The basic idea of converting a floating point number into a pair of 32-bit 2124 // integers is illustrated as follows: 2125 // 2126 // tf := trunc(val); 2127 // hif := floor(tf * 2^-32); 2128 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2129 // hi := fptoi(hif); 2130 // lo := fptoi(lof); 2131 // 2132 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2133 MachineInstrBuilder Sign; 2134 if (Signed && SrcLT == S32) { 2135 // However, a 32-bit floating point number has only 23 bits mantissa and 2136 // it's not enough to hold all the significant bits of `lof` if val is 2137 // negative. To avoid the loss of precision, We need to take the absolute 2138 // value after truncating and flip the result back based on the original 2139 // signedness. 2140 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2141 Trunc = B.buildFAbs(S32, Trunc, Flags); 2142 } 2143 MachineInstrBuilder K0, K1; 2144 if (SrcLT == S64) { 2145 K0 = B.buildFConstant(S64, 2146 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2147 K1 = B.buildFConstant(S64, 2148 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2149 } else { 2150 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2151 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2152 } 2153 2154 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2155 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2156 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2157 2158 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2159 : B.buildFPTOUI(S32, FloorMul); 2160 auto Lo = B.buildFPTOUI(S32, Fma); 2161 2162 if (Signed && SrcLT == S32) { 2163 // Flip the result based on the signedness, which is either all 0s or 1s. 2164 Sign = B.buildMerge(S64, {Sign, Sign}); 2165 // r := xor({lo, hi}, sign) - sign; 2166 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2167 } else 2168 B.buildMerge(Dst, {Lo, Hi}); 2169 MI.eraseFromParent(); 2170 2171 return true; 2172 } 2173 2174 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2175 MachineInstr &MI) const { 2176 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2177 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2178 2179 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2180 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2181 2182 // With ieee_mode disabled, the instructions have the correct behavior 2183 // already for G_FMINNUM/G_FMAXNUM 2184 if (!MFI->getMode().IEEE) 2185 return !IsIEEEOp; 2186 2187 if (IsIEEEOp) 2188 return true; 2189 2190 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2191 } 2192 2193 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2194 MachineInstr &MI, MachineRegisterInfo &MRI, 2195 MachineIRBuilder &B) const { 2196 // TODO: Should move some of this into LegalizerHelper. 2197 2198 // TODO: Promote dynamic indexing of s16 to s32 2199 2200 // FIXME: Artifact combiner probably should have replaced the truncated 2201 // constant before this, so we shouldn't need 2202 // getIConstantVRegValWithLookThrough. 2203 Optional<ValueAndVReg> MaybeIdxVal = 2204 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2205 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2206 return true; 2207 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2208 2209 Register Dst = MI.getOperand(0).getReg(); 2210 Register Vec = MI.getOperand(1).getReg(); 2211 2212 LLT VecTy = MRI.getType(Vec); 2213 LLT EltTy = VecTy.getElementType(); 2214 assert(EltTy == MRI.getType(Dst)); 2215 2216 if (IdxVal < VecTy.getNumElements()) { 2217 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2218 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2219 } else { 2220 B.buildUndef(Dst); 2221 } 2222 2223 MI.eraseFromParent(); 2224 return true; 2225 } 2226 2227 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2228 MachineInstr &MI, MachineRegisterInfo &MRI, 2229 MachineIRBuilder &B) const { 2230 // TODO: Should move some of this into LegalizerHelper. 2231 2232 // TODO: Promote dynamic indexing of s16 to s32 2233 2234 // FIXME: Artifact combiner probably should have replaced the truncated 2235 // constant before this, so we shouldn't need 2236 // getIConstantVRegValWithLookThrough. 2237 Optional<ValueAndVReg> MaybeIdxVal = 2238 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2239 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2240 return true; 2241 2242 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2243 Register Dst = MI.getOperand(0).getReg(); 2244 Register Vec = MI.getOperand(1).getReg(); 2245 Register Ins = MI.getOperand(2).getReg(); 2246 2247 LLT VecTy = MRI.getType(Vec); 2248 LLT EltTy = VecTy.getElementType(); 2249 assert(EltTy == MRI.getType(Ins)); 2250 2251 unsigned NumElts = VecTy.getNumElements(); 2252 if (IdxVal < NumElts) { 2253 SmallVector<Register, 8> SrcRegs; 2254 for (unsigned i = 0; i < NumElts; ++i) 2255 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2256 B.buildUnmerge(SrcRegs, Vec); 2257 2258 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2259 B.buildMerge(Dst, SrcRegs); 2260 } else { 2261 B.buildUndef(Dst); 2262 } 2263 2264 MI.eraseFromParent(); 2265 return true; 2266 } 2267 2268 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2269 MachineInstr &MI, MachineRegisterInfo &MRI, 2270 MachineIRBuilder &B) const { 2271 const LLT V2S16 = LLT::fixed_vector(2, 16); 2272 2273 Register Dst = MI.getOperand(0).getReg(); 2274 Register Src0 = MI.getOperand(1).getReg(); 2275 LLT DstTy = MRI.getType(Dst); 2276 LLT SrcTy = MRI.getType(Src0); 2277 2278 if (SrcTy == V2S16 && DstTy == V2S16 && 2279 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2280 return true; 2281 2282 MachineIRBuilder HelperBuilder(MI); 2283 GISelObserverWrapper DummyObserver; 2284 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2285 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2286 } 2287 2288 bool AMDGPULegalizerInfo::legalizeSinCos( 2289 MachineInstr &MI, MachineRegisterInfo &MRI, 2290 MachineIRBuilder &B) const { 2291 2292 Register DstReg = MI.getOperand(0).getReg(); 2293 Register SrcReg = MI.getOperand(1).getReg(); 2294 LLT Ty = MRI.getType(DstReg); 2295 unsigned Flags = MI.getFlags(); 2296 2297 Register TrigVal; 2298 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2299 if (ST.hasTrigReducedRange()) { 2300 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2301 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2302 .addUse(MulVal.getReg(0)) 2303 .setMIFlags(Flags).getReg(0); 2304 } else 2305 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2306 2307 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2308 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2309 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2310 .addUse(TrigVal) 2311 .setMIFlags(Flags); 2312 MI.eraseFromParent(); 2313 return true; 2314 } 2315 2316 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2317 MachineIRBuilder &B, 2318 const GlobalValue *GV, 2319 int64_t Offset, 2320 unsigned GAFlags) const { 2321 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2322 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2323 // to the following code sequence: 2324 // 2325 // For constant address space: 2326 // s_getpc_b64 s[0:1] 2327 // s_add_u32 s0, s0, $symbol 2328 // s_addc_u32 s1, s1, 0 2329 // 2330 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2331 // a fixup or relocation is emitted to replace $symbol with a literal 2332 // constant, which is a pc-relative offset from the encoding of the $symbol 2333 // operand to the global variable. 2334 // 2335 // For global address space: 2336 // s_getpc_b64 s[0:1] 2337 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2338 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2339 // 2340 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2341 // fixups or relocations are emitted to replace $symbol@*@lo and 2342 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2343 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2344 // operand to the global variable. 2345 // 2346 // What we want here is an offset from the value returned by s_getpc 2347 // (which is the address of the s_add_u32 instruction) to the global 2348 // variable, but since the encoding of $symbol starts 4 bytes after the start 2349 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2350 // small. This requires us to add 4 to the global variable offset in order to 2351 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2352 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2353 // instruction. 2354 2355 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2356 2357 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2358 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2359 2360 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2361 .addDef(PCReg); 2362 2363 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2364 if (GAFlags == SIInstrInfo::MO_NONE) 2365 MIB.addImm(0); 2366 else 2367 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2368 2369 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2370 2371 if (PtrTy.getSizeInBits() == 32) 2372 B.buildExtract(DstReg, PCReg, 0); 2373 return true; 2374 } 2375 2376 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2377 MachineInstr &MI, MachineRegisterInfo &MRI, 2378 MachineIRBuilder &B) const { 2379 Register DstReg = MI.getOperand(0).getReg(); 2380 LLT Ty = MRI.getType(DstReg); 2381 unsigned AS = Ty.getAddressSpace(); 2382 2383 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2384 MachineFunction &MF = B.getMF(); 2385 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2386 2387 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2388 if (!MFI->isModuleEntryFunction() && 2389 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2390 const Function &Fn = MF.getFunction(); 2391 DiagnosticInfoUnsupported BadLDSDecl( 2392 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2393 DS_Warning); 2394 Fn.getContext().diagnose(BadLDSDecl); 2395 2396 // We currently don't have a way to correctly allocate LDS objects that 2397 // aren't directly associated with a kernel. We do force inlining of 2398 // functions that use local objects. However, if these dead functions are 2399 // not eliminated, we don't want a compile time error. Just emit a warning 2400 // and a trap, since there should be no callable path here. 2401 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2402 B.buildUndef(DstReg); 2403 MI.eraseFromParent(); 2404 return true; 2405 } 2406 2407 // TODO: We could emit code to handle the initialization somewhere. 2408 // We ignore the initializer for now and legalize it to allow selection. 2409 // The initializer will anyway get errored out during assembly emission. 2410 const SITargetLowering *TLI = ST.getTargetLowering(); 2411 if (!TLI->shouldUseLDSConstAddress(GV)) { 2412 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2413 return true; // Leave in place; 2414 } 2415 2416 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2417 Type *Ty = GV->getValueType(); 2418 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2419 // zero-sized type in other languages to declare the dynamic shared 2420 // memory which size is not known at the compile time. They will be 2421 // allocated by the runtime and placed directly after the static 2422 // allocated ones. They all share the same offset. 2423 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2424 // Adjust alignment for that dynamic shared memory array. 2425 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2426 LLT S32 = LLT::scalar(32); 2427 auto Sz = 2428 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2429 B.buildIntToPtr(DstReg, Sz); 2430 MI.eraseFromParent(); 2431 return true; 2432 } 2433 } 2434 2435 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2436 *cast<GlobalVariable>(GV))); 2437 MI.eraseFromParent(); 2438 return true; 2439 } 2440 2441 const SITargetLowering *TLI = ST.getTargetLowering(); 2442 2443 if (TLI->shouldEmitFixup(GV)) { 2444 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2445 MI.eraseFromParent(); 2446 return true; 2447 } 2448 2449 if (TLI->shouldEmitPCReloc(GV)) { 2450 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2451 MI.eraseFromParent(); 2452 return true; 2453 } 2454 2455 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2456 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2457 2458 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2459 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2460 MachinePointerInfo::getGOT(MF), 2461 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2462 MachineMemOperand::MOInvariant, 2463 LoadTy, Align(8)); 2464 2465 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2466 2467 if (Ty.getSizeInBits() == 32) { 2468 // Truncate if this is a 32-bit constant address. 2469 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2470 B.buildExtract(DstReg, Load, 0); 2471 } else 2472 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2473 2474 MI.eraseFromParent(); 2475 return true; 2476 } 2477 2478 static LLT widenToNextPowerOf2(LLT Ty) { 2479 if (Ty.isVector()) 2480 return Ty.changeElementCount( 2481 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2482 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2483 } 2484 2485 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2486 MachineInstr &MI) const { 2487 MachineIRBuilder &B = Helper.MIRBuilder; 2488 MachineRegisterInfo &MRI = *B.getMRI(); 2489 GISelChangeObserver &Observer = Helper.Observer; 2490 2491 Register PtrReg = MI.getOperand(1).getReg(); 2492 LLT PtrTy = MRI.getType(PtrReg); 2493 unsigned AddrSpace = PtrTy.getAddressSpace(); 2494 2495 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2496 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2497 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2498 Observer.changingInstr(MI); 2499 MI.getOperand(1).setReg(Cast.getReg(0)); 2500 Observer.changedInstr(MI); 2501 return true; 2502 } 2503 2504 if (MI.getOpcode() != AMDGPU::G_LOAD) 2505 return false; 2506 2507 Register ValReg = MI.getOperand(0).getReg(); 2508 LLT ValTy = MRI.getType(ValReg); 2509 2510 MachineMemOperand *MMO = *MI.memoperands_begin(); 2511 const unsigned ValSize = ValTy.getSizeInBits(); 2512 const LLT MemTy = MMO->getMemoryType(); 2513 const Align MemAlign = MMO->getAlign(); 2514 const unsigned MemSize = MemTy.getSizeInBits(); 2515 const unsigned AlignInBits = 8 * MemAlign.value(); 2516 2517 // Widen non-power-of-2 loads to the alignment if needed 2518 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2519 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2520 2521 // This was already the correct extending load result type, so just adjust 2522 // the memory type. 2523 if (WideMemSize == ValSize) { 2524 MachineFunction &MF = B.getMF(); 2525 2526 MachineMemOperand *WideMMO = 2527 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2528 Observer.changingInstr(MI); 2529 MI.setMemRefs(MF, {WideMMO}); 2530 Observer.changedInstr(MI); 2531 return true; 2532 } 2533 2534 // Don't bother handling edge case that should probably never be produced. 2535 if (ValSize > WideMemSize) 2536 return false; 2537 2538 LLT WideTy = widenToNextPowerOf2(ValTy); 2539 2540 Register WideLoad; 2541 if (!WideTy.isVector()) { 2542 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2543 B.buildTrunc(ValReg, WideLoad).getReg(0); 2544 } else { 2545 // Extract the subvector. 2546 2547 if (isRegisterType(ValTy)) { 2548 // If this a case where G_EXTRACT is legal, use it. 2549 // (e.g. <3 x s32> -> <4 x s32>) 2550 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2551 B.buildExtract(ValReg, WideLoad, 0); 2552 } else { 2553 // For cases where the widened type isn't a nice register value, unmerge 2554 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2555 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2556 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2557 } 2558 } 2559 2560 MI.eraseFromParent(); 2561 return true; 2562 } 2563 2564 return false; 2565 } 2566 2567 bool AMDGPULegalizerInfo::legalizeFMad( 2568 MachineInstr &MI, MachineRegisterInfo &MRI, 2569 MachineIRBuilder &B) const { 2570 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2571 assert(Ty.isScalar()); 2572 2573 MachineFunction &MF = B.getMF(); 2574 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2575 2576 // TODO: Always legal with future ftz flag. 2577 // FIXME: Do we need just output? 2578 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2579 return true; 2580 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2581 return true; 2582 2583 MachineIRBuilder HelperBuilder(MI); 2584 GISelObserverWrapper DummyObserver; 2585 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2586 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2587 } 2588 2589 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2590 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2591 Register DstReg = MI.getOperand(0).getReg(); 2592 Register PtrReg = MI.getOperand(1).getReg(); 2593 Register CmpVal = MI.getOperand(2).getReg(); 2594 Register NewVal = MI.getOperand(3).getReg(); 2595 2596 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2597 "this should not have been custom lowered"); 2598 2599 LLT ValTy = MRI.getType(CmpVal); 2600 LLT VecTy = LLT::fixed_vector(2, ValTy); 2601 2602 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2603 2604 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2605 .addDef(DstReg) 2606 .addUse(PtrReg) 2607 .addUse(PackedVal) 2608 .setMemRefs(MI.memoperands()); 2609 2610 MI.eraseFromParent(); 2611 return true; 2612 } 2613 2614 bool AMDGPULegalizerInfo::legalizeFlog( 2615 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2616 Register Dst = MI.getOperand(0).getReg(); 2617 Register Src = MI.getOperand(1).getReg(); 2618 LLT Ty = B.getMRI()->getType(Dst); 2619 unsigned Flags = MI.getFlags(); 2620 2621 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2622 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2623 2624 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2625 MI.eraseFromParent(); 2626 return true; 2627 } 2628 2629 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2630 MachineIRBuilder &B) const { 2631 Register Dst = MI.getOperand(0).getReg(); 2632 Register Src = MI.getOperand(1).getReg(); 2633 unsigned Flags = MI.getFlags(); 2634 LLT Ty = B.getMRI()->getType(Dst); 2635 2636 auto K = B.buildFConstant(Ty, numbers::log2e); 2637 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2638 B.buildFExp2(Dst, Mul, Flags); 2639 MI.eraseFromParent(); 2640 return true; 2641 } 2642 2643 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2644 MachineIRBuilder &B) const { 2645 Register Dst = MI.getOperand(0).getReg(); 2646 Register Src0 = MI.getOperand(1).getReg(); 2647 Register Src1 = MI.getOperand(2).getReg(); 2648 unsigned Flags = MI.getFlags(); 2649 LLT Ty = B.getMRI()->getType(Dst); 2650 const LLT S16 = LLT::scalar(16); 2651 const LLT S32 = LLT::scalar(32); 2652 2653 if (Ty == S32) { 2654 auto Log = B.buildFLog2(S32, Src0, Flags); 2655 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2656 .addUse(Log.getReg(0)) 2657 .addUse(Src1) 2658 .setMIFlags(Flags); 2659 B.buildFExp2(Dst, Mul, Flags); 2660 } else if (Ty == S16) { 2661 // There's no f16 fmul_legacy, so we need to convert for it. 2662 auto Log = B.buildFLog2(S16, Src0, Flags); 2663 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2664 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2665 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2666 .addUse(Ext0.getReg(0)) 2667 .addUse(Ext1.getReg(0)) 2668 .setMIFlags(Flags); 2669 2670 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2671 } else 2672 return false; 2673 2674 MI.eraseFromParent(); 2675 return true; 2676 } 2677 2678 // Find a source register, ignoring any possible source modifiers. 2679 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2680 Register ModSrc = OrigSrc; 2681 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2682 ModSrc = SrcFNeg->getOperand(1).getReg(); 2683 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2684 ModSrc = SrcFAbs->getOperand(1).getReg(); 2685 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2686 ModSrc = SrcFAbs->getOperand(1).getReg(); 2687 return ModSrc; 2688 } 2689 2690 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2691 MachineRegisterInfo &MRI, 2692 MachineIRBuilder &B) const { 2693 2694 const LLT S1 = LLT::scalar(1); 2695 const LLT S64 = LLT::scalar(64); 2696 Register Dst = MI.getOperand(0).getReg(); 2697 Register OrigSrc = MI.getOperand(1).getReg(); 2698 unsigned Flags = MI.getFlags(); 2699 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2700 "this should not have been custom lowered"); 2701 2702 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2703 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2704 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2705 // V_FRACT bug is: 2706 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2707 // 2708 // Convert floor(x) to (x - fract(x)) 2709 2710 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2711 .addUse(OrigSrc) 2712 .setMIFlags(Flags); 2713 2714 // Give source modifier matching some assistance before obscuring a foldable 2715 // pattern. 2716 2717 // TODO: We can avoid the neg on the fract? The input sign to fract 2718 // shouldn't matter? 2719 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2720 2721 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2722 2723 Register Min = MRI.createGenericVirtualRegister(S64); 2724 2725 // We don't need to concern ourselves with the snan handling difference, so 2726 // use the one which will directly select. 2727 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2728 if (MFI->getMode().IEEE) 2729 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2730 else 2731 B.buildFMinNum(Min, Fract, Const, Flags); 2732 2733 Register CorrectedFract = Min; 2734 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2735 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2736 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2737 } 2738 2739 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2740 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2741 2742 MI.eraseFromParent(); 2743 return true; 2744 } 2745 2746 // Turn an illegal packed v2s16 build vector into bit operations. 2747 // TODO: This should probably be a bitcast action in LegalizerHelper. 2748 bool AMDGPULegalizerInfo::legalizeBuildVector( 2749 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2750 Register Dst = MI.getOperand(0).getReg(); 2751 const LLT S32 = LLT::scalar(32); 2752 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2753 2754 Register Src0 = MI.getOperand(1).getReg(); 2755 Register Src1 = MI.getOperand(2).getReg(); 2756 assert(MRI.getType(Src0) == LLT::scalar(16)); 2757 2758 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2759 B.buildBitcast(Dst, Merge); 2760 2761 MI.eraseFromParent(); 2762 return true; 2763 } 2764 2765 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2766 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2767 // case with a single min instruction instead of a compare+select. 2768 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2769 MachineRegisterInfo &MRI, 2770 MachineIRBuilder &B) const { 2771 Register Dst = MI.getOperand(0).getReg(); 2772 Register Src = MI.getOperand(1).getReg(); 2773 LLT DstTy = MRI.getType(Dst); 2774 LLT SrcTy = MRI.getType(Src); 2775 2776 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2777 ? AMDGPU::G_AMDGPU_FFBH_U32 2778 : AMDGPU::G_AMDGPU_FFBL_B32; 2779 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2780 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2781 2782 MI.eraseFromParent(); 2783 return true; 2784 } 2785 2786 // Check that this is a G_XOR x, -1 2787 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2788 if (MI.getOpcode() != TargetOpcode::G_XOR) 2789 return false; 2790 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2791 return ConstVal && *ConstVal == -1; 2792 } 2793 2794 // Return the use branch instruction, otherwise null if the usage is invalid. 2795 static MachineInstr * 2796 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2797 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2798 Register CondDef = MI.getOperand(0).getReg(); 2799 if (!MRI.hasOneNonDBGUse(CondDef)) 2800 return nullptr; 2801 2802 MachineBasicBlock *Parent = MI.getParent(); 2803 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2804 2805 if (isNot(MRI, *UseMI)) { 2806 Register NegatedCond = UseMI->getOperand(0).getReg(); 2807 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2808 return nullptr; 2809 2810 // We're deleting the def of this value, so we need to remove it. 2811 eraseInstr(*UseMI, MRI); 2812 2813 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2814 Negated = true; 2815 } 2816 2817 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2818 return nullptr; 2819 2820 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2821 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2822 if (Next == Parent->end()) { 2823 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2824 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2825 return nullptr; 2826 UncondBrTarget = &*NextMBB; 2827 } else { 2828 if (Next->getOpcode() != AMDGPU::G_BR) 2829 return nullptr; 2830 Br = &*Next; 2831 UncondBrTarget = Br->getOperand(0).getMBB(); 2832 } 2833 2834 return UseMI; 2835 } 2836 2837 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2838 const ArgDescriptor *Arg, 2839 const TargetRegisterClass *ArgRC, 2840 LLT ArgTy) const { 2841 MCRegister SrcReg = Arg->getRegister(); 2842 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2843 assert(DstReg.isVirtual() && "Virtual register expected"); 2844 2845 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2846 ArgTy); 2847 if (Arg->isMasked()) { 2848 // TODO: Should we try to emit this once in the entry block? 2849 const LLT S32 = LLT::scalar(32); 2850 const unsigned Mask = Arg->getMask(); 2851 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2852 2853 Register AndMaskSrc = LiveIn; 2854 2855 if (Shift != 0) { 2856 auto ShiftAmt = B.buildConstant(S32, Shift); 2857 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2858 } 2859 2860 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2861 } else { 2862 B.buildCopy(DstReg, LiveIn); 2863 } 2864 2865 return true; 2866 } 2867 2868 bool AMDGPULegalizerInfo::loadInputValue( 2869 Register DstReg, MachineIRBuilder &B, 2870 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2871 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2872 const ArgDescriptor *Arg; 2873 const TargetRegisterClass *ArgRC; 2874 LLT ArgTy; 2875 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2876 2877 if (!Arg) { 2878 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 2879 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 2880 // case the pointer argument may be missing and we use null. 2881 B.buildConstant(DstReg, 0); 2882 return true; 2883 } 2884 2885 // It's undefined behavior if a function marked with the amdgpu-no-* 2886 // attributes uses the corresponding intrinsic. 2887 B.buildUndef(DstReg); 2888 return true; 2889 } 2890 2891 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2892 return false; // TODO: Handle these 2893 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2894 } 2895 2896 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2897 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2898 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2899 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2900 return false; 2901 2902 MI.eraseFromParent(); 2903 return true; 2904 } 2905 2906 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2907 MachineRegisterInfo &MRI, 2908 MachineIRBuilder &B) const { 2909 Register Dst = MI.getOperand(0).getReg(); 2910 LLT DstTy = MRI.getType(Dst); 2911 LLT S16 = LLT::scalar(16); 2912 LLT S32 = LLT::scalar(32); 2913 LLT S64 = LLT::scalar(64); 2914 2915 if (DstTy == S16) 2916 return legalizeFDIV16(MI, MRI, B); 2917 if (DstTy == S32) 2918 return legalizeFDIV32(MI, MRI, B); 2919 if (DstTy == S64) 2920 return legalizeFDIV64(MI, MRI, B); 2921 2922 return false; 2923 } 2924 2925 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2926 Register DstDivReg, 2927 Register DstRemReg, 2928 Register X, 2929 Register Y) const { 2930 const LLT S1 = LLT::scalar(1); 2931 const LLT S32 = LLT::scalar(32); 2932 2933 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2934 // algorithm used here. 2935 2936 // Initial estimate of inv(y). 2937 auto FloatY = B.buildUITOFP(S32, Y); 2938 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2939 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2940 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2941 auto Z = B.buildFPTOUI(S32, ScaledY); 2942 2943 // One round of UNR. 2944 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2945 auto NegYZ = B.buildMul(S32, NegY, Z); 2946 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2947 2948 // Quotient/remainder estimate. 2949 auto Q = B.buildUMulH(S32, X, Z); 2950 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2951 2952 // First quotient/remainder refinement. 2953 auto One = B.buildConstant(S32, 1); 2954 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2955 if (DstDivReg) 2956 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2957 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2958 2959 // Second quotient/remainder refinement. 2960 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2961 if (DstDivReg) 2962 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2963 2964 if (DstRemReg) 2965 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2966 } 2967 2968 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 2969 // 2970 // Return lo, hi of result 2971 // 2972 // %cvt.lo = G_UITOFP Val.lo 2973 // %cvt.hi = G_UITOFP Val.hi 2974 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2975 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2976 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2977 // %mul2 = G_FMUL %mul1, 2**(-32) 2978 // %trunc = G_INTRINSIC_TRUNC %mul2 2979 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2980 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2981 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2982 Register Val) { 2983 const LLT S32 = LLT::scalar(32); 2984 auto Unmerge = B.buildUnmerge(S32, Val); 2985 2986 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2987 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2988 2989 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2990 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2991 2992 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2993 auto Mul1 = 2994 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2995 2996 // 2**(-32) 2997 auto Mul2 = 2998 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2999 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3000 3001 // -(2**32) 3002 auto Mad2 = B.buildFMAD(S32, Trunc, 3003 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3004 3005 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3006 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3007 3008 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3009 } 3010 3011 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3012 Register DstDivReg, 3013 Register DstRemReg, 3014 Register Numer, 3015 Register Denom) const { 3016 const LLT S32 = LLT::scalar(32); 3017 const LLT S64 = LLT::scalar(64); 3018 const LLT S1 = LLT::scalar(1); 3019 Register RcpLo, RcpHi; 3020 3021 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3022 3023 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3024 3025 auto Zero64 = B.buildConstant(S64, 0); 3026 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3027 3028 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3029 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3030 3031 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3032 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3033 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3034 3035 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3036 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3037 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3038 3039 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3040 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3041 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3042 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3043 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3044 3045 auto Zero32 = B.buildConstant(S32, 0); 3046 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3047 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3048 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3049 3050 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3051 Register NumerLo = UnmergeNumer.getReg(0); 3052 Register NumerHi = UnmergeNumer.getReg(1); 3053 3054 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3055 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3056 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3057 Register Mul3_Lo = UnmergeMul3.getReg(0); 3058 Register Mul3_Hi = UnmergeMul3.getReg(1); 3059 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3060 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3061 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3062 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3063 3064 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3065 Register DenomLo = UnmergeDenom.getReg(0); 3066 Register DenomHi = UnmergeDenom.getReg(1); 3067 3068 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3069 auto C1 = B.buildSExt(S32, CmpHi); 3070 3071 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3072 auto C2 = B.buildSExt(S32, CmpLo); 3073 3074 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3075 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3076 3077 // TODO: Here and below portions of the code can be enclosed into if/endif. 3078 // Currently control flow is unconditional and we have 4 selects after 3079 // potential endif to substitute PHIs. 3080 3081 // if C3 != 0 ... 3082 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3083 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3084 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3085 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3086 3087 auto One64 = B.buildConstant(S64, 1); 3088 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3089 3090 auto C4 = 3091 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3092 auto C5 = 3093 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3094 auto C6 = B.buildSelect( 3095 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3096 3097 // if (C6 != 0) 3098 auto Add4 = B.buildAdd(S64, Add3, One64); 3099 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3100 3101 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3102 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3103 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3104 3105 // endif C6 3106 // endif C3 3107 3108 if (DstDivReg) { 3109 auto Sel1 = B.buildSelect( 3110 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3111 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3112 Sel1, MulHi3); 3113 } 3114 3115 if (DstRemReg) { 3116 auto Sel2 = B.buildSelect( 3117 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3118 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3119 Sel2, Sub1); 3120 } 3121 } 3122 3123 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3124 MachineRegisterInfo &MRI, 3125 MachineIRBuilder &B) const { 3126 Register DstDivReg, DstRemReg; 3127 switch (MI.getOpcode()) { 3128 default: 3129 llvm_unreachable("Unexpected opcode!"); 3130 case AMDGPU::G_UDIV: { 3131 DstDivReg = MI.getOperand(0).getReg(); 3132 break; 3133 } 3134 case AMDGPU::G_UREM: { 3135 DstRemReg = MI.getOperand(0).getReg(); 3136 break; 3137 } 3138 case AMDGPU::G_UDIVREM: { 3139 DstDivReg = MI.getOperand(0).getReg(); 3140 DstRemReg = MI.getOperand(1).getReg(); 3141 break; 3142 } 3143 } 3144 3145 const LLT S64 = LLT::scalar(64); 3146 const LLT S32 = LLT::scalar(32); 3147 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3148 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3149 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3150 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3151 3152 if (Ty == S32) 3153 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3154 else if (Ty == S64) 3155 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3156 else 3157 return false; 3158 3159 MI.eraseFromParent(); 3160 return true; 3161 } 3162 3163 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3164 MachineRegisterInfo &MRI, 3165 MachineIRBuilder &B) const { 3166 const LLT S64 = LLT::scalar(64); 3167 const LLT S32 = LLT::scalar(32); 3168 3169 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3170 if (Ty != S32 && Ty != S64) 3171 return false; 3172 3173 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3174 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3175 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3176 3177 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3178 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3179 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3180 3181 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3182 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3183 3184 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3185 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3186 3187 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3188 switch (MI.getOpcode()) { 3189 default: 3190 llvm_unreachable("Unexpected opcode!"); 3191 case AMDGPU::G_SDIV: { 3192 DstDivReg = MI.getOperand(0).getReg(); 3193 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3194 break; 3195 } 3196 case AMDGPU::G_SREM: { 3197 DstRemReg = MI.getOperand(0).getReg(); 3198 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3199 break; 3200 } 3201 case AMDGPU::G_SDIVREM: { 3202 DstDivReg = MI.getOperand(0).getReg(); 3203 DstRemReg = MI.getOperand(1).getReg(); 3204 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3205 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3206 break; 3207 } 3208 } 3209 3210 if (Ty == S32) 3211 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3212 else 3213 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3214 3215 if (DstDivReg) { 3216 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3217 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3218 B.buildSub(DstDivReg, SignXor, Sign); 3219 } 3220 3221 if (DstRemReg) { 3222 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3223 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3224 B.buildSub(DstRemReg, SignXor, Sign); 3225 } 3226 3227 MI.eraseFromParent(); 3228 return true; 3229 } 3230 3231 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3232 MachineRegisterInfo &MRI, 3233 MachineIRBuilder &B) const { 3234 Register Res = MI.getOperand(0).getReg(); 3235 Register LHS = MI.getOperand(1).getReg(); 3236 Register RHS = MI.getOperand(2).getReg(); 3237 uint16_t Flags = MI.getFlags(); 3238 LLT ResTy = MRI.getType(Res); 3239 3240 const MachineFunction &MF = B.getMF(); 3241 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3242 MI.getFlag(MachineInstr::FmAfn); 3243 3244 if (!AllowInaccurateRcp) 3245 return false; 3246 3247 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3248 // 1 / x -> RCP(x) 3249 if (CLHS->isExactlyValue(1.0)) { 3250 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3251 .addUse(RHS) 3252 .setMIFlags(Flags); 3253 3254 MI.eraseFromParent(); 3255 return true; 3256 } 3257 3258 // -1 / x -> RCP( FNEG(x) ) 3259 if (CLHS->isExactlyValue(-1.0)) { 3260 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3261 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3262 .addUse(FNeg.getReg(0)) 3263 .setMIFlags(Flags); 3264 3265 MI.eraseFromParent(); 3266 return true; 3267 } 3268 } 3269 3270 // x / y -> x * (1.0 / y) 3271 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3272 .addUse(RHS) 3273 .setMIFlags(Flags); 3274 B.buildFMul(Res, LHS, RCP, Flags); 3275 3276 MI.eraseFromParent(); 3277 return true; 3278 } 3279 3280 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3281 MachineRegisterInfo &MRI, 3282 MachineIRBuilder &B) const { 3283 Register Res = MI.getOperand(0).getReg(); 3284 Register X = MI.getOperand(1).getReg(); 3285 Register Y = MI.getOperand(2).getReg(); 3286 uint16_t Flags = MI.getFlags(); 3287 LLT ResTy = MRI.getType(Res); 3288 3289 const MachineFunction &MF = B.getMF(); 3290 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3291 MI.getFlag(MachineInstr::FmAfn); 3292 3293 if (!AllowInaccurateRcp) 3294 return false; 3295 3296 auto NegY = B.buildFNeg(ResTy, Y); 3297 auto One = B.buildFConstant(ResTy, 1.0); 3298 3299 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3300 .addUse(Y) 3301 .setMIFlags(Flags); 3302 3303 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3304 R = B.buildFMA(ResTy, Tmp0, R, R); 3305 3306 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3307 R = B.buildFMA(ResTy, Tmp1, R, R); 3308 3309 auto Ret = B.buildFMul(ResTy, X, R); 3310 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3311 3312 B.buildFMA(Res, Tmp2, R, Ret); 3313 MI.eraseFromParent(); 3314 return true; 3315 } 3316 3317 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3318 MachineRegisterInfo &MRI, 3319 MachineIRBuilder &B) const { 3320 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3321 return true; 3322 3323 Register Res = MI.getOperand(0).getReg(); 3324 Register LHS = MI.getOperand(1).getReg(); 3325 Register RHS = MI.getOperand(2).getReg(); 3326 3327 uint16_t Flags = MI.getFlags(); 3328 3329 LLT S16 = LLT::scalar(16); 3330 LLT S32 = LLT::scalar(32); 3331 3332 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3333 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3334 3335 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3336 .addUse(RHSExt.getReg(0)) 3337 .setMIFlags(Flags); 3338 3339 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3340 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3341 3342 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3343 .addUse(RDst.getReg(0)) 3344 .addUse(RHS) 3345 .addUse(LHS) 3346 .setMIFlags(Flags); 3347 3348 MI.eraseFromParent(); 3349 return true; 3350 } 3351 3352 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3353 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3354 static void toggleSPDenormMode(bool Enable, 3355 MachineIRBuilder &B, 3356 const GCNSubtarget &ST, 3357 AMDGPU::SIModeRegisterDefaults Mode) { 3358 // Set SP denorm mode to this value. 3359 unsigned SPDenormMode = 3360 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3361 3362 if (ST.hasDenormModeInst()) { 3363 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3364 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3365 3366 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3367 B.buildInstr(AMDGPU::S_DENORM_MODE) 3368 .addImm(NewDenormModeValue); 3369 3370 } else { 3371 // Select FP32 bit field in mode register. 3372 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3373 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3374 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3375 3376 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3377 .addImm(SPDenormMode) 3378 .addImm(SPDenormModeBitField); 3379 } 3380 } 3381 3382 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3383 MachineRegisterInfo &MRI, 3384 MachineIRBuilder &B) const { 3385 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3386 return true; 3387 3388 Register Res = MI.getOperand(0).getReg(); 3389 Register LHS = MI.getOperand(1).getReg(); 3390 Register RHS = MI.getOperand(2).getReg(); 3391 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3392 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3393 3394 uint16_t Flags = MI.getFlags(); 3395 3396 LLT S32 = LLT::scalar(32); 3397 LLT S1 = LLT::scalar(1); 3398 3399 auto One = B.buildFConstant(S32, 1.0f); 3400 3401 auto DenominatorScaled = 3402 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3403 .addUse(LHS) 3404 .addUse(RHS) 3405 .addImm(0) 3406 .setMIFlags(Flags); 3407 auto NumeratorScaled = 3408 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3409 .addUse(LHS) 3410 .addUse(RHS) 3411 .addImm(1) 3412 .setMIFlags(Flags); 3413 3414 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3415 .addUse(DenominatorScaled.getReg(0)) 3416 .setMIFlags(Flags); 3417 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3418 3419 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3420 // aren't modeled as reading it. 3421 if (!Mode.allFP32Denormals()) 3422 toggleSPDenormMode(true, B, ST, Mode); 3423 3424 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3425 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3426 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3427 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3428 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3429 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3430 3431 if (!Mode.allFP32Denormals()) 3432 toggleSPDenormMode(false, B, ST, Mode); 3433 3434 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3435 .addUse(Fma4.getReg(0)) 3436 .addUse(Fma1.getReg(0)) 3437 .addUse(Fma3.getReg(0)) 3438 .addUse(NumeratorScaled.getReg(1)) 3439 .setMIFlags(Flags); 3440 3441 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3442 .addUse(Fmas.getReg(0)) 3443 .addUse(RHS) 3444 .addUse(LHS) 3445 .setMIFlags(Flags); 3446 3447 MI.eraseFromParent(); 3448 return true; 3449 } 3450 3451 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3452 MachineRegisterInfo &MRI, 3453 MachineIRBuilder &B) const { 3454 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3455 return true; 3456 3457 Register Res = MI.getOperand(0).getReg(); 3458 Register LHS = MI.getOperand(1).getReg(); 3459 Register RHS = MI.getOperand(2).getReg(); 3460 3461 uint16_t Flags = MI.getFlags(); 3462 3463 LLT S64 = LLT::scalar(64); 3464 LLT S1 = LLT::scalar(1); 3465 3466 auto One = B.buildFConstant(S64, 1.0); 3467 3468 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3469 .addUse(LHS) 3470 .addUse(RHS) 3471 .addImm(0) 3472 .setMIFlags(Flags); 3473 3474 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3475 3476 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3477 .addUse(DivScale0.getReg(0)) 3478 .setMIFlags(Flags); 3479 3480 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3481 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3482 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3483 3484 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3485 .addUse(LHS) 3486 .addUse(RHS) 3487 .addImm(1) 3488 .setMIFlags(Flags); 3489 3490 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3491 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3492 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3493 3494 Register Scale; 3495 if (!ST.hasUsableDivScaleConditionOutput()) { 3496 // Workaround a hardware bug on SI where the condition output from div_scale 3497 // is not usable. 3498 3499 LLT S32 = LLT::scalar(32); 3500 3501 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3502 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3503 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3504 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3505 3506 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3507 Scale1Unmerge.getReg(1)); 3508 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3509 Scale0Unmerge.getReg(1)); 3510 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3511 } else { 3512 Scale = DivScale1.getReg(1); 3513 } 3514 3515 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3516 .addUse(Fma4.getReg(0)) 3517 .addUse(Fma3.getReg(0)) 3518 .addUse(Mul.getReg(0)) 3519 .addUse(Scale) 3520 .setMIFlags(Flags); 3521 3522 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3523 .addUse(Fmas.getReg(0)) 3524 .addUse(RHS) 3525 .addUse(LHS) 3526 .setMIFlags(Flags); 3527 3528 MI.eraseFromParent(); 3529 return true; 3530 } 3531 3532 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3533 MachineRegisterInfo &MRI, 3534 MachineIRBuilder &B) const { 3535 Register Res = MI.getOperand(0).getReg(); 3536 Register LHS = MI.getOperand(2).getReg(); 3537 Register RHS = MI.getOperand(3).getReg(); 3538 uint16_t Flags = MI.getFlags(); 3539 3540 LLT S32 = LLT::scalar(32); 3541 LLT S1 = LLT::scalar(1); 3542 3543 auto Abs = B.buildFAbs(S32, RHS, Flags); 3544 const APFloat C0Val(1.0f); 3545 3546 auto C0 = B.buildConstant(S32, 0x6f800000); 3547 auto C1 = B.buildConstant(S32, 0x2f800000); 3548 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3549 3550 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3551 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3552 3553 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3554 3555 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3556 .addUse(Mul0.getReg(0)) 3557 .setMIFlags(Flags); 3558 3559 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3560 3561 B.buildFMul(Res, Sel, Mul1, Flags); 3562 3563 MI.eraseFromParent(); 3564 return true; 3565 } 3566 3567 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3568 // FIXME: Why do we handle this one but not other removed instructions? 3569 // 3570 // Reciprocal square root. The clamp prevents infinite results, clamping 3571 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3572 // +-max_float. 3573 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3574 MachineRegisterInfo &MRI, 3575 MachineIRBuilder &B) const { 3576 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3577 return true; 3578 3579 Register Dst = MI.getOperand(0).getReg(); 3580 Register Src = MI.getOperand(2).getReg(); 3581 auto Flags = MI.getFlags(); 3582 3583 LLT Ty = MRI.getType(Dst); 3584 3585 const fltSemantics *FltSemantics; 3586 if (Ty == LLT::scalar(32)) 3587 FltSemantics = &APFloat::IEEEsingle(); 3588 else if (Ty == LLT::scalar(64)) 3589 FltSemantics = &APFloat::IEEEdouble(); 3590 else 3591 return false; 3592 3593 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3594 .addUse(Src) 3595 .setMIFlags(Flags); 3596 3597 // We don't need to concern ourselves with the snan handling difference, since 3598 // the rsq quieted (or not) so use the one which will directly select. 3599 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3600 const bool UseIEEE = MFI->getMode().IEEE; 3601 3602 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3603 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3604 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3605 3606 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3607 3608 if (UseIEEE) 3609 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3610 else 3611 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3612 MI.eraseFromParent(); 3613 return true; 3614 } 3615 3616 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3617 switch (IID) { 3618 case Intrinsic::amdgcn_ds_fadd: 3619 return AMDGPU::G_ATOMICRMW_FADD; 3620 case Intrinsic::amdgcn_ds_fmin: 3621 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3622 case Intrinsic::amdgcn_ds_fmax: 3623 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3624 default: 3625 llvm_unreachable("not a DS FP intrinsic"); 3626 } 3627 } 3628 3629 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3630 MachineInstr &MI, 3631 Intrinsic::ID IID) const { 3632 GISelChangeObserver &Observer = Helper.Observer; 3633 Observer.changingInstr(MI); 3634 3635 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3636 3637 // The remaining operands were used to set fields in the MemOperand on 3638 // construction. 3639 for (int I = 6; I > 3; --I) 3640 MI.RemoveOperand(I); 3641 3642 MI.RemoveOperand(1); // Remove the intrinsic ID. 3643 Observer.changedInstr(MI); 3644 return true; 3645 } 3646 3647 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3648 MachineRegisterInfo &MRI, 3649 MachineIRBuilder &B) const { 3650 uint64_t Offset = 3651 ST.getTargetLowering()->getImplicitParameterOffset( 3652 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3653 LLT DstTy = MRI.getType(DstReg); 3654 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3655 3656 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3657 if (!loadInputValue(KernargPtrReg, B, 3658 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3659 return false; 3660 3661 // FIXME: This should be nuw 3662 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3663 return true; 3664 } 3665 3666 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3667 MachineRegisterInfo &MRI, 3668 MachineIRBuilder &B) const { 3669 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3670 if (!MFI->isEntryFunction()) { 3671 return legalizePreloadedArgIntrin(MI, MRI, B, 3672 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3673 } 3674 3675 Register DstReg = MI.getOperand(0).getReg(); 3676 if (!getImplicitArgPtr(DstReg, MRI, B)) 3677 return false; 3678 3679 MI.eraseFromParent(); 3680 return true; 3681 } 3682 3683 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3684 MachineRegisterInfo &MRI, 3685 MachineIRBuilder &B, 3686 unsigned AddrSpace) const { 3687 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3688 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3689 Register Hi32 = Unmerge.getReg(1); 3690 3691 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3692 MI.eraseFromParent(); 3693 return true; 3694 } 3695 3696 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3697 // offset (the offset that is included in bounds checking and swizzling, to be 3698 // split between the instruction's voffset and immoffset fields) and soffset 3699 // (the offset that is excluded from bounds checking and swizzling, to go in 3700 // the instruction's soffset field). This function takes the first kind of 3701 // offset and figures out how to split it between voffset and immoffset. 3702 std::pair<Register, unsigned> 3703 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3704 Register OrigOffset) const { 3705 const unsigned MaxImm = 4095; 3706 Register BaseReg; 3707 unsigned ImmOffset; 3708 const LLT S32 = LLT::scalar(32); 3709 MachineRegisterInfo &MRI = *B.getMRI(); 3710 3711 std::tie(BaseReg, ImmOffset) = 3712 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3713 3714 // If BaseReg is a pointer, convert it to int. 3715 if (MRI.getType(BaseReg).isPointer()) 3716 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3717 3718 // If the immediate value is too big for the immoffset field, put the value 3719 // and -4096 into the immoffset field so that the value that is copied/added 3720 // for the voffset field is a multiple of 4096, and it stands more chance 3721 // of being CSEd with the copy/add for another similar load/store. 3722 // However, do not do that rounding down to a multiple of 4096 if that is a 3723 // negative number, as it appears to be illegal to have a negative offset 3724 // in the vgpr, even if adding the immediate offset makes it positive. 3725 unsigned Overflow = ImmOffset & ~MaxImm; 3726 ImmOffset -= Overflow; 3727 if ((int32_t)Overflow < 0) { 3728 Overflow += ImmOffset; 3729 ImmOffset = 0; 3730 } 3731 3732 if (Overflow != 0) { 3733 if (!BaseReg) { 3734 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3735 } else { 3736 auto OverflowVal = B.buildConstant(S32, Overflow); 3737 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3738 } 3739 } 3740 3741 if (!BaseReg) 3742 BaseReg = B.buildConstant(S32, 0).getReg(0); 3743 3744 return std::make_pair(BaseReg, ImmOffset); 3745 } 3746 3747 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3748 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3749 Register VOffset, Register SOffset, 3750 unsigned ImmOffset, Register VIndex, 3751 MachineRegisterInfo &MRI) const { 3752 Optional<ValueAndVReg> MaybeVOffsetVal = 3753 getIConstantVRegValWithLookThrough(VOffset, MRI); 3754 Optional<ValueAndVReg> MaybeSOffsetVal = 3755 getIConstantVRegValWithLookThrough(SOffset, MRI); 3756 Optional<ValueAndVReg> MaybeVIndexVal = 3757 getIConstantVRegValWithLookThrough(VIndex, MRI); 3758 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3759 // update the MMO with that offset. The stride is unknown so we can only do 3760 // this if VIndex is constant 0. 3761 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3762 MaybeVIndexVal->Value == 0) { 3763 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3764 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3765 MMO->setOffset(TotalOffset); 3766 } else { 3767 // We don't have a constant combined offset to use in the MMO. Give up. 3768 MMO->setValue((Value *)nullptr); 3769 } 3770 } 3771 3772 /// Handle register layout difference for f16 images for some subtargets. 3773 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3774 MachineRegisterInfo &MRI, 3775 Register Reg, 3776 bool ImageStore) const { 3777 const LLT S16 = LLT::scalar(16); 3778 const LLT S32 = LLT::scalar(32); 3779 LLT StoreVT = MRI.getType(Reg); 3780 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3781 3782 if (ST.hasUnpackedD16VMem()) { 3783 auto Unmerge = B.buildUnmerge(S16, Reg); 3784 3785 SmallVector<Register, 4> WideRegs; 3786 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3787 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3788 3789 int NumElts = StoreVT.getNumElements(); 3790 3791 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3792 .getReg(0); 3793 } 3794 3795 if (ImageStore && ST.hasImageStoreD16Bug()) { 3796 if (StoreVT.getNumElements() == 2) { 3797 SmallVector<Register, 4> PackedRegs; 3798 Reg = B.buildBitcast(S32, Reg).getReg(0); 3799 PackedRegs.push_back(Reg); 3800 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3801 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3802 .getReg(0); 3803 } 3804 3805 if (StoreVT.getNumElements() == 3) { 3806 SmallVector<Register, 4> PackedRegs; 3807 auto Unmerge = B.buildUnmerge(S16, Reg); 3808 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3809 PackedRegs.push_back(Unmerge.getReg(I)); 3810 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3811 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3812 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3813 } 3814 3815 if (StoreVT.getNumElements() == 4) { 3816 SmallVector<Register, 4> PackedRegs; 3817 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3818 auto Unmerge = B.buildUnmerge(S32, Reg); 3819 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3820 PackedRegs.push_back(Unmerge.getReg(I)); 3821 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3822 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3823 .getReg(0); 3824 } 3825 3826 llvm_unreachable("invalid data type"); 3827 } 3828 3829 if (StoreVT == LLT::fixed_vector(3, S16)) { 3830 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 3831 .getReg(0); 3832 } 3833 return Reg; 3834 } 3835 3836 Register AMDGPULegalizerInfo::fixStoreSourceType( 3837 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3838 MachineRegisterInfo *MRI = B.getMRI(); 3839 LLT Ty = MRI->getType(VData); 3840 3841 const LLT S16 = LLT::scalar(16); 3842 3843 // Fixup illegal register types for i8 stores. 3844 if (Ty == LLT::scalar(8) || Ty == S16) { 3845 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3846 return AnyExt; 3847 } 3848 3849 if (Ty.isVector()) { 3850 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3851 if (IsFormat) 3852 return handleD16VData(B, *MRI, VData); 3853 } 3854 } 3855 3856 return VData; 3857 } 3858 3859 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3860 MachineRegisterInfo &MRI, 3861 MachineIRBuilder &B, 3862 bool IsTyped, 3863 bool IsFormat) const { 3864 Register VData = MI.getOperand(1).getReg(); 3865 LLT Ty = MRI.getType(VData); 3866 LLT EltTy = Ty.getScalarType(); 3867 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3868 const LLT S32 = LLT::scalar(32); 3869 3870 VData = fixStoreSourceType(B, VData, IsFormat); 3871 Register RSrc = MI.getOperand(2).getReg(); 3872 3873 MachineMemOperand *MMO = *MI.memoperands_begin(); 3874 const int MemSize = MMO->getSize(); 3875 3876 unsigned ImmOffset; 3877 3878 // The typed intrinsics add an immediate after the registers. 3879 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3880 3881 // The struct intrinsic variants add one additional operand over raw. 3882 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3883 Register VIndex; 3884 int OpOffset = 0; 3885 if (HasVIndex) { 3886 VIndex = MI.getOperand(3).getReg(); 3887 OpOffset = 1; 3888 } else { 3889 VIndex = B.buildConstant(S32, 0).getReg(0); 3890 } 3891 3892 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3893 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3894 3895 unsigned Format = 0; 3896 if (IsTyped) { 3897 Format = MI.getOperand(5 + OpOffset).getImm(); 3898 ++OpOffset; 3899 } 3900 3901 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3902 3903 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3904 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3905 3906 unsigned Opc; 3907 if (IsTyped) { 3908 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3909 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3910 } else if (IsFormat) { 3911 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3912 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3913 } else { 3914 switch (MemSize) { 3915 case 1: 3916 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3917 break; 3918 case 2: 3919 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3920 break; 3921 default: 3922 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3923 break; 3924 } 3925 } 3926 3927 auto MIB = B.buildInstr(Opc) 3928 .addUse(VData) // vdata 3929 .addUse(RSrc) // rsrc 3930 .addUse(VIndex) // vindex 3931 .addUse(VOffset) // voffset 3932 .addUse(SOffset) // soffset 3933 .addImm(ImmOffset); // offset(imm) 3934 3935 if (IsTyped) 3936 MIB.addImm(Format); 3937 3938 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3939 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3940 .addMemOperand(MMO); 3941 3942 MI.eraseFromParent(); 3943 return true; 3944 } 3945 3946 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3947 MachineRegisterInfo &MRI, 3948 MachineIRBuilder &B, 3949 bool IsFormat, 3950 bool IsTyped) const { 3951 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3952 MachineMemOperand *MMO = *MI.memoperands_begin(); 3953 const LLT MemTy = MMO->getMemoryType(); 3954 const LLT S32 = LLT::scalar(32); 3955 3956 Register Dst = MI.getOperand(0).getReg(); 3957 Register RSrc = MI.getOperand(2).getReg(); 3958 3959 // The typed intrinsics add an immediate after the registers. 3960 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3961 3962 // The struct intrinsic variants add one additional operand over raw. 3963 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3964 Register VIndex; 3965 int OpOffset = 0; 3966 if (HasVIndex) { 3967 VIndex = MI.getOperand(3).getReg(); 3968 OpOffset = 1; 3969 } else { 3970 VIndex = B.buildConstant(S32, 0).getReg(0); 3971 } 3972 3973 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3974 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3975 3976 unsigned Format = 0; 3977 if (IsTyped) { 3978 Format = MI.getOperand(5 + OpOffset).getImm(); 3979 ++OpOffset; 3980 } 3981 3982 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3983 unsigned ImmOffset; 3984 3985 LLT Ty = MRI.getType(Dst); 3986 LLT EltTy = Ty.getScalarType(); 3987 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3988 const bool Unpacked = ST.hasUnpackedD16VMem(); 3989 3990 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3991 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3992 3993 unsigned Opc; 3994 3995 if (IsTyped) { 3996 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3997 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3998 } else if (IsFormat) { 3999 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4000 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4001 } else { 4002 switch (MemTy.getSizeInBits()) { 4003 case 8: 4004 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4005 break; 4006 case 16: 4007 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4008 break; 4009 default: 4010 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4011 break; 4012 } 4013 } 4014 4015 Register LoadDstReg; 4016 4017 bool IsExtLoad = 4018 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4019 LLT UnpackedTy = Ty.changeElementSize(32); 4020 4021 if (IsExtLoad) 4022 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4023 else if (Unpacked && IsD16 && Ty.isVector()) 4024 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4025 else 4026 LoadDstReg = Dst; 4027 4028 auto MIB = B.buildInstr(Opc) 4029 .addDef(LoadDstReg) // vdata 4030 .addUse(RSrc) // rsrc 4031 .addUse(VIndex) // vindex 4032 .addUse(VOffset) // voffset 4033 .addUse(SOffset) // soffset 4034 .addImm(ImmOffset); // offset(imm) 4035 4036 if (IsTyped) 4037 MIB.addImm(Format); 4038 4039 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4040 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4041 .addMemOperand(MMO); 4042 4043 if (LoadDstReg != Dst) { 4044 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4045 4046 // Widen result for extending loads was widened. 4047 if (IsExtLoad) 4048 B.buildTrunc(Dst, LoadDstReg); 4049 else { 4050 // Repack to original 16-bit vector result 4051 // FIXME: G_TRUNC should work, but legalization currently fails 4052 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4053 SmallVector<Register, 4> Repack; 4054 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4055 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4056 B.buildMerge(Dst, Repack); 4057 } 4058 } 4059 4060 MI.eraseFromParent(); 4061 return true; 4062 } 4063 4064 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4065 MachineIRBuilder &B, 4066 bool IsInc) const { 4067 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4068 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4069 B.buildInstr(Opc) 4070 .addDef(MI.getOperand(0).getReg()) 4071 .addUse(MI.getOperand(2).getReg()) 4072 .addUse(MI.getOperand(3).getReg()) 4073 .cloneMemRefs(MI); 4074 MI.eraseFromParent(); 4075 return true; 4076 } 4077 4078 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4079 switch (IntrID) { 4080 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4081 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4082 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4083 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4084 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4085 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4086 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4087 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4088 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4089 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4090 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4091 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4092 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4093 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4094 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4095 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4096 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4097 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4098 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4099 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4100 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4101 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4102 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4103 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4104 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4105 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4106 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4107 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4108 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4109 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4110 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4111 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4112 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4113 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4114 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4115 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4116 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4117 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4118 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4119 case Intrinsic::amdgcn_buffer_atomic_fadd: 4120 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4121 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4122 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4123 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4124 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4125 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4126 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4127 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4128 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4129 default: 4130 llvm_unreachable("unhandled atomic opcode"); 4131 } 4132 } 4133 4134 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4135 MachineIRBuilder &B, 4136 Intrinsic::ID IID) const { 4137 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4138 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4139 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4140 4141 Register Dst; 4142 4143 int OpOffset = 0; 4144 if (HasReturn) { 4145 // A few FP atomics do not support return values. 4146 Dst = MI.getOperand(0).getReg(); 4147 } else { 4148 OpOffset = -1; 4149 } 4150 4151 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4152 Register CmpVal; 4153 4154 if (IsCmpSwap) { 4155 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4156 ++OpOffset; 4157 } 4158 4159 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4160 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4161 4162 // The struct intrinsic variants add one additional operand over raw. 4163 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4164 Register VIndex; 4165 if (HasVIndex) { 4166 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4167 ++OpOffset; 4168 } else { 4169 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4170 } 4171 4172 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4173 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4174 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4175 4176 MachineMemOperand *MMO = *MI.memoperands_begin(); 4177 4178 unsigned ImmOffset; 4179 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4180 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4181 4182 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4183 4184 if (HasReturn) 4185 MIB.addDef(Dst); 4186 4187 MIB.addUse(VData); // vdata 4188 4189 if (IsCmpSwap) 4190 MIB.addReg(CmpVal); 4191 4192 MIB.addUse(RSrc) // rsrc 4193 .addUse(VIndex) // vindex 4194 .addUse(VOffset) // voffset 4195 .addUse(SOffset) // soffset 4196 .addImm(ImmOffset) // offset(imm) 4197 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4198 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4199 .addMemOperand(MMO); 4200 4201 MI.eraseFromParent(); 4202 return true; 4203 } 4204 4205 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4206 /// vector with s16 typed elements. 4207 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4208 SmallVectorImpl<Register> &PackedAddrs, 4209 unsigned ArgOffset, 4210 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4211 bool IsA16, bool IsG16) { 4212 const LLT S16 = LLT::scalar(16); 4213 const LLT V2S16 = LLT::fixed_vector(2, 16); 4214 auto EndIdx = Intr->VAddrEnd; 4215 4216 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4217 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4218 if (!SrcOp.isReg()) 4219 continue; // _L to _LZ may have eliminated this. 4220 4221 Register AddrReg = SrcOp.getReg(); 4222 4223 if ((I < Intr->GradientStart) || 4224 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4225 (I >= Intr->CoordStart && !IsA16)) { 4226 // Handle any gradient or coordinate operands that should not be packed 4227 if ((I < Intr->GradientStart) && IsA16 && 4228 (B.getMRI()->getType(AddrReg) == S16)) { 4229 // Special handling of bias when A16 is on. Bias is of type half but 4230 // occupies full 32-bit. 4231 PackedAddrs.push_back( 4232 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4233 .getReg(0)); 4234 } else { 4235 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4236 PackedAddrs.push_back(AddrReg); 4237 } 4238 } else { 4239 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4240 // derivatives dx/dh and dx/dv are packed with undef. 4241 if (((I + 1) >= EndIdx) || 4242 ((Intr->NumGradients / 2) % 2 == 1 && 4243 (I == static_cast<unsigned>(Intr->GradientStart + 4244 (Intr->NumGradients / 2) - 1) || 4245 I == static_cast<unsigned>(Intr->GradientStart + 4246 Intr->NumGradients - 1))) || 4247 // Check for _L to _LZ optimization 4248 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4249 PackedAddrs.push_back( 4250 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4251 .getReg(0)); 4252 } else { 4253 PackedAddrs.push_back( 4254 B.buildBuildVector( 4255 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4256 .getReg(0)); 4257 ++I; 4258 } 4259 } 4260 } 4261 } 4262 4263 /// Convert from separate vaddr components to a single vector address register, 4264 /// and replace the remaining operands with $noreg. 4265 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4266 int DimIdx, int NumVAddrs) { 4267 const LLT S32 = LLT::scalar(32); 4268 4269 SmallVector<Register, 8> AddrRegs; 4270 for (int I = 0; I != NumVAddrs; ++I) { 4271 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4272 if (SrcOp.isReg()) { 4273 AddrRegs.push_back(SrcOp.getReg()); 4274 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4275 } 4276 } 4277 4278 int NumAddrRegs = AddrRegs.size(); 4279 if (NumAddrRegs != 1) { 4280 // Above 8 elements round up to next power of 2 (i.e. 16). 4281 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4282 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4283 auto Undef = B.buildUndef(S32); 4284 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4285 NumAddrRegs = RoundedNumRegs; 4286 } 4287 4288 auto VAddr = 4289 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4290 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4291 } 4292 4293 for (int I = 1; I != NumVAddrs; ++I) { 4294 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4295 if (SrcOp.isReg()) 4296 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4297 } 4298 } 4299 4300 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4301 /// 4302 /// Depending on the subtarget, load/store with 16-bit element data need to be 4303 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4304 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4305 /// registers. 4306 /// 4307 /// We don't want to directly select image instructions just yet, but also want 4308 /// to exposes all register repacking to the legalizer/combiners. We also don't 4309 /// want a selected instrution entering RegBankSelect. In order to avoid 4310 /// defining a multitude of intermediate image instructions, directly hack on 4311 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4312 /// padding now unnecessary arguments with $noreg. 4313 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4314 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4315 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4316 4317 const unsigned NumDefs = MI.getNumExplicitDefs(); 4318 const unsigned ArgOffset = NumDefs + 1; 4319 bool IsTFE = NumDefs == 2; 4320 // We are only processing the operands of d16 image operations on subtargets 4321 // that use the unpacked register layout, or need to repack the TFE result. 4322 4323 // TODO: Do we need to guard against already legalized intrinsics? 4324 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4325 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4326 4327 MachineRegisterInfo *MRI = B.getMRI(); 4328 const LLT S32 = LLT::scalar(32); 4329 const LLT S16 = LLT::scalar(16); 4330 const LLT V2S16 = LLT::fixed_vector(2, 16); 4331 4332 unsigned DMask = 0; 4333 4334 // Check for 16 bit addresses and pack if true. 4335 LLT GradTy = 4336 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4337 LLT AddrTy = 4338 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4339 const bool IsG16 = GradTy == S16; 4340 const bool IsA16 = AddrTy == S16; 4341 4342 int DMaskLanes = 0; 4343 if (!BaseOpcode->Atomic) { 4344 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4345 if (BaseOpcode->Gather4) { 4346 DMaskLanes = 4; 4347 } else if (DMask != 0) { 4348 DMaskLanes = countPopulation(DMask); 4349 } else if (!IsTFE && !BaseOpcode->Store) { 4350 // If dmask is 0, this is a no-op load. This can be eliminated. 4351 B.buildUndef(MI.getOperand(0)); 4352 MI.eraseFromParent(); 4353 return true; 4354 } 4355 } 4356 4357 Observer.changingInstr(MI); 4358 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4359 4360 unsigned NewOpcode = NumDefs == 0 ? 4361 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4362 4363 // Track that we legalized this 4364 MI.setDesc(B.getTII().get(NewOpcode)); 4365 4366 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4367 // dmask to be at least 1 otherwise the instruction will fail 4368 if (IsTFE && DMask == 0) { 4369 DMask = 0x1; 4370 DMaskLanes = 1; 4371 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4372 } 4373 4374 if (BaseOpcode->Atomic) { 4375 Register VData0 = MI.getOperand(2).getReg(); 4376 LLT Ty = MRI->getType(VData0); 4377 4378 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4379 if (Ty.isVector()) 4380 return false; 4381 4382 if (BaseOpcode->AtomicX2) { 4383 Register VData1 = MI.getOperand(3).getReg(); 4384 // The two values are packed in one register. 4385 LLT PackedTy = LLT::fixed_vector(2, Ty); 4386 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4387 MI.getOperand(2).setReg(Concat.getReg(0)); 4388 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4389 } 4390 } 4391 4392 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4393 4394 // Optimize _L to _LZ when _L is zero 4395 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4396 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4397 const ConstantFP *ConstantLod; 4398 4399 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4400 m_GFCst(ConstantLod))) { 4401 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4402 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4403 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4404 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, 4405 Intr->Dim); 4406 4407 // The starting indexes should remain in the same place. 4408 --CorrectedNumVAddrs; 4409 4410 MI.getOperand(MI.getNumExplicitDefs()) 4411 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4412 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4413 Intr = NewImageDimIntr; 4414 } 4415 } 4416 } 4417 4418 // Optimize _mip away, when 'lod' is zero 4419 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4420 int64_t ConstantLod; 4421 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4422 m_ICst(ConstantLod))) { 4423 if (ConstantLod == 0) { 4424 // TODO: Change intrinsic opcode and remove operand instead or replacing 4425 // it with 0, as the _L to _LZ handling is done above. 4426 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4427 --CorrectedNumVAddrs; 4428 } 4429 } 4430 } 4431 4432 // Rewrite the addressing register layout before doing anything else. 4433 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4434 // 16 bit gradients are supported, but are tied to the A16 control 4435 // so both gradients and addresses must be 16 bit 4436 return false; 4437 } 4438 4439 if (IsA16 && !ST.hasA16()) { 4440 // A16 not supported 4441 return false; 4442 } 4443 4444 if (IsA16 || IsG16) { 4445 if (Intr->NumVAddrs > 1) { 4446 SmallVector<Register, 4> PackedRegs; 4447 4448 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4449 IsG16); 4450 4451 // See also below in the non-a16 branch 4452 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4453 PackedRegs.size() <= ST.getNSAMaxSize(); 4454 4455 if (!UseNSA && PackedRegs.size() > 1) { 4456 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4457 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4458 PackedRegs[0] = Concat.getReg(0); 4459 PackedRegs.resize(1); 4460 } 4461 4462 const unsigned NumPacked = PackedRegs.size(); 4463 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4464 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4465 if (!SrcOp.isReg()) { 4466 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4467 continue; 4468 } 4469 4470 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4471 4472 if (I - Intr->VAddrStart < NumPacked) 4473 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4474 else 4475 SrcOp.setReg(AMDGPU::NoRegister); 4476 } 4477 } 4478 } else { 4479 // If the register allocator cannot place the address registers contiguously 4480 // without introducing moves, then using the non-sequential address encoding 4481 // is always preferable, since it saves VALU instructions and is usually a 4482 // wash in terms of code size or even better. 4483 // 4484 // However, we currently have no way of hinting to the register allocator 4485 // that MIMG addresses should be placed contiguously when it is possible to 4486 // do so, so force non-NSA for the common 2-address case as a heuristic. 4487 // 4488 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4489 // allocation when possible. 4490 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4491 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4492 4493 if (!UseNSA && Intr->NumVAddrs > 1) 4494 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4495 Intr->NumVAddrs); 4496 } 4497 4498 int Flags = 0; 4499 if (IsA16) 4500 Flags |= 1; 4501 if (IsG16) 4502 Flags |= 2; 4503 MI.addOperand(MachineOperand::CreateImm(Flags)); 4504 4505 if (BaseOpcode->Store) { // No TFE for stores? 4506 // TODO: Handle dmask trim 4507 Register VData = MI.getOperand(1).getReg(); 4508 LLT Ty = MRI->getType(VData); 4509 if (!Ty.isVector() || Ty.getElementType() != S16) 4510 return true; 4511 4512 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4513 if (RepackedReg != VData) { 4514 MI.getOperand(1).setReg(RepackedReg); 4515 } 4516 4517 return true; 4518 } 4519 4520 Register DstReg = MI.getOperand(0).getReg(); 4521 LLT Ty = MRI->getType(DstReg); 4522 const LLT EltTy = Ty.getScalarType(); 4523 const bool IsD16 = Ty.getScalarType() == S16; 4524 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4525 4526 // Confirm that the return type is large enough for the dmask specified 4527 if (NumElts < DMaskLanes) 4528 return false; 4529 4530 if (NumElts > 4 || DMaskLanes > 4) 4531 return false; 4532 4533 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4534 const LLT AdjustedTy = 4535 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4536 4537 // The raw dword aligned data component of the load. The only legal cases 4538 // where this matters should be when using the packed D16 format, for 4539 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4540 LLT RoundedTy; 4541 4542 // S32 vector to to cover all data, plus TFE result element. 4543 LLT TFETy; 4544 4545 // Register type to use for each loaded component. Will be S32 or V2S16. 4546 LLT RegTy; 4547 4548 if (IsD16 && ST.hasUnpackedD16VMem()) { 4549 RoundedTy = 4550 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4551 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4552 RegTy = S32; 4553 } else { 4554 unsigned EltSize = EltTy.getSizeInBits(); 4555 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4556 unsigned RoundedSize = 32 * RoundedElts; 4557 RoundedTy = LLT::scalarOrVector( 4558 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4559 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4560 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4561 } 4562 4563 // The return type does not need adjustment. 4564 // TODO: Should we change s16 case to s32 or <2 x s16>? 4565 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4566 return true; 4567 4568 Register Dst1Reg; 4569 4570 // Insert after the instruction. 4571 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4572 4573 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4574 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4575 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4576 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4577 4578 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4579 4580 MI.getOperand(0).setReg(NewResultReg); 4581 4582 // In the IR, TFE is supposed to be used with a 2 element struct return 4583 // type. The instruction really returns these two values in one contiguous 4584 // register, with one additional dword beyond the loaded data. Rewrite the 4585 // return type to use a single register result. 4586 4587 if (IsTFE) { 4588 Dst1Reg = MI.getOperand(1).getReg(); 4589 if (MRI->getType(Dst1Reg) != S32) 4590 return false; 4591 4592 // TODO: Make sure the TFE operand bit is set. 4593 MI.RemoveOperand(1); 4594 4595 // Handle the easy case that requires no repack instructions. 4596 if (Ty == S32) { 4597 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4598 return true; 4599 } 4600 } 4601 4602 // Now figure out how to copy the new result register back into the old 4603 // result. 4604 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4605 4606 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4607 4608 if (ResultNumRegs == 1) { 4609 assert(!IsTFE); 4610 ResultRegs[0] = NewResultReg; 4611 } else { 4612 // We have to repack into a new vector of some kind. 4613 for (int I = 0; I != NumDataRegs; ++I) 4614 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4615 B.buildUnmerge(ResultRegs, NewResultReg); 4616 4617 // Drop the final TFE element to get the data part. The TFE result is 4618 // directly written to the right place already. 4619 if (IsTFE) 4620 ResultRegs.resize(NumDataRegs); 4621 } 4622 4623 // For an s16 scalar result, we form an s32 result with a truncate regardless 4624 // of packed vs. unpacked. 4625 if (IsD16 && !Ty.isVector()) { 4626 B.buildTrunc(DstReg, ResultRegs[0]); 4627 return true; 4628 } 4629 4630 // Avoid a build/concat_vector of 1 entry. 4631 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4632 B.buildBitcast(DstReg, ResultRegs[0]); 4633 return true; 4634 } 4635 4636 assert(Ty.isVector()); 4637 4638 if (IsD16) { 4639 // For packed D16 results with TFE enabled, all the data components are 4640 // S32. Cast back to the expected type. 4641 // 4642 // TODO: We don't really need to use load s32 elements. We would only need one 4643 // cast for the TFE result if a multiple of v2s16 was used. 4644 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4645 for (Register &Reg : ResultRegs) 4646 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4647 } else if (ST.hasUnpackedD16VMem()) { 4648 for (Register &Reg : ResultRegs) 4649 Reg = B.buildTrunc(S16, Reg).getReg(0); 4650 } 4651 } 4652 4653 auto padWithUndef = [&](LLT Ty, int NumElts) { 4654 if (NumElts == 0) 4655 return; 4656 Register Undef = B.buildUndef(Ty).getReg(0); 4657 for (int I = 0; I != NumElts; ++I) 4658 ResultRegs.push_back(Undef); 4659 }; 4660 4661 // Pad out any elements eliminated due to the dmask. 4662 LLT ResTy = MRI->getType(ResultRegs[0]); 4663 if (!ResTy.isVector()) { 4664 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4665 B.buildBuildVector(DstReg, ResultRegs); 4666 return true; 4667 } 4668 4669 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4670 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4671 4672 // Deal with the one annoying legal case. 4673 const LLT V3S16 = LLT::fixed_vector(3, 16); 4674 if (Ty == V3S16) { 4675 if (IsTFE) { 4676 if (ResultRegs.size() == 1) { 4677 NewResultReg = ResultRegs[0]; 4678 } else if (ResultRegs.size() == 2) { 4679 LLT V4S16 = LLT::fixed_vector(4, 16); 4680 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 4681 } else { 4682 return false; 4683 } 4684 } 4685 4686 if (MRI->getType(DstReg).getNumElements() < 4687 MRI->getType(NewResultReg).getNumElements()) { 4688 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 4689 } else { 4690 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 4691 } 4692 return true; 4693 } 4694 4695 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4696 B.buildConcatVectors(DstReg, ResultRegs); 4697 return true; 4698 } 4699 4700 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4701 LegalizerHelper &Helper, MachineInstr &MI) const { 4702 MachineIRBuilder &B = Helper.MIRBuilder; 4703 GISelChangeObserver &Observer = Helper.Observer; 4704 4705 Register Dst = MI.getOperand(0).getReg(); 4706 LLT Ty = B.getMRI()->getType(Dst); 4707 unsigned Size = Ty.getSizeInBits(); 4708 MachineFunction &MF = B.getMF(); 4709 4710 Observer.changingInstr(MI); 4711 4712 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4713 Ty = getBitcastRegisterType(Ty); 4714 Helper.bitcastDst(MI, Ty, 0); 4715 Dst = MI.getOperand(0).getReg(); 4716 B.setInsertPt(B.getMBB(), MI); 4717 } 4718 4719 // FIXME: We don't really need this intermediate instruction. The intrinsic 4720 // should be fixed to have a memory operand. Since it's readnone, we're not 4721 // allowed to add one. 4722 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4723 MI.RemoveOperand(1); // Remove intrinsic ID 4724 4725 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4726 // TODO: Should this use datalayout alignment? 4727 const unsigned MemSize = (Size + 7) / 8; 4728 const Align MemAlign(4); 4729 MachineMemOperand *MMO = MF.getMachineMemOperand( 4730 MachinePointerInfo(), 4731 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4732 MachineMemOperand::MOInvariant, 4733 MemSize, MemAlign); 4734 MI.addMemOperand(MF, MMO); 4735 4736 // There are no 96-bit result scalar loads, but widening to 128-bit should 4737 // always be legal. We may need to restore this to a 96-bit result if it turns 4738 // out this needs to be converted to a vector load during RegBankSelect. 4739 if (!isPowerOf2_32(Size)) { 4740 if (Ty.isVector()) 4741 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4742 else 4743 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4744 } 4745 4746 Observer.changedInstr(MI); 4747 return true; 4748 } 4749 4750 // TODO: Move to selection 4751 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4752 MachineRegisterInfo &MRI, 4753 MachineIRBuilder &B) const { 4754 if (!ST.isTrapHandlerEnabled() || 4755 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4756 return legalizeTrapEndpgm(MI, MRI, B); 4757 4758 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4759 switch (*HsaAbiVer) { 4760 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4761 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4762 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4763 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4764 return ST.supportsGetDoorbellID() ? 4765 legalizeTrapHsa(MI, MRI, B) : 4766 legalizeTrapHsaQueuePtr(MI, MRI, B); 4767 } 4768 } 4769 4770 llvm_unreachable("Unknown trap handler"); 4771 } 4772 4773 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4774 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4775 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4776 MI.eraseFromParent(); 4777 return true; 4778 } 4779 4780 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4781 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4782 // Pass queue pointer to trap handler as input, and insert trap instruction 4783 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4784 Register LiveIn = 4785 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4786 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4787 return false; 4788 4789 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4790 B.buildCopy(SGPR01, LiveIn); 4791 B.buildInstr(AMDGPU::S_TRAP) 4792 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4793 .addReg(SGPR01, RegState::Implicit); 4794 4795 MI.eraseFromParent(); 4796 return true; 4797 } 4798 4799 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4800 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4801 B.buildInstr(AMDGPU::S_TRAP) 4802 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4803 MI.eraseFromParent(); 4804 return true; 4805 } 4806 4807 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4808 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4809 // Is non-HSA path or trap-handler disabled? Then, report a warning 4810 // accordingly 4811 if (!ST.isTrapHandlerEnabled() || 4812 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4813 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4814 "debugtrap handler not supported", 4815 MI.getDebugLoc(), DS_Warning); 4816 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4817 Ctx.diagnose(NoTrap); 4818 } else { 4819 // Insert debug-trap instruction 4820 B.buildInstr(AMDGPU::S_TRAP) 4821 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4822 } 4823 4824 MI.eraseFromParent(); 4825 return true; 4826 } 4827 4828 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4829 MachineIRBuilder &B) const { 4830 MachineRegisterInfo &MRI = *B.getMRI(); 4831 const LLT S16 = LLT::scalar(16); 4832 const LLT S32 = LLT::scalar(32); 4833 4834 Register DstReg = MI.getOperand(0).getReg(); 4835 Register NodePtr = MI.getOperand(2).getReg(); 4836 Register RayExtent = MI.getOperand(3).getReg(); 4837 Register RayOrigin = MI.getOperand(4).getReg(); 4838 Register RayDir = MI.getOperand(5).getReg(); 4839 Register RayInvDir = MI.getOperand(6).getReg(); 4840 Register TDescr = MI.getOperand(7).getReg(); 4841 4842 if (!ST.hasGFX10_AEncoding()) { 4843 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4844 "intrinsic not supported on subtarget", 4845 MI.getDebugLoc()); 4846 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4847 return false; 4848 } 4849 4850 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4851 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4852 const unsigned NumVDataDwords = 4; 4853 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4854 const bool UseNSA = 4855 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 4856 const unsigned BaseOpcodes[2][2] = { 4857 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 4858 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 4859 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 4860 int Opcode; 4861 if (UseNSA) { 4862 Opcode = 4863 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 4864 NumVDataDwords, NumVAddrDwords); 4865 } else { 4866 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 4867 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 4868 PowerOf2Ceil(NumVAddrDwords)); 4869 } 4870 assert(Opcode != -1); 4871 4872 SmallVector<Register, 12> Ops; 4873 if (Is64) { 4874 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4875 Ops.push_back(Unmerge.getReg(0)); 4876 Ops.push_back(Unmerge.getReg(1)); 4877 } else { 4878 Ops.push_back(NodePtr); 4879 } 4880 Ops.push_back(RayExtent); 4881 4882 auto packLanes = [&Ops, &S32, &B](Register Src) { 4883 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 4884 Ops.push_back(Unmerge.getReg(0)); 4885 Ops.push_back(Unmerge.getReg(1)); 4886 Ops.push_back(Unmerge.getReg(2)); 4887 }; 4888 4889 packLanes(RayOrigin); 4890 if (IsA16) { 4891 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 4892 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 4893 Register R1 = MRI.createGenericVirtualRegister(S32); 4894 Register R2 = MRI.createGenericVirtualRegister(S32); 4895 Register R3 = MRI.createGenericVirtualRegister(S32); 4896 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4897 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4898 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4899 Ops.push_back(R1); 4900 Ops.push_back(R2); 4901 Ops.push_back(R3); 4902 } else { 4903 packLanes(RayDir); 4904 packLanes(RayInvDir); 4905 } 4906 4907 if (!UseNSA) { 4908 // Build a single vector containing all the operands so far prepared. 4909 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 4910 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 4911 Ops.clear(); 4912 Ops.push_back(MergedOps); 4913 } 4914 4915 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4916 .addDef(DstReg) 4917 .addImm(Opcode); 4918 4919 for (Register R : Ops) { 4920 MIB.addUse(R); 4921 } 4922 4923 MIB.addUse(TDescr) 4924 .addImm(IsA16 ? 1 : 0) 4925 .cloneMemRefs(MI); 4926 4927 MI.eraseFromParent(); 4928 return true; 4929 } 4930 4931 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4932 MachineInstr &MI) const { 4933 MachineIRBuilder &B = Helper.MIRBuilder; 4934 MachineRegisterInfo &MRI = *B.getMRI(); 4935 4936 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4937 auto IntrID = MI.getIntrinsicID(); 4938 switch (IntrID) { 4939 case Intrinsic::amdgcn_if: 4940 case Intrinsic::amdgcn_else: { 4941 MachineInstr *Br = nullptr; 4942 MachineBasicBlock *UncondBrTarget = nullptr; 4943 bool Negated = false; 4944 if (MachineInstr *BrCond = 4945 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4946 const SIRegisterInfo *TRI 4947 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4948 4949 Register Def = MI.getOperand(1).getReg(); 4950 Register Use = MI.getOperand(3).getReg(); 4951 4952 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4953 4954 if (Negated) 4955 std::swap(CondBrTarget, UncondBrTarget); 4956 4957 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4958 if (IntrID == Intrinsic::amdgcn_if) { 4959 B.buildInstr(AMDGPU::SI_IF) 4960 .addDef(Def) 4961 .addUse(Use) 4962 .addMBB(UncondBrTarget); 4963 } else { 4964 B.buildInstr(AMDGPU::SI_ELSE) 4965 .addDef(Def) 4966 .addUse(Use) 4967 .addMBB(UncondBrTarget); 4968 } 4969 4970 if (Br) { 4971 Br->getOperand(0).setMBB(CondBrTarget); 4972 } else { 4973 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4974 // since we're swapping branch targets it needs to be reinserted. 4975 // FIXME: IRTranslator should probably not do this 4976 B.buildBr(*CondBrTarget); 4977 } 4978 4979 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4980 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4981 MI.eraseFromParent(); 4982 BrCond->eraseFromParent(); 4983 return true; 4984 } 4985 4986 return false; 4987 } 4988 case Intrinsic::amdgcn_loop: { 4989 MachineInstr *Br = nullptr; 4990 MachineBasicBlock *UncondBrTarget = nullptr; 4991 bool Negated = false; 4992 if (MachineInstr *BrCond = 4993 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4994 const SIRegisterInfo *TRI 4995 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4996 4997 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4998 Register Reg = MI.getOperand(2).getReg(); 4999 5000 if (Negated) 5001 std::swap(CondBrTarget, UncondBrTarget); 5002 5003 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5004 B.buildInstr(AMDGPU::SI_LOOP) 5005 .addUse(Reg) 5006 .addMBB(UncondBrTarget); 5007 5008 if (Br) 5009 Br->getOperand(0).setMBB(CondBrTarget); 5010 else 5011 B.buildBr(*CondBrTarget); 5012 5013 MI.eraseFromParent(); 5014 BrCond->eraseFromParent(); 5015 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5016 return true; 5017 } 5018 5019 return false; 5020 } 5021 case Intrinsic::amdgcn_kernarg_segment_ptr: 5022 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5023 // This only makes sense to call in a kernel, so just lower to null. 5024 B.buildConstant(MI.getOperand(0).getReg(), 0); 5025 MI.eraseFromParent(); 5026 return true; 5027 } 5028 5029 return legalizePreloadedArgIntrin( 5030 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5031 case Intrinsic::amdgcn_implicitarg_ptr: 5032 return legalizeImplicitArgPtr(MI, MRI, B); 5033 case Intrinsic::amdgcn_workitem_id_x: 5034 return legalizePreloadedArgIntrin(MI, MRI, B, 5035 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5036 case Intrinsic::amdgcn_workitem_id_y: 5037 return legalizePreloadedArgIntrin(MI, MRI, B, 5038 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5039 case Intrinsic::amdgcn_workitem_id_z: 5040 return legalizePreloadedArgIntrin(MI, MRI, B, 5041 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5042 case Intrinsic::amdgcn_workgroup_id_x: 5043 return legalizePreloadedArgIntrin(MI, MRI, B, 5044 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5045 case Intrinsic::amdgcn_workgroup_id_y: 5046 return legalizePreloadedArgIntrin(MI, MRI, B, 5047 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5048 case Intrinsic::amdgcn_workgroup_id_z: 5049 return legalizePreloadedArgIntrin(MI, MRI, B, 5050 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5051 case Intrinsic::amdgcn_dispatch_ptr: 5052 return legalizePreloadedArgIntrin(MI, MRI, B, 5053 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5054 case Intrinsic::amdgcn_queue_ptr: 5055 return legalizePreloadedArgIntrin(MI, MRI, B, 5056 AMDGPUFunctionArgInfo::QUEUE_PTR); 5057 case Intrinsic::amdgcn_implicit_buffer_ptr: 5058 return legalizePreloadedArgIntrin( 5059 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5060 case Intrinsic::amdgcn_dispatch_id: 5061 return legalizePreloadedArgIntrin(MI, MRI, B, 5062 AMDGPUFunctionArgInfo::DISPATCH_ID); 5063 case Intrinsic::amdgcn_fdiv_fast: 5064 return legalizeFDIVFastIntrin(MI, MRI, B); 5065 case Intrinsic::amdgcn_is_shared: 5066 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5067 case Intrinsic::amdgcn_is_private: 5068 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5069 case Intrinsic::amdgcn_wavefrontsize: { 5070 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5071 MI.eraseFromParent(); 5072 return true; 5073 } 5074 case Intrinsic::amdgcn_s_buffer_load: 5075 return legalizeSBufferLoad(Helper, MI); 5076 case Intrinsic::amdgcn_raw_buffer_store: 5077 case Intrinsic::amdgcn_struct_buffer_store: 5078 return legalizeBufferStore(MI, MRI, B, false, false); 5079 case Intrinsic::amdgcn_raw_buffer_store_format: 5080 case Intrinsic::amdgcn_struct_buffer_store_format: 5081 return legalizeBufferStore(MI, MRI, B, false, true); 5082 case Intrinsic::amdgcn_raw_tbuffer_store: 5083 case Intrinsic::amdgcn_struct_tbuffer_store: 5084 return legalizeBufferStore(MI, MRI, B, true, true); 5085 case Intrinsic::amdgcn_raw_buffer_load: 5086 case Intrinsic::amdgcn_struct_buffer_load: 5087 return legalizeBufferLoad(MI, MRI, B, false, false); 5088 case Intrinsic::amdgcn_raw_buffer_load_format: 5089 case Intrinsic::amdgcn_struct_buffer_load_format: 5090 return legalizeBufferLoad(MI, MRI, B, true, false); 5091 case Intrinsic::amdgcn_raw_tbuffer_load: 5092 case Intrinsic::amdgcn_struct_tbuffer_load: 5093 return legalizeBufferLoad(MI, MRI, B, true, true); 5094 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5095 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5096 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5097 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5098 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5099 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5100 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5101 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5102 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5103 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5104 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5105 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5106 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5107 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5108 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5109 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5110 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5111 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5112 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5113 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5114 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5115 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5116 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5117 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5118 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5119 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5120 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5121 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5122 case Intrinsic::amdgcn_buffer_atomic_fadd: 5123 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5124 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5125 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5126 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5127 return legalizeBufferAtomic(MI, B, IntrID); 5128 case Intrinsic::amdgcn_atomic_inc: 5129 return legalizeAtomicIncDec(MI, B, true); 5130 case Intrinsic::amdgcn_atomic_dec: 5131 return legalizeAtomicIncDec(MI, B, false); 5132 case Intrinsic::trap: 5133 return legalizeTrapIntrinsic(MI, MRI, B); 5134 case Intrinsic::debugtrap: 5135 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5136 case Intrinsic::amdgcn_rsq_clamp: 5137 return legalizeRsqClampIntrinsic(MI, MRI, B); 5138 case Intrinsic::amdgcn_ds_fadd: 5139 case Intrinsic::amdgcn_ds_fmin: 5140 case Intrinsic::amdgcn_ds_fmax: 5141 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5142 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5143 return legalizeBVHIntrinsic(MI, B); 5144 default: { 5145 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5146 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5147 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5148 return true; 5149 } 5150 } 5151 5152 return true; 5153 } 5154