1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 unsigned AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .clampScalar(0, S16, S32) 536 .clampMaxNumElements(0, S16, 2) 537 .scalarize(0) 538 .widenScalarToNextPow2(0, 32); 539 540 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 541 .legalFor({S32, S16, V2S16}) // Clamp modifier 542 .minScalarOrElt(0, S16) 543 .clampMaxNumElements(0, S16, 2) 544 .scalarize(0) 545 .widenScalarToNextPow2(0, 32) 546 .lower(); 547 } else if (ST.has16BitInsts()) { 548 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 549 .legalFor({S32, S16}) 550 .clampScalar(0, S16, S32) 551 .scalarize(0) 552 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 553 554 // Technically the saturating operations require clamp bit support, but this 555 // was introduced at the same time as 16-bit operations. 556 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 557 .legalFor({S32, S16}) // Clamp modifier 558 .minScalar(0, S16) 559 .scalarize(0) 560 .widenScalarToNextPow2(0, 16) 561 .lower(); 562 563 // We're just lowering this, but it helps get a better result to try to 564 // coerce to the desired type first. 565 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 566 .minScalar(0, S16) 567 .scalarize(0) 568 .lower(); 569 } else { 570 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 571 .legalFor({S32}) 572 .clampScalar(0, S32, S32) 573 .scalarize(0); 574 575 if (ST.hasIntClamp()) { 576 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 577 .legalFor({S32}) // Clamp modifier. 578 .scalarize(0) 579 .minScalarOrElt(0, S32) 580 .lower(); 581 } else { 582 // Clamp bit support was added in VI, along with 16-bit operations. 583 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 584 .minScalar(0, S32) 585 .scalarize(0) 586 .lower(); 587 } 588 589 // FIXME: DAG expansion gets better results. The widening uses the smaller 590 // range values and goes for the min/max lowering directly. 591 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 592 .minScalar(0, S32) 593 .scalarize(0) 594 .lower(); 595 } 596 597 getActionDefinitionsBuilder( 598 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 599 .customFor({S32, S64}) 600 .clampScalar(0, S32, S64) 601 .widenScalarToNextPow2(0, 32) 602 .scalarize(0); 603 604 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 605 .legalFor({S32}) 606 .maxScalarOrElt(0, S32); 607 608 if (ST.hasVOP3PInsts()) { 609 Mulh 610 .clampMaxNumElements(0, S8, 2) 611 .lowerFor({V2S8}); 612 } 613 614 Mulh 615 .scalarize(0) 616 .lower(); 617 618 // Report legal for any types we can handle anywhere. For the cases only legal 619 // on the SALU, RegBankSelect will be able to re-legalize. 620 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 621 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 622 .clampScalar(0, S32, S64) 623 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 624 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 625 .widenScalarToNextPow2(0) 626 .scalarize(0); 627 628 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 629 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 630 .legalFor({{S32, S1}, {S32, S32}}) 631 .minScalar(0, S32) 632 // TODO: .scalarize(0) 633 .lower(); 634 635 getActionDefinitionsBuilder(G_BITCAST) 636 // Don't worry about the size constraint. 637 .legalIf(all(isRegisterType(0), isRegisterType(1))) 638 .lower(); 639 640 641 getActionDefinitionsBuilder(G_CONSTANT) 642 .legalFor({S1, S32, S64, S16, GlobalPtr, 643 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 644 .legalIf(isPointer(0)) 645 .clampScalar(0, S32, S64) 646 .widenScalarToNextPow2(0); 647 648 getActionDefinitionsBuilder(G_FCONSTANT) 649 .legalFor({S32, S64, S16}) 650 .clampScalar(0, S16, S64); 651 652 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 653 .legalIf(isRegisterType(0)) 654 // s1 and s16 are special cases because they have legal operations on 655 // them, but don't really occupy registers in the normal way. 656 .legalFor({S1, S16}) 657 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 658 .clampScalarOrElt(0, S32, MaxScalar) 659 .widenScalarToNextPow2(0, 32) 660 .clampMaxNumElements(0, S32, 16); 661 662 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 663 664 // If the amount is divergent, we have to do a wave reduction to get the 665 // maximum value, so this is expanded during RegBankSelect. 666 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 667 .legalFor({{PrivatePtr, S32}}); 668 669 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 670 .customIf(typeIsNot(0, PrivatePtr)); 671 672 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 673 674 auto &FPOpActions = getActionDefinitionsBuilder( 675 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 676 .legalFor({S32, S64}); 677 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 678 .customFor({S32, S64}); 679 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 680 .customFor({S32, S64}); 681 682 if (ST.has16BitInsts()) { 683 if (ST.hasVOP3PInsts()) 684 FPOpActions.legalFor({S16, V2S16}); 685 else 686 FPOpActions.legalFor({S16}); 687 688 TrigActions.customFor({S16}); 689 FDIVActions.customFor({S16}); 690 } 691 692 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 693 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 694 695 if (ST.hasVOP3PInsts()) { 696 MinNumMaxNum.customFor(FPTypesPK16) 697 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 698 .clampMaxNumElements(0, S16, 2) 699 .clampScalar(0, S16, S64) 700 .scalarize(0); 701 } else if (ST.has16BitInsts()) { 702 MinNumMaxNum.customFor(FPTypes16) 703 .clampScalar(0, S16, S64) 704 .scalarize(0); 705 } else { 706 MinNumMaxNum.customFor(FPTypesBase) 707 .clampScalar(0, S32, S64) 708 .scalarize(0); 709 } 710 711 if (ST.hasVOP3PInsts()) 712 FPOpActions.clampMaxNumElements(0, S16, 2); 713 714 FPOpActions 715 .scalarize(0) 716 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 717 718 TrigActions 719 .scalarize(0) 720 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 721 722 FDIVActions 723 .scalarize(0) 724 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 725 726 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 727 .legalFor(FPTypesPK16) 728 .clampMaxNumElements(0, S16, 2) 729 .scalarize(0) 730 .clampScalar(0, S16, S64); 731 732 if (ST.has16BitInsts()) { 733 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 734 .legalFor({S32, S64, S16}) 735 .scalarize(0) 736 .clampScalar(0, S16, S64); 737 } else { 738 getActionDefinitionsBuilder(G_FSQRT) 739 .legalFor({S32, S64}) 740 .scalarize(0) 741 .clampScalar(0, S32, S64); 742 743 if (ST.hasFractBug()) { 744 getActionDefinitionsBuilder(G_FFLOOR) 745 .customFor({S64}) 746 .legalFor({S32, S64}) 747 .scalarize(0) 748 .clampScalar(0, S32, S64); 749 } else { 750 getActionDefinitionsBuilder(G_FFLOOR) 751 .legalFor({S32, S64}) 752 .scalarize(0) 753 .clampScalar(0, S32, S64); 754 } 755 } 756 757 getActionDefinitionsBuilder(G_FPTRUNC) 758 .legalFor({{S32, S64}, {S16, S32}}) 759 .scalarize(0) 760 .lower(); 761 762 getActionDefinitionsBuilder(G_FPEXT) 763 .legalFor({{S64, S32}, {S32, S16}}) 764 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 765 .scalarize(0); 766 767 getActionDefinitionsBuilder(G_FSUB) 768 // Use actual fsub instruction 769 .legalFor({S32}) 770 // Must use fadd + fneg 771 .lowerFor({S64, S16, V2S16}) 772 .scalarize(0) 773 .clampScalar(0, S32, S64); 774 775 // Whether this is legal depends on the floating point mode for the function. 776 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 777 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 778 FMad.customFor({S32, S16}); 779 else if (ST.hasMadMacF32Insts()) 780 FMad.customFor({S32}); 781 else if (ST.hasMadF16()) 782 FMad.customFor({S16}); 783 FMad.scalarize(0) 784 .lower(); 785 786 auto &FRem = getActionDefinitionsBuilder(G_FREM); 787 if (ST.has16BitInsts()) { 788 FRem.customFor({S16, S32, S64}); 789 } else { 790 FRem.minScalar(0, S32) 791 .customFor({S32, S64}); 792 } 793 FRem.scalarize(0); 794 795 // TODO: Do we need to clamp maximum bitwidth? 796 getActionDefinitionsBuilder(G_TRUNC) 797 .legalIf(isScalar(0)) 798 .legalFor({{V2S16, V2S32}}) 799 .clampMaxNumElements(0, S16, 2) 800 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 801 // situations (like an invalid implicit use), we don't want to infinite loop 802 // in the legalizer. 803 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 804 .alwaysLegal(); 805 806 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 807 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 808 {S32, S1}, {S64, S1}, {S16, S1}}) 809 .scalarize(0) 810 .clampScalar(0, S32, S64) 811 .widenScalarToNextPow2(1, 32); 812 813 // TODO: Split s1->s64 during regbankselect for VALU. 814 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 815 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 816 .lowerIf(typeIs(1, S1)) 817 .customFor({{S32, S64}, {S64, S64}}); 818 if (ST.has16BitInsts()) 819 IToFP.legalFor({{S16, S16}}); 820 IToFP.clampScalar(1, S32, S64) 821 .minScalar(0, S32) 822 .scalarize(0) 823 .widenScalarToNextPow2(1); 824 825 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 826 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 827 .customFor({{S64, S32}, {S64, S64}}) 828 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 829 if (ST.has16BitInsts()) 830 FPToI.legalFor({{S16, S16}}); 831 else 832 FPToI.minScalar(1, S32); 833 834 FPToI.minScalar(0, S32) 835 .widenScalarToNextPow2(0, 32) 836 .scalarize(0) 837 .lower(); 838 839 // Lower roundeven into G_FRINT 840 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 841 .scalarize(0) 842 .lower(); 843 844 if (ST.has16BitInsts()) { 845 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 846 .legalFor({S16, S32, S64}) 847 .clampScalar(0, S16, S64) 848 .scalarize(0); 849 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 850 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 851 .legalFor({S32, S64}) 852 .clampScalar(0, S32, S64) 853 .scalarize(0); 854 } else { 855 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 856 .legalFor({S32}) 857 .customFor({S64}) 858 .clampScalar(0, S32, S64) 859 .scalarize(0); 860 } 861 862 getActionDefinitionsBuilder(G_PTR_ADD) 863 .legalIf(all(isPointer(0), sameSize(0, 1))) 864 .scalarize(0) 865 .scalarSameSizeAs(1, 0); 866 867 getActionDefinitionsBuilder(G_PTRMASK) 868 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 869 .scalarSameSizeAs(1, 0) 870 .scalarize(0); 871 872 auto &CmpBuilder = 873 getActionDefinitionsBuilder(G_ICMP) 874 // The compare output type differs based on the register bank of the output, 875 // so make both s1 and s32 legal. 876 // 877 // Scalar compares producing output in scc will be promoted to s32, as that 878 // is the allocatable register type that will be needed for the copy from 879 // scc. This will be promoted during RegBankSelect, and we assume something 880 // before that won't try to use s32 result types. 881 // 882 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 883 // bank. 884 .legalForCartesianProduct( 885 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 886 .legalForCartesianProduct( 887 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 888 if (ST.has16BitInsts()) { 889 CmpBuilder.legalFor({{S1, S16}}); 890 } 891 892 CmpBuilder 893 .widenScalarToNextPow2(1) 894 .clampScalar(1, S32, S64) 895 .scalarize(0) 896 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 897 898 getActionDefinitionsBuilder(G_FCMP) 899 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 900 .widenScalarToNextPow2(1) 901 .clampScalar(1, S32, S64) 902 .scalarize(0); 903 904 // FIXME: fpow has a selection pattern that should move to custom lowering. 905 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 906 if (ST.has16BitInsts()) 907 Exp2Ops.legalFor({S32, S16}); 908 else 909 Exp2Ops.legalFor({S32}); 910 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 911 Exp2Ops.scalarize(0); 912 913 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 914 if (ST.has16BitInsts()) 915 ExpOps.customFor({{S32}, {S16}}); 916 else 917 ExpOps.customFor({S32}); 918 ExpOps.clampScalar(0, MinScalarFPTy, S32) 919 .scalarize(0); 920 921 getActionDefinitionsBuilder(G_FPOWI) 922 .clampScalar(0, MinScalarFPTy, S32) 923 .lower(); 924 925 // The 64-bit versions produce 32-bit results, but only on the SALU. 926 getActionDefinitionsBuilder(G_CTPOP) 927 .legalFor({{S32, S32}, {S32, S64}}) 928 .clampScalar(0, S32, S32) 929 .clampScalar(1, S32, S64) 930 .scalarize(0) 931 .widenScalarToNextPow2(0, 32) 932 .widenScalarToNextPow2(1, 32); 933 934 // The hardware instructions return a different result on 0 than the generic 935 // instructions expect. The hardware produces -1, but these produce the 936 // bitwidth. 937 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 938 .scalarize(0) 939 .clampScalar(0, S32, S32) 940 .clampScalar(1, S32, S64) 941 .widenScalarToNextPow2(0, 32) 942 .widenScalarToNextPow2(1, 32) 943 .custom(); 944 945 // The 64-bit versions produce 32-bit results, but only on the SALU. 946 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 947 .legalFor({{S32, S32}, {S32, S64}}) 948 .clampScalar(0, S32, S32) 949 .clampScalar(1, S32, S64) 950 .scalarize(0) 951 .widenScalarToNextPow2(0, 32) 952 .widenScalarToNextPow2(1, 32); 953 954 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 955 // RegBankSelect. 956 getActionDefinitionsBuilder(G_BITREVERSE) 957 .legalFor({S32, S64}) 958 .clampScalar(0, S32, S64) 959 .scalarize(0) 960 .widenScalarToNextPow2(0); 961 962 if (ST.has16BitInsts()) { 963 getActionDefinitionsBuilder(G_BSWAP) 964 .legalFor({S16, S32, V2S16}) 965 .clampMaxNumElements(0, S16, 2) 966 // FIXME: Fixing non-power-of-2 before clamp is workaround for 967 // narrowScalar limitation. 968 .widenScalarToNextPow2(0) 969 .clampScalar(0, S16, S32) 970 .scalarize(0); 971 972 if (ST.hasVOP3PInsts()) { 973 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 974 .legalFor({S32, S16, V2S16}) 975 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 976 .clampMaxNumElements(0, S16, 2) 977 .minScalar(0, S16) 978 .widenScalarToNextPow2(0) 979 .scalarize(0) 980 .lower(); 981 } else { 982 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 983 .legalFor({S32, S16}) 984 .widenScalarToNextPow2(0) 985 .minScalar(0, S16) 986 .scalarize(0) 987 .lower(); 988 } 989 } else { 990 // TODO: Should have same legality without v_perm_b32 991 getActionDefinitionsBuilder(G_BSWAP) 992 .legalFor({S32}) 993 .lowerIf(scalarNarrowerThan(0, 32)) 994 // FIXME: Fixing non-power-of-2 before clamp is workaround for 995 // narrowScalar limitation. 996 .widenScalarToNextPow2(0) 997 .maxScalar(0, S32) 998 .scalarize(0) 999 .lower(); 1000 1001 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1002 .legalFor({S32}) 1003 .minScalar(0, S32) 1004 .widenScalarToNextPow2(0) 1005 .scalarize(0) 1006 .lower(); 1007 } 1008 1009 getActionDefinitionsBuilder(G_INTTOPTR) 1010 // List the common cases 1011 .legalForCartesianProduct(AddrSpaces64, {S64}) 1012 .legalForCartesianProduct(AddrSpaces32, {S32}) 1013 .scalarize(0) 1014 // Accept any address space as long as the size matches 1015 .legalIf(sameSize(0, 1)) 1016 .widenScalarIf(smallerThan(1, 0), 1017 [](const LegalityQuery &Query) { 1018 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1019 }) 1020 .narrowScalarIf(largerThan(1, 0), 1021 [](const LegalityQuery &Query) { 1022 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1023 }); 1024 1025 getActionDefinitionsBuilder(G_PTRTOINT) 1026 // List the common cases 1027 .legalForCartesianProduct(AddrSpaces64, {S64}) 1028 .legalForCartesianProduct(AddrSpaces32, {S32}) 1029 .scalarize(0) 1030 // Accept any address space as long as the size matches 1031 .legalIf(sameSize(0, 1)) 1032 .widenScalarIf(smallerThan(0, 1), 1033 [](const LegalityQuery &Query) { 1034 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1035 }) 1036 .narrowScalarIf( 1037 largerThan(0, 1), 1038 [](const LegalityQuery &Query) { 1039 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1040 }); 1041 1042 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1043 .scalarize(0) 1044 .custom(); 1045 1046 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1047 bool IsLoad) -> bool { 1048 const LLT DstTy = Query.Types[0]; 1049 1050 // Split vector extloads. 1051 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1052 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1053 1054 if (MemSize < DstTy.getSizeInBits()) 1055 MemSize = std::max(MemSize, AlignBits); 1056 1057 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1058 return true; 1059 1060 const LLT PtrTy = Query.Types[1]; 1061 unsigned AS = PtrTy.getAddressSpace(); 1062 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1063 return true; 1064 1065 // Catch weird sized loads that don't evenly divide into the access sizes 1066 // TODO: May be able to widen depending on alignment etc. 1067 unsigned NumRegs = (MemSize + 31) / 32; 1068 if (NumRegs == 3) { 1069 if (!ST.hasDwordx3LoadStores()) 1070 return true; 1071 } else { 1072 // If the alignment allows, these should have been widened. 1073 if (!isPowerOf2_32(NumRegs)) 1074 return true; 1075 } 1076 1077 if (AlignBits < MemSize) { 1078 const SITargetLowering *TLI = ST.getTargetLowering(); 1079 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1080 Align(AlignBits / 8)); 1081 } 1082 1083 return false; 1084 }; 1085 1086 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1087 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1088 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1089 1090 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1091 // LDS 1092 // TODO: Unsupported flat for SI. 1093 1094 for (unsigned Op : {G_LOAD, G_STORE}) { 1095 const bool IsStore = Op == G_STORE; 1096 1097 auto &Actions = getActionDefinitionsBuilder(Op); 1098 // Explicitly list some common cases. 1099 // TODO: Does this help compile time at all? 1100 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1101 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1102 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1103 {S64, GlobalPtr, S64, GlobalAlign32}, 1104 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1105 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1106 {S32, GlobalPtr, S8, GlobalAlign8}, 1107 {S32, GlobalPtr, S16, GlobalAlign16}, 1108 1109 {S32, LocalPtr, S32, 32}, 1110 {S64, LocalPtr, S64, 32}, 1111 {V2S32, LocalPtr, V2S32, 32}, 1112 {S32, LocalPtr, S8, 8}, 1113 {S32, LocalPtr, S16, 16}, 1114 {V2S16, LocalPtr, S32, 32}, 1115 1116 {S32, PrivatePtr, S32, 32}, 1117 {S32, PrivatePtr, S8, 8}, 1118 {S32, PrivatePtr, S16, 16}, 1119 {V2S16, PrivatePtr, S32, 32}, 1120 1121 {S32, ConstantPtr, S32, GlobalAlign32}, 1122 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1123 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1124 {S64, ConstantPtr, S64, GlobalAlign32}, 1125 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1126 Actions.legalIf( 1127 [=](const LegalityQuery &Query) -> bool { 1128 return isLoadStoreLegal(ST, Query); 1129 }); 1130 1131 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1132 // 64-bits. 1133 // 1134 // TODO: Should generalize bitcast action into coerce, which will also cover 1135 // inserting addrspacecasts. 1136 Actions.customIf(typeIs(1, Constant32Ptr)); 1137 1138 // Turn any illegal element vectors into something easier to deal 1139 // with. These will ultimately produce 32-bit scalar shifts to extract the 1140 // parts anyway. 1141 // 1142 // For odd 16-bit element vectors, prefer to split those into pieces with 1143 // 16-bit vector parts. 1144 Actions.bitcastIf( 1145 [=](const LegalityQuery &Query) -> bool { 1146 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1147 Query.MMODescrs[0].MemoryTy); 1148 }, bitcastToRegisterType(0)); 1149 1150 if (!IsStore) { 1151 // Widen suitably aligned loads by loading extra bytes. The standard 1152 // legalization actions can't properly express widening memory operands. 1153 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1154 return shouldWidenLoad(ST, Query, G_LOAD); 1155 }); 1156 } 1157 1158 // FIXME: load/store narrowing should be moved to lower action 1159 Actions 1160 .narrowScalarIf( 1161 [=](const LegalityQuery &Query) -> bool { 1162 return !Query.Types[0].isVector() && 1163 needToSplitMemOp(Query, Op == G_LOAD); 1164 }, 1165 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1166 const LLT DstTy = Query.Types[0]; 1167 const LLT PtrTy = Query.Types[1]; 1168 1169 const unsigned DstSize = DstTy.getSizeInBits(); 1170 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1171 1172 // Split extloads. 1173 if (DstSize > MemSize) 1174 return std::make_pair(0, LLT::scalar(MemSize)); 1175 1176 if (!isPowerOf2_32(DstSize)) { 1177 // We're probably decomposing an odd sized store. Try to split 1178 // to the widest type. TODO: Account for alignment. As-is it 1179 // should be OK, since the new parts will be further legalized. 1180 unsigned FloorSize = PowerOf2Floor(DstSize); 1181 return std::make_pair(0, LLT::scalar(FloorSize)); 1182 } 1183 1184 if (DstSize > 32 && (DstSize % 32 != 0)) { 1185 // FIXME: Need a way to specify non-extload of larger size if 1186 // suitably aligned. 1187 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1188 } 1189 1190 unsigned MaxSize = maxSizeForAddrSpace(ST, 1191 PtrTy.getAddressSpace(), 1192 Op == G_LOAD); 1193 if (MemSize > MaxSize) 1194 return std::make_pair(0, LLT::scalar(MaxSize)); 1195 1196 unsigned Align = Query.MMODescrs[0].AlignInBits; 1197 return std::make_pair(0, LLT::scalar(Align)); 1198 }) 1199 .fewerElementsIf( 1200 [=](const LegalityQuery &Query) -> bool { 1201 return Query.Types[0].isVector() && 1202 needToSplitMemOp(Query, Op == G_LOAD); 1203 }, 1204 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1205 const LLT DstTy = Query.Types[0]; 1206 const LLT PtrTy = Query.Types[1]; 1207 1208 LLT EltTy = DstTy.getElementType(); 1209 unsigned MaxSize = maxSizeForAddrSpace(ST, 1210 PtrTy.getAddressSpace(), 1211 Op == G_LOAD); 1212 1213 // FIXME: Handle widened to power of 2 results better. This ends 1214 // up scalarizing. 1215 // FIXME: 3 element stores scalarized on SI 1216 1217 // Split if it's too large for the address space. 1218 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1219 if (MemSize > MaxSize) { 1220 unsigned NumElts = DstTy.getNumElements(); 1221 unsigned EltSize = EltTy.getSizeInBits(); 1222 1223 if (MaxSize % EltSize == 0) { 1224 return std::make_pair( 1225 0, LLT::scalarOrVector( 1226 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1227 } 1228 1229 unsigned NumPieces = MemSize / MaxSize; 1230 1231 // FIXME: Refine when odd breakdowns handled 1232 // The scalars will need to be re-legalized. 1233 if (NumPieces == 1 || NumPieces >= NumElts || 1234 NumElts % NumPieces != 0) 1235 return std::make_pair(0, EltTy); 1236 1237 return std::make_pair( 1238 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1239 } 1240 1241 // FIXME: We could probably handle weird extending loads better. 1242 if (DstTy.getSizeInBits() > MemSize) 1243 return std::make_pair(0, EltTy); 1244 1245 unsigned EltSize = EltTy.getSizeInBits(); 1246 unsigned DstSize = DstTy.getSizeInBits(); 1247 if (!isPowerOf2_32(DstSize)) { 1248 // We're probably decomposing an odd sized store. Try to split 1249 // to the widest type. TODO: Account for alignment. As-is it 1250 // should be OK, since the new parts will be further legalized. 1251 unsigned FloorSize = PowerOf2Floor(DstSize); 1252 return std::make_pair( 1253 0, LLT::scalarOrVector( 1254 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1255 } 1256 1257 // Need to split because of alignment. 1258 unsigned Align = Query.MMODescrs[0].AlignInBits; 1259 if (EltSize > Align && 1260 (EltSize / Align < DstTy.getNumElements())) { 1261 return std::make_pair( 1262 0, LLT::fixed_vector(EltSize / Align, EltTy)); 1263 } 1264 1265 // May need relegalization for the scalars. 1266 return std::make_pair(0, EltTy); 1267 }) 1268 .minScalar(0, S32) 1269 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1270 .widenScalarToNextPow2(0) 1271 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1272 .lower(); 1273 } 1274 1275 // FIXME: Unaligned accesses not lowered. 1276 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1277 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1278 {S32, GlobalPtr, S16, 2 * 8}, 1279 {S32, LocalPtr, S8, 8}, 1280 {S32, LocalPtr, S16, 16}, 1281 {S32, PrivatePtr, S8, 8}, 1282 {S32, PrivatePtr, S16, 16}, 1283 {S32, ConstantPtr, S8, 8}, 1284 {S32, ConstantPtr, S16, 2 * 8}}) 1285 .legalIf( 1286 [=](const LegalityQuery &Query) -> bool { 1287 return isLoadStoreLegal(ST, Query); 1288 }); 1289 1290 if (ST.hasFlatAddressSpace()) { 1291 ExtLoads.legalForTypesWithMemDesc( 1292 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1293 } 1294 1295 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1296 // 64-bits. 1297 // 1298 // TODO: Should generalize bitcast action into coerce, which will also cover 1299 // inserting addrspacecasts. 1300 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1301 1302 ExtLoads.clampScalar(0, S32, S32) 1303 .widenScalarToNextPow2(0) 1304 .lower(); 1305 1306 auto &Atomics = getActionDefinitionsBuilder( 1307 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1308 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1309 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1310 G_ATOMICRMW_UMIN}) 1311 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1312 {S64, GlobalPtr}, {S64, LocalPtr}, 1313 {S32, RegionPtr}, {S64, RegionPtr}}); 1314 if (ST.hasFlatAddressSpace()) { 1315 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1316 } 1317 1318 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1319 if (ST.hasLDSFPAtomics()) { 1320 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1321 if (ST.hasGFX90AInsts()) 1322 Atomic.legalFor({{S64, LocalPtr}}); 1323 } 1324 if (ST.hasAtomicFaddInsts()) 1325 Atomic.legalFor({{S32, GlobalPtr}}); 1326 1327 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1328 // demarshalling 1329 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1330 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1331 {S32, FlatPtr}, {S64, FlatPtr}}) 1332 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1333 {S32, RegionPtr}, {S64, RegionPtr}}); 1334 // TODO: Pointer types, any 32-bit or 64-bit vector 1335 1336 // Condition should be s32 for scalar, s1 for vector. 1337 getActionDefinitionsBuilder(G_SELECT) 1338 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1339 LocalPtr, FlatPtr, PrivatePtr, 1340 LLT::fixed_vector(2, LocalPtr), 1341 LLT::fixed_vector(2, PrivatePtr)}, 1342 {S1, S32}) 1343 .clampScalar(0, S16, S64) 1344 .scalarize(1) 1345 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1346 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1347 .clampMaxNumElements(0, S32, 2) 1348 .clampMaxNumElements(0, LocalPtr, 2) 1349 .clampMaxNumElements(0, PrivatePtr, 2) 1350 .scalarize(0) 1351 .widenScalarToNextPow2(0) 1352 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1353 1354 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1355 // be more flexible with the shift amount type. 1356 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1357 .legalFor({{S32, S32}, {S64, S32}}); 1358 if (ST.has16BitInsts()) { 1359 if (ST.hasVOP3PInsts()) { 1360 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1361 .clampMaxNumElements(0, S16, 2); 1362 } else 1363 Shifts.legalFor({{S16, S16}}); 1364 1365 // TODO: Support 16-bit shift amounts for all types 1366 Shifts.widenScalarIf( 1367 [=](const LegalityQuery &Query) { 1368 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1369 // 32-bit amount. 1370 const LLT ValTy = Query.Types[0]; 1371 const LLT AmountTy = Query.Types[1]; 1372 return ValTy.getSizeInBits() <= 16 && 1373 AmountTy.getSizeInBits() < 16; 1374 }, changeTo(1, S16)); 1375 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1376 Shifts.clampScalar(1, S32, S32); 1377 Shifts.clampScalar(0, S16, S64); 1378 Shifts.widenScalarToNextPow2(0, 16); 1379 1380 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1381 .minScalar(0, S16) 1382 .scalarize(0) 1383 .lower(); 1384 } else { 1385 // Make sure we legalize the shift amount type first, as the general 1386 // expansion for the shifted type will produce much worse code if it hasn't 1387 // been truncated already. 1388 Shifts.clampScalar(1, S32, S32); 1389 Shifts.clampScalar(0, S32, S64); 1390 Shifts.widenScalarToNextPow2(0, 32); 1391 1392 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1393 .minScalar(0, S32) 1394 .scalarize(0) 1395 .lower(); 1396 } 1397 Shifts.scalarize(0); 1398 1399 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1400 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1401 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1402 unsigned IdxTypeIdx = 2; 1403 1404 getActionDefinitionsBuilder(Op) 1405 .customIf([=](const LegalityQuery &Query) { 1406 const LLT EltTy = Query.Types[EltTypeIdx]; 1407 const LLT VecTy = Query.Types[VecTypeIdx]; 1408 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1409 const unsigned EltSize = EltTy.getSizeInBits(); 1410 return (EltSize == 32 || EltSize == 64) && 1411 VecTy.getSizeInBits() % 32 == 0 && 1412 VecTy.getSizeInBits() <= MaxRegisterSize && 1413 IdxTy.getSizeInBits() == 32; 1414 }) 1415 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1416 bitcastToVectorElement32(VecTypeIdx)) 1417 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1418 .bitcastIf( 1419 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1420 [=](const LegalityQuery &Query) { 1421 // For > 64-bit element types, try to turn this into a 64-bit 1422 // element vector since we may be able to do better indexing 1423 // if this is scalar. If not, fall back to 32. 1424 const LLT EltTy = Query.Types[EltTypeIdx]; 1425 const LLT VecTy = Query.Types[VecTypeIdx]; 1426 const unsigned DstEltSize = EltTy.getSizeInBits(); 1427 const unsigned VecSize = VecTy.getSizeInBits(); 1428 1429 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1430 return std::make_pair( 1431 VecTypeIdx, 1432 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1433 }) 1434 .clampScalar(EltTypeIdx, S32, S64) 1435 .clampScalar(VecTypeIdx, S32, S64) 1436 .clampScalar(IdxTypeIdx, S32, S32) 1437 .clampMaxNumElements(VecTypeIdx, S32, 32) 1438 // TODO: Clamp elements for 64-bit vectors? 1439 // It should only be necessary with variable indexes. 1440 // As a last resort, lower to the stack 1441 .lower(); 1442 } 1443 1444 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1445 .unsupportedIf([=](const LegalityQuery &Query) { 1446 const LLT &EltTy = Query.Types[1].getElementType(); 1447 return Query.Types[0] != EltTy; 1448 }); 1449 1450 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1451 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1452 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1453 1454 // FIXME: Doesn't handle extract of illegal sizes. 1455 getActionDefinitionsBuilder(Op) 1456 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1457 // FIXME: Multiples of 16 should not be legal. 1458 .legalIf([=](const LegalityQuery &Query) { 1459 const LLT BigTy = Query.Types[BigTyIdx]; 1460 const LLT LitTy = Query.Types[LitTyIdx]; 1461 return (BigTy.getSizeInBits() % 32 == 0) && 1462 (LitTy.getSizeInBits() % 16 == 0); 1463 }) 1464 .widenScalarIf( 1465 [=](const LegalityQuery &Query) { 1466 const LLT BigTy = Query.Types[BigTyIdx]; 1467 return (BigTy.getScalarSizeInBits() < 16); 1468 }, 1469 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1470 .widenScalarIf( 1471 [=](const LegalityQuery &Query) { 1472 const LLT LitTy = Query.Types[LitTyIdx]; 1473 return (LitTy.getScalarSizeInBits() < 16); 1474 }, 1475 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1476 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1477 .widenScalarToNextPow2(BigTyIdx, 32); 1478 1479 } 1480 1481 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1482 .legalForCartesianProduct(AllS32Vectors, {S32}) 1483 .legalForCartesianProduct(AllS64Vectors, {S64}) 1484 .clampNumElements(0, V16S32, V32S32) 1485 .clampNumElements(0, V2S64, V16S64) 1486 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1487 1488 if (ST.hasScalarPackInsts()) { 1489 BuildVector 1490 // FIXME: Should probably widen s1 vectors straight to s32 1491 .minScalarOrElt(0, S16) 1492 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1493 .minScalar(1, S32); 1494 1495 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1496 .legalFor({V2S16, S32}) 1497 .lower(); 1498 BuildVector.minScalarOrElt(0, S32); 1499 } else { 1500 BuildVector.customFor({V2S16, S16}); 1501 BuildVector.minScalarOrElt(0, S32); 1502 1503 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1504 .customFor({V2S16, S32}) 1505 .lower(); 1506 } 1507 1508 BuildVector.legalIf(isRegisterType(0)); 1509 1510 // FIXME: Clamp maximum size 1511 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1512 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1513 .clampMaxNumElements(0, S32, 32) 1514 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1515 .clampMaxNumElements(0, S16, 64); 1516 1517 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1518 // pre-legalize. 1519 if (ST.hasVOP3PInsts()) { 1520 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1521 .customFor({V2S16, V2S16}) 1522 .lower(); 1523 } else 1524 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1525 1526 // Merge/Unmerge 1527 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1528 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1529 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1530 1531 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1532 const LLT Ty = Query.Types[TypeIdx]; 1533 if (Ty.isVector()) { 1534 const LLT &EltTy = Ty.getElementType(); 1535 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1536 return true; 1537 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1538 return true; 1539 } 1540 return false; 1541 }; 1542 1543 auto &Builder = getActionDefinitionsBuilder(Op) 1544 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1545 .lowerFor({{S16, V2S16}}) 1546 .lowerIf([=](const LegalityQuery &Query) { 1547 const LLT BigTy = Query.Types[BigTyIdx]; 1548 return BigTy.getSizeInBits() == 32; 1549 }) 1550 // Try to widen to s16 first for small types. 1551 // TODO: Only do this on targets with legal s16 shifts 1552 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1553 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1554 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1555 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1556 elementTypeIs(1, S16)), 1557 changeTo(1, V2S16)) 1558 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1559 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1560 // valid. 1561 .clampScalar(LitTyIdx, S32, S512) 1562 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1563 // Break up vectors with weird elements into scalars 1564 .fewerElementsIf( 1565 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1566 scalarize(0)) 1567 .fewerElementsIf( 1568 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1569 scalarize(1)) 1570 .clampScalar(BigTyIdx, S32, MaxScalar); 1571 1572 if (Op == G_MERGE_VALUES) { 1573 Builder.widenScalarIf( 1574 // TODO: Use 16-bit shifts if legal for 8-bit values? 1575 [=](const LegalityQuery &Query) { 1576 const LLT Ty = Query.Types[LitTyIdx]; 1577 return Ty.getSizeInBits() < 32; 1578 }, 1579 changeTo(LitTyIdx, S32)); 1580 } 1581 1582 Builder.widenScalarIf( 1583 [=](const LegalityQuery &Query) { 1584 const LLT Ty = Query.Types[BigTyIdx]; 1585 return !isPowerOf2_32(Ty.getSizeInBits()) && 1586 Ty.getSizeInBits() % 16 != 0; 1587 }, 1588 [=](const LegalityQuery &Query) { 1589 // Pick the next power of 2, or a multiple of 64 over 128. 1590 // Whichever is smaller. 1591 const LLT &Ty = Query.Types[BigTyIdx]; 1592 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1593 if (NewSizeInBits >= 256) { 1594 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1595 if (RoundedTo < NewSizeInBits) 1596 NewSizeInBits = RoundedTo; 1597 } 1598 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1599 }) 1600 // Any vectors left are the wrong size. Scalarize them. 1601 .scalarize(0) 1602 .scalarize(1); 1603 } 1604 1605 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1606 // RegBankSelect. 1607 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1608 .legalFor({{S32}, {S64}}); 1609 1610 if (ST.hasVOP3PInsts()) { 1611 SextInReg.lowerFor({{V2S16}}) 1612 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1613 // get more vector shift opportunities, since we'll get those when 1614 // expanded. 1615 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1616 } else if (ST.has16BitInsts()) { 1617 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1618 } else { 1619 // Prefer to promote to s32 before lowering if we don't have 16-bit 1620 // shifts. This avoid a lot of intermediate truncate and extend operations. 1621 SextInReg.lowerFor({{S32}, {S64}}); 1622 } 1623 1624 SextInReg 1625 .scalarize(0) 1626 .clampScalar(0, S32, S64) 1627 .lower(); 1628 1629 // TODO: Only Try to form v2s16 with legal packed instructions. 1630 getActionDefinitionsBuilder(G_FSHR) 1631 .legalFor({{S32, S32}}) 1632 .lowerFor({{V2S16, V2S16}}) 1633 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1634 .scalarize(0) 1635 .lower(); 1636 1637 if (ST.hasVOP3PInsts()) { 1638 getActionDefinitionsBuilder(G_FSHL) 1639 .lowerFor({{V2S16, V2S16}}) 1640 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1641 .scalarize(0) 1642 .lower(); 1643 } else { 1644 getActionDefinitionsBuilder(G_FSHL) 1645 .scalarize(0) 1646 .lower(); 1647 } 1648 1649 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1650 .legalFor({S64}); 1651 1652 getActionDefinitionsBuilder(G_FENCE) 1653 .alwaysLegal(); 1654 1655 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1656 .scalarize(0) 1657 .minScalar(0, S32) 1658 .lower(); 1659 1660 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1661 .legalFor({{S32, S32}, {S64, S32}}) 1662 .clampScalar(1, S32, S32) 1663 .clampScalar(0, S32, S64) 1664 .widenScalarToNextPow2(0) 1665 .scalarize(0); 1666 1667 getActionDefinitionsBuilder({ 1668 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1669 G_FCOPYSIGN, 1670 1671 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1672 G_ATOMICRMW_NAND, 1673 G_ATOMICRMW_FSUB, 1674 G_READ_REGISTER, 1675 G_WRITE_REGISTER, 1676 1677 G_SADDO, G_SSUBO, 1678 1679 // TODO: Implement 1680 G_FMINIMUM, G_FMAXIMUM}).lower(); 1681 1682 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1683 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1684 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1685 .unsupported(); 1686 1687 getLegacyLegalizerInfo().computeTables(); 1688 verify(*ST.getInstrInfo()); 1689 } 1690 1691 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1692 MachineInstr &MI) const { 1693 MachineIRBuilder &B = Helper.MIRBuilder; 1694 MachineRegisterInfo &MRI = *B.getMRI(); 1695 1696 switch (MI.getOpcode()) { 1697 case TargetOpcode::G_ADDRSPACE_CAST: 1698 return legalizeAddrSpaceCast(MI, MRI, B); 1699 case TargetOpcode::G_FRINT: 1700 return legalizeFrint(MI, MRI, B); 1701 case TargetOpcode::G_FCEIL: 1702 return legalizeFceil(MI, MRI, B); 1703 case TargetOpcode::G_FREM: 1704 return legalizeFrem(MI, MRI, B); 1705 case TargetOpcode::G_INTRINSIC_TRUNC: 1706 return legalizeIntrinsicTrunc(MI, MRI, B); 1707 case TargetOpcode::G_SITOFP: 1708 return legalizeITOFP(MI, MRI, B, true); 1709 case TargetOpcode::G_UITOFP: 1710 return legalizeITOFP(MI, MRI, B, false); 1711 case TargetOpcode::G_FPTOSI: 1712 return legalizeFPTOI(MI, MRI, B, true); 1713 case TargetOpcode::G_FPTOUI: 1714 return legalizeFPTOI(MI, MRI, B, false); 1715 case TargetOpcode::G_FMINNUM: 1716 case TargetOpcode::G_FMAXNUM: 1717 case TargetOpcode::G_FMINNUM_IEEE: 1718 case TargetOpcode::G_FMAXNUM_IEEE: 1719 return legalizeMinNumMaxNum(Helper, MI); 1720 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1721 return legalizeExtractVectorElt(MI, MRI, B); 1722 case TargetOpcode::G_INSERT_VECTOR_ELT: 1723 return legalizeInsertVectorElt(MI, MRI, B); 1724 case TargetOpcode::G_SHUFFLE_VECTOR: 1725 return legalizeShuffleVector(MI, MRI, B); 1726 case TargetOpcode::G_FSIN: 1727 case TargetOpcode::G_FCOS: 1728 return legalizeSinCos(MI, MRI, B); 1729 case TargetOpcode::G_GLOBAL_VALUE: 1730 return legalizeGlobalValue(MI, MRI, B); 1731 case TargetOpcode::G_LOAD: 1732 case TargetOpcode::G_SEXTLOAD: 1733 case TargetOpcode::G_ZEXTLOAD: 1734 return legalizeLoad(Helper, MI); 1735 case TargetOpcode::G_FMAD: 1736 return legalizeFMad(MI, MRI, B); 1737 case TargetOpcode::G_FDIV: 1738 return legalizeFDIV(MI, MRI, B); 1739 case TargetOpcode::G_UDIV: 1740 case TargetOpcode::G_UREM: 1741 case TargetOpcode::G_UDIVREM: 1742 return legalizeUnsignedDIV_REM(MI, MRI, B); 1743 case TargetOpcode::G_SDIV: 1744 case TargetOpcode::G_SREM: 1745 case TargetOpcode::G_SDIVREM: 1746 return legalizeSignedDIV_REM(MI, MRI, B); 1747 case TargetOpcode::G_ATOMIC_CMPXCHG: 1748 return legalizeAtomicCmpXChg(MI, MRI, B); 1749 case TargetOpcode::G_FLOG: 1750 return legalizeFlog(MI, B, numbers::ln2f); 1751 case TargetOpcode::G_FLOG10: 1752 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1753 case TargetOpcode::G_FEXP: 1754 return legalizeFExp(MI, B); 1755 case TargetOpcode::G_FPOW: 1756 return legalizeFPow(MI, B); 1757 case TargetOpcode::G_FFLOOR: 1758 return legalizeFFloor(MI, MRI, B); 1759 case TargetOpcode::G_BUILD_VECTOR: 1760 return legalizeBuildVector(MI, MRI, B); 1761 case TargetOpcode::G_CTLZ: 1762 case TargetOpcode::G_CTTZ: 1763 return legalizeCTLZ_CTTZ(MI, MRI, B); 1764 default: 1765 return false; 1766 } 1767 1768 llvm_unreachable("expected switch to return"); 1769 } 1770 1771 Register AMDGPULegalizerInfo::getSegmentAperture( 1772 unsigned AS, 1773 MachineRegisterInfo &MRI, 1774 MachineIRBuilder &B) const { 1775 MachineFunction &MF = B.getMF(); 1776 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1777 const LLT S32 = LLT::scalar(32); 1778 1779 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1780 1781 if (ST.hasApertureRegs()) { 1782 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1783 // getreg. 1784 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1785 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1786 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1787 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1788 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1789 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1790 unsigned Encoding = 1791 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1792 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1793 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1794 1795 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1796 1797 B.buildInstr(AMDGPU::S_GETREG_B32) 1798 .addDef(GetReg) 1799 .addImm(Encoding); 1800 MRI.setType(GetReg, S32); 1801 1802 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1803 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1804 } 1805 1806 Register QueuePtr = MRI.createGenericVirtualRegister( 1807 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1808 1809 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1810 return Register(); 1811 1812 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1813 // private_segment_aperture_base_hi. 1814 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1815 1816 // TODO: can we be smarter about machine pointer info? 1817 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1818 MachineMemOperand *MMO = MF.getMachineMemOperand( 1819 PtrInfo, 1820 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1821 MachineMemOperand::MOInvariant, 1822 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1823 1824 Register LoadAddr; 1825 1826 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1827 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1828 } 1829 1830 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1831 MachineInstr &MI, MachineRegisterInfo &MRI, 1832 MachineIRBuilder &B) const { 1833 MachineFunction &MF = B.getMF(); 1834 1835 const LLT S32 = LLT::scalar(32); 1836 Register Dst = MI.getOperand(0).getReg(); 1837 Register Src = MI.getOperand(1).getReg(); 1838 1839 LLT DstTy = MRI.getType(Dst); 1840 LLT SrcTy = MRI.getType(Src); 1841 unsigned DestAS = DstTy.getAddressSpace(); 1842 unsigned SrcAS = SrcTy.getAddressSpace(); 1843 1844 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1845 // vector element. 1846 assert(!DstTy.isVector()); 1847 1848 const AMDGPUTargetMachine &TM 1849 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1850 1851 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1852 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1853 return true; 1854 } 1855 1856 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1857 // Truncate. 1858 B.buildExtract(Dst, Src, 0); 1859 MI.eraseFromParent(); 1860 return true; 1861 } 1862 1863 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1864 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1865 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1866 1867 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1868 // another. Merge operands are required to be the same type, but creating an 1869 // extra ptrtoint would be kind of pointless. 1870 auto HighAddr = B.buildConstant( 1871 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1872 B.buildMerge(Dst, {Src, HighAddr}); 1873 MI.eraseFromParent(); 1874 return true; 1875 } 1876 1877 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1878 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1879 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1880 unsigned NullVal = TM.getNullPointerValue(DestAS); 1881 1882 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1883 auto FlatNull = B.buildConstant(SrcTy, 0); 1884 1885 // Extract low 32-bits of the pointer. 1886 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1887 1888 auto CmpRes = 1889 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1890 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1891 1892 MI.eraseFromParent(); 1893 return true; 1894 } 1895 1896 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1897 return false; 1898 1899 if (!ST.hasFlatAddressSpace()) 1900 return false; 1901 1902 auto SegmentNull = 1903 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1904 auto FlatNull = 1905 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1906 1907 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1908 if (!ApertureReg.isValid()) 1909 return false; 1910 1911 auto CmpRes = 1912 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1913 1914 // Coerce the type of the low half of the result so we can use merge_values. 1915 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1916 1917 // TODO: Should we allow mismatched types but matching sizes in merges to 1918 // avoid the ptrtoint? 1919 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1920 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1921 1922 MI.eraseFromParent(); 1923 return true; 1924 } 1925 1926 bool AMDGPULegalizerInfo::legalizeFrint( 1927 MachineInstr &MI, MachineRegisterInfo &MRI, 1928 MachineIRBuilder &B) const { 1929 Register Src = MI.getOperand(1).getReg(); 1930 LLT Ty = MRI.getType(Src); 1931 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1932 1933 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1934 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1935 1936 auto C1 = B.buildFConstant(Ty, C1Val); 1937 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1938 1939 // TODO: Should this propagate fast-math-flags? 1940 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1941 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1942 1943 auto C2 = B.buildFConstant(Ty, C2Val); 1944 auto Fabs = B.buildFAbs(Ty, Src); 1945 1946 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1947 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1948 MI.eraseFromParent(); 1949 return true; 1950 } 1951 1952 bool AMDGPULegalizerInfo::legalizeFceil( 1953 MachineInstr &MI, MachineRegisterInfo &MRI, 1954 MachineIRBuilder &B) const { 1955 1956 const LLT S1 = LLT::scalar(1); 1957 const LLT S64 = LLT::scalar(64); 1958 1959 Register Src = MI.getOperand(1).getReg(); 1960 assert(MRI.getType(Src) == S64); 1961 1962 // result = trunc(src) 1963 // if (src > 0.0 && src != result) 1964 // result += 1.0 1965 1966 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1967 1968 const auto Zero = B.buildFConstant(S64, 0.0); 1969 const auto One = B.buildFConstant(S64, 1.0); 1970 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1971 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1972 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1973 auto Add = B.buildSelect(S64, And, One, Zero); 1974 1975 // TODO: Should this propagate fast-math-flags? 1976 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1977 return true; 1978 } 1979 1980 bool AMDGPULegalizerInfo::legalizeFrem( 1981 MachineInstr &MI, MachineRegisterInfo &MRI, 1982 MachineIRBuilder &B) const { 1983 Register DstReg = MI.getOperand(0).getReg(); 1984 Register Src0Reg = MI.getOperand(1).getReg(); 1985 Register Src1Reg = MI.getOperand(2).getReg(); 1986 auto Flags = MI.getFlags(); 1987 LLT Ty = MRI.getType(DstReg); 1988 1989 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1990 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1991 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1992 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1993 MI.eraseFromParent(); 1994 return true; 1995 } 1996 1997 static MachineInstrBuilder extractF64Exponent(Register Hi, 1998 MachineIRBuilder &B) { 1999 const unsigned FractBits = 52; 2000 const unsigned ExpBits = 11; 2001 LLT S32 = LLT::scalar(32); 2002 2003 auto Const0 = B.buildConstant(S32, FractBits - 32); 2004 auto Const1 = B.buildConstant(S32, ExpBits); 2005 2006 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2007 .addUse(Hi) 2008 .addUse(Const0.getReg(0)) 2009 .addUse(Const1.getReg(0)); 2010 2011 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2012 } 2013 2014 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2015 MachineInstr &MI, MachineRegisterInfo &MRI, 2016 MachineIRBuilder &B) const { 2017 const LLT S1 = LLT::scalar(1); 2018 const LLT S32 = LLT::scalar(32); 2019 const LLT S64 = LLT::scalar(64); 2020 2021 Register Src = MI.getOperand(1).getReg(); 2022 assert(MRI.getType(Src) == S64); 2023 2024 // TODO: Should this use extract since the low half is unused? 2025 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2026 Register Hi = Unmerge.getReg(1); 2027 2028 // Extract the upper half, since this is where we will find the sign and 2029 // exponent. 2030 auto Exp = extractF64Exponent(Hi, B); 2031 2032 const unsigned FractBits = 52; 2033 2034 // Extract the sign bit. 2035 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2036 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2037 2038 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2039 2040 const auto Zero32 = B.buildConstant(S32, 0); 2041 2042 // Extend back to 64-bits. 2043 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2044 2045 auto Shr = B.buildAShr(S64, FractMask, Exp); 2046 auto Not = B.buildNot(S64, Shr); 2047 auto Tmp0 = B.buildAnd(S64, Src, Not); 2048 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2049 2050 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2051 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2052 2053 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2054 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2055 MI.eraseFromParent(); 2056 return true; 2057 } 2058 2059 bool AMDGPULegalizerInfo::legalizeITOFP( 2060 MachineInstr &MI, MachineRegisterInfo &MRI, 2061 MachineIRBuilder &B, bool Signed) const { 2062 2063 Register Dst = MI.getOperand(0).getReg(); 2064 Register Src = MI.getOperand(1).getReg(); 2065 2066 const LLT S64 = LLT::scalar(64); 2067 const LLT S32 = LLT::scalar(32); 2068 2069 assert(MRI.getType(Src) == S64); 2070 2071 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2072 auto ThirtyTwo = B.buildConstant(S32, 32); 2073 2074 if (MRI.getType(Dst) == S64) { 2075 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2076 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2077 2078 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2079 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2080 .addUse(CvtHi.getReg(0)) 2081 .addUse(ThirtyTwo.getReg(0)); 2082 2083 // TODO: Should this propagate fast-math-flags? 2084 B.buildFAdd(Dst, LdExp, CvtLo); 2085 MI.eraseFromParent(); 2086 return true; 2087 } 2088 2089 assert(MRI.getType(Dst) == S32); 2090 2091 auto One = B.buildConstant(S32, 1); 2092 2093 MachineInstrBuilder ShAmt; 2094 if (Signed) { 2095 auto ThirtyOne = B.buildConstant(S32, 31); 2096 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2097 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2098 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2099 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2100 /*HasSideEffects=*/false) 2101 .addUse(Unmerge.getReg(1)); 2102 auto LS2 = B.buildSub(S32, LS, One); 2103 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2104 } else 2105 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2106 auto Norm = B.buildShl(S64, Src, ShAmt); 2107 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2108 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2109 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2110 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2111 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2112 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2113 /*HasSideEffects=*/false) 2114 .addUse(FVal.getReg(0)) 2115 .addUse(Scale.getReg(0)); 2116 MI.eraseFromParent(); 2117 return true; 2118 } 2119 2120 // TODO: Copied from DAG implementation. Verify logic and document how this 2121 // actually works. 2122 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2123 MachineRegisterInfo &MRI, 2124 MachineIRBuilder &B, 2125 bool Signed) const { 2126 2127 Register Dst = MI.getOperand(0).getReg(); 2128 Register Src = MI.getOperand(1).getReg(); 2129 2130 const LLT S64 = LLT::scalar(64); 2131 const LLT S32 = LLT::scalar(32); 2132 2133 const LLT SrcLT = MRI.getType(Src); 2134 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2135 2136 unsigned Flags = MI.getFlags(); 2137 2138 // The basic idea of converting a floating point number into a pair of 32-bit 2139 // integers is illustrated as follows: 2140 // 2141 // tf := trunc(val); 2142 // hif := floor(tf * 2^-32); 2143 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2144 // hi := fptoi(hif); 2145 // lo := fptoi(lof); 2146 // 2147 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2148 MachineInstrBuilder Sign; 2149 if (Signed && SrcLT == S32) { 2150 // However, a 32-bit floating point number has only 23 bits mantissa and 2151 // it's not enough to hold all the significant bits of `lof` if val is 2152 // negative. To avoid the loss of precision, We need to take the absolute 2153 // value after truncating and flip the result back based on the original 2154 // signedness. 2155 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2156 Trunc = B.buildFAbs(S32, Trunc, Flags); 2157 } 2158 MachineInstrBuilder K0, K1; 2159 if (SrcLT == S64) { 2160 K0 = B.buildFConstant(S64, 2161 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2162 K1 = B.buildFConstant(S64, 2163 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2164 } else { 2165 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2166 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2167 } 2168 2169 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2170 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2171 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2172 2173 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2174 : B.buildFPTOUI(S32, FloorMul); 2175 auto Lo = B.buildFPTOUI(S32, Fma); 2176 2177 if (Signed && SrcLT == S32) { 2178 // Flip the result based on the signedness, which is either all 0s or 1s. 2179 Sign = B.buildMerge(S64, {Sign, Sign}); 2180 // r := xor({lo, hi}, sign) - sign; 2181 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2182 } else 2183 B.buildMerge(Dst, {Lo, Hi}); 2184 MI.eraseFromParent(); 2185 2186 return true; 2187 } 2188 2189 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2190 MachineInstr &MI) const { 2191 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2192 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2193 2194 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2195 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2196 2197 // With ieee_mode disabled, the instructions have the correct behavior 2198 // already for G_FMINNUM/G_FMAXNUM 2199 if (!MFI->getMode().IEEE) 2200 return !IsIEEEOp; 2201 2202 if (IsIEEEOp) 2203 return true; 2204 2205 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2206 } 2207 2208 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2209 MachineInstr &MI, MachineRegisterInfo &MRI, 2210 MachineIRBuilder &B) const { 2211 // TODO: Should move some of this into LegalizerHelper. 2212 2213 // TODO: Promote dynamic indexing of s16 to s32 2214 2215 // FIXME: Artifact combiner probably should have replaced the truncated 2216 // constant before this, so we shouldn't need 2217 // getConstantVRegValWithLookThrough. 2218 Optional<ValueAndVReg> MaybeIdxVal = 2219 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2220 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2221 return true; 2222 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2223 2224 Register Dst = MI.getOperand(0).getReg(); 2225 Register Vec = MI.getOperand(1).getReg(); 2226 2227 LLT VecTy = MRI.getType(Vec); 2228 LLT EltTy = VecTy.getElementType(); 2229 assert(EltTy == MRI.getType(Dst)); 2230 2231 if (IdxVal < VecTy.getNumElements()) 2232 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2233 else 2234 B.buildUndef(Dst); 2235 2236 MI.eraseFromParent(); 2237 return true; 2238 } 2239 2240 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2241 MachineInstr &MI, MachineRegisterInfo &MRI, 2242 MachineIRBuilder &B) const { 2243 // TODO: Should move some of this into LegalizerHelper. 2244 2245 // TODO: Promote dynamic indexing of s16 to s32 2246 2247 // FIXME: Artifact combiner probably should have replaced the truncated 2248 // constant before this, so we shouldn't need 2249 // getConstantVRegValWithLookThrough. 2250 Optional<ValueAndVReg> MaybeIdxVal = 2251 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2252 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2253 return true; 2254 2255 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2256 Register Dst = MI.getOperand(0).getReg(); 2257 Register Vec = MI.getOperand(1).getReg(); 2258 Register Ins = MI.getOperand(2).getReg(); 2259 2260 LLT VecTy = MRI.getType(Vec); 2261 LLT EltTy = VecTy.getElementType(); 2262 assert(EltTy == MRI.getType(Ins)); 2263 2264 if (IdxVal < VecTy.getNumElements()) 2265 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2266 else 2267 B.buildUndef(Dst); 2268 2269 MI.eraseFromParent(); 2270 return true; 2271 } 2272 2273 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2274 MachineInstr &MI, MachineRegisterInfo &MRI, 2275 MachineIRBuilder &B) const { 2276 const LLT V2S16 = LLT::fixed_vector(2, 16); 2277 2278 Register Dst = MI.getOperand(0).getReg(); 2279 Register Src0 = MI.getOperand(1).getReg(); 2280 LLT DstTy = MRI.getType(Dst); 2281 LLT SrcTy = MRI.getType(Src0); 2282 2283 if (SrcTy == V2S16 && DstTy == V2S16 && 2284 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2285 return true; 2286 2287 MachineIRBuilder HelperBuilder(MI); 2288 GISelObserverWrapper DummyObserver; 2289 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2290 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2291 } 2292 2293 bool AMDGPULegalizerInfo::legalizeSinCos( 2294 MachineInstr &MI, MachineRegisterInfo &MRI, 2295 MachineIRBuilder &B) const { 2296 2297 Register DstReg = MI.getOperand(0).getReg(); 2298 Register SrcReg = MI.getOperand(1).getReg(); 2299 LLT Ty = MRI.getType(DstReg); 2300 unsigned Flags = MI.getFlags(); 2301 2302 Register TrigVal; 2303 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2304 if (ST.hasTrigReducedRange()) { 2305 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2306 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2307 .addUse(MulVal.getReg(0)) 2308 .setMIFlags(Flags).getReg(0); 2309 } else 2310 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2311 2312 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2313 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2314 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2315 .addUse(TrigVal) 2316 .setMIFlags(Flags); 2317 MI.eraseFromParent(); 2318 return true; 2319 } 2320 2321 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2322 MachineIRBuilder &B, 2323 const GlobalValue *GV, 2324 int64_t Offset, 2325 unsigned GAFlags) const { 2326 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2327 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2328 // to the following code sequence: 2329 // 2330 // For constant address space: 2331 // s_getpc_b64 s[0:1] 2332 // s_add_u32 s0, s0, $symbol 2333 // s_addc_u32 s1, s1, 0 2334 // 2335 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2336 // a fixup or relocation is emitted to replace $symbol with a literal 2337 // constant, which is a pc-relative offset from the encoding of the $symbol 2338 // operand to the global variable. 2339 // 2340 // For global address space: 2341 // s_getpc_b64 s[0:1] 2342 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2343 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2344 // 2345 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2346 // fixups or relocations are emitted to replace $symbol@*@lo and 2347 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2348 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2349 // operand to the global variable. 2350 // 2351 // What we want here is an offset from the value returned by s_getpc 2352 // (which is the address of the s_add_u32 instruction) to the global 2353 // variable, but since the encoding of $symbol starts 4 bytes after the start 2354 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2355 // small. This requires us to add 4 to the global variable offset in order to 2356 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2357 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2358 // instruction. 2359 2360 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2361 2362 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2363 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2364 2365 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2366 .addDef(PCReg); 2367 2368 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2369 if (GAFlags == SIInstrInfo::MO_NONE) 2370 MIB.addImm(0); 2371 else 2372 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2373 2374 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2375 2376 if (PtrTy.getSizeInBits() == 32) 2377 B.buildExtract(DstReg, PCReg, 0); 2378 return true; 2379 } 2380 2381 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2382 MachineInstr &MI, MachineRegisterInfo &MRI, 2383 MachineIRBuilder &B) const { 2384 Register DstReg = MI.getOperand(0).getReg(); 2385 LLT Ty = MRI.getType(DstReg); 2386 unsigned AS = Ty.getAddressSpace(); 2387 2388 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2389 MachineFunction &MF = B.getMF(); 2390 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2391 2392 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2393 if (!MFI->isModuleEntryFunction() && 2394 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2395 const Function &Fn = MF.getFunction(); 2396 DiagnosticInfoUnsupported BadLDSDecl( 2397 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2398 DS_Warning); 2399 Fn.getContext().diagnose(BadLDSDecl); 2400 2401 // We currently don't have a way to correctly allocate LDS objects that 2402 // aren't directly associated with a kernel. We do force inlining of 2403 // functions that use local objects. However, if these dead functions are 2404 // not eliminated, we don't want a compile time error. Just emit a warning 2405 // and a trap, since there should be no callable path here. 2406 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2407 B.buildUndef(DstReg); 2408 MI.eraseFromParent(); 2409 return true; 2410 } 2411 2412 // TODO: We could emit code to handle the initialization somewhere. 2413 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2414 const SITargetLowering *TLI = ST.getTargetLowering(); 2415 if (!TLI->shouldUseLDSConstAddress(GV)) { 2416 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2417 return true; // Leave in place; 2418 } 2419 2420 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2421 Type *Ty = GV->getValueType(); 2422 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2423 // zero-sized type in other languages to declare the dynamic shared 2424 // memory which size is not known at the compile time. They will be 2425 // allocated by the runtime and placed directly after the static 2426 // allocated ones. They all share the same offset. 2427 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2428 // Adjust alignment for that dynamic shared memory array. 2429 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2430 LLT S32 = LLT::scalar(32); 2431 auto Sz = 2432 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2433 B.buildIntToPtr(DstReg, Sz); 2434 MI.eraseFromParent(); 2435 return true; 2436 } 2437 } 2438 2439 B.buildConstant( 2440 DstReg, 2441 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2442 MI.eraseFromParent(); 2443 return true; 2444 } 2445 2446 const Function &Fn = MF.getFunction(); 2447 DiagnosticInfoUnsupported BadInit( 2448 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2449 Fn.getContext().diagnose(BadInit); 2450 return true; 2451 } 2452 2453 const SITargetLowering *TLI = ST.getTargetLowering(); 2454 2455 if (TLI->shouldEmitFixup(GV)) { 2456 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2457 MI.eraseFromParent(); 2458 return true; 2459 } 2460 2461 if (TLI->shouldEmitPCReloc(GV)) { 2462 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2463 MI.eraseFromParent(); 2464 return true; 2465 } 2466 2467 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2468 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2469 2470 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2471 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2472 MachinePointerInfo::getGOT(MF), 2473 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2474 MachineMemOperand::MOInvariant, 2475 LoadTy, Align(8)); 2476 2477 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2478 2479 if (Ty.getSizeInBits() == 32) { 2480 // Truncate if this is a 32-bit constant adrdess. 2481 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2482 B.buildExtract(DstReg, Load, 0); 2483 } else 2484 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2485 2486 MI.eraseFromParent(); 2487 return true; 2488 } 2489 2490 static LLT widenToNextPowerOf2(LLT Ty) { 2491 if (Ty.isVector()) 2492 return Ty.changeElementCount( 2493 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2494 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2495 } 2496 2497 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2498 MachineInstr &MI) const { 2499 MachineIRBuilder &B = Helper.MIRBuilder; 2500 MachineRegisterInfo &MRI = *B.getMRI(); 2501 GISelChangeObserver &Observer = Helper.Observer; 2502 2503 Register PtrReg = MI.getOperand(1).getReg(); 2504 LLT PtrTy = MRI.getType(PtrReg); 2505 unsigned AddrSpace = PtrTy.getAddressSpace(); 2506 2507 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2508 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2509 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2510 Observer.changingInstr(MI); 2511 MI.getOperand(1).setReg(Cast.getReg(0)); 2512 Observer.changedInstr(MI); 2513 return true; 2514 } 2515 2516 if (MI.getOpcode() != AMDGPU::G_LOAD) 2517 return false; 2518 2519 Register ValReg = MI.getOperand(0).getReg(); 2520 LLT ValTy = MRI.getType(ValReg); 2521 2522 MachineMemOperand *MMO = *MI.memoperands_begin(); 2523 const unsigned ValSize = ValTy.getSizeInBits(); 2524 const LLT MemTy = MMO->getMemoryType(); 2525 const Align MemAlign = MMO->getAlign(); 2526 const unsigned MemSize = MemTy.getSizeInBits(); 2527 const unsigned AlignInBits = 8 * MemAlign.value(); 2528 2529 // Widen non-power-of-2 loads to the alignment if needed 2530 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2531 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2532 2533 // This was already the correct extending load result type, so just adjust 2534 // the memory type. 2535 if (WideMemSize == ValSize) { 2536 MachineFunction &MF = B.getMF(); 2537 2538 MachineMemOperand *WideMMO = 2539 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2540 Observer.changingInstr(MI); 2541 MI.setMemRefs(MF, {WideMMO}); 2542 Observer.changedInstr(MI); 2543 return true; 2544 } 2545 2546 // Don't bother handling edge case that should probably never be produced. 2547 if (ValSize > WideMemSize) 2548 return false; 2549 2550 LLT WideTy = widenToNextPowerOf2(ValTy); 2551 2552 Register WideLoad; 2553 if (!WideTy.isVector()) { 2554 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2555 B.buildTrunc(ValReg, WideLoad).getReg(0); 2556 } else { 2557 // Extract the subvector. 2558 2559 if (isRegisterType(ValTy)) { 2560 // If this a case where G_EXTRACT is legal, use it. 2561 // (e.g. <3 x s32> -> <4 x s32>) 2562 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2563 B.buildExtract(ValReg, WideLoad, 0); 2564 } else { 2565 // For cases where the widened type isn't a nice register value, unmerge 2566 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2567 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2568 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2569 B.setInsertPt(B.getMBB(), MI.getIterator()); 2570 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2571 } 2572 } 2573 2574 MI.eraseFromParent(); 2575 return true; 2576 } 2577 2578 return false; 2579 } 2580 2581 bool AMDGPULegalizerInfo::legalizeFMad( 2582 MachineInstr &MI, MachineRegisterInfo &MRI, 2583 MachineIRBuilder &B) const { 2584 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2585 assert(Ty.isScalar()); 2586 2587 MachineFunction &MF = B.getMF(); 2588 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2589 2590 // TODO: Always legal with future ftz flag. 2591 // FIXME: Do we need just output? 2592 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2593 return true; 2594 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2595 return true; 2596 2597 MachineIRBuilder HelperBuilder(MI); 2598 GISelObserverWrapper DummyObserver; 2599 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2600 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2601 } 2602 2603 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2604 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2605 Register DstReg = MI.getOperand(0).getReg(); 2606 Register PtrReg = MI.getOperand(1).getReg(); 2607 Register CmpVal = MI.getOperand(2).getReg(); 2608 Register NewVal = MI.getOperand(3).getReg(); 2609 2610 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2611 "this should not have been custom lowered"); 2612 2613 LLT ValTy = MRI.getType(CmpVal); 2614 LLT VecTy = LLT::fixed_vector(2, ValTy); 2615 2616 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2617 2618 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2619 .addDef(DstReg) 2620 .addUse(PtrReg) 2621 .addUse(PackedVal) 2622 .setMemRefs(MI.memoperands()); 2623 2624 MI.eraseFromParent(); 2625 return true; 2626 } 2627 2628 bool AMDGPULegalizerInfo::legalizeFlog( 2629 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2630 Register Dst = MI.getOperand(0).getReg(); 2631 Register Src = MI.getOperand(1).getReg(); 2632 LLT Ty = B.getMRI()->getType(Dst); 2633 unsigned Flags = MI.getFlags(); 2634 2635 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2636 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2637 2638 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2639 MI.eraseFromParent(); 2640 return true; 2641 } 2642 2643 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2644 MachineIRBuilder &B) const { 2645 Register Dst = MI.getOperand(0).getReg(); 2646 Register Src = MI.getOperand(1).getReg(); 2647 unsigned Flags = MI.getFlags(); 2648 LLT Ty = B.getMRI()->getType(Dst); 2649 2650 auto K = B.buildFConstant(Ty, numbers::log2e); 2651 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2652 B.buildFExp2(Dst, Mul, Flags); 2653 MI.eraseFromParent(); 2654 return true; 2655 } 2656 2657 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2658 MachineIRBuilder &B) const { 2659 Register Dst = MI.getOperand(0).getReg(); 2660 Register Src0 = MI.getOperand(1).getReg(); 2661 Register Src1 = MI.getOperand(2).getReg(); 2662 unsigned Flags = MI.getFlags(); 2663 LLT Ty = B.getMRI()->getType(Dst); 2664 const LLT S16 = LLT::scalar(16); 2665 const LLT S32 = LLT::scalar(32); 2666 2667 if (Ty == S32) { 2668 auto Log = B.buildFLog2(S32, Src0, Flags); 2669 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2670 .addUse(Log.getReg(0)) 2671 .addUse(Src1) 2672 .setMIFlags(Flags); 2673 B.buildFExp2(Dst, Mul, Flags); 2674 } else if (Ty == S16) { 2675 // There's no f16 fmul_legacy, so we need to convert for it. 2676 auto Log = B.buildFLog2(S16, Src0, Flags); 2677 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2678 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2679 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2680 .addUse(Ext0.getReg(0)) 2681 .addUse(Ext1.getReg(0)) 2682 .setMIFlags(Flags); 2683 2684 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2685 } else 2686 return false; 2687 2688 MI.eraseFromParent(); 2689 return true; 2690 } 2691 2692 // Find a source register, ignoring any possible source modifiers. 2693 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2694 Register ModSrc = OrigSrc; 2695 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2696 ModSrc = SrcFNeg->getOperand(1).getReg(); 2697 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2698 ModSrc = SrcFAbs->getOperand(1).getReg(); 2699 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2700 ModSrc = SrcFAbs->getOperand(1).getReg(); 2701 return ModSrc; 2702 } 2703 2704 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2705 MachineRegisterInfo &MRI, 2706 MachineIRBuilder &B) const { 2707 2708 const LLT S1 = LLT::scalar(1); 2709 const LLT S64 = LLT::scalar(64); 2710 Register Dst = MI.getOperand(0).getReg(); 2711 Register OrigSrc = MI.getOperand(1).getReg(); 2712 unsigned Flags = MI.getFlags(); 2713 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2714 "this should not have been custom lowered"); 2715 2716 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2717 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2718 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2719 // V_FRACT bug is: 2720 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2721 // 2722 // Convert floor(x) to (x - fract(x)) 2723 2724 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2725 .addUse(OrigSrc) 2726 .setMIFlags(Flags); 2727 2728 // Give source modifier matching some assistance before obscuring a foldable 2729 // pattern. 2730 2731 // TODO: We can avoid the neg on the fract? The input sign to fract 2732 // shouldn't matter? 2733 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2734 2735 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2736 2737 Register Min = MRI.createGenericVirtualRegister(S64); 2738 2739 // We don't need to concern ourselves with the snan handling difference, so 2740 // use the one which will directly select. 2741 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2742 if (MFI->getMode().IEEE) 2743 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2744 else 2745 B.buildFMinNum(Min, Fract, Const, Flags); 2746 2747 Register CorrectedFract = Min; 2748 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2749 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2750 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2751 } 2752 2753 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2754 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2755 2756 MI.eraseFromParent(); 2757 return true; 2758 } 2759 2760 // Turn an illegal packed v2s16 build vector into bit operations. 2761 // TODO: This should probably be a bitcast action in LegalizerHelper. 2762 bool AMDGPULegalizerInfo::legalizeBuildVector( 2763 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2764 Register Dst = MI.getOperand(0).getReg(); 2765 const LLT S32 = LLT::scalar(32); 2766 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2767 2768 Register Src0 = MI.getOperand(1).getReg(); 2769 Register Src1 = MI.getOperand(2).getReg(); 2770 assert(MRI.getType(Src0) == LLT::scalar(16)); 2771 2772 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2773 B.buildBitcast(Dst, Merge); 2774 2775 MI.eraseFromParent(); 2776 return true; 2777 } 2778 2779 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2780 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2781 // case with a single min instruction instead of a compare+select. 2782 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2783 MachineRegisterInfo &MRI, 2784 MachineIRBuilder &B) const { 2785 Register Dst = MI.getOperand(0).getReg(); 2786 Register Src = MI.getOperand(1).getReg(); 2787 LLT DstTy = MRI.getType(Dst); 2788 LLT SrcTy = MRI.getType(Src); 2789 2790 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2791 ? AMDGPU::G_AMDGPU_FFBH_U32 2792 : AMDGPU::G_AMDGPU_FFBL_B32; 2793 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2794 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2795 2796 MI.eraseFromParent(); 2797 return true; 2798 } 2799 2800 // Check that this is a G_XOR x, -1 2801 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2802 if (MI.getOpcode() != TargetOpcode::G_XOR) 2803 return false; 2804 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2805 return ConstVal && *ConstVal == -1; 2806 } 2807 2808 // Return the use branch instruction, otherwise null if the usage is invalid. 2809 static MachineInstr * 2810 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2811 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2812 Register CondDef = MI.getOperand(0).getReg(); 2813 if (!MRI.hasOneNonDBGUse(CondDef)) 2814 return nullptr; 2815 2816 MachineBasicBlock *Parent = MI.getParent(); 2817 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2818 2819 if (isNot(MRI, *UseMI)) { 2820 Register NegatedCond = UseMI->getOperand(0).getReg(); 2821 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2822 return nullptr; 2823 2824 // We're deleting the def of this value, so we need to remove it. 2825 UseMI->eraseFromParent(); 2826 2827 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2828 Negated = true; 2829 } 2830 2831 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2832 return nullptr; 2833 2834 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2835 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2836 if (Next == Parent->end()) { 2837 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2838 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2839 return nullptr; 2840 UncondBrTarget = &*NextMBB; 2841 } else { 2842 if (Next->getOpcode() != AMDGPU::G_BR) 2843 return nullptr; 2844 Br = &*Next; 2845 UncondBrTarget = Br->getOperand(0).getMBB(); 2846 } 2847 2848 return UseMI; 2849 } 2850 2851 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2852 const ArgDescriptor *Arg, 2853 const TargetRegisterClass *ArgRC, 2854 LLT ArgTy) const { 2855 MCRegister SrcReg = Arg->getRegister(); 2856 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2857 assert(DstReg.isVirtual() && "Virtual register expected"); 2858 2859 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2860 ArgTy); 2861 if (Arg->isMasked()) { 2862 // TODO: Should we try to emit this once in the entry block? 2863 const LLT S32 = LLT::scalar(32); 2864 const unsigned Mask = Arg->getMask(); 2865 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2866 2867 Register AndMaskSrc = LiveIn; 2868 2869 if (Shift != 0) { 2870 auto ShiftAmt = B.buildConstant(S32, Shift); 2871 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2872 } 2873 2874 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2875 } else { 2876 B.buildCopy(DstReg, LiveIn); 2877 } 2878 2879 return true; 2880 } 2881 2882 bool AMDGPULegalizerInfo::loadInputValue( 2883 Register DstReg, MachineIRBuilder &B, 2884 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2885 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2886 const ArgDescriptor *Arg; 2887 const TargetRegisterClass *ArgRC; 2888 LLT ArgTy; 2889 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2890 2891 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2892 return false; // TODO: Handle these 2893 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2894 } 2895 2896 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2897 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2898 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2899 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2900 return false; 2901 2902 MI.eraseFromParent(); 2903 return true; 2904 } 2905 2906 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2907 MachineRegisterInfo &MRI, 2908 MachineIRBuilder &B) const { 2909 Register Dst = MI.getOperand(0).getReg(); 2910 LLT DstTy = MRI.getType(Dst); 2911 LLT S16 = LLT::scalar(16); 2912 LLT S32 = LLT::scalar(32); 2913 LLT S64 = LLT::scalar(64); 2914 2915 if (DstTy == S16) 2916 return legalizeFDIV16(MI, MRI, B); 2917 if (DstTy == S32) 2918 return legalizeFDIV32(MI, MRI, B); 2919 if (DstTy == S64) 2920 return legalizeFDIV64(MI, MRI, B); 2921 2922 return false; 2923 } 2924 2925 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2926 Register DstDivReg, 2927 Register DstRemReg, 2928 Register X, 2929 Register Y) const { 2930 const LLT S1 = LLT::scalar(1); 2931 const LLT S32 = LLT::scalar(32); 2932 2933 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2934 // algorithm used here. 2935 2936 // Initial estimate of inv(y). 2937 auto FloatY = B.buildUITOFP(S32, Y); 2938 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2939 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2940 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2941 auto Z = B.buildFPTOUI(S32, ScaledY); 2942 2943 // One round of UNR. 2944 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2945 auto NegYZ = B.buildMul(S32, NegY, Z); 2946 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2947 2948 // Quotient/remainder estimate. 2949 auto Q = B.buildUMulH(S32, X, Z); 2950 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2951 2952 // First quotient/remainder refinement. 2953 auto One = B.buildConstant(S32, 1); 2954 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2955 if (DstDivReg) 2956 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2957 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2958 2959 // Second quotient/remainder refinement. 2960 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2961 if (DstDivReg) 2962 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2963 2964 if (DstRemReg) 2965 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2966 } 2967 2968 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2969 // 2970 // Return lo, hi of result 2971 // 2972 // %cvt.lo = G_UITOFP Val.lo 2973 // %cvt.hi = G_UITOFP Val.hi 2974 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2975 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2976 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2977 // %mul2 = G_FMUL %mul1, 2**(-32) 2978 // %trunc = G_INTRINSIC_TRUNC %mul2 2979 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2980 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2981 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2982 Register Val) { 2983 const LLT S32 = LLT::scalar(32); 2984 auto Unmerge = B.buildUnmerge(S32, Val); 2985 2986 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2987 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2988 2989 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2990 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2991 2992 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2993 auto Mul1 = 2994 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2995 2996 // 2**(-32) 2997 auto Mul2 = 2998 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2999 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3000 3001 // -(2**32) 3002 auto Mad2 = B.buildFMAD(S32, Trunc, 3003 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3004 3005 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3006 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3007 3008 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3009 } 3010 3011 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3012 Register DstDivReg, 3013 Register DstRemReg, 3014 Register Numer, 3015 Register Denom) const { 3016 const LLT S32 = LLT::scalar(32); 3017 const LLT S64 = LLT::scalar(64); 3018 const LLT S1 = LLT::scalar(1); 3019 Register RcpLo, RcpHi; 3020 3021 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3022 3023 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3024 3025 auto Zero64 = B.buildConstant(S64, 0); 3026 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3027 3028 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3029 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3030 3031 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3032 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3033 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3034 3035 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3036 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3037 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 3038 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3039 3040 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3041 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3042 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3043 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3044 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3045 3046 auto Zero32 = B.buildConstant(S32, 0); 3047 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3048 auto Add2_HiC = 3049 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 3050 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 3051 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3052 3053 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3054 Register NumerLo = UnmergeNumer.getReg(0); 3055 Register NumerHi = UnmergeNumer.getReg(1); 3056 3057 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3058 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3059 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3060 Register Mul3_Lo = UnmergeMul3.getReg(0); 3061 Register Mul3_Hi = UnmergeMul3.getReg(1); 3062 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3063 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3064 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3065 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3066 3067 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3068 Register DenomLo = UnmergeDenom.getReg(0); 3069 Register DenomHi = UnmergeDenom.getReg(1); 3070 3071 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3072 auto C1 = B.buildSExt(S32, CmpHi); 3073 3074 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3075 auto C2 = B.buildSExt(S32, CmpLo); 3076 3077 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3078 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3079 3080 // TODO: Here and below portions of the code can be enclosed into if/endif. 3081 // Currently control flow is unconditional and we have 4 selects after 3082 // potential endif to substitute PHIs. 3083 3084 // if C3 != 0 ... 3085 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3086 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3087 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3088 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3089 3090 auto One64 = B.buildConstant(S64, 1); 3091 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3092 3093 auto C4 = 3094 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3095 auto C5 = 3096 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3097 auto C6 = B.buildSelect( 3098 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3099 3100 // if (C6 != 0) 3101 auto Add4 = B.buildAdd(S64, Add3, One64); 3102 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3103 3104 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3105 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3106 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3107 3108 // endif C6 3109 // endif C3 3110 3111 if (DstDivReg) { 3112 auto Sel1 = B.buildSelect( 3113 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3114 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3115 Sel1, MulHi3); 3116 } 3117 3118 if (DstRemReg) { 3119 auto Sel2 = B.buildSelect( 3120 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3121 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3122 Sel2, Sub1); 3123 } 3124 } 3125 3126 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3127 MachineRegisterInfo &MRI, 3128 MachineIRBuilder &B) const { 3129 Register DstDivReg, DstRemReg; 3130 switch (MI.getOpcode()) { 3131 default: 3132 llvm_unreachable("Unexpected opcode!"); 3133 case AMDGPU::G_UDIV: { 3134 DstDivReg = MI.getOperand(0).getReg(); 3135 break; 3136 } 3137 case AMDGPU::G_UREM: { 3138 DstRemReg = MI.getOperand(0).getReg(); 3139 break; 3140 } 3141 case AMDGPU::G_UDIVREM: { 3142 DstDivReg = MI.getOperand(0).getReg(); 3143 DstRemReg = MI.getOperand(1).getReg(); 3144 break; 3145 } 3146 } 3147 3148 const LLT S64 = LLT::scalar(64); 3149 const LLT S32 = LLT::scalar(32); 3150 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3151 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3152 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3153 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3154 3155 if (Ty == S32) 3156 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3157 else if (Ty == S64) 3158 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3159 else 3160 return false; 3161 3162 MI.eraseFromParent(); 3163 return true; 3164 } 3165 3166 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3167 MachineRegisterInfo &MRI, 3168 MachineIRBuilder &B) const { 3169 const LLT S64 = LLT::scalar(64); 3170 const LLT S32 = LLT::scalar(32); 3171 3172 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3173 if (Ty != S32 && Ty != S64) 3174 return false; 3175 3176 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3177 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3178 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3179 3180 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3181 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3182 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3183 3184 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3185 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3186 3187 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3188 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3189 3190 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3191 switch (MI.getOpcode()) { 3192 default: 3193 llvm_unreachable("Unexpected opcode!"); 3194 case AMDGPU::G_SDIV: { 3195 DstDivReg = MI.getOperand(0).getReg(); 3196 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3197 break; 3198 } 3199 case AMDGPU::G_SREM: { 3200 DstRemReg = MI.getOperand(0).getReg(); 3201 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3202 break; 3203 } 3204 case AMDGPU::G_SDIVREM: { 3205 DstDivReg = MI.getOperand(0).getReg(); 3206 DstRemReg = MI.getOperand(1).getReg(); 3207 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3208 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3209 break; 3210 } 3211 } 3212 3213 if (Ty == S32) 3214 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3215 else 3216 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3217 3218 if (DstDivReg) { 3219 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3220 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3221 B.buildSub(DstDivReg, SignXor, Sign); 3222 } 3223 3224 if (DstRemReg) { 3225 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3226 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3227 B.buildSub(DstRemReg, SignXor, Sign); 3228 } 3229 3230 MI.eraseFromParent(); 3231 return true; 3232 } 3233 3234 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3235 MachineRegisterInfo &MRI, 3236 MachineIRBuilder &B) const { 3237 Register Res = MI.getOperand(0).getReg(); 3238 Register LHS = MI.getOperand(1).getReg(); 3239 Register RHS = MI.getOperand(2).getReg(); 3240 uint16_t Flags = MI.getFlags(); 3241 LLT ResTy = MRI.getType(Res); 3242 3243 const MachineFunction &MF = B.getMF(); 3244 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3245 MI.getFlag(MachineInstr::FmAfn); 3246 3247 if (!AllowInaccurateRcp) 3248 return false; 3249 3250 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3251 // 1 / x -> RCP(x) 3252 if (CLHS->isExactlyValue(1.0)) { 3253 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3254 .addUse(RHS) 3255 .setMIFlags(Flags); 3256 3257 MI.eraseFromParent(); 3258 return true; 3259 } 3260 3261 // -1 / x -> RCP( FNEG(x) ) 3262 if (CLHS->isExactlyValue(-1.0)) { 3263 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3264 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3265 .addUse(FNeg.getReg(0)) 3266 .setMIFlags(Flags); 3267 3268 MI.eraseFromParent(); 3269 return true; 3270 } 3271 } 3272 3273 // x / y -> x * (1.0 / y) 3274 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3275 .addUse(RHS) 3276 .setMIFlags(Flags); 3277 B.buildFMul(Res, LHS, RCP, Flags); 3278 3279 MI.eraseFromParent(); 3280 return true; 3281 } 3282 3283 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3284 MachineRegisterInfo &MRI, 3285 MachineIRBuilder &B) const { 3286 Register Res = MI.getOperand(0).getReg(); 3287 Register X = MI.getOperand(1).getReg(); 3288 Register Y = MI.getOperand(2).getReg(); 3289 uint16_t Flags = MI.getFlags(); 3290 LLT ResTy = MRI.getType(Res); 3291 3292 const MachineFunction &MF = B.getMF(); 3293 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3294 MI.getFlag(MachineInstr::FmAfn); 3295 3296 if (!AllowInaccurateRcp) 3297 return false; 3298 3299 auto NegY = B.buildFNeg(ResTy, Y); 3300 auto One = B.buildFConstant(ResTy, 1.0); 3301 3302 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3303 .addUse(Y) 3304 .setMIFlags(Flags); 3305 3306 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3307 R = B.buildFMA(ResTy, Tmp0, R, R); 3308 3309 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3310 R = B.buildFMA(ResTy, Tmp1, R, R); 3311 3312 auto Ret = B.buildFMul(ResTy, X, R); 3313 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3314 3315 B.buildFMA(Res, Tmp2, R, Ret); 3316 MI.eraseFromParent(); 3317 return true; 3318 } 3319 3320 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3321 MachineRegisterInfo &MRI, 3322 MachineIRBuilder &B) const { 3323 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3324 return true; 3325 3326 Register Res = MI.getOperand(0).getReg(); 3327 Register LHS = MI.getOperand(1).getReg(); 3328 Register RHS = MI.getOperand(2).getReg(); 3329 3330 uint16_t Flags = MI.getFlags(); 3331 3332 LLT S16 = LLT::scalar(16); 3333 LLT S32 = LLT::scalar(32); 3334 3335 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3336 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3337 3338 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3339 .addUse(RHSExt.getReg(0)) 3340 .setMIFlags(Flags); 3341 3342 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3343 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3344 3345 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3346 .addUse(RDst.getReg(0)) 3347 .addUse(RHS) 3348 .addUse(LHS) 3349 .setMIFlags(Flags); 3350 3351 MI.eraseFromParent(); 3352 return true; 3353 } 3354 3355 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3356 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3357 static void toggleSPDenormMode(bool Enable, 3358 MachineIRBuilder &B, 3359 const GCNSubtarget &ST, 3360 AMDGPU::SIModeRegisterDefaults Mode) { 3361 // Set SP denorm mode to this value. 3362 unsigned SPDenormMode = 3363 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3364 3365 if (ST.hasDenormModeInst()) { 3366 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3367 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3368 3369 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3370 B.buildInstr(AMDGPU::S_DENORM_MODE) 3371 .addImm(NewDenormModeValue); 3372 3373 } else { 3374 // Select FP32 bit field in mode register. 3375 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3376 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3377 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3378 3379 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3380 .addImm(SPDenormMode) 3381 .addImm(SPDenormModeBitField); 3382 } 3383 } 3384 3385 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3386 MachineRegisterInfo &MRI, 3387 MachineIRBuilder &B) const { 3388 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3389 return true; 3390 3391 Register Res = MI.getOperand(0).getReg(); 3392 Register LHS = MI.getOperand(1).getReg(); 3393 Register RHS = MI.getOperand(2).getReg(); 3394 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3395 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3396 3397 uint16_t Flags = MI.getFlags(); 3398 3399 LLT S32 = LLT::scalar(32); 3400 LLT S1 = LLT::scalar(1); 3401 3402 auto One = B.buildFConstant(S32, 1.0f); 3403 3404 auto DenominatorScaled = 3405 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3406 .addUse(LHS) 3407 .addUse(RHS) 3408 .addImm(0) 3409 .setMIFlags(Flags); 3410 auto NumeratorScaled = 3411 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3412 .addUse(LHS) 3413 .addUse(RHS) 3414 .addImm(1) 3415 .setMIFlags(Flags); 3416 3417 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3418 .addUse(DenominatorScaled.getReg(0)) 3419 .setMIFlags(Flags); 3420 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3421 3422 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3423 // aren't modeled as reading it. 3424 if (!Mode.allFP32Denormals()) 3425 toggleSPDenormMode(true, B, ST, Mode); 3426 3427 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3428 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3429 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3430 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3431 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3432 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3433 3434 if (!Mode.allFP32Denormals()) 3435 toggleSPDenormMode(false, B, ST, Mode); 3436 3437 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3438 .addUse(Fma4.getReg(0)) 3439 .addUse(Fma1.getReg(0)) 3440 .addUse(Fma3.getReg(0)) 3441 .addUse(NumeratorScaled.getReg(1)) 3442 .setMIFlags(Flags); 3443 3444 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3445 .addUse(Fmas.getReg(0)) 3446 .addUse(RHS) 3447 .addUse(LHS) 3448 .setMIFlags(Flags); 3449 3450 MI.eraseFromParent(); 3451 return true; 3452 } 3453 3454 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3455 MachineRegisterInfo &MRI, 3456 MachineIRBuilder &B) const { 3457 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3458 return true; 3459 3460 Register Res = MI.getOperand(0).getReg(); 3461 Register LHS = MI.getOperand(1).getReg(); 3462 Register RHS = MI.getOperand(2).getReg(); 3463 3464 uint16_t Flags = MI.getFlags(); 3465 3466 LLT S64 = LLT::scalar(64); 3467 LLT S1 = LLT::scalar(1); 3468 3469 auto One = B.buildFConstant(S64, 1.0); 3470 3471 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3472 .addUse(LHS) 3473 .addUse(RHS) 3474 .addImm(0) 3475 .setMIFlags(Flags); 3476 3477 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3478 3479 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3480 .addUse(DivScale0.getReg(0)) 3481 .setMIFlags(Flags); 3482 3483 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3484 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3485 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3486 3487 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3488 .addUse(LHS) 3489 .addUse(RHS) 3490 .addImm(1) 3491 .setMIFlags(Flags); 3492 3493 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3494 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3495 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3496 3497 Register Scale; 3498 if (!ST.hasUsableDivScaleConditionOutput()) { 3499 // Workaround a hardware bug on SI where the condition output from div_scale 3500 // is not usable. 3501 3502 LLT S32 = LLT::scalar(32); 3503 3504 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3505 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3506 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3507 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3508 3509 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3510 Scale1Unmerge.getReg(1)); 3511 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3512 Scale0Unmerge.getReg(1)); 3513 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3514 } else { 3515 Scale = DivScale1.getReg(1); 3516 } 3517 3518 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3519 .addUse(Fma4.getReg(0)) 3520 .addUse(Fma3.getReg(0)) 3521 .addUse(Mul.getReg(0)) 3522 .addUse(Scale) 3523 .setMIFlags(Flags); 3524 3525 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3526 .addUse(Fmas.getReg(0)) 3527 .addUse(RHS) 3528 .addUse(LHS) 3529 .setMIFlags(Flags); 3530 3531 MI.eraseFromParent(); 3532 return true; 3533 } 3534 3535 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3536 MachineRegisterInfo &MRI, 3537 MachineIRBuilder &B) const { 3538 Register Res = MI.getOperand(0).getReg(); 3539 Register LHS = MI.getOperand(2).getReg(); 3540 Register RHS = MI.getOperand(3).getReg(); 3541 uint16_t Flags = MI.getFlags(); 3542 3543 LLT S32 = LLT::scalar(32); 3544 LLT S1 = LLT::scalar(1); 3545 3546 auto Abs = B.buildFAbs(S32, RHS, Flags); 3547 const APFloat C0Val(1.0f); 3548 3549 auto C0 = B.buildConstant(S32, 0x6f800000); 3550 auto C1 = B.buildConstant(S32, 0x2f800000); 3551 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3552 3553 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3554 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3555 3556 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3557 3558 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3559 .addUse(Mul0.getReg(0)) 3560 .setMIFlags(Flags); 3561 3562 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3563 3564 B.buildFMul(Res, Sel, Mul1, Flags); 3565 3566 MI.eraseFromParent(); 3567 return true; 3568 } 3569 3570 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3571 // FIXME: Why do we handle this one but not other removed instructions? 3572 // 3573 // Reciprocal square root. The clamp prevents infinite results, clamping 3574 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3575 // +-max_float. 3576 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3577 MachineRegisterInfo &MRI, 3578 MachineIRBuilder &B) const { 3579 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3580 return true; 3581 3582 Register Dst = MI.getOperand(0).getReg(); 3583 Register Src = MI.getOperand(2).getReg(); 3584 auto Flags = MI.getFlags(); 3585 3586 LLT Ty = MRI.getType(Dst); 3587 3588 const fltSemantics *FltSemantics; 3589 if (Ty == LLT::scalar(32)) 3590 FltSemantics = &APFloat::IEEEsingle(); 3591 else if (Ty == LLT::scalar(64)) 3592 FltSemantics = &APFloat::IEEEdouble(); 3593 else 3594 return false; 3595 3596 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3597 .addUse(Src) 3598 .setMIFlags(Flags); 3599 3600 // We don't need to concern ourselves with the snan handling difference, since 3601 // the rsq quieted (or not) so use the one which will directly select. 3602 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3603 const bool UseIEEE = MFI->getMode().IEEE; 3604 3605 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3606 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3607 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3608 3609 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3610 3611 if (UseIEEE) 3612 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3613 else 3614 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3615 MI.eraseFromParent(); 3616 return true; 3617 } 3618 3619 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3620 switch (IID) { 3621 case Intrinsic::amdgcn_ds_fadd: 3622 return AMDGPU::G_ATOMICRMW_FADD; 3623 case Intrinsic::amdgcn_ds_fmin: 3624 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3625 case Intrinsic::amdgcn_ds_fmax: 3626 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3627 default: 3628 llvm_unreachable("not a DS FP intrinsic"); 3629 } 3630 } 3631 3632 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3633 MachineInstr &MI, 3634 Intrinsic::ID IID) const { 3635 GISelChangeObserver &Observer = Helper.Observer; 3636 Observer.changingInstr(MI); 3637 3638 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3639 3640 // The remaining operands were used to set fields in the MemOperand on 3641 // construction. 3642 for (int I = 6; I > 3; --I) 3643 MI.RemoveOperand(I); 3644 3645 MI.RemoveOperand(1); // Remove the intrinsic ID. 3646 Observer.changedInstr(MI); 3647 return true; 3648 } 3649 3650 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3651 MachineRegisterInfo &MRI, 3652 MachineIRBuilder &B) const { 3653 uint64_t Offset = 3654 ST.getTargetLowering()->getImplicitParameterOffset( 3655 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3656 LLT DstTy = MRI.getType(DstReg); 3657 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3658 3659 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3660 if (!loadInputValue(KernargPtrReg, B, 3661 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3662 return false; 3663 3664 // FIXME: This should be nuw 3665 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3666 return true; 3667 } 3668 3669 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3670 MachineRegisterInfo &MRI, 3671 MachineIRBuilder &B) const { 3672 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3673 if (!MFI->isEntryFunction()) { 3674 return legalizePreloadedArgIntrin(MI, MRI, B, 3675 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3676 } 3677 3678 Register DstReg = MI.getOperand(0).getReg(); 3679 if (!getImplicitArgPtr(DstReg, MRI, B)) 3680 return false; 3681 3682 MI.eraseFromParent(); 3683 return true; 3684 } 3685 3686 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3687 MachineRegisterInfo &MRI, 3688 MachineIRBuilder &B, 3689 unsigned AddrSpace) const { 3690 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3691 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3692 Register Hi32 = Unmerge.getReg(1); 3693 3694 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3695 MI.eraseFromParent(); 3696 return true; 3697 } 3698 3699 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3700 // offset (the offset that is included in bounds checking and swizzling, to be 3701 // split between the instruction's voffset and immoffset fields) and soffset 3702 // (the offset that is excluded from bounds checking and swizzling, to go in 3703 // the instruction's soffset field). This function takes the first kind of 3704 // offset and figures out how to split it between voffset and immoffset. 3705 std::pair<Register, unsigned> 3706 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3707 Register OrigOffset) const { 3708 const unsigned MaxImm = 4095; 3709 Register BaseReg; 3710 unsigned ImmOffset; 3711 const LLT S32 = LLT::scalar(32); 3712 MachineRegisterInfo &MRI = *B.getMRI(); 3713 3714 std::tie(BaseReg, ImmOffset) = 3715 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3716 3717 // If BaseReg is a pointer, convert it to int. 3718 if (MRI.getType(BaseReg).isPointer()) 3719 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3720 3721 // If the immediate value is too big for the immoffset field, put the value 3722 // and -4096 into the immoffset field so that the value that is copied/added 3723 // for the voffset field is a multiple of 4096, and it stands more chance 3724 // of being CSEd with the copy/add for another similar load/store. 3725 // However, do not do that rounding down to a multiple of 4096 if that is a 3726 // negative number, as it appears to be illegal to have a negative offset 3727 // in the vgpr, even if adding the immediate offset makes it positive. 3728 unsigned Overflow = ImmOffset & ~MaxImm; 3729 ImmOffset -= Overflow; 3730 if ((int32_t)Overflow < 0) { 3731 Overflow += ImmOffset; 3732 ImmOffset = 0; 3733 } 3734 3735 if (Overflow != 0) { 3736 if (!BaseReg) { 3737 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3738 } else { 3739 auto OverflowVal = B.buildConstant(S32, Overflow); 3740 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3741 } 3742 } 3743 3744 if (!BaseReg) 3745 BaseReg = B.buildConstant(S32, 0).getReg(0); 3746 3747 return std::make_pair(BaseReg, ImmOffset); 3748 } 3749 3750 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3751 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3752 Register VOffset, Register SOffset, 3753 unsigned ImmOffset, Register VIndex, 3754 MachineRegisterInfo &MRI) const { 3755 Optional<ValueAndVReg> MaybeVOffsetVal = 3756 getConstantVRegValWithLookThrough(VOffset, MRI); 3757 Optional<ValueAndVReg> MaybeSOffsetVal = 3758 getConstantVRegValWithLookThrough(SOffset, MRI); 3759 Optional<ValueAndVReg> MaybeVIndexVal = 3760 getConstantVRegValWithLookThrough(VIndex, MRI); 3761 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3762 // update the MMO with that offset. The stride is unknown so we can only do 3763 // this if VIndex is constant 0. 3764 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3765 MaybeVIndexVal->Value == 0) { 3766 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3767 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3768 MMO->setOffset(TotalOffset); 3769 } else { 3770 // We don't have a constant combined offset to use in the MMO. Give up. 3771 MMO->setValue((Value *)nullptr); 3772 } 3773 } 3774 3775 /// Handle register layout difference for f16 images for some subtargets. 3776 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3777 MachineRegisterInfo &MRI, 3778 Register Reg, 3779 bool ImageStore) const { 3780 const LLT S16 = LLT::scalar(16); 3781 const LLT S32 = LLT::scalar(32); 3782 LLT StoreVT = MRI.getType(Reg); 3783 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3784 3785 if (ST.hasUnpackedD16VMem()) { 3786 auto Unmerge = B.buildUnmerge(S16, Reg); 3787 3788 SmallVector<Register, 4> WideRegs; 3789 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3790 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3791 3792 int NumElts = StoreVT.getNumElements(); 3793 3794 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3795 .getReg(0); 3796 } 3797 3798 if (ImageStore && ST.hasImageStoreD16Bug()) { 3799 if (StoreVT.getNumElements() == 2) { 3800 SmallVector<Register, 4> PackedRegs; 3801 Reg = B.buildBitcast(S32, Reg).getReg(0); 3802 PackedRegs.push_back(Reg); 3803 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3804 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3805 .getReg(0); 3806 } 3807 3808 if (StoreVT.getNumElements() == 3) { 3809 SmallVector<Register, 4> PackedRegs; 3810 auto Unmerge = B.buildUnmerge(S16, Reg); 3811 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3812 PackedRegs.push_back(Unmerge.getReg(I)); 3813 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3814 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3815 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3816 } 3817 3818 if (StoreVT.getNumElements() == 4) { 3819 SmallVector<Register, 4> PackedRegs; 3820 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3821 auto Unmerge = B.buildUnmerge(S32, Reg); 3822 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3823 PackedRegs.push_back(Unmerge.getReg(I)); 3824 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3825 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3826 .getReg(0); 3827 } 3828 3829 llvm_unreachable("invalid data type"); 3830 } 3831 3832 return Reg; 3833 } 3834 3835 Register AMDGPULegalizerInfo::fixStoreSourceType( 3836 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3837 MachineRegisterInfo *MRI = B.getMRI(); 3838 LLT Ty = MRI->getType(VData); 3839 3840 const LLT S16 = LLT::scalar(16); 3841 3842 // Fixup illegal register types for i8 stores. 3843 if (Ty == LLT::scalar(8) || Ty == S16) { 3844 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3845 return AnyExt; 3846 } 3847 3848 if (Ty.isVector()) { 3849 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3850 if (IsFormat) 3851 return handleD16VData(B, *MRI, VData); 3852 } 3853 } 3854 3855 return VData; 3856 } 3857 3858 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3859 MachineRegisterInfo &MRI, 3860 MachineIRBuilder &B, 3861 bool IsTyped, 3862 bool IsFormat) const { 3863 Register VData = MI.getOperand(1).getReg(); 3864 LLT Ty = MRI.getType(VData); 3865 LLT EltTy = Ty.getScalarType(); 3866 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3867 const LLT S32 = LLT::scalar(32); 3868 3869 VData = fixStoreSourceType(B, VData, IsFormat); 3870 Register RSrc = MI.getOperand(2).getReg(); 3871 3872 MachineMemOperand *MMO = *MI.memoperands_begin(); 3873 const int MemSize = MMO->getSize(); 3874 3875 unsigned ImmOffset; 3876 3877 // The typed intrinsics add an immediate after the registers. 3878 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3879 3880 // The struct intrinsic variants add one additional operand over raw. 3881 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3882 Register VIndex; 3883 int OpOffset = 0; 3884 if (HasVIndex) { 3885 VIndex = MI.getOperand(3).getReg(); 3886 OpOffset = 1; 3887 } else { 3888 VIndex = B.buildConstant(S32, 0).getReg(0); 3889 } 3890 3891 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3892 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3893 3894 unsigned Format = 0; 3895 if (IsTyped) { 3896 Format = MI.getOperand(5 + OpOffset).getImm(); 3897 ++OpOffset; 3898 } 3899 3900 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3901 3902 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3903 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3904 3905 unsigned Opc; 3906 if (IsTyped) { 3907 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3908 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3909 } else if (IsFormat) { 3910 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3911 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3912 } else { 3913 switch (MemSize) { 3914 case 1: 3915 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3916 break; 3917 case 2: 3918 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3919 break; 3920 default: 3921 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3922 break; 3923 } 3924 } 3925 3926 auto MIB = B.buildInstr(Opc) 3927 .addUse(VData) // vdata 3928 .addUse(RSrc) // rsrc 3929 .addUse(VIndex) // vindex 3930 .addUse(VOffset) // voffset 3931 .addUse(SOffset) // soffset 3932 .addImm(ImmOffset); // offset(imm) 3933 3934 if (IsTyped) 3935 MIB.addImm(Format); 3936 3937 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3938 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3939 .addMemOperand(MMO); 3940 3941 MI.eraseFromParent(); 3942 return true; 3943 } 3944 3945 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3946 MachineRegisterInfo &MRI, 3947 MachineIRBuilder &B, 3948 bool IsFormat, 3949 bool IsTyped) const { 3950 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3951 MachineMemOperand *MMO = *MI.memoperands_begin(); 3952 const LLT MemTy = MMO->getMemoryType(); 3953 const LLT S32 = LLT::scalar(32); 3954 3955 Register Dst = MI.getOperand(0).getReg(); 3956 Register RSrc = MI.getOperand(2).getReg(); 3957 3958 // The typed intrinsics add an immediate after the registers. 3959 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3960 3961 // The struct intrinsic variants add one additional operand over raw. 3962 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3963 Register VIndex; 3964 int OpOffset = 0; 3965 if (HasVIndex) { 3966 VIndex = MI.getOperand(3).getReg(); 3967 OpOffset = 1; 3968 } else { 3969 VIndex = B.buildConstant(S32, 0).getReg(0); 3970 } 3971 3972 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3973 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3974 3975 unsigned Format = 0; 3976 if (IsTyped) { 3977 Format = MI.getOperand(5 + OpOffset).getImm(); 3978 ++OpOffset; 3979 } 3980 3981 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3982 unsigned ImmOffset; 3983 3984 LLT Ty = MRI.getType(Dst); 3985 LLT EltTy = Ty.getScalarType(); 3986 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3987 const bool Unpacked = ST.hasUnpackedD16VMem(); 3988 3989 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3990 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3991 3992 unsigned Opc; 3993 3994 if (IsTyped) { 3995 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3996 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3997 } else if (IsFormat) { 3998 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3999 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4000 } else { 4001 switch (MemTy.getSizeInBits()) { 4002 case 8: 4003 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4004 break; 4005 case 16: 4006 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4007 break; 4008 default: 4009 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4010 break; 4011 } 4012 } 4013 4014 Register LoadDstReg; 4015 4016 bool IsExtLoad = 4017 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4018 LLT UnpackedTy = Ty.changeElementSize(32); 4019 4020 if (IsExtLoad) 4021 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4022 else if (Unpacked && IsD16 && Ty.isVector()) 4023 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4024 else 4025 LoadDstReg = Dst; 4026 4027 auto MIB = B.buildInstr(Opc) 4028 .addDef(LoadDstReg) // vdata 4029 .addUse(RSrc) // rsrc 4030 .addUse(VIndex) // vindex 4031 .addUse(VOffset) // voffset 4032 .addUse(SOffset) // soffset 4033 .addImm(ImmOffset); // offset(imm) 4034 4035 if (IsTyped) 4036 MIB.addImm(Format); 4037 4038 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4039 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4040 .addMemOperand(MMO); 4041 4042 if (LoadDstReg != Dst) { 4043 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4044 4045 // Widen result for extending loads was widened. 4046 if (IsExtLoad) 4047 B.buildTrunc(Dst, LoadDstReg); 4048 else { 4049 // Repack to original 16-bit vector result 4050 // FIXME: G_TRUNC should work, but legalization currently fails 4051 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4052 SmallVector<Register, 4> Repack; 4053 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4054 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4055 B.buildMerge(Dst, Repack); 4056 } 4057 } 4058 4059 MI.eraseFromParent(); 4060 return true; 4061 } 4062 4063 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4064 MachineIRBuilder &B, 4065 bool IsInc) const { 4066 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4067 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4068 B.buildInstr(Opc) 4069 .addDef(MI.getOperand(0).getReg()) 4070 .addUse(MI.getOperand(2).getReg()) 4071 .addUse(MI.getOperand(3).getReg()) 4072 .cloneMemRefs(MI); 4073 MI.eraseFromParent(); 4074 return true; 4075 } 4076 4077 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4078 switch (IntrID) { 4079 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4080 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4081 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4082 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4083 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4084 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4085 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4086 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4087 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4088 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4089 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4090 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4091 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4092 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4093 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4094 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4095 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4096 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4097 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4098 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4099 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4100 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4101 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4102 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4103 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4104 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4105 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4106 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4107 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4108 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4109 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4110 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4111 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4112 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4113 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4114 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4115 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4116 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4117 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4118 case Intrinsic::amdgcn_buffer_atomic_fadd: 4119 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4120 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4121 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4122 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4123 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4124 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4125 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4126 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4127 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4128 default: 4129 llvm_unreachable("unhandled atomic opcode"); 4130 } 4131 } 4132 4133 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4134 MachineIRBuilder &B, 4135 Intrinsic::ID IID) const { 4136 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4137 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4138 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4139 4140 Register Dst; 4141 4142 int OpOffset = 0; 4143 if (HasReturn) { 4144 // A few FP atomics do not support return values. 4145 Dst = MI.getOperand(0).getReg(); 4146 } else { 4147 OpOffset = -1; 4148 } 4149 4150 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4151 Register CmpVal; 4152 4153 if (IsCmpSwap) { 4154 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4155 ++OpOffset; 4156 } 4157 4158 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4159 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4160 4161 // The struct intrinsic variants add one additional operand over raw. 4162 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4163 Register VIndex; 4164 if (HasVIndex) { 4165 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4166 ++OpOffset; 4167 } else { 4168 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4169 } 4170 4171 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4172 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4173 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4174 4175 MachineMemOperand *MMO = *MI.memoperands_begin(); 4176 4177 unsigned ImmOffset; 4178 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4179 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4180 4181 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4182 4183 if (HasReturn) 4184 MIB.addDef(Dst); 4185 4186 MIB.addUse(VData); // vdata 4187 4188 if (IsCmpSwap) 4189 MIB.addReg(CmpVal); 4190 4191 MIB.addUse(RSrc) // rsrc 4192 .addUse(VIndex) // vindex 4193 .addUse(VOffset) // voffset 4194 .addUse(SOffset) // soffset 4195 .addImm(ImmOffset) // offset(imm) 4196 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4197 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4198 .addMemOperand(MMO); 4199 4200 MI.eraseFromParent(); 4201 return true; 4202 } 4203 4204 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4205 /// vector with s16 typed elements. 4206 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4207 SmallVectorImpl<Register> &PackedAddrs, 4208 unsigned ArgOffset, 4209 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4210 bool IsA16, bool IsG16) { 4211 const LLT S16 = LLT::scalar(16); 4212 const LLT V2S16 = LLT::fixed_vector(2, 16); 4213 auto EndIdx = Intr->VAddrEnd; 4214 4215 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4216 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4217 if (!SrcOp.isReg()) 4218 continue; // _L to _LZ may have eliminated this. 4219 4220 Register AddrReg = SrcOp.getReg(); 4221 4222 if ((I < Intr->GradientStart) || 4223 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4224 (I >= Intr->CoordStart && !IsA16)) { 4225 // Handle any gradient or coordinate operands that should not be packed 4226 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4227 PackedAddrs.push_back(AddrReg); 4228 } else { 4229 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4230 // derivatives dx/dh and dx/dv are packed with undef. 4231 if (((I + 1) >= EndIdx) || 4232 ((Intr->NumGradients / 2) % 2 == 1 && 4233 (I == static_cast<unsigned>(Intr->GradientStart + 4234 (Intr->NumGradients / 2) - 1) || 4235 I == static_cast<unsigned>(Intr->GradientStart + 4236 Intr->NumGradients - 1))) || 4237 // Check for _L to _LZ optimization 4238 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4239 PackedAddrs.push_back( 4240 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4241 .getReg(0)); 4242 } else { 4243 PackedAddrs.push_back( 4244 B.buildBuildVector( 4245 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4246 .getReg(0)); 4247 ++I; 4248 } 4249 } 4250 } 4251 } 4252 4253 /// Convert from separate vaddr components to a single vector address register, 4254 /// and replace the remaining operands with $noreg. 4255 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4256 int DimIdx, int NumVAddrs) { 4257 const LLT S32 = LLT::scalar(32); 4258 4259 SmallVector<Register, 8> AddrRegs; 4260 for (int I = 0; I != NumVAddrs; ++I) { 4261 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4262 if (SrcOp.isReg()) { 4263 AddrRegs.push_back(SrcOp.getReg()); 4264 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4265 } 4266 } 4267 4268 int NumAddrRegs = AddrRegs.size(); 4269 if (NumAddrRegs != 1) { 4270 // Above 8 elements round up to next power of 2 (i.e. 16). 4271 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4272 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4273 auto Undef = B.buildUndef(S32); 4274 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4275 NumAddrRegs = RoundedNumRegs; 4276 } 4277 4278 auto VAddr = 4279 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4280 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4281 } 4282 4283 for (int I = 1; I != NumVAddrs; ++I) { 4284 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4285 if (SrcOp.isReg()) 4286 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4287 } 4288 } 4289 4290 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4291 /// 4292 /// Depending on the subtarget, load/store with 16-bit element data need to be 4293 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4294 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4295 /// registers. 4296 /// 4297 /// We don't want to directly select image instructions just yet, but also want 4298 /// to exposes all register repacking to the legalizer/combiners. We also don't 4299 /// want a selected instrution entering RegBankSelect. In order to avoid 4300 /// defining a multitude of intermediate image instructions, directly hack on 4301 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4302 /// now unnecessary arguments with $noreg. 4303 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4304 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4305 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4306 4307 const unsigned NumDefs = MI.getNumExplicitDefs(); 4308 const unsigned ArgOffset = NumDefs + 1; 4309 bool IsTFE = NumDefs == 2; 4310 // We are only processing the operands of d16 image operations on subtargets 4311 // that use the unpacked register layout, or need to repack the TFE result. 4312 4313 // TODO: Do we need to guard against already legalized intrinsics? 4314 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4315 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4316 4317 MachineRegisterInfo *MRI = B.getMRI(); 4318 const LLT S32 = LLT::scalar(32); 4319 const LLT S16 = LLT::scalar(16); 4320 const LLT V2S16 = LLT::fixed_vector(2, 16); 4321 4322 unsigned DMask = 0; 4323 4324 // Check for 16 bit addresses and pack if true. 4325 LLT GradTy = 4326 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4327 LLT AddrTy = 4328 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4329 const bool IsG16 = GradTy == S16; 4330 const bool IsA16 = AddrTy == S16; 4331 4332 int DMaskLanes = 0; 4333 if (!BaseOpcode->Atomic) { 4334 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4335 if (BaseOpcode->Gather4) { 4336 DMaskLanes = 4; 4337 } else if (DMask != 0) { 4338 DMaskLanes = countPopulation(DMask); 4339 } else if (!IsTFE && !BaseOpcode->Store) { 4340 // If dmask is 0, this is a no-op load. This can be eliminated. 4341 B.buildUndef(MI.getOperand(0)); 4342 MI.eraseFromParent(); 4343 return true; 4344 } 4345 } 4346 4347 Observer.changingInstr(MI); 4348 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4349 4350 unsigned NewOpcode = NumDefs == 0 ? 4351 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4352 4353 // Track that we legalized this 4354 MI.setDesc(B.getTII().get(NewOpcode)); 4355 4356 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4357 // dmask to be at least 1 otherwise the instruction will fail 4358 if (IsTFE && DMask == 0) { 4359 DMask = 0x1; 4360 DMaskLanes = 1; 4361 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4362 } 4363 4364 if (BaseOpcode->Atomic) { 4365 Register VData0 = MI.getOperand(2).getReg(); 4366 LLT Ty = MRI->getType(VData0); 4367 4368 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4369 if (Ty.isVector()) 4370 return false; 4371 4372 if (BaseOpcode->AtomicX2) { 4373 Register VData1 = MI.getOperand(3).getReg(); 4374 // The two values are packed in one register. 4375 LLT PackedTy = LLT::fixed_vector(2, Ty); 4376 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4377 MI.getOperand(2).setReg(Concat.getReg(0)); 4378 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4379 } 4380 } 4381 4382 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4383 4384 // Optimize _L to _LZ when _L is zero 4385 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4386 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4387 const ConstantFP *ConstantLod; 4388 4389 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4390 m_GFCst(ConstantLod))) { 4391 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4392 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4393 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4394 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4395 Intr->Dim); 4396 4397 // The starting indexes should remain in the same place. 4398 --CorrectedNumVAddrs; 4399 4400 MI.getOperand(MI.getNumExplicitDefs()) 4401 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4402 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4403 Intr = NewImageDimIntr; 4404 } 4405 } 4406 } 4407 4408 // Optimize _mip away, when 'lod' is zero 4409 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4410 int64_t ConstantLod; 4411 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4412 m_ICst(ConstantLod))) { 4413 if (ConstantLod == 0) { 4414 // TODO: Change intrinsic opcode and remove operand instead or replacing 4415 // it with 0, as the _L to _LZ handling is done above. 4416 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4417 --CorrectedNumVAddrs; 4418 } 4419 } 4420 } 4421 4422 // Rewrite the addressing register layout before doing anything else. 4423 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4424 // 16 bit gradients are supported, but are tied to the A16 control 4425 // so both gradients and addresses must be 16 bit 4426 return false; 4427 } 4428 4429 if (IsA16 && !ST.hasA16()) { 4430 // A16 not supported 4431 return false; 4432 } 4433 4434 if (IsA16 || IsG16) { 4435 if (Intr->NumVAddrs > 1) { 4436 SmallVector<Register, 4> PackedRegs; 4437 4438 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4439 IsG16); 4440 4441 // See also below in the non-a16 branch 4442 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4443 PackedRegs.size() <= ST.getNSAMaxSize(); 4444 4445 if (!UseNSA && PackedRegs.size() > 1) { 4446 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4447 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4448 PackedRegs[0] = Concat.getReg(0); 4449 PackedRegs.resize(1); 4450 } 4451 4452 const unsigned NumPacked = PackedRegs.size(); 4453 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4454 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4455 if (!SrcOp.isReg()) { 4456 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4457 continue; 4458 } 4459 4460 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4461 4462 if (I - Intr->VAddrStart < NumPacked) 4463 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4464 else 4465 SrcOp.setReg(AMDGPU::NoRegister); 4466 } 4467 } 4468 } else { 4469 // If the register allocator cannot place the address registers contiguously 4470 // without introducing moves, then using the non-sequential address encoding 4471 // is always preferable, since it saves VALU instructions and is usually a 4472 // wash in terms of code size or even better. 4473 // 4474 // However, we currently have no way of hinting to the register allocator 4475 // that MIMG addresses should be placed contiguously when it is possible to 4476 // do so, so force non-NSA for the common 2-address case as a heuristic. 4477 // 4478 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4479 // allocation when possible. 4480 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4481 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4482 4483 if (!UseNSA && Intr->NumVAddrs > 1) 4484 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4485 Intr->NumVAddrs); 4486 } 4487 4488 int Flags = 0; 4489 if (IsA16) 4490 Flags |= 1; 4491 if (IsG16) 4492 Flags |= 2; 4493 MI.addOperand(MachineOperand::CreateImm(Flags)); 4494 4495 if (BaseOpcode->Store) { // No TFE for stores? 4496 // TODO: Handle dmask trim 4497 Register VData = MI.getOperand(1).getReg(); 4498 LLT Ty = MRI->getType(VData); 4499 if (!Ty.isVector() || Ty.getElementType() != S16) 4500 return true; 4501 4502 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4503 if (RepackedReg != VData) { 4504 MI.getOperand(1).setReg(RepackedReg); 4505 } 4506 4507 return true; 4508 } 4509 4510 Register DstReg = MI.getOperand(0).getReg(); 4511 LLT Ty = MRI->getType(DstReg); 4512 const LLT EltTy = Ty.getScalarType(); 4513 const bool IsD16 = Ty.getScalarType() == S16; 4514 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4515 4516 // Confirm that the return type is large enough for the dmask specified 4517 if (NumElts < DMaskLanes) 4518 return false; 4519 4520 if (NumElts > 4 || DMaskLanes > 4) 4521 return false; 4522 4523 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4524 const LLT AdjustedTy = 4525 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4526 4527 // The raw dword aligned data component of the load. The only legal cases 4528 // where this matters should be when using the packed D16 format, for 4529 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4530 LLT RoundedTy; 4531 4532 // S32 vector to to cover all data, plus TFE result element. 4533 LLT TFETy; 4534 4535 // Register type to use for each loaded component. Will be S32 or V2S16. 4536 LLT RegTy; 4537 4538 if (IsD16 && ST.hasUnpackedD16VMem()) { 4539 RoundedTy = 4540 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4541 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4542 RegTy = S32; 4543 } else { 4544 unsigned EltSize = EltTy.getSizeInBits(); 4545 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4546 unsigned RoundedSize = 32 * RoundedElts; 4547 RoundedTy = LLT::scalarOrVector( 4548 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4549 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4550 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4551 } 4552 4553 // The return type does not need adjustment. 4554 // TODO: Should we change s16 case to s32 or <2 x s16>? 4555 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4556 return true; 4557 4558 Register Dst1Reg; 4559 4560 // Insert after the instruction. 4561 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4562 4563 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4564 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4565 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4566 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4567 4568 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4569 4570 MI.getOperand(0).setReg(NewResultReg); 4571 4572 // In the IR, TFE is supposed to be used with a 2 element struct return 4573 // type. The intruction really returns these two values in one contiguous 4574 // register, with one additional dword beyond the loaded data. Rewrite the 4575 // return type to use a single register result. 4576 4577 if (IsTFE) { 4578 Dst1Reg = MI.getOperand(1).getReg(); 4579 if (MRI->getType(Dst1Reg) != S32) 4580 return false; 4581 4582 // TODO: Make sure the TFE operand bit is set. 4583 MI.RemoveOperand(1); 4584 4585 // Handle the easy case that requires no repack instructions. 4586 if (Ty == S32) { 4587 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4588 return true; 4589 } 4590 } 4591 4592 // Now figure out how to copy the new result register back into the old 4593 // result. 4594 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4595 4596 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4597 4598 if (ResultNumRegs == 1) { 4599 assert(!IsTFE); 4600 ResultRegs[0] = NewResultReg; 4601 } else { 4602 // We have to repack into a new vector of some kind. 4603 for (int I = 0; I != NumDataRegs; ++I) 4604 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4605 B.buildUnmerge(ResultRegs, NewResultReg); 4606 4607 // Drop the final TFE element to get the data part. The TFE result is 4608 // directly written to the right place already. 4609 if (IsTFE) 4610 ResultRegs.resize(NumDataRegs); 4611 } 4612 4613 // For an s16 scalar result, we form an s32 result with a truncate regardless 4614 // of packed vs. unpacked. 4615 if (IsD16 && !Ty.isVector()) { 4616 B.buildTrunc(DstReg, ResultRegs[0]); 4617 return true; 4618 } 4619 4620 // Avoid a build/concat_vector of 1 entry. 4621 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4622 B.buildBitcast(DstReg, ResultRegs[0]); 4623 return true; 4624 } 4625 4626 assert(Ty.isVector()); 4627 4628 if (IsD16) { 4629 // For packed D16 results with TFE enabled, all the data components are 4630 // S32. Cast back to the expected type. 4631 // 4632 // TODO: We don't really need to use load s32 elements. We would only need one 4633 // cast for the TFE result if a multiple of v2s16 was used. 4634 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4635 for (Register &Reg : ResultRegs) 4636 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4637 } else if (ST.hasUnpackedD16VMem()) { 4638 for (Register &Reg : ResultRegs) 4639 Reg = B.buildTrunc(S16, Reg).getReg(0); 4640 } 4641 } 4642 4643 auto padWithUndef = [&](LLT Ty, int NumElts) { 4644 if (NumElts == 0) 4645 return; 4646 Register Undef = B.buildUndef(Ty).getReg(0); 4647 for (int I = 0; I != NumElts; ++I) 4648 ResultRegs.push_back(Undef); 4649 }; 4650 4651 // Pad out any elements eliminated due to the dmask. 4652 LLT ResTy = MRI->getType(ResultRegs[0]); 4653 if (!ResTy.isVector()) { 4654 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4655 B.buildBuildVector(DstReg, ResultRegs); 4656 return true; 4657 } 4658 4659 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4660 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4661 4662 // Deal with the one annoying legal case. 4663 const LLT V3S16 = LLT::fixed_vector(3, 16); 4664 if (Ty == V3S16) { 4665 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4666 auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs); 4667 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4668 return true; 4669 } 4670 4671 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4672 B.buildConcatVectors(DstReg, ResultRegs); 4673 return true; 4674 } 4675 4676 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4677 LegalizerHelper &Helper, MachineInstr &MI) const { 4678 MachineIRBuilder &B = Helper.MIRBuilder; 4679 GISelChangeObserver &Observer = Helper.Observer; 4680 4681 Register Dst = MI.getOperand(0).getReg(); 4682 LLT Ty = B.getMRI()->getType(Dst); 4683 unsigned Size = Ty.getSizeInBits(); 4684 MachineFunction &MF = B.getMF(); 4685 4686 Observer.changingInstr(MI); 4687 4688 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4689 Ty = getBitcastRegisterType(Ty); 4690 Helper.bitcastDst(MI, Ty, 0); 4691 Dst = MI.getOperand(0).getReg(); 4692 B.setInsertPt(B.getMBB(), MI); 4693 } 4694 4695 // FIXME: We don't really need this intermediate instruction. The intrinsic 4696 // should be fixed to have a memory operand. Since it's readnone, we're not 4697 // allowed to add one. 4698 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4699 MI.RemoveOperand(1); // Remove intrinsic ID 4700 4701 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4702 // TODO: Should this use datalayout alignment? 4703 const unsigned MemSize = (Size + 7) / 8; 4704 const Align MemAlign(4); 4705 MachineMemOperand *MMO = MF.getMachineMemOperand( 4706 MachinePointerInfo(), 4707 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4708 MachineMemOperand::MOInvariant, 4709 MemSize, MemAlign); 4710 MI.addMemOperand(MF, MMO); 4711 4712 // There are no 96-bit result scalar loads, but widening to 128-bit should 4713 // always be legal. We may need to restore this to a 96-bit result if it turns 4714 // out this needs to be converted to a vector load during RegBankSelect. 4715 if (!isPowerOf2_32(Size)) { 4716 if (Ty.isVector()) 4717 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4718 else 4719 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4720 } 4721 4722 Observer.changedInstr(MI); 4723 return true; 4724 } 4725 4726 // TODO: Move to selection 4727 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4728 MachineRegisterInfo &MRI, 4729 MachineIRBuilder &B) const { 4730 if (!ST.isTrapHandlerEnabled() || 4731 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4732 return legalizeTrapEndpgm(MI, MRI, B); 4733 4734 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4735 switch (*HsaAbiVer) { 4736 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4737 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4738 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4739 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4740 return ST.supportsGetDoorbellID() ? 4741 legalizeTrapHsa(MI, MRI, B) : 4742 legalizeTrapHsaQueuePtr(MI, MRI, B); 4743 } 4744 } 4745 4746 llvm_unreachable("Unknown trap handler"); 4747 } 4748 4749 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4750 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4751 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4752 MI.eraseFromParent(); 4753 return true; 4754 } 4755 4756 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4757 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4758 // Pass queue pointer to trap handler as input, and insert trap instruction 4759 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4760 Register LiveIn = 4761 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4762 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4763 return false; 4764 4765 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4766 B.buildCopy(SGPR01, LiveIn); 4767 B.buildInstr(AMDGPU::S_TRAP) 4768 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4769 .addReg(SGPR01, RegState::Implicit); 4770 4771 MI.eraseFromParent(); 4772 return true; 4773 } 4774 4775 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4776 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4777 B.buildInstr(AMDGPU::S_TRAP) 4778 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4779 MI.eraseFromParent(); 4780 return true; 4781 } 4782 4783 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4784 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4785 // Is non-HSA path or trap-handler disabled? then, report a warning 4786 // accordingly 4787 if (!ST.isTrapHandlerEnabled() || 4788 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4789 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4790 "debugtrap handler not supported", 4791 MI.getDebugLoc(), DS_Warning); 4792 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4793 Ctx.diagnose(NoTrap); 4794 } else { 4795 // Insert debug-trap instruction 4796 B.buildInstr(AMDGPU::S_TRAP) 4797 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4798 } 4799 4800 MI.eraseFromParent(); 4801 return true; 4802 } 4803 4804 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4805 MachineIRBuilder &B) const { 4806 MachineRegisterInfo &MRI = *B.getMRI(); 4807 const LLT S16 = LLT::scalar(16); 4808 const LLT S32 = LLT::scalar(32); 4809 4810 Register DstReg = MI.getOperand(0).getReg(); 4811 Register NodePtr = MI.getOperand(2).getReg(); 4812 Register RayExtent = MI.getOperand(3).getReg(); 4813 Register RayOrigin = MI.getOperand(4).getReg(); 4814 Register RayDir = MI.getOperand(5).getReg(); 4815 Register RayInvDir = MI.getOperand(6).getReg(); 4816 Register TDescr = MI.getOperand(7).getReg(); 4817 4818 if (!ST.hasGFX10_AEncoding()) { 4819 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4820 "intrinsic not supported on subtarget", 4821 MI.getDebugLoc()); 4822 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4823 return false; 4824 } 4825 4826 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4827 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4828 const unsigned NumVDataDwords = 4; 4829 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4830 const bool UseNSA = 4831 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 4832 const unsigned BaseOpcodes[2][2] = { 4833 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 4834 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 4835 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 4836 int Opcode; 4837 if (UseNSA) { 4838 Opcode = 4839 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 4840 NumVDataDwords, NumVAddrDwords); 4841 } else { 4842 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 4843 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 4844 PowerOf2Ceil(NumVAddrDwords)); 4845 } 4846 assert(Opcode != -1); 4847 4848 SmallVector<Register, 12> Ops; 4849 if (Is64) { 4850 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4851 Ops.push_back(Unmerge.getReg(0)); 4852 Ops.push_back(Unmerge.getReg(1)); 4853 } else { 4854 Ops.push_back(NodePtr); 4855 } 4856 Ops.push_back(RayExtent); 4857 4858 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4859 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4860 Ops.push_back(Unmerge.getReg(0)); 4861 Ops.push_back(Unmerge.getReg(1)); 4862 Ops.push_back(Unmerge.getReg(2)); 4863 }; 4864 4865 packLanes(RayOrigin); 4866 if (IsA16) { 4867 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4868 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4869 Register R1 = MRI.createGenericVirtualRegister(S32); 4870 Register R2 = MRI.createGenericVirtualRegister(S32); 4871 Register R3 = MRI.createGenericVirtualRegister(S32); 4872 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4873 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4874 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4875 Ops.push_back(R1); 4876 Ops.push_back(R2); 4877 Ops.push_back(R3); 4878 } else { 4879 packLanes(RayDir); 4880 packLanes(RayInvDir); 4881 } 4882 4883 if (!UseNSA) { 4884 // Build a single vector containing all the operands so far prepared. 4885 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 4886 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 4887 Ops.clear(); 4888 Ops.push_back(MergedOps); 4889 } 4890 4891 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4892 .addDef(DstReg) 4893 .addImm(Opcode); 4894 4895 for (Register R : Ops) { 4896 MIB.addUse(R); 4897 } 4898 4899 MIB.addUse(TDescr) 4900 .addImm(IsA16 ? 1 : 0) 4901 .cloneMemRefs(MI); 4902 4903 MI.eraseFromParent(); 4904 return true; 4905 } 4906 4907 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4908 MachineInstr &MI) const { 4909 MachineIRBuilder &B = Helper.MIRBuilder; 4910 MachineRegisterInfo &MRI = *B.getMRI(); 4911 4912 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4913 auto IntrID = MI.getIntrinsicID(); 4914 switch (IntrID) { 4915 case Intrinsic::amdgcn_if: 4916 case Intrinsic::amdgcn_else: { 4917 MachineInstr *Br = nullptr; 4918 MachineBasicBlock *UncondBrTarget = nullptr; 4919 bool Negated = false; 4920 if (MachineInstr *BrCond = 4921 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4922 const SIRegisterInfo *TRI 4923 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4924 4925 Register Def = MI.getOperand(1).getReg(); 4926 Register Use = MI.getOperand(3).getReg(); 4927 4928 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4929 4930 if (Negated) 4931 std::swap(CondBrTarget, UncondBrTarget); 4932 4933 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4934 if (IntrID == Intrinsic::amdgcn_if) { 4935 B.buildInstr(AMDGPU::SI_IF) 4936 .addDef(Def) 4937 .addUse(Use) 4938 .addMBB(UncondBrTarget); 4939 } else { 4940 B.buildInstr(AMDGPU::SI_ELSE) 4941 .addDef(Def) 4942 .addUse(Use) 4943 .addMBB(UncondBrTarget); 4944 } 4945 4946 if (Br) { 4947 Br->getOperand(0).setMBB(CondBrTarget); 4948 } else { 4949 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4950 // since we're swapping branch targets it needs to be reinserted. 4951 // FIXME: IRTranslator should probably not do this 4952 B.buildBr(*CondBrTarget); 4953 } 4954 4955 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4956 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4957 MI.eraseFromParent(); 4958 BrCond->eraseFromParent(); 4959 return true; 4960 } 4961 4962 return false; 4963 } 4964 case Intrinsic::amdgcn_loop: { 4965 MachineInstr *Br = nullptr; 4966 MachineBasicBlock *UncondBrTarget = nullptr; 4967 bool Negated = false; 4968 if (MachineInstr *BrCond = 4969 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4970 const SIRegisterInfo *TRI 4971 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4972 4973 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4974 Register Reg = MI.getOperand(2).getReg(); 4975 4976 if (Negated) 4977 std::swap(CondBrTarget, UncondBrTarget); 4978 4979 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4980 B.buildInstr(AMDGPU::SI_LOOP) 4981 .addUse(Reg) 4982 .addMBB(UncondBrTarget); 4983 4984 if (Br) 4985 Br->getOperand(0).setMBB(CondBrTarget); 4986 else 4987 B.buildBr(*CondBrTarget); 4988 4989 MI.eraseFromParent(); 4990 BrCond->eraseFromParent(); 4991 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4992 return true; 4993 } 4994 4995 return false; 4996 } 4997 case Intrinsic::amdgcn_kernarg_segment_ptr: 4998 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4999 // This only makes sense to call in a kernel, so just lower to null. 5000 B.buildConstant(MI.getOperand(0).getReg(), 0); 5001 MI.eraseFromParent(); 5002 return true; 5003 } 5004 5005 return legalizePreloadedArgIntrin( 5006 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5007 case Intrinsic::amdgcn_implicitarg_ptr: 5008 return legalizeImplicitArgPtr(MI, MRI, B); 5009 case Intrinsic::amdgcn_workitem_id_x: 5010 return legalizePreloadedArgIntrin(MI, MRI, B, 5011 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5012 case Intrinsic::amdgcn_workitem_id_y: 5013 return legalizePreloadedArgIntrin(MI, MRI, B, 5014 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5015 case Intrinsic::amdgcn_workitem_id_z: 5016 return legalizePreloadedArgIntrin(MI, MRI, B, 5017 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5018 case Intrinsic::amdgcn_workgroup_id_x: 5019 return legalizePreloadedArgIntrin(MI, MRI, B, 5020 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5021 case Intrinsic::amdgcn_workgroup_id_y: 5022 return legalizePreloadedArgIntrin(MI, MRI, B, 5023 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5024 case Intrinsic::amdgcn_workgroup_id_z: 5025 return legalizePreloadedArgIntrin(MI, MRI, B, 5026 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5027 case Intrinsic::amdgcn_dispatch_ptr: 5028 return legalizePreloadedArgIntrin(MI, MRI, B, 5029 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5030 case Intrinsic::amdgcn_queue_ptr: 5031 return legalizePreloadedArgIntrin(MI, MRI, B, 5032 AMDGPUFunctionArgInfo::QUEUE_PTR); 5033 case Intrinsic::amdgcn_implicit_buffer_ptr: 5034 return legalizePreloadedArgIntrin( 5035 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5036 case Intrinsic::amdgcn_dispatch_id: 5037 return legalizePreloadedArgIntrin(MI, MRI, B, 5038 AMDGPUFunctionArgInfo::DISPATCH_ID); 5039 case Intrinsic::amdgcn_fdiv_fast: 5040 return legalizeFDIVFastIntrin(MI, MRI, B); 5041 case Intrinsic::amdgcn_is_shared: 5042 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5043 case Intrinsic::amdgcn_is_private: 5044 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5045 case Intrinsic::amdgcn_wavefrontsize: { 5046 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5047 MI.eraseFromParent(); 5048 return true; 5049 } 5050 case Intrinsic::amdgcn_s_buffer_load: 5051 return legalizeSBufferLoad(Helper, MI); 5052 case Intrinsic::amdgcn_raw_buffer_store: 5053 case Intrinsic::amdgcn_struct_buffer_store: 5054 return legalizeBufferStore(MI, MRI, B, false, false); 5055 case Intrinsic::amdgcn_raw_buffer_store_format: 5056 case Intrinsic::amdgcn_struct_buffer_store_format: 5057 return legalizeBufferStore(MI, MRI, B, false, true); 5058 case Intrinsic::amdgcn_raw_tbuffer_store: 5059 case Intrinsic::amdgcn_struct_tbuffer_store: 5060 return legalizeBufferStore(MI, MRI, B, true, true); 5061 case Intrinsic::amdgcn_raw_buffer_load: 5062 case Intrinsic::amdgcn_struct_buffer_load: 5063 return legalizeBufferLoad(MI, MRI, B, false, false); 5064 case Intrinsic::amdgcn_raw_buffer_load_format: 5065 case Intrinsic::amdgcn_struct_buffer_load_format: 5066 return legalizeBufferLoad(MI, MRI, B, true, false); 5067 case Intrinsic::amdgcn_raw_tbuffer_load: 5068 case Intrinsic::amdgcn_struct_tbuffer_load: 5069 return legalizeBufferLoad(MI, MRI, B, true, true); 5070 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5071 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5072 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5073 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5074 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5075 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5076 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5077 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5078 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5079 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5080 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5081 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5082 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5083 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5084 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5085 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5086 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5087 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5088 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5089 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5090 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5091 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5092 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5093 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5094 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5095 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5096 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5097 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5098 case Intrinsic::amdgcn_buffer_atomic_fadd: 5099 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5100 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5101 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5102 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5103 return legalizeBufferAtomic(MI, B, IntrID); 5104 case Intrinsic::amdgcn_atomic_inc: 5105 return legalizeAtomicIncDec(MI, B, true); 5106 case Intrinsic::amdgcn_atomic_dec: 5107 return legalizeAtomicIncDec(MI, B, false); 5108 case Intrinsic::trap: 5109 return legalizeTrapIntrinsic(MI, MRI, B); 5110 case Intrinsic::debugtrap: 5111 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5112 case Intrinsic::amdgcn_rsq_clamp: 5113 return legalizeRsqClampIntrinsic(MI, MRI, B); 5114 case Intrinsic::amdgcn_ds_fadd: 5115 case Intrinsic::amdgcn_ds_fmin: 5116 case Intrinsic::amdgcn_ds_fmax: 5117 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5118 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5119 return legalizeBVHIntrinsic(MI, B); 5120 default: { 5121 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5122 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5123 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5124 return true; 5125 } 5126 } 5127 5128 return true; 5129 } 5130