1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 unsigned AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .clampScalar(0, S16, S32) 536 .clampMaxNumElements(0, S16, 2) 537 .scalarize(0) 538 .widenScalarToNextPow2(0, 32); 539 540 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 541 .legalFor({S32, S16, V2S16}) // Clamp modifier 542 .minScalarOrElt(0, S16) 543 .clampMaxNumElements(0, S16, 2) 544 .scalarize(0) 545 .widenScalarToNextPow2(0, 32) 546 .lower(); 547 } else if (ST.has16BitInsts()) { 548 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 549 .legalFor({S32, S16}) 550 .clampScalar(0, S16, S32) 551 .scalarize(0) 552 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 553 554 // Technically the saturating operations require clamp bit support, but this 555 // was introduced at the same time as 16-bit operations. 556 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 557 .legalFor({S32, S16}) // Clamp modifier 558 .minScalar(0, S16) 559 .scalarize(0) 560 .widenScalarToNextPow2(0, 16) 561 .lower(); 562 563 // We're just lowering this, but it helps get a better result to try to 564 // coerce to the desired type first. 565 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 566 .minScalar(0, S16) 567 .scalarize(0) 568 .lower(); 569 } else { 570 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 571 .legalFor({S32}) 572 .clampScalar(0, S32, S32) 573 .scalarize(0); 574 575 if (ST.hasIntClamp()) { 576 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 577 .legalFor({S32}) // Clamp modifier. 578 .scalarize(0) 579 .minScalarOrElt(0, S32) 580 .lower(); 581 } else { 582 // Clamp bit support was added in VI, along with 16-bit operations. 583 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 584 .minScalar(0, S32) 585 .scalarize(0) 586 .lower(); 587 } 588 589 // FIXME: DAG expansion gets better results. The widening uses the smaller 590 // range values and goes for the min/max lowering directly. 591 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 592 .minScalar(0, S32) 593 .scalarize(0) 594 .lower(); 595 } 596 597 getActionDefinitionsBuilder( 598 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 599 .customFor({S32, S64}) 600 .clampScalar(0, S32, S64) 601 .widenScalarToNextPow2(0, 32) 602 .scalarize(0); 603 604 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 605 .legalFor({S32}) 606 .maxScalarOrElt(0, S32); 607 608 if (ST.hasVOP3PInsts()) { 609 Mulh 610 .clampMaxNumElements(0, S8, 2) 611 .lowerFor({V2S8}); 612 } 613 614 Mulh 615 .scalarize(0) 616 .lower(); 617 618 // Report legal for any types we can handle anywhere. For the cases only legal 619 // on the SALU, RegBankSelect will be able to re-legalize. 620 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 621 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 622 .clampScalar(0, S32, S64) 623 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 624 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 625 .widenScalarToNextPow2(0) 626 .scalarize(0); 627 628 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 629 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 630 .legalFor({{S32, S1}, {S32, S32}}) 631 .minScalar(0, S32) 632 // TODO: .scalarize(0) 633 .lower(); 634 635 getActionDefinitionsBuilder(G_BITCAST) 636 // Don't worry about the size constraint. 637 .legalIf(all(isRegisterType(0), isRegisterType(1))) 638 .lower(); 639 640 641 getActionDefinitionsBuilder(G_CONSTANT) 642 .legalFor({S1, S32, S64, S16, GlobalPtr, 643 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 644 .legalIf(isPointer(0)) 645 .clampScalar(0, S32, S64) 646 .widenScalarToNextPow2(0); 647 648 getActionDefinitionsBuilder(G_FCONSTANT) 649 .legalFor({S32, S64, S16}) 650 .clampScalar(0, S16, S64); 651 652 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 653 .legalIf(isRegisterType(0)) 654 // s1 and s16 are special cases because they have legal operations on 655 // them, but don't really occupy registers in the normal way. 656 .legalFor({S1, S16}) 657 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 658 .clampScalarOrElt(0, S32, MaxScalar) 659 .widenScalarToNextPow2(0, 32) 660 .clampMaxNumElements(0, S32, 16); 661 662 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 663 664 // If the amount is divergent, we have to do a wave reduction to get the 665 // maximum value, so this is expanded during RegBankSelect. 666 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 667 .legalFor({{PrivatePtr, S32}}); 668 669 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 670 .customIf(typeIsNot(0, PrivatePtr)); 671 672 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 673 674 auto &FPOpActions = getActionDefinitionsBuilder( 675 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 676 .legalFor({S32, S64}); 677 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 678 .customFor({S32, S64}); 679 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 680 .customFor({S32, S64}); 681 682 if (ST.has16BitInsts()) { 683 if (ST.hasVOP3PInsts()) 684 FPOpActions.legalFor({S16, V2S16}); 685 else 686 FPOpActions.legalFor({S16}); 687 688 TrigActions.customFor({S16}); 689 FDIVActions.customFor({S16}); 690 } 691 692 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 693 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 694 695 if (ST.hasVOP3PInsts()) { 696 MinNumMaxNum.customFor(FPTypesPK16) 697 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 698 .clampMaxNumElements(0, S16, 2) 699 .clampScalar(0, S16, S64) 700 .scalarize(0); 701 } else if (ST.has16BitInsts()) { 702 MinNumMaxNum.customFor(FPTypes16) 703 .clampScalar(0, S16, S64) 704 .scalarize(0); 705 } else { 706 MinNumMaxNum.customFor(FPTypesBase) 707 .clampScalar(0, S32, S64) 708 .scalarize(0); 709 } 710 711 if (ST.hasVOP3PInsts()) 712 FPOpActions.clampMaxNumElements(0, S16, 2); 713 714 FPOpActions 715 .scalarize(0) 716 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 717 718 TrigActions 719 .scalarize(0) 720 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 721 722 FDIVActions 723 .scalarize(0) 724 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 725 726 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 727 .legalFor(FPTypesPK16) 728 .clampMaxNumElements(0, S16, 2) 729 .scalarize(0) 730 .clampScalar(0, S16, S64); 731 732 if (ST.has16BitInsts()) { 733 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 734 .legalFor({S32, S64, S16}) 735 .scalarize(0) 736 .clampScalar(0, S16, S64); 737 } else { 738 getActionDefinitionsBuilder(G_FSQRT) 739 .legalFor({S32, S64}) 740 .scalarize(0) 741 .clampScalar(0, S32, S64); 742 743 if (ST.hasFractBug()) { 744 getActionDefinitionsBuilder(G_FFLOOR) 745 .customFor({S64}) 746 .legalFor({S32, S64}) 747 .scalarize(0) 748 .clampScalar(0, S32, S64); 749 } else { 750 getActionDefinitionsBuilder(G_FFLOOR) 751 .legalFor({S32, S64}) 752 .scalarize(0) 753 .clampScalar(0, S32, S64); 754 } 755 } 756 757 getActionDefinitionsBuilder(G_FPTRUNC) 758 .legalFor({{S32, S64}, {S16, S32}}) 759 .scalarize(0) 760 .lower(); 761 762 getActionDefinitionsBuilder(G_FPEXT) 763 .legalFor({{S64, S32}, {S32, S16}}) 764 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 765 .scalarize(0); 766 767 getActionDefinitionsBuilder(G_FSUB) 768 // Use actual fsub instruction 769 .legalFor({S32}) 770 // Must use fadd + fneg 771 .lowerFor({S64, S16, V2S16}) 772 .scalarize(0) 773 .clampScalar(0, S32, S64); 774 775 // Whether this is legal depends on the floating point mode for the function. 776 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 777 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 778 FMad.customFor({S32, S16}); 779 else if (ST.hasMadMacF32Insts()) 780 FMad.customFor({S32}); 781 else if (ST.hasMadF16()) 782 FMad.customFor({S16}); 783 FMad.scalarize(0) 784 .lower(); 785 786 auto &FRem = getActionDefinitionsBuilder(G_FREM); 787 if (ST.has16BitInsts()) { 788 FRem.customFor({S16, S32, S64}); 789 } else { 790 FRem.minScalar(0, S32) 791 .customFor({S32, S64}); 792 } 793 FRem.scalarize(0); 794 795 // TODO: Do we need to clamp maximum bitwidth? 796 getActionDefinitionsBuilder(G_TRUNC) 797 .legalIf(isScalar(0)) 798 .legalFor({{V2S16, V2S32}}) 799 .clampMaxNumElements(0, S16, 2) 800 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 801 // situations (like an invalid implicit use), we don't want to infinite loop 802 // in the legalizer. 803 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 804 .alwaysLegal(); 805 806 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 807 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 808 {S32, S1}, {S64, S1}, {S16, S1}}) 809 .scalarize(0) 810 .clampScalar(0, S32, S64) 811 .widenScalarToNextPow2(1, 32); 812 813 // TODO: Split s1->s64 during regbankselect for VALU. 814 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 815 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 816 .lowerIf(typeIs(1, S1)) 817 .customFor({{S32, S64}, {S64, S64}}); 818 if (ST.has16BitInsts()) 819 IToFP.legalFor({{S16, S16}}); 820 IToFP.clampScalar(1, S32, S64) 821 .minScalar(0, S32) 822 .scalarize(0) 823 .widenScalarToNextPow2(1); 824 825 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 826 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 827 .customFor({{S64, S32}, {S64, S64}}) 828 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 829 if (ST.has16BitInsts()) 830 FPToI.legalFor({{S16, S16}}); 831 else 832 FPToI.minScalar(1, S32); 833 834 FPToI.minScalar(0, S32) 835 .widenScalarToNextPow2(0, 32) 836 .scalarize(0) 837 .lower(); 838 839 // Lower roundeven into G_FRINT 840 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 841 .scalarize(0) 842 .lower(); 843 844 if (ST.has16BitInsts()) { 845 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 846 .legalFor({S16, S32, S64}) 847 .clampScalar(0, S16, S64) 848 .scalarize(0); 849 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 850 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 851 .legalFor({S32, S64}) 852 .clampScalar(0, S32, S64) 853 .scalarize(0); 854 } else { 855 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 856 .legalFor({S32}) 857 .customFor({S64}) 858 .clampScalar(0, S32, S64) 859 .scalarize(0); 860 } 861 862 getActionDefinitionsBuilder(G_PTR_ADD) 863 .legalIf(all(isPointer(0), sameSize(0, 1))) 864 .scalarize(0) 865 .scalarSameSizeAs(1, 0); 866 867 getActionDefinitionsBuilder(G_PTRMASK) 868 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 869 .scalarSameSizeAs(1, 0) 870 .scalarize(0); 871 872 auto &CmpBuilder = 873 getActionDefinitionsBuilder(G_ICMP) 874 // The compare output type differs based on the register bank of the output, 875 // so make both s1 and s32 legal. 876 // 877 // Scalar compares producing output in scc will be promoted to s32, as that 878 // is the allocatable register type that will be needed for the copy from 879 // scc. This will be promoted during RegBankSelect, and we assume something 880 // before that won't try to use s32 result types. 881 // 882 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 883 // bank. 884 .legalForCartesianProduct( 885 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 886 .legalForCartesianProduct( 887 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 888 if (ST.has16BitInsts()) { 889 CmpBuilder.legalFor({{S1, S16}}); 890 } 891 892 CmpBuilder 893 .widenScalarToNextPow2(1) 894 .clampScalar(1, S32, S64) 895 .scalarize(0) 896 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 897 898 getActionDefinitionsBuilder(G_FCMP) 899 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 900 .widenScalarToNextPow2(1) 901 .clampScalar(1, S32, S64) 902 .scalarize(0); 903 904 // FIXME: fpow has a selection pattern that should move to custom lowering. 905 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 906 if (ST.has16BitInsts()) 907 Exp2Ops.legalFor({S32, S16}); 908 else 909 Exp2Ops.legalFor({S32}); 910 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 911 Exp2Ops.scalarize(0); 912 913 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 914 if (ST.has16BitInsts()) 915 ExpOps.customFor({{S32}, {S16}}); 916 else 917 ExpOps.customFor({S32}); 918 ExpOps.clampScalar(0, MinScalarFPTy, S32) 919 .scalarize(0); 920 921 getActionDefinitionsBuilder(G_FPOWI) 922 .clampScalar(0, MinScalarFPTy, S32) 923 .lower(); 924 925 // The 64-bit versions produce 32-bit results, but only on the SALU. 926 getActionDefinitionsBuilder(G_CTPOP) 927 .legalFor({{S32, S32}, {S32, S64}}) 928 .clampScalar(0, S32, S32) 929 .clampScalar(1, S32, S64) 930 .scalarize(0) 931 .widenScalarToNextPow2(0, 32) 932 .widenScalarToNextPow2(1, 32); 933 934 // The hardware instructions return a different result on 0 than the generic 935 // instructions expect. The hardware produces -1, but these produce the 936 // bitwidth. 937 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 938 .scalarize(0) 939 .clampScalar(0, S32, S32) 940 .clampScalar(1, S32, S64) 941 .widenScalarToNextPow2(0, 32) 942 .widenScalarToNextPow2(1, 32) 943 .custom(); 944 945 // The 64-bit versions produce 32-bit results, but only on the SALU. 946 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 947 .legalFor({{S32, S32}, {S32, S64}}) 948 .clampScalar(0, S32, S32) 949 .clampScalar(1, S32, S64) 950 .scalarize(0) 951 .widenScalarToNextPow2(0, 32) 952 .widenScalarToNextPow2(1, 32); 953 954 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 955 // RegBankSelect. 956 getActionDefinitionsBuilder(G_BITREVERSE) 957 .legalFor({S32, S64}) 958 .clampScalar(0, S32, S64) 959 .scalarize(0) 960 .widenScalarToNextPow2(0); 961 962 if (ST.has16BitInsts()) { 963 getActionDefinitionsBuilder(G_BSWAP) 964 .legalFor({S16, S32, V2S16}) 965 .clampMaxNumElements(0, S16, 2) 966 // FIXME: Fixing non-power-of-2 before clamp is workaround for 967 // narrowScalar limitation. 968 .widenScalarToNextPow2(0) 969 .clampScalar(0, S16, S32) 970 .scalarize(0); 971 972 if (ST.hasVOP3PInsts()) { 973 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 974 .legalFor({S32, S16, V2S16}) 975 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 976 .clampMaxNumElements(0, S16, 2) 977 .minScalar(0, S16) 978 .widenScalarToNextPow2(0) 979 .scalarize(0) 980 .lower(); 981 } else { 982 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 983 .legalFor({S32, S16}) 984 .widenScalarToNextPow2(0) 985 .minScalar(0, S16) 986 .scalarize(0) 987 .lower(); 988 } 989 } else { 990 // TODO: Should have same legality without v_perm_b32 991 getActionDefinitionsBuilder(G_BSWAP) 992 .legalFor({S32}) 993 .lowerIf(scalarNarrowerThan(0, 32)) 994 // FIXME: Fixing non-power-of-2 before clamp is workaround for 995 // narrowScalar limitation. 996 .widenScalarToNextPow2(0) 997 .maxScalar(0, S32) 998 .scalarize(0) 999 .lower(); 1000 1001 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1002 .legalFor({S32}) 1003 .minScalar(0, S32) 1004 .widenScalarToNextPow2(0) 1005 .scalarize(0) 1006 .lower(); 1007 } 1008 1009 getActionDefinitionsBuilder(G_INTTOPTR) 1010 // List the common cases 1011 .legalForCartesianProduct(AddrSpaces64, {S64}) 1012 .legalForCartesianProduct(AddrSpaces32, {S32}) 1013 .scalarize(0) 1014 // Accept any address space as long as the size matches 1015 .legalIf(sameSize(0, 1)) 1016 .widenScalarIf(smallerThan(1, 0), 1017 [](const LegalityQuery &Query) { 1018 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1019 }) 1020 .narrowScalarIf(largerThan(1, 0), 1021 [](const LegalityQuery &Query) { 1022 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1023 }); 1024 1025 getActionDefinitionsBuilder(G_PTRTOINT) 1026 // List the common cases 1027 .legalForCartesianProduct(AddrSpaces64, {S64}) 1028 .legalForCartesianProduct(AddrSpaces32, {S32}) 1029 .scalarize(0) 1030 // Accept any address space as long as the size matches 1031 .legalIf(sameSize(0, 1)) 1032 .widenScalarIf(smallerThan(0, 1), 1033 [](const LegalityQuery &Query) { 1034 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1035 }) 1036 .narrowScalarIf( 1037 largerThan(0, 1), 1038 [](const LegalityQuery &Query) { 1039 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1040 }); 1041 1042 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1043 .scalarize(0) 1044 .custom(); 1045 1046 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1047 bool IsLoad) -> bool { 1048 const LLT DstTy = Query.Types[0]; 1049 1050 // Split vector extloads. 1051 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1052 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1053 1054 if (MemSize < DstTy.getSizeInBits()) 1055 MemSize = std::max(MemSize, AlignBits); 1056 1057 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1058 return true; 1059 1060 const LLT PtrTy = Query.Types[1]; 1061 unsigned AS = PtrTy.getAddressSpace(); 1062 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1063 return true; 1064 1065 // Catch weird sized loads that don't evenly divide into the access sizes 1066 // TODO: May be able to widen depending on alignment etc. 1067 unsigned NumRegs = (MemSize + 31) / 32; 1068 if (NumRegs == 3) { 1069 if (!ST.hasDwordx3LoadStores()) 1070 return true; 1071 } else { 1072 // If the alignment allows, these should have been widened. 1073 if (!isPowerOf2_32(NumRegs)) 1074 return true; 1075 } 1076 1077 if (AlignBits < MemSize) { 1078 const SITargetLowering *TLI = ST.getTargetLowering(); 1079 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1080 Align(AlignBits / 8)); 1081 } 1082 1083 return false; 1084 }; 1085 1086 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1087 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1088 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1089 1090 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1091 // LDS 1092 // TODO: Unsupported flat for SI. 1093 1094 for (unsigned Op : {G_LOAD, G_STORE}) { 1095 const bool IsStore = Op == G_STORE; 1096 1097 auto &Actions = getActionDefinitionsBuilder(Op); 1098 // Explicitly list some common cases. 1099 // TODO: Does this help compile time at all? 1100 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1101 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1102 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1103 {S64, GlobalPtr, S64, GlobalAlign32}, 1104 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1105 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1106 {S32, GlobalPtr, S8, GlobalAlign8}, 1107 {S32, GlobalPtr, S16, GlobalAlign16}, 1108 1109 {S32, LocalPtr, S32, 32}, 1110 {S64, LocalPtr, S64, 32}, 1111 {V2S32, LocalPtr, V2S32, 32}, 1112 {S32, LocalPtr, S8, 8}, 1113 {S32, LocalPtr, S16, 16}, 1114 {V2S16, LocalPtr, S32, 32}, 1115 1116 {S32, PrivatePtr, S32, 32}, 1117 {S32, PrivatePtr, S8, 8}, 1118 {S32, PrivatePtr, S16, 16}, 1119 {V2S16, PrivatePtr, S32, 32}, 1120 1121 {S32, ConstantPtr, S32, GlobalAlign32}, 1122 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1123 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1124 {S64, ConstantPtr, S64, GlobalAlign32}, 1125 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1126 Actions.legalIf( 1127 [=](const LegalityQuery &Query) -> bool { 1128 return isLoadStoreLegal(ST, Query); 1129 }); 1130 1131 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1132 // 64-bits. 1133 // 1134 // TODO: Should generalize bitcast action into coerce, which will also cover 1135 // inserting addrspacecasts. 1136 Actions.customIf(typeIs(1, Constant32Ptr)); 1137 1138 // Turn any illegal element vectors into something easier to deal 1139 // with. These will ultimately produce 32-bit scalar shifts to extract the 1140 // parts anyway. 1141 // 1142 // For odd 16-bit element vectors, prefer to split those into pieces with 1143 // 16-bit vector parts. 1144 Actions.bitcastIf( 1145 [=](const LegalityQuery &Query) -> bool { 1146 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1147 Query.MMODescrs[0].MemoryTy); 1148 }, bitcastToRegisterType(0)); 1149 1150 if (!IsStore) { 1151 // Widen suitably aligned loads by loading extra bytes. The standard 1152 // legalization actions can't properly express widening memory operands. 1153 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1154 return shouldWidenLoad(ST, Query, G_LOAD); 1155 }); 1156 } 1157 1158 // FIXME: load/store narrowing should be moved to lower action 1159 Actions 1160 .narrowScalarIf( 1161 [=](const LegalityQuery &Query) -> bool { 1162 return !Query.Types[0].isVector() && 1163 needToSplitMemOp(Query, Op == G_LOAD); 1164 }, 1165 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1166 const LLT DstTy = Query.Types[0]; 1167 const LLT PtrTy = Query.Types[1]; 1168 1169 const unsigned DstSize = DstTy.getSizeInBits(); 1170 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1171 1172 // Split extloads. 1173 if (DstSize > MemSize) 1174 return std::make_pair(0, LLT::scalar(MemSize)); 1175 1176 if (!isPowerOf2_32(DstSize)) { 1177 // We're probably decomposing an odd sized store. Try to split 1178 // to the widest type. TODO: Account for alignment. As-is it 1179 // should be OK, since the new parts will be further legalized. 1180 unsigned FloorSize = PowerOf2Floor(DstSize); 1181 return std::make_pair(0, LLT::scalar(FloorSize)); 1182 } 1183 1184 if (DstSize > 32 && (DstSize % 32 != 0)) { 1185 // FIXME: Need a way to specify non-extload of larger size if 1186 // suitably aligned. 1187 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1188 } 1189 1190 unsigned MaxSize = maxSizeForAddrSpace(ST, 1191 PtrTy.getAddressSpace(), 1192 Op == G_LOAD); 1193 if (MemSize > MaxSize) 1194 return std::make_pair(0, LLT::scalar(MaxSize)); 1195 1196 unsigned Align = Query.MMODescrs[0].AlignInBits; 1197 return std::make_pair(0, LLT::scalar(Align)); 1198 }) 1199 .fewerElementsIf( 1200 [=](const LegalityQuery &Query) -> bool { 1201 return Query.Types[0].isVector() && 1202 needToSplitMemOp(Query, Op == G_LOAD); 1203 }, 1204 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1205 const LLT DstTy = Query.Types[0]; 1206 const LLT PtrTy = Query.Types[1]; 1207 1208 LLT EltTy = DstTy.getElementType(); 1209 unsigned MaxSize = maxSizeForAddrSpace(ST, 1210 PtrTy.getAddressSpace(), 1211 Op == G_LOAD); 1212 1213 // FIXME: Handle widened to power of 2 results better. This ends 1214 // up scalarizing. 1215 // FIXME: 3 element stores scalarized on SI 1216 1217 // Split if it's too large for the address space. 1218 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1219 if (MemSize > MaxSize) { 1220 unsigned NumElts = DstTy.getNumElements(); 1221 unsigned EltSize = EltTy.getSizeInBits(); 1222 1223 if (MaxSize % EltSize == 0) { 1224 return std::make_pair( 1225 0, LLT::scalarOrVector( 1226 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1227 } 1228 1229 unsigned NumPieces = MemSize / MaxSize; 1230 1231 // FIXME: Refine when odd breakdowns handled 1232 // The scalars will need to be re-legalized. 1233 if (NumPieces == 1 || NumPieces >= NumElts || 1234 NumElts % NumPieces != 0) 1235 return std::make_pair(0, EltTy); 1236 1237 return std::make_pair( 1238 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1239 } 1240 1241 // FIXME: We could probably handle weird extending loads better. 1242 if (DstTy.getSizeInBits() > MemSize) 1243 return std::make_pair(0, EltTy); 1244 1245 unsigned EltSize = EltTy.getSizeInBits(); 1246 unsigned DstSize = DstTy.getSizeInBits(); 1247 if (!isPowerOf2_32(DstSize)) { 1248 // We're probably decomposing an odd sized store. Try to split 1249 // to the widest type. TODO: Account for alignment. As-is it 1250 // should be OK, since the new parts will be further legalized. 1251 unsigned FloorSize = PowerOf2Floor(DstSize); 1252 return std::make_pair( 1253 0, LLT::scalarOrVector( 1254 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1255 } 1256 1257 // Need to split because of alignment. 1258 unsigned Align = Query.MMODescrs[0].AlignInBits; 1259 if (EltSize > Align && 1260 (EltSize / Align < DstTy.getNumElements())) { 1261 return std::make_pair( 1262 0, LLT::fixed_vector(EltSize / Align, EltTy)); 1263 } 1264 1265 // May need relegalization for the scalars. 1266 return std::make_pair(0, EltTy); 1267 }) 1268 .minScalar(0, S32) 1269 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1270 .widenScalarToNextPow2(0) 1271 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1272 .lower(); 1273 } 1274 1275 // FIXME: Unaligned accesses not lowered. 1276 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1277 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1278 {S32, GlobalPtr, S16, 2 * 8}, 1279 {S32, LocalPtr, S8, 8}, 1280 {S32, LocalPtr, S16, 16}, 1281 {S32, PrivatePtr, S8, 8}, 1282 {S32, PrivatePtr, S16, 16}, 1283 {S32, ConstantPtr, S8, 8}, 1284 {S32, ConstantPtr, S16, 2 * 8}}) 1285 .legalIf( 1286 [=](const LegalityQuery &Query) -> bool { 1287 return isLoadStoreLegal(ST, Query); 1288 }); 1289 1290 if (ST.hasFlatAddressSpace()) { 1291 ExtLoads.legalForTypesWithMemDesc( 1292 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1293 } 1294 1295 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1296 // 64-bits. 1297 // 1298 // TODO: Should generalize bitcast action into coerce, which will also cover 1299 // inserting addrspacecasts. 1300 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1301 1302 ExtLoads.clampScalar(0, S32, S32) 1303 .widenScalarToNextPow2(0) 1304 .lower(); 1305 1306 auto &Atomics = getActionDefinitionsBuilder( 1307 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1308 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1309 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1310 G_ATOMICRMW_UMIN}) 1311 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1312 {S64, GlobalPtr}, {S64, LocalPtr}, 1313 {S32, RegionPtr}, {S64, RegionPtr}}); 1314 if (ST.hasFlatAddressSpace()) { 1315 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1316 } 1317 1318 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1319 if (ST.hasLDSFPAtomics()) { 1320 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1321 if (ST.hasGFX90AInsts()) 1322 Atomic.legalFor({{S64, LocalPtr}}); 1323 } 1324 if (ST.hasAtomicFaddInsts()) 1325 Atomic.legalFor({{S32, GlobalPtr}}); 1326 1327 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1328 // demarshalling 1329 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1330 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1331 {S32, FlatPtr}, {S64, FlatPtr}}) 1332 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1333 {S32, RegionPtr}, {S64, RegionPtr}}); 1334 // TODO: Pointer types, any 32-bit or 64-bit vector 1335 1336 // Condition should be s32 for scalar, s1 for vector. 1337 getActionDefinitionsBuilder(G_SELECT) 1338 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1339 LocalPtr, FlatPtr, PrivatePtr, 1340 LLT::fixed_vector(2, LocalPtr), 1341 LLT::fixed_vector(2, PrivatePtr)}, 1342 {S1, S32}) 1343 .clampScalar(0, S16, S64) 1344 .scalarize(1) 1345 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1346 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1347 .clampMaxNumElements(0, S32, 2) 1348 .clampMaxNumElements(0, LocalPtr, 2) 1349 .clampMaxNumElements(0, PrivatePtr, 2) 1350 .scalarize(0) 1351 .widenScalarToNextPow2(0) 1352 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1353 1354 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1355 // be more flexible with the shift amount type. 1356 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1357 .legalFor({{S32, S32}, {S64, S32}}); 1358 if (ST.has16BitInsts()) { 1359 if (ST.hasVOP3PInsts()) { 1360 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1361 .clampMaxNumElements(0, S16, 2); 1362 } else 1363 Shifts.legalFor({{S16, S16}}); 1364 1365 // TODO: Support 16-bit shift amounts for all types 1366 Shifts.widenScalarIf( 1367 [=](const LegalityQuery &Query) { 1368 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1369 // 32-bit amount. 1370 const LLT ValTy = Query.Types[0]; 1371 const LLT AmountTy = Query.Types[1]; 1372 return ValTy.getSizeInBits() <= 16 && 1373 AmountTy.getSizeInBits() < 16; 1374 }, changeTo(1, S16)); 1375 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1376 Shifts.clampScalar(1, S32, S32); 1377 Shifts.clampScalar(0, S16, S64); 1378 Shifts.widenScalarToNextPow2(0, 16); 1379 1380 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1381 .minScalar(0, S16) 1382 .scalarize(0) 1383 .lower(); 1384 } else { 1385 // Make sure we legalize the shift amount type first, as the general 1386 // expansion for the shifted type will produce much worse code if it hasn't 1387 // been truncated already. 1388 Shifts.clampScalar(1, S32, S32); 1389 Shifts.clampScalar(0, S32, S64); 1390 Shifts.widenScalarToNextPow2(0, 32); 1391 1392 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1393 .minScalar(0, S32) 1394 .scalarize(0) 1395 .lower(); 1396 } 1397 Shifts.scalarize(0); 1398 1399 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1400 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1401 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1402 unsigned IdxTypeIdx = 2; 1403 1404 getActionDefinitionsBuilder(Op) 1405 .customIf([=](const LegalityQuery &Query) { 1406 const LLT EltTy = Query.Types[EltTypeIdx]; 1407 const LLT VecTy = Query.Types[VecTypeIdx]; 1408 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1409 const unsigned EltSize = EltTy.getSizeInBits(); 1410 return (EltSize == 32 || EltSize == 64) && 1411 VecTy.getSizeInBits() % 32 == 0 && 1412 VecTy.getSizeInBits() <= MaxRegisterSize && 1413 IdxTy.getSizeInBits() == 32; 1414 }) 1415 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1416 bitcastToVectorElement32(VecTypeIdx)) 1417 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1418 .bitcastIf( 1419 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1420 [=](const LegalityQuery &Query) { 1421 // For > 64-bit element types, try to turn this into a 64-bit 1422 // element vector since we may be able to do better indexing 1423 // if this is scalar. If not, fall back to 32. 1424 const LLT EltTy = Query.Types[EltTypeIdx]; 1425 const LLT VecTy = Query.Types[VecTypeIdx]; 1426 const unsigned DstEltSize = EltTy.getSizeInBits(); 1427 const unsigned VecSize = VecTy.getSizeInBits(); 1428 1429 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1430 return std::make_pair( 1431 VecTypeIdx, 1432 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1433 }) 1434 .clampScalar(EltTypeIdx, S32, S64) 1435 .clampScalar(VecTypeIdx, S32, S64) 1436 .clampScalar(IdxTypeIdx, S32, S32) 1437 .clampMaxNumElements(VecTypeIdx, S32, 32) 1438 // TODO: Clamp elements for 64-bit vectors? 1439 // It should only be necessary with variable indexes. 1440 // As a last resort, lower to the stack 1441 .lower(); 1442 } 1443 1444 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1445 .unsupportedIf([=](const LegalityQuery &Query) { 1446 const LLT &EltTy = Query.Types[1].getElementType(); 1447 return Query.Types[0] != EltTy; 1448 }); 1449 1450 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1451 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1452 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1453 1454 // FIXME: Doesn't handle extract of illegal sizes. 1455 getActionDefinitionsBuilder(Op) 1456 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1457 // FIXME: Multiples of 16 should not be legal. 1458 .legalIf([=](const LegalityQuery &Query) { 1459 const LLT BigTy = Query.Types[BigTyIdx]; 1460 const LLT LitTy = Query.Types[LitTyIdx]; 1461 return (BigTy.getSizeInBits() % 32 == 0) && 1462 (LitTy.getSizeInBits() % 16 == 0); 1463 }) 1464 .widenScalarIf( 1465 [=](const LegalityQuery &Query) { 1466 const LLT BigTy = Query.Types[BigTyIdx]; 1467 return (BigTy.getScalarSizeInBits() < 16); 1468 }, 1469 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1470 .widenScalarIf( 1471 [=](const LegalityQuery &Query) { 1472 const LLT LitTy = Query.Types[LitTyIdx]; 1473 return (LitTy.getScalarSizeInBits() < 16); 1474 }, 1475 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1476 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1477 .widenScalarToNextPow2(BigTyIdx, 32); 1478 1479 } 1480 1481 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1482 .legalForCartesianProduct(AllS32Vectors, {S32}) 1483 .legalForCartesianProduct(AllS64Vectors, {S64}) 1484 .clampNumElements(0, V16S32, V32S32) 1485 .clampNumElements(0, V2S64, V16S64) 1486 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1487 1488 if (ST.hasScalarPackInsts()) { 1489 BuildVector 1490 // FIXME: Should probably widen s1 vectors straight to s32 1491 .minScalarOrElt(0, S16) 1492 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1493 .minScalar(1, S32); 1494 1495 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1496 .legalFor({V2S16, S32}) 1497 .lower(); 1498 BuildVector.minScalarOrElt(0, S32); 1499 } else { 1500 BuildVector.customFor({V2S16, S16}); 1501 BuildVector.minScalarOrElt(0, S32); 1502 1503 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1504 .customFor({V2S16, S32}) 1505 .lower(); 1506 } 1507 1508 BuildVector.legalIf(isRegisterType(0)); 1509 1510 // FIXME: Clamp maximum size 1511 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1512 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1513 .clampMaxNumElements(0, S32, 32) 1514 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1515 .clampMaxNumElements(0, S16, 64); 1516 1517 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1518 // pre-legalize. 1519 if (ST.hasVOP3PInsts()) { 1520 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1521 .customFor({V2S16, V2S16}) 1522 .lower(); 1523 } else 1524 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1525 1526 // Merge/Unmerge 1527 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1528 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1529 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1530 1531 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1532 const LLT Ty = Query.Types[TypeIdx]; 1533 if (Ty.isVector()) { 1534 const LLT &EltTy = Ty.getElementType(); 1535 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1536 return true; 1537 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1538 return true; 1539 } 1540 return false; 1541 }; 1542 1543 auto &Builder = getActionDefinitionsBuilder(Op) 1544 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1545 .lowerFor({{S16, V2S16}}) 1546 .lowerIf([=](const LegalityQuery &Query) { 1547 const LLT BigTy = Query.Types[BigTyIdx]; 1548 return BigTy.getSizeInBits() == 32; 1549 }) 1550 // Try to widen to s16 first for small types. 1551 // TODO: Only do this on targets with legal s16 shifts 1552 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1553 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1554 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1555 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1556 elementTypeIs(1, S16)), 1557 changeTo(1, V2S16)) 1558 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1559 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1560 // valid. 1561 .clampScalar(LitTyIdx, S32, S512) 1562 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1563 // Break up vectors with weird elements into scalars 1564 .fewerElementsIf( 1565 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1566 scalarize(0)) 1567 .fewerElementsIf( 1568 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1569 scalarize(1)) 1570 .clampScalar(BigTyIdx, S32, MaxScalar); 1571 1572 if (Op == G_MERGE_VALUES) { 1573 Builder.widenScalarIf( 1574 // TODO: Use 16-bit shifts if legal for 8-bit values? 1575 [=](const LegalityQuery &Query) { 1576 const LLT Ty = Query.Types[LitTyIdx]; 1577 return Ty.getSizeInBits() < 32; 1578 }, 1579 changeTo(LitTyIdx, S32)); 1580 } 1581 1582 Builder.widenScalarIf( 1583 [=](const LegalityQuery &Query) { 1584 const LLT Ty = Query.Types[BigTyIdx]; 1585 return !isPowerOf2_32(Ty.getSizeInBits()) && 1586 Ty.getSizeInBits() % 16 != 0; 1587 }, 1588 [=](const LegalityQuery &Query) { 1589 // Pick the next power of 2, or a multiple of 64 over 128. 1590 // Whichever is smaller. 1591 const LLT &Ty = Query.Types[BigTyIdx]; 1592 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1593 if (NewSizeInBits >= 256) { 1594 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1595 if (RoundedTo < NewSizeInBits) 1596 NewSizeInBits = RoundedTo; 1597 } 1598 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1599 }) 1600 // Any vectors left are the wrong size. Scalarize them. 1601 .scalarize(0) 1602 .scalarize(1); 1603 } 1604 1605 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1606 // RegBankSelect. 1607 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1608 .legalFor({{S32}, {S64}}); 1609 1610 if (ST.hasVOP3PInsts()) { 1611 SextInReg.lowerFor({{V2S16}}) 1612 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1613 // get more vector shift opportunities, since we'll get those when 1614 // expanded. 1615 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1616 } else if (ST.has16BitInsts()) { 1617 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1618 } else { 1619 // Prefer to promote to s32 before lowering if we don't have 16-bit 1620 // shifts. This avoid a lot of intermediate truncate and extend operations. 1621 SextInReg.lowerFor({{S32}, {S64}}); 1622 } 1623 1624 SextInReg 1625 .scalarize(0) 1626 .clampScalar(0, S32, S64) 1627 .lower(); 1628 1629 // TODO: Only Try to form v2s16 with legal packed instructions. 1630 getActionDefinitionsBuilder(G_FSHR) 1631 .legalFor({{S32, S32}}) 1632 .lowerFor({{V2S16, V2S16}}) 1633 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1634 .scalarize(0) 1635 .lower(); 1636 1637 if (ST.hasVOP3PInsts()) { 1638 getActionDefinitionsBuilder(G_FSHL) 1639 .lowerFor({{V2S16, V2S16}}) 1640 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1641 .scalarize(0) 1642 .lower(); 1643 } else { 1644 getActionDefinitionsBuilder(G_FSHL) 1645 .scalarize(0) 1646 .lower(); 1647 } 1648 1649 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1650 .legalFor({S64}); 1651 1652 getActionDefinitionsBuilder(G_FENCE) 1653 .alwaysLegal(); 1654 1655 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1656 .scalarize(0) 1657 .minScalar(0, S32) 1658 .lower(); 1659 1660 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1661 .legalFor({{S32, S32}, {S64, S32}}) 1662 .clampScalar(1, S32, S32) 1663 .clampScalar(0, S32, S64) 1664 .widenScalarToNextPow2(0) 1665 .scalarize(0); 1666 1667 getActionDefinitionsBuilder({ 1668 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1669 G_FCOPYSIGN, 1670 1671 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1672 G_ATOMICRMW_NAND, 1673 G_ATOMICRMW_FSUB, 1674 G_READ_REGISTER, 1675 G_WRITE_REGISTER, 1676 1677 G_SADDO, G_SSUBO, 1678 1679 // TODO: Implement 1680 G_FMINIMUM, G_FMAXIMUM}).lower(); 1681 1682 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1683 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1684 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1685 .unsupported(); 1686 1687 getLegacyLegalizerInfo().computeTables(); 1688 verify(*ST.getInstrInfo()); 1689 } 1690 1691 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1692 MachineInstr &MI) const { 1693 MachineIRBuilder &B = Helper.MIRBuilder; 1694 MachineRegisterInfo &MRI = *B.getMRI(); 1695 1696 switch (MI.getOpcode()) { 1697 case TargetOpcode::G_ADDRSPACE_CAST: 1698 return legalizeAddrSpaceCast(MI, MRI, B); 1699 case TargetOpcode::G_FRINT: 1700 return legalizeFrint(MI, MRI, B); 1701 case TargetOpcode::G_FCEIL: 1702 return legalizeFceil(MI, MRI, B); 1703 case TargetOpcode::G_FREM: 1704 return legalizeFrem(MI, MRI, B); 1705 case TargetOpcode::G_INTRINSIC_TRUNC: 1706 return legalizeIntrinsicTrunc(MI, MRI, B); 1707 case TargetOpcode::G_SITOFP: 1708 return legalizeITOFP(MI, MRI, B, true); 1709 case TargetOpcode::G_UITOFP: 1710 return legalizeITOFP(MI, MRI, B, false); 1711 case TargetOpcode::G_FPTOSI: 1712 return legalizeFPTOI(MI, MRI, B, true); 1713 case TargetOpcode::G_FPTOUI: 1714 return legalizeFPTOI(MI, MRI, B, false); 1715 case TargetOpcode::G_FMINNUM: 1716 case TargetOpcode::G_FMAXNUM: 1717 case TargetOpcode::G_FMINNUM_IEEE: 1718 case TargetOpcode::G_FMAXNUM_IEEE: 1719 return legalizeMinNumMaxNum(Helper, MI); 1720 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1721 return legalizeExtractVectorElt(MI, MRI, B); 1722 case TargetOpcode::G_INSERT_VECTOR_ELT: 1723 return legalizeInsertVectorElt(MI, MRI, B); 1724 case TargetOpcode::G_SHUFFLE_VECTOR: 1725 return legalizeShuffleVector(MI, MRI, B); 1726 case TargetOpcode::G_FSIN: 1727 case TargetOpcode::G_FCOS: 1728 return legalizeSinCos(MI, MRI, B); 1729 case TargetOpcode::G_GLOBAL_VALUE: 1730 return legalizeGlobalValue(MI, MRI, B); 1731 case TargetOpcode::G_LOAD: 1732 case TargetOpcode::G_SEXTLOAD: 1733 case TargetOpcode::G_ZEXTLOAD: 1734 return legalizeLoad(Helper, MI); 1735 case TargetOpcode::G_FMAD: 1736 return legalizeFMad(MI, MRI, B); 1737 case TargetOpcode::G_FDIV: 1738 return legalizeFDIV(MI, MRI, B); 1739 case TargetOpcode::G_UDIV: 1740 case TargetOpcode::G_UREM: 1741 case TargetOpcode::G_UDIVREM: 1742 return legalizeUnsignedDIV_REM(MI, MRI, B); 1743 case TargetOpcode::G_SDIV: 1744 case TargetOpcode::G_SREM: 1745 case TargetOpcode::G_SDIVREM: 1746 return legalizeSignedDIV_REM(MI, MRI, B); 1747 case TargetOpcode::G_ATOMIC_CMPXCHG: 1748 return legalizeAtomicCmpXChg(MI, MRI, B); 1749 case TargetOpcode::G_FLOG: 1750 return legalizeFlog(MI, B, numbers::ln2f); 1751 case TargetOpcode::G_FLOG10: 1752 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1753 case TargetOpcode::G_FEXP: 1754 return legalizeFExp(MI, B); 1755 case TargetOpcode::G_FPOW: 1756 return legalizeFPow(MI, B); 1757 case TargetOpcode::G_FFLOOR: 1758 return legalizeFFloor(MI, MRI, B); 1759 case TargetOpcode::G_BUILD_VECTOR: 1760 return legalizeBuildVector(MI, MRI, B); 1761 case TargetOpcode::G_CTLZ: 1762 case TargetOpcode::G_CTTZ: 1763 return legalizeCTLZ_CTTZ(MI, MRI, B); 1764 default: 1765 return false; 1766 } 1767 1768 llvm_unreachable("expected switch to return"); 1769 } 1770 1771 Register AMDGPULegalizerInfo::getSegmentAperture( 1772 unsigned AS, 1773 MachineRegisterInfo &MRI, 1774 MachineIRBuilder &B) const { 1775 MachineFunction &MF = B.getMF(); 1776 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1777 const LLT S32 = LLT::scalar(32); 1778 1779 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1780 1781 if (ST.hasApertureRegs()) { 1782 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1783 // getreg. 1784 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1785 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1786 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1787 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1788 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1789 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1790 unsigned Encoding = 1791 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1792 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1793 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1794 1795 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1796 1797 B.buildInstr(AMDGPU::S_GETREG_B32) 1798 .addDef(GetReg) 1799 .addImm(Encoding); 1800 MRI.setType(GetReg, S32); 1801 1802 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1803 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1804 } 1805 1806 Register QueuePtr = MRI.createGenericVirtualRegister( 1807 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1808 1809 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1810 return Register(); 1811 1812 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1813 // private_segment_aperture_base_hi. 1814 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1815 1816 // TODO: can we be smarter about machine pointer info? 1817 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1818 MachineMemOperand *MMO = MF.getMachineMemOperand( 1819 PtrInfo, 1820 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1821 MachineMemOperand::MOInvariant, 1822 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1823 1824 Register LoadAddr; 1825 1826 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1827 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1828 } 1829 1830 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1831 MachineInstr &MI, MachineRegisterInfo &MRI, 1832 MachineIRBuilder &B) const { 1833 MachineFunction &MF = B.getMF(); 1834 1835 const LLT S32 = LLT::scalar(32); 1836 Register Dst = MI.getOperand(0).getReg(); 1837 Register Src = MI.getOperand(1).getReg(); 1838 1839 LLT DstTy = MRI.getType(Dst); 1840 LLT SrcTy = MRI.getType(Src); 1841 unsigned DestAS = DstTy.getAddressSpace(); 1842 unsigned SrcAS = SrcTy.getAddressSpace(); 1843 1844 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1845 // vector element. 1846 assert(!DstTy.isVector()); 1847 1848 const AMDGPUTargetMachine &TM 1849 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1850 1851 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1852 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1853 return true; 1854 } 1855 1856 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1857 // Truncate. 1858 B.buildExtract(Dst, Src, 0); 1859 MI.eraseFromParent(); 1860 return true; 1861 } 1862 1863 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1864 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1865 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1866 1867 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1868 // another. Merge operands are required to be the same type, but creating an 1869 // extra ptrtoint would be kind of pointless. 1870 auto HighAddr = B.buildConstant( 1871 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1872 B.buildMerge(Dst, {Src, HighAddr}); 1873 MI.eraseFromParent(); 1874 return true; 1875 } 1876 1877 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1878 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1879 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1880 unsigned NullVal = TM.getNullPointerValue(DestAS); 1881 1882 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1883 auto FlatNull = B.buildConstant(SrcTy, 0); 1884 1885 // Extract low 32-bits of the pointer. 1886 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1887 1888 auto CmpRes = 1889 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1890 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1891 1892 MI.eraseFromParent(); 1893 return true; 1894 } 1895 1896 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1897 return false; 1898 1899 if (!ST.hasFlatAddressSpace()) 1900 return false; 1901 1902 auto SegmentNull = 1903 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1904 auto FlatNull = 1905 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1906 1907 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1908 if (!ApertureReg.isValid()) 1909 return false; 1910 1911 auto CmpRes = 1912 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1913 1914 // Coerce the type of the low half of the result so we can use merge_values. 1915 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1916 1917 // TODO: Should we allow mismatched types but matching sizes in merges to 1918 // avoid the ptrtoint? 1919 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1920 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1921 1922 MI.eraseFromParent(); 1923 return true; 1924 } 1925 1926 bool AMDGPULegalizerInfo::legalizeFrint( 1927 MachineInstr &MI, MachineRegisterInfo &MRI, 1928 MachineIRBuilder &B) const { 1929 Register Src = MI.getOperand(1).getReg(); 1930 LLT Ty = MRI.getType(Src); 1931 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1932 1933 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1934 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1935 1936 auto C1 = B.buildFConstant(Ty, C1Val); 1937 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1938 1939 // TODO: Should this propagate fast-math-flags? 1940 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1941 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1942 1943 auto C2 = B.buildFConstant(Ty, C2Val); 1944 auto Fabs = B.buildFAbs(Ty, Src); 1945 1946 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1947 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1948 MI.eraseFromParent(); 1949 return true; 1950 } 1951 1952 bool AMDGPULegalizerInfo::legalizeFceil( 1953 MachineInstr &MI, MachineRegisterInfo &MRI, 1954 MachineIRBuilder &B) const { 1955 1956 const LLT S1 = LLT::scalar(1); 1957 const LLT S64 = LLT::scalar(64); 1958 1959 Register Src = MI.getOperand(1).getReg(); 1960 assert(MRI.getType(Src) == S64); 1961 1962 // result = trunc(src) 1963 // if (src > 0.0 && src != result) 1964 // result += 1.0 1965 1966 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1967 1968 const auto Zero = B.buildFConstant(S64, 0.0); 1969 const auto One = B.buildFConstant(S64, 1.0); 1970 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1971 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1972 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1973 auto Add = B.buildSelect(S64, And, One, Zero); 1974 1975 // TODO: Should this propagate fast-math-flags? 1976 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1977 return true; 1978 } 1979 1980 bool AMDGPULegalizerInfo::legalizeFrem( 1981 MachineInstr &MI, MachineRegisterInfo &MRI, 1982 MachineIRBuilder &B) const { 1983 Register DstReg = MI.getOperand(0).getReg(); 1984 Register Src0Reg = MI.getOperand(1).getReg(); 1985 Register Src1Reg = MI.getOperand(2).getReg(); 1986 auto Flags = MI.getFlags(); 1987 LLT Ty = MRI.getType(DstReg); 1988 1989 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1990 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1991 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1992 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1993 MI.eraseFromParent(); 1994 return true; 1995 } 1996 1997 static MachineInstrBuilder extractF64Exponent(Register Hi, 1998 MachineIRBuilder &B) { 1999 const unsigned FractBits = 52; 2000 const unsigned ExpBits = 11; 2001 LLT S32 = LLT::scalar(32); 2002 2003 auto Const0 = B.buildConstant(S32, FractBits - 32); 2004 auto Const1 = B.buildConstant(S32, ExpBits); 2005 2006 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2007 .addUse(Hi) 2008 .addUse(Const0.getReg(0)) 2009 .addUse(Const1.getReg(0)); 2010 2011 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2012 } 2013 2014 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2015 MachineInstr &MI, MachineRegisterInfo &MRI, 2016 MachineIRBuilder &B) const { 2017 const LLT S1 = LLT::scalar(1); 2018 const LLT S32 = LLT::scalar(32); 2019 const LLT S64 = LLT::scalar(64); 2020 2021 Register Src = MI.getOperand(1).getReg(); 2022 assert(MRI.getType(Src) == S64); 2023 2024 // TODO: Should this use extract since the low half is unused? 2025 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2026 Register Hi = Unmerge.getReg(1); 2027 2028 // Extract the upper half, since this is where we will find the sign and 2029 // exponent. 2030 auto Exp = extractF64Exponent(Hi, B); 2031 2032 const unsigned FractBits = 52; 2033 2034 // Extract the sign bit. 2035 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2036 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2037 2038 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2039 2040 const auto Zero32 = B.buildConstant(S32, 0); 2041 2042 // Extend back to 64-bits. 2043 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2044 2045 auto Shr = B.buildAShr(S64, FractMask, Exp); 2046 auto Not = B.buildNot(S64, Shr); 2047 auto Tmp0 = B.buildAnd(S64, Src, Not); 2048 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2049 2050 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2051 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2052 2053 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2054 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2055 MI.eraseFromParent(); 2056 return true; 2057 } 2058 2059 bool AMDGPULegalizerInfo::legalizeITOFP( 2060 MachineInstr &MI, MachineRegisterInfo &MRI, 2061 MachineIRBuilder &B, bool Signed) const { 2062 2063 Register Dst = MI.getOperand(0).getReg(); 2064 Register Src = MI.getOperand(1).getReg(); 2065 2066 const LLT S64 = LLT::scalar(64); 2067 const LLT S32 = LLT::scalar(32); 2068 2069 assert(MRI.getType(Src) == S64); 2070 2071 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2072 auto ThirtyTwo = B.buildConstant(S32, 32); 2073 2074 if (MRI.getType(Dst) == S64) { 2075 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2076 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2077 2078 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2079 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2080 .addUse(CvtHi.getReg(0)) 2081 .addUse(ThirtyTwo.getReg(0)); 2082 2083 // TODO: Should this propagate fast-math-flags? 2084 B.buildFAdd(Dst, LdExp, CvtLo); 2085 MI.eraseFromParent(); 2086 return true; 2087 } 2088 2089 assert(MRI.getType(Dst) == S32); 2090 2091 auto One = B.buildConstant(S32, 1); 2092 2093 MachineInstrBuilder ShAmt; 2094 if (Signed) { 2095 auto ThirtyOne = B.buildConstant(S32, 31); 2096 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2097 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2098 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2099 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2100 /*HasSideEffects=*/false) 2101 .addUse(Unmerge.getReg(1)); 2102 auto LS2 = B.buildSub(S32, LS, One); 2103 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2104 } else 2105 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2106 auto Norm = B.buildShl(S64, Src, ShAmt); 2107 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2108 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2109 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2110 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2111 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2112 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2113 /*HasSideEffects=*/false) 2114 .addUse(FVal.getReg(0)) 2115 .addUse(Scale.getReg(0)); 2116 MI.eraseFromParent(); 2117 return true; 2118 } 2119 2120 // TODO: Copied from DAG implementation. Verify logic and document how this 2121 // actually works. 2122 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2123 MachineRegisterInfo &MRI, 2124 MachineIRBuilder &B, 2125 bool Signed) const { 2126 2127 Register Dst = MI.getOperand(0).getReg(); 2128 Register Src = MI.getOperand(1).getReg(); 2129 2130 const LLT S64 = LLT::scalar(64); 2131 const LLT S32 = LLT::scalar(32); 2132 2133 const LLT SrcLT = MRI.getType(Src); 2134 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2135 2136 unsigned Flags = MI.getFlags(); 2137 2138 // The basic idea of converting a floating point number into a pair of 32-bit 2139 // integers is illustrated as follows: 2140 // 2141 // tf := trunc(val); 2142 // hif := floor(tf * 2^-32); 2143 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2144 // hi := fptoi(hif); 2145 // lo := fptoi(lof); 2146 // 2147 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2148 MachineInstrBuilder Sign; 2149 if (Signed && SrcLT == S32) { 2150 // However, a 32-bit floating point number has only 23 bits mantissa and 2151 // it's not enough to hold all the significant bits of `lof` if val is 2152 // negative. To avoid the loss of precision, We need to take the absolute 2153 // value after truncating and flip the result back based on the original 2154 // signedness. 2155 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2156 Trunc = B.buildFAbs(S32, Trunc, Flags); 2157 } 2158 MachineInstrBuilder K0, K1; 2159 if (SrcLT == S64) { 2160 K0 = B.buildFConstant(S64, 2161 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2162 K1 = B.buildFConstant(S64, 2163 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2164 } else { 2165 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2166 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2167 } 2168 2169 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2170 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2171 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2172 2173 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2174 : B.buildFPTOUI(S32, FloorMul); 2175 auto Lo = B.buildFPTOUI(S32, Fma); 2176 2177 if (Signed && SrcLT == S32) { 2178 // Flip the result based on the signedness, which is either all 0s or 1s. 2179 Sign = B.buildMerge(S64, {Sign, Sign}); 2180 // r := xor({lo, hi}, sign) - sign; 2181 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2182 } else 2183 B.buildMerge(Dst, {Lo, Hi}); 2184 MI.eraseFromParent(); 2185 2186 return true; 2187 } 2188 2189 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2190 MachineInstr &MI) const { 2191 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2192 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2193 2194 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2195 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2196 2197 // With ieee_mode disabled, the instructions have the correct behavior 2198 // already for G_FMINNUM/G_FMAXNUM 2199 if (!MFI->getMode().IEEE) 2200 return !IsIEEEOp; 2201 2202 if (IsIEEEOp) 2203 return true; 2204 2205 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2206 } 2207 2208 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2209 MachineInstr &MI, MachineRegisterInfo &MRI, 2210 MachineIRBuilder &B) const { 2211 // TODO: Should move some of this into LegalizerHelper. 2212 2213 // TODO: Promote dynamic indexing of s16 to s32 2214 2215 // FIXME: Artifact combiner probably should have replaced the truncated 2216 // constant before this, so we shouldn't need 2217 // getConstantVRegValWithLookThrough. 2218 Optional<ValueAndVReg> MaybeIdxVal = 2219 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2220 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2221 return true; 2222 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2223 2224 Register Dst = MI.getOperand(0).getReg(); 2225 Register Vec = MI.getOperand(1).getReg(); 2226 2227 LLT VecTy = MRI.getType(Vec); 2228 LLT EltTy = VecTy.getElementType(); 2229 assert(EltTy == MRI.getType(Dst)); 2230 2231 if (IdxVal < VecTy.getNumElements()) 2232 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2233 else 2234 B.buildUndef(Dst); 2235 2236 MI.eraseFromParent(); 2237 return true; 2238 } 2239 2240 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2241 MachineInstr &MI, MachineRegisterInfo &MRI, 2242 MachineIRBuilder &B) const { 2243 // TODO: Should move some of this into LegalizerHelper. 2244 2245 // TODO: Promote dynamic indexing of s16 to s32 2246 2247 // FIXME: Artifact combiner probably should have replaced the truncated 2248 // constant before this, so we shouldn't need 2249 // getConstantVRegValWithLookThrough. 2250 Optional<ValueAndVReg> MaybeIdxVal = 2251 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2252 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2253 return true; 2254 2255 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2256 Register Dst = MI.getOperand(0).getReg(); 2257 Register Vec = MI.getOperand(1).getReg(); 2258 Register Ins = MI.getOperand(2).getReg(); 2259 2260 LLT VecTy = MRI.getType(Vec); 2261 LLT EltTy = VecTy.getElementType(); 2262 assert(EltTy == MRI.getType(Ins)); 2263 2264 if (IdxVal < VecTy.getNumElements()) 2265 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2266 else 2267 B.buildUndef(Dst); 2268 2269 MI.eraseFromParent(); 2270 return true; 2271 } 2272 2273 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2274 MachineInstr &MI, MachineRegisterInfo &MRI, 2275 MachineIRBuilder &B) const { 2276 const LLT V2S16 = LLT::fixed_vector(2, 16); 2277 2278 Register Dst = MI.getOperand(0).getReg(); 2279 Register Src0 = MI.getOperand(1).getReg(); 2280 LLT DstTy = MRI.getType(Dst); 2281 LLT SrcTy = MRI.getType(Src0); 2282 2283 if (SrcTy == V2S16 && DstTy == V2S16 && 2284 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2285 return true; 2286 2287 MachineIRBuilder HelperBuilder(MI); 2288 GISelObserverWrapper DummyObserver; 2289 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2290 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2291 } 2292 2293 bool AMDGPULegalizerInfo::legalizeSinCos( 2294 MachineInstr &MI, MachineRegisterInfo &MRI, 2295 MachineIRBuilder &B) const { 2296 2297 Register DstReg = MI.getOperand(0).getReg(); 2298 Register SrcReg = MI.getOperand(1).getReg(); 2299 LLT Ty = MRI.getType(DstReg); 2300 unsigned Flags = MI.getFlags(); 2301 2302 Register TrigVal; 2303 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2304 if (ST.hasTrigReducedRange()) { 2305 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2306 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2307 .addUse(MulVal.getReg(0)) 2308 .setMIFlags(Flags).getReg(0); 2309 } else 2310 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2311 2312 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2313 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2314 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2315 .addUse(TrigVal) 2316 .setMIFlags(Flags); 2317 MI.eraseFromParent(); 2318 return true; 2319 } 2320 2321 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2322 MachineIRBuilder &B, 2323 const GlobalValue *GV, 2324 int64_t Offset, 2325 unsigned GAFlags) const { 2326 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2327 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2328 // to the following code sequence: 2329 // 2330 // For constant address space: 2331 // s_getpc_b64 s[0:1] 2332 // s_add_u32 s0, s0, $symbol 2333 // s_addc_u32 s1, s1, 0 2334 // 2335 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2336 // a fixup or relocation is emitted to replace $symbol with a literal 2337 // constant, which is a pc-relative offset from the encoding of the $symbol 2338 // operand to the global variable. 2339 // 2340 // For global address space: 2341 // s_getpc_b64 s[0:1] 2342 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2343 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2344 // 2345 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2346 // fixups or relocations are emitted to replace $symbol@*@lo and 2347 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2348 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2349 // operand to the global variable. 2350 // 2351 // What we want here is an offset from the value returned by s_getpc 2352 // (which is the address of the s_add_u32 instruction) to the global 2353 // variable, but since the encoding of $symbol starts 4 bytes after the start 2354 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2355 // small. This requires us to add 4 to the global variable offset in order to 2356 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2357 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2358 // instruction. 2359 2360 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2361 2362 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2363 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2364 2365 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2366 .addDef(PCReg); 2367 2368 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2369 if (GAFlags == SIInstrInfo::MO_NONE) 2370 MIB.addImm(0); 2371 else 2372 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2373 2374 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2375 2376 if (PtrTy.getSizeInBits() == 32) 2377 B.buildExtract(DstReg, PCReg, 0); 2378 return true; 2379 } 2380 2381 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2382 MachineInstr &MI, MachineRegisterInfo &MRI, 2383 MachineIRBuilder &B) const { 2384 Register DstReg = MI.getOperand(0).getReg(); 2385 LLT Ty = MRI.getType(DstReg); 2386 unsigned AS = Ty.getAddressSpace(); 2387 2388 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2389 MachineFunction &MF = B.getMF(); 2390 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2391 2392 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2393 if (!MFI->isModuleEntryFunction() && 2394 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2395 const Function &Fn = MF.getFunction(); 2396 DiagnosticInfoUnsupported BadLDSDecl( 2397 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2398 DS_Warning); 2399 Fn.getContext().diagnose(BadLDSDecl); 2400 2401 // We currently don't have a way to correctly allocate LDS objects that 2402 // aren't directly associated with a kernel. We do force inlining of 2403 // functions that use local objects. However, if these dead functions are 2404 // not eliminated, we don't want a compile time error. Just emit a warning 2405 // and a trap, since there should be no callable path here. 2406 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2407 B.buildUndef(DstReg); 2408 MI.eraseFromParent(); 2409 return true; 2410 } 2411 2412 // TODO: We could emit code to handle the initialization somewhere. 2413 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2414 const SITargetLowering *TLI = ST.getTargetLowering(); 2415 if (!TLI->shouldUseLDSConstAddress(GV)) { 2416 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2417 return true; // Leave in place; 2418 } 2419 2420 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2421 Type *Ty = GV->getValueType(); 2422 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2423 // zero-sized type in other languages to declare the dynamic shared 2424 // memory which size is not known at the compile time. They will be 2425 // allocated by the runtime and placed directly after the static 2426 // allocated ones. They all share the same offset. 2427 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2428 // Adjust alignment for that dynamic shared memory array. 2429 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2430 LLT S32 = LLT::scalar(32); 2431 auto Sz = 2432 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2433 B.buildIntToPtr(DstReg, Sz); 2434 MI.eraseFromParent(); 2435 return true; 2436 } 2437 } 2438 2439 B.buildConstant( 2440 DstReg, 2441 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2442 MI.eraseFromParent(); 2443 return true; 2444 } 2445 2446 const Function &Fn = MF.getFunction(); 2447 DiagnosticInfoUnsupported BadInit( 2448 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2449 Fn.getContext().diagnose(BadInit); 2450 return true; 2451 } 2452 2453 const SITargetLowering *TLI = ST.getTargetLowering(); 2454 2455 if (TLI->shouldEmitFixup(GV)) { 2456 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2457 MI.eraseFromParent(); 2458 return true; 2459 } 2460 2461 if (TLI->shouldEmitPCReloc(GV)) { 2462 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2463 MI.eraseFromParent(); 2464 return true; 2465 } 2466 2467 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2468 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2469 2470 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2471 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2472 MachinePointerInfo::getGOT(MF), 2473 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2474 MachineMemOperand::MOInvariant, 2475 LoadTy, Align(8)); 2476 2477 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2478 2479 if (Ty.getSizeInBits() == 32) { 2480 // Truncate if this is a 32-bit constant adrdess. 2481 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2482 B.buildExtract(DstReg, Load, 0); 2483 } else 2484 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2485 2486 MI.eraseFromParent(); 2487 return true; 2488 } 2489 2490 static LLT widenToNextPowerOf2(LLT Ty) { 2491 if (Ty.isVector()) 2492 return Ty.changeElementCount( 2493 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2494 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2495 } 2496 2497 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2498 MachineInstr &MI) const { 2499 MachineIRBuilder &B = Helper.MIRBuilder; 2500 MachineRegisterInfo &MRI = *B.getMRI(); 2501 GISelChangeObserver &Observer = Helper.Observer; 2502 2503 Register PtrReg = MI.getOperand(1).getReg(); 2504 LLT PtrTy = MRI.getType(PtrReg); 2505 unsigned AddrSpace = PtrTy.getAddressSpace(); 2506 2507 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2508 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2509 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2510 Observer.changingInstr(MI); 2511 MI.getOperand(1).setReg(Cast.getReg(0)); 2512 Observer.changedInstr(MI); 2513 return true; 2514 } 2515 2516 if (MI.getOpcode() != AMDGPU::G_LOAD) 2517 return false; 2518 2519 Register ValReg = MI.getOperand(0).getReg(); 2520 LLT ValTy = MRI.getType(ValReg); 2521 2522 MachineMemOperand *MMO = *MI.memoperands_begin(); 2523 const unsigned ValSize = ValTy.getSizeInBits(); 2524 const LLT MemTy = MMO->getMemoryType(); 2525 const Align MemAlign = MMO->getAlign(); 2526 const unsigned MemSize = MemTy.getSizeInBits(); 2527 const unsigned AlignInBits = 8 * MemAlign.value(); 2528 2529 // Widen non-power-of-2 loads to the alignment if needed 2530 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2531 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2532 2533 // This was already the correct extending load result type, so just adjust 2534 // the memory type. 2535 if (WideMemSize == ValSize) { 2536 MachineFunction &MF = B.getMF(); 2537 2538 MachineMemOperand *WideMMO = 2539 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2540 Observer.changingInstr(MI); 2541 MI.setMemRefs(MF, {WideMMO}); 2542 Observer.changedInstr(MI); 2543 return true; 2544 } 2545 2546 // Don't bother handling edge case that should probably never be produced. 2547 if (ValSize > WideMemSize) 2548 return false; 2549 2550 LLT WideTy = widenToNextPowerOf2(ValTy); 2551 2552 Register WideLoad; 2553 if (!WideTy.isVector()) { 2554 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2555 B.buildTrunc(ValReg, WideLoad).getReg(0); 2556 } else { 2557 // Extract the subvector. 2558 2559 if (isRegisterType(ValTy)) { 2560 // If this a case where G_EXTRACT is legal, use it. 2561 // (e.g. <3 x s32> -> <4 x s32>) 2562 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2563 B.buildExtract(ValReg, WideLoad, 0); 2564 } else { 2565 // For cases where the widened type isn't a nice register value, unmerge 2566 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2567 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2568 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2569 B.setInsertPt(B.getMBB(), MI.getIterator()); 2570 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2571 } 2572 } 2573 2574 MI.eraseFromParent(); 2575 return true; 2576 } 2577 2578 return false; 2579 } 2580 2581 bool AMDGPULegalizerInfo::legalizeFMad( 2582 MachineInstr &MI, MachineRegisterInfo &MRI, 2583 MachineIRBuilder &B) const { 2584 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2585 assert(Ty.isScalar()); 2586 2587 MachineFunction &MF = B.getMF(); 2588 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2589 2590 // TODO: Always legal with future ftz flag. 2591 // FIXME: Do we need just output? 2592 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2593 return true; 2594 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2595 return true; 2596 2597 MachineIRBuilder HelperBuilder(MI); 2598 GISelObserverWrapper DummyObserver; 2599 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2600 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2601 } 2602 2603 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2604 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2605 Register DstReg = MI.getOperand(0).getReg(); 2606 Register PtrReg = MI.getOperand(1).getReg(); 2607 Register CmpVal = MI.getOperand(2).getReg(); 2608 Register NewVal = MI.getOperand(3).getReg(); 2609 2610 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2611 "this should not have been custom lowered"); 2612 2613 LLT ValTy = MRI.getType(CmpVal); 2614 LLT VecTy = LLT::fixed_vector(2, ValTy); 2615 2616 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2617 2618 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2619 .addDef(DstReg) 2620 .addUse(PtrReg) 2621 .addUse(PackedVal) 2622 .setMemRefs(MI.memoperands()); 2623 2624 MI.eraseFromParent(); 2625 return true; 2626 } 2627 2628 bool AMDGPULegalizerInfo::legalizeFlog( 2629 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2630 Register Dst = MI.getOperand(0).getReg(); 2631 Register Src = MI.getOperand(1).getReg(); 2632 LLT Ty = B.getMRI()->getType(Dst); 2633 unsigned Flags = MI.getFlags(); 2634 2635 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2636 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2637 2638 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2639 MI.eraseFromParent(); 2640 return true; 2641 } 2642 2643 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2644 MachineIRBuilder &B) const { 2645 Register Dst = MI.getOperand(0).getReg(); 2646 Register Src = MI.getOperand(1).getReg(); 2647 unsigned Flags = MI.getFlags(); 2648 LLT Ty = B.getMRI()->getType(Dst); 2649 2650 auto K = B.buildFConstant(Ty, numbers::log2e); 2651 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2652 B.buildFExp2(Dst, Mul, Flags); 2653 MI.eraseFromParent(); 2654 return true; 2655 } 2656 2657 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2658 MachineIRBuilder &B) const { 2659 Register Dst = MI.getOperand(0).getReg(); 2660 Register Src0 = MI.getOperand(1).getReg(); 2661 Register Src1 = MI.getOperand(2).getReg(); 2662 unsigned Flags = MI.getFlags(); 2663 LLT Ty = B.getMRI()->getType(Dst); 2664 const LLT S16 = LLT::scalar(16); 2665 const LLT S32 = LLT::scalar(32); 2666 2667 if (Ty == S32) { 2668 auto Log = B.buildFLog2(S32, Src0, Flags); 2669 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2670 .addUse(Log.getReg(0)) 2671 .addUse(Src1) 2672 .setMIFlags(Flags); 2673 B.buildFExp2(Dst, Mul, Flags); 2674 } else if (Ty == S16) { 2675 // There's no f16 fmul_legacy, so we need to convert for it. 2676 auto Log = B.buildFLog2(S16, Src0, Flags); 2677 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2678 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2679 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2680 .addUse(Ext0.getReg(0)) 2681 .addUse(Ext1.getReg(0)) 2682 .setMIFlags(Flags); 2683 2684 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2685 } else 2686 return false; 2687 2688 MI.eraseFromParent(); 2689 return true; 2690 } 2691 2692 // Find a source register, ignoring any possible source modifiers. 2693 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2694 Register ModSrc = OrigSrc; 2695 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2696 ModSrc = SrcFNeg->getOperand(1).getReg(); 2697 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2698 ModSrc = SrcFAbs->getOperand(1).getReg(); 2699 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2700 ModSrc = SrcFAbs->getOperand(1).getReg(); 2701 return ModSrc; 2702 } 2703 2704 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2705 MachineRegisterInfo &MRI, 2706 MachineIRBuilder &B) const { 2707 2708 const LLT S1 = LLT::scalar(1); 2709 const LLT S64 = LLT::scalar(64); 2710 Register Dst = MI.getOperand(0).getReg(); 2711 Register OrigSrc = MI.getOperand(1).getReg(); 2712 unsigned Flags = MI.getFlags(); 2713 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2714 "this should not have been custom lowered"); 2715 2716 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2717 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2718 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2719 // V_FRACT bug is: 2720 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2721 // 2722 // Convert floor(x) to (x - fract(x)) 2723 2724 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2725 .addUse(OrigSrc) 2726 .setMIFlags(Flags); 2727 2728 // Give source modifier matching some assistance before obscuring a foldable 2729 // pattern. 2730 2731 // TODO: We can avoid the neg on the fract? The input sign to fract 2732 // shouldn't matter? 2733 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2734 2735 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2736 2737 Register Min = MRI.createGenericVirtualRegister(S64); 2738 2739 // We don't need to concern ourselves with the snan handling difference, so 2740 // use the one which will directly select. 2741 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2742 if (MFI->getMode().IEEE) 2743 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2744 else 2745 B.buildFMinNum(Min, Fract, Const, Flags); 2746 2747 Register CorrectedFract = Min; 2748 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2749 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2750 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2751 } 2752 2753 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2754 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2755 2756 MI.eraseFromParent(); 2757 return true; 2758 } 2759 2760 // Turn an illegal packed v2s16 build vector into bit operations. 2761 // TODO: This should probably be a bitcast action in LegalizerHelper. 2762 bool AMDGPULegalizerInfo::legalizeBuildVector( 2763 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2764 Register Dst = MI.getOperand(0).getReg(); 2765 const LLT S32 = LLT::scalar(32); 2766 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2767 2768 Register Src0 = MI.getOperand(1).getReg(); 2769 Register Src1 = MI.getOperand(2).getReg(); 2770 assert(MRI.getType(Src0) == LLT::scalar(16)); 2771 2772 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2773 B.buildBitcast(Dst, Merge); 2774 2775 MI.eraseFromParent(); 2776 return true; 2777 } 2778 2779 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2780 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2781 // case with a single min instruction instead of a compare+select. 2782 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2783 MachineRegisterInfo &MRI, 2784 MachineIRBuilder &B) const { 2785 Register Dst = MI.getOperand(0).getReg(); 2786 Register Src = MI.getOperand(1).getReg(); 2787 LLT DstTy = MRI.getType(Dst); 2788 LLT SrcTy = MRI.getType(Src); 2789 2790 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2791 ? AMDGPU::G_AMDGPU_FFBH_U32 2792 : AMDGPU::G_AMDGPU_FFBL_B32; 2793 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2794 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2795 2796 MI.eraseFromParent(); 2797 return true; 2798 } 2799 2800 // Check that this is a G_XOR x, -1 2801 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2802 if (MI.getOpcode() != TargetOpcode::G_XOR) 2803 return false; 2804 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2805 return ConstVal && *ConstVal == -1; 2806 } 2807 2808 // Return the use branch instruction, otherwise null if the usage is invalid. 2809 static MachineInstr * 2810 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2811 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2812 Register CondDef = MI.getOperand(0).getReg(); 2813 if (!MRI.hasOneNonDBGUse(CondDef)) 2814 return nullptr; 2815 2816 MachineBasicBlock *Parent = MI.getParent(); 2817 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2818 2819 if (isNot(MRI, *UseMI)) { 2820 Register NegatedCond = UseMI->getOperand(0).getReg(); 2821 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2822 return nullptr; 2823 2824 // We're deleting the def of this value, so we need to remove it. 2825 UseMI->eraseFromParent(); 2826 2827 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2828 Negated = true; 2829 } 2830 2831 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2832 return nullptr; 2833 2834 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2835 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2836 if (Next == Parent->end()) { 2837 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2838 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2839 return nullptr; 2840 UncondBrTarget = &*NextMBB; 2841 } else { 2842 if (Next->getOpcode() != AMDGPU::G_BR) 2843 return nullptr; 2844 Br = &*Next; 2845 UncondBrTarget = Br->getOperand(0).getMBB(); 2846 } 2847 2848 return UseMI; 2849 } 2850 2851 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2852 const ArgDescriptor *Arg, 2853 const TargetRegisterClass *ArgRC, 2854 LLT ArgTy) const { 2855 MCRegister SrcReg = Arg->getRegister(); 2856 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2857 assert(DstReg.isVirtual() && "Virtual register expected"); 2858 2859 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2860 ArgTy); 2861 if (Arg->isMasked()) { 2862 // TODO: Should we try to emit this once in the entry block? 2863 const LLT S32 = LLT::scalar(32); 2864 const unsigned Mask = Arg->getMask(); 2865 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2866 2867 Register AndMaskSrc = LiveIn; 2868 2869 if (Shift != 0) { 2870 auto ShiftAmt = B.buildConstant(S32, Shift); 2871 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2872 } 2873 2874 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2875 } else { 2876 B.buildCopy(DstReg, LiveIn); 2877 } 2878 2879 return true; 2880 } 2881 2882 bool AMDGPULegalizerInfo::loadInputValue( 2883 Register DstReg, MachineIRBuilder &B, 2884 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2885 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2886 const ArgDescriptor *Arg; 2887 const TargetRegisterClass *ArgRC; 2888 LLT ArgTy; 2889 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2890 2891 if (!Arg) { 2892 assert(ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2893 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 2894 // case the pointer argument may be missing and we use null. 2895 B.buildConstant(DstReg, 0); 2896 return true; 2897 } 2898 2899 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2900 return false; // TODO: Handle these 2901 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2902 } 2903 2904 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2905 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2906 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2907 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2908 return false; 2909 2910 MI.eraseFromParent(); 2911 return true; 2912 } 2913 2914 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2915 MachineRegisterInfo &MRI, 2916 MachineIRBuilder &B) const { 2917 Register Dst = MI.getOperand(0).getReg(); 2918 LLT DstTy = MRI.getType(Dst); 2919 LLT S16 = LLT::scalar(16); 2920 LLT S32 = LLT::scalar(32); 2921 LLT S64 = LLT::scalar(64); 2922 2923 if (DstTy == S16) 2924 return legalizeFDIV16(MI, MRI, B); 2925 if (DstTy == S32) 2926 return legalizeFDIV32(MI, MRI, B); 2927 if (DstTy == S64) 2928 return legalizeFDIV64(MI, MRI, B); 2929 2930 return false; 2931 } 2932 2933 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2934 Register DstDivReg, 2935 Register DstRemReg, 2936 Register X, 2937 Register Y) const { 2938 const LLT S1 = LLT::scalar(1); 2939 const LLT S32 = LLT::scalar(32); 2940 2941 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2942 // algorithm used here. 2943 2944 // Initial estimate of inv(y). 2945 auto FloatY = B.buildUITOFP(S32, Y); 2946 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2947 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2948 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2949 auto Z = B.buildFPTOUI(S32, ScaledY); 2950 2951 // One round of UNR. 2952 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2953 auto NegYZ = B.buildMul(S32, NegY, Z); 2954 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2955 2956 // Quotient/remainder estimate. 2957 auto Q = B.buildUMulH(S32, X, Z); 2958 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2959 2960 // First quotient/remainder refinement. 2961 auto One = B.buildConstant(S32, 1); 2962 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2963 if (DstDivReg) 2964 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2965 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2966 2967 // Second quotient/remainder refinement. 2968 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2969 if (DstDivReg) 2970 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2971 2972 if (DstRemReg) 2973 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2974 } 2975 2976 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2977 // 2978 // Return lo, hi of result 2979 // 2980 // %cvt.lo = G_UITOFP Val.lo 2981 // %cvt.hi = G_UITOFP Val.hi 2982 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2983 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2984 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2985 // %mul2 = G_FMUL %mul1, 2**(-32) 2986 // %trunc = G_INTRINSIC_TRUNC %mul2 2987 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2988 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2989 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2990 Register Val) { 2991 const LLT S32 = LLT::scalar(32); 2992 auto Unmerge = B.buildUnmerge(S32, Val); 2993 2994 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2995 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2996 2997 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2998 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2999 3000 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 3001 auto Mul1 = 3002 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3003 3004 // 2**(-32) 3005 auto Mul2 = 3006 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3007 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3008 3009 // -(2**32) 3010 auto Mad2 = B.buildFMAD(S32, Trunc, 3011 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3012 3013 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3014 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3015 3016 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3017 } 3018 3019 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3020 Register DstDivReg, 3021 Register DstRemReg, 3022 Register Numer, 3023 Register Denom) const { 3024 const LLT S32 = LLT::scalar(32); 3025 const LLT S64 = LLT::scalar(64); 3026 const LLT S1 = LLT::scalar(1); 3027 Register RcpLo, RcpHi; 3028 3029 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3030 3031 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3032 3033 auto Zero64 = B.buildConstant(S64, 0); 3034 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3035 3036 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3037 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3038 3039 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3040 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3041 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3042 3043 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3044 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3045 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 3046 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3047 3048 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3049 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3050 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3051 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3052 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3053 3054 auto Zero32 = B.buildConstant(S32, 0); 3055 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3056 auto Add2_HiC = 3057 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 3058 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 3059 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3060 3061 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3062 Register NumerLo = UnmergeNumer.getReg(0); 3063 Register NumerHi = UnmergeNumer.getReg(1); 3064 3065 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3066 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3067 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3068 Register Mul3_Lo = UnmergeMul3.getReg(0); 3069 Register Mul3_Hi = UnmergeMul3.getReg(1); 3070 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3071 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3072 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3073 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3074 3075 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3076 Register DenomLo = UnmergeDenom.getReg(0); 3077 Register DenomHi = UnmergeDenom.getReg(1); 3078 3079 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3080 auto C1 = B.buildSExt(S32, CmpHi); 3081 3082 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3083 auto C2 = B.buildSExt(S32, CmpLo); 3084 3085 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3086 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3087 3088 // TODO: Here and below portions of the code can be enclosed into if/endif. 3089 // Currently control flow is unconditional and we have 4 selects after 3090 // potential endif to substitute PHIs. 3091 3092 // if C3 != 0 ... 3093 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3094 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3095 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3096 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3097 3098 auto One64 = B.buildConstant(S64, 1); 3099 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3100 3101 auto C4 = 3102 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3103 auto C5 = 3104 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3105 auto C6 = B.buildSelect( 3106 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3107 3108 // if (C6 != 0) 3109 auto Add4 = B.buildAdd(S64, Add3, One64); 3110 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3111 3112 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3113 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3114 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3115 3116 // endif C6 3117 // endif C3 3118 3119 if (DstDivReg) { 3120 auto Sel1 = B.buildSelect( 3121 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3122 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3123 Sel1, MulHi3); 3124 } 3125 3126 if (DstRemReg) { 3127 auto Sel2 = B.buildSelect( 3128 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3129 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3130 Sel2, Sub1); 3131 } 3132 } 3133 3134 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3135 MachineRegisterInfo &MRI, 3136 MachineIRBuilder &B) const { 3137 Register DstDivReg, DstRemReg; 3138 switch (MI.getOpcode()) { 3139 default: 3140 llvm_unreachable("Unexpected opcode!"); 3141 case AMDGPU::G_UDIV: { 3142 DstDivReg = MI.getOperand(0).getReg(); 3143 break; 3144 } 3145 case AMDGPU::G_UREM: { 3146 DstRemReg = MI.getOperand(0).getReg(); 3147 break; 3148 } 3149 case AMDGPU::G_UDIVREM: { 3150 DstDivReg = MI.getOperand(0).getReg(); 3151 DstRemReg = MI.getOperand(1).getReg(); 3152 break; 3153 } 3154 } 3155 3156 const LLT S64 = LLT::scalar(64); 3157 const LLT S32 = LLT::scalar(32); 3158 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3159 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3160 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3161 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3162 3163 if (Ty == S32) 3164 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3165 else if (Ty == S64) 3166 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3167 else 3168 return false; 3169 3170 MI.eraseFromParent(); 3171 return true; 3172 } 3173 3174 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3175 MachineRegisterInfo &MRI, 3176 MachineIRBuilder &B) const { 3177 const LLT S64 = LLT::scalar(64); 3178 const LLT S32 = LLT::scalar(32); 3179 3180 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3181 if (Ty != S32 && Ty != S64) 3182 return false; 3183 3184 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3185 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3186 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3187 3188 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3189 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3190 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3191 3192 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3193 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3194 3195 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3196 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3197 3198 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3199 switch (MI.getOpcode()) { 3200 default: 3201 llvm_unreachable("Unexpected opcode!"); 3202 case AMDGPU::G_SDIV: { 3203 DstDivReg = MI.getOperand(0).getReg(); 3204 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3205 break; 3206 } 3207 case AMDGPU::G_SREM: { 3208 DstRemReg = MI.getOperand(0).getReg(); 3209 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3210 break; 3211 } 3212 case AMDGPU::G_SDIVREM: { 3213 DstDivReg = MI.getOperand(0).getReg(); 3214 DstRemReg = MI.getOperand(1).getReg(); 3215 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3216 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3217 break; 3218 } 3219 } 3220 3221 if (Ty == S32) 3222 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3223 else 3224 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3225 3226 if (DstDivReg) { 3227 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3228 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3229 B.buildSub(DstDivReg, SignXor, Sign); 3230 } 3231 3232 if (DstRemReg) { 3233 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3234 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3235 B.buildSub(DstRemReg, SignXor, Sign); 3236 } 3237 3238 MI.eraseFromParent(); 3239 return true; 3240 } 3241 3242 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3243 MachineRegisterInfo &MRI, 3244 MachineIRBuilder &B) const { 3245 Register Res = MI.getOperand(0).getReg(); 3246 Register LHS = MI.getOperand(1).getReg(); 3247 Register RHS = MI.getOperand(2).getReg(); 3248 uint16_t Flags = MI.getFlags(); 3249 LLT ResTy = MRI.getType(Res); 3250 3251 const MachineFunction &MF = B.getMF(); 3252 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3253 MI.getFlag(MachineInstr::FmAfn); 3254 3255 if (!AllowInaccurateRcp) 3256 return false; 3257 3258 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3259 // 1 / x -> RCP(x) 3260 if (CLHS->isExactlyValue(1.0)) { 3261 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3262 .addUse(RHS) 3263 .setMIFlags(Flags); 3264 3265 MI.eraseFromParent(); 3266 return true; 3267 } 3268 3269 // -1 / x -> RCP( FNEG(x) ) 3270 if (CLHS->isExactlyValue(-1.0)) { 3271 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3272 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3273 .addUse(FNeg.getReg(0)) 3274 .setMIFlags(Flags); 3275 3276 MI.eraseFromParent(); 3277 return true; 3278 } 3279 } 3280 3281 // x / y -> x * (1.0 / y) 3282 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3283 .addUse(RHS) 3284 .setMIFlags(Flags); 3285 B.buildFMul(Res, LHS, RCP, Flags); 3286 3287 MI.eraseFromParent(); 3288 return true; 3289 } 3290 3291 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3292 MachineRegisterInfo &MRI, 3293 MachineIRBuilder &B) const { 3294 Register Res = MI.getOperand(0).getReg(); 3295 Register X = MI.getOperand(1).getReg(); 3296 Register Y = MI.getOperand(2).getReg(); 3297 uint16_t Flags = MI.getFlags(); 3298 LLT ResTy = MRI.getType(Res); 3299 3300 const MachineFunction &MF = B.getMF(); 3301 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3302 MI.getFlag(MachineInstr::FmAfn); 3303 3304 if (!AllowInaccurateRcp) 3305 return false; 3306 3307 auto NegY = B.buildFNeg(ResTy, Y); 3308 auto One = B.buildFConstant(ResTy, 1.0); 3309 3310 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3311 .addUse(Y) 3312 .setMIFlags(Flags); 3313 3314 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3315 R = B.buildFMA(ResTy, Tmp0, R, R); 3316 3317 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3318 R = B.buildFMA(ResTy, Tmp1, R, R); 3319 3320 auto Ret = B.buildFMul(ResTy, X, R); 3321 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3322 3323 B.buildFMA(Res, Tmp2, R, Ret); 3324 MI.eraseFromParent(); 3325 return true; 3326 } 3327 3328 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3329 MachineRegisterInfo &MRI, 3330 MachineIRBuilder &B) const { 3331 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3332 return true; 3333 3334 Register Res = MI.getOperand(0).getReg(); 3335 Register LHS = MI.getOperand(1).getReg(); 3336 Register RHS = MI.getOperand(2).getReg(); 3337 3338 uint16_t Flags = MI.getFlags(); 3339 3340 LLT S16 = LLT::scalar(16); 3341 LLT S32 = LLT::scalar(32); 3342 3343 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3344 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3345 3346 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3347 .addUse(RHSExt.getReg(0)) 3348 .setMIFlags(Flags); 3349 3350 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3351 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3352 3353 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3354 .addUse(RDst.getReg(0)) 3355 .addUse(RHS) 3356 .addUse(LHS) 3357 .setMIFlags(Flags); 3358 3359 MI.eraseFromParent(); 3360 return true; 3361 } 3362 3363 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3364 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3365 static void toggleSPDenormMode(bool Enable, 3366 MachineIRBuilder &B, 3367 const GCNSubtarget &ST, 3368 AMDGPU::SIModeRegisterDefaults Mode) { 3369 // Set SP denorm mode to this value. 3370 unsigned SPDenormMode = 3371 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3372 3373 if (ST.hasDenormModeInst()) { 3374 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3375 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3376 3377 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3378 B.buildInstr(AMDGPU::S_DENORM_MODE) 3379 .addImm(NewDenormModeValue); 3380 3381 } else { 3382 // Select FP32 bit field in mode register. 3383 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3384 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3385 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3386 3387 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3388 .addImm(SPDenormMode) 3389 .addImm(SPDenormModeBitField); 3390 } 3391 } 3392 3393 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3394 MachineRegisterInfo &MRI, 3395 MachineIRBuilder &B) const { 3396 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3397 return true; 3398 3399 Register Res = MI.getOperand(0).getReg(); 3400 Register LHS = MI.getOperand(1).getReg(); 3401 Register RHS = MI.getOperand(2).getReg(); 3402 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3403 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3404 3405 uint16_t Flags = MI.getFlags(); 3406 3407 LLT S32 = LLT::scalar(32); 3408 LLT S1 = LLT::scalar(1); 3409 3410 auto One = B.buildFConstant(S32, 1.0f); 3411 3412 auto DenominatorScaled = 3413 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3414 .addUse(LHS) 3415 .addUse(RHS) 3416 .addImm(0) 3417 .setMIFlags(Flags); 3418 auto NumeratorScaled = 3419 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3420 .addUse(LHS) 3421 .addUse(RHS) 3422 .addImm(1) 3423 .setMIFlags(Flags); 3424 3425 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3426 .addUse(DenominatorScaled.getReg(0)) 3427 .setMIFlags(Flags); 3428 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3429 3430 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3431 // aren't modeled as reading it. 3432 if (!Mode.allFP32Denormals()) 3433 toggleSPDenormMode(true, B, ST, Mode); 3434 3435 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3436 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3437 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3438 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3439 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3440 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3441 3442 if (!Mode.allFP32Denormals()) 3443 toggleSPDenormMode(false, B, ST, Mode); 3444 3445 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3446 .addUse(Fma4.getReg(0)) 3447 .addUse(Fma1.getReg(0)) 3448 .addUse(Fma3.getReg(0)) 3449 .addUse(NumeratorScaled.getReg(1)) 3450 .setMIFlags(Flags); 3451 3452 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3453 .addUse(Fmas.getReg(0)) 3454 .addUse(RHS) 3455 .addUse(LHS) 3456 .setMIFlags(Flags); 3457 3458 MI.eraseFromParent(); 3459 return true; 3460 } 3461 3462 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3463 MachineRegisterInfo &MRI, 3464 MachineIRBuilder &B) const { 3465 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3466 return true; 3467 3468 Register Res = MI.getOperand(0).getReg(); 3469 Register LHS = MI.getOperand(1).getReg(); 3470 Register RHS = MI.getOperand(2).getReg(); 3471 3472 uint16_t Flags = MI.getFlags(); 3473 3474 LLT S64 = LLT::scalar(64); 3475 LLT S1 = LLT::scalar(1); 3476 3477 auto One = B.buildFConstant(S64, 1.0); 3478 3479 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3480 .addUse(LHS) 3481 .addUse(RHS) 3482 .addImm(0) 3483 .setMIFlags(Flags); 3484 3485 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3486 3487 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3488 .addUse(DivScale0.getReg(0)) 3489 .setMIFlags(Flags); 3490 3491 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3492 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3493 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3494 3495 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3496 .addUse(LHS) 3497 .addUse(RHS) 3498 .addImm(1) 3499 .setMIFlags(Flags); 3500 3501 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3502 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3503 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3504 3505 Register Scale; 3506 if (!ST.hasUsableDivScaleConditionOutput()) { 3507 // Workaround a hardware bug on SI where the condition output from div_scale 3508 // is not usable. 3509 3510 LLT S32 = LLT::scalar(32); 3511 3512 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3513 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3514 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3515 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3516 3517 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3518 Scale1Unmerge.getReg(1)); 3519 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3520 Scale0Unmerge.getReg(1)); 3521 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3522 } else { 3523 Scale = DivScale1.getReg(1); 3524 } 3525 3526 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3527 .addUse(Fma4.getReg(0)) 3528 .addUse(Fma3.getReg(0)) 3529 .addUse(Mul.getReg(0)) 3530 .addUse(Scale) 3531 .setMIFlags(Flags); 3532 3533 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3534 .addUse(Fmas.getReg(0)) 3535 .addUse(RHS) 3536 .addUse(LHS) 3537 .setMIFlags(Flags); 3538 3539 MI.eraseFromParent(); 3540 return true; 3541 } 3542 3543 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3544 MachineRegisterInfo &MRI, 3545 MachineIRBuilder &B) const { 3546 Register Res = MI.getOperand(0).getReg(); 3547 Register LHS = MI.getOperand(2).getReg(); 3548 Register RHS = MI.getOperand(3).getReg(); 3549 uint16_t Flags = MI.getFlags(); 3550 3551 LLT S32 = LLT::scalar(32); 3552 LLT S1 = LLT::scalar(1); 3553 3554 auto Abs = B.buildFAbs(S32, RHS, Flags); 3555 const APFloat C0Val(1.0f); 3556 3557 auto C0 = B.buildConstant(S32, 0x6f800000); 3558 auto C1 = B.buildConstant(S32, 0x2f800000); 3559 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3560 3561 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3562 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3563 3564 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3565 3566 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3567 .addUse(Mul0.getReg(0)) 3568 .setMIFlags(Flags); 3569 3570 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3571 3572 B.buildFMul(Res, Sel, Mul1, Flags); 3573 3574 MI.eraseFromParent(); 3575 return true; 3576 } 3577 3578 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3579 // FIXME: Why do we handle this one but not other removed instructions? 3580 // 3581 // Reciprocal square root. The clamp prevents infinite results, clamping 3582 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3583 // +-max_float. 3584 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3585 MachineRegisterInfo &MRI, 3586 MachineIRBuilder &B) const { 3587 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3588 return true; 3589 3590 Register Dst = MI.getOperand(0).getReg(); 3591 Register Src = MI.getOperand(2).getReg(); 3592 auto Flags = MI.getFlags(); 3593 3594 LLT Ty = MRI.getType(Dst); 3595 3596 const fltSemantics *FltSemantics; 3597 if (Ty == LLT::scalar(32)) 3598 FltSemantics = &APFloat::IEEEsingle(); 3599 else if (Ty == LLT::scalar(64)) 3600 FltSemantics = &APFloat::IEEEdouble(); 3601 else 3602 return false; 3603 3604 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3605 .addUse(Src) 3606 .setMIFlags(Flags); 3607 3608 // We don't need to concern ourselves with the snan handling difference, since 3609 // the rsq quieted (or not) so use the one which will directly select. 3610 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3611 const bool UseIEEE = MFI->getMode().IEEE; 3612 3613 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3614 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3615 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3616 3617 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3618 3619 if (UseIEEE) 3620 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3621 else 3622 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3623 MI.eraseFromParent(); 3624 return true; 3625 } 3626 3627 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3628 switch (IID) { 3629 case Intrinsic::amdgcn_ds_fadd: 3630 return AMDGPU::G_ATOMICRMW_FADD; 3631 case Intrinsic::amdgcn_ds_fmin: 3632 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3633 case Intrinsic::amdgcn_ds_fmax: 3634 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3635 default: 3636 llvm_unreachable("not a DS FP intrinsic"); 3637 } 3638 } 3639 3640 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3641 MachineInstr &MI, 3642 Intrinsic::ID IID) const { 3643 GISelChangeObserver &Observer = Helper.Observer; 3644 Observer.changingInstr(MI); 3645 3646 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3647 3648 // The remaining operands were used to set fields in the MemOperand on 3649 // construction. 3650 for (int I = 6; I > 3; --I) 3651 MI.RemoveOperand(I); 3652 3653 MI.RemoveOperand(1); // Remove the intrinsic ID. 3654 Observer.changedInstr(MI); 3655 return true; 3656 } 3657 3658 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3659 MachineRegisterInfo &MRI, 3660 MachineIRBuilder &B) const { 3661 uint64_t Offset = 3662 ST.getTargetLowering()->getImplicitParameterOffset( 3663 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3664 LLT DstTy = MRI.getType(DstReg); 3665 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3666 3667 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3668 if (!loadInputValue(KernargPtrReg, B, 3669 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3670 return false; 3671 3672 // FIXME: This should be nuw 3673 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3674 return true; 3675 } 3676 3677 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3678 MachineRegisterInfo &MRI, 3679 MachineIRBuilder &B) const { 3680 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3681 if (!MFI->isEntryFunction()) { 3682 return legalizePreloadedArgIntrin(MI, MRI, B, 3683 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3684 } 3685 3686 Register DstReg = MI.getOperand(0).getReg(); 3687 if (!getImplicitArgPtr(DstReg, MRI, B)) 3688 return false; 3689 3690 MI.eraseFromParent(); 3691 return true; 3692 } 3693 3694 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3695 MachineRegisterInfo &MRI, 3696 MachineIRBuilder &B, 3697 unsigned AddrSpace) const { 3698 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3699 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3700 Register Hi32 = Unmerge.getReg(1); 3701 3702 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3703 MI.eraseFromParent(); 3704 return true; 3705 } 3706 3707 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3708 // offset (the offset that is included in bounds checking and swizzling, to be 3709 // split between the instruction's voffset and immoffset fields) and soffset 3710 // (the offset that is excluded from bounds checking and swizzling, to go in 3711 // the instruction's soffset field). This function takes the first kind of 3712 // offset and figures out how to split it between voffset and immoffset. 3713 std::pair<Register, unsigned> 3714 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3715 Register OrigOffset) const { 3716 const unsigned MaxImm = 4095; 3717 Register BaseReg; 3718 unsigned ImmOffset; 3719 const LLT S32 = LLT::scalar(32); 3720 MachineRegisterInfo &MRI = *B.getMRI(); 3721 3722 std::tie(BaseReg, ImmOffset) = 3723 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3724 3725 // If BaseReg is a pointer, convert it to int. 3726 if (MRI.getType(BaseReg).isPointer()) 3727 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3728 3729 // If the immediate value is too big for the immoffset field, put the value 3730 // and -4096 into the immoffset field so that the value that is copied/added 3731 // for the voffset field is a multiple of 4096, and it stands more chance 3732 // of being CSEd with the copy/add for another similar load/store. 3733 // However, do not do that rounding down to a multiple of 4096 if that is a 3734 // negative number, as it appears to be illegal to have a negative offset 3735 // in the vgpr, even if adding the immediate offset makes it positive. 3736 unsigned Overflow = ImmOffset & ~MaxImm; 3737 ImmOffset -= Overflow; 3738 if ((int32_t)Overflow < 0) { 3739 Overflow += ImmOffset; 3740 ImmOffset = 0; 3741 } 3742 3743 if (Overflow != 0) { 3744 if (!BaseReg) { 3745 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3746 } else { 3747 auto OverflowVal = B.buildConstant(S32, Overflow); 3748 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3749 } 3750 } 3751 3752 if (!BaseReg) 3753 BaseReg = B.buildConstant(S32, 0).getReg(0); 3754 3755 return std::make_pair(BaseReg, ImmOffset); 3756 } 3757 3758 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3759 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3760 Register VOffset, Register SOffset, 3761 unsigned ImmOffset, Register VIndex, 3762 MachineRegisterInfo &MRI) const { 3763 Optional<ValueAndVReg> MaybeVOffsetVal = 3764 getConstantVRegValWithLookThrough(VOffset, MRI); 3765 Optional<ValueAndVReg> MaybeSOffsetVal = 3766 getConstantVRegValWithLookThrough(SOffset, MRI); 3767 Optional<ValueAndVReg> MaybeVIndexVal = 3768 getConstantVRegValWithLookThrough(VIndex, MRI); 3769 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3770 // update the MMO with that offset. The stride is unknown so we can only do 3771 // this if VIndex is constant 0. 3772 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3773 MaybeVIndexVal->Value == 0) { 3774 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3775 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3776 MMO->setOffset(TotalOffset); 3777 } else { 3778 // We don't have a constant combined offset to use in the MMO. Give up. 3779 MMO->setValue((Value *)nullptr); 3780 } 3781 } 3782 3783 /// Handle register layout difference for f16 images for some subtargets. 3784 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3785 MachineRegisterInfo &MRI, 3786 Register Reg, 3787 bool ImageStore) const { 3788 const LLT S16 = LLT::scalar(16); 3789 const LLT S32 = LLT::scalar(32); 3790 LLT StoreVT = MRI.getType(Reg); 3791 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3792 3793 if (ST.hasUnpackedD16VMem()) { 3794 auto Unmerge = B.buildUnmerge(S16, Reg); 3795 3796 SmallVector<Register, 4> WideRegs; 3797 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3798 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3799 3800 int NumElts = StoreVT.getNumElements(); 3801 3802 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3803 .getReg(0); 3804 } 3805 3806 if (ImageStore && ST.hasImageStoreD16Bug()) { 3807 if (StoreVT.getNumElements() == 2) { 3808 SmallVector<Register, 4> PackedRegs; 3809 Reg = B.buildBitcast(S32, Reg).getReg(0); 3810 PackedRegs.push_back(Reg); 3811 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3812 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3813 .getReg(0); 3814 } 3815 3816 if (StoreVT.getNumElements() == 3) { 3817 SmallVector<Register, 4> PackedRegs; 3818 auto Unmerge = B.buildUnmerge(S16, Reg); 3819 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3820 PackedRegs.push_back(Unmerge.getReg(I)); 3821 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3822 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3823 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3824 } 3825 3826 if (StoreVT.getNumElements() == 4) { 3827 SmallVector<Register, 4> PackedRegs; 3828 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3829 auto Unmerge = B.buildUnmerge(S32, Reg); 3830 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3831 PackedRegs.push_back(Unmerge.getReg(I)); 3832 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3833 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3834 .getReg(0); 3835 } 3836 3837 llvm_unreachable("invalid data type"); 3838 } 3839 3840 return Reg; 3841 } 3842 3843 Register AMDGPULegalizerInfo::fixStoreSourceType( 3844 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3845 MachineRegisterInfo *MRI = B.getMRI(); 3846 LLT Ty = MRI->getType(VData); 3847 3848 const LLT S16 = LLT::scalar(16); 3849 3850 // Fixup illegal register types for i8 stores. 3851 if (Ty == LLT::scalar(8) || Ty == S16) { 3852 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3853 return AnyExt; 3854 } 3855 3856 if (Ty.isVector()) { 3857 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3858 if (IsFormat) 3859 return handleD16VData(B, *MRI, VData); 3860 } 3861 } 3862 3863 return VData; 3864 } 3865 3866 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3867 MachineRegisterInfo &MRI, 3868 MachineIRBuilder &B, 3869 bool IsTyped, 3870 bool IsFormat) const { 3871 Register VData = MI.getOperand(1).getReg(); 3872 LLT Ty = MRI.getType(VData); 3873 LLT EltTy = Ty.getScalarType(); 3874 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3875 const LLT S32 = LLT::scalar(32); 3876 3877 VData = fixStoreSourceType(B, VData, IsFormat); 3878 Register RSrc = MI.getOperand(2).getReg(); 3879 3880 MachineMemOperand *MMO = *MI.memoperands_begin(); 3881 const int MemSize = MMO->getSize(); 3882 3883 unsigned ImmOffset; 3884 3885 // The typed intrinsics add an immediate after the registers. 3886 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3887 3888 // The struct intrinsic variants add one additional operand over raw. 3889 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3890 Register VIndex; 3891 int OpOffset = 0; 3892 if (HasVIndex) { 3893 VIndex = MI.getOperand(3).getReg(); 3894 OpOffset = 1; 3895 } else { 3896 VIndex = B.buildConstant(S32, 0).getReg(0); 3897 } 3898 3899 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3900 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3901 3902 unsigned Format = 0; 3903 if (IsTyped) { 3904 Format = MI.getOperand(5 + OpOffset).getImm(); 3905 ++OpOffset; 3906 } 3907 3908 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3909 3910 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3911 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3912 3913 unsigned Opc; 3914 if (IsTyped) { 3915 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3916 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3917 } else if (IsFormat) { 3918 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3919 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3920 } else { 3921 switch (MemSize) { 3922 case 1: 3923 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3924 break; 3925 case 2: 3926 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3927 break; 3928 default: 3929 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3930 break; 3931 } 3932 } 3933 3934 auto MIB = B.buildInstr(Opc) 3935 .addUse(VData) // vdata 3936 .addUse(RSrc) // rsrc 3937 .addUse(VIndex) // vindex 3938 .addUse(VOffset) // voffset 3939 .addUse(SOffset) // soffset 3940 .addImm(ImmOffset); // offset(imm) 3941 3942 if (IsTyped) 3943 MIB.addImm(Format); 3944 3945 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3946 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3947 .addMemOperand(MMO); 3948 3949 MI.eraseFromParent(); 3950 return true; 3951 } 3952 3953 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3954 MachineRegisterInfo &MRI, 3955 MachineIRBuilder &B, 3956 bool IsFormat, 3957 bool IsTyped) const { 3958 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3959 MachineMemOperand *MMO = *MI.memoperands_begin(); 3960 const LLT MemTy = MMO->getMemoryType(); 3961 const LLT S32 = LLT::scalar(32); 3962 3963 Register Dst = MI.getOperand(0).getReg(); 3964 Register RSrc = MI.getOperand(2).getReg(); 3965 3966 // The typed intrinsics add an immediate after the registers. 3967 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3968 3969 // The struct intrinsic variants add one additional operand over raw. 3970 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3971 Register VIndex; 3972 int OpOffset = 0; 3973 if (HasVIndex) { 3974 VIndex = MI.getOperand(3).getReg(); 3975 OpOffset = 1; 3976 } else { 3977 VIndex = B.buildConstant(S32, 0).getReg(0); 3978 } 3979 3980 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3981 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3982 3983 unsigned Format = 0; 3984 if (IsTyped) { 3985 Format = MI.getOperand(5 + OpOffset).getImm(); 3986 ++OpOffset; 3987 } 3988 3989 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3990 unsigned ImmOffset; 3991 3992 LLT Ty = MRI.getType(Dst); 3993 LLT EltTy = Ty.getScalarType(); 3994 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3995 const bool Unpacked = ST.hasUnpackedD16VMem(); 3996 3997 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3998 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3999 4000 unsigned Opc; 4001 4002 if (IsTyped) { 4003 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4004 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4005 } else if (IsFormat) { 4006 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4007 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4008 } else { 4009 switch (MemTy.getSizeInBits()) { 4010 case 8: 4011 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4012 break; 4013 case 16: 4014 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4015 break; 4016 default: 4017 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4018 break; 4019 } 4020 } 4021 4022 Register LoadDstReg; 4023 4024 bool IsExtLoad = 4025 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4026 LLT UnpackedTy = Ty.changeElementSize(32); 4027 4028 if (IsExtLoad) 4029 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4030 else if (Unpacked && IsD16 && Ty.isVector()) 4031 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4032 else 4033 LoadDstReg = Dst; 4034 4035 auto MIB = B.buildInstr(Opc) 4036 .addDef(LoadDstReg) // vdata 4037 .addUse(RSrc) // rsrc 4038 .addUse(VIndex) // vindex 4039 .addUse(VOffset) // voffset 4040 .addUse(SOffset) // soffset 4041 .addImm(ImmOffset); // offset(imm) 4042 4043 if (IsTyped) 4044 MIB.addImm(Format); 4045 4046 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4047 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4048 .addMemOperand(MMO); 4049 4050 if (LoadDstReg != Dst) { 4051 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4052 4053 // Widen result for extending loads was widened. 4054 if (IsExtLoad) 4055 B.buildTrunc(Dst, LoadDstReg); 4056 else { 4057 // Repack to original 16-bit vector result 4058 // FIXME: G_TRUNC should work, but legalization currently fails 4059 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4060 SmallVector<Register, 4> Repack; 4061 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4062 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4063 B.buildMerge(Dst, Repack); 4064 } 4065 } 4066 4067 MI.eraseFromParent(); 4068 return true; 4069 } 4070 4071 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4072 MachineIRBuilder &B, 4073 bool IsInc) const { 4074 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4075 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4076 B.buildInstr(Opc) 4077 .addDef(MI.getOperand(0).getReg()) 4078 .addUse(MI.getOperand(2).getReg()) 4079 .addUse(MI.getOperand(3).getReg()) 4080 .cloneMemRefs(MI); 4081 MI.eraseFromParent(); 4082 return true; 4083 } 4084 4085 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4086 switch (IntrID) { 4087 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4088 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4089 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4090 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4091 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4092 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4093 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4094 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4095 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4096 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4097 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4098 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4099 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4100 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4101 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4102 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4103 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4104 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4105 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4106 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4107 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4108 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4109 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4110 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4111 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4112 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4113 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4114 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4115 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4116 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4117 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4118 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4119 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4120 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4121 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4122 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4123 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4124 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4125 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4126 case Intrinsic::amdgcn_buffer_atomic_fadd: 4127 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4128 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4129 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4130 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4131 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4132 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4133 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4134 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4135 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4136 default: 4137 llvm_unreachable("unhandled atomic opcode"); 4138 } 4139 } 4140 4141 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4142 MachineIRBuilder &B, 4143 Intrinsic::ID IID) const { 4144 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4145 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4146 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4147 4148 Register Dst; 4149 4150 int OpOffset = 0; 4151 if (HasReturn) { 4152 // A few FP atomics do not support return values. 4153 Dst = MI.getOperand(0).getReg(); 4154 } else { 4155 OpOffset = -1; 4156 } 4157 4158 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4159 Register CmpVal; 4160 4161 if (IsCmpSwap) { 4162 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4163 ++OpOffset; 4164 } 4165 4166 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4167 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4168 4169 // The struct intrinsic variants add one additional operand over raw. 4170 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4171 Register VIndex; 4172 if (HasVIndex) { 4173 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4174 ++OpOffset; 4175 } else { 4176 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4177 } 4178 4179 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4180 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4181 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4182 4183 MachineMemOperand *MMO = *MI.memoperands_begin(); 4184 4185 unsigned ImmOffset; 4186 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4187 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4188 4189 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4190 4191 if (HasReturn) 4192 MIB.addDef(Dst); 4193 4194 MIB.addUse(VData); // vdata 4195 4196 if (IsCmpSwap) 4197 MIB.addReg(CmpVal); 4198 4199 MIB.addUse(RSrc) // rsrc 4200 .addUse(VIndex) // vindex 4201 .addUse(VOffset) // voffset 4202 .addUse(SOffset) // soffset 4203 .addImm(ImmOffset) // offset(imm) 4204 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4205 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4206 .addMemOperand(MMO); 4207 4208 MI.eraseFromParent(); 4209 return true; 4210 } 4211 4212 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4213 /// vector with s16 typed elements. 4214 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4215 SmallVectorImpl<Register> &PackedAddrs, 4216 unsigned ArgOffset, 4217 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4218 bool IsA16, bool IsG16) { 4219 const LLT S16 = LLT::scalar(16); 4220 const LLT V2S16 = LLT::fixed_vector(2, 16); 4221 auto EndIdx = Intr->VAddrEnd; 4222 4223 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4224 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4225 if (!SrcOp.isReg()) 4226 continue; // _L to _LZ may have eliminated this. 4227 4228 Register AddrReg = SrcOp.getReg(); 4229 4230 if ((I < Intr->GradientStart) || 4231 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4232 (I >= Intr->CoordStart && !IsA16)) { 4233 // Handle any gradient or coordinate operands that should not be packed 4234 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4235 PackedAddrs.push_back(AddrReg); 4236 } else { 4237 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4238 // derivatives dx/dh and dx/dv are packed with undef. 4239 if (((I + 1) >= EndIdx) || 4240 ((Intr->NumGradients / 2) % 2 == 1 && 4241 (I == static_cast<unsigned>(Intr->GradientStart + 4242 (Intr->NumGradients / 2) - 1) || 4243 I == static_cast<unsigned>(Intr->GradientStart + 4244 Intr->NumGradients - 1))) || 4245 // Check for _L to _LZ optimization 4246 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4247 PackedAddrs.push_back( 4248 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4249 .getReg(0)); 4250 } else { 4251 PackedAddrs.push_back( 4252 B.buildBuildVector( 4253 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4254 .getReg(0)); 4255 ++I; 4256 } 4257 } 4258 } 4259 } 4260 4261 /// Convert from separate vaddr components to a single vector address register, 4262 /// and replace the remaining operands with $noreg. 4263 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4264 int DimIdx, int NumVAddrs) { 4265 const LLT S32 = LLT::scalar(32); 4266 4267 SmallVector<Register, 8> AddrRegs; 4268 for (int I = 0; I != NumVAddrs; ++I) { 4269 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4270 if (SrcOp.isReg()) { 4271 AddrRegs.push_back(SrcOp.getReg()); 4272 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4273 } 4274 } 4275 4276 int NumAddrRegs = AddrRegs.size(); 4277 if (NumAddrRegs != 1) { 4278 // Above 8 elements round up to next power of 2 (i.e. 16). 4279 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4280 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4281 auto Undef = B.buildUndef(S32); 4282 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4283 NumAddrRegs = RoundedNumRegs; 4284 } 4285 4286 auto VAddr = 4287 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4288 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4289 } 4290 4291 for (int I = 1; I != NumVAddrs; ++I) { 4292 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4293 if (SrcOp.isReg()) 4294 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4295 } 4296 } 4297 4298 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4299 /// 4300 /// Depending on the subtarget, load/store with 16-bit element data need to be 4301 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4302 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4303 /// registers. 4304 /// 4305 /// We don't want to directly select image instructions just yet, but also want 4306 /// to exposes all register repacking to the legalizer/combiners. We also don't 4307 /// want a selected instrution entering RegBankSelect. In order to avoid 4308 /// defining a multitude of intermediate image instructions, directly hack on 4309 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4310 /// now unnecessary arguments with $noreg. 4311 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4312 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4313 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4314 4315 const unsigned NumDefs = MI.getNumExplicitDefs(); 4316 const unsigned ArgOffset = NumDefs + 1; 4317 bool IsTFE = NumDefs == 2; 4318 // We are only processing the operands of d16 image operations on subtargets 4319 // that use the unpacked register layout, or need to repack the TFE result. 4320 4321 // TODO: Do we need to guard against already legalized intrinsics? 4322 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4323 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4324 4325 MachineRegisterInfo *MRI = B.getMRI(); 4326 const LLT S32 = LLT::scalar(32); 4327 const LLT S16 = LLT::scalar(16); 4328 const LLT V2S16 = LLT::fixed_vector(2, 16); 4329 4330 unsigned DMask = 0; 4331 4332 // Check for 16 bit addresses and pack if true. 4333 LLT GradTy = 4334 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4335 LLT AddrTy = 4336 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4337 const bool IsG16 = GradTy == S16; 4338 const bool IsA16 = AddrTy == S16; 4339 4340 int DMaskLanes = 0; 4341 if (!BaseOpcode->Atomic) { 4342 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4343 if (BaseOpcode->Gather4) { 4344 DMaskLanes = 4; 4345 } else if (DMask != 0) { 4346 DMaskLanes = countPopulation(DMask); 4347 } else if (!IsTFE && !BaseOpcode->Store) { 4348 // If dmask is 0, this is a no-op load. This can be eliminated. 4349 B.buildUndef(MI.getOperand(0)); 4350 MI.eraseFromParent(); 4351 return true; 4352 } 4353 } 4354 4355 Observer.changingInstr(MI); 4356 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4357 4358 unsigned NewOpcode = NumDefs == 0 ? 4359 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4360 4361 // Track that we legalized this 4362 MI.setDesc(B.getTII().get(NewOpcode)); 4363 4364 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4365 // dmask to be at least 1 otherwise the instruction will fail 4366 if (IsTFE && DMask == 0) { 4367 DMask = 0x1; 4368 DMaskLanes = 1; 4369 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4370 } 4371 4372 if (BaseOpcode->Atomic) { 4373 Register VData0 = MI.getOperand(2).getReg(); 4374 LLT Ty = MRI->getType(VData0); 4375 4376 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4377 if (Ty.isVector()) 4378 return false; 4379 4380 if (BaseOpcode->AtomicX2) { 4381 Register VData1 = MI.getOperand(3).getReg(); 4382 // The two values are packed in one register. 4383 LLT PackedTy = LLT::fixed_vector(2, Ty); 4384 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4385 MI.getOperand(2).setReg(Concat.getReg(0)); 4386 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4387 } 4388 } 4389 4390 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4391 4392 // Optimize _L to _LZ when _L is zero 4393 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4394 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4395 const ConstantFP *ConstantLod; 4396 4397 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4398 m_GFCst(ConstantLod))) { 4399 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4400 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4401 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4402 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4403 Intr->Dim); 4404 4405 // The starting indexes should remain in the same place. 4406 --CorrectedNumVAddrs; 4407 4408 MI.getOperand(MI.getNumExplicitDefs()) 4409 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4410 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4411 Intr = NewImageDimIntr; 4412 } 4413 } 4414 } 4415 4416 // Optimize _mip away, when 'lod' is zero 4417 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4418 int64_t ConstantLod; 4419 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4420 m_ICst(ConstantLod))) { 4421 if (ConstantLod == 0) { 4422 // TODO: Change intrinsic opcode and remove operand instead or replacing 4423 // it with 0, as the _L to _LZ handling is done above. 4424 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4425 --CorrectedNumVAddrs; 4426 } 4427 } 4428 } 4429 4430 // Rewrite the addressing register layout before doing anything else. 4431 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4432 // 16 bit gradients are supported, but are tied to the A16 control 4433 // so both gradients and addresses must be 16 bit 4434 return false; 4435 } 4436 4437 if (IsA16 && !ST.hasA16()) { 4438 // A16 not supported 4439 return false; 4440 } 4441 4442 if (IsA16 || IsG16) { 4443 if (Intr->NumVAddrs > 1) { 4444 SmallVector<Register, 4> PackedRegs; 4445 4446 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4447 IsG16); 4448 4449 // See also below in the non-a16 branch 4450 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4451 PackedRegs.size() <= ST.getNSAMaxSize(); 4452 4453 if (!UseNSA && PackedRegs.size() > 1) { 4454 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4455 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4456 PackedRegs[0] = Concat.getReg(0); 4457 PackedRegs.resize(1); 4458 } 4459 4460 const unsigned NumPacked = PackedRegs.size(); 4461 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4462 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4463 if (!SrcOp.isReg()) { 4464 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4465 continue; 4466 } 4467 4468 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4469 4470 if (I - Intr->VAddrStart < NumPacked) 4471 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4472 else 4473 SrcOp.setReg(AMDGPU::NoRegister); 4474 } 4475 } 4476 } else { 4477 // If the register allocator cannot place the address registers contiguously 4478 // without introducing moves, then using the non-sequential address encoding 4479 // is always preferable, since it saves VALU instructions and is usually a 4480 // wash in terms of code size or even better. 4481 // 4482 // However, we currently have no way of hinting to the register allocator 4483 // that MIMG addresses should be placed contiguously when it is possible to 4484 // do so, so force non-NSA for the common 2-address case as a heuristic. 4485 // 4486 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4487 // allocation when possible. 4488 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4489 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4490 4491 if (!UseNSA && Intr->NumVAddrs > 1) 4492 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4493 Intr->NumVAddrs); 4494 } 4495 4496 int Flags = 0; 4497 if (IsA16) 4498 Flags |= 1; 4499 if (IsG16) 4500 Flags |= 2; 4501 MI.addOperand(MachineOperand::CreateImm(Flags)); 4502 4503 if (BaseOpcode->Store) { // No TFE for stores? 4504 // TODO: Handle dmask trim 4505 Register VData = MI.getOperand(1).getReg(); 4506 LLT Ty = MRI->getType(VData); 4507 if (!Ty.isVector() || Ty.getElementType() != S16) 4508 return true; 4509 4510 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4511 if (RepackedReg != VData) { 4512 MI.getOperand(1).setReg(RepackedReg); 4513 } 4514 4515 return true; 4516 } 4517 4518 Register DstReg = MI.getOperand(0).getReg(); 4519 LLT Ty = MRI->getType(DstReg); 4520 const LLT EltTy = Ty.getScalarType(); 4521 const bool IsD16 = Ty.getScalarType() == S16; 4522 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4523 4524 // Confirm that the return type is large enough for the dmask specified 4525 if (NumElts < DMaskLanes) 4526 return false; 4527 4528 if (NumElts > 4 || DMaskLanes > 4) 4529 return false; 4530 4531 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4532 const LLT AdjustedTy = 4533 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4534 4535 // The raw dword aligned data component of the load. The only legal cases 4536 // where this matters should be when using the packed D16 format, for 4537 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4538 LLT RoundedTy; 4539 4540 // S32 vector to to cover all data, plus TFE result element. 4541 LLT TFETy; 4542 4543 // Register type to use for each loaded component. Will be S32 or V2S16. 4544 LLT RegTy; 4545 4546 if (IsD16 && ST.hasUnpackedD16VMem()) { 4547 RoundedTy = 4548 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4549 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4550 RegTy = S32; 4551 } else { 4552 unsigned EltSize = EltTy.getSizeInBits(); 4553 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4554 unsigned RoundedSize = 32 * RoundedElts; 4555 RoundedTy = LLT::scalarOrVector( 4556 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4557 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4558 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4559 } 4560 4561 // The return type does not need adjustment. 4562 // TODO: Should we change s16 case to s32 or <2 x s16>? 4563 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4564 return true; 4565 4566 Register Dst1Reg; 4567 4568 // Insert after the instruction. 4569 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4570 4571 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4572 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4573 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4574 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4575 4576 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4577 4578 MI.getOperand(0).setReg(NewResultReg); 4579 4580 // In the IR, TFE is supposed to be used with a 2 element struct return 4581 // type. The intruction really returns these two values in one contiguous 4582 // register, with one additional dword beyond the loaded data. Rewrite the 4583 // return type to use a single register result. 4584 4585 if (IsTFE) { 4586 Dst1Reg = MI.getOperand(1).getReg(); 4587 if (MRI->getType(Dst1Reg) != S32) 4588 return false; 4589 4590 // TODO: Make sure the TFE operand bit is set. 4591 MI.RemoveOperand(1); 4592 4593 // Handle the easy case that requires no repack instructions. 4594 if (Ty == S32) { 4595 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4596 return true; 4597 } 4598 } 4599 4600 // Now figure out how to copy the new result register back into the old 4601 // result. 4602 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4603 4604 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4605 4606 if (ResultNumRegs == 1) { 4607 assert(!IsTFE); 4608 ResultRegs[0] = NewResultReg; 4609 } else { 4610 // We have to repack into a new vector of some kind. 4611 for (int I = 0; I != NumDataRegs; ++I) 4612 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4613 B.buildUnmerge(ResultRegs, NewResultReg); 4614 4615 // Drop the final TFE element to get the data part. The TFE result is 4616 // directly written to the right place already. 4617 if (IsTFE) 4618 ResultRegs.resize(NumDataRegs); 4619 } 4620 4621 // For an s16 scalar result, we form an s32 result with a truncate regardless 4622 // of packed vs. unpacked. 4623 if (IsD16 && !Ty.isVector()) { 4624 B.buildTrunc(DstReg, ResultRegs[0]); 4625 return true; 4626 } 4627 4628 // Avoid a build/concat_vector of 1 entry. 4629 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4630 B.buildBitcast(DstReg, ResultRegs[0]); 4631 return true; 4632 } 4633 4634 assert(Ty.isVector()); 4635 4636 if (IsD16) { 4637 // For packed D16 results with TFE enabled, all the data components are 4638 // S32. Cast back to the expected type. 4639 // 4640 // TODO: We don't really need to use load s32 elements. We would only need one 4641 // cast for the TFE result if a multiple of v2s16 was used. 4642 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4643 for (Register &Reg : ResultRegs) 4644 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4645 } else if (ST.hasUnpackedD16VMem()) { 4646 for (Register &Reg : ResultRegs) 4647 Reg = B.buildTrunc(S16, Reg).getReg(0); 4648 } 4649 } 4650 4651 auto padWithUndef = [&](LLT Ty, int NumElts) { 4652 if (NumElts == 0) 4653 return; 4654 Register Undef = B.buildUndef(Ty).getReg(0); 4655 for (int I = 0; I != NumElts; ++I) 4656 ResultRegs.push_back(Undef); 4657 }; 4658 4659 // Pad out any elements eliminated due to the dmask. 4660 LLT ResTy = MRI->getType(ResultRegs[0]); 4661 if (!ResTy.isVector()) { 4662 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4663 B.buildBuildVector(DstReg, ResultRegs); 4664 return true; 4665 } 4666 4667 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4668 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4669 4670 // Deal with the one annoying legal case. 4671 const LLT V3S16 = LLT::fixed_vector(3, 16); 4672 if (Ty == V3S16) { 4673 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4674 auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs); 4675 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4676 return true; 4677 } 4678 4679 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4680 B.buildConcatVectors(DstReg, ResultRegs); 4681 return true; 4682 } 4683 4684 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4685 LegalizerHelper &Helper, MachineInstr &MI) const { 4686 MachineIRBuilder &B = Helper.MIRBuilder; 4687 GISelChangeObserver &Observer = Helper.Observer; 4688 4689 Register Dst = MI.getOperand(0).getReg(); 4690 LLT Ty = B.getMRI()->getType(Dst); 4691 unsigned Size = Ty.getSizeInBits(); 4692 MachineFunction &MF = B.getMF(); 4693 4694 Observer.changingInstr(MI); 4695 4696 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4697 Ty = getBitcastRegisterType(Ty); 4698 Helper.bitcastDst(MI, Ty, 0); 4699 Dst = MI.getOperand(0).getReg(); 4700 B.setInsertPt(B.getMBB(), MI); 4701 } 4702 4703 // FIXME: We don't really need this intermediate instruction. The intrinsic 4704 // should be fixed to have a memory operand. Since it's readnone, we're not 4705 // allowed to add one. 4706 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4707 MI.RemoveOperand(1); // Remove intrinsic ID 4708 4709 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4710 // TODO: Should this use datalayout alignment? 4711 const unsigned MemSize = (Size + 7) / 8; 4712 const Align MemAlign(4); 4713 MachineMemOperand *MMO = MF.getMachineMemOperand( 4714 MachinePointerInfo(), 4715 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4716 MachineMemOperand::MOInvariant, 4717 MemSize, MemAlign); 4718 MI.addMemOperand(MF, MMO); 4719 4720 // There are no 96-bit result scalar loads, but widening to 128-bit should 4721 // always be legal. We may need to restore this to a 96-bit result if it turns 4722 // out this needs to be converted to a vector load during RegBankSelect. 4723 if (!isPowerOf2_32(Size)) { 4724 if (Ty.isVector()) 4725 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4726 else 4727 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4728 } 4729 4730 Observer.changedInstr(MI); 4731 return true; 4732 } 4733 4734 // TODO: Move to selection 4735 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4736 MachineRegisterInfo &MRI, 4737 MachineIRBuilder &B) const { 4738 if (!ST.isTrapHandlerEnabled() || 4739 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4740 return legalizeTrapEndpgm(MI, MRI, B); 4741 4742 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4743 switch (*HsaAbiVer) { 4744 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4745 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4746 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4747 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4748 return ST.supportsGetDoorbellID() ? 4749 legalizeTrapHsa(MI, MRI, B) : 4750 legalizeTrapHsaQueuePtr(MI, MRI, B); 4751 } 4752 } 4753 4754 llvm_unreachable("Unknown trap handler"); 4755 } 4756 4757 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4758 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4759 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4760 MI.eraseFromParent(); 4761 return true; 4762 } 4763 4764 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4765 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4766 // Pass queue pointer to trap handler as input, and insert trap instruction 4767 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4768 Register LiveIn = 4769 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4770 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4771 return false; 4772 4773 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4774 B.buildCopy(SGPR01, LiveIn); 4775 B.buildInstr(AMDGPU::S_TRAP) 4776 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4777 .addReg(SGPR01, RegState::Implicit); 4778 4779 MI.eraseFromParent(); 4780 return true; 4781 } 4782 4783 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4784 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4785 B.buildInstr(AMDGPU::S_TRAP) 4786 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4787 MI.eraseFromParent(); 4788 return true; 4789 } 4790 4791 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4792 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4793 // Is non-HSA path or trap-handler disabled? then, report a warning 4794 // accordingly 4795 if (!ST.isTrapHandlerEnabled() || 4796 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4797 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4798 "debugtrap handler not supported", 4799 MI.getDebugLoc(), DS_Warning); 4800 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4801 Ctx.diagnose(NoTrap); 4802 } else { 4803 // Insert debug-trap instruction 4804 B.buildInstr(AMDGPU::S_TRAP) 4805 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4806 } 4807 4808 MI.eraseFromParent(); 4809 return true; 4810 } 4811 4812 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4813 MachineIRBuilder &B) const { 4814 MachineRegisterInfo &MRI = *B.getMRI(); 4815 const LLT S16 = LLT::scalar(16); 4816 const LLT S32 = LLT::scalar(32); 4817 4818 Register DstReg = MI.getOperand(0).getReg(); 4819 Register NodePtr = MI.getOperand(2).getReg(); 4820 Register RayExtent = MI.getOperand(3).getReg(); 4821 Register RayOrigin = MI.getOperand(4).getReg(); 4822 Register RayDir = MI.getOperand(5).getReg(); 4823 Register RayInvDir = MI.getOperand(6).getReg(); 4824 Register TDescr = MI.getOperand(7).getReg(); 4825 4826 if (!ST.hasGFX10_AEncoding()) { 4827 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4828 "intrinsic not supported on subtarget", 4829 MI.getDebugLoc()); 4830 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4831 return false; 4832 } 4833 4834 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4835 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4836 const unsigned NumVDataDwords = 4; 4837 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4838 const bool UseNSA = 4839 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 4840 const unsigned BaseOpcodes[2][2] = { 4841 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 4842 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 4843 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 4844 int Opcode; 4845 if (UseNSA) { 4846 Opcode = 4847 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 4848 NumVDataDwords, NumVAddrDwords); 4849 } else { 4850 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 4851 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 4852 PowerOf2Ceil(NumVAddrDwords)); 4853 } 4854 assert(Opcode != -1); 4855 4856 SmallVector<Register, 12> Ops; 4857 if (Is64) { 4858 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4859 Ops.push_back(Unmerge.getReg(0)); 4860 Ops.push_back(Unmerge.getReg(1)); 4861 } else { 4862 Ops.push_back(NodePtr); 4863 } 4864 Ops.push_back(RayExtent); 4865 4866 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4867 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4868 Ops.push_back(Unmerge.getReg(0)); 4869 Ops.push_back(Unmerge.getReg(1)); 4870 Ops.push_back(Unmerge.getReg(2)); 4871 }; 4872 4873 packLanes(RayOrigin); 4874 if (IsA16) { 4875 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4876 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4877 Register R1 = MRI.createGenericVirtualRegister(S32); 4878 Register R2 = MRI.createGenericVirtualRegister(S32); 4879 Register R3 = MRI.createGenericVirtualRegister(S32); 4880 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4881 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4882 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4883 Ops.push_back(R1); 4884 Ops.push_back(R2); 4885 Ops.push_back(R3); 4886 } else { 4887 packLanes(RayDir); 4888 packLanes(RayInvDir); 4889 } 4890 4891 if (!UseNSA) { 4892 // Build a single vector containing all the operands so far prepared. 4893 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 4894 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 4895 Ops.clear(); 4896 Ops.push_back(MergedOps); 4897 } 4898 4899 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4900 .addDef(DstReg) 4901 .addImm(Opcode); 4902 4903 for (Register R : Ops) { 4904 MIB.addUse(R); 4905 } 4906 4907 MIB.addUse(TDescr) 4908 .addImm(IsA16 ? 1 : 0) 4909 .cloneMemRefs(MI); 4910 4911 MI.eraseFromParent(); 4912 return true; 4913 } 4914 4915 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4916 MachineInstr &MI) const { 4917 MachineIRBuilder &B = Helper.MIRBuilder; 4918 MachineRegisterInfo &MRI = *B.getMRI(); 4919 4920 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4921 auto IntrID = MI.getIntrinsicID(); 4922 switch (IntrID) { 4923 case Intrinsic::amdgcn_if: 4924 case Intrinsic::amdgcn_else: { 4925 MachineInstr *Br = nullptr; 4926 MachineBasicBlock *UncondBrTarget = nullptr; 4927 bool Negated = false; 4928 if (MachineInstr *BrCond = 4929 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4930 const SIRegisterInfo *TRI 4931 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4932 4933 Register Def = MI.getOperand(1).getReg(); 4934 Register Use = MI.getOperand(3).getReg(); 4935 4936 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4937 4938 if (Negated) 4939 std::swap(CondBrTarget, UncondBrTarget); 4940 4941 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4942 if (IntrID == Intrinsic::amdgcn_if) { 4943 B.buildInstr(AMDGPU::SI_IF) 4944 .addDef(Def) 4945 .addUse(Use) 4946 .addMBB(UncondBrTarget); 4947 } else { 4948 B.buildInstr(AMDGPU::SI_ELSE) 4949 .addDef(Def) 4950 .addUse(Use) 4951 .addMBB(UncondBrTarget); 4952 } 4953 4954 if (Br) { 4955 Br->getOperand(0).setMBB(CondBrTarget); 4956 } else { 4957 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4958 // since we're swapping branch targets it needs to be reinserted. 4959 // FIXME: IRTranslator should probably not do this 4960 B.buildBr(*CondBrTarget); 4961 } 4962 4963 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4964 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4965 MI.eraseFromParent(); 4966 BrCond->eraseFromParent(); 4967 return true; 4968 } 4969 4970 return false; 4971 } 4972 case Intrinsic::amdgcn_loop: { 4973 MachineInstr *Br = nullptr; 4974 MachineBasicBlock *UncondBrTarget = nullptr; 4975 bool Negated = false; 4976 if (MachineInstr *BrCond = 4977 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4978 const SIRegisterInfo *TRI 4979 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4980 4981 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4982 Register Reg = MI.getOperand(2).getReg(); 4983 4984 if (Negated) 4985 std::swap(CondBrTarget, UncondBrTarget); 4986 4987 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4988 B.buildInstr(AMDGPU::SI_LOOP) 4989 .addUse(Reg) 4990 .addMBB(UncondBrTarget); 4991 4992 if (Br) 4993 Br->getOperand(0).setMBB(CondBrTarget); 4994 else 4995 B.buildBr(*CondBrTarget); 4996 4997 MI.eraseFromParent(); 4998 BrCond->eraseFromParent(); 4999 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5000 return true; 5001 } 5002 5003 return false; 5004 } 5005 case Intrinsic::amdgcn_kernarg_segment_ptr: 5006 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5007 // This only makes sense to call in a kernel, so just lower to null. 5008 B.buildConstant(MI.getOperand(0).getReg(), 0); 5009 MI.eraseFromParent(); 5010 return true; 5011 } 5012 5013 return legalizePreloadedArgIntrin( 5014 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5015 case Intrinsic::amdgcn_implicitarg_ptr: 5016 return legalizeImplicitArgPtr(MI, MRI, B); 5017 case Intrinsic::amdgcn_workitem_id_x: 5018 return legalizePreloadedArgIntrin(MI, MRI, B, 5019 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5020 case Intrinsic::amdgcn_workitem_id_y: 5021 return legalizePreloadedArgIntrin(MI, MRI, B, 5022 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5023 case Intrinsic::amdgcn_workitem_id_z: 5024 return legalizePreloadedArgIntrin(MI, MRI, B, 5025 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5026 case Intrinsic::amdgcn_workgroup_id_x: 5027 return legalizePreloadedArgIntrin(MI, MRI, B, 5028 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5029 case Intrinsic::amdgcn_workgroup_id_y: 5030 return legalizePreloadedArgIntrin(MI, MRI, B, 5031 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5032 case Intrinsic::amdgcn_workgroup_id_z: 5033 return legalizePreloadedArgIntrin(MI, MRI, B, 5034 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5035 case Intrinsic::amdgcn_dispatch_ptr: 5036 return legalizePreloadedArgIntrin(MI, MRI, B, 5037 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5038 case Intrinsic::amdgcn_queue_ptr: 5039 return legalizePreloadedArgIntrin(MI, MRI, B, 5040 AMDGPUFunctionArgInfo::QUEUE_PTR); 5041 case Intrinsic::amdgcn_implicit_buffer_ptr: 5042 return legalizePreloadedArgIntrin( 5043 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5044 case Intrinsic::amdgcn_dispatch_id: 5045 return legalizePreloadedArgIntrin(MI, MRI, B, 5046 AMDGPUFunctionArgInfo::DISPATCH_ID); 5047 case Intrinsic::amdgcn_fdiv_fast: 5048 return legalizeFDIVFastIntrin(MI, MRI, B); 5049 case Intrinsic::amdgcn_is_shared: 5050 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5051 case Intrinsic::amdgcn_is_private: 5052 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5053 case Intrinsic::amdgcn_wavefrontsize: { 5054 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5055 MI.eraseFromParent(); 5056 return true; 5057 } 5058 case Intrinsic::amdgcn_s_buffer_load: 5059 return legalizeSBufferLoad(Helper, MI); 5060 case Intrinsic::amdgcn_raw_buffer_store: 5061 case Intrinsic::amdgcn_struct_buffer_store: 5062 return legalizeBufferStore(MI, MRI, B, false, false); 5063 case Intrinsic::amdgcn_raw_buffer_store_format: 5064 case Intrinsic::amdgcn_struct_buffer_store_format: 5065 return legalizeBufferStore(MI, MRI, B, false, true); 5066 case Intrinsic::amdgcn_raw_tbuffer_store: 5067 case Intrinsic::amdgcn_struct_tbuffer_store: 5068 return legalizeBufferStore(MI, MRI, B, true, true); 5069 case Intrinsic::amdgcn_raw_buffer_load: 5070 case Intrinsic::amdgcn_struct_buffer_load: 5071 return legalizeBufferLoad(MI, MRI, B, false, false); 5072 case Intrinsic::amdgcn_raw_buffer_load_format: 5073 case Intrinsic::amdgcn_struct_buffer_load_format: 5074 return legalizeBufferLoad(MI, MRI, B, true, false); 5075 case Intrinsic::amdgcn_raw_tbuffer_load: 5076 case Intrinsic::amdgcn_struct_tbuffer_load: 5077 return legalizeBufferLoad(MI, MRI, B, true, true); 5078 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5079 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5080 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5081 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5082 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5083 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5084 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5085 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5086 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5087 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5088 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5089 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5090 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5091 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5092 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5093 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5094 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5095 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5096 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5097 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5098 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5099 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5100 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5101 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5102 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5103 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5104 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5105 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5106 case Intrinsic::amdgcn_buffer_atomic_fadd: 5107 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5108 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5109 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5110 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5111 return legalizeBufferAtomic(MI, B, IntrID); 5112 case Intrinsic::amdgcn_atomic_inc: 5113 return legalizeAtomicIncDec(MI, B, true); 5114 case Intrinsic::amdgcn_atomic_dec: 5115 return legalizeAtomicIncDec(MI, B, false); 5116 case Intrinsic::trap: 5117 return legalizeTrapIntrinsic(MI, MRI, B); 5118 case Intrinsic::debugtrap: 5119 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5120 case Intrinsic::amdgcn_rsq_clamp: 5121 return legalizeRsqClampIntrinsic(MI, MRI, B); 5122 case Intrinsic::amdgcn_ds_fadd: 5123 case Intrinsic::amdgcn_ds_fmin: 5124 case Intrinsic::amdgcn_ds_fmax: 5125 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5126 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5127 return legalizeBVHIntrinsic(MI, B); 5128 default: { 5129 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5130 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5131 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5132 return true; 5133 } 5134 } 5135 5136 return true; 5137 } 5138