1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returns true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 unsigned AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .minScalar(0, S16) 536 .clampMaxNumElements(0, S16, 2) 537 .widenScalarToNextMultipleOf(0, 32) 538 .maxScalar(0, S32) 539 .scalarize(0); 540 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 542 .legalFor({S32, S16, V2S16}) // Clamp modifier 543 .minScalarOrElt(0, S16) 544 .clampMaxNumElements(0, S16, 2) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 32) 547 .lower(); 548 } else if (ST.has16BitInsts()) { 549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 550 .legalFor({S32, S16}) 551 .minScalar(0, S16) 552 .widenScalarToNextMultipleOf(0, 32) 553 .maxScalar(0, S32) 554 .scalarize(0); 555 556 // Technically the saturating operations require clamp bit support, but this 557 // was introduced at the same time as 16-bit operations. 558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 559 .legalFor({S32, S16}) // Clamp modifier 560 .minScalar(0, S16) 561 .scalarize(0) 562 .widenScalarToNextPow2(0, 16) 563 .lower(); 564 565 // We're just lowering this, but it helps get a better result to try to 566 // coerce to the desired type first. 567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 568 .minScalar(0, S16) 569 .scalarize(0) 570 .lower(); 571 } else { 572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 573 .legalFor({S32}) 574 .widenScalarToNextMultipleOf(0, 32) 575 .clampScalar(0, S32, S32) 576 .scalarize(0); 577 578 if (ST.hasIntClamp()) { 579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 580 .legalFor({S32}) // Clamp modifier. 581 .scalarize(0) 582 .minScalarOrElt(0, S32) 583 .lower(); 584 } else { 585 // Clamp bit support was added in VI, along with 16-bit operations. 586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 587 .minScalar(0, S32) 588 .scalarize(0) 589 .lower(); 590 } 591 592 // FIXME: DAG expansion gets better results. The widening uses the smaller 593 // range values and goes for the min/max lowering directly. 594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 595 .minScalar(0, S32) 596 .scalarize(0) 597 .lower(); 598 } 599 600 getActionDefinitionsBuilder( 601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 602 .customFor({S32, S64}) 603 .clampScalar(0, S32, S64) 604 .widenScalarToNextPow2(0, 32) 605 .scalarize(0); 606 607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 608 .legalFor({S32}) 609 .maxScalarOrElt(0, S32); 610 611 if (ST.hasVOP3PInsts()) { 612 Mulh 613 .clampMaxNumElements(0, S8, 2) 614 .lowerFor({V2S8}); 615 } 616 617 Mulh 618 .scalarize(0) 619 .lower(); 620 621 // Report legal for any types we can handle anywhere. For the cases only legal 622 // on the SALU, RegBankSelect will be able to re-legalize. 623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 625 .clampScalar(0, S32, S64) 626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 628 .widenScalarToNextPow2(0) 629 .scalarize(0); 630 631 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 633 .legalFor({{S32, S1}, {S32, S32}}) 634 .minScalar(0, S32) 635 // TODO: .scalarize(0) 636 .lower(); 637 638 getActionDefinitionsBuilder(G_BITCAST) 639 // Don't worry about the size constraint. 640 .legalIf(all(isRegisterType(0), isRegisterType(1))) 641 .lower(); 642 643 644 getActionDefinitionsBuilder(G_CONSTANT) 645 .legalFor({S1, S32, S64, S16, GlobalPtr, 646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 647 .legalIf(isPointer(0)) 648 .clampScalar(0, S32, S64) 649 .widenScalarToNextPow2(0); 650 651 getActionDefinitionsBuilder(G_FCONSTANT) 652 .legalFor({S32, S64, S16}) 653 .clampScalar(0, S16, S64); 654 655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 656 .legalIf(isRegisterType(0)) 657 // s1 and s16 are special cases because they have legal operations on 658 // them, but don't really occupy registers in the normal way. 659 .legalFor({S1, S16}) 660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 661 .clampScalarOrElt(0, S32, MaxScalar) 662 .widenScalarToNextPow2(0, 32) 663 .clampMaxNumElements(0, S32, 16); 664 665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 666 667 // If the amount is divergent, we have to do a wave reduction to get the 668 // maximum value, so this is expanded during RegBankSelect. 669 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 670 .legalFor({{PrivatePtr, S32}}); 671 672 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 673 .customIf(typeIsNot(0, PrivatePtr)); 674 675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 676 677 auto &FPOpActions = getActionDefinitionsBuilder( 678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 679 .legalFor({S32, S64}); 680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 681 .customFor({S32, S64}); 682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 683 .customFor({S32, S64}); 684 685 if (ST.has16BitInsts()) { 686 if (ST.hasVOP3PInsts()) 687 FPOpActions.legalFor({S16, V2S16}); 688 else 689 FPOpActions.legalFor({S16}); 690 691 TrigActions.customFor({S16}); 692 FDIVActions.customFor({S16}); 693 } 694 695 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 697 698 if (ST.hasVOP3PInsts()) { 699 MinNumMaxNum.customFor(FPTypesPK16) 700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 701 .clampMaxNumElements(0, S16, 2) 702 .clampScalar(0, S16, S64) 703 .scalarize(0); 704 } else if (ST.has16BitInsts()) { 705 MinNumMaxNum.customFor(FPTypes16) 706 .clampScalar(0, S16, S64) 707 .scalarize(0); 708 } else { 709 MinNumMaxNum.customFor(FPTypesBase) 710 .clampScalar(0, S32, S64) 711 .scalarize(0); 712 } 713 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.clampMaxNumElements(0, S16, 2); 716 717 FPOpActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 TrigActions 722 .scalarize(0) 723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 724 725 FDIVActions 726 .scalarize(0) 727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 728 729 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 730 .legalFor(FPTypesPK16) 731 .clampMaxNumElements(0, S16, 2) 732 .scalarize(0) 733 .clampScalar(0, S16, S64); 734 735 if (ST.has16BitInsts()) { 736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 737 .legalFor({S32, S64, S16}) 738 .scalarize(0) 739 .clampScalar(0, S16, S64); 740 } else { 741 getActionDefinitionsBuilder(G_FSQRT) 742 .legalFor({S32, S64}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 746 if (ST.hasFractBug()) { 747 getActionDefinitionsBuilder(G_FFLOOR) 748 .customFor({S64}) 749 .legalFor({S32, S64}) 750 .scalarize(0) 751 .clampScalar(0, S32, S64); 752 } else { 753 getActionDefinitionsBuilder(G_FFLOOR) 754 .legalFor({S32, S64}) 755 .scalarize(0) 756 .clampScalar(0, S32, S64); 757 } 758 } 759 760 getActionDefinitionsBuilder(G_FPTRUNC) 761 .legalFor({{S32, S64}, {S16, S32}}) 762 .scalarize(0) 763 .lower(); 764 765 getActionDefinitionsBuilder(G_FPEXT) 766 .legalFor({{S64, S32}, {S32, S16}}) 767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 768 .scalarize(0); 769 770 getActionDefinitionsBuilder(G_FSUB) 771 // Use actual fsub instruction 772 .legalFor({S32}) 773 // Must use fadd + fneg 774 .lowerFor({S64, S16, V2S16}) 775 .scalarize(0) 776 .clampScalar(0, S32, S64); 777 778 // Whether this is legal depends on the floating point mode for the function. 779 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 780 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 781 FMad.customFor({S32, S16}); 782 else if (ST.hasMadMacF32Insts()) 783 FMad.customFor({S32}); 784 else if (ST.hasMadF16()) 785 FMad.customFor({S16}); 786 FMad.scalarize(0) 787 .lower(); 788 789 auto &FRem = getActionDefinitionsBuilder(G_FREM); 790 if (ST.has16BitInsts()) { 791 FRem.customFor({S16, S32, S64}); 792 } else { 793 FRem.minScalar(0, S32) 794 .customFor({S32, S64}); 795 } 796 FRem.scalarize(0); 797 798 // TODO: Do we need to clamp maximum bitwidth? 799 getActionDefinitionsBuilder(G_TRUNC) 800 .legalIf(isScalar(0)) 801 .legalFor({{V2S16, V2S32}}) 802 .clampMaxNumElements(0, S16, 2) 803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 804 // situations (like an invalid implicit use), we don't want to infinite loop 805 // in the legalizer. 806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 807 .alwaysLegal(); 808 809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 811 {S32, S1}, {S64, S1}, {S16, S1}}) 812 .scalarize(0) 813 .clampScalar(0, S32, S64) 814 .widenScalarToNextPow2(1, 32); 815 816 // TODO: Split s1->s64 during regbankselect for VALU. 817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 819 .lowerIf(typeIs(1, S1)) 820 .customFor({{S32, S64}, {S64, S64}}); 821 if (ST.has16BitInsts()) 822 IToFP.legalFor({{S16, S16}}); 823 IToFP.clampScalar(1, S32, S64) 824 .minScalar(0, S32) 825 .scalarize(0) 826 .widenScalarToNextPow2(1); 827 828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 830 .customFor({{S64, S32}, {S64, S64}}) 831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 832 if (ST.has16BitInsts()) 833 FPToI.legalFor({{S16, S16}}); 834 else 835 FPToI.minScalar(1, S32); 836 837 FPToI.minScalar(0, S32) 838 .widenScalarToNextPow2(0, 32) 839 .scalarize(0) 840 .lower(); 841 842 // Lower roundeven into G_FRINT 843 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 844 .scalarize(0) 845 .lower(); 846 847 if (ST.has16BitInsts()) { 848 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 849 .legalFor({S16, S32, S64}) 850 .clampScalar(0, S16, S64) 851 .scalarize(0); 852 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 854 .legalFor({S32, S64}) 855 .clampScalar(0, S32, S64) 856 .scalarize(0); 857 } else { 858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 859 .legalFor({S32}) 860 .customFor({S64}) 861 .clampScalar(0, S32, S64) 862 .scalarize(0); 863 } 864 865 getActionDefinitionsBuilder(G_PTR_ADD) 866 .legalIf(all(isPointer(0), sameSize(0, 1))) 867 .scalarize(0) 868 .scalarSameSizeAs(1, 0); 869 870 getActionDefinitionsBuilder(G_PTRMASK) 871 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 872 .scalarSameSizeAs(1, 0) 873 .scalarize(0); 874 875 auto &CmpBuilder = 876 getActionDefinitionsBuilder(G_ICMP) 877 // The compare output type differs based on the register bank of the output, 878 // so make both s1 and s32 legal. 879 // 880 // Scalar compares producing output in scc will be promoted to s32, as that 881 // is the allocatable register type that will be needed for the copy from 882 // scc. This will be promoted during RegBankSelect, and we assume something 883 // before that won't try to use s32 result types. 884 // 885 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 886 // bank. 887 .legalForCartesianProduct( 888 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 889 .legalForCartesianProduct( 890 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 891 if (ST.has16BitInsts()) { 892 CmpBuilder.legalFor({{S1, S16}}); 893 } 894 895 CmpBuilder 896 .widenScalarToNextPow2(1) 897 .clampScalar(1, S32, S64) 898 .scalarize(0) 899 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 900 901 getActionDefinitionsBuilder(G_FCMP) 902 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 903 .widenScalarToNextPow2(1) 904 .clampScalar(1, S32, S64) 905 .scalarize(0); 906 907 // FIXME: fpow has a selection pattern that should move to custom lowering. 908 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 909 if (ST.has16BitInsts()) 910 Exp2Ops.legalFor({S32, S16}); 911 else 912 Exp2Ops.legalFor({S32}); 913 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 914 Exp2Ops.scalarize(0); 915 916 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 917 if (ST.has16BitInsts()) 918 ExpOps.customFor({{S32}, {S16}}); 919 else 920 ExpOps.customFor({S32}); 921 ExpOps.clampScalar(0, MinScalarFPTy, S32) 922 .scalarize(0); 923 924 getActionDefinitionsBuilder(G_FPOWI) 925 .clampScalar(0, MinScalarFPTy, S32) 926 .lower(); 927 928 // The 64-bit versions produce 32-bit results, but only on the SALU. 929 getActionDefinitionsBuilder(G_CTPOP) 930 .legalFor({{S32, S32}, {S32, S64}}) 931 .clampScalar(0, S32, S32) 932 .clampScalar(1, S32, S64) 933 .scalarize(0) 934 .widenScalarToNextPow2(0, 32) 935 .widenScalarToNextPow2(1, 32); 936 937 // The hardware instructions return a different result on 0 than the generic 938 // instructions expect. The hardware produces -1, but these produce the 939 // bitwidth. 940 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 941 .scalarize(0) 942 .clampScalar(0, S32, S32) 943 .clampScalar(1, S32, S64) 944 .widenScalarToNextPow2(0, 32) 945 .widenScalarToNextPow2(1, 32) 946 .custom(); 947 948 // The 64-bit versions produce 32-bit results, but only on the SALU. 949 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 950 .legalFor({{S32, S32}, {S32, S64}}) 951 .clampScalar(0, S32, S32) 952 .clampScalar(1, S32, S64) 953 .scalarize(0) 954 .widenScalarToNextPow2(0, 32) 955 .widenScalarToNextPow2(1, 32); 956 957 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 958 // RegBankSelect. 959 getActionDefinitionsBuilder(G_BITREVERSE) 960 .legalFor({S32, S64}) 961 .clampScalar(0, S32, S64) 962 .scalarize(0) 963 .widenScalarToNextPow2(0); 964 965 if (ST.has16BitInsts()) { 966 getActionDefinitionsBuilder(G_BSWAP) 967 .legalFor({S16, S32, V2S16}) 968 .clampMaxNumElements(0, S16, 2) 969 // FIXME: Fixing non-power-of-2 before clamp is workaround for 970 // narrowScalar limitation. 971 .widenScalarToNextPow2(0) 972 .clampScalar(0, S16, S32) 973 .scalarize(0); 974 975 if (ST.hasVOP3PInsts()) { 976 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 977 .legalFor({S32, S16, V2S16}) 978 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 979 .clampMaxNumElements(0, S16, 2) 980 .minScalar(0, S16) 981 .widenScalarToNextPow2(0) 982 .scalarize(0) 983 .lower(); 984 } else { 985 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 986 .legalFor({S32, S16}) 987 .widenScalarToNextPow2(0) 988 .minScalar(0, S16) 989 .scalarize(0) 990 .lower(); 991 } 992 } else { 993 // TODO: Should have same legality without v_perm_b32 994 getActionDefinitionsBuilder(G_BSWAP) 995 .legalFor({S32}) 996 .lowerIf(scalarNarrowerThan(0, 32)) 997 // FIXME: Fixing non-power-of-2 before clamp is workaround for 998 // narrowScalar limitation. 999 .widenScalarToNextPow2(0) 1000 .maxScalar(0, S32) 1001 .scalarize(0) 1002 .lower(); 1003 1004 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1005 .legalFor({S32}) 1006 .minScalar(0, S32) 1007 .widenScalarToNextPow2(0) 1008 .scalarize(0) 1009 .lower(); 1010 } 1011 1012 getActionDefinitionsBuilder(G_INTTOPTR) 1013 // List the common cases 1014 .legalForCartesianProduct(AddrSpaces64, {S64}) 1015 .legalForCartesianProduct(AddrSpaces32, {S32}) 1016 .scalarize(0) 1017 // Accept any address space as long as the size matches 1018 .legalIf(sameSize(0, 1)) 1019 .widenScalarIf(smallerThan(1, 0), 1020 [](const LegalityQuery &Query) { 1021 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1022 }) 1023 .narrowScalarIf(largerThan(1, 0), 1024 [](const LegalityQuery &Query) { 1025 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1026 }); 1027 1028 getActionDefinitionsBuilder(G_PTRTOINT) 1029 // List the common cases 1030 .legalForCartesianProduct(AddrSpaces64, {S64}) 1031 .legalForCartesianProduct(AddrSpaces32, {S32}) 1032 .scalarize(0) 1033 // Accept any address space as long as the size matches 1034 .legalIf(sameSize(0, 1)) 1035 .widenScalarIf(smallerThan(0, 1), 1036 [](const LegalityQuery &Query) { 1037 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1038 }) 1039 .narrowScalarIf( 1040 largerThan(0, 1), 1041 [](const LegalityQuery &Query) { 1042 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1043 }); 1044 1045 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1046 .scalarize(0) 1047 .custom(); 1048 1049 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1050 bool IsLoad) -> bool { 1051 const LLT DstTy = Query.Types[0]; 1052 1053 // Split vector extloads. 1054 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1055 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1056 1057 if (MemSize < DstTy.getSizeInBits()) 1058 MemSize = std::max(MemSize, AlignBits); 1059 1060 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1061 return true; 1062 1063 const LLT PtrTy = Query.Types[1]; 1064 unsigned AS = PtrTy.getAddressSpace(); 1065 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1066 return true; 1067 1068 // Catch weird sized loads that don't evenly divide into the access sizes 1069 // TODO: May be able to widen depending on alignment etc. 1070 unsigned NumRegs = (MemSize + 31) / 32; 1071 if (NumRegs == 3) { 1072 if (!ST.hasDwordx3LoadStores()) 1073 return true; 1074 } else { 1075 // If the alignment allows, these should have been widened. 1076 if (!isPowerOf2_32(NumRegs)) 1077 return true; 1078 } 1079 1080 if (AlignBits < MemSize) { 1081 const SITargetLowering *TLI = ST.getTargetLowering(); 1082 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1083 Align(AlignBits / 8)); 1084 } 1085 1086 return false; 1087 }; 1088 1089 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1090 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1091 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1092 1093 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1094 // LDS 1095 // TODO: Unsupported flat for SI. 1096 1097 for (unsigned Op : {G_LOAD, G_STORE}) { 1098 const bool IsStore = Op == G_STORE; 1099 1100 auto &Actions = getActionDefinitionsBuilder(Op); 1101 // Explicitly list some common cases. 1102 // TODO: Does this help compile time at all? 1103 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1104 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1105 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1106 {S64, GlobalPtr, S64, GlobalAlign32}, 1107 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1108 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1109 {S32, GlobalPtr, S8, GlobalAlign8}, 1110 {S32, GlobalPtr, S16, GlobalAlign16}, 1111 1112 {S32, LocalPtr, S32, 32}, 1113 {S64, LocalPtr, S64, 32}, 1114 {V2S32, LocalPtr, V2S32, 32}, 1115 {S32, LocalPtr, S8, 8}, 1116 {S32, LocalPtr, S16, 16}, 1117 {V2S16, LocalPtr, S32, 32}, 1118 1119 {S32, PrivatePtr, S32, 32}, 1120 {S32, PrivatePtr, S8, 8}, 1121 {S32, PrivatePtr, S16, 16}, 1122 {V2S16, PrivatePtr, S32, 32}, 1123 1124 {S32, ConstantPtr, S32, GlobalAlign32}, 1125 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1126 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1127 {S64, ConstantPtr, S64, GlobalAlign32}, 1128 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1129 Actions.legalIf( 1130 [=](const LegalityQuery &Query) -> bool { 1131 return isLoadStoreLegal(ST, Query); 1132 }); 1133 1134 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1135 // 64-bits. 1136 // 1137 // TODO: Should generalize bitcast action into coerce, which will also cover 1138 // inserting addrspacecasts. 1139 Actions.customIf(typeIs(1, Constant32Ptr)); 1140 1141 // Turn any illegal element vectors into something easier to deal 1142 // with. These will ultimately produce 32-bit scalar shifts to extract the 1143 // parts anyway. 1144 // 1145 // For odd 16-bit element vectors, prefer to split those into pieces with 1146 // 16-bit vector parts. 1147 Actions.bitcastIf( 1148 [=](const LegalityQuery &Query) -> bool { 1149 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1150 Query.MMODescrs[0].MemoryTy); 1151 }, bitcastToRegisterType(0)); 1152 1153 if (!IsStore) { 1154 // Widen suitably aligned loads by loading extra bytes. The standard 1155 // legalization actions can't properly express widening memory operands. 1156 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1157 return shouldWidenLoad(ST, Query, G_LOAD); 1158 }); 1159 } 1160 1161 // FIXME: load/store narrowing should be moved to lower action 1162 Actions 1163 .narrowScalarIf( 1164 [=](const LegalityQuery &Query) -> bool { 1165 return !Query.Types[0].isVector() && 1166 needToSplitMemOp(Query, Op == G_LOAD); 1167 }, 1168 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1169 const LLT DstTy = Query.Types[0]; 1170 const LLT PtrTy = Query.Types[1]; 1171 1172 const unsigned DstSize = DstTy.getSizeInBits(); 1173 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1174 1175 // Split extloads. 1176 if (DstSize > MemSize) 1177 return std::make_pair(0, LLT::scalar(MemSize)); 1178 1179 if (!isPowerOf2_32(DstSize)) { 1180 // We're probably decomposing an odd sized store. Try to split 1181 // to the widest type. TODO: Account for alignment. As-is it 1182 // should be OK, since the new parts will be further legalized. 1183 unsigned FloorSize = PowerOf2Floor(DstSize); 1184 return std::make_pair(0, LLT::scalar(FloorSize)); 1185 } 1186 1187 if (DstSize > 32 && (DstSize % 32 != 0)) { 1188 // FIXME: Need a way to specify non-extload of larger size if 1189 // suitably aligned. 1190 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1191 } 1192 1193 unsigned MaxSize = maxSizeForAddrSpace(ST, 1194 PtrTy.getAddressSpace(), 1195 Op == G_LOAD); 1196 if (MemSize > MaxSize) 1197 return std::make_pair(0, LLT::scalar(MaxSize)); 1198 1199 unsigned Align = Query.MMODescrs[0].AlignInBits; 1200 return std::make_pair(0, LLT::scalar(Align)); 1201 }) 1202 .fewerElementsIf( 1203 [=](const LegalityQuery &Query) -> bool { 1204 return Query.Types[0].isVector() && 1205 needToSplitMemOp(Query, Op == G_LOAD); 1206 }, 1207 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1208 const LLT DstTy = Query.Types[0]; 1209 const LLT PtrTy = Query.Types[1]; 1210 1211 LLT EltTy = DstTy.getElementType(); 1212 unsigned MaxSize = maxSizeForAddrSpace(ST, 1213 PtrTy.getAddressSpace(), 1214 Op == G_LOAD); 1215 1216 // FIXME: Handle widened to power of 2 results better. This ends 1217 // up scalarizing. 1218 // FIXME: 3 element stores scalarized on SI 1219 1220 // Split if it's too large for the address space. 1221 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1222 if (MemSize > MaxSize) { 1223 unsigned NumElts = DstTy.getNumElements(); 1224 unsigned EltSize = EltTy.getSizeInBits(); 1225 1226 if (MaxSize % EltSize == 0) { 1227 return std::make_pair( 1228 0, LLT::scalarOrVector( 1229 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1230 } 1231 1232 unsigned NumPieces = MemSize / MaxSize; 1233 1234 // FIXME: Refine when odd breakdowns handled 1235 // The scalars will need to be re-legalized. 1236 if (NumPieces == 1 || NumPieces >= NumElts || 1237 NumElts % NumPieces != 0) 1238 return std::make_pair(0, EltTy); 1239 1240 return std::make_pair( 1241 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1242 } 1243 1244 // FIXME: We could probably handle weird extending loads better. 1245 if (DstTy.getSizeInBits() > MemSize) 1246 return std::make_pair(0, EltTy); 1247 1248 unsigned EltSize = EltTy.getSizeInBits(); 1249 unsigned DstSize = DstTy.getSizeInBits(); 1250 if (!isPowerOf2_32(DstSize)) { 1251 // We're probably decomposing an odd sized store. Try to split 1252 // to the widest type. TODO: Account for alignment. As-is it 1253 // should be OK, since the new parts will be further legalized. 1254 unsigned FloorSize = PowerOf2Floor(DstSize); 1255 return std::make_pair( 1256 0, LLT::scalarOrVector( 1257 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1258 } 1259 1260 // Need to split because of alignment. 1261 unsigned Align = Query.MMODescrs[0].AlignInBits; 1262 if (EltSize > Align && 1263 (EltSize / Align < DstTy.getNumElements())) { 1264 return std::make_pair( 1265 0, LLT::fixed_vector(EltSize / Align, EltTy)); 1266 } 1267 1268 // May need relegalization for the scalars. 1269 return std::make_pair(0, EltTy); 1270 }) 1271 .minScalar(0, S32) 1272 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1273 .widenScalarToNextPow2(0) 1274 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1275 .lower(); 1276 } 1277 1278 // FIXME: Unaligned accesses not lowered. 1279 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1280 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1281 {S32, GlobalPtr, S16, 2 * 8}, 1282 {S32, LocalPtr, S8, 8}, 1283 {S32, LocalPtr, S16, 16}, 1284 {S32, PrivatePtr, S8, 8}, 1285 {S32, PrivatePtr, S16, 16}, 1286 {S32, ConstantPtr, S8, 8}, 1287 {S32, ConstantPtr, S16, 2 * 8}}) 1288 .legalIf( 1289 [=](const LegalityQuery &Query) -> bool { 1290 return isLoadStoreLegal(ST, Query); 1291 }); 1292 1293 if (ST.hasFlatAddressSpace()) { 1294 ExtLoads.legalForTypesWithMemDesc( 1295 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1296 } 1297 1298 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1299 // 64-bits. 1300 // 1301 // TODO: Should generalize bitcast action into coerce, which will also cover 1302 // inserting addrspacecasts. 1303 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1304 1305 ExtLoads.clampScalar(0, S32, S32) 1306 .widenScalarToNextPow2(0) 1307 .lower(); 1308 1309 auto &Atomics = getActionDefinitionsBuilder( 1310 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1311 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1312 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1313 G_ATOMICRMW_UMIN}) 1314 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1315 {S64, GlobalPtr}, {S64, LocalPtr}, 1316 {S32, RegionPtr}, {S64, RegionPtr}}); 1317 if (ST.hasFlatAddressSpace()) { 1318 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1319 } 1320 1321 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1322 if (ST.hasLDSFPAtomicAdd()) { 1323 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1324 if (ST.hasGFX90AInsts()) 1325 Atomic.legalFor({{S64, LocalPtr}}); 1326 } 1327 if (ST.hasAtomicFaddInsts()) 1328 Atomic.legalFor({{S32, GlobalPtr}}); 1329 1330 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1331 // demarshalling 1332 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1333 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1334 {S32, FlatPtr}, {S64, FlatPtr}}) 1335 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1336 {S32, RegionPtr}, {S64, RegionPtr}}); 1337 // TODO: Pointer types, any 32-bit or 64-bit vector 1338 1339 // Condition should be s32 for scalar, s1 for vector. 1340 getActionDefinitionsBuilder(G_SELECT) 1341 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1342 LocalPtr, FlatPtr, PrivatePtr, 1343 LLT::fixed_vector(2, LocalPtr), 1344 LLT::fixed_vector(2, PrivatePtr)}, 1345 {S1, S32}) 1346 .clampScalar(0, S16, S64) 1347 .scalarize(1) 1348 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1349 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1350 .clampMaxNumElements(0, S32, 2) 1351 .clampMaxNumElements(0, LocalPtr, 2) 1352 .clampMaxNumElements(0, PrivatePtr, 2) 1353 .scalarize(0) 1354 .widenScalarToNextPow2(0) 1355 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1356 1357 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1358 // be more flexible with the shift amount type. 1359 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1360 .legalFor({{S32, S32}, {S64, S32}}); 1361 if (ST.has16BitInsts()) { 1362 if (ST.hasVOP3PInsts()) { 1363 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1364 .clampMaxNumElements(0, S16, 2); 1365 } else 1366 Shifts.legalFor({{S16, S16}}); 1367 1368 // TODO: Support 16-bit shift amounts for all types 1369 Shifts.widenScalarIf( 1370 [=](const LegalityQuery &Query) { 1371 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1372 // 32-bit amount. 1373 const LLT ValTy = Query.Types[0]; 1374 const LLT AmountTy = Query.Types[1]; 1375 return ValTy.getSizeInBits() <= 16 && 1376 AmountTy.getSizeInBits() < 16; 1377 }, changeTo(1, S16)); 1378 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1379 Shifts.clampScalar(1, S32, S32); 1380 Shifts.clampScalar(0, S16, S64); 1381 Shifts.widenScalarToNextPow2(0, 16); 1382 1383 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1384 .minScalar(0, S16) 1385 .scalarize(0) 1386 .lower(); 1387 } else { 1388 // Make sure we legalize the shift amount type first, as the general 1389 // expansion for the shifted type will produce much worse code if it hasn't 1390 // been truncated already. 1391 Shifts.clampScalar(1, S32, S32); 1392 Shifts.clampScalar(0, S32, S64); 1393 Shifts.widenScalarToNextPow2(0, 32); 1394 1395 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1396 .minScalar(0, S32) 1397 .scalarize(0) 1398 .lower(); 1399 } 1400 Shifts.scalarize(0); 1401 1402 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1403 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1404 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1405 unsigned IdxTypeIdx = 2; 1406 1407 getActionDefinitionsBuilder(Op) 1408 .customIf([=](const LegalityQuery &Query) { 1409 const LLT EltTy = Query.Types[EltTypeIdx]; 1410 const LLT VecTy = Query.Types[VecTypeIdx]; 1411 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1412 const unsigned EltSize = EltTy.getSizeInBits(); 1413 return (EltSize == 32 || EltSize == 64) && 1414 VecTy.getSizeInBits() % 32 == 0 && 1415 VecTy.getSizeInBits() <= MaxRegisterSize && 1416 IdxTy.getSizeInBits() == 32; 1417 }) 1418 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1419 bitcastToVectorElement32(VecTypeIdx)) 1420 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1421 .bitcastIf( 1422 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1423 [=](const LegalityQuery &Query) { 1424 // For > 64-bit element types, try to turn this into a 64-bit 1425 // element vector since we may be able to do better indexing 1426 // if this is scalar. If not, fall back to 32. 1427 const LLT EltTy = Query.Types[EltTypeIdx]; 1428 const LLT VecTy = Query.Types[VecTypeIdx]; 1429 const unsigned DstEltSize = EltTy.getSizeInBits(); 1430 const unsigned VecSize = VecTy.getSizeInBits(); 1431 1432 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1433 return std::make_pair( 1434 VecTypeIdx, 1435 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1436 }) 1437 .clampScalar(EltTypeIdx, S32, S64) 1438 .clampScalar(VecTypeIdx, S32, S64) 1439 .clampScalar(IdxTypeIdx, S32, S32) 1440 .clampMaxNumElements(VecTypeIdx, S32, 32) 1441 // TODO: Clamp elements for 64-bit vectors? 1442 // It should only be necessary with variable indexes. 1443 // As a last resort, lower to the stack 1444 .lower(); 1445 } 1446 1447 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1448 .unsupportedIf([=](const LegalityQuery &Query) { 1449 const LLT &EltTy = Query.Types[1].getElementType(); 1450 return Query.Types[0] != EltTy; 1451 }); 1452 1453 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1454 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1455 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1456 1457 // FIXME: Doesn't handle extract of illegal sizes. 1458 getActionDefinitionsBuilder(Op) 1459 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1460 // FIXME: Multiples of 16 should not be legal. 1461 .legalIf([=](const LegalityQuery &Query) { 1462 const LLT BigTy = Query.Types[BigTyIdx]; 1463 const LLT LitTy = Query.Types[LitTyIdx]; 1464 return (BigTy.getSizeInBits() % 32 == 0) && 1465 (LitTy.getSizeInBits() % 16 == 0); 1466 }) 1467 .widenScalarIf( 1468 [=](const LegalityQuery &Query) { 1469 const LLT BigTy = Query.Types[BigTyIdx]; 1470 return (BigTy.getScalarSizeInBits() < 16); 1471 }, 1472 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1473 .widenScalarIf( 1474 [=](const LegalityQuery &Query) { 1475 const LLT LitTy = Query.Types[LitTyIdx]; 1476 return (LitTy.getScalarSizeInBits() < 16); 1477 }, 1478 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1479 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1480 .widenScalarToNextPow2(BigTyIdx, 32); 1481 1482 } 1483 1484 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1485 .legalForCartesianProduct(AllS32Vectors, {S32}) 1486 .legalForCartesianProduct(AllS64Vectors, {S64}) 1487 .clampNumElements(0, V16S32, V32S32) 1488 .clampNumElements(0, V2S64, V16S64) 1489 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1490 1491 if (ST.hasScalarPackInsts()) { 1492 BuildVector 1493 // FIXME: Should probably widen s1 vectors straight to s32 1494 .minScalarOrElt(0, S16) 1495 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1496 .minScalar(1, S32); 1497 1498 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1499 .legalFor({V2S16, S32}) 1500 .lower(); 1501 BuildVector.minScalarOrElt(0, S32); 1502 } else { 1503 BuildVector.customFor({V2S16, S16}); 1504 BuildVector.minScalarOrElt(0, S32); 1505 1506 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1507 .customFor({V2S16, S32}) 1508 .lower(); 1509 } 1510 1511 BuildVector.legalIf(isRegisterType(0)); 1512 1513 // FIXME: Clamp maximum size 1514 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1515 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1516 .clampMaxNumElements(0, S32, 32) 1517 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1518 .clampMaxNumElements(0, S16, 64); 1519 1520 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1521 // pre-legalize. 1522 if (ST.hasVOP3PInsts()) { 1523 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1524 .customFor({V2S16, V2S16}) 1525 .lower(); 1526 } else 1527 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1528 1529 // Merge/Unmerge 1530 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1531 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1532 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1533 1534 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1535 const LLT Ty = Query.Types[TypeIdx]; 1536 if (Ty.isVector()) { 1537 const LLT &EltTy = Ty.getElementType(); 1538 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1539 return true; 1540 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1541 return true; 1542 } 1543 return false; 1544 }; 1545 1546 auto &Builder = getActionDefinitionsBuilder(Op) 1547 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1548 .lowerFor({{S16, V2S16}}) 1549 .lowerIf([=](const LegalityQuery &Query) { 1550 const LLT BigTy = Query.Types[BigTyIdx]; 1551 return BigTy.getSizeInBits() == 32; 1552 }) 1553 // Try to widen to s16 first for small types. 1554 // TODO: Only do this on targets with legal s16 shifts 1555 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1556 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1557 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1558 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1559 elementTypeIs(1, S16)), 1560 changeTo(1, V2S16)) 1561 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1562 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1563 // valid. 1564 .clampScalar(LitTyIdx, S32, S512) 1565 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1566 // Break up vectors with weird elements into scalars 1567 .fewerElementsIf( 1568 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1569 scalarize(0)) 1570 .fewerElementsIf( 1571 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1572 scalarize(1)) 1573 .clampScalar(BigTyIdx, S32, MaxScalar); 1574 1575 if (Op == G_MERGE_VALUES) { 1576 Builder.widenScalarIf( 1577 // TODO: Use 16-bit shifts if legal for 8-bit values? 1578 [=](const LegalityQuery &Query) { 1579 const LLT Ty = Query.Types[LitTyIdx]; 1580 return Ty.getSizeInBits() < 32; 1581 }, 1582 changeTo(LitTyIdx, S32)); 1583 } 1584 1585 Builder.widenScalarIf( 1586 [=](const LegalityQuery &Query) { 1587 const LLT Ty = Query.Types[BigTyIdx]; 1588 return !isPowerOf2_32(Ty.getSizeInBits()) && 1589 Ty.getSizeInBits() % 16 != 0; 1590 }, 1591 [=](const LegalityQuery &Query) { 1592 // Pick the next power of 2, or a multiple of 64 over 128. 1593 // Whichever is smaller. 1594 const LLT &Ty = Query.Types[BigTyIdx]; 1595 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1596 if (NewSizeInBits >= 256) { 1597 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1598 if (RoundedTo < NewSizeInBits) 1599 NewSizeInBits = RoundedTo; 1600 } 1601 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1602 }) 1603 // Any vectors left are the wrong size. Scalarize them. 1604 .scalarize(0) 1605 .scalarize(1); 1606 } 1607 1608 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1609 // RegBankSelect. 1610 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1611 .legalFor({{S32}, {S64}}); 1612 1613 if (ST.hasVOP3PInsts()) { 1614 SextInReg.lowerFor({{V2S16}}) 1615 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1616 // get more vector shift opportunities, since we'll get those when 1617 // expanded. 1618 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1619 } else if (ST.has16BitInsts()) { 1620 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1621 } else { 1622 // Prefer to promote to s32 before lowering if we don't have 16-bit 1623 // shifts. This avoid a lot of intermediate truncate and extend operations. 1624 SextInReg.lowerFor({{S32}, {S64}}); 1625 } 1626 1627 SextInReg 1628 .scalarize(0) 1629 .clampScalar(0, S32, S64) 1630 .lower(); 1631 1632 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1633 .scalarize(0) 1634 .lower(); 1635 1636 // TODO: Only Try to form v2s16 with legal packed instructions. 1637 getActionDefinitionsBuilder(G_FSHR) 1638 .legalFor({{S32, S32}}) 1639 .lowerFor({{V2S16, V2S16}}) 1640 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1641 .scalarize(0) 1642 .lower(); 1643 1644 if (ST.hasVOP3PInsts()) { 1645 getActionDefinitionsBuilder(G_FSHL) 1646 .lowerFor({{V2S16, V2S16}}) 1647 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1648 .scalarize(0) 1649 .lower(); 1650 } else { 1651 getActionDefinitionsBuilder(G_FSHL) 1652 .scalarize(0) 1653 .lower(); 1654 } 1655 1656 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1657 .legalFor({S64}); 1658 1659 getActionDefinitionsBuilder(G_FENCE) 1660 .alwaysLegal(); 1661 1662 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1663 .scalarize(0) 1664 .minScalar(0, S32) 1665 .lower(); 1666 1667 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1668 .legalFor({{S32, S32}, {S64, S32}}) 1669 .clampScalar(1, S32, S32) 1670 .clampScalar(0, S32, S64) 1671 .widenScalarToNextPow2(0) 1672 .scalarize(0); 1673 1674 getActionDefinitionsBuilder({ 1675 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1676 G_FCOPYSIGN, 1677 1678 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1679 G_ATOMICRMW_NAND, 1680 G_ATOMICRMW_FSUB, 1681 G_READ_REGISTER, 1682 G_WRITE_REGISTER, 1683 1684 G_SADDO, G_SSUBO, 1685 1686 // TODO: Implement 1687 G_FMINIMUM, G_FMAXIMUM}).lower(); 1688 1689 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1690 .lower(); 1691 1692 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1693 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1694 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1695 .unsupported(); 1696 1697 getLegacyLegalizerInfo().computeTables(); 1698 verify(*ST.getInstrInfo()); 1699 } 1700 1701 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1702 MachineInstr &MI) const { 1703 MachineIRBuilder &B = Helper.MIRBuilder; 1704 MachineRegisterInfo &MRI = *B.getMRI(); 1705 1706 switch (MI.getOpcode()) { 1707 case TargetOpcode::G_ADDRSPACE_CAST: 1708 return legalizeAddrSpaceCast(MI, MRI, B); 1709 case TargetOpcode::G_FRINT: 1710 return legalizeFrint(MI, MRI, B); 1711 case TargetOpcode::G_FCEIL: 1712 return legalizeFceil(MI, MRI, B); 1713 case TargetOpcode::G_FREM: 1714 return legalizeFrem(MI, MRI, B); 1715 case TargetOpcode::G_INTRINSIC_TRUNC: 1716 return legalizeIntrinsicTrunc(MI, MRI, B); 1717 case TargetOpcode::G_SITOFP: 1718 return legalizeITOFP(MI, MRI, B, true); 1719 case TargetOpcode::G_UITOFP: 1720 return legalizeITOFP(MI, MRI, B, false); 1721 case TargetOpcode::G_FPTOSI: 1722 return legalizeFPTOI(MI, MRI, B, true); 1723 case TargetOpcode::G_FPTOUI: 1724 return legalizeFPTOI(MI, MRI, B, false); 1725 case TargetOpcode::G_FMINNUM: 1726 case TargetOpcode::G_FMAXNUM: 1727 case TargetOpcode::G_FMINNUM_IEEE: 1728 case TargetOpcode::G_FMAXNUM_IEEE: 1729 return legalizeMinNumMaxNum(Helper, MI); 1730 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1731 return legalizeExtractVectorElt(MI, MRI, B); 1732 case TargetOpcode::G_INSERT_VECTOR_ELT: 1733 return legalizeInsertVectorElt(MI, MRI, B); 1734 case TargetOpcode::G_SHUFFLE_VECTOR: 1735 return legalizeShuffleVector(MI, MRI, B); 1736 case TargetOpcode::G_FSIN: 1737 case TargetOpcode::G_FCOS: 1738 return legalizeSinCos(MI, MRI, B); 1739 case TargetOpcode::G_GLOBAL_VALUE: 1740 return legalizeGlobalValue(MI, MRI, B); 1741 case TargetOpcode::G_LOAD: 1742 case TargetOpcode::G_SEXTLOAD: 1743 case TargetOpcode::G_ZEXTLOAD: 1744 return legalizeLoad(Helper, MI); 1745 case TargetOpcode::G_FMAD: 1746 return legalizeFMad(MI, MRI, B); 1747 case TargetOpcode::G_FDIV: 1748 return legalizeFDIV(MI, MRI, B); 1749 case TargetOpcode::G_UDIV: 1750 case TargetOpcode::G_UREM: 1751 case TargetOpcode::G_UDIVREM: 1752 return legalizeUnsignedDIV_REM(MI, MRI, B); 1753 case TargetOpcode::G_SDIV: 1754 case TargetOpcode::G_SREM: 1755 case TargetOpcode::G_SDIVREM: 1756 return legalizeSignedDIV_REM(MI, MRI, B); 1757 case TargetOpcode::G_ATOMIC_CMPXCHG: 1758 return legalizeAtomicCmpXChg(MI, MRI, B); 1759 case TargetOpcode::G_FLOG: 1760 return legalizeFlog(MI, B, numbers::ln2f); 1761 case TargetOpcode::G_FLOG10: 1762 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1763 case TargetOpcode::G_FEXP: 1764 return legalizeFExp(MI, B); 1765 case TargetOpcode::G_FPOW: 1766 return legalizeFPow(MI, B); 1767 case TargetOpcode::G_FFLOOR: 1768 return legalizeFFloor(MI, MRI, B); 1769 case TargetOpcode::G_BUILD_VECTOR: 1770 return legalizeBuildVector(MI, MRI, B); 1771 case TargetOpcode::G_CTLZ: 1772 case TargetOpcode::G_CTTZ: 1773 return legalizeCTLZ_CTTZ(MI, MRI, B); 1774 default: 1775 return false; 1776 } 1777 1778 llvm_unreachable("expected switch to return"); 1779 } 1780 1781 Register AMDGPULegalizerInfo::getSegmentAperture( 1782 unsigned AS, 1783 MachineRegisterInfo &MRI, 1784 MachineIRBuilder &B) const { 1785 MachineFunction &MF = B.getMF(); 1786 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1787 const LLT S32 = LLT::scalar(32); 1788 1789 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1790 1791 if (ST.hasApertureRegs()) { 1792 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1793 // getreg. 1794 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1795 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1796 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1797 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1798 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1799 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1800 unsigned Encoding = 1801 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1802 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1803 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1804 1805 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1806 1807 B.buildInstr(AMDGPU::S_GETREG_B32) 1808 .addDef(GetReg) 1809 .addImm(Encoding); 1810 MRI.setType(GetReg, S32); 1811 1812 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1813 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1814 } 1815 1816 Register QueuePtr = MRI.createGenericVirtualRegister( 1817 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1818 1819 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1820 return Register(); 1821 1822 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1823 // private_segment_aperture_base_hi. 1824 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1825 1826 // TODO: can we be smarter about machine pointer info? 1827 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1828 MachineMemOperand *MMO = MF.getMachineMemOperand( 1829 PtrInfo, 1830 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1831 MachineMemOperand::MOInvariant, 1832 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1833 1834 Register LoadAddr; 1835 1836 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1837 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1838 } 1839 1840 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1841 MachineInstr &MI, MachineRegisterInfo &MRI, 1842 MachineIRBuilder &B) const { 1843 MachineFunction &MF = B.getMF(); 1844 1845 const LLT S32 = LLT::scalar(32); 1846 Register Dst = MI.getOperand(0).getReg(); 1847 Register Src = MI.getOperand(1).getReg(); 1848 1849 LLT DstTy = MRI.getType(Dst); 1850 LLT SrcTy = MRI.getType(Src); 1851 unsigned DestAS = DstTy.getAddressSpace(); 1852 unsigned SrcAS = SrcTy.getAddressSpace(); 1853 1854 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1855 // vector element. 1856 assert(!DstTy.isVector()); 1857 1858 const AMDGPUTargetMachine &TM 1859 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1860 1861 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1862 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1863 return true; 1864 } 1865 1866 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1867 // Truncate. 1868 B.buildExtract(Dst, Src, 0); 1869 MI.eraseFromParent(); 1870 return true; 1871 } 1872 1873 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1874 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1875 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1876 1877 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1878 // another. Merge operands are required to be the same type, but creating an 1879 // extra ptrtoint would be kind of pointless. 1880 auto HighAddr = B.buildConstant( 1881 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1882 B.buildMerge(Dst, {Src, HighAddr}); 1883 MI.eraseFromParent(); 1884 return true; 1885 } 1886 1887 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1888 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1889 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1890 unsigned NullVal = TM.getNullPointerValue(DestAS); 1891 1892 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1893 auto FlatNull = B.buildConstant(SrcTy, 0); 1894 1895 // Extract low 32-bits of the pointer. 1896 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1897 1898 auto CmpRes = 1899 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1900 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1901 1902 MI.eraseFromParent(); 1903 return true; 1904 } 1905 1906 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1907 return false; 1908 1909 if (!ST.hasFlatAddressSpace()) 1910 return false; 1911 1912 auto SegmentNull = 1913 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1914 auto FlatNull = 1915 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1916 1917 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1918 if (!ApertureReg.isValid()) 1919 return false; 1920 1921 auto CmpRes = 1922 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1923 1924 // Coerce the type of the low half of the result so we can use merge_values. 1925 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1926 1927 // TODO: Should we allow mismatched types but matching sizes in merges to 1928 // avoid the ptrtoint? 1929 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1930 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1931 1932 MI.eraseFromParent(); 1933 return true; 1934 } 1935 1936 bool AMDGPULegalizerInfo::legalizeFrint( 1937 MachineInstr &MI, MachineRegisterInfo &MRI, 1938 MachineIRBuilder &B) const { 1939 Register Src = MI.getOperand(1).getReg(); 1940 LLT Ty = MRI.getType(Src); 1941 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1942 1943 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1944 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1945 1946 auto C1 = B.buildFConstant(Ty, C1Val); 1947 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1948 1949 // TODO: Should this propagate fast-math-flags? 1950 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1951 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1952 1953 auto C2 = B.buildFConstant(Ty, C2Val); 1954 auto Fabs = B.buildFAbs(Ty, Src); 1955 1956 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1957 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1958 MI.eraseFromParent(); 1959 return true; 1960 } 1961 1962 bool AMDGPULegalizerInfo::legalizeFceil( 1963 MachineInstr &MI, MachineRegisterInfo &MRI, 1964 MachineIRBuilder &B) const { 1965 1966 const LLT S1 = LLT::scalar(1); 1967 const LLT S64 = LLT::scalar(64); 1968 1969 Register Src = MI.getOperand(1).getReg(); 1970 assert(MRI.getType(Src) == S64); 1971 1972 // result = trunc(src) 1973 // if (src > 0.0 && src != result) 1974 // result += 1.0 1975 1976 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1977 1978 const auto Zero = B.buildFConstant(S64, 0.0); 1979 const auto One = B.buildFConstant(S64, 1.0); 1980 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1981 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1982 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1983 auto Add = B.buildSelect(S64, And, One, Zero); 1984 1985 // TODO: Should this propagate fast-math-flags? 1986 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1987 return true; 1988 } 1989 1990 bool AMDGPULegalizerInfo::legalizeFrem( 1991 MachineInstr &MI, MachineRegisterInfo &MRI, 1992 MachineIRBuilder &B) const { 1993 Register DstReg = MI.getOperand(0).getReg(); 1994 Register Src0Reg = MI.getOperand(1).getReg(); 1995 Register Src1Reg = MI.getOperand(2).getReg(); 1996 auto Flags = MI.getFlags(); 1997 LLT Ty = MRI.getType(DstReg); 1998 1999 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2000 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2001 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2002 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2003 MI.eraseFromParent(); 2004 return true; 2005 } 2006 2007 static MachineInstrBuilder extractF64Exponent(Register Hi, 2008 MachineIRBuilder &B) { 2009 const unsigned FractBits = 52; 2010 const unsigned ExpBits = 11; 2011 LLT S32 = LLT::scalar(32); 2012 2013 auto Const0 = B.buildConstant(S32, FractBits - 32); 2014 auto Const1 = B.buildConstant(S32, ExpBits); 2015 2016 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2017 .addUse(Hi) 2018 .addUse(Const0.getReg(0)) 2019 .addUse(Const1.getReg(0)); 2020 2021 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2022 } 2023 2024 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2025 MachineInstr &MI, MachineRegisterInfo &MRI, 2026 MachineIRBuilder &B) const { 2027 const LLT S1 = LLT::scalar(1); 2028 const LLT S32 = LLT::scalar(32); 2029 const LLT S64 = LLT::scalar(64); 2030 2031 Register Src = MI.getOperand(1).getReg(); 2032 assert(MRI.getType(Src) == S64); 2033 2034 // TODO: Should this use extract since the low half is unused? 2035 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2036 Register Hi = Unmerge.getReg(1); 2037 2038 // Extract the upper half, since this is where we will find the sign and 2039 // exponent. 2040 auto Exp = extractF64Exponent(Hi, B); 2041 2042 const unsigned FractBits = 52; 2043 2044 // Extract the sign bit. 2045 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2046 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2047 2048 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2049 2050 const auto Zero32 = B.buildConstant(S32, 0); 2051 2052 // Extend back to 64-bits. 2053 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2054 2055 auto Shr = B.buildAShr(S64, FractMask, Exp); 2056 auto Not = B.buildNot(S64, Shr); 2057 auto Tmp0 = B.buildAnd(S64, Src, Not); 2058 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2059 2060 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2061 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2062 2063 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2064 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2065 MI.eraseFromParent(); 2066 return true; 2067 } 2068 2069 bool AMDGPULegalizerInfo::legalizeITOFP( 2070 MachineInstr &MI, MachineRegisterInfo &MRI, 2071 MachineIRBuilder &B, bool Signed) const { 2072 2073 Register Dst = MI.getOperand(0).getReg(); 2074 Register Src = MI.getOperand(1).getReg(); 2075 2076 const LLT S64 = LLT::scalar(64); 2077 const LLT S32 = LLT::scalar(32); 2078 2079 assert(MRI.getType(Src) == S64); 2080 2081 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2082 auto ThirtyTwo = B.buildConstant(S32, 32); 2083 2084 if (MRI.getType(Dst) == S64) { 2085 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2086 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2087 2088 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2089 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2090 .addUse(CvtHi.getReg(0)) 2091 .addUse(ThirtyTwo.getReg(0)); 2092 2093 // TODO: Should this propagate fast-math-flags? 2094 B.buildFAdd(Dst, LdExp, CvtLo); 2095 MI.eraseFromParent(); 2096 return true; 2097 } 2098 2099 assert(MRI.getType(Dst) == S32); 2100 2101 auto One = B.buildConstant(S32, 1); 2102 2103 MachineInstrBuilder ShAmt; 2104 if (Signed) { 2105 auto ThirtyOne = B.buildConstant(S32, 31); 2106 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2107 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2108 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2109 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2110 /*HasSideEffects=*/false) 2111 .addUse(Unmerge.getReg(1)); 2112 auto LS2 = B.buildSub(S32, LS, One); 2113 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2114 } else 2115 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2116 auto Norm = B.buildShl(S64, Src, ShAmt); 2117 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2118 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2119 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2120 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2121 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2122 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2123 /*HasSideEffects=*/false) 2124 .addUse(FVal.getReg(0)) 2125 .addUse(Scale.getReg(0)); 2126 MI.eraseFromParent(); 2127 return true; 2128 } 2129 2130 // TODO: Copied from DAG implementation. Verify logic and document how this 2131 // actually works. 2132 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2133 MachineRegisterInfo &MRI, 2134 MachineIRBuilder &B, 2135 bool Signed) const { 2136 2137 Register Dst = MI.getOperand(0).getReg(); 2138 Register Src = MI.getOperand(1).getReg(); 2139 2140 const LLT S64 = LLT::scalar(64); 2141 const LLT S32 = LLT::scalar(32); 2142 2143 const LLT SrcLT = MRI.getType(Src); 2144 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2145 2146 unsigned Flags = MI.getFlags(); 2147 2148 // The basic idea of converting a floating point number into a pair of 32-bit 2149 // integers is illustrated as follows: 2150 // 2151 // tf := trunc(val); 2152 // hif := floor(tf * 2^-32); 2153 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2154 // hi := fptoi(hif); 2155 // lo := fptoi(lof); 2156 // 2157 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2158 MachineInstrBuilder Sign; 2159 if (Signed && SrcLT == S32) { 2160 // However, a 32-bit floating point number has only 23 bits mantissa and 2161 // it's not enough to hold all the significant bits of `lof` if val is 2162 // negative. To avoid the loss of precision, We need to take the absolute 2163 // value after truncating and flip the result back based on the original 2164 // signedness. 2165 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2166 Trunc = B.buildFAbs(S32, Trunc, Flags); 2167 } 2168 MachineInstrBuilder K0, K1; 2169 if (SrcLT == S64) { 2170 K0 = B.buildFConstant(S64, 2171 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2172 K1 = B.buildFConstant(S64, 2173 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2174 } else { 2175 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2176 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2177 } 2178 2179 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2180 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2181 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2182 2183 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2184 : B.buildFPTOUI(S32, FloorMul); 2185 auto Lo = B.buildFPTOUI(S32, Fma); 2186 2187 if (Signed && SrcLT == S32) { 2188 // Flip the result based on the signedness, which is either all 0s or 1s. 2189 Sign = B.buildMerge(S64, {Sign, Sign}); 2190 // r := xor({lo, hi}, sign) - sign; 2191 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2192 } else 2193 B.buildMerge(Dst, {Lo, Hi}); 2194 MI.eraseFromParent(); 2195 2196 return true; 2197 } 2198 2199 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2200 MachineInstr &MI) const { 2201 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2202 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2203 2204 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2205 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2206 2207 // With ieee_mode disabled, the instructions have the correct behavior 2208 // already for G_FMINNUM/G_FMAXNUM 2209 if (!MFI->getMode().IEEE) 2210 return !IsIEEEOp; 2211 2212 if (IsIEEEOp) 2213 return true; 2214 2215 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2216 } 2217 2218 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2219 MachineInstr &MI, MachineRegisterInfo &MRI, 2220 MachineIRBuilder &B) const { 2221 // TODO: Should move some of this into LegalizerHelper. 2222 2223 // TODO: Promote dynamic indexing of s16 to s32 2224 2225 // FIXME: Artifact combiner probably should have replaced the truncated 2226 // constant before this, so we shouldn't need 2227 // getIConstantVRegValWithLookThrough. 2228 Optional<ValueAndVReg> MaybeIdxVal = 2229 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2230 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2231 return true; 2232 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2233 2234 Register Dst = MI.getOperand(0).getReg(); 2235 Register Vec = MI.getOperand(1).getReg(); 2236 2237 LLT VecTy = MRI.getType(Vec); 2238 LLT EltTy = VecTy.getElementType(); 2239 assert(EltTy == MRI.getType(Dst)); 2240 2241 if (IdxVal < VecTy.getNumElements()) 2242 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2243 else 2244 B.buildUndef(Dst); 2245 2246 MI.eraseFromParent(); 2247 return true; 2248 } 2249 2250 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2251 MachineInstr &MI, MachineRegisterInfo &MRI, 2252 MachineIRBuilder &B) const { 2253 // TODO: Should move some of this into LegalizerHelper. 2254 2255 // TODO: Promote dynamic indexing of s16 to s32 2256 2257 // FIXME: Artifact combiner probably should have replaced the truncated 2258 // constant before this, so we shouldn't need 2259 // getIConstantVRegValWithLookThrough. 2260 Optional<ValueAndVReg> MaybeIdxVal = 2261 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2262 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2263 return true; 2264 2265 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2266 Register Dst = MI.getOperand(0).getReg(); 2267 Register Vec = MI.getOperand(1).getReg(); 2268 Register Ins = MI.getOperand(2).getReg(); 2269 2270 LLT VecTy = MRI.getType(Vec); 2271 LLT EltTy = VecTy.getElementType(); 2272 assert(EltTy == MRI.getType(Ins)); 2273 2274 if (IdxVal < VecTy.getNumElements()) 2275 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2276 else 2277 B.buildUndef(Dst); 2278 2279 MI.eraseFromParent(); 2280 return true; 2281 } 2282 2283 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2284 MachineInstr &MI, MachineRegisterInfo &MRI, 2285 MachineIRBuilder &B) const { 2286 const LLT V2S16 = LLT::fixed_vector(2, 16); 2287 2288 Register Dst = MI.getOperand(0).getReg(); 2289 Register Src0 = MI.getOperand(1).getReg(); 2290 LLT DstTy = MRI.getType(Dst); 2291 LLT SrcTy = MRI.getType(Src0); 2292 2293 if (SrcTy == V2S16 && DstTy == V2S16 && 2294 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2295 return true; 2296 2297 MachineIRBuilder HelperBuilder(MI); 2298 GISelObserverWrapper DummyObserver; 2299 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2300 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2301 } 2302 2303 bool AMDGPULegalizerInfo::legalizeSinCos( 2304 MachineInstr &MI, MachineRegisterInfo &MRI, 2305 MachineIRBuilder &B) const { 2306 2307 Register DstReg = MI.getOperand(0).getReg(); 2308 Register SrcReg = MI.getOperand(1).getReg(); 2309 LLT Ty = MRI.getType(DstReg); 2310 unsigned Flags = MI.getFlags(); 2311 2312 Register TrigVal; 2313 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2314 if (ST.hasTrigReducedRange()) { 2315 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2316 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2317 .addUse(MulVal.getReg(0)) 2318 .setMIFlags(Flags).getReg(0); 2319 } else 2320 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2321 2322 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2323 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2324 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2325 .addUse(TrigVal) 2326 .setMIFlags(Flags); 2327 MI.eraseFromParent(); 2328 return true; 2329 } 2330 2331 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2332 MachineIRBuilder &B, 2333 const GlobalValue *GV, 2334 int64_t Offset, 2335 unsigned GAFlags) const { 2336 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2337 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2338 // to the following code sequence: 2339 // 2340 // For constant address space: 2341 // s_getpc_b64 s[0:1] 2342 // s_add_u32 s0, s0, $symbol 2343 // s_addc_u32 s1, s1, 0 2344 // 2345 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2346 // a fixup or relocation is emitted to replace $symbol with a literal 2347 // constant, which is a pc-relative offset from the encoding of the $symbol 2348 // operand to the global variable. 2349 // 2350 // For global address space: 2351 // s_getpc_b64 s[0:1] 2352 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2353 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2354 // 2355 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2356 // fixups or relocations are emitted to replace $symbol@*@lo and 2357 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2358 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2359 // operand to the global variable. 2360 // 2361 // What we want here is an offset from the value returned by s_getpc 2362 // (which is the address of the s_add_u32 instruction) to the global 2363 // variable, but since the encoding of $symbol starts 4 bytes after the start 2364 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2365 // small. This requires us to add 4 to the global variable offset in order to 2366 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2367 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2368 // instruction. 2369 2370 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2371 2372 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2373 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2374 2375 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2376 .addDef(PCReg); 2377 2378 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2379 if (GAFlags == SIInstrInfo::MO_NONE) 2380 MIB.addImm(0); 2381 else 2382 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2383 2384 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2385 2386 if (PtrTy.getSizeInBits() == 32) 2387 B.buildExtract(DstReg, PCReg, 0); 2388 return true; 2389 } 2390 2391 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2392 MachineInstr &MI, MachineRegisterInfo &MRI, 2393 MachineIRBuilder &B) const { 2394 Register DstReg = MI.getOperand(0).getReg(); 2395 LLT Ty = MRI.getType(DstReg); 2396 unsigned AS = Ty.getAddressSpace(); 2397 2398 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2399 MachineFunction &MF = B.getMF(); 2400 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2401 2402 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2403 if (!MFI->isModuleEntryFunction() && 2404 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2405 const Function &Fn = MF.getFunction(); 2406 DiagnosticInfoUnsupported BadLDSDecl( 2407 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2408 DS_Warning); 2409 Fn.getContext().diagnose(BadLDSDecl); 2410 2411 // We currently don't have a way to correctly allocate LDS objects that 2412 // aren't directly associated with a kernel. We do force inlining of 2413 // functions that use local objects. However, if these dead functions are 2414 // not eliminated, we don't want a compile time error. Just emit a warning 2415 // and a trap, since there should be no callable path here. 2416 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2417 B.buildUndef(DstReg); 2418 MI.eraseFromParent(); 2419 return true; 2420 } 2421 2422 // TODO: We could emit code to handle the initialization somewhere. 2423 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2424 const SITargetLowering *TLI = ST.getTargetLowering(); 2425 if (!TLI->shouldUseLDSConstAddress(GV)) { 2426 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2427 return true; // Leave in place; 2428 } 2429 2430 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2431 Type *Ty = GV->getValueType(); 2432 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2433 // zero-sized type in other languages to declare the dynamic shared 2434 // memory which size is not known at the compile time. They will be 2435 // allocated by the runtime and placed directly after the static 2436 // allocated ones. They all share the same offset. 2437 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2438 // Adjust alignment for that dynamic shared memory array. 2439 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2440 LLT S32 = LLT::scalar(32); 2441 auto Sz = 2442 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2443 B.buildIntToPtr(DstReg, Sz); 2444 MI.eraseFromParent(); 2445 return true; 2446 } 2447 } 2448 2449 B.buildConstant( 2450 DstReg, 2451 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2452 MI.eraseFromParent(); 2453 return true; 2454 } 2455 2456 const Function &Fn = MF.getFunction(); 2457 DiagnosticInfoUnsupported BadInit( 2458 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2459 Fn.getContext().diagnose(BadInit); 2460 return true; 2461 } 2462 2463 const SITargetLowering *TLI = ST.getTargetLowering(); 2464 2465 if (TLI->shouldEmitFixup(GV)) { 2466 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2467 MI.eraseFromParent(); 2468 return true; 2469 } 2470 2471 if (TLI->shouldEmitPCReloc(GV)) { 2472 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2473 MI.eraseFromParent(); 2474 return true; 2475 } 2476 2477 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2478 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2479 2480 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2481 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2482 MachinePointerInfo::getGOT(MF), 2483 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2484 MachineMemOperand::MOInvariant, 2485 LoadTy, Align(8)); 2486 2487 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2488 2489 if (Ty.getSizeInBits() == 32) { 2490 // Truncate if this is a 32-bit constant address. 2491 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2492 B.buildExtract(DstReg, Load, 0); 2493 } else 2494 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2495 2496 MI.eraseFromParent(); 2497 return true; 2498 } 2499 2500 static LLT widenToNextPowerOf2(LLT Ty) { 2501 if (Ty.isVector()) 2502 return Ty.changeElementCount( 2503 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2504 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2505 } 2506 2507 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2508 MachineInstr &MI) const { 2509 MachineIRBuilder &B = Helper.MIRBuilder; 2510 MachineRegisterInfo &MRI = *B.getMRI(); 2511 GISelChangeObserver &Observer = Helper.Observer; 2512 2513 Register PtrReg = MI.getOperand(1).getReg(); 2514 LLT PtrTy = MRI.getType(PtrReg); 2515 unsigned AddrSpace = PtrTy.getAddressSpace(); 2516 2517 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2518 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2519 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2520 Observer.changingInstr(MI); 2521 MI.getOperand(1).setReg(Cast.getReg(0)); 2522 Observer.changedInstr(MI); 2523 return true; 2524 } 2525 2526 if (MI.getOpcode() != AMDGPU::G_LOAD) 2527 return false; 2528 2529 Register ValReg = MI.getOperand(0).getReg(); 2530 LLT ValTy = MRI.getType(ValReg); 2531 2532 MachineMemOperand *MMO = *MI.memoperands_begin(); 2533 const unsigned ValSize = ValTy.getSizeInBits(); 2534 const LLT MemTy = MMO->getMemoryType(); 2535 const Align MemAlign = MMO->getAlign(); 2536 const unsigned MemSize = MemTy.getSizeInBits(); 2537 const unsigned AlignInBits = 8 * MemAlign.value(); 2538 2539 // Widen non-power-of-2 loads to the alignment if needed 2540 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2541 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2542 2543 // This was already the correct extending load result type, so just adjust 2544 // the memory type. 2545 if (WideMemSize == ValSize) { 2546 MachineFunction &MF = B.getMF(); 2547 2548 MachineMemOperand *WideMMO = 2549 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2550 Observer.changingInstr(MI); 2551 MI.setMemRefs(MF, {WideMMO}); 2552 Observer.changedInstr(MI); 2553 return true; 2554 } 2555 2556 // Don't bother handling edge case that should probably never be produced. 2557 if (ValSize > WideMemSize) 2558 return false; 2559 2560 LLT WideTy = widenToNextPowerOf2(ValTy); 2561 2562 Register WideLoad; 2563 if (!WideTy.isVector()) { 2564 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2565 B.buildTrunc(ValReg, WideLoad).getReg(0); 2566 } else { 2567 // Extract the subvector. 2568 2569 if (isRegisterType(ValTy)) { 2570 // If this a case where G_EXTRACT is legal, use it. 2571 // (e.g. <3 x s32> -> <4 x s32>) 2572 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2573 B.buildExtract(ValReg, WideLoad, 0); 2574 } else { 2575 // For cases where the widened type isn't a nice register value, unmerge 2576 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2577 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2578 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2579 B.setInsertPt(B.getMBB(), MI.getIterator()); 2580 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2581 } 2582 } 2583 2584 MI.eraseFromParent(); 2585 return true; 2586 } 2587 2588 return false; 2589 } 2590 2591 bool AMDGPULegalizerInfo::legalizeFMad( 2592 MachineInstr &MI, MachineRegisterInfo &MRI, 2593 MachineIRBuilder &B) const { 2594 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2595 assert(Ty.isScalar()); 2596 2597 MachineFunction &MF = B.getMF(); 2598 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2599 2600 // TODO: Always legal with future ftz flag. 2601 // FIXME: Do we need just output? 2602 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2603 return true; 2604 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2605 return true; 2606 2607 MachineIRBuilder HelperBuilder(MI); 2608 GISelObserverWrapper DummyObserver; 2609 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2610 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2611 } 2612 2613 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2614 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2615 Register DstReg = MI.getOperand(0).getReg(); 2616 Register PtrReg = MI.getOperand(1).getReg(); 2617 Register CmpVal = MI.getOperand(2).getReg(); 2618 Register NewVal = MI.getOperand(3).getReg(); 2619 2620 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2621 "this should not have been custom lowered"); 2622 2623 LLT ValTy = MRI.getType(CmpVal); 2624 LLT VecTy = LLT::fixed_vector(2, ValTy); 2625 2626 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2627 2628 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2629 .addDef(DstReg) 2630 .addUse(PtrReg) 2631 .addUse(PackedVal) 2632 .setMemRefs(MI.memoperands()); 2633 2634 MI.eraseFromParent(); 2635 return true; 2636 } 2637 2638 bool AMDGPULegalizerInfo::legalizeFlog( 2639 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2640 Register Dst = MI.getOperand(0).getReg(); 2641 Register Src = MI.getOperand(1).getReg(); 2642 LLT Ty = B.getMRI()->getType(Dst); 2643 unsigned Flags = MI.getFlags(); 2644 2645 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2646 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2647 2648 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2649 MI.eraseFromParent(); 2650 return true; 2651 } 2652 2653 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2654 MachineIRBuilder &B) const { 2655 Register Dst = MI.getOperand(0).getReg(); 2656 Register Src = MI.getOperand(1).getReg(); 2657 unsigned Flags = MI.getFlags(); 2658 LLT Ty = B.getMRI()->getType(Dst); 2659 2660 auto K = B.buildFConstant(Ty, numbers::log2e); 2661 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2662 B.buildFExp2(Dst, Mul, Flags); 2663 MI.eraseFromParent(); 2664 return true; 2665 } 2666 2667 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2668 MachineIRBuilder &B) const { 2669 Register Dst = MI.getOperand(0).getReg(); 2670 Register Src0 = MI.getOperand(1).getReg(); 2671 Register Src1 = MI.getOperand(2).getReg(); 2672 unsigned Flags = MI.getFlags(); 2673 LLT Ty = B.getMRI()->getType(Dst); 2674 const LLT S16 = LLT::scalar(16); 2675 const LLT S32 = LLT::scalar(32); 2676 2677 if (Ty == S32) { 2678 auto Log = B.buildFLog2(S32, Src0, Flags); 2679 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2680 .addUse(Log.getReg(0)) 2681 .addUse(Src1) 2682 .setMIFlags(Flags); 2683 B.buildFExp2(Dst, Mul, Flags); 2684 } else if (Ty == S16) { 2685 // There's no f16 fmul_legacy, so we need to convert for it. 2686 auto Log = B.buildFLog2(S16, Src0, Flags); 2687 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2688 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2689 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2690 .addUse(Ext0.getReg(0)) 2691 .addUse(Ext1.getReg(0)) 2692 .setMIFlags(Flags); 2693 2694 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2695 } else 2696 return false; 2697 2698 MI.eraseFromParent(); 2699 return true; 2700 } 2701 2702 // Find a source register, ignoring any possible source modifiers. 2703 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2704 Register ModSrc = OrigSrc; 2705 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2706 ModSrc = SrcFNeg->getOperand(1).getReg(); 2707 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2708 ModSrc = SrcFAbs->getOperand(1).getReg(); 2709 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2710 ModSrc = SrcFAbs->getOperand(1).getReg(); 2711 return ModSrc; 2712 } 2713 2714 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2715 MachineRegisterInfo &MRI, 2716 MachineIRBuilder &B) const { 2717 2718 const LLT S1 = LLT::scalar(1); 2719 const LLT S64 = LLT::scalar(64); 2720 Register Dst = MI.getOperand(0).getReg(); 2721 Register OrigSrc = MI.getOperand(1).getReg(); 2722 unsigned Flags = MI.getFlags(); 2723 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2724 "this should not have been custom lowered"); 2725 2726 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2727 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2728 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2729 // V_FRACT bug is: 2730 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2731 // 2732 // Convert floor(x) to (x - fract(x)) 2733 2734 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2735 .addUse(OrigSrc) 2736 .setMIFlags(Flags); 2737 2738 // Give source modifier matching some assistance before obscuring a foldable 2739 // pattern. 2740 2741 // TODO: We can avoid the neg on the fract? The input sign to fract 2742 // shouldn't matter? 2743 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2744 2745 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2746 2747 Register Min = MRI.createGenericVirtualRegister(S64); 2748 2749 // We don't need to concern ourselves with the snan handling difference, so 2750 // use the one which will directly select. 2751 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2752 if (MFI->getMode().IEEE) 2753 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2754 else 2755 B.buildFMinNum(Min, Fract, Const, Flags); 2756 2757 Register CorrectedFract = Min; 2758 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2759 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2760 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2761 } 2762 2763 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2764 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2765 2766 MI.eraseFromParent(); 2767 return true; 2768 } 2769 2770 // Turn an illegal packed v2s16 build vector into bit operations. 2771 // TODO: This should probably be a bitcast action in LegalizerHelper. 2772 bool AMDGPULegalizerInfo::legalizeBuildVector( 2773 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2774 Register Dst = MI.getOperand(0).getReg(); 2775 const LLT S32 = LLT::scalar(32); 2776 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2777 2778 Register Src0 = MI.getOperand(1).getReg(); 2779 Register Src1 = MI.getOperand(2).getReg(); 2780 assert(MRI.getType(Src0) == LLT::scalar(16)); 2781 2782 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2783 B.buildBitcast(Dst, Merge); 2784 2785 MI.eraseFromParent(); 2786 return true; 2787 } 2788 2789 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2790 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2791 // case with a single min instruction instead of a compare+select. 2792 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2793 MachineRegisterInfo &MRI, 2794 MachineIRBuilder &B) const { 2795 Register Dst = MI.getOperand(0).getReg(); 2796 Register Src = MI.getOperand(1).getReg(); 2797 LLT DstTy = MRI.getType(Dst); 2798 LLT SrcTy = MRI.getType(Src); 2799 2800 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2801 ? AMDGPU::G_AMDGPU_FFBH_U32 2802 : AMDGPU::G_AMDGPU_FFBL_B32; 2803 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2804 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2805 2806 MI.eraseFromParent(); 2807 return true; 2808 } 2809 2810 // Check that this is a G_XOR x, -1 2811 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2812 if (MI.getOpcode() != TargetOpcode::G_XOR) 2813 return false; 2814 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2815 return ConstVal && *ConstVal == -1; 2816 } 2817 2818 // Return the use branch instruction, otherwise null if the usage is invalid. 2819 static MachineInstr * 2820 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2821 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2822 Register CondDef = MI.getOperand(0).getReg(); 2823 if (!MRI.hasOneNonDBGUse(CondDef)) 2824 return nullptr; 2825 2826 MachineBasicBlock *Parent = MI.getParent(); 2827 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2828 2829 if (isNot(MRI, *UseMI)) { 2830 Register NegatedCond = UseMI->getOperand(0).getReg(); 2831 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2832 return nullptr; 2833 2834 // We're deleting the def of this value, so we need to remove it. 2835 eraseInstr(*UseMI, MRI); 2836 2837 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2838 Negated = true; 2839 } 2840 2841 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2842 return nullptr; 2843 2844 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2845 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2846 if (Next == Parent->end()) { 2847 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2848 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2849 return nullptr; 2850 UncondBrTarget = &*NextMBB; 2851 } else { 2852 if (Next->getOpcode() != AMDGPU::G_BR) 2853 return nullptr; 2854 Br = &*Next; 2855 UncondBrTarget = Br->getOperand(0).getMBB(); 2856 } 2857 2858 return UseMI; 2859 } 2860 2861 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2862 const ArgDescriptor *Arg, 2863 const TargetRegisterClass *ArgRC, 2864 LLT ArgTy) const { 2865 MCRegister SrcReg = Arg->getRegister(); 2866 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2867 assert(DstReg.isVirtual() && "Virtual register expected"); 2868 2869 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2870 ArgTy); 2871 if (Arg->isMasked()) { 2872 // TODO: Should we try to emit this once in the entry block? 2873 const LLT S32 = LLT::scalar(32); 2874 const unsigned Mask = Arg->getMask(); 2875 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2876 2877 Register AndMaskSrc = LiveIn; 2878 2879 if (Shift != 0) { 2880 auto ShiftAmt = B.buildConstant(S32, Shift); 2881 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2882 } 2883 2884 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2885 } else { 2886 B.buildCopy(DstReg, LiveIn); 2887 } 2888 2889 return true; 2890 } 2891 2892 bool AMDGPULegalizerInfo::loadInputValue( 2893 Register DstReg, MachineIRBuilder &B, 2894 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2895 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2896 const ArgDescriptor *Arg; 2897 const TargetRegisterClass *ArgRC; 2898 LLT ArgTy; 2899 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2900 2901 if (!Arg) { 2902 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 2903 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 2904 // case the pointer argument may be missing and we use null. 2905 B.buildConstant(DstReg, 0); 2906 return true; 2907 } 2908 2909 // It's undefined behavior if a function marked with the amdgpu-no-* 2910 // attributes uses the corresponding intrinsic. 2911 B.buildUndef(DstReg); 2912 return true; 2913 } 2914 2915 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2916 return false; // TODO: Handle these 2917 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2918 } 2919 2920 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2921 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2922 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2923 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2924 return false; 2925 2926 MI.eraseFromParent(); 2927 return true; 2928 } 2929 2930 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2931 MachineRegisterInfo &MRI, 2932 MachineIRBuilder &B) const { 2933 Register Dst = MI.getOperand(0).getReg(); 2934 LLT DstTy = MRI.getType(Dst); 2935 LLT S16 = LLT::scalar(16); 2936 LLT S32 = LLT::scalar(32); 2937 LLT S64 = LLT::scalar(64); 2938 2939 if (DstTy == S16) 2940 return legalizeFDIV16(MI, MRI, B); 2941 if (DstTy == S32) 2942 return legalizeFDIV32(MI, MRI, B); 2943 if (DstTy == S64) 2944 return legalizeFDIV64(MI, MRI, B); 2945 2946 return false; 2947 } 2948 2949 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2950 Register DstDivReg, 2951 Register DstRemReg, 2952 Register X, 2953 Register Y) const { 2954 const LLT S1 = LLT::scalar(1); 2955 const LLT S32 = LLT::scalar(32); 2956 2957 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2958 // algorithm used here. 2959 2960 // Initial estimate of inv(y). 2961 auto FloatY = B.buildUITOFP(S32, Y); 2962 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2963 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2964 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2965 auto Z = B.buildFPTOUI(S32, ScaledY); 2966 2967 // One round of UNR. 2968 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2969 auto NegYZ = B.buildMul(S32, NegY, Z); 2970 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2971 2972 // Quotient/remainder estimate. 2973 auto Q = B.buildUMulH(S32, X, Z); 2974 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2975 2976 // First quotient/remainder refinement. 2977 auto One = B.buildConstant(S32, 1); 2978 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2979 if (DstDivReg) 2980 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2981 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2982 2983 // Second quotient/remainder refinement. 2984 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2985 if (DstDivReg) 2986 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2987 2988 if (DstRemReg) 2989 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2990 } 2991 2992 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 2993 // 2994 // Return lo, hi of result 2995 // 2996 // %cvt.lo = G_UITOFP Val.lo 2997 // %cvt.hi = G_UITOFP Val.hi 2998 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2999 // %rcp = G_AMDGPU_RCP_IFLAG %mad 3000 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 3001 // %mul2 = G_FMUL %mul1, 2**(-32) 3002 // %trunc = G_INTRINSIC_TRUNC %mul2 3003 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 3004 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 3005 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 3006 Register Val) { 3007 const LLT S32 = LLT::scalar(32); 3008 auto Unmerge = B.buildUnmerge(S32, Val); 3009 3010 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 3011 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 3012 3013 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 3014 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 3015 3016 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 3017 auto Mul1 = 3018 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3019 3020 // 2**(-32) 3021 auto Mul2 = 3022 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3023 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3024 3025 // -(2**32) 3026 auto Mad2 = B.buildFMAD(S32, Trunc, 3027 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3028 3029 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3030 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3031 3032 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3033 } 3034 3035 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3036 Register DstDivReg, 3037 Register DstRemReg, 3038 Register Numer, 3039 Register Denom) const { 3040 const LLT S32 = LLT::scalar(32); 3041 const LLT S64 = LLT::scalar(64); 3042 const LLT S1 = LLT::scalar(1); 3043 Register RcpLo, RcpHi; 3044 3045 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3046 3047 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3048 3049 auto Zero64 = B.buildConstant(S64, 0); 3050 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3051 3052 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3053 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3054 3055 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3056 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3057 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3058 3059 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3060 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3061 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 3062 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3063 3064 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3065 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3066 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3067 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3068 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3069 3070 auto Zero32 = B.buildConstant(S32, 0); 3071 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3072 auto Add2_HiC = 3073 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 3074 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 3075 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3076 3077 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3078 Register NumerLo = UnmergeNumer.getReg(0); 3079 Register NumerHi = UnmergeNumer.getReg(1); 3080 3081 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3082 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3083 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3084 Register Mul3_Lo = UnmergeMul3.getReg(0); 3085 Register Mul3_Hi = UnmergeMul3.getReg(1); 3086 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3087 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3088 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3089 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3090 3091 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3092 Register DenomLo = UnmergeDenom.getReg(0); 3093 Register DenomHi = UnmergeDenom.getReg(1); 3094 3095 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3096 auto C1 = B.buildSExt(S32, CmpHi); 3097 3098 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3099 auto C2 = B.buildSExt(S32, CmpLo); 3100 3101 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3102 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3103 3104 // TODO: Here and below portions of the code can be enclosed into if/endif. 3105 // Currently control flow is unconditional and we have 4 selects after 3106 // potential endif to substitute PHIs. 3107 3108 // if C3 != 0 ... 3109 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3110 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3111 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3112 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3113 3114 auto One64 = B.buildConstant(S64, 1); 3115 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3116 3117 auto C4 = 3118 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3119 auto C5 = 3120 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3121 auto C6 = B.buildSelect( 3122 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3123 3124 // if (C6 != 0) 3125 auto Add4 = B.buildAdd(S64, Add3, One64); 3126 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3127 3128 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3129 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3130 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3131 3132 // endif C6 3133 // endif C3 3134 3135 if (DstDivReg) { 3136 auto Sel1 = B.buildSelect( 3137 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3138 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3139 Sel1, MulHi3); 3140 } 3141 3142 if (DstRemReg) { 3143 auto Sel2 = B.buildSelect( 3144 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3145 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3146 Sel2, Sub1); 3147 } 3148 } 3149 3150 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3151 MachineRegisterInfo &MRI, 3152 MachineIRBuilder &B) const { 3153 Register DstDivReg, DstRemReg; 3154 switch (MI.getOpcode()) { 3155 default: 3156 llvm_unreachable("Unexpected opcode!"); 3157 case AMDGPU::G_UDIV: { 3158 DstDivReg = MI.getOperand(0).getReg(); 3159 break; 3160 } 3161 case AMDGPU::G_UREM: { 3162 DstRemReg = MI.getOperand(0).getReg(); 3163 break; 3164 } 3165 case AMDGPU::G_UDIVREM: { 3166 DstDivReg = MI.getOperand(0).getReg(); 3167 DstRemReg = MI.getOperand(1).getReg(); 3168 break; 3169 } 3170 } 3171 3172 const LLT S64 = LLT::scalar(64); 3173 const LLT S32 = LLT::scalar(32); 3174 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3175 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3176 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3177 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3178 3179 if (Ty == S32) 3180 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3181 else if (Ty == S64) 3182 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3183 else 3184 return false; 3185 3186 MI.eraseFromParent(); 3187 return true; 3188 } 3189 3190 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3191 MachineRegisterInfo &MRI, 3192 MachineIRBuilder &B) const { 3193 const LLT S64 = LLT::scalar(64); 3194 const LLT S32 = LLT::scalar(32); 3195 3196 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3197 if (Ty != S32 && Ty != S64) 3198 return false; 3199 3200 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3201 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3202 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3203 3204 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3205 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3206 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3207 3208 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3209 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3210 3211 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3212 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3213 3214 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3215 switch (MI.getOpcode()) { 3216 default: 3217 llvm_unreachable("Unexpected opcode!"); 3218 case AMDGPU::G_SDIV: { 3219 DstDivReg = MI.getOperand(0).getReg(); 3220 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3221 break; 3222 } 3223 case AMDGPU::G_SREM: { 3224 DstRemReg = MI.getOperand(0).getReg(); 3225 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3226 break; 3227 } 3228 case AMDGPU::G_SDIVREM: { 3229 DstDivReg = MI.getOperand(0).getReg(); 3230 DstRemReg = MI.getOperand(1).getReg(); 3231 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3232 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3233 break; 3234 } 3235 } 3236 3237 if (Ty == S32) 3238 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3239 else 3240 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3241 3242 if (DstDivReg) { 3243 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3244 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3245 B.buildSub(DstDivReg, SignXor, Sign); 3246 } 3247 3248 if (DstRemReg) { 3249 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3250 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3251 B.buildSub(DstRemReg, SignXor, Sign); 3252 } 3253 3254 MI.eraseFromParent(); 3255 return true; 3256 } 3257 3258 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3259 MachineRegisterInfo &MRI, 3260 MachineIRBuilder &B) const { 3261 Register Res = MI.getOperand(0).getReg(); 3262 Register LHS = MI.getOperand(1).getReg(); 3263 Register RHS = MI.getOperand(2).getReg(); 3264 uint16_t Flags = MI.getFlags(); 3265 LLT ResTy = MRI.getType(Res); 3266 3267 const MachineFunction &MF = B.getMF(); 3268 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3269 MI.getFlag(MachineInstr::FmAfn); 3270 3271 if (!AllowInaccurateRcp) 3272 return false; 3273 3274 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3275 // 1 / x -> RCP(x) 3276 if (CLHS->isExactlyValue(1.0)) { 3277 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3278 .addUse(RHS) 3279 .setMIFlags(Flags); 3280 3281 MI.eraseFromParent(); 3282 return true; 3283 } 3284 3285 // -1 / x -> RCP( FNEG(x) ) 3286 if (CLHS->isExactlyValue(-1.0)) { 3287 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3288 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3289 .addUse(FNeg.getReg(0)) 3290 .setMIFlags(Flags); 3291 3292 MI.eraseFromParent(); 3293 return true; 3294 } 3295 } 3296 3297 // x / y -> x * (1.0 / y) 3298 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3299 .addUse(RHS) 3300 .setMIFlags(Flags); 3301 B.buildFMul(Res, LHS, RCP, Flags); 3302 3303 MI.eraseFromParent(); 3304 return true; 3305 } 3306 3307 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3308 MachineRegisterInfo &MRI, 3309 MachineIRBuilder &B) const { 3310 Register Res = MI.getOperand(0).getReg(); 3311 Register X = MI.getOperand(1).getReg(); 3312 Register Y = MI.getOperand(2).getReg(); 3313 uint16_t Flags = MI.getFlags(); 3314 LLT ResTy = MRI.getType(Res); 3315 3316 const MachineFunction &MF = B.getMF(); 3317 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3318 MI.getFlag(MachineInstr::FmAfn); 3319 3320 if (!AllowInaccurateRcp) 3321 return false; 3322 3323 auto NegY = B.buildFNeg(ResTy, Y); 3324 auto One = B.buildFConstant(ResTy, 1.0); 3325 3326 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3327 .addUse(Y) 3328 .setMIFlags(Flags); 3329 3330 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3331 R = B.buildFMA(ResTy, Tmp0, R, R); 3332 3333 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3334 R = B.buildFMA(ResTy, Tmp1, R, R); 3335 3336 auto Ret = B.buildFMul(ResTy, X, R); 3337 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3338 3339 B.buildFMA(Res, Tmp2, R, Ret); 3340 MI.eraseFromParent(); 3341 return true; 3342 } 3343 3344 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3345 MachineRegisterInfo &MRI, 3346 MachineIRBuilder &B) const { 3347 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3348 return true; 3349 3350 Register Res = MI.getOperand(0).getReg(); 3351 Register LHS = MI.getOperand(1).getReg(); 3352 Register RHS = MI.getOperand(2).getReg(); 3353 3354 uint16_t Flags = MI.getFlags(); 3355 3356 LLT S16 = LLT::scalar(16); 3357 LLT S32 = LLT::scalar(32); 3358 3359 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3360 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3361 3362 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3363 .addUse(RHSExt.getReg(0)) 3364 .setMIFlags(Flags); 3365 3366 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3367 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3368 3369 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3370 .addUse(RDst.getReg(0)) 3371 .addUse(RHS) 3372 .addUse(LHS) 3373 .setMIFlags(Flags); 3374 3375 MI.eraseFromParent(); 3376 return true; 3377 } 3378 3379 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3380 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3381 static void toggleSPDenormMode(bool Enable, 3382 MachineIRBuilder &B, 3383 const GCNSubtarget &ST, 3384 AMDGPU::SIModeRegisterDefaults Mode) { 3385 // Set SP denorm mode to this value. 3386 unsigned SPDenormMode = 3387 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3388 3389 if (ST.hasDenormModeInst()) { 3390 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3391 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3392 3393 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3394 B.buildInstr(AMDGPU::S_DENORM_MODE) 3395 .addImm(NewDenormModeValue); 3396 3397 } else { 3398 // Select FP32 bit field in mode register. 3399 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3400 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3401 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3402 3403 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3404 .addImm(SPDenormMode) 3405 .addImm(SPDenormModeBitField); 3406 } 3407 } 3408 3409 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3410 MachineRegisterInfo &MRI, 3411 MachineIRBuilder &B) const { 3412 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3413 return true; 3414 3415 Register Res = MI.getOperand(0).getReg(); 3416 Register LHS = MI.getOperand(1).getReg(); 3417 Register RHS = MI.getOperand(2).getReg(); 3418 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3419 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3420 3421 uint16_t Flags = MI.getFlags(); 3422 3423 LLT S32 = LLT::scalar(32); 3424 LLT S1 = LLT::scalar(1); 3425 3426 auto One = B.buildFConstant(S32, 1.0f); 3427 3428 auto DenominatorScaled = 3429 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3430 .addUse(LHS) 3431 .addUse(RHS) 3432 .addImm(0) 3433 .setMIFlags(Flags); 3434 auto NumeratorScaled = 3435 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3436 .addUse(LHS) 3437 .addUse(RHS) 3438 .addImm(1) 3439 .setMIFlags(Flags); 3440 3441 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3442 .addUse(DenominatorScaled.getReg(0)) 3443 .setMIFlags(Flags); 3444 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3445 3446 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3447 // aren't modeled as reading it. 3448 if (!Mode.allFP32Denormals()) 3449 toggleSPDenormMode(true, B, ST, Mode); 3450 3451 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3452 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3453 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3454 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3455 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3456 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3457 3458 if (!Mode.allFP32Denormals()) 3459 toggleSPDenormMode(false, B, ST, Mode); 3460 3461 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3462 .addUse(Fma4.getReg(0)) 3463 .addUse(Fma1.getReg(0)) 3464 .addUse(Fma3.getReg(0)) 3465 .addUse(NumeratorScaled.getReg(1)) 3466 .setMIFlags(Flags); 3467 3468 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3469 .addUse(Fmas.getReg(0)) 3470 .addUse(RHS) 3471 .addUse(LHS) 3472 .setMIFlags(Flags); 3473 3474 MI.eraseFromParent(); 3475 return true; 3476 } 3477 3478 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3479 MachineRegisterInfo &MRI, 3480 MachineIRBuilder &B) const { 3481 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3482 return true; 3483 3484 Register Res = MI.getOperand(0).getReg(); 3485 Register LHS = MI.getOperand(1).getReg(); 3486 Register RHS = MI.getOperand(2).getReg(); 3487 3488 uint16_t Flags = MI.getFlags(); 3489 3490 LLT S64 = LLT::scalar(64); 3491 LLT S1 = LLT::scalar(1); 3492 3493 auto One = B.buildFConstant(S64, 1.0); 3494 3495 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3496 .addUse(LHS) 3497 .addUse(RHS) 3498 .addImm(0) 3499 .setMIFlags(Flags); 3500 3501 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3502 3503 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3504 .addUse(DivScale0.getReg(0)) 3505 .setMIFlags(Flags); 3506 3507 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3508 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3509 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3510 3511 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3512 .addUse(LHS) 3513 .addUse(RHS) 3514 .addImm(1) 3515 .setMIFlags(Flags); 3516 3517 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3518 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3519 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3520 3521 Register Scale; 3522 if (!ST.hasUsableDivScaleConditionOutput()) { 3523 // Workaround a hardware bug on SI where the condition output from div_scale 3524 // is not usable. 3525 3526 LLT S32 = LLT::scalar(32); 3527 3528 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3529 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3530 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3531 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3532 3533 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3534 Scale1Unmerge.getReg(1)); 3535 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3536 Scale0Unmerge.getReg(1)); 3537 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3538 } else { 3539 Scale = DivScale1.getReg(1); 3540 } 3541 3542 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3543 .addUse(Fma4.getReg(0)) 3544 .addUse(Fma3.getReg(0)) 3545 .addUse(Mul.getReg(0)) 3546 .addUse(Scale) 3547 .setMIFlags(Flags); 3548 3549 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3550 .addUse(Fmas.getReg(0)) 3551 .addUse(RHS) 3552 .addUse(LHS) 3553 .setMIFlags(Flags); 3554 3555 MI.eraseFromParent(); 3556 return true; 3557 } 3558 3559 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3560 MachineRegisterInfo &MRI, 3561 MachineIRBuilder &B) const { 3562 Register Res = MI.getOperand(0).getReg(); 3563 Register LHS = MI.getOperand(2).getReg(); 3564 Register RHS = MI.getOperand(3).getReg(); 3565 uint16_t Flags = MI.getFlags(); 3566 3567 LLT S32 = LLT::scalar(32); 3568 LLT S1 = LLT::scalar(1); 3569 3570 auto Abs = B.buildFAbs(S32, RHS, Flags); 3571 const APFloat C0Val(1.0f); 3572 3573 auto C0 = B.buildConstant(S32, 0x6f800000); 3574 auto C1 = B.buildConstant(S32, 0x2f800000); 3575 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3576 3577 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3578 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3579 3580 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3581 3582 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3583 .addUse(Mul0.getReg(0)) 3584 .setMIFlags(Flags); 3585 3586 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3587 3588 B.buildFMul(Res, Sel, Mul1, Flags); 3589 3590 MI.eraseFromParent(); 3591 return true; 3592 } 3593 3594 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3595 // FIXME: Why do we handle this one but not other removed instructions? 3596 // 3597 // Reciprocal square root. The clamp prevents infinite results, clamping 3598 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3599 // +-max_float. 3600 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3601 MachineRegisterInfo &MRI, 3602 MachineIRBuilder &B) const { 3603 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3604 return true; 3605 3606 Register Dst = MI.getOperand(0).getReg(); 3607 Register Src = MI.getOperand(2).getReg(); 3608 auto Flags = MI.getFlags(); 3609 3610 LLT Ty = MRI.getType(Dst); 3611 3612 const fltSemantics *FltSemantics; 3613 if (Ty == LLT::scalar(32)) 3614 FltSemantics = &APFloat::IEEEsingle(); 3615 else if (Ty == LLT::scalar(64)) 3616 FltSemantics = &APFloat::IEEEdouble(); 3617 else 3618 return false; 3619 3620 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3621 .addUse(Src) 3622 .setMIFlags(Flags); 3623 3624 // We don't need to concern ourselves with the snan handling difference, since 3625 // the rsq quieted (or not) so use the one which will directly select. 3626 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3627 const bool UseIEEE = MFI->getMode().IEEE; 3628 3629 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3630 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3631 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3632 3633 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3634 3635 if (UseIEEE) 3636 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3637 else 3638 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3639 MI.eraseFromParent(); 3640 return true; 3641 } 3642 3643 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3644 switch (IID) { 3645 case Intrinsic::amdgcn_ds_fadd: 3646 return AMDGPU::G_ATOMICRMW_FADD; 3647 case Intrinsic::amdgcn_ds_fmin: 3648 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3649 case Intrinsic::amdgcn_ds_fmax: 3650 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3651 default: 3652 llvm_unreachable("not a DS FP intrinsic"); 3653 } 3654 } 3655 3656 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3657 MachineInstr &MI, 3658 Intrinsic::ID IID) const { 3659 GISelChangeObserver &Observer = Helper.Observer; 3660 Observer.changingInstr(MI); 3661 3662 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3663 3664 // The remaining operands were used to set fields in the MemOperand on 3665 // construction. 3666 for (int I = 6; I > 3; --I) 3667 MI.RemoveOperand(I); 3668 3669 MI.RemoveOperand(1); // Remove the intrinsic ID. 3670 Observer.changedInstr(MI); 3671 return true; 3672 } 3673 3674 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3675 MachineRegisterInfo &MRI, 3676 MachineIRBuilder &B) const { 3677 uint64_t Offset = 3678 ST.getTargetLowering()->getImplicitParameterOffset( 3679 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3680 LLT DstTy = MRI.getType(DstReg); 3681 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3682 3683 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3684 if (!loadInputValue(KernargPtrReg, B, 3685 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3686 return false; 3687 3688 // FIXME: This should be nuw 3689 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3690 return true; 3691 } 3692 3693 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3694 MachineRegisterInfo &MRI, 3695 MachineIRBuilder &B) const { 3696 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3697 if (!MFI->isEntryFunction()) { 3698 return legalizePreloadedArgIntrin(MI, MRI, B, 3699 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3700 } 3701 3702 Register DstReg = MI.getOperand(0).getReg(); 3703 if (!getImplicitArgPtr(DstReg, MRI, B)) 3704 return false; 3705 3706 MI.eraseFromParent(); 3707 return true; 3708 } 3709 3710 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3711 MachineRegisterInfo &MRI, 3712 MachineIRBuilder &B, 3713 unsigned AddrSpace) const { 3714 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3715 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3716 Register Hi32 = Unmerge.getReg(1); 3717 3718 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3719 MI.eraseFromParent(); 3720 return true; 3721 } 3722 3723 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3724 // offset (the offset that is included in bounds checking and swizzling, to be 3725 // split between the instruction's voffset and immoffset fields) and soffset 3726 // (the offset that is excluded from bounds checking and swizzling, to go in 3727 // the instruction's soffset field). This function takes the first kind of 3728 // offset and figures out how to split it between voffset and immoffset. 3729 std::pair<Register, unsigned> 3730 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3731 Register OrigOffset) const { 3732 const unsigned MaxImm = 4095; 3733 Register BaseReg; 3734 unsigned ImmOffset; 3735 const LLT S32 = LLT::scalar(32); 3736 MachineRegisterInfo &MRI = *B.getMRI(); 3737 3738 std::tie(BaseReg, ImmOffset) = 3739 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3740 3741 // If BaseReg is a pointer, convert it to int. 3742 if (MRI.getType(BaseReg).isPointer()) 3743 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3744 3745 // If the immediate value is too big for the immoffset field, put the value 3746 // and -4096 into the immoffset field so that the value that is copied/added 3747 // for the voffset field is a multiple of 4096, and it stands more chance 3748 // of being CSEd with the copy/add for another similar load/store. 3749 // However, do not do that rounding down to a multiple of 4096 if that is a 3750 // negative number, as it appears to be illegal to have a negative offset 3751 // in the vgpr, even if adding the immediate offset makes it positive. 3752 unsigned Overflow = ImmOffset & ~MaxImm; 3753 ImmOffset -= Overflow; 3754 if ((int32_t)Overflow < 0) { 3755 Overflow += ImmOffset; 3756 ImmOffset = 0; 3757 } 3758 3759 if (Overflow != 0) { 3760 if (!BaseReg) { 3761 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3762 } else { 3763 auto OverflowVal = B.buildConstant(S32, Overflow); 3764 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3765 } 3766 } 3767 3768 if (!BaseReg) 3769 BaseReg = B.buildConstant(S32, 0).getReg(0); 3770 3771 return std::make_pair(BaseReg, ImmOffset); 3772 } 3773 3774 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3775 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3776 Register VOffset, Register SOffset, 3777 unsigned ImmOffset, Register VIndex, 3778 MachineRegisterInfo &MRI) const { 3779 Optional<ValueAndVReg> MaybeVOffsetVal = 3780 getIConstantVRegValWithLookThrough(VOffset, MRI); 3781 Optional<ValueAndVReg> MaybeSOffsetVal = 3782 getIConstantVRegValWithLookThrough(SOffset, MRI); 3783 Optional<ValueAndVReg> MaybeVIndexVal = 3784 getIConstantVRegValWithLookThrough(VIndex, MRI); 3785 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3786 // update the MMO with that offset. The stride is unknown so we can only do 3787 // this if VIndex is constant 0. 3788 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3789 MaybeVIndexVal->Value == 0) { 3790 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3791 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3792 MMO->setOffset(TotalOffset); 3793 } else { 3794 // We don't have a constant combined offset to use in the MMO. Give up. 3795 MMO->setValue((Value *)nullptr); 3796 } 3797 } 3798 3799 /// Handle register layout difference for f16 images for some subtargets. 3800 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3801 MachineRegisterInfo &MRI, 3802 Register Reg, 3803 bool ImageStore) const { 3804 const LLT S16 = LLT::scalar(16); 3805 const LLT S32 = LLT::scalar(32); 3806 LLT StoreVT = MRI.getType(Reg); 3807 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3808 3809 if (ST.hasUnpackedD16VMem()) { 3810 auto Unmerge = B.buildUnmerge(S16, Reg); 3811 3812 SmallVector<Register, 4> WideRegs; 3813 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3814 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3815 3816 int NumElts = StoreVT.getNumElements(); 3817 3818 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3819 .getReg(0); 3820 } 3821 3822 if (ImageStore && ST.hasImageStoreD16Bug()) { 3823 if (StoreVT.getNumElements() == 2) { 3824 SmallVector<Register, 4> PackedRegs; 3825 Reg = B.buildBitcast(S32, Reg).getReg(0); 3826 PackedRegs.push_back(Reg); 3827 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3828 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3829 .getReg(0); 3830 } 3831 3832 if (StoreVT.getNumElements() == 3) { 3833 SmallVector<Register, 4> PackedRegs; 3834 auto Unmerge = B.buildUnmerge(S16, Reg); 3835 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3836 PackedRegs.push_back(Unmerge.getReg(I)); 3837 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3838 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3839 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3840 } 3841 3842 if (StoreVT.getNumElements() == 4) { 3843 SmallVector<Register, 4> PackedRegs; 3844 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3845 auto Unmerge = B.buildUnmerge(S32, Reg); 3846 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3847 PackedRegs.push_back(Unmerge.getReg(I)); 3848 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3849 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3850 .getReg(0); 3851 } 3852 3853 llvm_unreachable("invalid data type"); 3854 } 3855 3856 return Reg; 3857 } 3858 3859 Register AMDGPULegalizerInfo::fixStoreSourceType( 3860 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3861 MachineRegisterInfo *MRI = B.getMRI(); 3862 LLT Ty = MRI->getType(VData); 3863 3864 const LLT S16 = LLT::scalar(16); 3865 3866 // Fixup illegal register types for i8 stores. 3867 if (Ty == LLT::scalar(8) || Ty == S16) { 3868 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3869 return AnyExt; 3870 } 3871 3872 if (Ty.isVector()) { 3873 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3874 if (IsFormat) 3875 return handleD16VData(B, *MRI, VData); 3876 } 3877 } 3878 3879 return VData; 3880 } 3881 3882 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3883 MachineRegisterInfo &MRI, 3884 MachineIRBuilder &B, 3885 bool IsTyped, 3886 bool IsFormat) const { 3887 Register VData = MI.getOperand(1).getReg(); 3888 LLT Ty = MRI.getType(VData); 3889 LLT EltTy = Ty.getScalarType(); 3890 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3891 const LLT S32 = LLT::scalar(32); 3892 3893 VData = fixStoreSourceType(B, VData, IsFormat); 3894 Register RSrc = MI.getOperand(2).getReg(); 3895 3896 MachineMemOperand *MMO = *MI.memoperands_begin(); 3897 const int MemSize = MMO->getSize(); 3898 3899 unsigned ImmOffset; 3900 3901 // The typed intrinsics add an immediate after the registers. 3902 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3903 3904 // The struct intrinsic variants add one additional operand over raw. 3905 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3906 Register VIndex; 3907 int OpOffset = 0; 3908 if (HasVIndex) { 3909 VIndex = MI.getOperand(3).getReg(); 3910 OpOffset = 1; 3911 } else { 3912 VIndex = B.buildConstant(S32, 0).getReg(0); 3913 } 3914 3915 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3916 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3917 3918 unsigned Format = 0; 3919 if (IsTyped) { 3920 Format = MI.getOperand(5 + OpOffset).getImm(); 3921 ++OpOffset; 3922 } 3923 3924 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3925 3926 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3927 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3928 3929 unsigned Opc; 3930 if (IsTyped) { 3931 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3932 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3933 } else if (IsFormat) { 3934 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3935 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3936 } else { 3937 switch (MemSize) { 3938 case 1: 3939 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3940 break; 3941 case 2: 3942 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3943 break; 3944 default: 3945 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3946 break; 3947 } 3948 } 3949 3950 auto MIB = B.buildInstr(Opc) 3951 .addUse(VData) // vdata 3952 .addUse(RSrc) // rsrc 3953 .addUse(VIndex) // vindex 3954 .addUse(VOffset) // voffset 3955 .addUse(SOffset) // soffset 3956 .addImm(ImmOffset); // offset(imm) 3957 3958 if (IsTyped) 3959 MIB.addImm(Format); 3960 3961 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3962 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3963 .addMemOperand(MMO); 3964 3965 MI.eraseFromParent(); 3966 return true; 3967 } 3968 3969 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3970 MachineRegisterInfo &MRI, 3971 MachineIRBuilder &B, 3972 bool IsFormat, 3973 bool IsTyped) const { 3974 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3975 MachineMemOperand *MMO = *MI.memoperands_begin(); 3976 const LLT MemTy = MMO->getMemoryType(); 3977 const LLT S32 = LLT::scalar(32); 3978 3979 Register Dst = MI.getOperand(0).getReg(); 3980 Register RSrc = MI.getOperand(2).getReg(); 3981 3982 // The typed intrinsics add an immediate after the registers. 3983 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3984 3985 // The struct intrinsic variants add one additional operand over raw. 3986 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3987 Register VIndex; 3988 int OpOffset = 0; 3989 if (HasVIndex) { 3990 VIndex = MI.getOperand(3).getReg(); 3991 OpOffset = 1; 3992 } else { 3993 VIndex = B.buildConstant(S32, 0).getReg(0); 3994 } 3995 3996 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3997 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3998 3999 unsigned Format = 0; 4000 if (IsTyped) { 4001 Format = MI.getOperand(5 + OpOffset).getImm(); 4002 ++OpOffset; 4003 } 4004 4005 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4006 unsigned ImmOffset; 4007 4008 LLT Ty = MRI.getType(Dst); 4009 LLT EltTy = Ty.getScalarType(); 4010 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4011 const bool Unpacked = ST.hasUnpackedD16VMem(); 4012 4013 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4014 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4015 4016 unsigned Opc; 4017 4018 if (IsTyped) { 4019 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4020 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4021 } else if (IsFormat) { 4022 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4023 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4024 } else { 4025 switch (MemTy.getSizeInBits()) { 4026 case 8: 4027 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4028 break; 4029 case 16: 4030 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4031 break; 4032 default: 4033 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4034 break; 4035 } 4036 } 4037 4038 Register LoadDstReg; 4039 4040 bool IsExtLoad = 4041 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4042 LLT UnpackedTy = Ty.changeElementSize(32); 4043 4044 if (IsExtLoad) 4045 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4046 else if (Unpacked && IsD16 && Ty.isVector()) 4047 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4048 else 4049 LoadDstReg = Dst; 4050 4051 auto MIB = B.buildInstr(Opc) 4052 .addDef(LoadDstReg) // vdata 4053 .addUse(RSrc) // rsrc 4054 .addUse(VIndex) // vindex 4055 .addUse(VOffset) // voffset 4056 .addUse(SOffset) // soffset 4057 .addImm(ImmOffset); // offset(imm) 4058 4059 if (IsTyped) 4060 MIB.addImm(Format); 4061 4062 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4063 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4064 .addMemOperand(MMO); 4065 4066 if (LoadDstReg != Dst) { 4067 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4068 4069 // Widen result for extending loads was widened. 4070 if (IsExtLoad) 4071 B.buildTrunc(Dst, LoadDstReg); 4072 else { 4073 // Repack to original 16-bit vector result 4074 // FIXME: G_TRUNC should work, but legalization currently fails 4075 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4076 SmallVector<Register, 4> Repack; 4077 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4078 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4079 B.buildMerge(Dst, Repack); 4080 } 4081 } 4082 4083 MI.eraseFromParent(); 4084 return true; 4085 } 4086 4087 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4088 MachineIRBuilder &B, 4089 bool IsInc) const { 4090 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4091 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4092 B.buildInstr(Opc) 4093 .addDef(MI.getOperand(0).getReg()) 4094 .addUse(MI.getOperand(2).getReg()) 4095 .addUse(MI.getOperand(3).getReg()) 4096 .cloneMemRefs(MI); 4097 MI.eraseFromParent(); 4098 return true; 4099 } 4100 4101 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4102 switch (IntrID) { 4103 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4104 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4105 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4106 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4107 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4108 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4109 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4110 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4111 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4112 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4113 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4114 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4115 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4116 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4117 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4118 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4119 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4120 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4121 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4122 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4123 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4124 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4125 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4126 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4127 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4128 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4129 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4130 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4131 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4132 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4133 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4134 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4135 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4136 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4137 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4138 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4139 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4140 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4141 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4142 case Intrinsic::amdgcn_buffer_atomic_fadd: 4143 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4144 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4145 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4146 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4147 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4148 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4149 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4150 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4151 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4152 default: 4153 llvm_unreachable("unhandled atomic opcode"); 4154 } 4155 } 4156 4157 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4158 MachineIRBuilder &B, 4159 Intrinsic::ID IID) const { 4160 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4161 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4162 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4163 4164 Register Dst; 4165 4166 int OpOffset = 0; 4167 if (HasReturn) { 4168 // A few FP atomics do not support return values. 4169 Dst = MI.getOperand(0).getReg(); 4170 } else { 4171 OpOffset = -1; 4172 } 4173 4174 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4175 Register CmpVal; 4176 4177 if (IsCmpSwap) { 4178 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4179 ++OpOffset; 4180 } 4181 4182 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4183 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4184 4185 // The struct intrinsic variants add one additional operand over raw. 4186 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4187 Register VIndex; 4188 if (HasVIndex) { 4189 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4190 ++OpOffset; 4191 } else { 4192 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4193 } 4194 4195 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4196 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4197 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4198 4199 MachineMemOperand *MMO = *MI.memoperands_begin(); 4200 4201 unsigned ImmOffset; 4202 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4203 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4204 4205 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4206 4207 if (HasReturn) 4208 MIB.addDef(Dst); 4209 4210 MIB.addUse(VData); // vdata 4211 4212 if (IsCmpSwap) 4213 MIB.addReg(CmpVal); 4214 4215 MIB.addUse(RSrc) // rsrc 4216 .addUse(VIndex) // vindex 4217 .addUse(VOffset) // voffset 4218 .addUse(SOffset) // soffset 4219 .addImm(ImmOffset) // offset(imm) 4220 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4221 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4222 .addMemOperand(MMO); 4223 4224 MI.eraseFromParent(); 4225 return true; 4226 } 4227 4228 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4229 /// vector with s16 typed elements. 4230 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4231 SmallVectorImpl<Register> &PackedAddrs, 4232 unsigned ArgOffset, 4233 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4234 bool IsA16, bool IsG16) { 4235 const LLT S16 = LLT::scalar(16); 4236 const LLT V2S16 = LLT::fixed_vector(2, 16); 4237 auto EndIdx = Intr->VAddrEnd; 4238 4239 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4240 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4241 if (!SrcOp.isReg()) 4242 continue; // _L to _LZ may have eliminated this. 4243 4244 Register AddrReg = SrcOp.getReg(); 4245 4246 if ((I < Intr->GradientStart) || 4247 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4248 (I >= Intr->CoordStart && !IsA16)) { 4249 // Handle any gradient or coordinate operands that should not be packed 4250 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4251 PackedAddrs.push_back(AddrReg); 4252 } else { 4253 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4254 // derivatives dx/dh and dx/dv are packed with undef. 4255 if (((I + 1) >= EndIdx) || 4256 ((Intr->NumGradients / 2) % 2 == 1 && 4257 (I == static_cast<unsigned>(Intr->GradientStart + 4258 (Intr->NumGradients / 2) - 1) || 4259 I == static_cast<unsigned>(Intr->GradientStart + 4260 Intr->NumGradients - 1))) || 4261 // Check for _L to _LZ optimization 4262 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4263 PackedAddrs.push_back( 4264 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4265 .getReg(0)); 4266 } else { 4267 PackedAddrs.push_back( 4268 B.buildBuildVector( 4269 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4270 .getReg(0)); 4271 ++I; 4272 } 4273 } 4274 } 4275 } 4276 4277 /// Convert from separate vaddr components to a single vector address register, 4278 /// and replace the remaining operands with $noreg. 4279 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4280 int DimIdx, int NumVAddrs) { 4281 const LLT S32 = LLT::scalar(32); 4282 4283 SmallVector<Register, 8> AddrRegs; 4284 for (int I = 0; I != NumVAddrs; ++I) { 4285 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4286 if (SrcOp.isReg()) { 4287 AddrRegs.push_back(SrcOp.getReg()); 4288 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4289 } 4290 } 4291 4292 int NumAddrRegs = AddrRegs.size(); 4293 if (NumAddrRegs != 1) { 4294 // Above 8 elements round up to next power of 2 (i.e. 16). 4295 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4296 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4297 auto Undef = B.buildUndef(S32); 4298 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4299 NumAddrRegs = RoundedNumRegs; 4300 } 4301 4302 auto VAddr = 4303 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4304 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4305 } 4306 4307 for (int I = 1; I != NumVAddrs; ++I) { 4308 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4309 if (SrcOp.isReg()) 4310 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4311 } 4312 } 4313 4314 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4315 /// 4316 /// Depending on the subtarget, load/store with 16-bit element data need to be 4317 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4318 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4319 /// registers. 4320 /// 4321 /// We don't want to directly select image instructions just yet, but also want 4322 /// to exposes all register repacking to the legalizer/combiners. We also don't 4323 /// want a selected instrution entering RegBankSelect. In order to avoid 4324 /// defining a multitude of intermediate image instructions, directly hack on 4325 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4326 /// padding now unnecessary arguments with $noreg. 4327 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4328 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4329 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4330 4331 const unsigned NumDefs = MI.getNumExplicitDefs(); 4332 const unsigned ArgOffset = NumDefs + 1; 4333 bool IsTFE = NumDefs == 2; 4334 // We are only processing the operands of d16 image operations on subtargets 4335 // that use the unpacked register layout, or need to repack the TFE result. 4336 4337 // TODO: Do we need to guard against already legalized intrinsics? 4338 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4339 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4340 4341 MachineRegisterInfo *MRI = B.getMRI(); 4342 const LLT S32 = LLT::scalar(32); 4343 const LLT S16 = LLT::scalar(16); 4344 const LLT V2S16 = LLT::fixed_vector(2, 16); 4345 4346 unsigned DMask = 0; 4347 4348 // Check for 16 bit addresses and pack if true. 4349 LLT GradTy = 4350 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4351 LLT AddrTy = 4352 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4353 const bool IsG16 = GradTy == S16; 4354 const bool IsA16 = AddrTy == S16; 4355 4356 int DMaskLanes = 0; 4357 if (!BaseOpcode->Atomic) { 4358 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4359 if (BaseOpcode->Gather4) { 4360 DMaskLanes = 4; 4361 } else if (DMask != 0) { 4362 DMaskLanes = countPopulation(DMask); 4363 } else if (!IsTFE && !BaseOpcode->Store) { 4364 // If dmask is 0, this is a no-op load. This can be eliminated. 4365 B.buildUndef(MI.getOperand(0)); 4366 MI.eraseFromParent(); 4367 return true; 4368 } 4369 } 4370 4371 Observer.changingInstr(MI); 4372 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4373 4374 unsigned NewOpcode = NumDefs == 0 ? 4375 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4376 4377 // Track that we legalized this 4378 MI.setDesc(B.getTII().get(NewOpcode)); 4379 4380 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4381 // dmask to be at least 1 otherwise the instruction will fail 4382 if (IsTFE && DMask == 0) { 4383 DMask = 0x1; 4384 DMaskLanes = 1; 4385 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4386 } 4387 4388 if (BaseOpcode->Atomic) { 4389 Register VData0 = MI.getOperand(2).getReg(); 4390 LLT Ty = MRI->getType(VData0); 4391 4392 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4393 if (Ty.isVector()) 4394 return false; 4395 4396 if (BaseOpcode->AtomicX2) { 4397 Register VData1 = MI.getOperand(3).getReg(); 4398 // The two values are packed in one register. 4399 LLT PackedTy = LLT::fixed_vector(2, Ty); 4400 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4401 MI.getOperand(2).setReg(Concat.getReg(0)); 4402 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4403 } 4404 } 4405 4406 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4407 4408 // Optimize _L to _LZ when _L is zero 4409 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4410 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4411 const ConstantFP *ConstantLod; 4412 4413 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4414 m_GFCst(ConstantLod))) { 4415 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4416 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4417 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4418 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4419 Intr->Dim); 4420 4421 // The starting indexes should remain in the same place. 4422 --CorrectedNumVAddrs; 4423 4424 MI.getOperand(MI.getNumExplicitDefs()) 4425 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4426 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4427 Intr = NewImageDimIntr; 4428 } 4429 } 4430 } 4431 4432 // Optimize _mip away, when 'lod' is zero 4433 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4434 int64_t ConstantLod; 4435 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4436 m_ICst(ConstantLod))) { 4437 if (ConstantLod == 0) { 4438 // TODO: Change intrinsic opcode and remove operand instead or replacing 4439 // it with 0, as the _L to _LZ handling is done above. 4440 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4441 --CorrectedNumVAddrs; 4442 } 4443 } 4444 } 4445 4446 // Rewrite the addressing register layout before doing anything else. 4447 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4448 // 16 bit gradients are supported, but are tied to the A16 control 4449 // so both gradients and addresses must be 16 bit 4450 return false; 4451 } 4452 4453 if (IsA16 && !ST.hasA16()) { 4454 // A16 not supported 4455 return false; 4456 } 4457 4458 if (IsA16 || IsG16) { 4459 if (Intr->NumVAddrs > 1) { 4460 SmallVector<Register, 4> PackedRegs; 4461 4462 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4463 IsG16); 4464 4465 // See also below in the non-a16 branch 4466 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4467 PackedRegs.size() <= ST.getNSAMaxSize(); 4468 4469 if (!UseNSA && PackedRegs.size() > 1) { 4470 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4471 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4472 PackedRegs[0] = Concat.getReg(0); 4473 PackedRegs.resize(1); 4474 } 4475 4476 const unsigned NumPacked = PackedRegs.size(); 4477 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4478 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4479 if (!SrcOp.isReg()) { 4480 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4481 continue; 4482 } 4483 4484 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4485 4486 if (I - Intr->VAddrStart < NumPacked) 4487 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4488 else 4489 SrcOp.setReg(AMDGPU::NoRegister); 4490 } 4491 } 4492 } else { 4493 // If the register allocator cannot place the address registers contiguously 4494 // without introducing moves, then using the non-sequential address encoding 4495 // is always preferable, since it saves VALU instructions and is usually a 4496 // wash in terms of code size or even better. 4497 // 4498 // However, we currently have no way of hinting to the register allocator 4499 // that MIMG addresses should be placed contiguously when it is possible to 4500 // do so, so force non-NSA for the common 2-address case as a heuristic. 4501 // 4502 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4503 // allocation when possible. 4504 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4505 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4506 4507 if (!UseNSA && Intr->NumVAddrs > 1) 4508 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4509 Intr->NumVAddrs); 4510 } 4511 4512 int Flags = 0; 4513 if (IsA16) 4514 Flags |= 1; 4515 if (IsG16) 4516 Flags |= 2; 4517 MI.addOperand(MachineOperand::CreateImm(Flags)); 4518 4519 if (BaseOpcode->Store) { // No TFE for stores? 4520 // TODO: Handle dmask trim 4521 Register VData = MI.getOperand(1).getReg(); 4522 LLT Ty = MRI->getType(VData); 4523 if (!Ty.isVector() || Ty.getElementType() != S16) 4524 return true; 4525 4526 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4527 if (RepackedReg != VData) { 4528 MI.getOperand(1).setReg(RepackedReg); 4529 } 4530 4531 return true; 4532 } 4533 4534 Register DstReg = MI.getOperand(0).getReg(); 4535 LLT Ty = MRI->getType(DstReg); 4536 const LLT EltTy = Ty.getScalarType(); 4537 const bool IsD16 = Ty.getScalarType() == S16; 4538 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4539 4540 // Confirm that the return type is large enough for the dmask specified 4541 if (NumElts < DMaskLanes) 4542 return false; 4543 4544 if (NumElts > 4 || DMaskLanes > 4) 4545 return false; 4546 4547 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4548 const LLT AdjustedTy = 4549 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4550 4551 // The raw dword aligned data component of the load. The only legal cases 4552 // where this matters should be when using the packed D16 format, for 4553 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4554 LLT RoundedTy; 4555 4556 // S32 vector to to cover all data, plus TFE result element. 4557 LLT TFETy; 4558 4559 // Register type to use for each loaded component. Will be S32 or V2S16. 4560 LLT RegTy; 4561 4562 if (IsD16 && ST.hasUnpackedD16VMem()) { 4563 RoundedTy = 4564 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4565 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4566 RegTy = S32; 4567 } else { 4568 unsigned EltSize = EltTy.getSizeInBits(); 4569 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4570 unsigned RoundedSize = 32 * RoundedElts; 4571 RoundedTy = LLT::scalarOrVector( 4572 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4573 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4574 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4575 } 4576 4577 // The return type does not need adjustment. 4578 // TODO: Should we change s16 case to s32 or <2 x s16>? 4579 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4580 return true; 4581 4582 Register Dst1Reg; 4583 4584 // Insert after the instruction. 4585 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4586 4587 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4588 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4589 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4590 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4591 4592 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4593 4594 MI.getOperand(0).setReg(NewResultReg); 4595 4596 // In the IR, TFE is supposed to be used with a 2 element struct return 4597 // type. The instruction really returns these two values in one contiguous 4598 // register, with one additional dword beyond the loaded data. Rewrite the 4599 // return type to use a single register result. 4600 4601 if (IsTFE) { 4602 Dst1Reg = MI.getOperand(1).getReg(); 4603 if (MRI->getType(Dst1Reg) != S32) 4604 return false; 4605 4606 // TODO: Make sure the TFE operand bit is set. 4607 MI.RemoveOperand(1); 4608 4609 // Handle the easy case that requires no repack instructions. 4610 if (Ty == S32) { 4611 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4612 return true; 4613 } 4614 } 4615 4616 // Now figure out how to copy the new result register back into the old 4617 // result. 4618 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4619 4620 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4621 4622 if (ResultNumRegs == 1) { 4623 assert(!IsTFE); 4624 ResultRegs[0] = NewResultReg; 4625 } else { 4626 // We have to repack into a new vector of some kind. 4627 for (int I = 0; I != NumDataRegs; ++I) 4628 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4629 B.buildUnmerge(ResultRegs, NewResultReg); 4630 4631 // Drop the final TFE element to get the data part. The TFE result is 4632 // directly written to the right place already. 4633 if (IsTFE) 4634 ResultRegs.resize(NumDataRegs); 4635 } 4636 4637 // For an s16 scalar result, we form an s32 result with a truncate regardless 4638 // of packed vs. unpacked. 4639 if (IsD16 && !Ty.isVector()) { 4640 B.buildTrunc(DstReg, ResultRegs[0]); 4641 return true; 4642 } 4643 4644 // Avoid a build/concat_vector of 1 entry. 4645 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4646 B.buildBitcast(DstReg, ResultRegs[0]); 4647 return true; 4648 } 4649 4650 assert(Ty.isVector()); 4651 4652 if (IsD16) { 4653 // For packed D16 results with TFE enabled, all the data components are 4654 // S32. Cast back to the expected type. 4655 // 4656 // TODO: We don't really need to use load s32 elements. We would only need one 4657 // cast for the TFE result if a multiple of v2s16 was used. 4658 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4659 for (Register &Reg : ResultRegs) 4660 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4661 } else if (ST.hasUnpackedD16VMem()) { 4662 for (Register &Reg : ResultRegs) 4663 Reg = B.buildTrunc(S16, Reg).getReg(0); 4664 } 4665 } 4666 4667 auto padWithUndef = [&](LLT Ty, int NumElts) { 4668 if (NumElts == 0) 4669 return; 4670 Register Undef = B.buildUndef(Ty).getReg(0); 4671 for (int I = 0; I != NumElts; ++I) 4672 ResultRegs.push_back(Undef); 4673 }; 4674 4675 // Pad out any elements eliminated due to the dmask. 4676 LLT ResTy = MRI->getType(ResultRegs[0]); 4677 if (!ResTy.isVector()) { 4678 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4679 B.buildBuildVector(DstReg, ResultRegs); 4680 return true; 4681 } 4682 4683 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4684 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4685 4686 // Deal with the one annoying legal case. 4687 const LLT V3S16 = LLT::fixed_vector(3, 16); 4688 if (Ty == V3S16) { 4689 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4690 auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs); 4691 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4692 return true; 4693 } 4694 4695 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4696 B.buildConcatVectors(DstReg, ResultRegs); 4697 return true; 4698 } 4699 4700 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4701 LegalizerHelper &Helper, MachineInstr &MI) const { 4702 MachineIRBuilder &B = Helper.MIRBuilder; 4703 GISelChangeObserver &Observer = Helper.Observer; 4704 4705 Register Dst = MI.getOperand(0).getReg(); 4706 LLT Ty = B.getMRI()->getType(Dst); 4707 unsigned Size = Ty.getSizeInBits(); 4708 MachineFunction &MF = B.getMF(); 4709 4710 Observer.changingInstr(MI); 4711 4712 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4713 Ty = getBitcastRegisterType(Ty); 4714 Helper.bitcastDst(MI, Ty, 0); 4715 Dst = MI.getOperand(0).getReg(); 4716 B.setInsertPt(B.getMBB(), MI); 4717 } 4718 4719 // FIXME: We don't really need this intermediate instruction. The intrinsic 4720 // should be fixed to have a memory operand. Since it's readnone, we're not 4721 // allowed to add one. 4722 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4723 MI.RemoveOperand(1); // Remove intrinsic ID 4724 4725 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4726 // TODO: Should this use datalayout alignment? 4727 const unsigned MemSize = (Size + 7) / 8; 4728 const Align MemAlign(4); 4729 MachineMemOperand *MMO = MF.getMachineMemOperand( 4730 MachinePointerInfo(), 4731 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4732 MachineMemOperand::MOInvariant, 4733 MemSize, MemAlign); 4734 MI.addMemOperand(MF, MMO); 4735 4736 // There are no 96-bit result scalar loads, but widening to 128-bit should 4737 // always be legal. We may need to restore this to a 96-bit result if it turns 4738 // out this needs to be converted to a vector load during RegBankSelect. 4739 if (!isPowerOf2_32(Size)) { 4740 if (Ty.isVector()) 4741 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4742 else 4743 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4744 } 4745 4746 Observer.changedInstr(MI); 4747 return true; 4748 } 4749 4750 // TODO: Move to selection 4751 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4752 MachineRegisterInfo &MRI, 4753 MachineIRBuilder &B) const { 4754 if (!ST.isTrapHandlerEnabled() || 4755 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4756 return legalizeTrapEndpgm(MI, MRI, B); 4757 4758 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4759 switch (*HsaAbiVer) { 4760 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4761 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4762 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4763 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4764 return ST.supportsGetDoorbellID() ? 4765 legalizeTrapHsa(MI, MRI, B) : 4766 legalizeTrapHsaQueuePtr(MI, MRI, B); 4767 } 4768 } 4769 4770 llvm_unreachable("Unknown trap handler"); 4771 } 4772 4773 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4774 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4775 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4776 MI.eraseFromParent(); 4777 return true; 4778 } 4779 4780 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4781 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4782 // Pass queue pointer to trap handler as input, and insert trap instruction 4783 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4784 Register LiveIn = 4785 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4786 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4787 return false; 4788 4789 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4790 B.buildCopy(SGPR01, LiveIn); 4791 B.buildInstr(AMDGPU::S_TRAP) 4792 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4793 .addReg(SGPR01, RegState::Implicit); 4794 4795 MI.eraseFromParent(); 4796 return true; 4797 } 4798 4799 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4800 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4801 B.buildInstr(AMDGPU::S_TRAP) 4802 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4803 MI.eraseFromParent(); 4804 return true; 4805 } 4806 4807 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4808 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4809 // Is non-HSA path or trap-handler disabled? Then, report a warning 4810 // accordingly 4811 if (!ST.isTrapHandlerEnabled() || 4812 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4813 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4814 "debugtrap handler not supported", 4815 MI.getDebugLoc(), DS_Warning); 4816 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4817 Ctx.diagnose(NoTrap); 4818 } else { 4819 // Insert debug-trap instruction 4820 B.buildInstr(AMDGPU::S_TRAP) 4821 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4822 } 4823 4824 MI.eraseFromParent(); 4825 return true; 4826 } 4827 4828 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4829 MachineIRBuilder &B) const { 4830 MachineRegisterInfo &MRI = *B.getMRI(); 4831 const LLT S16 = LLT::scalar(16); 4832 const LLT S32 = LLT::scalar(32); 4833 4834 Register DstReg = MI.getOperand(0).getReg(); 4835 Register NodePtr = MI.getOperand(2).getReg(); 4836 Register RayExtent = MI.getOperand(3).getReg(); 4837 Register RayOrigin = MI.getOperand(4).getReg(); 4838 Register RayDir = MI.getOperand(5).getReg(); 4839 Register RayInvDir = MI.getOperand(6).getReg(); 4840 Register TDescr = MI.getOperand(7).getReg(); 4841 4842 if (!ST.hasGFX10_AEncoding()) { 4843 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4844 "intrinsic not supported on subtarget", 4845 MI.getDebugLoc()); 4846 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4847 return false; 4848 } 4849 4850 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4851 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4852 const unsigned NumVDataDwords = 4; 4853 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4854 const bool UseNSA = 4855 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 4856 const unsigned BaseOpcodes[2][2] = { 4857 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 4858 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 4859 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 4860 int Opcode; 4861 if (UseNSA) { 4862 Opcode = 4863 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 4864 NumVDataDwords, NumVAddrDwords); 4865 } else { 4866 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 4867 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 4868 PowerOf2Ceil(NumVAddrDwords)); 4869 } 4870 assert(Opcode != -1); 4871 4872 SmallVector<Register, 12> Ops; 4873 if (Is64) { 4874 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4875 Ops.push_back(Unmerge.getReg(0)); 4876 Ops.push_back(Unmerge.getReg(1)); 4877 } else { 4878 Ops.push_back(NodePtr); 4879 } 4880 Ops.push_back(RayExtent); 4881 4882 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4883 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4884 Ops.push_back(Unmerge.getReg(0)); 4885 Ops.push_back(Unmerge.getReg(1)); 4886 Ops.push_back(Unmerge.getReg(2)); 4887 }; 4888 4889 packLanes(RayOrigin); 4890 if (IsA16) { 4891 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4892 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4893 Register R1 = MRI.createGenericVirtualRegister(S32); 4894 Register R2 = MRI.createGenericVirtualRegister(S32); 4895 Register R3 = MRI.createGenericVirtualRegister(S32); 4896 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4897 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4898 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4899 Ops.push_back(R1); 4900 Ops.push_back(R2); 4901 Ops.push_back(R3); 4902 } else { 4903 packLanes(RayDir); 4904 packLanes(RayInvDir); 4905 } 4906 4907 if (!UseNSA) { 4908 // Build a single vector containing all the operands so far prepared. 4909 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 4910 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 4911 Ops.clear(); 4912 Ops.push_back(MergedOps); 4913 } 4914 4915 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4916 .addDef(DstReg) 4917 .addImm(Opcode); 4918 4919 for (Register R : Ops) { 4920 MIB.addUse(R); 4921 } 4922 4923 MIB.addUse(TDescr) 4924 .addImm(IsA16 ? 1 : 0) 4925 .cloneMemRefs(MI); 4926 4927 MI.eraseFromParent(); 4928 return true; 4929 } 4930 4931 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4932 MachineInstr &MI) const { 4933 MachineIRBuilder &B = Helper.MIRBuilder; 4934 MachineRegisterInfo &MRI = *B.getMRI(); 4935 4936 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4937 auto IntrID = MI.getIntrinsicID(); 4938 switch (IntrID) { 4939 case Intrinsic::amdgcn_if: 4940 case Intrinsic::amdgcn_else: { 4941 MachineInstr *Br = nullptr; 4942 MachineBasicBlock *UncondBrTarget = nullptr; 4943 bool Negated = false; 4944 if (MachineInstr *BrCond = 4945 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4946 const SIRegisterInfo *TRI 4947 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4948 4949 Register Def = MI.getOperand(1).getReg(); 4950 Register Use = MI.getOperand(3).getReg(); 4951 4952 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4953 4954 if (Negated) 4955 std::swap(CondBrTarget, UncondBrTarget); 4956 4957 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4958 if (IntrID == Intrinsic::amdgcn_if) { 4959 B.buildInstr(AMDGPU::SI_IF) 4960 .addDef(Def) 4961 .addUse(Use) 4962 .addMBB(UncondBrTarget); 4963 } else { 4964 B.buildInstr(AMDGPU::SI_ELSE) 4965 .addDef(Def) 4966 .addUse(Use) 4967 .addMBB(UncondBrTarget); 4968 } 4969 4970 if (Br) { 4971 Br->getOperand(0).setMBB(CondBrTarget); 4972 } else { 4973 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4974 // since we're swapping branch targets it needs to be reinserted. 4975 // FIXME: IRTranslator should probably not do this 4976 B.buildBr(*CondBrTarget); 4977 } 4978 4979 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4980 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4981 MI.eraseFromParent(); 4982 BrCond->eraseFromParent(); 4983 return true; 4984 } 4985 4986 return false; 4987 } 4988 case Intrinsic::amdgcn_loop: { 4989 MachineInstr *Br = nullptr; 4990 MachineBasicBlock *UncondBrTarget = nullptr; 4991 bool Negated = false; 4992 if (MachineInstr *BrCond = 4993 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4994 const SIRegisterInfo *TRI 4995 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4996 4997 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4998 Register Reg = MI.getOperand(2).getReg(); 4999 5000 if (Negated) 5001 std::swap(CondBrTarget, UncondBrTarget); 5002 5003 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5004 B.buildInstr(AMDGPU::SI_LOOP) 5005 .addUse(Reg) 5006 .addMBB(UncondBrTarget); 5007 5008 if (Br) 5009 Br->getOperand(0).setMBB(CondBrTarget); 5010 else 5011 B.buildBr(*CondBrTarget); 5012 5013 MI.eraseFromParent(); 5014 BrCond->eraseFromParent(); 5015 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5016 return true; 5017 } 5018 5019 return false; 5020 } 5021 case Intrinsic::amdgcn_kernarg_segment_ptr: 5022 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5023 // This only makes sense to call in a kernel, so just lower to null. 5024 B.buildConstant(MI.getOperand(0).getReg(), 0); 5025 MI.eraseFromParent(); 5026 return true; 5027 } 5028 5029 return legalizePreloadedArgIntrin( 5030 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5031 case Intrinsic::amdgcn_implicitarg_ptr: 5032 return legalizeImplicitArgPtr(MI, MRI, B); 5033 case Intrinsic::amdgcn_workitem_id_x: 5034 return legalizePreloadedArgIntrin(MI, MRI, B, 5035 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5036 case Intrinsic::amdgcn_workitem_id_y: 5037 return legalizePreloadedArgIntrin(MI, MRI, B, 5038 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5039 case Intrinsic::amdgcn_workitem_id_z: 5040 return legalizePreloadedArgIntrin(MI, MRI, B, 5041 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5042 case Intrinsic::amdgcn_workgroup_id_x: 5043 return legalizePreloadedArgIntrin(MI, MRI, B, 5044 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5045 case Intrinsic::amdgcn_workgroup_id_y: 5046 return legalizePreloadedArgIntrin(MI, MRI, B, 5047 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5048 case Intrinsic::amdgcn_workgroup_id_z: 5049 return legalizePreloadedArgIntrin(MI, MRI, B, 5050 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5051 case Intrinsic::amdgcn_dispatch_ptr: 5052 return legalizePreloadedArgIntrin(MI, MRI, B, 5053 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5054 case Intrinsic::amdgcn_queue_ptr: 5055 return legalizePreloadedArgIntrin(MI, MRI, B, 5056 AMDGPUFunctionArgInfo::QUEUE_PTR); 5057 case Intrinsic::amdgcn_implicit_buffer_ptr: 5058 return legalizePreloadedArgIntrin( 5059 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5060 case Intrinsic::amdgcn_dispatch_id: 5061 return legalizePreloadedArgIntrin(MI, MRI, B, 5062 AMDGPUFunctionArgInfo::DISPATCH_ID); 5063 case Intrinsic::amdgcn_fdiv_fast: 5064 return legalizeFDIVFastIntrin(MI, MRI, B); 5065 case Intrinsic::amdgcn_is_shared: 5066 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5067 case Intrinsic::amdgcn_is_private: 5068 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5069 case Intrinsic::amdgcn_wavefrontsize: { 5070 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5071 MI.eraseFromParent(); 5072 return true; 5073 } 5074 case Intrinsic::amdgcn_s_buffer_load: 5075 return legalizeSBufferLoad(Helper, MI); 5076 case Intrinsic::amdgcn_raw_buffer_store: 5077 case Intrinsic::amdgcn_struct_buffer_store: 5078 return legalizeBufferStore(MI, MRI, B, false, false); 5079 case Intrinsic::amdgcn_raw_buffer_store_format: 5080 case Intrinsic::amdgcn_struct_buffer_store_format: 5081 return legalizeBufferStore(MI, MRI, B, false, true); 5082 case Intrinsic::amdgcn_raw_tbuffer_store: 5083 case Intrinsic::amdgcn_struct_tbuffer_store: 5084 return legalizeBufferStore(MI, MRI, B, true, true); 5085 case Intrinsic::amdgcn_raw_buffer_load: 5086 case Intrinsic::amdgcn_struct_buffer_load: 5087 return legalizeBufferLoad(MI, MRI, B, false, false); 5088 case Intrinsic::amdgcn_raw_buffer_load_format: 5089 case Intrinsic::amdgcn_struct_buffer_load_format: 5090 return legalizeBufferLoad(MI, MRI, B, true, false); 5091 case Intrinsic::amdgcn_raw_tbuffer_load: 5092 case Intrinsic::amdgcn_struct_tbuffer_load: 5093 return legalizeBufferLoad(MI, MRI, B, true, true); 5094 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5095 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5096 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5097 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5098 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5099 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5100 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5101 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5102 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5103 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5104 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5105 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5106 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5107 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5108 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5109 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5110 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5111 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5112 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5113 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5114 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5115 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5116 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5117 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5118 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5119 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5120 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5121 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5122 case Intrinsic::amdgcn_buffer_atomic_fadd: 5123 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5124 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5125 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5126 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5127 return legalizeBufferAtomic(MI, B, IntrID); 5128 case Intrinsic::amdgcn_atomic_inc: 5129 return legalizeAtomicIncDec(MI, B, true); 5130 case Intrinsic::amdgcn_atomic_dec: 5131 return legalizeAtomicIncDec(MI, B, false); 5132 case Intrinsic::trap: 5133 return legalizeTrapIntrinsic(MI, MRI, B); 5134 case Intrinsic::debugtrap: 5135 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5136 case Intrinsic::amdgcn_rsq_clamp: 5137 return legalizeRsqClampIntrinsic(MI, MRI, B); 5138 case Intrinsic::amdgcn_ds_fadd: 5139 case Intrinsic::amdgcn_ds_fmin: 5140 case Intrinsic::amdgcn_ds_fmax: 5141 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5142 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5143 return legalizeBVHIntrinsic(MI, B); 5144 default: { 5145 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5146 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5147 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5148 return true; 5149 } 5150 } 5151 5152 return true; 5153 } 5154