1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 unsigned AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .clampScalar(0, S16, S32) 536 .clampMaxNumElements(0, S16, 2) 537 .scalarize(0) 538 .widenScalarToNextPow2(0, 32); 539 540 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 541 .legalFor({S32, S16, V2S16}) // Clamp modifier 542 .minScalarOrElt(0, S16) 543 .clampMaxNumElements(0, S16, 2) 544 .scalarize(0) 545 .widenScalarToNextPow2(0, 32) 546 .lower(); 547 } else if (ST.has16BitInsts()) { 548 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 549 .legalFor({S32, S16}) 550 .clampScalar(0, S16, S32) 551 .scalarize(0) 552 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 553 554 // Technically the saturating operations require clamp bit support, but this 555 // was introduced at the same time as 16-bit operations. 556 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 557 .legalFor({S32, S16}) // Clamp modifier 558 .minScalar(0, S16) 559 .scalarize(0) 560 .widenScalarToNextPow2(0, 16) 561 .lower(); 562 563 // We're just lowering this, but it helps get a better result to try to 564 // coerce to the desired type first. 565 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 566 .minScalar(0, S16) 567 .scalarize(0) 568 .lower(); 569 } else { 570 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 571 .legalFor({S32}) 572 .clampScalar(0, S32, S32) 573 .scalarize(0); 574 575 if (ST.hasIntClamp()) { 576 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 577 .legalFor({S32}) // Clamp modifier. 578 .scalarize(0) 579 .minScalarOrElt(0, S32) 580 .lower(); 581 } else { 582 // Clamp bit support was added in VI, along with 16-bit operations. 583 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 584 .minScalar(0, S32) 585 .scalarize(0) 586 .lower(); 587 } 588 589 // FIXME: DAG expansion gets better results. The widening uses the smaller 590 // range values and goes for the min/max lowering directly. 591 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 592 .minScalar(0, S32) 593 .scalarize(0) 594 .lower(); 595 } 596 597 getActionDefinitionsBuilder( 598 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 599 .customFor({S32, S64}) 600 .clampScalar(0, S32, S64) 601 .widenScalarToNextPow2(0, 32) 602 .scalarize(0); 603 604 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 605 .legalFor({S32}) 606 .maxScalarOrElt(0, S32); 607 608 if (ST.hasVOP3PInsts()) { 609 Mulh 610 .clampMaxNumElements(0, S8, 2) 611 .lowerFor({V2S8}); 612 } 613 614 Mulh 615 .scalarize(0) 616 .lower(); 617 618 // Report legal for any types we can handle anywhere. For the cases only legal 619 // on the SALU, RegBankSelect will be able to re-legalize. 620 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 621 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 622 .clampScalar(0, S32, S64) 623 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 624 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 625 .widenScalarToNextPow2(0) 626 .scalarize(0); 627 628 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 629 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 630 .legalFor({{S32, S1}, {S32, S32}}) 631 .minScalar(0, S32) 632 // TODO: .scalarize(0) 633 .lower(); 634 635 getActionDefinitionsBuilder(G_BITCAST) 636 // Don't worry about the size constraint. 637 .legalIf(all(isRegisterType(0), isRegisterType(1))) 638 .lower(); 639 640 641 getActionDefinitionsBuilder(G_CONSTANT) 642 .legalFor({S1, S32, S64, S16, GlobalPtr, 643 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 644 .legalIf(isPointer(0)) 645 .clampScalar(0, S32, S64) 646 .widenScalarToNextPow2(0); 647 648 getActionDefinitionsBuilder(G_FCONSTANT) 649 .legalFor({S32, S64, S16}) 650 .clampScalar(0, S16, S64); 651 652 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 653 .legalIf(isRegisterType(0)) 654 // s1 and s16 are special cases because they have legal operations on 655 // them, but don't really occupy registers in the normal way. 656 .legalFor({S1, S16}) 657 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 658 .clampScalarOrElt(0, S32, MaxScalar) 659 .widenScalarToNextPow2(0, 32) 660 .clampMaxNumElements(0, S32, 16); 661 662 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 663 664 // If the amount is divergent, we have to do a wave reduction to get the 665 // maximum value, so this is expanded during RegBankSelect. 666 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 667 .legalFor({{PrivatePtr, S32}}); 668 669 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 670 .customIf(typeIsNot(0, PrivatePtr)); 671 672 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 673 674 auto &FPOpActions = getActionDefinitionsBuilder( 675 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 676 .legalFor({S32, S64}); 677 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 678 .customFor({S32, S64}); 679 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 680 .customFor({S32, S64}); 681 682 if (ST.has16BitInsts()) { 683 if (ST.hasVOP3PInsts()) 684 FPOpActions.legalFor({S16, V2S16}); 685 else 686 FPOpActions.legalFor({S16}); 687 688 TrigActions.customFor({S16}); 689 FDIVActions.customFor({S16}); 690 } 691 692 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 693 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 694 695 if (ST.hasVOP3PInsts()) { 696 MinNumMaxNum.customFor(FPTypesPK16) 697 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 698 .clampMaxNumElements(0, S16, 2) 699 .clampScalar(0, S16, S64) 700 .scalarize(0); 701 } else if (ST.has16BitInsts()) { 702 MinNumMaxNum.customFor(FPTypes16) 703 .clampScalar(0, S16, S64) 704 .scalarize(0); 705 } else { 706 MinNumMaxNum.customFor(FPTypesBase) 707 .clampScalar(0, S32, S64) 708 .scalarize(0); 709 } 710 711 if (ST.hasVOP3PInsts()) 712 FPOpActions.clampMaxNumElements(0, S16, 2); 713 714 FPOpActions 715 .scalarize(0) 716 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 717 718 TrigActions 719 .scalarize(0) 720 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 721 722 FDIVActions 723 .scalarize(0) 724 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 725 726 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 727 .legalFor(FPTypesPK16) 728 .clampMaxNumElements(0, S16, 2) 729 .scalarize(0) 730 .clampScalar(0, S16, S64); 731 732 if (ST.has16BitInsts()) { 733 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 734 .legalFor({S32, S64, S16}) 735 .scalarize(0) 736 .clampScalar(0, S16, S64); 737 } else { 738 getActionDefinitionsBuilder(G_FSQRT) 739 .legalFor({S32, S64}) 740 .scalarize(0) 741 .clampScalar(0, S32, S64); 742 743 if (ST.hasFractBug()) { 744 getActionDefinitionsBuilder(G_FFLOOR) 745 .customFor({S64}) 746 .legalFor({S32, S64}) 747 .scalarize(0) 748 .clampScalar(0, S32, S64); 749 } else { 750 getActionDefinitionsBuilder(G_FFLOOR) 751 .legalFor({S32, S64}) 752 .scalarize(0) 753 .clampScalar(0, S32, S64); 754 } 755 } 756 757 getActionDefinitionsBuilder(G_FPTRUNC) 758 .legalFor({{S32, S64}, {S16, S32}}) 759 .scalarize(0) 760 .lower(); 761 762 getActionDefinitionsBuilder(G_FPEXT) 763 .legalFor({{S64, S32}, {S32, S16}}) 764 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 765 .scalarize(0); 766 767 getActionDefinitionsBuilder(G_FSUB) 768 // Use actual fsub instruction 769 .legalFor({S32}) 770 // Must use fadd + fneg 771 .lowerFor({S64, S16, V2S16}) 772 .scalarize(0) 773 .clampScalar(0, S32, S64); 774 775 // Whether this is legal depends on the floating point mode for the function. 776 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 777 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 778 FMad.customFor({S32, S16}); 779 else if (ST.hasMadMacF32Insts()) 780 FMad.customFor({S32}); 781 else if (ST.hasMadF16()) 782 FMad.customFor({S16}); 783 FMad.scalarize(0) 784 .lower(); 785 786 auto &FRem = getActionDefinitionsBuilder(G_FREM); 787 if (ST.has16BitInsts()) { 788 FRem.customFor({S16, S32, S64}); 789 } else { 790 FRem.minScalar(0, S32) 791 .customFor({S32, S64}); 792 } 793 FRem.scalarize(0); 794 795 // TODO: Do we need to clamp maximum bitwidth? 796 getActionDefinitionsBuilder(G_TRUNC) 797 .legalIf(isScalar(0)) 798 .legalFor({{V2S16, V2S32}}) 799 .clampMaxNumElements(0, S16, 2) 800 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 801 // situations (like an invalid implicit use), we don't want to infinite loop 802 // in the legalizer. 803 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 804 .alwaysLegal(); 805 806 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 807 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 808 {S32, S1}, {S64, S1}, {S16, S1}}) 809 .scalarize(0) 810 .clampScalar(0, S32, S64) 811 .widenScalarToNextPow2(1, 32); 812 813 // TODO: Split s1->s64 during regbankselect for VALU. 814 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 815 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 816 .lowerIf(typeIs(1, S1)) 817 .customFor({{S32, S64}, {S64, S64}}); 818 if (ST.has16BitInsts()) 819 IToFP.legalFor({{S16, S16}}); 820 IToFP.clampScalar(1, S32, S64) 821 .minScalar(0, S32) 822 .scalarize(0) 823 .widenScalarToNextPow2(1); 824 825 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 826 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 827 .customFor({{S64, S32}, {S64, S64}}) 828 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 829 if (ST.has16BitInsts()) 830 FPToI.legalFor({{S16, S16}}); 831 else 832 FPToI.minScalar(1, S32); 833 834 FPToI.minScalar(0, S32) 835 .widenScalarToNextPow2(0, 32) 836 .scalarize(0) 837 .lower(); 838 839 // Lower roundeven into G_FRINT 840 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 841 .scalarize(0) 842 .lower(); 843 844 if (ST.has16BitInsts()) { 845 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 846 .legalFor({S16, S32, S64}) 847 .clampScalar(0, S16, S64) 848 .scalarize(0); 849 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 850 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 851 .legalFor({S32, S64}) 852 .clampScalar(0, S32, S64) 853 .scalarize(0); 854 } else { 855 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 856 .legalFor({S32}) 857 .customFor({S64}) 858 .clampScalar(0, S32, S64) 859 .scalarize(0); 860 } 861 862 getActionDefinitionsBuilder(G_PTR_ADD) 863 .legalIf(all(isPointer(0), sameSize(0, 1))) 864 .scalarize(0) 865 .scalarSameSizeAs(1, 0); 866 867 getActionDefinitionsBuilder(G_PTRMASK) 868 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 869 .scalarSameSizeAs(1, 0) 870 .scalarize(0); 871 872 auto &CmpBuilder = 873 getActionDefinitionsBuilder(G_ICMP) 874 // The compare output type differs based on the register bank of the output, 875 // so make both s1 and s32 legal. 876 // 877 // Scalar compares producing output in scc will be promoted to s32, as that 878 // is the allocatable register type that will be needed for the copy from 879 // scc. This will be promoted during RegBankSelect, and we assume something 880 // before that won't try to use s32 result types. 881 // 882 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 883 // bank. 884 .legalForCartesianProduct( 885 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 886 .legalForCartesianProduct( 887 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 888 if (ST.has16BitInsts()) { 889 CmpBuilder.legalFor({{S1, S16}}); 890 } 891 892 CmpBuilder 893 .widenScalarToNextPow2(1) 894 .clampScalar(1, S32, S64) 895 .scalarize(0) 896 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 897 898 getActionDefinitionsBuilder(G_FCMP) 899 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 900 .widenScalarToNextPow2(1) 901 .clampScalar(1, S32, S64) 902 .scalarize(0); 903 904 // FIXME: fpow has a selection pattern that should move to custom lowering. 905 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 906 if (ST.has16BitInsts()) 907 Exp2Ops.legalFor({S32, S16}); 908 else 909 Exp2Ops.legalFor({S32}); 910 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 911 Exp2Ops.scalarize(0); 912 913 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 914 if (ST.has16BitInsts()) 915 ExpOps.customFor({{S32}, {S16}}); 916 else 917 ExpOps.customFor({S32}); 918 ExpOps.clampScalar(0, MinScalarFPTy, S32) 919 .scalarize(0); 920 921 getActionDefinitionsBuilder(G_FPOWI) 922 .clampScalar(0, MinScalarFPTy, S32) 923 .lower(); 924 925 // The 64-bit versions produce 32-bit results, but only on the SALU. 926 getActionDefinitionsBuilder(G_CTPOP) 927 .legalFor({{S32, S32}, {S32, S64}}) 928 .clampScalar(0, S32, S32) 929 .clampScalar(1, S32, S64) 930 .scalarize(0) 931 .widenScalarToNextPow2(0, 32) 932 .widenScalarToNextPow2(1, 32); 933 934 // The hardware instructions return a different result on 0 than the generic 935 // instructions expect. The hardware produces -1, but these produce the 936 // bitwidth. 937 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 938 .scalarize(0) 939 .clampScalar(0, S32, S32) 940 .clampScalar(1, S32, S64) 941 .widenScalarToNextPow2(0, 32) 942 .widenScalarToNextPow2(1, 32) 943 .custom(); 944 945 // The 64-bit versions produce 32-bit results, but only on the SALU. 946 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 947 .legalFor({{S32, S32}, {S32, S64}}) 948 .clampScalar(0, S32, S32) 949 .clampScalar(1, S32, S64) 950 .scalarize(0) 951 .widenScalarToNextPow2(0, 32) 952 .widenScalarToNextPow2(1, 32); 953 954 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 955 // RegBankSelect. 956 getActionDefinitionsBuilder(G_BITREVERSE) 957 .legalFor({S32, S64}) 958 .clampScalar(0, S32, S64) 959 .scalarize(0) 960 .widenScalarToNextPow2(0); 961 962 if (ST.has16BitInsts()) { 963 getActionDefinitionsBuilder(G_BSWAP) 964 .legalFor({S16, S32, V2S16}) 965 .clampMaxNumElements(0, S16, 2) 966 // FIXME: Fixing non-power-of-2 before clamp is workaround for 967 // narrowScalar limitation. 968 .widenScalarToNextPow2(0) 969 .clampScalar(0, S16, S32) 970 .scalarize(0); 971 972 if (ST.hasVOP3PInsts()) { 973 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 974 .legalFor({S32, S16, V2S16}) 975 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 976 .clampMaxNumElements(0, S16, 2) 977 .minScalar(0, S16) 978 .widenScalarToNextPow2(0) 979 .scalarize(0) 980 .lower(); 981 } else { 982 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 983 .legalFor({S32, S16}) 984 .widenScalarToNextPow2(0) 985 .minScalar(0, S16) 986 .scalarize(0) 987 .lower(); 988 } 989 } else { 990 // TODO: Should have same legality without v_perm_b32 991 getActionDefinitionsBuilder(G_BSWAP) 992 .legalFor({S32}) 993 .lowerIf(scalarNarrowerThan(0, 32)) 994 // FIXME: Fixing non-power-of-2 before clamp is workaround for 995 // narrowScalar limitation. 996 .widenScalarToNextPow2(0) 997 .maxScalar(0, S32) 998 .scalarize(0) 999 .lower(); 1000 1001 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1002 .legalFor({S32}) 1003 .minScalar(0, S32) 1004 .widenScalarToNextPow2(0) 1005 .scalarize(0) 1006 .lower(); 1007 } 1008 1009 getActionDefinitionsBuilder(G_INTTOPTR) 1010 // List the common cases 1011 .legalForCartesianProduct(AddrSpaces64, {S64}) 1012 .legalForCartesianProduct(AddrSpaces32, {S32}) 1013 .scalarize(0) 1014 // Accept any address space as long as the size matches 1015 .legalIf(sameSize(0, 1)) 1016 .widenScalarIf(smallerThan(1, 0), 1017 [](const LegalityQuery &Query) { 1018 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1019 }) 1020 .narrowScalarIf(largerThan(1, 0), 1021 [](const LegalityQuery &Query) { 1022 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1023 }); 1024 1025 getActionDefinitionsBuilder(G_PTRTOINT) 1026 // List the common cases 1027 .legalForCartesianProduct(AddrSpaces64, {S64}) 1028 .legalForCartesianProduct(AddrSpaces32, {S32}) 1029 .scalarize(0) 1030 // Accept any address space as long as the size matches 1031 .legalIf(sameSize(0, 1)) 1032 .widenScalarIf(smallerThan(0, 1), 1033 [](const LegalityQuery &Query) { 1034 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1035 }) 1036 .narrowScalarIf( 1037 largerThan(0, 1), 1038 [](const LegalityQuery &Query) { 1039 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1040 }); 1041 1042 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1043 .scalarize(0) 1044 .custom(); 1045 1046 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1047 bool IsLoad) -> bool { 1048 const LLT DstTy = Query.Types[0]; 1049 1050 // Split vector extloads. 1051 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1052 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1053 1054 if (MemSize < DstTy.getSizeInBits()) 1055 MemSize = std::max(MemSize, AlignBits); 1056 1057 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1058 return true; 1059 1060 const LLT PtrTy = Query.Types[1]; 1061 unsigned AS = PtrTy.getAddressSpace(); 1062 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1063 return true; 1064 1065 // Catch weird sized loads that don't evenly divide into the access sizes 1066 // TODO: May be able to widen depending on alignment etc. 1067 unsigned NumRegs = (MemSize + 31) / 32; 1068 if (NumRegs == 3) { 1069 if (!ST.hasDwordx3LoadStores()) 1070 return true; 1071 } else { 1072 // If the alignment allows, these should have been widened. 1073 if (!isPowerOf2_32(NumRegs)) 1074 return true; 1075 } 1076 1077 if (AlignBits < MemSize) { 1078 const SITargetLowering *TLI = ST.getTargetLowering(); 1079 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1080 Align(AlignBits / 8)); 1081 } 1082 1083 return false; 1084 }; 1085 1086 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1087 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1088 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1089 1090 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1091 // LDS 1092 // TODO: Unsupported flat for SI. 1093 1094 for (unsigned Op : {G_LOAD, G_STORE}) { 1095 const bool IsStore = Op == G_STORE; 1096 1097 auto &Actions = getActionDefinitionsBuilder(Op); 1098 // Explicitly list some common cases. 1099 // TODO: Does this help compile time at all? 1100 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1101 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1102 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1103 {S64, GlobalPtr, S64, GlobalAlign32}, 1104 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1105 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1106 {S32, GlobalPtr, S8, GlobalAlign8}, 1107 {S32, GlobalPtr, S16, GlobalAlign16}, 1108 1109 {S32, LocalPtr, S32, 32}, 1110 {S64, LocalPtr, S64, 32}, 1111 {V2S32, LocalPtr, V2S32, 32}, 1112 {S32, LocalPtr, S8, 8}, 1113 {S32, LocalPtr, S16, 16}, 1114 {V2S16, LocalPtr, S32, 32}, 1115 1116 {S32, PrivatePtr, S32, 32}, 1117 {S32, PrivatePtr, S8, 8}, 1118 {S32, PrivatePtr, S16, 16}, 1119 {V2S16, PrivatePtr, S32, 32}, 1120 1121 {S32, ConstantPtr, S32, GlobalAlign32}, 1122 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1123 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1124 {S64, ConstantPtr, S64, GlobalAlign32}, 1125 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1126 Actions.legalIf( 1127 [=](const LegalityQuery &Query) -> bool { 1128 return isLoadStoreLegal(ST, Query); 1129 }); 1130 1131 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1132 // 64-bits. 1133 // 1134 // TODO: Should generalize bitcast action into coerce, which will also cover 1135 // inserting addrspacecasts. 1136 Actions.customIf(typeIs(1, Constant32Ptr)); 1137 1138 // Turn any illegal element vectors into something easier to deal 1139 // with. These will ultimately produce 32-bit scalar shifts to extract the 1140 // parts anyway. 1141 // 1142 // For odd 16-bit element vectors, prefer to split those into pieces with 1143 // 16-bit vector parts. 1144 Actions.bitcastIf( 1145 [=](const LegalityQuery &Query) -> bool { 1146 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1147 Query.MMODescrs[0].MemoryTy); 1148 }, bitcastToRegisterType(0)); 1149 1150 if (!IsStore) { 1151 // Widen suitably aligned loads by loading extra bytes. The standard 1152 // legalization actions can't properly express widening memory operands. 1153 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1154 return shouldWidenLoad(ST, Query, G_LOAD); 1155 }); 1156 } 1157 1158 // FIXME: load/store narrowing should be moved to lower action 1159 Actions 1160 .narrowScalarIf( 1161 [=](const LegalityQuery &Query) -> bool { 1162 return !Query.Types[0].isVector() && 1163 needToSplitMemOp(Query, Op == G_LOAD); 1164 }, 1165 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1166 const LLT DstTy = Query.Types[0]; 1167 const LLT PtrTy = Query.Types[1]; 1168 1169 const unsigned DstSize = DstTy.getSizeInBits(); 1170 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1171 1172 // Split extloads. 1173 if (DstSize > MemSize) 1174 return std::make_pair(0, LLT::scalar(MemSize)); 1175 1176 if (!isPowerOf2_32(DstSize)) { 1177 // We're probably decomposing an odd sized store. Try to split 1178 // to the widest type. TODO: Account for alignment. As-is it 1179 // should be OK, since the new parts will be further legalized. 1180 unsigned FloorSize = PowerOf2Floor(DstSize); 1181 return std::make_pair(0, LLT::scalar(FloorSize)); 1182 } 1183 1184 if (DstSize > 32 && (DstSize % 32 != 0)) { 1185 // FIXME: Need a way to specify non-extload of larger size if 1186 // suitably aligned. 1187 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1188 } 1189 1190 unsigned MaxSize = maxSizeForAddrSpace(ST, 1191 PtrTy.getAddressSpace(), 1192 Op == G_LOAD); 1193 if (MemSize > MaxSize) 1194 return std::make_pair(0, LLT::scalar(MaxSize)); 1195 1196 unsigned Align = Query.MMODescrs[0].AlignInBits; 1197 return std::make_pair(0, LLT::scalar(Align)); 1198 }) 1199 .fewerElementsIf( 1200 [=](const LegalityQuery &Query) -> bool { 1201 return Query.Types[0].isVector() && 1202 needToSplitMemOp(Query, Op == G_LOAD); 1203 }, 1204 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1205 const LLT DstTy = Query.Types[0]; 1206 const LLT PtrTy = Query.Types[1]; 1207 1208 LLT EltTy = DstTy.getElementType(); 1209 unsigned MaxSize = maxSizeForAddrSpace(ST, 1210 PtrTy.getAddressSpace(), 1211 Op == G_LOAD); 1212 1213 // FIXME: Handle widened to power of 2 results better. This ends 1214 // up scalarizing. 1215 // FIXME: 3 element stores scalarized on SI 1216 1217 // Split if it's too large for the address space. 1218 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1219 if (MemSize > MaxSize) { 1220 unsigned NumElts = DstTy.getNumElements(); 1221 unsigned EltSize = EltTy.getSizeInBits(); 1222 1223 if (MaxSize % EltSize == 0) { 1224 return std::make_pair( 1225 0, LLT::scalarOrVector( 1226 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1227 } 1228 1229 unsigned NumPieces = MemSize / MaxSize; 1230 1231 // FIXME: Refine when odd breakdowns handled 1232 // The scalars will need to be re-legalized. 1233 if (NumPieces == 1 || NumPieces >= NumElts || 1234 NumElts % NumPieces != 0) 1235 return std::make_pair(0, EltTy); 1236 1237 return std::make_pair( 1238 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1239 } 1240 1241 // FIXME: We could probably handle weird extending loads better. 1242 if (DstTy.getSizeInBits() > MemSize) 1243 return std::make_pair(0, EltTy); 1244 1245 unsigned EltSize = EltTy.getSizeInBits(); 1246 unsigned DstSize = DstTy.getSizeInBits(); 1247 if (!isPowerOf2_32(DstSize)) { 1248 // We're probably decomposing an odd sized store. Try to split 1249 // to the widest type. TODO: Account for alignment. As-is it 1250 // should be OK, since the new parts will be further legalized. 1251 unsigned FloorSize = PowerOf2Floor(DstSize); 1252 return std::make_pair( 1253 0, LLT::scalarOrVector( 1254 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1255 } 1256 1257 // Need to split because of alignment. 1258 unsigned Align = Query.MMODescrs[0].AlignInBits; 1259 if (EltSize > Align && 1260 (EltSize / Align < DstTy.getNumElements())) { 1261 return std::make_pair( 1262 0, LLT::fixed_vector(EltSize / Align, EltTy)); 1263 } 1264 1265 // May need relegalization for the scalars. 1266 return std::make_pair(0, EltTy); 1267 }) 1268 .minScalar(0, S32) 1269 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1270 .widenScalarToNextPow2(0) 1271 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1272 .lower(); 1273 } 1274 1275 // FIXME: Unaligned accesses not lowered. 1276 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1277 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1278 {S32, GlobalPtr, S16, 2 * 8}, 1279 {S32, LocalPtr, S8, 8}, 1280 {S32, LocalPtr, S16, 16}, 1281 {S32, PrivatePtr, S8, 8}, 1282 {S32, PrivatePtr, S16, 16}, 1283 {S32, ConstantPtr, S8, 8}, 1284 {S32, ConstantPtr, S16, 2 * 8}}) 1285 .legalIf( 1286 [=](const LegalityQuery &Query) -> bool { 1287 return isLoadStoreLegal(ST, Query); 1288 }); 1289 1290 if (ST.hasFlatAddressSpace()) { 1291 ExtLoads.legalForTypesWithMemDesc( 1292 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1293 } 1294 1295 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1296 // 64-bits. 1297 // 1298 // TODO: Should generalize bitcast action into coerce, which will also cover 1299 // inserting addrspacecasts. 1300 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1301 1302 ExtLoads.clampScalar(0, S32, S32) 1303 .widenScalarToNextPow2(0) 1304 .lower(); 1305 1306 auto &Atomics = getActionDefinitionsBuilder( 1307 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1308 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1309 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1310 G_ATOMICRMW_UMIN}) 1311 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1312 {S64, GlobalPtr}, {S64, LocalPtr}, 1313 {S32, RegionPtr}, {S64, RegionPtr}}); 1314 if (ST.hasFlatAddressSpace()) { 1315 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1316 } 1317 1318 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1319 if (ST.hasLDSFPAtomics()) { 1320 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1321 if (ST.hasGFX90AInsts()) 1322 Atomic.legalFor({{S64, LocalPtr}}); 1323 } 1324 if (ST.hasAtomicFaddInsts()) 1325 Atomic.legalFor({{S32, GlobalPtr}}); 1326 1327 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1328 // demarshalling 1329 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1330 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1331 {S32, FlatPtr}, {S64, FlatPtr}}) 1332 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1333 {S32, RegionPtr}, {S64, RegionPtr}}); 1334 // TODO: Pointer types, any 32-bit or 64-bit vector 1335 1336 // Condition should be s32 for scalar, s1 for vector. 1337 getActionDefinitionsBuilder(G_SELECT) 1338 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1339 LocalPtr, FlatPtr, PrivatePtr, 1340 LLT::fixed_vector(2, LocalPtr), 1341 LLT::fixed_vector(2, PrivatePtr)}, 1342 {S1, S32}) 1343 .clampScalar(0, S16, S64) 1344 .scalarize(1) 1345 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1346 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1347 .clampMaxNumElements(0, S32, 2) 1348 .clampMaxNumElements(0, LocalPtr, 2) 1349 .clampMaxNumElements(0, PrivatePtr, 2) 1350 .scalarize(0) 1351 .widenScalarToNextPow2(0) 1352 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1353 1354 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1355 // be more flexible with the shift amount type. 1356 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1357 .legalFor({{S32, S32}, {S64, S32}}); 1358 if (ST.has16BitInsts()) { 1359 if (ST.hasVOP3PInsts()) { 1360 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1361 .clampMaxNumElements(0, S16, 2); 1362 } else 1363 Shifts.legalFor({{S16, S16}}); 1364 1365 // TODO: Support 16-bit shift amounts for all types 1366 Shifts.widenScalarIf( 1367 [=](const LegalityQuery &Query) { 1368 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1369 // 32-bit amount. 1370 const LLT ValTy = Query.Types[0]; 1371 const LLT AmountTy = Query.Types[1]; 1372 return ValTy.getSizeInBits() <= 16 && 1373 AmountTy.getSizeInBits() < 16; 1374 }, changeTo(1, S16)); 1375 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1376 Shifts.clampScalar(1, S32, S32); 1377 Shifts.clampScalar(0, S16, S64); 1378 Shifts.widenScalarToNextPow2(0, 16); 1379 1380 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1381 .minScalar(0, S16) 1382 .scalarize(0) 1383 .lower(); 1384 } else { 1385 // Make sure we legalize the shift amount type first, as the general 1386 // expansion for the shifted type will produce much worse code if it hasn't 1387 // been truncated already. 1388 Shifts.clampScalar(1, S32, S32); 1389 Shifts.clampScalar(0, S32, S64); 1390 Shifts.widenScalarToNextPow2(0, 32); 1391 1392 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1393 .minScalar(0, S32) 1394 .scalarize(0) 1395 .lower(); 1396 } 1397 Shifts.scalarize(0); 1398 1399 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1400 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1401 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1402 unsigned IdxTypeIdx = 2; 1403 1404 getActionDefinitionsBuilder(Op) 1405 .customIf([=](const LegalityQuery &Query) { 1406 const LLT EltTy = Query.Types[EltTypeIdx]; 1407 const LLT VecTy = Query.Types[VecTypeIdx]; 1408 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1409 const unsigned EltSize = EltTy.getSizeInBits(); 1410 return (EltSize == 32 || EltSize == 64) && 1411 VecTy.getSizeInBits() % 32 == 0 && 1412 VecTy.getSizeInBits() <= MaxRegisterSize && 1413 IdxTy.getSizeInBits() == 32; 1414 }) 1415 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1416 bitcastToVectorElement32(VecTypeIdx)) 1417 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1418 .bitcastIf( 1419 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1420 [=](const LegalityQuery &Query) { 1421 // For > 64-bit element types, try to turn this into a 64-bit 1422 // element vector since we may be able to do better indexing 1423 // if this is scalar. If not, fall back to 32. 1424 const LLT EltTy = Query.Types[EltTypeIdx]; 1425 const LLT VecTy = Query.Types[VecTypeIdx]; 1426 const unsigned DstEltSize = EltTy.getSizeInBits(); 1427 const unsigned VecSize = VecTy.getSizeInBits(); 1428 1429 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1430 return std::make_pair( 1431 VecTypeIdx, 1432 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1433 }) 1434 .clampScalar(EltTypeIdx, S32, S64) 1435 .clampScalar(VecTypeIdx, S32, S64) 1436 .clampScalar(IdxTypeIdx, S32, S32) 1437 .clampMaxNumElements(VecTypeIdx, S32, 32) 1438 // TODO: Clamp elements for 64-bit vectors? 1439 // It should only be necessary with variable indexes. 1440 // As a last resort, lower to the stack 1441 .lower(); 1442 } 1443 1444 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1445 .unsupportedIf([=](const LegalityQuery &Query) { 1446 const LLT &EltTy = Query.Types[1].getElementType(); 1447 return Query.Types[0] != EltTy; 1448 }); 1449 1450 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1451 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1452 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1453 1454 // FIXME: Doesn't handle extract of illegal sizes. 1455 getActionDefinitionsBuilder(Op) 1456 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1457 // FIXME: Multiples of 16 should not be legal. 1458 .legalIf([=](const LegalityQuery &Query) { 1459 const LLT BigTy = Query.Types[BigTyIdx]; 1460 const LLT LitTy = Query.Types[LitTyIdx]; 1461 return (BigTy.getSizeInBits() % 32 == 0) && 1462 (LitTy.getSizeInBits() % 16 == 0); 1463 }) 1464 .widenScalarIf( 1465 [=](const LegalityQuery &Query) { 1466 const LLT BigTy = Query.Types[BigTyIdx]; 1467 return (BigTy.getScalarSizeInBits() < 16); 1468 }, 1469 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1470 .widenScalarIf( 1471 [=](const LegalityQuery &Query) { 1472 const LLT LitTy = Query.Types[LitTyIdx]; 1473 return (LitTy.getScalarSizeInBits() < 16); 1474 }, 1475 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1476 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1477 .widenScalarToNextPow2(BigTyIdx, 32); 1478 1479 } 1480 1481 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1482 .legalForCartesianProduct(AllS32Vectors, {S32}) 1483 .legalForCartesianProduct(AllS64Vectors, {S64}) 1484 .clampNumElements(0, V16S32, V32S32) 1485 .clampNumElements(0, V2S64, V16S64) 1486 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1487 1488 if (ST.hasScalarPackInsts()) { 1489 BuildVector 1490 // FIXME: Should probably widen s1 vectors straight to s32 1491 .minScalarOrElt(0, S16) 1492 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1493 .minScalar(1, S32); 1494 1495 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1496 .legalFor({V2S16, S32}) 1497 .lower(); 1498 BuildVector.minScalarOrElt(0, S32); 1499 } else { 1500 BuildVector.customFor({V2S16, S16}); 1501 BuildVector.minScalarOrElt(0, S32); 1502 1503 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1504 .customFor({V2S16, S32}) 1505 .lower(); 1506 } 1507 1508 BuildVector.legalIf(isRegisterType(0)); 1509 1510 // FIXME: Clamp maximum size 1511 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1512 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1513 .clampMaxNumElements(0, S32, 32) 1514 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1515 .clampMaxNumElements(0, S16, 64); 1516 1517 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1518 // pre-legalize. 1519 if (ST.hasVOP3PInsts()) { 1520 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1521 .customFor({V2S16, V2S16}) 1522 .lower(); 1523 } else 1524 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1525 1526 // Merge/Unmerge 1527 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1528 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1529 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1530 1531 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1532 const LLT Ty = Query.Types[TypeIdx]; 1533 if (Ty.isVector()) { 1534 const LLT &EltTy = Ty.getElementType(); 1535 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1536 return true; 1537 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1538 return true; 1539 } 1540 return false; 1541 }; 1542 1543 auto &Builder = getActionDefinitionsBuilder(Op) 1544 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1545 .lowerFor({{S16, V2S16}}) 1546 .lowerIf([=](const LegalityQuery &Query) { 1547 const LLT BigTy = Query.Types[BigTyIdx]; 1548 return BigTy.getSizeInBits() == 32; 1549 }) 1550 // Try to widen to s16 first for small types. 1551 // TODO: Only do this on targets with legal s16 shifts 1552 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1553 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1554 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1555 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1556 elementTypeIs(1, S16)), 1557 changeTo(1, V2S16)) 1558 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1559 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1560 // valid. 1561 .clampScalar(LitTyIdx, S32, S512) 1562 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1563 // Break up vectors with weird elements into scalars 1564 .fewerElementsIf( 1565 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1566 scalarize(0)) 1567 .fewerElementsIf( 1568 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1569 scalarize(1)) 1570 .clampScalar(BigTyIdx, S32, MaxScalar); 1571 1572 if (Op == G_MERGE_VALUES) { 1573 Builder.widenScalarIf( 1574 // TODO: Use 16-bit shifts if legal for 8-bit values? 1575 [=](const LegalityQuery &Query) { 1576 const LLT Ty = Query.Types[LitTyIdx]; 1577 return Ty.getSizeInBits() < 32; 1578 }, 1579 changeTo(LitTyIdx, S32)); 1580 } 1581 1582 Builder.widenScalarIf( 1583 [=](const LegalityQuery &Query) { 1584 const LLT Ty = Query.Types[BigTyIdx]; 1585 return !isPowerOf2_32(Ty.getSizeInBits()) && 1586 Ty.getSizeInBits() % 16 != 0; 1587 }, 1588 [=](const LegalityQuery &Query) { 1589 // Pick the next power of 2, or a multiple of 64 over 128. 1590 // Whichever is smaller. 1591 const LLT &Ty = Query.Types[BigTyIdx]; 1592 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1593 if (NewSizeInBits >= 256) { 1594 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1595 if (RoundedTo < NewSizeInBits) 1596 NewSizeInBits = RoundedTo; 1597 } 1598 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1599 }) 1600 // Any vectors left are the wrong size. Scalarize them. 1601 .scalarize(0) 1602 .scalarize(1); 1603 } 1604 1605 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1606 // RegBankSelect. 1607 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1608 .legalFor({{S32}, {S64}}); 1609 1610 if (ST.hasVOP3PInsts()) { 1611 SextInReg.lowerFor({{V2S16}}) 1612 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1613 // get more vector shift opportunities, since we'll get those when 1614 // expanded. 1615 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1616 } else if (ST.has16BitInsts()) { 1617 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1618 } else { 1619 // Prefer to promote to s32 before lowering if we don't have 16-bit 1620 // shifts. This avoid a lot of intermediate truncate and extend operations. 1621 SextInReg.lowerFor({{S32}, {S64}}); 1622 } 1623 1624 SextInReg 1625 .scalarize(0) 1626 .clampScalar(0, S32, S64) 1627 .lower(); 1628 1629 // TODO: Only Try to form v2s16 with legal packed instructions. 1630 getActionDefinitionsBuilder(G_FSHR) 1631 .legalFor({{S32, S32}}) 1632 .lowerFor({{V2S16, V2S16}}) 1633 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1634 .scalarize(0) 1635 .lower(); 1636 1637 if (ST.hasVOP3PInsts()) { 1638 getActionDefinitionsBuilder(G_FSHL) 1639 .lowerFor({{V2S16, V2S16}}) 1640 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1641 .scalarize(0) 1642 .lower(); 1643 } else { 1644 getActionDefinitionsBuilder(G_FSHL) 1645 .scalarize(0) 1646 .lower(); 1647 } 1648 1649 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1650 .legalFor({S64}); 1651 1652 getActionDefinitionsBuilder(G_FENCE) 1653 .alwaysLegal(); 1654 1655 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1656 .scalarize(0) 1657 .minScalar(0, S32) 1658 .lower(); 1659 1660 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1661 .legalFor({{S32, S32}, {S64, S32}}) 1662 .clampScalar(1, S32, S32) 1663 .clampScalar(0, S32, S64) 1664 .widenScalarToNextPow2(0) 1665 .scalarize(0); 1666 1667 getActionDefinitionsBuilder({ 1668 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1669 G_FCOPYSIGN, 1670 1671 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1672 G_ATOMICRMW_NAND, 1673 G_ATOMICRMW_FSUB, 1674 G_READ_REGISTER, 1675 G_WRITE_REGISTER, 1676 1677 G_SADDO, G_SSUBO, 1678 1679 // TODO: Implement 1680 G_FMINIMUM, G_FMAXIMUM}).lower(); 1681 1682 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1683 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1684 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1685 .unsupported(); 1686 1687 getLegacyLegalizerInfo().computeTables(); 1688 verify(*ST.getInstrInfo()); 1689 } 1690 1691 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1692 MachineInstr &MI) const { 1693 MachineIRBuilder &B = Helper.MIRBuilder; 1694 MachineRegisterInfo &MRI = *B.getMRI(); 1695 1696 switch (MI.getOpcode()) { 1697 case TargetOpcode::G_ADDRSPACE_CAST: 1698 return legalizeAddrSpaceCast(MI, MRI, B); 1699 case TargetOpcode::G_FRINT: 1700 return legalizeFrint(MI, MRI, B); 1701 case TargetOpcode::G_FCEIL: 1702 return legalizeFceil(MI, MRI, B); 1703 case TargetOpcode::G_FREM: 1704 return legalizeFrem(MI, MRI, B); 1705 case TargetOpcode::G_INTRINSIC_TRUNC: 1706 return legalizeIntrinsicTrunc(MI, MRI, B); 1707 case TargetOpcode::G_SITOFP: 1708 return legalizeITOFP(MI, MRI, B, true); 1709 case TargetOpcode::G_UITOFP: 1710 return legalizeITOFP(MI, MRI, B, false); 1711 case TargetOpcode::G_FPTOSI: 1712 return legalizeFPTOI(MI, MRI, B, true); 1713 case TargetOpcode::G_FPTOUI: 1714 return legalizeFPTOI(MI, MRI, B, false); 1715 case TargetOpcode::G_FMINNUM: 1716 case TargetOpcode::G_FMAXNUM: 1717 case TargetOpcode::G_FMINNUM_IEEE: 1718 case TargetOpcode::G_FMAXNUM_IEEE: 1719 return legalizeMinNumMaxNum(Helper, MI); 1720 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1721 return legalizeExtractVectorElt(MI, MRI, B); 1722 case TargetOpcode::G_INSERT_VECTOR_ELT: 1723 return legalizeInsertVectorElt(MI, MRI, B); 1724 case TargetOpcode::G_SHUFFLE_VECTOR: 1725 return legalizeShuffleVector(MI, MRI, B); 1726 case TargetOpcode::G_FSIN: 1727 case TargetOpcode::G_FCOS: 1728 return legalizeSinCos(MI, MRI, B); 1729 case TargetOpcode::G_GLOBAL_VALUE: 1730 return legalizeGlobalValue(MI, MRI, B); 1731 case TargetOpcode::G_LOAD: 1732 case TargetOpcode::G_SEXTLOAD: 1733 case TargetOpcode::G_ZEXTLOAD: 1734 return legalizeLoad(Helper, MI); 1735 case TargetOpcode::G_FMAD: 1736 return legalizeFMad(MI, MRI, B); 1737 case TargetOpcode::G_FDIV: 1738 return legalizeFDIV(MI, MRI, B); 1739 case TargetOpcode::G_UDIV: 1740 case TargetOpcode::G_UREM: 1741 case TargetOpcode::G_UDIVREM: 1742 return legalizeUnsignedDIV_REM(MI, MRI, B); 1743 case TargetOpcode::G_SDIV: 1744 case TargetOpcode::G_SREM: 1745 case TargetOpcode::G_SDIVREM: 1746 return legalizeSignedDIV_REM(MI, MRI, B); 1747 case TargetOpcode::G_ATOMIC_CMPXCHG: 1748 return legalizeAtomicCmpXChg(MI, MRI, B); 1749 case TargetOpcode::G_FLOG: 1750 return legalizeFlog(MI, B, numbers::ln2f); 1751 case TargetOpcode::G_FLOG10: 1752 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1753 case TargetOpcode::G_FEXP: 1754 return legalizeFExp(MI, B); 1755 case TargetOpcode::G_FPOW: 1756 return legalizeFPow(MI, B); 1757 case TargetOpcode::G_FFLOOR: 1758 return legalizeFFloor(MI, MRI, B); 1759 case TargetOpcode::G_BUILD_VECTOR: 1760 return legalizeBuildVector(MI, MRI, B); 1761 case TargetOpcode::G_CTLZ: 1762 case TargetOpcode::G_CTTZ: 1763 return legalizeCTLZ_CTTZ(MI, MRI, B); 1764 default: 1765 return false; 1766 } 1767 1768 llvm_unreachable("expected switch to return"); 1769 } 1770 1771 Register AMDGPULegalizerInfo::getSegmentAperture( 1772 unsigned AS, 1773 MachineRegisterInfo &MRI, 1774 MachineIRBuilder &B) const { 1775 MachineFunction &MF = B.getMF(); 1776 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1777 const LLT S32 = LLT::scalar(32); 1778 1779 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1780 1781 if (ST.hasApertureRegs()) { 1782 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1783 // getreg. 1784 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1785 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1786 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1787 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1788 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1789 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1790 unsigned Encoding = 1791 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1792 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1793 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1794 1795 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1796 1797 B.buildInstr(AMDGPU::S_GETREG_B32) 1798 .addDef(GetReg) 1799 .addImm(Encoding); 1800 MRI.setType(GetReg, S32); 1801 1802 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1803 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1804 } 1805 1806 Register QueuePtr = MRI.createGenericVirtualRegister( 1807 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1808 1809 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1810 return Register(); 1811 1812 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1813 // private_segment_aperture_base_hi. 1814 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1815 1816 // TODO: can we be smarter about machine pointer info? 1817 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1818 MachineMemOperand *MMO = MF.getMachineMemOperand( 1819 PtrInfo, 1820 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1821 MachineMemOperand::MOInvariant, 1822 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1823 1824 Register LoadAddr; 1825 1826 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1827 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1828 } 1829 1830 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1831 MachineInstr &MI, MachineRegisterInfo &MRI, 1832 MachineIRBuilder &B) const { 1833 MachineFunction &MF = B.getMF(); 1834 1835 const LLT S32 = LLT::scalar(32); 1836 Register Dst = MI.getOperand(0).getReg(); 1837 Register Src = MI.getOperand(1).getReg(); 1838 1839 LLT DstTy = MRI.getType(Dst); 1840 LLT SrcTy = MRI.getType(Src); 1841 unsigned DestAS = DstTy.getAddressSpace(); 1842 unsigned SrcAS = SrcTy.getAddressSpace(); 1843 1844 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1845 // vector element. 1846 assert(!DstTy.isVector()); 1847 1848 const AMDGPUTargetMachine &TM 1849 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1850 1851 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1852 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1853 return true; 1854 } 1855 1856 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1857 // Truncate. 1858 B.buildExtract(Dst, Src, 0); 1859 MI.eraseFromParent(); 1860 return true; 1861 } 1862 1863 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1864 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1865 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1866 1867 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1868 // another. Merge operands are required to be the same type, but creating an 1869 // extra ptrtoint would be kind of pointless. 1870 auto HighAddr = B.buildConstant( 1871 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1872 B.buildMerge(Dst, {Src, HighAddr}); 1873 MI.eraseFromParent(); 1874 return true; 1875 } 1876 1877 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1878 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1879 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1880 unsigned NullVal = TM.getNullPointerValue(DestAS); 1881 1882 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1883 auto FlatNull = B.buildConstant(SrcTy, 0); 1884 1885 // Extract low 32-bits of the pointer. 1886 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1887 1888 auto CmpRes = 1889 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1890 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1891 1892 MI.eraseFromParent(); 1893 return true; 1894 } 1895 1896 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1897 return false; 1898 1899 if (!ST.hasFlatAddressSpace()) 1900 return false; 1901 1902 auto SegmentNull = 1903 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1904 auto FlatNull = 1905 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1906 1907 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1908 if (!ApertureReg.isValid()) 1909 return false; 1910 1911 auto CmpRes = 1912 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1913 1914 // Coerce the type of the low half of the result so we can use merge_values. 1915 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1916 1917 // TODO: Should we allow mismatched types but matching sizes in merges to 1918 // avoid the ptrtoint? 1919 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1920 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1921 1922 MI.eraseFromParent(); 1923 return true; 1924 } 1925 1926 bool AMDGPULegalizerInfo::legalizeFrint( 1927 MachineInstr &MI, MachineRegisterInfo &MRI, 1928 MachineIRBuilder &B) const { 1929 Register Src = MI.getOperand(1).getReg(); 1930 LLT Ty = MRI.getType(Src); 1931 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1932 1933 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1934 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1935 1936 auto C1 = B.buildFConstant(Ty, C1Val); 1937 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1938 1939 // TODO: Should this propagate fast-math-flags? 1940 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1941 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1942 1943 auto C2 = B.buildFConstant(Ty, C2Val); 1944 auto Fabs = B.buildFAbs(Ty, Src); 1945 1946 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1947 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1948 MI.eraseFromParent(); 1949 return true; 1950 } 1951 1952 bool AMDGPULegalizerInfo::legalizeFceil( 1953 MachineInstr &MI, MachineRegisterInfo &MRI, 1954 MachineIRBuilder &B) const { 1955 1956 const LLT S1 = LLT::scalar(1); 1957 const LLT S64 = LLT::scalar(64); 1958 1959 Register Src = MI.getOperand(1).getReg(); 1960 assert(MRI.getType(Src) == S64); 1961 1962 // result = trunc(src) 1963 // if (src > 0.0 && src != result) 1964 // result += 1.0 1965 1966 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1967 1968 const auto Zero = B.buildFConstant(S64, 0.0); 1969 const auto One = B.buildFConstant(S64, 1.0); 1970 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1971 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1972 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1973 auto Add = B.buildSelect(S64, And, One, Zero); 1974 1975 // TODO: Should this propagate fast-math-flags? 1976 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1977 return true; 1978 } 1979 1980 bool AMDGPULegalizerInfo::legalizeFrem( 1981 MachineInstr &MI, MachineRegisterInfo &MRI, 1982 MachineIRBuilder &B) const { 1983 Register DstReg = MI.getOperand(0).getReg(); 1984 Register Src0Reg = MI.getOperand(1).getReg(); 1985 Register Src1Reg = MI.getOperand(2).getReg(); 1986 auto Flags = MI.getFlags(); 1987 LLT Ty = MRI.getType(DstReg); 1988 1989 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1990 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1991 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1992 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1993 MI.eraseFromParent(); 1994 return true; 1995 } 1996 1997 static MachineInstrBuilder extractF64Exponent(Register Hi, 1998 MachineIRBuilder &B) { 1999 const unsigned FractBits = 52; 2000 const unsigned ExpBits = 11; 2001 LLT S32 = LLT::scalar(32); 2002 2003 auto Const0 = B.buildConstant(S32, FractBits - 32); 2004 auto Const1 = B.buildConstant(S32, ExpBits); 2005 2006 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2007 .addUse(Hi) 2008 .addUse(Const0.getReg(0)) 2009 .addUse(Const1.getReg(0)); 2010 2011 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2012 } 2013 2014 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2015 MachineInstr &MI, MachineRegisterInfo &MRI, 2016 MachineIRBuilder &B) const { 2017 const LLT S1 = LLT::scalar(1); 2018 const LLT S32 = LLT::scalar(32); 2019 const LLT S64 = LLT::scalar(64); 2020 2021 Register Src = MI.getOperand(1).getReg(); 2022 assert(MRI.getType(Src) == S64); 2023 2024 // TODO: Should this use extract since the low half is unused? 2025 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2026 Register Hi = Unmerge.getReg(1); 2027 2028 // Extract the upper half, since this is where we will find the sign and 2029 // exponent. 2030 auto Exp = extractF64Exponent(Hi, B); 2031 2032 const unsigned FractBits = 52; 2033 2034 // Extract the sign bit. 2035 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2036 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2037 2038 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2039 2040 const auto Zero32 = B.buildConstant(S32, 0); 2041 2042 // Extend back to 64-bits. 2043 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2044 2045 auto Shr = B.buildAShr(S64, FractMask, Exp); 2046 auto Not = B.buildNot(S64, Shr); 2047 auto Tmp0 = B.buildAnd(S64, Src, Not); 2048 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2049 2050 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2051 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2052 2053 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2054 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2055 MI.eraseFromParent(); 2056 return true; 2057 } 2058 2059 bool AMDGPULegalizerInfo::legalizeITOFP( 2060 MachineInstr &MI, MachineRegisterInfo &MRI, 2061 MachineIRBuilder &B, bool Signed) const { 2062 2063 Register Dst = MI.getOperand(0).getReg(); 2064 Register Src = MI.getOperand(1).getReg(); 2065 2066 const LLT S64 = LLT::scalar(64); 2067 const LLT S32 = LLT::scalar(32); 2068 const LLT S1 = LLT::scalar(1); 2069 2070 assert(MRI.getType(Src) == S64); 2071 2072 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2073 auto ThirtyTwo = B.buildConstant(S32, 32); 2074 2075 if (MRI.getType(Dst) == S64) { 2076 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2077 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2078 2079 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2080 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2081 .addUse(CvtHi.getReg(0)) 2082 .addUse(ThirtyTwo.getReg(0)); 2083 2084 // TODO: Should this propagate fast-math-flags? 2085 B.buildFAdd(Dst, LdExp, CvtLo); 2086 MI.eraseFromParent(); 2087 return true; 2088 } 2089 2090 assert(MRI.getType(Dst) == S32); 2091 2092 auto Zero = B.buildConstant(S32, 0); 2093 auto One = B.buildConstant(S32, 1); 2094 auto AllOnes = B.buildConstant(S32, -1); 2095 2096 MachineInstrBuilder ShAmt; 2097 if (Signed) { 2098 auto ThirtyThree = B.buildConstant(S32, 33); 2099 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2100 auto HasSameSign = B.buildICmp(CmpInst::ICMP_SGE, S1, X, Zero); 2101 auto MaxShAmt = B.buildSelect(S32, HasSameSign, ThirtyThree, ThirtyTwo); 2102 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2103 /*HasSideEffects=*/false) 2104 .addUse(Unmerge.getReg(1)); 2105 auto NotAllSameBits = B.buildICmp(CmpInst::ICMP_NE, S1, LS, AllOnes); 2106 auto LS2 = B.buildSelect(S32, NotAllSameBits, LS, MaxShAmt); 2107 ShAmt = B.buildSub(S32, LS2, One); 2108 } else 2109 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2110 auto Norm = B.buildShl(S64, Src, ShAmt); 2111 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2112 auto NotAllZeros = 2113 B.buildICmp(CmpInst::ICMP_NE, S1, Unmerge2.getReg(0), Zero); 2114 auto Adjust = B.buildSelect(S32, NotAllZeros, One, Zero); 2115 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2116 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2117 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2118 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2119 /*HasSideEffects=*/false) 2120 .addUse(FVal.getReg(0)) 2121 .addUse(Scale.getReg(0)); 2122 MI.eraseFromParent(); 2123 return true; 2124 } 2125 2126 // TODO: Copied from DAG implementation. Verify logic and document how this 2127 // actually works. 2128 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2129 MachineRegisterInfo &MRI, 2130 MachineIRBuilder &B, 2131 bool Signed) const { 2132 2133 Register Dst = MI.getOperand(0).getReg(); 2134 Register Src = MI.getOperand(1).getReg(); 2135 2136 const LLT S64 = LLT::scalar(64); 2137 const LLT S32 = LLT::scalar(32); 2138 2139 const LLT SrcLT = MRI.getType(Src); 2140 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2141 2142 unsigned Flags = MI.getFlags(); 2143 2144 // The basic idea of converting a floating point number into a pair of 32-bit 2145 // integers is illustrated as follows: 2146 // 2147 // tf := trunc(val); 2148 // hif := floor(tf * 2^-32); 2149 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2150 // hi := fptoi(hif); 2151 // lo := fptoi(lof); 2152 // 2153 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2154 MachineInstrBuilder Sign; 2155 if (Signed && SrcLT == S32) { 2156 // However, a 32-bit floating point number has only 23 bits mantissa and 2157 // it's not enough to hold all the significant bits of `lof` if val is 2158 // negative. To avoid the loss of precision, We need to take the absolute 2159 // value after truncating and flip the result back based on the original 2160 // signedness. 2161 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2162 Trunc = B.buildFAbs(S32, Trunc, Flags); 2163 } 2164 MachineInstrBuilder K0, K1; 2165 if (SrcLT == S64) { 2166 K0 = B.buildFConstant(S64, 2167 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2168 K1 = B.buildFConstant(S64, 2169 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2170 } else { 2171 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2172 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2173 } 2174 2175 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2176 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2177 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2178 2179 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2180 : B.buildFPTOUI(S32, FloorMul); 2181 auto Lo = B.buildFPTOUI(S32, Fma); 2182 2183 if (Signed && SrcLT == S32) { 2184 // Flip the result based on the signedness, which is either all 0s or 1s. 2185 Sign = B.buildMerge(S64, {Sign, Sign}); 2186 // r := xor({lo, hi}, sign) - sign; 2187 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2188 } else 2189 B.buildMerge(Dst, {Lo, Hi}); 2190 MI.eraseFromParent(); 2191 2192 return true; 2193 } 2194 2195 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2196 MachineInstr &MI) const { 2197 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2198 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2199 2200 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2201 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2202 2203 // With ieee_mode disabled, the instructions have the correct behavior 2204 // already for G_FMINNUM/G_FMAXNUM 2205 if (!MFI->getMode().IEEE) 2206 return !IsIEEEOp; 2207 2208 if (IsIEEEOp) 2209 return true; 2210 2211 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2212 } 2213 2214 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2215 MachineInstr &MI, MachineRegisterInfo &MRI, 2216 MachineIRBuilder &B) const { 2217 // TODO: Should move some of this into LegalizerHelper. 2218 2219 // TODO: Promote dynamic indexing of s16 to s32 2220 2221 // FIXME: Artifact combiner probably should have replaced the truncated 2222 // constant before this, so we shouldn't need 2223 // getConstantVRegValWithLookThrough. 2224 Optional<ValueAndVReg> MaybeIdxVal = 2225 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2226 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2227 return true; 2228 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2229 2230 Register Dst = MI.getOperand(0).getReg(); 2231 Register Vec = MI.getOperand(1).getReg(); 2232 2233 LLT VecTy = MRI.getType(Vec); 2234 LLT EltTy = VecTy.getElementType(); 2235 assert(EltTy == MRI.getType(Dst)); 2236 2237 if (IdxVal < VecTy.getNumElements()) 2238 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2239 else 2240 B.buildUndef(Dst); 2241 2242 MI.eraseFromParent(); 2243 return true; 2244 } 2245 2246 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2247 MachineInstr &MI, MachineRegisterInfo &MRI, 2248 MachineIRBuilder &B) const { 2249 // TODO: Should move some of this into LegalizerHelper. 2250 2251 // TODO: Promote dynamic indexing of s16 to s32 2252 2253 // FIXME: Artifact combiner probably should have replaced the truncated 2254 // constant before this, so we shouldn't need 2255 // getConstantVRegValWithLookThrough. 2256 Optional<ValueAndVReg> MaybeIdxVal = 2257 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2258 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2259 return true; 2260 2261 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2262 Register Dst = MI.getOperand(0).getReg(); 2263 Register Vec = MI.getOperand(1).getReg(); 2264 Register Ins = MI.getOperand(2).getReg(); 2265 2266 LLT VecTy = MRI.getType(Vec); 2267 LLT EltTy = VecTy.getElementType(); 2268 assert(EltTy == MRI.getType(Ins)); 2269 2270 if (IdxVal < VecTy.getNumElements()) 2271 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2272 else 2273 B.buildUndef(Dst); 2274 2275 MI.eraseFromParent(); 2276 return true; 2277 } 2278 2279 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2280 MachineInstr &MI, MachineRegisterInfo &MRI, 2281 MachineIRBuilder &B) const { 2282 const LLT V2S16 = LLT::fixed_vector(2, 16); 2283 2284 Register Dst = MI.getOperand(0).getReg(); 2285 Register Src0 = MI.getOperand(1).getReg(); 2286 LLT DstTy = MRI.getType(Dst); 2287 LLT SrcTy = MRI.getType(Src0); 2288 2289 if (SrcTy == V2S16 && DstTy == V2S16 && 2290 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2291 return true; 2292 2293 MachineIRBuilder HelperBuilder(MI); 2294 GISelObserverWrapper DummyObserver; 2295 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2296 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2297 } 2298 2299 bool AMDGPULegalizerInfo::legalizeSinCos( 2300 MachineInstr &MI, MachineRegisterInfo &MRI, 2301 MachineIRBuilder &B) const { 2302 2303 Register DstReg = MI.getOperand(0).getReg(); 2304 Register SrcReg = MI.getOperand(1).getReg(); 2305 LLT Ty = MRI.getType(DstReg); 2306 unsigned Flags = MI.getFlags(); 2307 2308 Register TrigVal; 2309 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2310 if (ST.hasTrigReducedRange()) { 2311 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2312 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2313 .addUse(MulVal.getReg(0)) 2314 .setMIFlags(Flags).getReg(0); 2315 } else 2316 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2317 2318 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2319 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2320 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2321 .addUse(TrigVal) 2322 .setMIFlags(Flags); 2323 MI.eraseFromParent(); 2324 return true; 2325 } 2326 2327 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2328 MachineIRBuilder &B, 2329 const GlobalValue *GV, 2330 int64_t Offset, 2331 unsigned GAFlags) const { 2332 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2333 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2334 // to the following code sequence: 2335 // 2336 // For constant address space: 2337 // s_getpc_b64 s[0:1] 2338 // s_add_u32 s0, s0, $symbol 2339 // s_addc_u32 s1, s1, 0 2340 // 2341 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2342 // a fixup or relocation is emitted to replace $symbol with a literal 2343 // constant, which is a pc-relative offset from the encoding of the $symbol 2344 // operand to the global variable. 2345 // 2346 // For global address space: 2347 // s_getpc_b64 s[0:1] 2348 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2349 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2350 // 2351 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2352 // fixups or relocations are emitted to replace $symbol@*@lo and 2353 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2354 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2355 // operand to the global variable. 2356 // 2357 // What we want here is an offset from the value returned by s_getpc 2358 // (which is the address of the s_add_u32 instruction) to the global 2359 // variable, but since the encoding of $symbol starts 4 bytes after the start 2360 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2361 // small. This requires us to add 4 to the global variable offset in order to 2362 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2363 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2364 // instruction. 2365 2366 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2367 2368 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2369 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2370 2371 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2372 .addDef(PCReg); 2373 2374 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2375 if (GAFlags == SIInstrInfo::MO_NONE) 2376 MIB.addImm(0); 2377 else 2378 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2379 2380 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2381 2382 if (PtrTy.getSizeInBits() == 32) 2383 B.buildExtract(DstReg, PCReg, 0); 2384 return true; 2385 } 2386 2387 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2388 MachineInstr &MI, MachineRegisterInfo &MRI, 2389 MachineIRBuilder &B) const { 2390 Register DstReg = MI.getOperand(0).getReg(); 2391 LLT Ty = MRI.getType(DstReg); 2392 unsigned AS = Ty.getAddressSpace(); 2393 2394 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2395 MachineFunction &MF = B.getMF(); 2396 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2397 2398 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2399 if (!MFI->isModuleEntryFunction() && 2400 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2401 const Function &Fn = MF.getFunction(); 2402 DiagnosticInfoUnsupported BadLDSDecl( 2403 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2404 DS_Warning); 2405 Fn.getContext().diagnose(BadLDSDecl); 2406 2407 // We currently don't have a way to correctly allocate LDS objects that 2408 // aren't directly associated with a kernel. We do force inlining of 2409 // functions that use local objects. However, if these dead functions are 2410 // not eliminated, we don't want a compile time error. Just emit a warning 2411 // and a trap, since there should be no callable path here. 2412 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2413 B.buildUndef(DstReg); 2414 MI.eraseFromParent(); 2415 return true; 2416 } 2417 2418 // TODO: We could emit code to handle the initialization somewhere. 2419 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2420 const SITargetLowering *TLI = ST.getTargetLowering(); 2421 if (!TLI->shouldUseLDSConstAddress(GV)) { 2422 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2423 return true; // Leave in place; 2424 } 2425 2426 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2427 Type *Ty = GV->getValueType(); 2428 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2429 // zero-sized type in other languages to declare the dynamic shared 2430 // memory which size is not known at the compile time. They will be 2431 // allocated by the runtime and placed directly after the static 2432 // allocated ones. They all share the same offset. 2433 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2434 // Adjust alignment for that dynamic shared memory array. 2435 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2436 LLT S32 = LLT::scalar(32); 2437 auto Sz = 2438 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2439 B.buildIntToPtr(DstReg, Sz); 2440 MI.eraseFromParent(); 2441 return true; 2442 } 2443 } 2444 2445 B.buildConstant( 2446 DstReg, 2447 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2448 MI.eraseFromParent(); 2449 return true; 2450 } 2451 2452 const Function &Fn = MF.getFunction(); 2453 DiagnosticInfoUnsupported BadInit( 2454 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2455 Fn.getContext().diagnose(BadInit); 2456 return true; 2457 } 2458 2459 const SITargetLowering *TLI = ST.getTargetLowering(); 2460 2461 if (TLI->shouldEmitFixup(GV)) { 2462 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2463 MI.eraseFromParent(); 2464 return true; 2465 } 2466 2467 if (TLI->shouldEmitPCReloc(GV)) { 2468 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2469 MI.eraseFromParent(); 2470 return true; 2471 } 2472 2473 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2474 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2475 2476 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2477 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2478 MachinePointerInfo::getGOT(MF), 2479 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2480 MachineMemOperand::MOInvariant, 2481 LoadTy, Align(8)); 2482 2483 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2484 2485 if (Ty.getSizeInBits() == 32) { 2486 // Truncate if this is a 32-bit constant adrdess. 2487 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2488 B.buildExtract(DstReg, Load, 0); 2489 } else 2490 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2491 2492 MI.eraseFromParent(); 2493 return true; 2494 } 2495 2496 static LLT widenToNextPowerOf2(LLT Ty) { 2497 if (Ty.isVector()) 2498 return Ty.changeElementCount( 2499 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2500 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2501 } 2502 2503 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2504 MachineInstr &MI) const { 2505 MachineIRBuilder &B = Helper.MIRBuilder; 2506 MachineRegisterInfo &MRI = *B.getMRI(); 2507 GISelChangeObserver &Observer = Helper.Observer; 2508 2509 Register PtrReg = MI.getOperand(1).getReg(); 2510 LLT PtrTy = MRI.getType(PtrReg); 2511 unsigned AddrSpace = PtrTy.getAddressSpace(); 2512 2513 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2514 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2515 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2516 Observer.changingInstr(MI); 2517 MI.getOperand(1).setReg(Cast.getReg(0)); 2518 Observer.changedInstr(MI); 2519 return true; 2520 } 2521 2522 if (MI.getOpcode() != AMDGPU::G_LOAD) 2523 return false; 2524 2525 Register ValReg = MI.getOperand(0).getReg(); 2526 LLT ValTy = MRI.getType(ValReg); 2527 2528 MachineMemOperand *MMO = *MI.memoperands_begin(); 2529 const unsigned ValSize = ValTy.getSizeInBits(); 2530 const LLT MemTy = MMO->getMemoryType(); 2531 const Align MemAlign = MMO->getAlign(); 2532 const unsigned MemSize = MemTy.getSizeInBits(); 2533 const unsigned AlignInBits = 8 * MemAlign.value(); 2534 2535 // Widen non-power-of-2 loads to the alignment if needed 2536 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2537 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2538 2539 // This was already the correct extending load result type, so just adjust 2540 // the memory type. 2541 if (WideMemSize == ValSize) { 2542 MachineFunction &MF = B.getMF(); 2543 2544 MachineMemOperand *WideMMO = 2545 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2546 Observer.changingInstr(MI); 2547 MI.setMemRefs(MF, {WideMMO}); 2548 Observer.changedInstr(MI); 2549 return true; 2550 } 2551 2552 // Don't bother handling edge case that should probably never be produced. 2553 if (ValSize > WideMemSize) 2554 return false; 2555 2556 LLT WideTy = widenToNextPowerOf2(ValTy); 2557 2558 Register WideLoad; 2559 if (!WideTy.isVector()) { 2560 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2561 B.buildTrunc(ValReg, WideLoad).getReg(0); 2562 } else { 2563 // Extract the subvector. 2564 2565 if (isRegisterType(ValTy)) { 2566 // If this a case where G_EXTRACT is legal, use it. 2567 // (e.g. <3 x s32> -> <4 x s32>) 2568 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2569 B.buildExtract(ValReg, WideLoad, 0); 2570 } else { 2571 // For cases where the widened type isn't a nice register value, unmerge 2572 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2573 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2574 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2575 B.setInsertPt(B.getMBB(), MI.getIterator()); 2576 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2577 } 2578 } 2579 2580 MI.eraseFromParent(); 2581 return true; 2582 } 2583 2584 return false; 2585 } 2586 2587 bool AMDGPULegalizerInfo::legalizeFMad( 2588 MachineInstr &MI, MachineRegisterInfo &MRI, 2589 MachineIRBuilder &B) const { 2590 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2591 assert(Ty.isScalar()); 2592 2593 MachineFunction &MF = B.getMF(); 2594 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2595 2596 // TODO: Always legal with future ftz flag. 2597 // FIXME: Do we need just output? 2598 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2599 return true; 2600 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2601 return true; 2602 2603 MachineIRBuilder HelperBuilder(MI); 2604 GISelObserverWrapper DummyObserver; 2605 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2606 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2607 } 2608 2609 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2610 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2611 Register DstReg = MI.getOperand(0).getReg(); 2612 Register PtrReg = MI.getOperand(1).getReg(); 2613 Register CmpVal = MI.getOperand(2).getReg(); 2614 Register NewVal = MI.getOperand(3).getReg(); 2615 2616 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2617 "this should not have been custom lowered"); 2618 2619 LLT ValTy = MRI.getType(CmpVal); 2620 LLT VecTy = LLT::fixed_vector(2, ValTy); 2621 2622 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2623 2624 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2625 .addDef(DstReg) 2626 .addUse(PtrReg) 2627 .addUse(PackedVal) 2628 .setMemRefs(MI.memoperands()); 2629 2630 MI.eraseFromParent(); 2631 return true; 2632 } 2633 2634 bool AMDGPULegalizerInfo::legalizeFlog( 2635 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2636 Register Dst = MI.getOperand(0).getReg(); 2637 Register Src = MI.getOperand(1).getReg(); 2638 LLT Ty = B.getMRI()->getType(Dst); 2639 unsigned Flags = MI.getFlags(); 2640 2641 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2642 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2643 2644 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2645 MI.eraseFromParent(); 2646 return true; 2647 } 2648 2649 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2650 MachineIRBuilder &B) const { 2651 Register Dst = MI.getOperand(0).getReg(); 2652 Register Src = MI.getOperand(1).getReg(); 2653 unsigned Flags = MI.getFlags(); 2654 LLT Ty = B.getMRI()->getType(Dst); 2655 2656 auto K = B.buildFConstant(Ty, numbers::log2e); 2657 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2658 B.buildFExp2(Dst, Mul, Flags); 2659 MI.eraseFromParent(); 2660 return true; 2661 } 2662 2663 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2664 MachineIRBuilder &B) const { 2665 Register Dst = MI.getOperand(0).getReg(); 2666 Register Src0 = MI.getOperand(1).getReg(); 2667 Register Src1 = MI.getOperand(2).getReg(); 2668 unsigned Flags = MI.getFlags(); 2669 LLT Ty = B.getMRI()->getType(Dst); 2670 const LLT S16 = LLT::scalar(16); 2671 const LLT S32 = LLT::scalar(32); 2672 2673 if (Ty == S32) { 2674 auto Log = B.buildFLog2(S32, Src0, Flags); 2675 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2676 .addUse(Log.getReg(0)) 2677 .addUse(Src1) 2678 .setMIFlags(Flags); 2679 B.buildFExp2(Dst, Mul, Flags); 2680 } else if (Ty == S16) { 2681 // There's no f16 fmul_legacy, so we need to convert for it. 2682 auto Log = B.buildFLog2(S16, Src0, Flags); 2683 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2684 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2685 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2686 .addUse(Ext0.getReg(0)) 2687 .addUse(Ext1.getReg(0)) 2688 .setMIFlags(Flags); 2689 2690 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2691 } else 2692 return false; 2693 2694 MI.eraseFromParent(); 2695 return true; 2696 } 2697 2698 // Find a source register, ignoring any possible source modifiers. 2699 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2700 Register ModSrc = OrigSrc; 2701 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2702 ModSrc = SrcFNeg->getOperand(1).getReg(); 2703 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2704 ModSrc = SrcFAbs->getOperand(1).getReg(); 2705 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2706 ModSrc = SrcFAbs->getOperand(1).getReg(); 2707 return ModSrc; 2708 } 2709 2710 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2711 MachineRegisterInfo &MRI, 2712 MachineIRBuilder &B) const { 2713 2714 const LLT S1 = LLT::scalar(1); 2715 const LLT S64 = LLT::scalar(64); 2716 Register Dst = MI.getOperand(0).getReg(); 2717 Register OrigSrc = MI.getOperand(1).getReg(); 2718 unsigned Flags = MI.getFlags(); 2719 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2720 "this should not have been custom lowered"); 2721 2722 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2723 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2724 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2725 // V_FRACT bug is: 2726 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2727 // 2728 // Convert floor(x) to (x - fract(x)) 2729 2730 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2731 .addUse(OrigSrc) 2732 .setMIFlags(Flags); 2733 2734 // Give source modifier matching some assistance before obscuring a foldable 2735 // pattern. 2736 2737 // TODO: We can avoid the neg on the fract? The input sign to fract 2738 // shouldn't matter? 2739 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2740 2741 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2742 2743 Register Min = MRI.createGenericVirtualRegister(S64); 2744 2745 // We don't need to concern ourselves with the snan handling difference, so 2746 // use the one which will directly select. 2747 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2748 if (MFI->getMode().IEEE) 2749 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2750 else 2751 B.buildFMinNum(Min, Fract, Const, Flags); 2752 2753 Register CorrectedFract = Min; 2754 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2755 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2756 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2757 } 2758 2759 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2760 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2761 2762 MI.eraseFromParent(); 2763 return true; 2764 } 2765 2766 // Turn an illegal packed v2s16 build vector into bit operations. 2767 // TODO: This should probably be a bitcast action in LegalizerHelper. 2768 bool AMDGPULegalizerInfo::legalizeBuildVector( 2769 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2770 Register Dst = MI.getOperand(0).getReg(); 2771 const LLT S32 = LLT::scalar(32); 2772 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2773 2774 Register Src0 = MI.getOperand(1).getReg(); 2775 Register Src1 = MI.getOperand(2).getReg(); 2776 assert(MRI.getType(Src0) == LLT::scalar(16)); 2777 2778 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2779 B.buildBitcast(Dst, Merge); 2780 2781 MI.eraseFromParent(); 2782 return true; 2783 } 2784 2785 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2786 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2787 // case with a single min instruction instead of a compare+select. 2788 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2789 MachineRegisterInfo &MRI, 2790 MachineIRBuilder &B) const { 2791 Register Dst = MI.getOperand(0).getReg(); 2792 Register Src = MI.getOperand(1).getReg(); 2793 LLT DstTy = MRI.getType(Dst); 2794 LLT SrcTy = MRI.getType(Src); 2795 2796 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2797 ? AMDGPU::G_AMDGPU_FFBH_U32 2798 : AMDGPU::G_AMDGPU_FFBL_B32; 2799 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2800 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2801 2802 MI.eraseFromParent(); 2803 return true; 2804 } 2805 2806 // Check that this is a G_XOR x, -1 2807 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2808 if (MI.getOpcode() != TargetOpcode::G_XOR) 2809 return false; 2810 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2811 return ConstVal && *ConstVal == -1; 2812 } 2813 2814 // Return the use branch instruction, otherwise null if the usage is invalid. 2815 static MachineInstr * 2816 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2817 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2818 Register CondDef = MI.getOperand(0).getReg(); 2819 if (!MRI.hasOneNonDBGUse(CondDef)) 2820 return nullptr; 2821 2822 MachineBasicBlock *Parent = MI.getParent(); 2823 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2824 2825 if (isNot(MRI, *UseMI)) { 2826 Register NegatedCond = UseMI->getOperand(0).getReg(); 2827 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2828 return nullptr; 2829 2830 // We're deleting the def of this value, so we need to remove it. 2831 UseMI->eraseFromParent(); 2832 2833 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2834 Negated = true; 2835 } 2836 2837 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2838 return nullptr; 2839 2840 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2841 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2842 if (Next == Parent->end()) { 2843 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2844 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2845 return nullptr; 2846 UncondBrTarget = &*NextMBB; 2847 } else { 2848 if (Next->getOpcode() != AMDGPU::G_BR) 2849 return nullptr; 2850 Br = &*Next; 2851 UncondBrTarget = Br->getOperand(0).getMBB(); 2852 } 2853 2854 return UseMI; 2855 } 2856 2857 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2858 const ArgDescriptor *Arg, 2859 const TargetRegisterClass *ArgRC, 2860 LLT ArgTy) const { 2861 MCRegister SrcReg = Arg->getRegister(); 2862 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2863 assert(DstReg.isVirtual() && "Virtual register expected"); 2864 2865 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2866 ArgTy); 2867 if (Arg->isMasked()) { 2868 // TODO: Should we try to emit this once in the entry block? 2869 const LLT S32 = LLT::scalar(32); 2870 const unsigned Mask = Arg->getMask(); 2871 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2872 2873 Register AndMaskSrc = LiveIn; 2874 2875 if (Shift != 0) { 2876 auto ShiftAmt = B.buildConstant(S32, Shift); 2877 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2878 } 2879 2880 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2881 } else { 2882 B.buildCopy(DstReg, LiveIn); 2883 } 2884 2885 return true; 2886 } 2887 2888 bool AMDGPULegalizerInfo::loadInputValue( 2889 Register DstReg, MachineIRBuilder &B, 2890 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2891 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2892 const ArgDescriptor *Arg; 2893 const TargetRegisterClass *ArgRC; 2894 LLT ArgTy; 2895 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2896 2897 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2898 return false; // TODO: Handle these 2899 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2900 } 2901 2902 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2903 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2904 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2905 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2906 return false; 2907 2908 MI.eraseFromParent(); 2909 return true; 2910 } 2911 2912 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2913 MachineRegisterInfo &MRI, 2914 MachineIRBuilder &B) const { 2915 Register Dst = MI.getOperand(0).getReg(); 2916 LLT DstTy = MRI.getType(Dst); 2917 LLT S16 = LLT::scalar(16); 2918 LLT S32 = LLT::scalar(32); 2919 LLT S64 = LLT::scalar(64); 2920 2921 if (DstTy == S16) 2922 return legalizeFDIV16(MI, MRI, B); 2923 if (DstTy == S32) 2924 return legalizeFDIV32(MI, MRI, B); 2925 if (DstTy == S64) 2926 return legalizeFDIV64(MI, MRI, B); 2927 2928 return false; 2929 } 2930 2931 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2932 Register DstDivReg, 2933 Register DstRemReg, 2934 Register X, 2935 Register Y) const { 2936 const LLT S1 = LLT::scalar(1); 2937 const LLT S32 = LLT::scalar(32); 2938 2939 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2940 // algorithm used here. 2941 2942 // Initial estimate of inv(y). 2943 auto FloatY = B.buildUITOFP(S32, Y); 2944 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2945 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2946 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2947 auto Z = B.buildFPTOUI(S32, ScaledY); 2948 2949 // One round of UNR. 2950 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2951 auto NegYZ = B.buildMul(S32, NegY, Z); 2952 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2953 2954 // Quotient/remainder estimate. 2955 auto Q = B.buildUMulH(S32, X, Z); 2956 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2957 2958 // First quotient/remainder refinement. 2959 auto One = B.buildConstant(S32, 1); 2960 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2961 if (DstDivReg) 2962 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2963 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2964 2965 // Second quotient/remainder refinement. 2966 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2967 if (DstDivReg) 2968 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2969 2970 if (DstRemReg) 2971 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2972 } 2973 2974 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2975 // 2976 // Return lo, hi of result 2977 // 2978 // %cvt.lo = G_UITOFP Val.lo 2979 // %cvt.hi = G_UITOFP Val.hi 2980 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2981 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2982 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2983 // %mul2 = G_FMUL %mul1, 2**(-32) 2984 // %trunc = G_INTRINSIC_TRUNC %mul2 2985 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2986 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2987 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2988 Register Val) { 2989 const LLT S32 = LLT::scalar(32); 2990 auto Unmerge = B.buildUnmerge(S32, Val); 2991 2992 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2993 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2994 2995 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2996 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2997 2998 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2999 auto Mul1 = 3000 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3001 3002 // 2**(-32) 3003 auto Mul2 = 3004 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3005 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3006 3007 // -(2**32) 3008 auto Mad2 = B.buildFMAD(S32, Trunc, 3009 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3010 3011 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3012 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3013 3014 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3015 } 3016 3017 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3018 Register DstDivReg, 3019 Register DstRemReg, 3020 Register Numer, 3021 Register Denom) const { 3022 const LLT S32 = LLT::scalar(32); 3023 const LLT S64 = LLT::scalar(64); 3024 const LLT S1 = LLT::scalar(1); 3025 Register RcpLo, RcpHi; 3026 3027 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3028 3029 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3030 3031 auto Zero64 = B.buildConstant(S64, 0); 3032 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3033 3034 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3035 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3036 3037 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3038 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3039 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3040 3041 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3042 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3043 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 3044 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3045 3046 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3047 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3048 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3049 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3050 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3051 3052 auto Zero32 = B.buildConstant(S32, 0); 3053 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3054 auto Add2_HiC = 3055 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 3056 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 3057 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3058 3059 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3060 Register NumerLo = UnmergeNumer.getReg(0); 3061 Register NumerHi = UnmergeNumer.getReg(1); 3062 3063 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3064 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3065 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3066 Register Mul3_Lo = UnmergeMul3.getReg(0); 3067 Register Mul3_Hi = UnmergeMul3.getReg(1); 3068 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3069 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3070 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3071 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3072 3073 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3074 Register DenomLo = UnmergeDenom.getReg(0); 3075 Register DenomHi = UnmergeDenom.getReg(1); 3076 3077 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3078 auto C1 = B.buildSExt(S32, CmpHi); 3079 3080 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3081 auto C2 = B.buildSExt(S32, CmpLo); 3082 3083 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3084 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3085 3086 // TODO: Here and below portions of the code can be enclosed into if/endif. 3087 // Currently control flow is unconditional and we have 4 selects after 3088 // potential endif to substitute PHIs. 3089 3090 // if C3 != 0 ... 3091 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3092 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3093 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3094 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3095 3096 auto One64 = B.buildConstant(S64, 1); 3097 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3098 3099 auto C4 = 3100 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3101 auto C5 = 3102 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3103 auto C6 = B.buildSelect( 3104 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3105 3106 // if (C6 != 0) 3107 auto Add4 = B.buildAdd(S64, Add3, One64); 3108 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3109 3110 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3111 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3112 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3113 3114 // endif C6 3115 // endif C3 3116 3117 if (DstDivReg) { 3118 auto Sel1 = B.buildSelect( 3119 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3120 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3121 Sel1, MulHi3); 3122 } 3123 3124 if (DstRemReg) { 3125 auto Sel2 = B.buildSelect( 3126 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3127 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3128 Sel2, Sub1); 3129 } 3130 } 3131 3132 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3133 MachineRegisterInfo &MRI, 3134 MachineIRBuilder &B) const { 3135 Register DstDivReg, DstRemReg; 3136 switch (MI.getOpcode()) { 3137 default: 3138 llvm_unreachable("Unexpected opcode!"); 3139 case AMDGPU::G_UDIV: { 3140 DstDivReg = MI.getOperand(0).getReg(); 3141 break; 3142 } 3143 case AMDGPU::G_UREM: { 3144 DstRemReg = MI.getOperand(0).getReg(); 3145 break; 3146 } 3147 case AMDGPU::G_UDIVREM: { 3148 DstDivReg = MI.getOperand(0).getReg(); 3149 DstRemReg = MI.getOperand(1).getReg(); 3150 break; 3151 } 3152 } 3153 3154 const LLT S64 = LLT::scalar(64); 3155 const LLT S32 = LLT::scalar(32); 3156 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3157 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3158 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3159 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3160 3161 if (Ty == S32) 3162 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3163 else if (Ty == S64) 3164 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3165 else 3166 return false; 3167 3168 MI.eraseFromParent(); 3169 return true; 3170 } 3171 3172 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3173 MachineRegisterInfo &MRI, 3174 MachineIRBuilder &B) const { 3175 const LLT S64 = LLT::scalar(64); 3176 const LLT S32 = LLT::scalar(32); 3177 3178 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3179 if (Ty != S32 && Ty != S64) 3180 return false; 3181 3182 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3183 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3184 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3185 3186 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3187 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3188 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3189 3190 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3191 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3192 3193 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3194 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3195 3196 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3197 switch (MI.getOpcode()) { 3198 default: 3199 llvm_unreachable("Unexpected opcode!"); 3200 case AMDGPU::G_SDIV: { 3201 DstDivReg = MI.getOperand(0).getReg(); 3202 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3203 break; 3204 } 3205 case AMDGPU::G_SREM: { 3206 DstRemReg = MI.getOperand(0).getReg(); 3207 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3208 break; 3209 } 3210 case AMDGPU::G_SDIVREM: { 3211 DstDivReg = MI.getOperand(0).getReg(); 3212 DstRemReg = MI.getOperand(1).getReg(); 3213 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3214 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3215 break; 3216 } 3217 } 3218 3219 if (Ty == S32) 3220 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3221 else 3222 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3223 3224 if (DstDivReg) { 3225 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3226 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3227 B.buildSub(DstDivReg, SignXor, Sign); 3228 } 3229 3230 if (DstRemReg) { 3231 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3232 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3233 B.buildSub(DstRemReg, SignXor, Sign); 3234 } 3235 3236 MI.eraseFromParent(); 3237 return true; 3238 } 3239 3240 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3241 MachineRegisterInfo &MRI, 3242 MachineIRBuilder &B) const { 3243 Register Res = MI.getOperand(0).getReg(); 3244 Register LHS = MI.getOperand(1).getReg(); 3245 Register RHS = MI.getOperand(2).getReg(); 3246 uint16_t Flags = MI.getFlags(); 3247 LLT ResTy = MRI.getType(Res); 3248 3249 const MachineFunction &MF = B.getMF(); 3250 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3251 MI.getFlag(MachineInstr::FmAfn); 3252 3253 if (!AllowInaccurateRcp) 3254 return false; 3255 3256 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3257 // 1 / x -> RCP(x) 3258 if (CLHS->isExactlyValue(1.0)) { 3259 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3260 .addUse(RHS) 3261 .setMIFlags(Flags); 3262 3263 MI.eraseFromParent(); 3264 return true; 3265 } 3266 3267 // -1 / x -> RCP( FNEG(x) ) 3268 if (CLHS->isExactlyValue(-1.0)) { 3269 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3270 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3271 .addUse(FNeg.getReg(0)) 3272 .setMIFlags(Flags); 3273 3274 MI.eraseFromParent(); 3275 return true; 3276 } 3277 } 3278 3279 // x / y -> x * (1.0 / y) 3280 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3281 .addUse(RHS) 3282 .setMIFlags(Flags); 3283 B.buildFMul(Res, LHS, RCP, Flags); 3284 3285 MI.eraseFromParent(); 3286 return true; 3287 } 3288 3289 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3290 MachineRegisterInfo &MRI, 3291 MachineIRBuilder &B) const { 3292 Register Res = MI.getOperand(0).getReg(); 3293 Register X = MI.getOperand(1).getReg(); 3294 Register Y = MI.getOperand(2).getReg(); 3295 uint16_t Flags = MI.getFlags(); 3296 LLT ResTy = MRI.getType(Res); 3297 3298 const MachineFunction &MF = B.getMF(); 3299 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3300 MI.getFlag(MachineInstr::FmAfn); 3301 3302 if (!AllowInaccurateRcp) 3303 return false; 3304 3305 auto NegY = B.buildFNeg(ResTy, Y); 3306 auto One = B.buildFConstant(ResTy, 1.0); 3307 3308 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3309 .addUse(Y) 3310 .setMIFlags(Flags); 3311 3312 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3313 R = B.buildFMA(ResTy, Tmp0, R, R); 3314 3315 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3316 R = B.buildFMA(ResTy, Tmp1, R, R); 3317 3318 auto Ret = B.buildFMul(ResTy, X, R); 3319 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3320 3321 B.buildFMA(Res, Tmp2, R, Ret); 3322 MI.eraseFromParent(); 3323 return true; 3324 } 3325 3326 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3327 MachineRegisterInfo &MRI, 3328 MachineIRBuilder &B) const { 3329 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3330 return true; 3331 3332 Register Res = MI.getOperand(0).getReg(); 3333 Register LHS = MI.getOperand(1).getReg(); 3334 Register RHS = MI.getOperand(2).getReg(); 3335 3336 uint16_t Flags = MI.getFlags(); 3337 3338 LLT S16 = LLT::scalar(16); 3339 LLT S32 = LLT::scalar(32); 3340 3341 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3342 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3343 3344 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3345 .addUse(RHSExt.getReg(0)) 3346 .setMIFlags(Flags); 3347 3348 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3349 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3350 3351 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3352 .addUse(RDst.getReg(0)) 3353 .addUse(RHS) 3354 .addUse(LHS) 3355 .setMIFlags(Flags); 3356 3357 MI.eraseFromParent(); 3358 return true; 3359 } 3360 3361 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3362 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3363 static void toggleSPDenormMode(bool Enable, 3364 MachineIRBuilder &B, 3365 const GCNSubtarget &ST, 3366 AMDGPU::SIModeRegisterDefaults Mode) { 3367 // Set SP denorm mode to this value. 3368 unsigned SPDenormMode = 3369 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3370 3371 if (ST.hasDenormModeInst()) { 3372 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3373 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3374 3375 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3376 B.buildInstr(AMDGPU::S_DENORM_MODE) 3377 .addImm(NewDenormModeValue); 3378 3379 } else { 3380 // Select FP32 bit field in mode register. 3381 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3382 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3383 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3384 3385 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3386 .addImm(SPDenormMode) 3387 .addImm(SPDenormModeBitField); 3388 } 3389 } 3390 3391 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3392 MachineRegisterInfo &MRI, 3393 MachineIRBuilder &B) const { 3394 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3395 return true; 3396 3397 Register Res = MI.getOperand(0).getReg(); 3398 Register LHS = MI.getOperand(1).getReg(); 3399 Register RHS = MI.getOperand(2).getReg(); 3400 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3401 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3402 3403 uint16_t Flags = MI.getFlags(); 3404 3405 LLT S32 = LLT::scalar(32); 3406 LLT S1 = LLT::scalar(1); 3407 3408 auto One = B.buildFConstant(S32, 1.0f); 3409 3410 auto DenominatorScaled = 3411 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3412 .addUse(LHS) 3413 .addUse(RHS) 3414 .addImm(0) 3415 .setMIFlags(Flags); 3416 auto NumeratorScaled = 3417 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3418 .addUse(LHS) 3419 .addUse(RHS) 3420 .addImm(1) 3421 .setMIFlags(Flags); 3422 3423 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3424 .addUse(DenominatorScaled.getReg(0)) 3425 .setMIFlags(Flags); 3426 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3427 3428 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3429 // aren't modeled as reading it. 3430 if (!Mode.allFP32Denormals()) 3431 toggleSPDenormMode(true, B, ST, Mode); 3432 3433 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3434 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3435 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3436 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3437 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3438 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3439 3440 if (!Mode.allFP32Denormals()) 3441 toggleSPDenormMode(false, B, ST, Mode); 3442 3443 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3444 .addUse(Fma4.getReg(0)) 3445 .addUse(Fma1.getReg(0)) 3446 .addUse(Fma3.getReg(0)) 3447 .addUse(NumeratorScaled.getReg(1)) 3448 .setMIFlags(Flags); 3449 3450 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3451 .addUse(Fmas.getReg(0)) 3452 .addUse(RHS) 3453 .addUse(LHS) 3454 .setMIFlags(Flags); 3455 3456 MI.eraseFromParent(); 3457 return true; 3458 } 3459 3460 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3461 MachineRegisterInfo &MRI, 3462 MachineIRBuilder &B) const { 3463 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3464 return true; 3465 3466 Register Res = MI.getOperand(0).getReg(); 3467 Register LHS = MI.getOperand(1).getReg(); 3468 Register RHS = MI.getOperand(2).getReg(); 3469 3470 uint16_t Flags = MI.getFlags(); 3471 3472 LLT S64 = LLT::scalar(64); 3473 LLT S1 = LLT::scalar(1); 3474 3475 auto One = B.buildFConstant(S64, 1.0); 3476 3477 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3478 .addUse(LHS) 3479 .addUse(RHS) 3480 .addImm(0) 3481 .setMIFlags(Flags); 3482 3483 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3484 3485 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3486 .addUse(DivScale0.getReg(0)) 3487 .setMIFlags(Flags); 3488 3489 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3490 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3491 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3492 3493 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3494 .addUse(LHS) 3495 .addUse(RHS) 3496 .addImm(1) 3497 .setMIFlags(Flags); 3498 3499 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3500 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3501 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3502 3503 Register Scale; 3504 if (!ST.hasUsableDivScaleConditionOutput()) { 3505 // Workaround a hardware bug on SI where the condition output from div_scale 3506 // is not usable. 3507 3508 LLT S32 = LLT::scalar(32); 3509 3510 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3511 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3512 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3513 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3514 3515 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3516 Scale1Unmerge.getReg(1)); 3517 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3518 Scale0Unmerge.getReg(1)); 3519 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3520 } else { 3521 Scale = DivScale1.getReg(1); 3522 } 3523 3524 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3525 .addUse(Fma4.getReg(0)) 3526 .addUse(Fma3.getReg(0)) 3527 .addUse(Mul.getReg(0)) 3528 .addUse(Scale) 3529 .setMIFlags(Flags); 3530 3531 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3532 .addUse(Fmas.getReg(0)) 3533 .addUse(RHS) 3534 .addUse(LHS) 3535 .setMIFlags(Flags); 3536 3537 MI.eraseFromParent(); 3538 return true; 3539 } 3540 3541 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3542 MachineRegisterInfo &MRI, 3543 MachineIRBuilder &B) const { 3544 Register Res = MI.getOperand(0).getReg(); 3545 Register LHS = MI.getOperand(2).getReg(); 3546 Register RHS = MI.getOperand(3).getReg(); 3547 uint16_t Flags = MI.getFlags(); 3548 3549 LLT S32 = LLT::scalar(32); 3550 LLT S1 = LLT::scalar(1); 3551 3552 auto Abs = B.buildFAbs(S32, RHS, Flags); 3553 const APFloat C0Val(1.0f); 3554 3555 auto C0 = B.buildConstant(S32, 0x6f800000); 3556 auto C1 = B.buildConstant(S32, 0x2f800000); 3557 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3558 3559 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3560 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3561 3562 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3563 3564 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3565 .addUse(Mul0.getReg(0)) 3566 .setMIFlags(Flags); 3567 3568 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3569 3570 B.buildFMul(Res, Sel, Mul1, Flags); 3571 3572 MI.eraseFromParent(); 3573 return true; 3574 } 3575 3576 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3577 // FIXME: Why do we handle this one but not other removed instructions? 3578 // 3579 // Reciprocal square root. The clamp prevents infinite results, clamping 3580 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3581 // +-max_float. 3582 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3583 MachineRegisterInfo &MRI, 3584 MachineIRBuilder &B) const { 3585 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3586 return true; 3587 3588 Register Dst = MI.getOperand(0).getReg(); 3589 Register Src = MI.getOperand(2).getReg(); 3590 auto Flags = MI.getFlags(); 3591 3592 LLT Ty = MRI.getType(Dst); 3593 3594 const fltSemantics *FltSemantics; 3595 if (Ty == LLT::scalar(32)) 3596 FltSemantics = &APFloat::IEEEsingle(); 3597 else if (Ty == LLT::scalar(64)) 3598 FltSemantics = &APFloat::IEEEdouble(); 3599 else 3600 return false; 3601 3602 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3603 .addUse(Src) 3604 .setMIFlags(Flags); 3605 3606 // We don't need to concern ourselves with the snan handling difference, since 3607 // the rsq quieted (or not) so use the one which will directly select. 3608 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3609 const bool UseIEEE = MFI->getMode().IEEE; 3610 3611 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3612 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3613 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3614 3615 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3616 3617 if (UseIEEE) 3618 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3619 else 3620 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3621 MI.eraseFromParent(); 3622 return true; 3623 } 3624 3625 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3626 switch (IID) { 3627 case Intrinsic::amdgcn_ds_fadd: 3628 return AMDGPU::G_ATOMICRMW_FADD; 3629 case Intrinsic::amdgcn_ds_fmin: 3630 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3631 case Intrinsic::amdgcn_ds_fmax: 3632 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3633 default: 3634 llvm_unreachable("not a DS FP intrinsic"); 3635 } 3636 } 3637 3638 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3639 MachineInstr &MI, 3640 Intrinsic::ID IID) const { 3641 GISelChangeObserver &Observer = Helper.Observer; 3642 Observer.changingInstr(MI); 3643 3644 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3645 3646 // The remaining operands were used to set fields in the MemOperand on 3647 // construction. 3648 for (int I = 6; I > 3; --I) 3649 MI.RemoveOperand(I); 3650 3651 MI.RemoveOperand(1); // Remove the intrinsic ID. 3652 Observer.changedInstr(MI); 3653 return true; 3654 } 3655 3656 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3657 MachineRegisterInfo &MRI, 3658 MachineIRBuilder &B) const { 3659 uint64_t Offset = 3660 ST.getTargetLowering()->getImplicitParameterOffset( 3661 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3662 LLT DstTy = MRI.getType(DstReg); 3663 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3664 3665 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3666 if (!loadInputValue(KernargPtrReg, B, 3667 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3668 return false; 3669 3670 // FIXME: This should be nuw 3671 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3672 return true; 3673 } 3674 3675 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3676 MachineRegisterInfo &MRI, 3677 MachineIRBuilder &B) const { 3678 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3679 if (!MFI->isEntryFunction()) { 3680 return legalizePreloadedArgIntrin(MI, MRI, B, 3681 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3682 } 3683 3684 Register DstReg = MI.getOperand(0).getReg(); 3685 if (!getImplicitArgPtr(DstReg, MRI, B)) 3686 return false; 3687 3688 MI.eraseFromParent(); 3689 return true; 3690 } 3691 3692 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3693 MachineRegisterInfo &MRI, 3694 MachineIRBuilder &B, 3695 unsigned AddrSpace) const { 3696 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3697 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3698 Register Hi32 = Unmerge.getReg(1); 3699 3700 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3701 MI.eraseFromParent(); 3702 return true; 3703 } 3704 3705 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3706 // offset (the offset that is included in bounds checking and swizzling, to be 3707 // split between the instruction's voffset and immoffset fields) and soffset 3708 // (the offset that is excluded from bounds checking and swizzling, to go in 3709 // the instruction's soffset field). This function takes the first kind of 3710 // offset and figures out how to split it between voffset and immoffset. 3711 std::pair<Register, unsigned> 3712 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3713 Register OrigOffset) const { 3714 const unsigned MaxImm = 4095; 3715 Register BaseReg; 3716 unsigned ImmOffset; 3717 const LLT S32 = LLT::scalar(32); 3718 MachineRegisterInfo &MRI = *B.getMRI(); 3719 3720 std::tie(BaseReg, ImmOffset) = 3721 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3722 3723 // If BaseReg is a pointer, convert it to int. 3724 if (MRI.getType(BaseReg).isPointer()) 3725 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3726 3727 // If the immediate value is too big for the immoffset field, put the value 3728 // and -4096 into the immoffset field so that the value that is copied/added 3729 // for the voffset field is a multiple of 4096, and it stands more chance 3730 // of being CSEd with the copy/add for another similar load/store. 3731 // However, do not do that rounding down to a multiple of 4096 if that is a 3732 // negative number, as it appears to be illegal to have a negative offset 3733 // in the vgpr, even if adding the immediate offset makes it positive. 3734 unsigned Overflow = ImmOffset & ~MaxImm; 3735 ImmOffset -= Overflow; 3736 if ((int32_t)Overflow < 0) { 3737 Overflow += ImmOffset; 3738 ImmOffset = 0; 3739 } 3740 3741 if (Overflow != 0) { 3742 if (!BaseReg) { 3743 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3744 } else { 3745 auto OverflowVal = B.buildConstant(S32, Overflow); 3746 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3747 } 3748 } 3749 3750 if (!BaseReg) 3751 BaseReg = B.buildConstant(S32, 0).getReg(0); 3752 3753 return std::make_pair(BaseReg, ImmOffset); 3754 } 3755 3756 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3757 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3758 Register VOffset, Register SOffset, 3759 unsigned ImmOffset, Register VIndex, 3760 MachineRegisterInfo &MRI) const { 3761 Optional<ValueAndVReg> MaybeVOffsetVal = 3762 getConstantVRegValWithLookThrough(VOffset, MRI); 3763 Optional<ValueAndVReg> MaybeSOffsetVal = 3764 getConstantVRegValWithLookThrough(SOffset, MRI); 3765 Optional<ValueAndVReg> MaybeVIndexVal = 3766 getConstantVRegValWithLookThrough(VIndex, MRI); 3767 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3768 // update the MMO with that offset. The stride is unknown so we can only do 3769 // this if VIndex is constant 0. 3770 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3771 MaybeVIndexVal->Value == 0) { 3772 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3773 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3774 MMO->setOffset(TotalOffset); 3775 } else { 3776 // We don't have a constant combined offset to use in the MMO. Give up. 3777 MMO->setValue((Value *)nullptr); 3778 } 3779 } 3780 3781 /// Handle register layout difference for f16 images for some subtargets. 3782 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3783 MachineRegisterInfo &MRI, 3784 Register Reg, 3785 bool ImageStore) const { 3786 const LLT S16 = LLT::scalar(16); 3787 const LLT S32 = LLT::scalar(32); 3788 LLT StoreVT = MRI.getType(Reg); 3789 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3790 3791 if (ST.hasUnpackedD16VMem()) { 3792 auto Unmerge = B.buildUnmerge(S16, Reg); 3793 3794 SmallVector<Register, 4> WideRegs; 3795 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3796 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3797 3798 int NumElts = StoreVT.getNumElements(); 3799 3800 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3801 .getReg(0); 3802 } 3803 3804 if (ImageStore && ST.hasImageStoreD16Bug()) { 3805 if (StoreVT.getNumElements() == 2) { 3806 SmallVector<Register, 4> PackedRegs; 3807 Reg = B.buildBitcast(S32, Reg).getReg(0); 3808 PackedRegs.push_back(Reg); 3809 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3810 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3811 .getReg(0); 3812 } 3813 3814 if (StoreVT.getNumElements() == 3) { 3815 SmallVector<Register, 4> PackedRegs; 3816 auto Unmerge = B.buildUnmerge(S16, Reg); 3817 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3818 PackedRegs.push_back(Unmerge.getReg(I)); 3819 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3820 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3821 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3822 } 3823 3824 if (StoreVT.getNumElements() == 4) { 3825 SmallVector<Register, 4> PackedRegs; 3826 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3827 auto Unmerge = B.buildUnmerge(S32, Reg); 3828 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3829 PackedRegs.push_back(Unmerge.getReg(I)); 3830 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3831 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3832 .getReg(0); 3833 } 3834 3835 llvm_unreachable("invalid data type"); 3836 } 3837 3838 return Reg; 3839 } 3840 3841 Register AMDGPULegalizerInfo::fixStoreSourceType( 3842 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3843 MachineRegisterInfo *MRI = B.getMRI(); 3844 LLT Ty = MRI->getType(VData); 3845 3846 const LLT S16 = LLT::scalar(16); 3847 3848 // Fixup illegal register types for i8 stores. 3849 if (Ty == LLT::scalar(8) || Ty == S16) { 3850 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3851 return AnyExt; 3852 } 3853 3854 if (Ty.isVector()) { 3855 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3856 if (IsFormat) 3857 return handleD16VData(B, *MRI, VData); 3858 } 3859 } 3860 3861 return VData; 3862 } 3863 3864 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3865 MachineRegisterInfo &MRI, 3866 MachineIRBuilder &B, 3867 bool IsTyped, 3868 bool IsFormat) const { 3869 Register VData = MI.getOperand(1).getReg(); 3870 LLT Ty = MRI.getType(VData); 3871 LLT EltTy = Ty.getScalarType(); 3872 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3873 const LLT S32 = LLT::scalar(32); 3874 3875 VData = fixStoreSourceType(B, VData, IsFormat); 3876 Register RSrc = MI.getOperand(2).getReg(); 3877 3878 MachineMemOperand *MMO = *MI.memoperands_begin(); 3879 const int MemSize = MMO->getSize(); 3880 3881 unsigned ImmOffset; 3882 3883 // The typed intrinsics add an immediate after the registers. 3884 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3885 3886 // The struct intrinsic variants add one additional operand over raw. 3887 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3888 Register VIndex; 3889 int OpOffset = 0; 3890 if (HasVIndex) { 3891 VIndex = MI.getOperand(3).getReg(); 3892 OpOffset = 1; 3893 } else { 3894 VIndex = B.buildConstant(S32, 0).getReg(0); 3895 } 3896 3897 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3898 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3899 3900 unsigned Format = 0; 3901 if (IsTyped) { 3902 Format = MI.getOperand(5 + OpOffset).getImm(); 3903 ++OpOffset; 3904 } 3905 3906 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3907 3908 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3909 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3910 3911 unsigned Opc; 3912 if (IsTyped) { 3913 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3914 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3915 } else if (IsFormat) { 3916 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3917 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3918 } else { 3919 switch (MemSize) { 3920 case 1: 3921 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3922 break; 3923 case 2: 3924 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3925 break; 3926 default: 3927 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3928 break; 3929 } 3930 } 3931 3932 auto MIB = B.buildInstr(Opc) 3933 .addUse(VData) // vdata 3934 .addUse(RSrc) // rsrc 3935 .addUse(VIndex) // vindex 3936 .addUse(VOffset) // voffset 3937 .addUse(SOffset) // soffset 3938 .addImm(ImmOffset); // offset(imm) 3939 3940 if (IsTyped) 3941 MIB.addImm(Format); 3942 3943 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3944 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3945 .addMemOperand(MMO); 3946 3947 MI.eraseFromParent(); 3948 return true; 3949 } 3950 3951 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3952 MachineRegisterInfo &MRI, 3953 MachineIRBuilder &B, 3954 bool IsFormat, 3955 bool IsTyped) const { 3956 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3957 MachineMemOperand *MMO = *MI.memoperands_begin(); 3958 const LLT MemTy = MMO->getMemoryType(); 3959 const LLT S32 = LLT::scalar(32); 3960 3961 Register Dst = MI.getOperand(0).getReg(); 3962 Register RSrc = MI.getOperand(2).getReg(); 3963 3964 // The typed intrinsics add an immediate after the registers. 3965 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3966 3967 // The struct intrinsic variants add one additional operand over raw. 3968 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3969 Register VIndex; 3970 int OpOffset = 0; 3971 if (HasVIndex) { 3972 VIndex = MI.getOperand(3).getReg(); 3973 OpOffset = 1; 3974 } else { 3975 VIndex = B.buildConstant(S32, 0).getReg(0); 3976 } 3977 3978 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3979 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3980 3981 unsigned Format = 0; 3982 if (IsTyped) { 3983 Format = MI.getOperand(5 + OpOffset).getImm(); 3984 ++OpOffset; 3985 } 3986 3987 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3988 unsigned ImmOffset; 3989 3990 LLT Ty = MRI.getType(Dst); 3991 LLT EltTy = Ty.getScalarType(); 3992 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3993 const bool Unpacked = ST.hasUnpackedD16VMem(); 3994 3995 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3996 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3997 3998 unsigned Opc; 3999 4000 if (IsTyped) { 4001 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4002 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4003 } else if (IsFormat) { 4004 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4005 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4006 } else { 4007 switch (MemTy.getSizeInBits()) { 4008 case 8: 4009 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4010 break; 4011 case 16: 4012 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4013 break; 4014 default: 4015 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4016 break; 4017 } 4018 } 4019 4020 Register LoadDstReg; 4021 4022 bool IsExtLoad = 4023 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4024 LLT UnpackedTy = Ty.changeElementSize(32); 4025 4026 if (IsExtLoad) 4027 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4028 else if (Unpacked && IsD16 && Ty.isVector()) 4029 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4030 else 4031 LoadDstReg = Dst; 4032 4033 auto MIB = B.buildInstr(Opc) 4034 .addDef(LoadDstReg) // vdata 4035 .addUse(RSrc) // rsrc 4036 .addUse(VIndex) // vindex 4037 .addUse(VOffset) // voffset 4038 .addUse(SOffset) // soffset 4039 .addImm(ImmOffset); // offset(imm) 4040 4041 if (IsTyped) 4042 MIB.addImm(Format); 4043 4044 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4045 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4046 .addMemOperand(MMO); 4047 4048 if (LoadDstReg != Dst) { 4049 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4050 4051 // Widen result for extending loads was widened. 4052 if (IsExtLoad) 4053 B.buildTrunc(Dst, LoadDstReg); 4054 else { 4055 // Repack to original 16-bit vector result 4056 // FIXME: G_TRUNC should work, but legalization currently fails 4057 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4058 SmallVector<Register, 4> Repack; 4059 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4060 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4061 B.buildMerge(Dst, Repack); 4062 } 4063 } 4064 4065 MI.eraseFromParent(); 4066 return true; 4067 } 4068 4069 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4070 MachineIRBuilder &B, 4071 bool IsInc) const { 4072 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4073 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4074 B.buildInstr(Opc) 4075 .addDef(MI.getOperand(0).getReg()) 4076 .addUse(MI.getOperand(2).getReg()) 4077 .addUse(MI.getOperand(3).getReg()) 4078 .cloneMemRefs(MI); 4079 MI.eraseFromParent(); 4080 return true; 4081 } 4082 4083 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4084 switch (IntrID) { 4085 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4086 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4087 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4088 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4089 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4090 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4091 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4092 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4093 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4094 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4095 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4096 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4097 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4098 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4099 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4100 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4101 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4102 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4103 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4104 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4105 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4106 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4107 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4108 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4109 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4110 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4111 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4112 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4113 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4114 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4115 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4116 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4117 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4118 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4119 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4120 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4121 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4122 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4123 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4124 case Intrinsic::amdgcn_buffer_atomic_fadd: 4125 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4126 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4127 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4128 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4129 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4130 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4131 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4132 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4133 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4134 default: 4135 llvm_unreachable("unhandled atomic opcode"); 4136 } 4137 } 4138 4139 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4140 MachineIRBuilder &B, 4141 Intrinsic::ID IID) const { 4142 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4143 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4144 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4145 4146 Register Dst; 4147 4148 int OpOffset = 0; 4149 if (HasReturn) { 4150 // A few FP atomics do not support return values. 4151 Dst = MI.getOperand(0).getReg(); 4152 } else { 4153 OpOffset = -1; 4154 } 4155 4156 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4157 Register CmpVal; 4158 4159 if (IsCmpSwap) { 4160 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4161 ++OpOffset; 4162 } 4163 4164 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4165 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4166 4167 // The struct intrinsic variants add one additional operand over raw. 4168 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4169 Register VIndex; 4170 if (HasVIndex) { 4171 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4172 ++OpOffset; 4173 } else { 4174 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4175 } 4176 4177 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4178 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4179 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4180 4181 MachineMemOperand *MMO = *MI.memoperands_begin(); 4182 4183 unsigned ImmOffset; 4184 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4185 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4186 4187 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4188 4189 if (HasReturn) 4190 MIB.addDef(Dst); 4191 4192 MIB.addUse(VData); // vdata 4193 4194 if (IsCmpSwap) 4195 MIB.addReg(CmpVal); 4196 4197 MIB.addUse(RSrc) // rsrc 4198 .addUse(VIndex) // vindex 4199 .addUse(VOffset) // voffset 4200 .addUse(SOffset) // soffset 4201 .addImm(ImmOffset) // offset(imm) 4202 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4203 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4204 .addMemOperand(MMO); 4205 4206 MI.eraseFromParent(); 4207 return true; 4208 } 4209 4210 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4211 /// vector with s16 typed elements. 4212 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4213 SmallVectorImpl<Register> &PackedAddrs, 4214 unsigned ArgOffset, 4215 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4216 bool IsA16, bool IsG16) { 4217 const LLT S16 = LLT::scalar(16); 4218 const LLT V2S16 = LLT::fixed_vector(2, 16); 4219 auto EndIdx = Intr->VAddrEnd; 4220 4221 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4222 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4223 if (!SrcOp.isReg()) 4224 continue; // _L to _LZ may have eliminated this. 4225 4226 Register AddrReg = SrcOp.getReg(); 4227 4228 if ((I < Intr->GradientStart) || 4229 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4230 (I >= Intr->CoordStart && !IsA16)) { 4231 // Handle any gradient or coordinate operands that should not be packed 4232 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4233 PackedAddrs.push_back(AddrReg); 4234 } else { 4235 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4236 // derivatives dx/dh and dx/dv are packed with undef. 4237 if (((I + 1) >= EndIdx) || 4238 ((Intr->NumGradients / 2) % 2 == 1 && 4239 (I == static_cast<unsigned>(Intr->GradientStart + 4240 (Intr->NumGradients / 2) - 1) || 4241 I == static_cast<unsigned>(Intr->GradientStart + 4242 Intr->NumGradients - 1))) || 4243 // Check for _L to _LZ optimization 4244 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4245 PackedAddrs.push_back( 4246 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4247 .getReg(0)); 4248 } else { 4249 PackedAddrs.push_back( 4250 B.buildBuildVector( 4251 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4252 .getReg(0)); 4253 ++I; 4254 } 4255 } 4256 } 4257 } 4258 4259 /// Convert from separate vaddr components to a single vector address register, 4260 /// and replace the remaining operands with $noreg. 4261 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4262 int DimIdx, int NumVAddrs) { 4263 const LLT S32 = LLT::scalar(32); 4264 4265 SmallVector<Register, 8> AddrRegs; 4266 for (int I = 0; I != NumVAddrs; ++I) { 4267 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4268 if (SrcOp.isReg()) { 4269 AddrRegs.push_back(SrcOp.getReg()); 4270 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4271 } 4272 } 4273 4274 int NumAddrRegs = AddrRegs.size(); 4275 if (NumAddrRegs != 1) { 4276 // Above 8 elements round up to next power of 2 (i.e. 16). 4277 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4278 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4279 auto Undef = B.buildUndef(S32); 4280 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4281 NumAddrRegs = RoundedNumRegs; 4282 } 4283 4284 auto VAddr = 4285 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4286 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4287 } 4288 4289 for (int I = 1; I != NumVAddrs; ++I) { 4290 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4291 if (SrcOp.isReg()) 4292 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4293 } 4294 } 4295 4296 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4297 /// 4298 /// Depending on the subtarget, load/store with 16-bit element data need to be 4299 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4300 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4301 /// registers. 4302 /// 4303 /// We don't want to directly select image instructions just yet, but also want 4304 /// to exposes all register repacking to the legalizer/combiners. We also don't 4305 /// want a selected instrution entering RegBankSelect. In order to avoid 4306 /// defining a multitude of intermediate image instructions, directly hack on 4307 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4308 /// now unnecessary arguments with $noreg. 4309 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4310 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4311 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4312 4313 const unsigned NumDefs = MI.getNumExplicitDefs(); 4314 const unsigned ArgOffset = NumDefs + 1; 4315 bool IsTFE = NumDefs == 2; 4316 // We are only processing the operands of d16 image operations on subtargets 4317 // that use the unpacked register layout, or need to repack the TFE result. 4318 4319 // TODO: Do we need to guard against already legalized intrinsics? 4320 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4321 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4322 4323 MachineRegisterInfo *MRI = B.getMRI(); 4324 const LLT S32 = LLT::scalar(32); 4325 const LLT S16 = LLT::scalar(16); 4326 const LLT V2S16 = LLT::fixed_vector(2, 16); 4327 4328 unsigned DMask = 0; 4329 4330 // Check for 16 bit addresses and pack if true. 4331 LLT GradTy = 4332 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4333 LLT AddrTy = 4334 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4335 const bool IsG16 = GradTy == S16; 4336 const bool IsA16 = AddrTy == S16; 4337 4338 int DMaskLanes = 0; 4339 if (!BaseOpcode->Atomic) { 4340 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4341 if (BaseOpcode->Gather4) { 4342 DMaskLanes = 4; 4343 } else if (DMask != 0) { 4344 DMaskLanes = countPopulation(DMask); 4345 } else if (!IsTFE && !BaseOpcode->Store) { 4346 // If dmask is 0, this is a no-op load. This can be eliminated. 4347 B.buildUndef(MI.getOperand(0)); 4348 MI.eraseFromParent(); 4349 return true; 4350 } 4351 } 4352 4353 Observer.changingInstr(MI); 4354 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4355 4356 unsigned NewOpcode = NumDefs == 0 ? 4357 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4358 4359 // Track that we legalized this 4360 MI.setDesc(B.getTII().get(NewOpcode)); 4361 4362 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4363 // dmask to be at least 1 otherwise the instruction will fail 4364 if (IsTFE && DMask == 0) { 4365 DMask = 0x1; 4366 DMaskLanes = 1; 4367 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4368 } 4369 4370 if (BaseOpcode->Atomic) { 4371 Register VData0 = MI.getOperand(2).getReg(); 4372 LLT Ty = MRI->getType(VData0); 4373 4374 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4375 if (Ty.isVector()) 4376 return false; 4377 4378 if (BaseOpcode->AtomicX2) { 4379 Register VData1 = MI.getOperand(3).getReg(); 4380 // The two values are packed in one register. 4381 LLT PackedTy = LLT::fixed_vector(2, Ty); 4382 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4383 MI.getOperand(2).setReg(Concat.getReg(0)); 4384 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4385 } 4386 } 4387 4388 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4389 4390 // Optimize _L to _LZ when _L is zero 4391 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4392 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4393 const ConstantFP *ConstantLod; 4394 4395 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4396 m_GFCst(ConstantLod))) { 4397 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4398 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4399 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4400 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4401 Intr->Dim); 4402 4403 // The starting indexes should remain in the same place. 4404 --CorrectedNumVAddrs; 4405 4406 MI.getOperand(MI.getNumExplicitDefs()) 4407 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4408 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4409 Intr = NewImageDimIntr; 4410 } 4411 } 4412 } 4413 4414 // Optimize _mip away, when 'lod' is zero 4415 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4416 int64_t ConstantLod; 4417 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4418 m_ICst(ConstantLod))) { 4419 if (ConstantLod == 0) { 4420 // TODO: Change intrinsic opcode and remove operand instead or replacing 4421 // it with 0, as the _L to _LZ handling is done above. 4422 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4423 --CorrectedNumVAddrs; 4424 } 4425 } 4426 } 4427 4428 // Rewrite the addressing register layout before doing anything else. 4429 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4430 // 16 bit gradients are supported, but are tied to the A16 control 4431 // so both gradients and addresses must be 16 bit 4432 return false; 4433 } 4434 4435 if (IsA16 && !ST.hasA16()) { 4436 // A16 not supported 4437 return false; 4438 } 4439 4440 if (IsA16 || IsG16) { 4441 if (Intr->NumVAddrs > 1) { 4442 SmallVector<Register, 4> PackedRegs; 4443 4444 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4445 IsG16); 4446 4447 // See also below in the non-a16 branch 4448 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4449 PackedRegs.size() <= ST.getNSAMaxSize(); 4450 4451 if (!UseNSA && PackedRegs.size() > 1) { 4452 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4453 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4454 PackedRegs[0] = Concat.getReg(0); 4455 PackedRegs.resize(1); 4456 } 4457 4458 const unsigned NumPacked = PackedRegs.size(); 4459 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4460 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4461 if (!SrcOp.isReg()) { 4462 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4463 continue; 4464 } 4465 4466 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4467 4468 if (I - Intr->VAddrStart < NumPacked) 4469 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4470 else 4471 SrcOp.setReg(AMDGPU::NoRegister); 4472 } 4473 } 4474 } else { 4475 // If the register allocator cannot place the address registers contiguously 4476 // without introducing moves, then using the non-sequential address encoding 4477 // is always preferable, since it saves VALU instructions and is usually a 4478 // wash in terms of code size or even better. 4479 // 4480 // However, we currently have no way of hinting to the register allocator 4481 // that MIMG addresses should be placed contiguously when it is possible to 4482 // do so, so force non-NSA for the common 2-address case as a heuristic. 4483 // 4484 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4485 // allocation when possible. 4486 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4487 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4488 4489 if (!UseNSA && Intr->NumVAddrs > 1) 4490 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4491 Intr->NumVAddrs); 4492 } 4493 4494 int Flags = 0; 4495 if (IsA16) 4496 Flags |= 1; 4497 if (IsG16) 4498 Flags |= 2; 4499 MI.addOperand(MachineOperand::CreateImm(Flags)); 4500 4501 if (BaseOpcode->Store) { // No TFE for stores? 4502 // TODO: Handle dmask trim 4503 Register VData = MI.getOperand(1).getReg(); 4504 LLT Ty = MRI->getType(VData); 4505 if (!Ty.isVector() || Ty.getElementType() != S16) 4506 return true; 4507 4508 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4509 if (RepackedReg != VData) { 4510 MI.getOperand(1).setReg(RepackedReg); 4511 } 4512 4513 return true; 4514 } 4515 4516 Register DstReg = MI.getOperand(0).getReg(); 4517 LLT Ty = MRI->getType(DstReg); 4518 const LLT EltTy = Ty.getScalarType(); 4519 const bool IsD16 = Ty.getScalarType() == S16; 4520 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4521 4522 // Confirm that the return type is large enough for the dmask specified 4523 if (NumElts < DMaskLanes) 4524 return false; 4525 4526 if (NumElts > 4 || DMaskLanes > 4) 4527 return false; 4528 4529 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4530 const LLT AdjustedTy = 4531 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4532 4533 // The raw dword aligned data component of the load. The only legal cases 4534 // where this matters should be when using the packed D16 format, for 4535 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4536 LLT RoundedTy; 4537 4538 // S32 vector to to cover all data, plus TFE result element. 4539 LLT TFETy; 4540 4541 // Register type to use for each loaded component. Will be S32 or V2S16. 4542 LLT RegTy; 4543 4544 if (IsD16 && ST.hasUnpackedD16VMem()) { 4545 RoundedTy = 4546 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4547 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4548 RegTy = S32; 4549 } else { 4550 unsigned EltSize = EltTy.getSizeInBits(); 4551 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4552 unsigned RoundedSize = 32 * RoundedElts; 4553 RoundedTy = LLT::scalarOrVector( 4554 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4555 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4556 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4557 } 4558 4559 // The return type does not need adjustment. 4560 // TODO: Should we change s16 case to s32 or <2 x s16>? 4561 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4562 return true; 4563 4564 Register Dst1Reg; 4565 4566 // Insert after the instruction. 4567 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4568 4569 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4570 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4571 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4572 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4573 4574 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4575 4576 MI.getOperand(0).setReg(NewResultReg); 4577 4578 // In the IR, TFE is supposed to be used with a 2 element struct return 4579 // type. The intruction really returns these two values in one contiguous 4580 // register, with one additional dword beyond the loaded data. Rewrite the 4581 // return type to use a single register result. 4582 4583 if (IsTFE) { 4584 Dst1Reg = MI.getOperand(1).getReg(); 4585 if (MRI->getType(Dst1Reg) != S32) 4586 return false; 4587 4588 // TODO: Make sure the TFE operand bit is set. 4589 MI.RemoveOperand(1); 4590 4591 // Handle the easy case that requires no repack instructions. 4592 if (Ty == S32) { 4593 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4594 return true; 4595 } 4596 } 4597 4598 // Now figure out how to copy the new result register back into the old 4599 // result. 4600 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4601 4602 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4603 4604 if (ResultNumRegs == 1) { 4605 assert(!IsTFE); 4606 ResultRegs[0] = NewResultReg; 4607 } else { 4608 // We have to repack into a new vector of some kind. 4609 for (int I = 0; I != NumDataRegs; ++I) 4610 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4611 B.buildUnmerge(ResultRegs, NewResultReg); 4612 4613 // Drop the final TFE element to get the data part. The TFE result is 4614 // directly written to the right place already. 4615 if (IsTFE) 4616 ResultRegs.resize(NumDataRegs); 4617 } 4618 4619 // For an s16 scalar result, we form an s32 result with a truncate regardless 4620 // of packed vs. unpacked. 4621 if (IsD16 && !Ty.isVector()) { 4622 B.buildTrunc(DstReg, ResultRegs[0]); 4623 return true; 4624 } 4625 4626 // Avoid a build/concat_vector of 1 entry. 4627 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4628 B.buildBitcast(DstReg, ResultRegs[0]); 4629 return true; 4630 } 4631 4632 assert(Ty.isVector()); 4633 4634 if (IsD16) { 4635 // For packed D16 results with TFE enabled, all the data components are 4636 // S32. Cast back to the expected type. 4637 // 4638 // TODO: We don't really need to use load s32 elements. We would only need one 4639 // cast for the TFE result if a multiple of v2s16 was used. 4640 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4641 for (Register &Reg : ResultRegs) 4642 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4643 } else if (ST.hasUnpackedD16VMem()) { 4644 for (Register &Reg : ResultRegs) 4645 Reg = B.buildTrunc(S16, Reg).getReg(0); 4646 } 4647 } 4648 4649 auto padWithUndef = [&](LLT Ty, int NumElts) { 4650 if (NumElts == 0) 4651 return; 4652 Register Undef = B.buildUndef(Ty).getReg(0); 4653 for (int I = 0; I != NumElts; ++I) 4654 ResultRegs.push_back(Undef); 4655 }; 4656 4657 // Pad out any elements eliminated due to the dmask. 4658 LLT ResTy = MRI->getType(ResultRegs[0]); 4659 if (!ResTy.isVector()) { 4660 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4661 B.buildBuildVector(DstReg, ResultRegs); 4662 return true; 4663 } 4664 4665 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4666 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4667 4668 // Deal with the one annoying legal case. 4669 const LLT V3S16 = LLT::fixed_vector(3, 16); 4670 if (Ty == V3S16) { 4671 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4672 auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs); 4673 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4674 return true; 4675 } 4676 4677 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4678 B.buildConcatVectors(DstReg, ResultRegs); 4679 return true; 4680 } 4681 4682 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4683 LegalizerHelper &Helper, MachineInstr &MI) const { 4684 MachineIRBuilder &B = Helper.MIRBuilder; 4685 GISelChangeObserver &Observer = Helper.Observer; 4686 4687 Register Dst = MI.getOperand(0).getReg(); 4688 LLT Ty = B.getMRI()->getType(Dst); 4689 unsigned Size = Ty.getSizeInBits(); 4690 MachineFunction &MF = B.getMF(); 4691 4692 Observer.changingInstr(MI); 4693 4694 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4695 Ty = getBitcastRegisterType(Ty); 4696 Helper.bitcastDst(MI, Ty, 0); 4697 Dst = MI.getOperand(0).getReg(); 4698 B.setInsertPt(B.getMBB(), MI); 4699 } 4700 4701 // FIXME: We don't really need this intermediate instruction. The intrinsic 4702 // should be fixed to have a memory operand. Since it's readnone, we're not 4703 // allowed to add one. 4704 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4705 MI.RemoveOperand(1); // Remove intrinsic ID 4706 4707 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4708 // TODO: Should this use datalayout alignment? 4709 const unsigned MemSize = (Size + 7) / 8; 4710 const Align MemAlign(4); 4711 MachineMemOperand *MMO = MF.getMachineMemOperand( 4712 MachinePointerInfo(), 4713 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4714 MachineMemOperand::MOInvariant, 4715 MemSize, MemAlign); 4716 MI.addMemOperand(MF, MMO); 4717 4718 // There are no 96-bit result scalar loads, but widening to 128-bit should 4719 // always be legal. We may need to restore this to a 96-bit result if it turns 4720 // out this needs to be converted to a vector load during RegBankSelect. 4721 if (!isPowerOf2_32(Size)) { 4722 if (Ty.isVector()) 4723 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4724 else 4725 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4726 } 4727 4728 Observer.changedInstr(MI); 4729 return true; 4730 } 4731 4732 // TODO: Move to selection 4733 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4734 MachineRegisterInfo &MRI, 4735 MachineIRBuilder &B) const { 4736 if (!ST.isTrapHandlerEnabled() || 4737 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4738 return legalizeTrapEndpgm(MI, MRI, B); 4739 4740 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4741 switch (*HsaAbiVer) { 4742 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4743 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4744 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4745 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4746 return ST.supportsGetDoorbellID() ? 4747 legalizeTrapHsa(MI, MRI, B) : 4748 legalizeTrapHsaQueuePtr(MI, MRI, B); 4749 } 4750 } 4751 4752 llvm_unreachable("Unknown trap handler"); 4753 } 4754 4755 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4756 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4757 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4758 MI.eraseFromParent(); 4759 return true; 4760 } 4761 4762 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4763 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4764 // Pass queue pointer to trap handler as input, and insert trap instruction 4765 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4766 Register LiveIn = 4767 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4768 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4769 return false; 4770 4771 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4772 B.buildCopy(SGPR01, LiveIn); 4773 B.buildInstr(AMDGPU::S_TRAP) 4774 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4775 .addReg(SGPR01, RegState::Implicit); 4776 4777 MI.eraseFromParent(); 4778 return true; 4779 } 4780 4781 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4782 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4783 B.buildInstr(AMDGPU::S_TRAP) 4784 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4785 MI.eraseFromParent(); 4786 return true; 4787 } 4788 4789 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4790 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4791 // Is non-HSA path or trap-handler disabled? then, report a warning 4792 // accordingly 4793 if (!ST.isTrapHandlerEnabled() || 4794 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4795 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4796 "debugtrap handler not supported", 4797 MI.getDebugLoc(), DS_Warning); 4798 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4799 Ctx.diagnose(NoTrap); 4800 } else { 4801 // Insert debug-trap instruction 4802 B.buildInstr(AMDGPU::S_TRAP) 4803 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4804 } 4805 4806 MI.eraseFromParent(); 4807 return true; 4808 } 4809 4810 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4811 MachineIRBuilder &B) const { 4812 MachineRegisterInfo &MRI = *B.getMRI(); 4813 const LLT S16 = LLT::scalar(16); 4814 const LLT S32 = LLT::scalar(32); 4815 4816 Register DstReg = MI.getOperand(0).getReg(); 4817 Register NodePtr = MI.getOperand(2).getReg(); 4818 Register RayExtent = MI.getOperand(3).getReg(); 4819 Register RayOrigin = MI.getOperand(4).getReg(); 4820 Register RayDir = MI.getOperand(5).getReg(); 4821 Register RayInvDir = MI.getOperand(6).getReg(); 4822 Register TDescr = MI.getOperand(7).getReg(); 4823 4824 if (!ST.hasGFX10_AEncoding()) { 4825 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4826 "intrinsic not supported on subtarget", 4827 MI.getDebugLoc()); 4828 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4829 return false; 4830 } 4831 4832 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4833 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4834 const unsigned NumVAddrs = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4835 const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize(); 4836 const unsigned Opcodes[2][2][2] = { 4837 {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa, 4838 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa}, 4839 {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa, 4840 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa}}, 4841 {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa, 4842 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa}, 4843 {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa, 4844 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa}}}; 4845 const unsigned Opcode = Opcodes[UseNSA][IsA16][Is64]; 4846 4847 SmallVector<Register, 12> Ops; 4848 if (Is64) { 4849 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4850 Ops.push_back(Unmerge.getReg(0)); 4851 Ops.push_back(Unmerge.getReg(1)); 4852 } else { 4853 Ops.push_back(NodePtr); 4854 } 4855 Ops.push_back(RayExtent); 4856 4857 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4858 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4859 Ops.push_back(Unmerge.getReg(0)); 4860 Ops.push_back(Unmerge.getReg(1)); 4861 Ops.push_back(Unmerge.getReg(2)); 4862 }; 4863 4864 packLanes(RayOrigin); 4865 if (IsA16) { 4866 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4867 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4868 Register R1 = MRI.createGenericVirtualRegister(S32); 4869 Register R2 = MRI.createGenericVirtualRegister(S32); 4870 Register R3 = MRI.createGenericVirtualRegister(S32); 4871 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4872 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4873 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4874 Ops.push_back(R1); 4875 Ops.push_back(R2); 4876 Ops.push_back(R3); 4877 } else { 4878 packLanes(RayDir); 4879 packLanes(RayInvDir); 4880 } 4881 4882 if (!UseNSA) { 4883 // Build a single vector containing all the operands so far prepared. 4884 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 4885 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 4886 Ops.clear(); 4887 Ops.push_back(MergedOps); 4888 } 4889 4890 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4891 .addDef(DstReg) 4892 .addImm(Opcode); 4893 4894 for (Register R : Ops) { 4895 MIB.addUse(R); 4896 } 4897 4898 MIB.addUse(TDescr) 4899 .addImm(IsA16 ? 1 : 0) 4900 .cloneMemRefs(MI); 4901 4902 MI.eraseFromParent(); 4903 return true; 4904 } 4905 4906 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4907 MachineInstr &MI) const { 4908 MachineIRBuilder &B = Helper.MIRBuilder; 4909 MachineRegisterInfo &MRI = *B.getMRI(); 4910 4911 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4912 auto IntrID = MI.getIntrinsicID(); 4913 switch (IntrID) { 4914 case Intrinsic::amdgcn_if: 4915 case Intrinsic::amdgcn_else: { 4916 MachineInstr *Br = nullptr; 4917 MachineBasicBlock *UncondBrTarget = nullptr; 4918 bool Negated = false; 4919 if (MachineInstr *BrCond = 4920 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4921 const SIRegisterInfo *TRI 4922 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4923 4924 Register Def = MI.getOperand(1).getReg(); 4925 Register Use = MI.getOperand(3).getReg(); 4926 4927 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4928 4929 if (Negated) 4930 std::swap(CondBrTarget, UncondBrTarget); 4931 4932 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4933 if (IntrID == Intrinsic::amdgcn_if) { 4934 B.buildInstr(AMDGPU::SI_IF) 4935 .addDef(Def) 4936 .addUse(Use) 4937 .addMBB(UncondBrTarget); 4938 } else { 4939 B.buildInstr(AMDGPU::SI_ELSE) 4940 .addDef(Def) 4941 .addUse(Use) 4942 .addMBB(UncondBrTarget); 4943 } 4944 4945 if (Br) { 4946 Br->getOperand(0).setMBB(CondBrTarget); 4947 } else { 4948 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4949 // since we're swapping branch targets it needs to be reinserted. 4950 // FIXME: IRTranslator should probably not do this 4951 B.buildBr(*CondBrTarget); 4952 } 4953 4954 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4955 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4956 MI.eraseFromParent(); 4957 BrCond->eraseFromParent(); 4958 return true; 4959 } 4960 4961 return false; 4962 } 4963 case Intrinsic::amdgcn_loop: { 4964 MachineInstr *Br = nullptr; 4965 MachineBasicBlock *UncondBrTarget = nullptr; 4966 bool Negated = false; 4967 if (MachineInstr *BrCond = 4968 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4969 const SIRegisterInfo *TRI 4970 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4971 4972 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4973 Register Reg = MI.getOperand(2).getReg(); 4974 4975 if (Negated) 4976 std::swap(CondBrTarget, UncondBrTarget); 4977 4978 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4979 B.buildInstr(AMDGPU::SI_LOOP) 4980 .addUse(Reg) 4981 .addMBB(UncondBrTarget); 4982 4983 if (Br) 4984 Br->getOperand(0).setMBB(CondBrTarget); 4985 else 4986 B.buildBr(*CondBrTarget); 4987 4988 MI.eraseFromParent(); 4989 BrCond->eraseFromParent(); 4990 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4991 return true; 4992 } 4993 4994 return false; 4995 } 4996 case Intrinsic::amdgcn_kernarg_segment_ptr: 4997 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4998 // This only makes sense to call in a kernel, so just lower to null. 4999 B.buildConstant(MI.getOperand(0).getReg(), 0); 5000 MI.eraseFromParent(); 5001 return true; 5002 } 5003 5004 return legalizePreloadedArgIntrin( 5005 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5006 case Intrinsic::amdgcn_implicitarg_ptr: 5007 return legalizeImplicitArgPtr(MI, MRI, B); 5008 case Intrinsic::amdgcn_workitem_id_x: 5009 return legalizePreloadedArgIntrin(MI, MRI, B, 5010 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5011 case Intrinsic::amdgcn_workitem_id_y: 5012 return legalizePreloadedArgIntrin(MI, MRI, B, 5013 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5014 case Intrinsic::amdgcn_workitem_id_z: 5015 return legalizePreloadedArgIntrin(MI, MRI, B, 5016 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5017 case Intrinsic::amdgcn_workgroup_id_x: 5018 return legalizePreloadedArgIntrin(MI, MRI, B, 5019 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5020 case Intrinsic::amdgcn_workgroup_id_y: 5021 return legalizePreloadedArgIntrin(MI, MRI, B, 5022 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5023 case Intrinsic::amdgcn_workgroup_id_z: 5024 return legalizePreloadedArgIntrin(MI, MRI, B, 5025 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5026 case Intrinsic::amdgcn_dispatch_ptr: 5027 return legalizePreloadedArgIntrin(MI, MRI, B, 5028 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5029 case Intrinsic::amdgcn_queue_ptr: 5030 return legalizePreloadedArgIntrin(MI, MRI, B, 5031 AMDGPUFunctionArgInfo::QUEUE_PTR); 5032 case Intrinsic::amdgcn_implicit_buffer_ptr: 5033 return legalizePreloadedArgIntrin( 5034 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5035 case Intrinsic::amdgcn_dispatch_id: 5036 return legalizePreloadedArgIntrin(MI, MRI, B, 5037 AMDGPUFunctionArgInfo::DISPATCH_ID); 5038 case Intrinsic::amdgcn_fdiv_fast: 5039 return legalizeFDIVFastIntrin(MI, MRI, B); 5040 case Intrinsic::amdgcn_is_shared: 5041 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5042 case Intrinsic::amdgcn_is_private: 5043 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5044 case Intrinsic::amdgcn_wavefrontsize: { 5045 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5046 MI.eraseFromParent(); 5047 return true; 5048 } 5049 case Intrinsic::amdgcn_s_buffer_load: 5050 return legalizeSBufferLoad(Helper, MI); 5051 case Intrinsic::amdgcn_raw_buffer_store: 5052 case Intrinsic::amdgcn_struct_buffer_store: 5053 return legalizeBufferStore(MI, MRI, B, false, false); 5054 case Intrinsic::amdgcn_raw_buffer_store_format: 5055 case Intrinsic::amdgcn_struct_buffer_store_format: 5056 return legalizeBufferStore(MI, MRI, B, false, true); 5057 case Intrinsic::amdgcn_raw_tbuffer_store: 5058 case Intrinsic::amdgcn_struct_tbuffer_store: 5059 return legalizeBufferStore(MI, MRI, B, true, true); 5060 case Intrinsic::amdgcn_raw_buffer_load: 5061 case Intrinsic::amdgcn_struct_buffer_load: 5062 return legalizeBufferLoad(MI, MRI, B, false, false); 5063 case Intrinsic::amdgcn_raw_buffer_load_format: 5064 case Intrinsic::amdgcn_struct_buffer_load_format: 5065 return legalizeBufferLoad(MI, MRI, B, true, false); 5066 case Intrinsic::amdgcn_raw_tbuffer_load: 5067 case Intrinsic::amdgcn_struct_tbuffer_load: 5068 return legalizeBufferLoad(MI, MRI, B, true, true); 5069 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5070 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5071 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5072 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5073 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5074 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5075 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5076 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5077 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5078 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5079 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5080 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5081 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5082 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5083 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5084 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5085 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5086 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5087 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5088 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5089 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5090 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5091 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5092 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5093 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5094 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5095 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5096 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5097 case Intrinsic::amdgcn_buffer_atomic_fadd: 5098 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5099 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5100 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5101 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5102 return legalizeBufferAtomic(MI, B, IntrID); 5103 case Intrinsic::amdgcn_atomic_inc: 5104 return legalizeAtomicIncDec(MI, B, true); 5105 case Intrinsic::amdgcn_atomic_dec: 5106 return legalizeAtomicIncDec(MI, B, false); 5107 case Intrinsic::trap: 5108 return legalizeTrapIntrinsic(MI, MRI, B); 5109 case Intrinsic::debugtrap: 5110 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5111 case Intrinsic::amdgcn_rsq_clamp: 5112 return legalizeRsqClampIntrinsic(MI, MRI, B); 5113 case Intrinsic::amdgcn_ds_fadd: 5114 case Intrinsic::amdgcn_ds_fmin: 5115 case Intrinsic::amdgcn_ds_fmax: 5116 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5117 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5118 return legalizeBVHIntrinsic(MI, B); 5119 default: { 5120 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5121 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5122 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5123 return true; 5124 } 5125 } 5126 5127 return true; 5128 } 5129