1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returns true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 if (Size <= 32) { 138 // <2 x s8> -> s16 139 // <4 x s8> -> s32 140 return LLT::scalar(Size); 141 } 142 143 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 144 } 145 146 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 147 return [=](const LegalityQuery &Query) { 148 const LLT Ty = Query.Types[TypeIdx]; 149 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 150 }; 151 } 152 153 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 154 return [=](const LegalityQuery &Query) { 155 const LLT Ty = Query.Types[TypeIdx]; 156 unsigned Size = Ty.getSizeInBits(); 157 assert(Size % 32 == 0); 158 return std::make_pair( 159 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 160 }; 161 } 162 163 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 164 return [=](const LegalityQuery &Query) { 165 const LLT QueryTy = Query.Types[TypeIdx]; 166 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 167 }; 168 } 169 170 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 171 return [=](const LegalityQuery &Query) { 172 const LLT QueryTy = Query.Types[TypeIdx]; 173 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 174 }; 175 } 176 177 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 178 return [=](const LegalityQuery &Query) { 179 const LLT QueryTy = Query.Types[TypeIdx]; 180 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 181 }; 182 } 183 184 static bool isRegisterSize(unsigned Size) { 185 return Size % 32 == 0 && Size <= MaxRegisterSize; 186 } 187 188 static bool isRegisterVectorElementType(LLT EltTy) { 189 const int EltSize = EltTy.getSizeInBits(); 190 return EltSize == 16 || EltSize % 32 == 0; 191 } 192 193 static bool isRegisterVectorType(LLT Ty) { 194 const int EltSize = Ty.getElementType().getSizeInBits(); 195 return EltSize == 32 || EltSize == 64 || 196 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 197 EltSize == 128 || EltSize == 256; 198 } 199 200 static bool isRegisterType(LLT Ty) { 201 if (!isRegisterSize(Ty.getSizeInBits())) 202 return false; 203 204 if (Ty.isVector()) 205 return isRegisterVectorType(Ty); 206 207 return true; 208 } 209 210 // Any combination of 32 or 64-bit elements up the maximum register size, and 211 // multiples of v2s16. 212 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 213 return [=](const LegalityQuery &Query) { 214 return isRegisterType(Query.Types[TypeIdx]); 215 }; 216 } 217 218 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 219 return [=](const LegalityQuery &Query) { 220 const LLT QueryTy = Query.Types[TypeIdx]; 221 if (!QueryTy.isVector()) 222 return false; 223 const LLT EltTy = QueryTy.getElementType(); 224 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 225 }; 226 } 227 228 // If we have a truncating store or an extending load with a data size larger 229 // than 32-bits, we need to reduce to a 32-bit type. 230 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 231 return [=](const LegalityQuery &Query) { 232 const LLT Ty = Query.Types[TypeIdx]; 233 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 234 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 235 }; 236 } 237 238 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 239 // handle some operations by just promoting the register during 240 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 241 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 242 bool IsLoad) { 243 switch (AS) { 244 case AMDGPUAS::PRIVATE_ADDRESS: 245 // FIXME: Private element size. 246 return ST.enableFlatScratch() ? 128 : 32; 247 case AMDGPUAS::LOCAL_ADDRESS: 248 return ST.useDS128() ? 128 : 64; 249 case AMDGPUAS::GLOBAL_ADDRESS: 250 case AMDGPUAS::CONSTANT_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 252 // Treat constant and global as identical. SMRD loads are sometimes usable for 253 // global loads (ideally constant address space should be eliminated) 254 // depending on the context. Legality cannot be context dependent, but 255 // RegBankSelect can split the load as necessary depending on the pointer 256 // register bank/uniformity and if the memory is invariant or not written in a 257 // kernel. 258 return IsLoad ? 512 : 128; 259 default: 260 // Flat addresses may contextually need to be split to 32-bit parts if they 261 // may alias scratch depending on the subtarget. 262 return 128; 263 } 264 } 265 266 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 267 const LegalityQuery &Query) { 268 const LLT Ty = Query.Types[0]; 269 270 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 271 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 272 273 unsigned RegSize = Ty.getSizeInBits(); 274 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 275 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 276 unsigned AS = Query.Types[1].getAddressSpace(); 277 278 // All of these need to be custom lowered to cast the pointer operand. 279 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 280 return false; 281 282 // Do not handle extending vector loads. 283 if (Ty.isVector() && MemSize != RegSize) 284 return false; 285 286 // TODO: We should be able to widen loads if the alignment is high enough, but 287 // we also need to modify the memory access size. 288 #if 0 289 // Accept widening loads based on alignment. 290 if (IsLoad && MemSize < Size) 291 MemSize = std::max(MemSize, Align); 292 #endif 293 294 // Only 1-byte and 2-byte to 32-bit extloads are valid. 295 if (MemSize != RegSize && RegSize != 32) 296 return false; 297 298 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 299 return false; 300 301 switch (MemSize) { 302 case 8: 303 case 16: 304 case 32: 305 case 64: 306 case 128: 307 break; 308 case 96: 309 if (!ST.hasDwordx3LoadStores()) 310 return false; 311 break; 312 case 256: 313 case 512: 314 // These may contextually need to be broken down. 315 break; 316 default: 317 return false; 318 } 319 320 assert(RegSize >= MemSize); 321 322 if (AlignBits < MemSize) { 323 const SITargetLowering *TLI = ST.getTargetLowering(); 324 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 325 Align(AlignBits / 8))) 326 return false; 327 } 328 329 return true; 330 } 331 332 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 333 // workaround this. Eventually it should ignore the type for loads and only care 334 // about the size. Return true in cases where we will workaround this for now by 335 // bitcasting. 336 static bool loadStoreBitcastWorkaround(const LLT Ty) { 337 if (EnableNewLegality) 338 return false; 339 340 const unsigned Size = Ty.getSizeInBits(); 341 if (Size <= 64) 342 return false; 343 if (!Ty.isVector()) 344 return true; 345 346 LLT EltTy = Ty.getElementType(); 347 if (EltTy.isPointer()) 348 return true; 349 350 unsigned EltSize = EltTy.getSizeInBits(); 351 return EltSize != 32 && EltSize != 64; 352 } 353 354 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 355 const LLT Ty = Query.Types[0]; 356 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 357 !loadStoreBitcastWorkaround(Ty); 358 } 359 360 /// Return true if a load or store of the type should be lowered with a bitcast 361 /// to a different type. 362 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 363 const LLT MemTy) { 364 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 365 const unsigned Size = Ty.getSizeInBits(); 366 if (Size != MemSizeInBits) 367 return Size <= 32 && Ty.isVector(); 368 369 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 370 return true; 371 372 // Don't try to handle bitcasting vector ext loads for now. 373 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 374 (Size <= 32 || isRegisterSize(Size)) && 375 !isRegisterVectorElementType(Ty.getElementType()); 376 } 377 378 /// Return true if we should legalize a load by widening an odd sized memory 379 /// access up to the alignment. Note this case when the memory access itself 380 /// changes, not the size of the result register. 381 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 382 uint64_t AlignInBits, unsigned AddrSpace, 383 unsigned Opcode) { 384 unsigned SizeInBits = MemoryTy.getSizeInBits(); 385 // We don't want to widen cases that are naturally legal. 386 if (isPowerOf2_32(SizeInBits)) 387 return false; 388 389 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 390 // end up widening these for a scalar load during RegBankSelect, since there 391 // aren't 96-bit scalar loads. 392 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 393 return false; 394 395 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 396 return false; 397 398 // A load is known dereferenceable up to the alignment, so it's legal to widen 399 // to it. 400 // 401 // TODO: Could check dereferenceable for less aligned cases. 402 unsigned RoundedSize = NextPowerOf2(SizeInBits); 403 if (AlignInBits < RoundedSize) 404 return false; 405 406 // Do not widen if it would introduce a slow unaligned load. 407 const SITargetLowering *TLI = ST.getTargetLowering(); 408 bool Fast = false; 409 return TLI->allowsMisalignedMemoryAccessesImpl( 410 RoundedSize, AddrSpace, Align(AlignInBits / 8), 411 MachineMemOperand::MOLoad, &Fast) && 412 Fast; 413 } 414 415 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 416 unsigned Opcode) { 417 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 418 return false; 419 420 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 421 Query.MMODescrs[0].AlignInBits, 422 Query.Types[1].getAddressSpace(), Opcode); 423 } 424 425 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 426 const GCNTargetMachine &TM) 427 : ST(ST_) { 428 using namespace TargetOpcode; 429 430 auto GetAddrSpacePtr = [&TM](unsigned AS) { 431 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 432 }; 433 434 const LLT S1 = LLT::scalar(1); 435 const LLT S8 = LLT::scalar(8); 436 const LLT S16 = LLT::scalar(16); 437 const LLT S32 = LLT::scalar(32); 438 const LLT S64 = LLT::scalar(64); 439 const LLT S128 = LLT::scalar(128); 440 const LLT S256 = LLT::scalar(256); 441 const LLT S512 = LLT::scalar(512); 442 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 443 444 const LLT V2S8 = LLT::fixed_vector(2, 8); 445 const LLT V2S16 = LLT::fixed_vector(2, 16); 446 const LLT V4S16 = LLT::fixed_vector(4, 16); 447 448 const LLT V2S32 = LLT::fixed_vector(2, 32); 449 const LLT V3S32 = LLT::fixed_vector(3, 32); 450 const LLT V4S32 = LLT::fixed_vector(4, 32); 451 const LLT V5S32 = LLT::fixed_vector(5, 32); 452 const LLT V6S32 = LLT::fixed_vector(6, 32); 453 const LLT V7S32 = LLT::fixed_vector(7, 32); 454 const LLT V8S32 = LLT::fixed_vector(8, 32); 455 const LLT V9S32 = LLT::fixed_vector(9, 32); 456 const LLT V10S32 = LLT::fixed_vector(10, 32); 457 const LLT V11S32 = LLT::fixed_vector(11, 32); 458 const LLT V12S32 = LLT::fixed_vector(12, 32); 459 const LLT V13S32 = LLT::fixed_vector(13, 32); 460 const LLT V14S32 = LLT::fixed_vector(14, 32); 461 const LLT V15S32 = LLT::fixed_vector(15, 32); 462 const LLT V16S32 = LLT::fixed_vector(16, 32); 463 const LLT V32S32 = LLT::fixed_vector(32, 32); 464 465 const LLT V2S64 = LLT::fixed_vector(2, 64); 466 const LLT V3S64 = LLT::fixed_vector(3, 64); 467 const LLT V4S64 = LLT::fixed_vector(4, 64); 468 const LLT V5S64 = LLT::fixed_vector(5, 64); 469 const LLT V6S64 = LLT::fixed_vector(6, 64); 470 const LLT V7S64 = LLT::fixed_vector(7, 64); 471 const LLT V8S64 = LLT::fixed_vector(8, 64); 472 const LLT V16S64 = LLT::fixed_vector(16, 64); 473 474 std::initializer_list<LLT> AllS32Vectors = 475 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 476 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 477 std::initializer_list<LLT> AllS64Vectors = 478 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 479 480 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 481 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 482 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 483 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 484 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 485 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 486 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 487 488 const LLT CodePtr = FlatPtr; 489 490 const std::initializer_list<LLT> AddrSpaces64 = { 491 GlobalPtr, ConstantPtr, FlatPtr 492 }; 493 494 const std::initializer_list<LLT> AddrSpaces32 = { 495 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 496 }; 497 498 const std::initializer_list<LLT> FPTypesBase = { 499 S32, S64 500 }; 501 502 const std::initializer_list<LLT> FPTypes16 = { 503 S32, S64, S16 504 }; 505 506 const std::initializer_list<LLT> FPTypesPK16 = { 507 S32, S64, S16, V2S16 508 }; 509 510 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 511 512 // s1 for VCC branches, s32 for SCC branches. 513 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 514 515 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 516 // elements for v3s16 517 getActionDefinitionsBuilder(G_PHI) 518 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 519 .legalFor(AllS32Vectors) 520 .legalFor(AllS64Vectors) 521 .legalFor(AddrSpaces64) 522 .legalFor(AddrSpaces32) 523 .legalIf(isPointer(0)) 524 .clampScalar(0, S16, S256) 525 .widenScalarToNextPow2(0, 32) 526 .clampMaxNumElements(0, S32, 16) 527 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 528 .scalarize(0); 529 530 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 531 // Full set of gfx9 features. 532 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 533 .legalFor({S32, S16, V2S16}) 534 .minScalar(0, S16) 535 .clampMaxNumElementsStrict(0, S16, 2) 536 .widenScalarToNextMultipleOf(0, 32) 537 .maxScalar(0, S32) 538 .scalarize(0); 539 540 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 541 .legalFor({S32, S16, V2S16}) // Clamp modifier 542 .minScalarOrElt(0, S16) 543 .clampMaxNumElementsStrict(0, S16, 2) 544 .scalarize(0) 545 .widenScalarToNextPow2(0, 32) 546 .lower(); 547 } else if (ST.has16BitInsts()) { 548 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 549 .legalFor({S32, S16}) 550 .minScalar(0, S16) 551 .widenScalarToNextMultipleOf(0, 32) 552 .maxScalar(0, S32) 553 .scalarize(0); 554 555 // Technically the saturating operations require clamp bit support, but this 556 // was introduced at the same time as 16-bit operations. 557 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 558 .legalFor({S32, S16}) // Clamp modifier 559 .minScalar(0, S16) 560 .scalarize(0) 561 .widenScalarToNextPow2(0, 16) 562 .lower(); 563 564 // We're just lowering this, but it helps get a better result to try to 565 // coerce to the desired type first. 566 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 567 .minScalar(0, S16) 568 .scalarize(0) 569 .lower(); 570 } else { 571 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 572 .legalFor({S32}) 573 .widenScalarToNextMultipleOf(0, 32) 574 .clampScalar(0, S32, S32) 575 .scalarize(0); 576 577 if (ST.hasIntClamp()) { 578 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 579 .legalFor({S32}) // Clamp modifier. 580 .scalarize(0) 581 .minScalarOrElt(0, S32) 582 .lower(); 583 } else { 584 // Clamp bit support was added in VI, along with 16-bit operations. 585 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 586 .minScalar(0, S32) 587 .scalarize(0) 588 .lower(); 589 } 590 591 // FIXME: DAG expansion gets better results. The widening uses the smaller 592 // range values and goes for the min/max lowering directly. 593 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 594 .minScalar(0, S32) 595 .scalarize(0) 596 .lower(); 597 } 598 599 getActionDefinitionsBuilder( 600 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 601 .customFor({S32, S64}) 602 .clampScalar(0, S32, S64) 603 .widenScalarToNextPow2(0, 32) 604 .scalarize(0); 605 606 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 607 .legalFor({S32}) 608 .maxScalar(0, S32); 609 610 if (ST.hasVOP3PInsts()) { 611 Mulh 612 .clampMaxNumElements(0, S8, 2) 613 .lowerFor({V2S8}); 614 } 615 616 Mulh 617 .scalarize(0) 618 .lower(); 619 620 // Report legal for any types we can handle anywhere. For the cases only legal 621 // on the SALU, RegBankSelect will be able to re-legalize. 622 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 623 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 624 .clampScalar(0, S32, S64) 625 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 626 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 627 .widenScalarToNextPow2(0) 628 .scalarize(0); 629 630 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 631 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 632 .legalFor({{S32, S1}, {S32, S32}}) 633 .minScalar(0, S32) 634 // TODO: .scalarize(0) 635 .lower(); 636 637 getActionDefinitionsBuilder(G_BITCAST) 638 // Don't worry about the size constraint. 639 .legalIf(all(isRegisterType(0), isRegisterType(1))) 640 .lower(); 641 642 643 getActionDefinitionsBuilder(G_CONSTANT) 644 .legalFor({S1, S32, S64, S16, GlobalPtr, 645 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 646 .legalIf(isPointer(0)) 647 .clampScalar(0, S32, S64) 648 .widenScalarToNextPow2(0); 649 650 getActionDefinitionsBuilder(G_FCONSTANT) 651 .legalFor({S32, S64, S16}) 652 .clampScalar(0, S16, S64); 653 654 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 655 .legalIf(isRegisterType(0)) 656 // s1 and s16 are special cases because they have legal operations on 657 // them, but don't really occupy registers in the normal way. 658 .legalFor({S1, S16}) 659 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 660 .clampScalarOrElt(0, S32, MaxScalar) 661 .widenScalarToNextPow2(0, 32) 662 .clampMaxNumElements(0, S32, 16); 663 664 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 665 666 // If the amount is divergent, we have to do a wave reduction to get the 667 // maximum value, so this is expanded during RegBankSelect. 668 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 669 .legalFor({{PrivatePtr, S32}}); 670 671 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 672 .customIf(typeIsNot(0, PrivatePtr)); 673 674 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 675 676 auto &FPOpActions = getActionDefinitionsBuilder( 677 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 678 .legalFor({S32, S64}); 679 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 680 .customFor({S32, S64}); 681 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 682 .customFor({S32, S64}); 683 684 if (ST.has16BitInsts()) { 685 if (ST.hasVOP3PInsts()) 686 FPOpActions.legalFor({S16, V2S16}); 687 else 688 FPOpActions.legalFor({S16}); 689 690 TrigActions.customFor({S16}); 691 FDIVActions.customFor({S16}); 692 } 693 694 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 695 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 696 697 if (ST.hasVOP3PInsts()) { 698 MinNumMaxNum.customFor(FPTypesPK16) 699 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 700 .clampMaxNumElements(0, S16, 2) 701 .clampScalar(0, S16, S64) 702 .scalarize(0); 703 } else if (ST.has16BitInsts()) { 704 MinNumMaxNum.customFor(FPTypes16) 705 .clampScalar(0, S16, S64) 706 .scalarize(0); 707 } else { 708 MinNumMaxNum.customFor(FPTypesBase) 709 .clampScalar(0, S32, S64) 710 .scalarize(0); 711 } 712 713 if (ST.hasVOP3PInsts()) 714 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 715 716 FPOpActions 717 .scalarize(0) 718 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 719 720 TrigActions 721 .scalarize(0) 722 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 723 724 FDIVActions 725 .scalarize(0) 726 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 727 728 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 729 .legalFor(FPTypesPK16) 730 .clampMaxNumElementsStrict(0, S16, 2) 731 .scalarize(0) 732 .clampScalar(0, S16, S64); 733 734 if (ST.has16BitInsts()) { 735 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 736 .legalFor({S32, S64, S16}) 737 .scalarize(0) 738 .clampScalar(0, S16, S64); 739 } else { 740 getActionDefinitionsBuilder(G_FSQRT) 741 .legalFor({S32, S64}) 742 .scalarize(0) 743 .clampScalar(0, S32, S64); 744 745 if (ST.hasFractBug()) { 746 getActionDefinitionsBuilder(G_FFLOOR) 747 .customFor({S64}) 748 .legalFor({S32, S64}) 749 .scalarize(0) 750 .clampScalar(0, S32, S64); 751 } else { 752 getActionDefinitionsBuilder(G_FFLOOR) 753 .legalFor({S32, S64}) 754 .scalarize(0) 755 .clampScalar(0, S32, S64); 756 } 757 } 758 759 getActionDefinitionsBuilder(G_FPTRUNC) 760 .legalFor({{S32, S64}, {S16, S32}}) 761 .scalarize(0) 762 .lower(); 763 764 getActionDefinitionsBuilder(G_FPEXT) 765 .legalFor({{S64, S32}, {S32, S16}}) 766 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 767 .scalarize(0); 768 769 getActionDefinitionsBuilder(G_FSUB) 770 // Use actual fsub instruction 771 .legalFor({S32}) 772 // Must use fadd + fneg 773 .lowerFor({S64, S16, V2S16}) 774 .scalarize(0) 775 .clampScalar(0, S32, S64); 776 777 // Whether this is legal depends on the floating point mode for the function. 778 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 779 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 780 FMad.customFor({S32, S16}); 781 else if (ST.hasMadMacF32Insts()) 782 FMad.customFor({S32}); 783 else if (ST.hasMadF16()) 784 FMad.customFor({S16}); 785 FMad.scalarize(0) 786 .lower(); 787 788 auto &FRem = getActionDefinitionsBuilder(G_FREM); 789 if (ST.has16BitInsts()) { 790 FRem.customFor({S16, S32, S64}); 791 } else { 792 FRem.minScalar(0, S32) 793 .customFor({S32, S64}); 794 } 795 FRem.scalarize(0); 796 797 // TODO: Do we need to clamp maximum bitwidth? 798 getActionDefinitionsBuilder(G_TRUNC) 799 .legalIf(isScalar(0)) 800 .legalFor({{V2S16, V2S32}}) 801 .clampMaxNumElements(0, S16, 2) 802 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 803 // situations (like an invalid implicit use), we don't want to infinite loop 804 // in the legalizer. 805 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 806 .alwaysLegal(); 807 808 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 809 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 810 {S32, S1}, {S64, S1}, {S16, S1}}) 811 .scalarize(0) 812 .clampScalar(0, S32, S64) 813 .widenScalarToNextPow2(1, 32); 814 815 // TODO: Split s1->s64 during regbankselect for VALU. 816 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 817 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 818 .lowerIf(typeIs(1, S1)) 819 .customFor({{S32, S64}, {S64, S64}}); 820 if (ST.has16BitInsts()) 821 IToFP.legalFor({{S16, S16}}); 822 IToFP.clampScalar(1, S32, S64) 823 .minScalar(0, S32) 824 .scalarize(0) 825 .widenScalarToNextPow2(1); 826 827 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 828 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 829 .customFor({{S64, S32}, {S64, S64}}) 830 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 831 if (ST.has16BitInsts()) 832 FPToI.legalFor({{S16, S16}}); 833 else 834 FPToI.minScalar(1, S32); 835 836 FPToI.minScalar(0, S32) 837 .widenScalarToNextPow2(0, 32) 838 .scalarize(0) 839 .lower(); 840 841 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 842 .customFor({S16, S32}) 843 .scalarize(0) 844 .lower(); 845 846 // Lower roundeven into G_FRINT 847 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 848 .scalarize(0) 849 .lower(); 850 851 if (ST.has16BitInsts()) { 852 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 853 .legalFor({S16, S32, S64}) 854 .clampScalar(0, S16, S64) 855 .scalarize(0); 856 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 857 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 858 .legalFor({S32, S64}) 859 .clampScalar(0, S32, S64) 860 .scalarize(0); 861 } else { 862 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 863 .legalFor({S32}) 864 .customFor({S64}) 865 .clampScalar(0, S32, S64) 866 .scalarize(0); 867 } 868 869 getActionDefinitionsBuilder(G_PTR_ADD) 870 .legalIf(all(isPointer(0), sameSize(0, 1))) 871 .scalarize(0) 872 .scalarSameSizeAs(1, 0); 873 874 getActionDefinitionsBuilder(G_PTRMASK) 875 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 876 .scalarSameSizeAs(1, 0) 877 .scalarize(0); 878 879 auto &CmpBuilder = 880 getActionDefinitionsBuilder(G_ICMP) 881 // The compare output type differs based on the register bank of the output, 882 // so make both s1 and s32 legal. 883 // 884 // Scalar compares producing output in scc will be promoted to s32, as that 885 // is the allocatable register type that will be needed for the copy from 886 // scc. This will be promoted during RegBankSelect, and we assume something 887 // before that won't try to use s32 result types. 888 // 889 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 890 // bank. 891 .legalForCartesianProduct( 892 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 893 .legalForCartesianProduct( 894 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 895 if (ST.has16BitInsts()) { 896 CmpBuilder.legalFor({{S1, S16}}); 897 } 898 899 CmpBuilder 900 .widenScalarToNextPow2(1) 901 .clampScalar(1, S32, S64) 902 .scalarize(0) 903 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 904 905 getActionDefinitionsBuilder(G_FCMP) 906 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 907 .widenScalarToNextPow2(1) 908 .clampScalar(1, S32, S64) 909 .scalarize(0); 910 911 // FIXME: fpow has a selection pattern that should move to custom lowering. 912 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 913 if (ST.has16BitInsts()) 914 Exp2Ops.legalFor({S32, S16}); 915 else 916 Exp2Ops.legalFor({S32}); 917 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 918 Exp2Ops.scalarize(0); 919 920 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 921 if (ST.has16BitInsts()) 922 ExpOps.customFor({{S32}, {S16}}); 923 else 924 ExpOps.customFor({S32}); 925 ExpOps.clampScalar(0, MinScalarFPTy, S32) 926 .scalarize(0); 927 928 getActionDefinitionsBuilder(G_FPOWI) 929 .clampScalar(0, MinScalarFPTy, S32) 930 .lower(); 931 932 // The 64-bit versions produce 32-bit results, but only on the SALU. 933 getActionDefinitionsBuilder(G_CTPOP) 934 .legalFor({{S32, S32}, {S32, S64}}) 935 .clampScalar(0, S32, S32) 936 .widenScalarToNextPow2(1, 32) 937 .clampScalar(1, S32, S64) 938 .scalarize(0) 939 .widenScalarToNextPow2(0, 32); 940 941 942 // The hardware instructions return a different result on 0 than the generic 943 // instructions expect. The hardware produces -1, but these produce the 944 // bitwidth. 945 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 946 .scalarize(0) 947 .clampScalar(0, S32, S32) 948 .clampScalar(1, S32, S64) 949 .widenScalarToNextPow2(0, 32) 950 .widenScalarToNextPow2(1, 32) 951 .custom(); 952 953 // The 64-bit versions produce 32-bit results, but only on the SALU. 954 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 955 .legalFor({{S32, S32}, {S32, S64}}) 956 .clampScalar(0, S32, S32) 957 .clampScalar(1, S32, S64) 958 .scalarize(0) 959 .widenScalarToNextPow2(0, 32) 960 .widenScalarToNextPow2(1, 32); 961 962 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 963 // RegBankSelect. 964 getActionDefinitionsBuilder(G_BITREVERSE) 965 .legalFor({S32, S64}) 966 .clampScalar(0, S32, S64) 967 .scalarize(0) 968 .widenScalarToNextPow2(0); 969 970 if (ST.has16BitInsts()) { 971 getActionDefinitionsBuilder(G_BSWAP) 972 .legalFor({S16, S32, V2S16}) 973 .clampMaxNumElementsStrict(0, S16, 2) 974 // FIXME: Fixing non-power-of-2 before clamp is workaround for 975 // narrowScalar limitation. 976 .widenScalarToNextPow2(0) 977 .clampScalar(0, S16, S32) 978 .scalarize(0); 979 980 if (ST.hasVOP3PInsts()) { 981 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 982 .legalFor({S32, S16, V2S16}) 983 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 984 .clampMaxNumElements(0, S16, 2) 985 .minScalar(0, S16) 986 .widenScalarToNextPow2(0) 987 .scalarize(0) 988 .lower(); 989 } else { 990 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 991 .legalFor({S32, S16}) 992 .widenScalarToNextPow2(0) 993 .minScalar(0, S16) 994 .scalarize(0) 995 .lower(); 996 } 997 } else { 998 // TODO: Should have same legality without v_perm_b32 999 getActionDefinitionsBuilder(G_BSWAP) 1000 .legalFor({S32}) 1001 .lowerIf(scalarNarrowerThan(0, 32)) 1002 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1003 // narrowScalar limitation. 1004 .widenScalarToNextPow2(0) 1005 .maxScalar(0, S32) 1006 .scalarize(0) 1007 .lower(); 1008 1009 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1010 .legalFor({S32}) 1011 .minScalar(0, S32) 1012 .widenScalarToNextPow2(0) 1013 .scalarize(0) 1014 .lower(); 1015 } 1016 1017 getActionDefinitionsBuilder(G_INTTOPTR) 1018 // List the common cases 1019 .legalForCartesianProduct(AddrSpaces64, {S64}) 1020 .legalForCartesianProduct(AddrSpaces32, {S32}) 1021 .scalarize(0) 1022 // Accept any address space as long as the size matches 1023 .legalIf(sameSize(0, 1)) 1024 .widenScalarIf(smallerThan(1, 0), 1025 [](const LegalityQuery &Query) { 1026 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1027 }) 1028 .narrowScalarIf(largerThan(1, 0), 1029 [](const LegalityQuery &Query) { 1030 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1031 }); 1032 1033 getActionDefinitionsBuilder(G_PTRTOINT) 1034 // List the common cases 1035 .legalForCartesianProduct(AddrSpaces64, {S64}) 1036 .legalForCartesianProduct(AddrSpaces32, {S32}) 1037 .scalarize(0) 1038 // Accept any address space as long as the size matches 1039 .legalIf(sameSize(0, 1)) 1040 .widenScalarIf(smallerThan(0, 1), 1041 [](const LegalityQuery &Query) { 1042 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1043 }) 1044 .narrowScalarIf( 1045 largerThan(0, 1), 1046 [](const LegalityQuery &Query) { 1047 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1048 }); 1049 1050 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1051 .scalarize(0) 1052 .custom(); 1053 1054 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1055 bool IsLoad) -> bool { 1056 const LLT DstTy = Query.Types[0]; 1057 1058 // Split vector extloads. 1059 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1060 1061 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1062 return true; 1063 1064 const LLT PtrTy = Query.Types[1]; 1065 unsigned AS = PtrTy.getAddressSpace(); 1066 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1067 return true; 1068 1069 // Catch weird sized loads that don't evenly divide into the access sizes 1070 // TODO: May be able to widen depending on alignment etc. 1071 unsigned NumRegs = (MemSize + 31) / 32; 1072 if (NumRegs == 3) { 1073 if (!ST.hasDwordx3LoadStores()) 1074 return true; 1075 } else { 1076 // If the alignment allows, these should have been widened. 1077 if (!isPowerOf2_32(NumRegs)) 1078 return true; 1079 } 1080 1081 return false; 1082 }; 1083 1084 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1085 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1086 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1087 1088 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1089 // LDS 1090 // TODO: Unsupported flat for SI. 1091 1092 for (unsigned Op : {G_LOAD, G_STORE}) { 1093 const bool IsStore = Op == G_STORE; 1094 1095 auto &Actions = getActionDefinitionsBuilder(Op); 1096 // Explicitly list some common cases. 1097 // TODO: Does this help compile time at all? 1098 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1099 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1100 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1101 {S64, GlobalPtr, S64, GlobalAlign32}, 1102 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1103 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1104 {S32, GlobalPtr, S8, GlobalAlign8}, 1105 {S32, GlobalPtr, S16, GlobalAlign16}, 1106 1107 {S32, LocalPtr, S32, 32}, 1108 {S64, LocalPtr, S64, 32}, 1109 {V2S32, LocalPtr, V2S32, 32}, 1110 {S32, LocalPtr, S8, 8}, 1111 {S32, LocalPtr, S16, 16}, 1112 {V2S16, LocalPtr, S32, 32}, 1113 1114 {S32, PrivatePtr, S32, 32}, 1115 {S32, PrivatePtr, S8, 8}, 1116 {S32, PrivatePtr, S16, 16}, 1117 {V2S16, PrivatePtr, S32, 32}, 1118 1119 {S32, ConstantPtr, S32, GlobalAlign32}, 1120 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1121 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1122 {S64, ConstantPtr, S64, GlobalAlign32}, 1123 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1124 Actions.legalIf( 1125 [=](const LegalityQuery &Query) -> bool { 1126 return isLoadStoreLegal(ST, Query); 1127 }); 1128 1129 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1130 // 64-bits. 1131 // 1132 // TODO: Should generalize bitcast action into coerce, which will also cover 1133 // inserting addrspacecasts. 1134 Actions.customIf(typeIs(1, Constant32Ptr)); 1135 1136 // Turn any illegal element vectors into something easier to deal 1137 // with. These will ultimately produce 32-bit scalar shifts to extract the 1138 // parts anyway. 1139 // 1140 // For odd 16-bit element vectors, prefer to split those into pieces with 1141 // 16-bit vector parts. 1142 Actions.bitcastIf( 1143 [=](const LegalityQuery &Query) -> bool { 1144 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1145 Query.MMODescrs[0].MemoryTy); 1146 }, bitcastToRegisterType(0)); 1147 1148 if (!IsStore) { 1149 // Widen suitably aligned loads by loading extra bytes. The standard 1150 // legalization actions can't properly express widening memory operands. 1151 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1152 return shouldWidenLoad(ST, Query, G_LOAD); 1153 }); 1154 } 1155 1156 // FIXME: load/store narrowing should be moved to lower action 1157 Actions 1158 .narrowScalarIf( 1159 [=](const LegalityQuery &Query) -> bool { 1160 return !Query.Types[0].isVector() && 1161 needToSplitMemOp(Query, Op == G_LOAD); 1162 }, 1163 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1164 const LLT DstTy = Query.Types[0]; 1165 const LLT PtrTy = Query.Types[1]; 1166 1167 const unsigned DstSize = DstTy.getSizeInBits(); 1168 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1169 1170 // Split extloads. 1171 if (DstSize > MemSize) 1172 return std::make_pair(0, LLT::scalar(MemSize)); 1173 1174 unsigned MaxSize = maxSizeForAddrSpace(ST, 1175 PtrTy.getAddressSpace(), 1176 Op == G_LOAD); 1177 if (MemSize > MaxSize) 1178 return std::make_pair(0, LLT::scalar(MaxSize)); 1179 1180 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1181 return std::make_pair(0, LLT::scalar(Align)); 1182 }) 1183 .fewerElementsIf( 1184 [=](const LegalityQuery &Query) -> bool { 1185 return Query.Types[0].isVector() && 1186 needToSplitMemOp(Query, Op == G_LOAD); 1187 }, 1188 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1189 const LLT DstTy = Query.Types[0]; 1190 const LLT PtrTy = Query.Types[1]; 1191 1192 LLT EltTy = DstTy.getElementType(); 1193 unsigned MaxSize = maxSizeForAddrSpace(ST, 1194 PtrTy.getAddressSpace(), 1195 Op == G_LOAD); 1196 1197 // FIXME: Handle widened to power of 2 results better. This ends 1198 // up scalarizing. 1199 // FIXME: 3 element stores scalarized on SI 1200 1201 // Split if it's too large for the address space. 1202 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1203 if (MemSize > MaxSize) { 1204 unsigned NumElts = DstTy.getNumElements(); 1205 unsigned EltSize = EltTy.getSizeInBits(); 1206 1207 if (MaxSize % EltSize == 0) { 1208 return std::make_pair( 1209 0, LLT::scalarOrVector( 1210 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1211 } 1212 1213 unsigned NumPieces = MemSize / MaxSize; 1214 1215 // FIXME: Refine when odd breakdowns handled 1216 // The scalars will need to be re-legalized. 1217 if (NumPieces == 1 || NumPieces >= NumElts || 1218 NumElts % NumPieces != 0) 1219 return std::make_pair(0, EltTy); 1220 1221 return std::make_pair( 1222 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1223 } 1224 1225 // FIXME: We could probably handle weird extending loads better. 1226 if (DstTy.getSizeInBits() > MemSize) 1227 return std::make_pair(0, EltTy); 1228 1229 unsigned EltSize = EltTy.getSizeInBits(); 1230 unsigned DstSize = DstTy.getSizeInBits(); 1231 if (!isPowerOf2_32(DstSize)) { 1232 // We're probably decomposing an odd sized store. Try to split 1233 // to the widest type. TODO: Account for alignment. As-is it 1234 // should be OK, since the new parts will be further legalized. 1235 unsigned FloorSize = PowerOf2Floor(DstSize); 1236 return std::make_pair( 1237 0, LLT::scalarOrVector( 1238 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1239 } 1240 1241 // May need relegalization for the scalars. 1242 return std::make_pair(0, EltTy); 1243 }) 1244 .minScalar(0, S32) 1245 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1246 .widenScalarToNextPow2(0) 1247 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1248 .lower(); 1249 } 1250 1251 // FIXME: Unaligned accesses not lowered. 1252 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1253 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1254 {S32, GlobalPtr, S16, 2 * 8}, 1255 {S32, LocalPtr, S8, 8}, 1256 {S32, LocalPtr, S16, 16}, 1257 {S32, PrivatePtr, S8, 8}, 1258 {S32, PrivatePtr, S16, 16}, 1259 {S32, ConstantPtr, S8, 8}, 1260 {S32, ConstantPtr, S16, 2 * 8}}) 1261 .legalIf( 1262 [=](const LegalityQuery &Query) -> bool { 1263 return isLoadStoreLegal(ST, Query); 1264 }); 1265 1266 if (ST.hasFlatAddressSpace()) { 1267 ExtLoads.legalForTypesWithMemDesc( 1268 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1269 } 1270 1271 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1272 // 64-bits. 1273 // 1274 // TODO: Should generalize bitcast action into coerce, which will also cover 1275 // inserting addrspacecasts. 1276 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1277 1278 ExtLoads.clampScalar(0, S32, S32) 1279 .widenScalarToNextPow2(0) 1280 .lower(); 1281 1282 auto &Atomics = getActionDefinitionsBuilder( 1283 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1284 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1285 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1286 G_ATOMICRMW_UMIN}) 1287 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1288 {S64, GlobalPtr}, {S64, LocalPtr}, 1289 {S32, RegionPtr}, {S64, RegionPtr}}); 1290 if (ST.hasFlatAddressSpace()) { 1291 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1292 } 1293 1294 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1295 if (ST.hasLDSFPAtomicAdd()) { 1296 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1297 if (ST.hasGFX90AInsts()) 1298 Atomic.legalFor({{S64, LocalPtr}}); 1299 if (ST.hasGFX940Insts()) 1300 Atomic.legalFor({{V2S16, LocalPtr}}); 1301 } 1302 if (ST.hasAtomicFaddInsts()) 1303 Atomic.legalFor({{S32, GlobalPtr}}); 1304 1305 if (ST.hasGFX90AInsts()) { 1306 // These are legal with some caveats, and should have undergone expansion in 1307 // the IR in most situations 1308 // TODO: Move atomic expansion into legalizer 1309 // TODO: Also supports <2 x f16> 1310 Atomic.legalFor({ 1311 {S32, GlobalPtr}, 1312 {S64, GlobalPtr}, 1313 {S64, FlatPtr} 1314 }); 1315 } 1316 1317 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1318 // demarshalling 1319 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1320 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1321 {S32, FlatPtr}, {S64, FlatPtr}}) 1322 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1323 {S32, RegionPtr}, {S64, RegionPtr}}); 1324 // TODO: Pointer types, any 32-bit or 64-bit vector 1325 1326 // Condition should be s32 for scalar, s1 for vector. 1327 getActionDefinitionsBuilder(G_SELECT) 1328 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1329 LocalPtr, FlatPtr, PrivatePtr, 1330 LLT::fixed_vector(2, LocalPtr), 1331 LLT::fixed_vector(2, PrivatePtr)}, 1332 {S1, S32}) 1333 .clampScalar(0, S16, S64) 1334 .scalarize(1) 1335 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1336 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1337 .clampMaxNumElements(0, S32, 2) 1338 .clampMaxNumElements(0, LocalPtr, 2) 1339 .clampMaxNumElements(0, PrivatePtr, 2) 1340 .scalarize(0) 1341 .widenScalarToNextPow2(0) 1342 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1343 1344 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1345 // be more flexible with the shift amount type. 1346 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1347 .legalFor({{S32, S32}, {S64, S32}}); 1348 if (ST.has16BitInsts()) { 1349 if (ST.hasVOP3PInsts()) { 1350 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1351 .clampMaxNumElements(0, S16, 2); 1352 } else 1353 Shifts.legalFor({{S16, S16}}); 1354 1355 // TODO: Support 16-bit shift amounts for all types 1356 Shifts.widenScalarIf( 1357 [=](const LegalityQuery &Query) { 1358 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1359 // 32-bit amount. 1360 const LLT ValTy = Query.Types[0]; 1361 const LLT AmountTy = Query.Types[1]; 1362 return ValTy.getSizeInBits() <= 16 && 1363 AmountTy.getSizeInBits() < 16; 1364 }, changeTo(1, S16)); 1365 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1366 Shifts.clampScalar(1, S32, S32); 1367 Shifts.widenScalarToNextPow2(0, 16); 1368 Shifts.clampScalar(0, S16, S64); 1369 1370 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1371 .minScalar(0, S16) 1372 .scalarize(0) 1373 .lower(); 1374 } else { 1375 // Make sure we legalize the shift amount type first, as the general 1376 // expansion for the shifted type will produce much worse code if it hasn't 1377 // been truncated already. 1378 Shifts.clampScalar(1, S32, S32); 1379 Shifts.widenScalarToNextPow2(0, 32); 1380 Shifts.clampScalar(0, S32, S64); 1381 1382 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1383 .minScalar(0, S32) 1384 .scalarize(0) 1385 .lower(); 1386 } 1387 Shifts.scalarize(0); 1388 1389 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1390 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1391 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1392 unsigned IdxTypeIdx = 2; 1393 1394 getActionDefinitionsBuilder(Op) 1395 .customIf([=](const LegalityQuery &Query) { 1396 const LLT EltTy = Query.Types[EltTypeIdx]; 1397 const LLT VecTy = Query.Types[VecTypeIdx]; 1398 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1399 const unsigned EltSize = EltTy.getSizeInBits(); 1400 return (EltSize == 32 || EltSize == 64) && 1401 VecTy.getSizeInBits() % 32 == 0 && 1402 VecTy.getSizeInBits() <= MaxRegisterSize && 1403 IdxTy.getSizeInBits() == 32; 1404 }) 1405 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1406 bitcastToVectorElement32(VecTypeIdx)) 1407 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1408 .bitcastIf( 1409 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1410 [=](const LegalityQuery &Query) { 1411 // For > 64-bit element types, try to turn this into a 64-bit 1412 // element vector since we may be able to do better indexing 1413 // if this is scalar. If not, fall back to 32. 1414 const LLT EltTy = Query.Types[EltTypeIdx]; 1415 const LLT VecTy = Query.Types[VecTypeIdx]; 1416 const unsigned DstEltSize = EltTy.getSizeInBits(); 1417 const unsigned VecSize = VecTy.getSizeInBits(); 1418 1419 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1420 return std::make_pair( 1421 VecTypeIdx, 1422 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1423 }) 1424 .clampScalar(EltTypeIdx, S32, S64) 1425 .clampScalar(VecTypeIdx, S32, S64) 1426 .clampScalar(IdxTypeIdx, S32, S32) 1427 .clampMaxNumElements(VecTypeIdx, S32, 32) 1428 // TODO: Clamp elements for 64-bit vectors? 1429 // It should only be necessary with variable indexes. 1430 // As a last resort, lower to the stack 1431 .lower(); 1432 } 1433 1434 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1435 .unsupportedIf([=](const LegalityQuery &Query) { 1436 const LLT &EltTy = Query.Types[1].getElementType(); 1437 return Query.Types[0] != EltTy; 1438 }); 1439 1440 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1441 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1442 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1443 1444 // FIXME: Doesn't handle extract of illegal sizes. 1445 getActionDefinitionsBuilder(Op) 1446 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1447 .lowerIf([=](const LegalityQuery &Query) { 1448 // Sub-vector(or single element) insert and extract. 1449 // TODO: verify immediate offset here since lower only works with 1450 // whole elements. 1451 const LLT BigTy = Query.Types[BigTyIdx]; 1452 return BigTy.isVector(); 1453 }) 1454 // FIXME: Multiples of 16 should not be legal. 1455 .legalIf([=](const LegalityQuery &Query) { 1456 const LLT BigTy = Query.Types[BigTyIdx]; 1457 const LLT LitTy = Query.Types[LitTyIdx]; 1458 return (BigTy.getSizeInBits() % 32 == 0) && 1459 (LitTy.getSizeInBits() % 16 == 0); 1460 }) 1461 .widenScalarIf( 1462 [=](const LegalityQuery &Query) { 1463 const LLT BigTy = Query.Types[BigTyIdx]; 1464 return (BigTy.getScalarSizeInBits() < 16); 1465 }, 1466 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1467 .widenScalarIf( 1468 [=](const LegalityQuery &Query) { 1469 const LLT LitTy = Query.Types[LitTyIdx]; 1470 return (LitTy.getScalarSizeInBits() < 16); 1471 }, 1472 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1473 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1474 .widenScalarToNextPow2(BigTyIdx, 32); 1475 1476 } 1477 1478 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1479 .legalForCartesianProduct(AllS32Vectors, {S32}) 1480 .legalForCartesianProduct(AllS64Vectors, {S64}) 1481 .clampNumElements(0, V16S32, V32S32) 1482 .clampNumElements(0, V2S64, V16S64) 1483 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1484 1485 if (ST.hasScalarPackInsts()) { 1486 BuildVector 1487 // FIXME: Should probably widen s1 vectors straight to s32 1488 .minScalarOrElt(0, S16) 1489 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1490 .minScalar(1, S32); 1491 1492 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1493 .legalFor({V2S16, S32}) 1494 .lower(); 1495 BuildVector.minScalarOrElt(0, S32); 1496 } else { 1497 BuildVector.customFor({V2S16, S16}); 1498 BuildVector.minScalarOrElt(0, S32); 1499 1500 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1501 .customFor({V2S16, S32}) 1502 .lower(); 1503 } 1504 1505 BuildVector.legalIf(isRegisterType(0)); 1506 1507 // FIXME: Clamp maximum size 1508 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1509 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1510 .clampMaxNumElements(0, S32, 32) 1511 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1512 .clampMaxNumElements(0, S16, 64); 1513 1514 // TODO: Don't fully scalarize v2s16 pieces? Or combine out those 1515 // pre-legalize. 1516 if (ST.hasVOP3PInsts()) { 1517 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1518 .customFor({V2S16, V2S16}) 1519 .lower(); 1520 } else 1521 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1522 1523 // Merge/Unmerge 1524 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1525 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1526 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1527 1528 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1529 const LLT Ty = Query.Types[TypeIdx]; 1530 if (Ty.isVector()) { 1531 const LLT &EltTy = Ty.getElementType(); 1532 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1533 return true; 1534 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1535 return true; 1536 } 1537 return false; 1538 }; 1539 1540 auto &Builder = getActionDefinitionsBuilder(Op) 1541 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1542 .lowerFor({{S16, V2S16}}) 1543 .lowerIf([=](const LegalityQuery &Query) { 1544 const LLT BigTy = Query.Types[BigTyIdx]; 1545 return BigTy.getSizeInBits() == 32; 1546 }) 1547 // Try to widen to s16 first for small types. 1548 // TODO: Only do this on targets with legal s16 shifts 1549 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1550 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1551 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1552 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1553 elementTypeIs(1, S16)), 1554 changeTo(1, V2S16)) 1555 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1556 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1557 // valid. 1558 .clampScalar(LitTyIdx, S32, S512) 1559 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1560 // Break up vectors with weird elements into scalars 1561 .fewerElementsIf( 1562 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1563 scalarize(0)) 1564 .fewerElementsIf( 1565 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1566 scalarize(1)) 1567 .clampScalar(BigTyIdx, S32, MaxScalar); 1568 1569 if (Op == G_MERGE_VALUES) { 1570 Builder.widenScalarIf( 1571 // TODO: Use 16-bit shifts if legal for 8-bit values? 1572 [=](const LegalityQuery &Query) { 1573 const LLT Ty = Query.Types[LitTyIdx]; 1574 return Ty.getSizeInBits() < 32; 1575 }, 1576 changeTo(LitTyIdx, S32)); 1577 } 1578 1579 Builder.widenScalarIf( 1580 [=](const LegalityQuery &Query) { 1581 const LLT Ty = Query.Types[BigTyIdx]; 1582 return !isPowerOf2_32(Ty.getSizeInBits()) && 1583 Ty.getSizeInBits() % 16 != 0; 1584 }, 1585 [=](const LegalityQuery &Query) { 1586 // Pick the next power of 2, or a multiple of 64 over 128. 1587 // Whichever is smaller. 1588 const LLT &Ty = Query.Types[BigTyIdx]; 1589 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1590 if (NewSizeInBits >= 256) { 1591 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1592 if (RoundedTo < NewSizeInBits) 1593 NewSizeInBits = RoundedTo; 1594 } 1595 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1596 }) 1597 // Any vectors left are the wrong size. Scalarize them. 1598 .scalarize(0) 1599 .scalarize(1); 1600 } 1601 1602 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1603 // RegBankSelect. 1604 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1605 .legalFor({{S32}, {S64}}); 1606 1607 if (ST.hasVOP3PInsts()) { 1608 SextInReg.lowerFor({{V2S16}}) 1609 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1610 // get more vector shift opportunities, since we'll get those when 1611 // expanded. 1612 .clampMaxNumElementsStrict(0, S16, 2); 1613 } else if (ST.has16BitInsts()) { 1614 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1615 } else { 1616 // Prefer to promote to s32 before lowering if we don't have 16-bit 1617 // shifts. This avoid a lot of intermediate truncate and extend operations. 1618 SextInReg.lowerFor({{S32}, {S64}}); 1619 } 1620 1621 SextInReg 1622 .scalarize(0) 1623 .clampScalar(0, S32, S64) 1624 .lower(); 1625 1626 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1627 .scalarize(0) 1628 .lower(); 1629 1630 // TODO: Only Try to form v2s16 with legal packed instructions. 1631 getActionDefinitionsBuilder(G_FSHR) 1632 .legalFor({{S32, S32}}) 1633 .lowerFor({{V2S16, V2S16}}) 1634 .clampMaxNumElementsStrict(0, S16, 2) 1635 .scalarize(0) 1636 .lower(); 1637 1638 if (ST.hasVOP3PInsts()) { 1639 getActionDefinitionsBuilder(G_FSHL) 1640 .lowerFor({{V2S16, V2S16}}) 1641 .clampMaxNumElementsStrict(0, S16, 2) 1642 .scalarize(0) 1643 .lower(); 1644 } else { 1645 getActionDefinitionsBuilder(G_FSHL) 1646 .scalarize(0) 1647 .lower(); 1648 } 1649 1650 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1651 .legalFor({S64}); 1652 1653 getActionDefinitionsBuilder(G_FENCE) 1654 .alwaysLegal(); 1655 1656 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1657 .scalarize(0) 1658 .minScalar(0, S32) 1659 .lower(); 1660 1661 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1662 .legalFor({{S32, S32}, {S64, S32}}) 1663 .clampScalar(1, S32, S32) 1664 .clampScalar(0, S32, S64) 1665 .widenScalarToNextPow2(0) 1666 .scalarize(0); 1667 1668 getActionDefinitionsBuilder({ 1669 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1670 G_FCOPYSIGN, 1671 1672 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1673 G_ATOMICRMW_NAND, 1674 G_ATOMICRMW_FSUB, 1675 G_READ_REGISTER, 1676 G_WRITE_REGISTER, 1677 1678 G_SADDO, G_SSUBO, 1679 1680 // TODO: Implement 1681 G_FMINIMUM, G_FMAXIMUM}).lower(); 1682 1683 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1684 .lower(); 1685 1686 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1687 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1688 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1689 .unsupported(); 1690 1691 getLegacyLegalizerInfo().computeTables(); 1692 verify(*ST.getInstrInfo()); 1693 } 1694 1695 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1696 MachineInstr &MI) const { 1697 MachineIRBuilder &B = Helper.MIRBuilder; 1698 MachineRegisterInfo &MRI = *B.getMRI(); 1699 1700 switch (MI.getOpcode()) { 1701 case TargetOpcode::G_ADDRSPACE_CAST: 1702 return legalizeAddrSpaceCast(MI, MRI, B); 1703 case TargetOpcode::G_FRINT: 1704 return legalizeFrint(MI, MRI, B); 1705 case TargetOpcode::G_FCEIL: 1706 return legalizeFceil(MI, MRI, B); 1707 case TargetOpcode::G_FREM: 1708 return legalizeFrem(MI, MRI, B); 1709 case TargetOpcode::G_INTRINSIC_TRUNC: 1710 return legalizeIntrinsicTrunc(MI, MRI, B); 1711 case TargetOpcode::G_SITOFP: 1712 return legalizeITOFP(MI, MRI, B, true); 1713 case TargetOpcode::G_UITOFP: 1714 return legalizeITOFP(MI, MRI, B, false); 1715 case TargetOpcode::G_FPTOSI: 1716 return legalizeFPTOI(MI, MRI, B, true); 1717 case TargetOpcode::G_FPTOUI: 1718 return legalizeFPTOI(MI, MRI, B, false); 1719 case TargetOpcode::G_FMINNUM: 1720 case TargetOpcode::G_FMAXNUM: 1721 case TargetOpcode::G_FMINNUM_IEEE: 1722 case TargetOpcode::G_FMAXNUM_IEEE: 1723 return legalizeMinNumMaxNum(Helper, MI); 1724 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1725 return legalizeExtractVectorElt(MI, MRI, B); 1726 case TargetOpcode::G_INSERT_VECTOR_ELT: 1727 return legalizeInsertVectorElt(MI, MRI, B); 1728 case TargetOpcode::G_SHUFFLE_VECTOR: 1729 return legalizeShuffleVector(MI, MRI, B); 1730 case TargetOpcode::G_FSIN: 1731 case TargetOpcode::G_FCOS: 1732 return legalizeSinCos(MI, MRI, B); 1733 case TargetOpcode::G_GLOBAL_VALUE: 1734 return legalizeGlobalValue(MI, MRI, B); 1735 case TargetOpcode::G_LOAD: 1736 case TargetOpcode::G_SEXTLOAD: 1737 case TargetOpcode::G_ZEXTLOAD: 1738 return legalizeLoad(Helper, MI); 1739 case TargetOpcode::G_FMAD: 1740 return legalizeFMad(MI, MRI, B); 1741 case TargetOpcode::G_FDIV: 1742 return legalizeFDIV(MI, MRI, B); 1743 case TargetOpcode::G_UDIV: 1744 case TargetOpcode::G_UREM: 1745 case TargetOpcode::G_UDIVREM: 1746 return legalizeUnsignedDIV_REM(MI, MRI, B); 1747 case TargetOpcode::G_SDIV: 1748 case TargetOpcode::G_SREM: 1749 case TargetOpcode::G_SDIVREM: 1750 return legalizeSignedDIV_REM(MI, MRI, B); 1751 case TargetOpcode::G_ATOMIC_CMPXCHG: 1752 return legalizeAtomicCmpXChg(MI, MRI, B); 1753 case TargetOpcode::G_FLOG: 1754 return legalizeFlog(MI, B, numbers::ln2f); 1755 case TargetOpcode::G_FLOG10: 1756 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1757 case TargetOpcode::G_FEXP: 1758 return legalizeFExp(MI, B); 1759 case TargetOpcode::G_FPOW: 1760 return legalizeFPow(MI, B); 1761 case TargetOpcode::G_FFLOOR: 1762 return legalizeFFloor(MI, MRI, B); 1763 case TargetOpcode::G_BUILD_VECTOR: 1764 return legalizeBuildVector(MI, MRI, B); 1765 case TargetOpcode::G_CTLZ: 1766 case TargetOpcode::G_CTTZ: 1767 return legalizeCTLZ_CTTZ(MI, MRI, B); 1768 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 1769 return legalizeFPTruncRound(MI, B); 1770 default: 1771 return false; 1772 } 1773 1774 llvm_unreachable("expected switch to return"); 1775 } 1776 1777 Register AMDGPULegalizerInfo::getSegmentAperture( 1778 unsigned AS, 1779 MachineRegisterInfo &MRI, 1780 MachineIRBuilder &B) const { 1781 MachineFunction &MF = B.getMF(); 1782 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1783 const LLT S32 = LLT::scalar(32); 1784 1785 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1786 1787 if (ST.hasApertureRegs()) { 1788 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1789 // getreg. 1790 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1791 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1792 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1793 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1794 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1795 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1796 unsigned Encoding = 1797 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1798 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1799 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1800 1801 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1802 1803 B.buildInstr(AMDGPU::S_GETREG_B32) 1804 .addDef(GetReg) 1805 .addImm(Encoding); 1806 MRI.setType(GetReg, S32); 1807 1808 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1809 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1810 } 1811 1812 // TODO: can we be smarter about machine pointer info? 1813 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1814 Register LoadAddr = MRI.createGenericVirtualRegister( 1815 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1816 // For code object version 5, private_base and shared_base are passed through 1817 // implicit kernargs. 1818 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 1819 AMDGPUTargetLowering::ImplicitParameter Param = 1820 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 1821 : AMDGPUTargetLowering::PRIVATE_BASE; 1822 uint64_t Offset = 1823 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 1824 1825 Register KernargPtrReg = MRI.createGenericVirtualRegister( 1826 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1827 1828 if (!loadInputValue(KernargPtrReg, B, 1829 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 1830 return Register(); 1831 1832 MachineMemOperand *MMO = MF.getMachineMemOperand( 1833 PtrInfo, 1834 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1835 MachineMemOperand::MOInvariant, 1836 LLT::scalar(32), commonAlignment(Align(64), Offset)); 1837 1838 // Pointer address 1839 B.buildPtrAdd(LoadAddr, KernargPtrReg, 1840 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 1841 // Load address 1842 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1843 } 1844 1845 Register QueuePtr = MRI.createGenericVirtualRegister( 1846 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1847 1848 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1849 return Register(); 1850 1851 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1852 // private_segment_aperture_base_hi. 1853 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1854 1855 MachineMemOperand *MMO = MF.getMachineMemOperand( 1856 PtrInfo, 1857 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1858 MachineMemOperand::MOInvariant, 1859 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1860 1861 B.buildPtrAdd(LoadAddr, QueuePtr, 1862 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 1863 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1864 } 1865 1866 /// Return true if the value is a known valid address, such that a null check is 1867 /// not necessary. 1868 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 1869 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 1870 MachineInstr *Def = MRI.getVRegDef(Val); 1871 switch (Def->getOpcode()) { 1872 case AMDGPU::G_FRAME_INDEX: 1873 case AMDGPU::G_GLOBAL_VALUE: 1874 case AMDGPU::G_BLOCK_ADDR: 1875 return true; 1876 case AMDGPU::G_CONSTANT: { 1877 const ConstantInt *CI = Def->getOperand(1).getCImm(); 1878 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 1879 } 1880 default: 1881 return false; 1882 } 1883 1884 return false; 1885 } 1886 1887 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1888 MachineInstr &MI, MachineRegisterInfo &MRI, 1889 MachineIRBuilder &B) const { 1890 MachineFunction &MF = B.getMF(); 1891 1892 const LLT S32 = LLT::scalar(32); 1893 Register Dst = MI.getOperand(0).getReg(); 1894 Register Src = MI.getOperand(1).getReg(); 1895 1896 LLT DstTy = MRI.getType(Dst); 1897 LLT SrcTy = MRI.getType(Src); 1898 unsigned DestAS = DstTy.getAddressSpace(); 1899 unsigned SrcAS = SrcTy.getAddressSpace(); 1900 1901 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1902 // vector element. 1903 assert(!DstTy.isVector()); 1904 1905 const AMDGPUTargetMachine &TM 1906 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1907 1908 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1909 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1910 return true; 1911 } 1912 1913 if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 1914 (DestAS == AMDGPUAS::LOCAL_ADDRESS || 1915 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1916 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1917 // Extract low 32-bits of the pointer. 1918 B.buildExtract(Dst, Src, 0); 1919 MI.eraseFromParent(); 1920 return true; 1921 } 1922 1923 unsigned NullVal = TM.getNullPointerValue(DestAS); 1924 1925 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1926 auto FlatNull = B.buildConstant(SrcTy, 0); 1927 1928 // Extract low 32-bits of the pointer. 1929 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1930 1931 auto CmpRes = 1932 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1933 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1934 1935 MI.eraseFromParent(); 1936 return true; 1937 } 1938 1939 if (DestAS == AMDGPUAS::FLAT_ADDRESS && 1940 (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 1941 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1942 if (!ST.hasFlatAddressSpace()) 1943 return false; 1944 1945 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1946 if (!ApertureReg.isValid()) 1947 return false; 1948 1949 // Coerce the type of the low half of the result so we can use merge_values. 1950 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1951 1952 // TODO: Should we allow mismatched types but matching sizes in merges to 1953 // avoid the ptrtoint? 1954 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1955 1956 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1957 B.buildCopy(Dst, BuildPtr); 1958 MI.eraseFromParent(); 1959 return true; 1960 } 1961 1962 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1963 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1964 1965 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 1966 SegmentNull.getReg(0)); 1967 1968 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1969 1970 MI.eraseFromParent(); 1971 return true; 1972 } 1973 1974 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 1975 SrcTy.getSizeInBits() == 64) { 1976 // Truncate. 1977 B.buildExtract(Dst, Src, 0); 1978 MI.eraseFromParent(); 1979 return true; 1980 } 1981 1982 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 1983 DstTy.getSizeInBits() == 64) { 1984 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1985 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1986 1987 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1988 // another. Merge operands are required to be the same type, but creating an 1989 // extra ptrtoint would be kind of pointless. 1990 auto HighAddr = B.buildConstant( 1991 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1992 B.buildMerge(Dst, {Src, HighAddr}); 1993 MI.eraseFromParent(); 1994 return true; 1995 } 1996 1997 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 1998 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 1999 2000 LLVMContext &Ctx = MF.getFunction().getContext(); 2001 Ctx.diagnose(InvalidAddrSpaceCast); 2002 B.buildUndef(Dst); 2003 MI.eraseFromParent(); 2004 return true; 2005 } 2006 2007 bool AMDGPULegalizerInfo::legalizeFrint( 2008 MachineInstr &MI, MachineRegisterInfo &MRI, 2009 MachineIRBuilder &B) const { 2010 Register Src = MI.getOperand(1).getReg(); 2011 LLT Ty = MRI.getType(Src); 2012 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 2013 2014 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2015 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2016 2017 auto C1 = B.buildFConstant(Ty, C1Val); 2018 auto CopySign = B.buildFCopysign(Ty, C1, Src); 2019 2020 // TODO: Should this propagate fast-math-flags? 2021 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 2022 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 2023 2024 auto C2 = B.buildFConstant(Ty, C2Val); 2025 auto Fabs = B.buildFAbs(Ty, Src); 2026 2027 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 2028 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2029 MI.eraseFromParent(); 2030 return true; 2031 } 2032 2033 bool AMDGPULegalizerInfo::legalizeFceil( 2034 MachineInstr &MI, MachineRegisterInfo &MRI, 2035 MachineIRBuilder &B) const { 2036 2037 const LLT S1 = LLT::scalar(1); 2038 const LLT S64 = LLT::scalar(64); 2039 2040 Register Src = MI.getOperand(1).getReg(); 2041 assert(MRI.getType(Src) == S64); 2042 2043 // result = trunc(src) 2044 // if (src > 0.0 && src != result) 2045 // result += 1.0 2046 2047 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 2048 2049 const auto Zero = B.buildFConstant(S64, 0.0); 2050 const auto One = B.buildFConstant(S64, 1.0); 2051 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2052 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2053 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2054 auto Add = B.buildSelect(S64, And, One, Zero); 2055 2056 // TODO: Should this propagate fast-math-flags? 2057 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2058 MI.eraseFromParent(); 2059 return true; 2060 } 2061 2062 bool AMDGPULegalizerInfo::legalizeFrem( 2063 MachineInstr &MI, MachineRegisterInfo &MRI, 2064 MachineIRBuilder &B) const { 2065 Register DstReg = MI.getOperand(0).getReg(); 2066 Register Src0Reg = MI.getOperand(1).getReg(); 2067 Register Src1Reg = MI.getOperand(2).getReg(); 2068 auto Flags = MI.getFlags(); 2069 LLT Ty = MRI.getType(DstReg); 2070 2071 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2072 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2073 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2074 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2075 MI.eraseFromParent(); 2076 return true; 2077 } 2078 2079 static MachineInstrBuilder extractF64Exponent(Register Hi, 2080 MachineIRBuilder &B) { 2081 const unsigned FractBits = 52; 2082 const unsigned ExpBits = 11; 2083 LLT S32 = LLT::scalar(32); 2084 2085 auto Const0 = B.buildConstant(S32, FractBits - 32); 2086 auto Const1 = B.buildConstant(S32, ExpBits); 2087 2088 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2089 .addUse(Hi) 2090 .addUse(Const0.getReg(0)) 2091 .addUse(Const1.getReg(0)); 2092 2093 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2094 } 2095 2096 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2097 MachineInstr &MI, MachineRegisterInfo &MRI, 2098 MachineIRBuilder &B) const { 2099 const LLT S1 = LLT::scalar(1); 2100 const LLT S32 = LLT::scalar(32); 2101 const LLT S64 = LLT::scalar(64); 2102 2103 Register Src = MI.getOperand(1).getReg(); 2104 assert(MRI.getType(Src) == S64); 2105 2106 // TODO: Should this use extract since the low half is unused? 2107 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2108 Register Hi = Unmerge.getReg(1); 2109 2110 // Extract the upper half, since this is where we will find the sign and 2111 // exponent. 2112 auto Exp = extractF64Exponent(Hi, B); 2113 2114 const unsigned FractBits = 52; 2115 2116 // Extract the sign bit. 2117 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2118 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2119 2120 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2121 2122 const auto Zero32 = B.buildConstant(S32, 0); 2123 2124 // Extend back to 64-bits. 2125 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2126 2127 auto Shr = B.buildAShr(S64, FractMask, Exp); 2128 auto Not = B.buildNot(S64, Shr); 2129 auto Tmp0 = B.buildAnd(S64, Src, Not); 2130 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2131 2132 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2133 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2134 2135 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2136 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2137 MI.eraseFromParent(); 2138 return true; 2139 } 2140 2141 bool AMDGPULegalizerInfo::legalizeITOFP( 2142 MachineInstr &MI, MachineRegisterInfo &MRI, 2143 MachineIRBuilder &B, bool Signed) const { 2144 2145 Register Dst = MI.getOperand(0).getReg(); 2146 Register Src = MI.getOperand(1).getReg(); 2147 2148 const LLT S64 = LLT::scalar(64); 2149 const LLT S32 = LLT::scalar(32); 2150 2151 assert(MRI.getType(Src) == S64); 2152 2153 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2154 auto ThirtyTwo = B.buildConstant(S32, 32); 2155 2156 if (MRI.getType(Dst) == S64) { 2157 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2158 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2159 2160 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2161 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2162 .addUse(CvtHi.getReg(0)) 2163 .addUse(ThirtyTwo.getReg(0)); 2164 2165 // TODO: Should this propagate fast-math-flags? 2166 B.buildFAdd(Dst, LdExp, CvtLo); 2167 MI.eraseFromParent(); 2168 return true; 2169 } 2170 2171 assert(MRI.getType(Dst) == S32); 2172 2173 auto One = B.buildConstant(S32, 1); 2174 2175 MachineInstrBuilder ShAmt; 2176 if (Signed) { 2177 auto ThirtyOne = B.buildConstant(S32, 31); 2178 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2179 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2180 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2181 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2182 /*HasSideEffects=*/false) 2183 .addUse(Unmerge.getReg(1)); 2184 auto LS2 = B.buildSub(S32, LS, One); 2185 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2186 } else 2187 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2188 auto Norm = B.buildShl(S64, Src, ShAmt); 2189 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2190 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2191 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2192 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2193 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2194 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2195 /*HasSideEffects=*/false) 2196 .addUse(FVal.getReg(0)) 2197 .addUse(Scale.getReg(0)); 2198 MI.eraseFromParent(); 2199 return true; 2200 } 2201 2202 // TODO: Copied from DAG implementation. Verify logic and document how this 2203 // actually works. 2204 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2205 MachineRegisterInfo &MRI, 2206 MachineIRBuilder &B, 2207 bool Signed) const { 2208 2209 Register Dst = MI.getOperand(0).getReg(); 2210 Register Src = MI.getOperand(1).getReg(); 2211 2212 const LLT S64 = LLT::scalar(64); 2213 const LLT S32 = LLT::scalar(32); 2214 2215 const LLT SrcLT = MRI.getType(Src); 2216 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2217 2218 unsigned Flags = MI.getFlags(); 2219 2220 // The basic idea of converting a floating point number into a pair of 32-bit 2221 // integers is illustrated as follows: 2222 // 2223 // tf := trunc(val); 2224 // hif := floor(tf * 2^-32); 2225 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2226 // hi := fptoi(hif); 2227 // lo := fptoi(lof); 2228 // 2229 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2230 MachineInstrBuilder Sign; 2231 if (Signed && SrcLT == S32) { 2232 // However, a 32-bit floating point number has only 23 bits mantissa and 2233 // it's not enough to hold all the significant bits of `lof` if val is 2234 // negative. To avoid the loss of precision, We need to take the absolute 2235 // value after truncating and flip the result back based on the original 2236 // signedness. 2237 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2238 Trunc = B.buildFAbs(S32, Trunc, Flags); 2239 } 2240 MachineInstrBuilder K0, K1; 2241 if (SrcLT == S64) { 2242 K0 = B.buildFConstant(S64, 2243 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2244 K1 = B.buildFConstant(S64, 2245 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2246 } else { 2247 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2248 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2249 } 2250 2251 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2252 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2253 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2254 2255 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2256 : B.buildFPTOUI(S32, FloorMul); 2257 auto Lo = B.buildFPTOUI(S32, Fma); 2258 2259 if (Signed && SrcLT == S32) { 2260 // Flip the result based on the signedness, which is either all 0s or 1s. 2261 Sign = B.buildMerge(S64, {Sign, Sign}); 2262 // r := xor({lo, hi}, sign) - sign; 2263 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2264 } else 2265 B.buildMerge(Dst, {Lo, Hi}); 2266 MI.eraseFromParent(); 2267 2268 return true; 2269 } 2270 2271 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2272 MachineInstr &MI) const { 2273 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2274 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2275 2276 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2277 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2278 2279 // With ieee_mode disabled, the instructions have the correct behavior 2280 // already for G_FMINNUM/G_FMAXNUM 2281 if (!MFI->getMode().IEEE) 2282 return !IsIEEEOp; 2283 2284 if (IsIEEEOp) 2285 return true; 2286 2287 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2288 } 2289 2290 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2291 MachineInstr &MI, MachineRegisterInfo &MRI, 2292 MachineIRBuilder &B) const { 2293 // TODO: Should move some of this into LegalizerHelper. 2294 2295 // TODO: Promote dynamic indexing of s16 to s32 2296 2297 // FIXME: Artifact combiner probably should have replaced the truncated 2298 // constant before this, so we shouldn't need 2299 // getIConstantVRegValWithLookThrough. 2300 Optional<ValueAndVReg> MaybeIdxVal = 2301 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2302 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2303 return true; 2304 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2305 2306 Register Dst = MI.getOperand(0).getReg(); 2307 Register Vec = MI.getOperand(1).getReg(); 2308 2309 LLT VecTy = MRI.getType(Vec); 2310 LLT EltTy = VecTy.getElementType(); 2311 assert(EltTy == MRI.getType(Dst)); 2312 2313 if (IdxVal < VecTy.getNumElements()) { 2314 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2315 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2316 } else { 2317 B.buildUndef(Dst); 2318 } 2319 2320 MI.eraseFromParent(); 2321 return true; 2322 } 2323 2324 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2325 MachineInstr &MI, MachineRegisterInfo &MRI, 2326 MachineIRBuilder &B) const { 2327 // TODO: Should move some of this into LegalizerHelper. 2328 2329 // TODO: Promote dynamic indexing of s16 to s32 2330 2331 // FIXME: Artifact combiner probably should have replaced the truncated 2332 // constant before this, so we shouldn't need 2333 // getIConstantVRegValWithLookThrough. 2334 Optional<ValueAndVReg> MaybeIdxVal = 2335 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2336 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2337 return true; 2338 2339 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2340 Register Dst = MI.getOperand(0).getReg(); 2341 Register Vec = MI.getOperand(1).getReg(); 2342 Register Ins = MI.getOperand(2).getReg(); 2343 2344 LLT VecTy = MRI.getType(Vec); 2345 LLT EltTy = VecTy.getElementType(); 2346 assert(EltTy == MRI.getType(Ins)); 2347 (void)Ins; 2348 2349 unsigned NumElts = VecTy.getNumElements(); 2350 if (IdxVal < NumElts) { 2351 SmallVector<Register, 8> SrcRegs; 2352 for (unsigned i = 0; i < NumElts; ++i) 2353 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2354 B.buildUnmerge(SrcRegs, Vec); 2355 2356 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2357 B.buildMerge(Dst, SrcRegs); 2358 } else { 2359 B.buildUndef(Dst); 2360 } 2361 2362 MI.eraseFromParent(); 2363 return true; 2364 } 2365 2366 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2367 MachineInstr &MI, MachineRegisterInfo &MRI, 2368 MachineIRBuilder &B) const { 2369 const LLT V2S16 = LLT::fixed_vector(2, 16); 2370 2371 Register Dst = MI.getOperand(0).getReg(); 2372 Register Src0 = MI.getOperand(1).getReg(); 2373 LLT DstTy = MRI.getType(Dst); 2374 LLT SrcTy = MRI.getType(Src0); 2375 2376 if (SrcTy == V2S16 && DstTy == V2S16 && 2377 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2378 return true; 2379 2380 MachineIRBuilder HelperBuilder(MI); 2381 GISelObserverWrapper DummyObserver; 2382 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2383 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2384 } 2385 2386 bool AMDGPULegalizerInfo::legalizeSinCos( 2387 MachineInstr &MI, MachineRegisterInfo &MRI, 2388 MachineIRBuilder &B) const { 2389 2390 Register DstReg = MI.getOperand(0).getReg(); 2391 Register SrcReg = MI.getOperand(1).getReg(); 2392 LLT Ty = MRI.getType(DstReg); 2393 unsigned Flags = MI.getFlags(); 2394 2395 Register TrigVal; 2396 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2397 if (ST.hasTrigReducedRange()) { 2398 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2399 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2400 .addUse(MulVal.getReg(0)) 2401 .setMIFlags(Flags).getReg(0); 2402 } else 2403 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2404 2405 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2406 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2407 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2408 .addUse(TrigVal) 2409 .setMIFlags(Flags); 2410 MI.eraseFromParent(); 2411 return true; 2412 } 2413 2414 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2415 MachineIRBuilder &B, 2416 const GlobalValue *GV, 2417 int64_t Offset, 2418 unsigned GAFlags) const { 2419 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2420 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2421 // to the following code sequence: 2422 // 2423 // For constant address space: 2424 // s_getpc_b64 s[0:1] 2425 // s_add_u32 s0, s0, $symbol 2426 // s_addc_u32 s1, s1, 0 2427 // 2428 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2429 // a fixup or relocation is emitted to replace $symbol with a literal 2430 // constant, which is a pc-relative offset from the encoding of the $symbol 2431 // operand to the global variable. 2432 // 2433 // For global address space: 2434 // s_getpc_b64 s[0:1] 2435 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2436 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2437 // 2438 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2439 // fixups or relocations are emitted to replace $symbol@*@lo and 2440 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2441 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2442 // operand to the global variable. 2443 // 2444 // What we want here is an offset from the value returned by s_getpc 2445 // (which is the address of the s_add_u32 instruction) to the global 2446 // variable, but since the encoding of $symbol starts 4 bytes after the start 2447 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2448 // small. This requires us to add 4 to the global variable offset in order to 2449 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2450 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2451 // instruction. 2452 2453 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2454 2455 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2456 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2457 2458 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2459 .addDef(PCReg); 2460 2461 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2462 if (GAFlags == SIInstrInfo::MO_NONE) 2463 MIB.addImm(0); 2464 else 2465 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2466 2467 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2468 2469 if (PtrTy.getSizeInBits() == 32) 2470 B.buildExtract(DstReg, PCReg, 0); 2471 return true; 2472 } 2473 2474 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2475 MachineInstr &MI, MachineRegisterInfo &MRI, 2476 MachineIRBuilder &B) const { 2477 Register DstReg = MI.getOperand(0).getReg(); 2478 LLT Ty = MRI.getType(DstReg); 2479 unsigned AS = Ty.getAddressSpace(); 2480 2481 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2482 MachineFunction &MF = B.getMF(); 2483 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2484 2485 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2486 if (!MFI->isModuleEntryFunction() && 2487 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2488 const Function &Fn = MF.getFunction(); 2489 DiagnosticInfoUnsupported BadLDSDecl( 2490 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2491 DS_Warning); 2492 Fn.getContext().diagnose(BadLDSDecl); 2493 2494 // We currently don't have a way to correctly allocate LDS objects that 2495 // aren't directly associated with a kernel. We do force inlining of 2496 // functions that use local objects. However, if these dead functions are 2497 // not eliminated, we don't want a compile time error. Just emit a warning 2498 // and a trap, since there should be no callable path here. 2499 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2500 B.buildUndef(DstReg); 2501 MI.eraseFromParent(); 2502 return true; 2503 } 2504 2505 // TODO: We could emit code to handle the initialization somewhere. 2506 // We ignore the initializer for now and legalize it to allow selection. 2507 // The initializer will anyway get errored out during assembly emission. 2508 const SITargetLowering *TLI = ST.getTargetLowering(); 2509 if (!TLI->shouldUseLDSConstAddress(GV)) { 2510 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2511 return true; // Leave in place; 2512 } 2513 2514 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2515 Type *Ty = GV->getValueType(); 2516 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2517 // zero-sized type in other languages to declare the dynamic shared 2518 // memory which size is not known at the compile time. They will be 2519 // allocated by the runtime and placed directly after the static 2520 // allocated ones. They all share the same offset. 2521 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2522 // Adjust alignment for that dynamic shared memory array. 2523 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2524 LLT S32 = LLT::scalar(32); 2525 auto Sz = 2526 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2527 B.buildIntToPtr(DstReg, Sz); 2528 MI.eraseFromParent(); 2529 return true; 2530 } 2531 } 2532 2533 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2534 *cast<GlobalVariable>(GV))); 2535 MI.eraseFromParent(); 2536 return true; 2537 } 2538 2539 const SITargetLowering *TLI = ST.getTargetLowering(); 2540 2541 if (TLI->shouldEmitFixup(GV)) { 2542 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2543 MI.eraseFromParent(); 2544 return true; 2545 } 2546 2547 if (TLI->shouldEmitPCReloc(GV)) { 2548 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2549 MI.eraseFromParent(); 2550 return true; 2551 } 2552 2553 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2554 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2555 2556 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2557 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2558 MachinePointerInfo::getGOT(MF), 2559 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2560 MachineMemOperand::MOInvariant, 2561 LoadTy, Align(8)); 2562 2563 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2564 2565 if (Ty.getSizeInBits() == 32) { 2566 // Truncate if this is a 32-bit constant address. 2567 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2568 B.buildExtract(DstReg, Load, 0); 2569 } else 2570 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2571 2572 MI.eraseFromParent(); 2573 return true; 2574 } 2575 2576 static LLT widenToNextPowerOf2(LLT Ty) { 2577 if (Ty.isVector()) 2578 return Ty.changeElementCount( 2579 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2580 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2581 } 2582 2583 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2584 MachineInstr &MI) const { 2585 MachineIRBuilder &B = Helper.MIRBuilder; 2586 MachineRegisterInfo &MRI = *B.getMRI(); 2587 GISelChangeObserver &Observer = Helper.Observer; 2588 2589 Register PtrReg = MI.getOperand(1).getReg(); 2590 LLT PtrTy = MRI.getType(PtrReg); 2591 unsigned AddrSpace = PtrTy.getAddressSpace(); 2592 2593 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2594 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2595 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2596 Observer.changingInstr(MI); 2597 MI.getOperand(1).setReg(Cast.getReg(0)); 2598 Observer.changedInstr(MI); 2599 return true; 2600 } 2601 2602 if (MI.getOpcode() != AMDGPU::G_LOAD) 2603 return false; 2604 2605 Register ValReg = MI.getOperand(0).getReg(); 2606 LLT ValTy = MRI.getType(ValReg); 2607 2608 MachineMemOperand *MMO = *MI.memoperands_begin(); 2609 const unsigned ValSize = ValTy.getSizeInBits(); 2610 const LLT MemTy = MMO->getMemoryType(); 2611 const Align MemAlign = MMO->getAlign(); 2612 const unsigned MemSize = MemTy.getSizeInBits(); 2613 const uint64_t AlignInBits = 8 * MemAlign.value(); 2614 2615 // Widen non-power-of-2 loads to the alignment if needed 2616 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2617 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2618 2619 // This was already the correct extending load result type, so just adjust 2620 // the memory type. 2621 if (WideMemSize == ValSize) { 2622 MachineFunction &MF = B.getMF(); 2623 2624 MachineMemOperand *WideMMO = 2625 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2626 Observer.changingInstr(MI); 2627 MI.setMemRefs(MF, {WideMMO}); 2628 Observer.changedInstr(MI); 2629 return true; 2630 } 2631 2632 // Don't bother handling edge case that should probably never be produced. 2633 if (ValSize > WideMemSize) 2634 return false; 2635 2636 LLT WideTy = widenToNextPowerOf2(ValTy); 2637 2638 Register WideLoad; 2639 if (!WideTy.isVector()) { 2640 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2641 B.buildTrunc(ValReg, WideLoad).getReg(0); 2642 } else { 2643 // Extract the subvector. 2644 2645 if (isRegisterType(ValTy)) { 2646 // If this a case where G_EXTRACT is legal, use it. 2647 // (e.g. <3 x s32> -> <4 x s32>) 2648 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2649 B.buildExtract(ValReg, WideLoad, 0); 2650 } else { 2651 // For cases where the widened type isn't a nice register value, unmerge 2652 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2653 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2654 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2655 } 2656 } 2657 2658 MI.eraseFromParent(); 2659 return true; 2660 } 2661 2662 return false; 2663 } 2664 2665 bool AMDGPULegalizerInfo::legalizeFMad( 2666 MachineInstr &MI, MachineRegisterInfo &MRI, 2667 MachineIRBuilder &B) const { 2668 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2669 assert(Ty.isScalar()); 2670 2671 MachineFunction &MF = B.getMF(); 2672 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2673 2674 // TODO: Always legal with future ftz flag. 2675 // FIXME: Do we need just output? 2676 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2677 return true; 2678 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2679 return true; 2680 2681 MachineIRBuilder HelperBuilder(MI); 2682 GISelObserverWrapper DummyObserver; 2683 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2684 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2685 } 2686 2687 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2688 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2689 Register DstReg = MI.getOperand(0).getReg(); 2690 Register PtrReg = MI.getOperand(1).getReg(); 2691 Register CmpVal = MI.getOperand(2).getReg(); 2692 Register NewVal = MI.getOperand(3).getReg(); 2693 2694 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2695 "this should not have been custom lowered"); 2696 2697 LLT ValTy = MRI.getType(CmpVal); 2698 LLT VecTy = LLT::fixed_vector(2, ValTy); 2699 2700 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2701 2702 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2703 .addDef(DstReg) 2704 .addUse(PtrReg) 2705 .addUse(PackedVal) 2706 .setMemRefs(MI.memoperands()); 2707 2708 MI.eraseFromParent(); 2709 return true; 2710 } 2711 2712 bool AMDGPULegalizerInfo::legalizeFlog( 2713 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2714 Register Dst = MI.getOperand(0).getReg(); 2715 Register Src = MI.getOperand(1).getReg(); 2716 LLT Ty = B.getMRI()->getType(Dst); 2717 unsigned Flags = MI.getFlags(); 2718 2719 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2720 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2721 2722 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2723 MI.eraseFromParent(); 2724 return true; 2725 } 2726 2727 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2728 MachineIRBuilder &B) const { 2729 Register Dst = MI.getOperand(0).getReg(); 2730 Register Src = MI.getOperand(1).getReg(); 2731 unsigned Flags = MI.getFlags(); 2732 LLT Ty = B.getMRI()->getType(Dst); 2733 2734 auto K = B.buildFConstant(Ty, numbers::log2e); 2735 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2736 B.buildFExp2(Dst, Mul, Flags); 2737 MI.eraseFromParent(); 2738 return true; 2739 } 2740 2741 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2742 MachineIRBuilder &B) const { 2743 Register Dst = MI.getOperand(0).getReg(); 2744 Register Src0 = MI.getOperand(1).getReg(); 2745 Register Src1 = MI.getOperand(2).getReg(); 2746 unsigned Flags = MI.getFlags(); 2747 LLT Ty = B.getMRI()->getType(Dst); 2748 const LLT S16 = LLT::scalar(16); 2749 const LLT S32 = LLT::scalar(32); 2750 2751 if (Ty == S32) { 2752 auto Log = B.buildFLog2(S32, Src0, Flags); 2753 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2754 .addUse(Log.getReg(0)) 2755 .addUse(Src1) 2756 .setMIFlags(Flags); 2757 B.buildFExp2(Dst, Mul, Flags); 2758 } else if (Ty == S16) { 2759 // There's no f16 fmul_legacy, so we need to convert for it. 2760 auto Log = B.buildFLog2(S16, Src0, Flags); 2761 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2762 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2763 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2764 .addUse(Ext0.getReg(0)) 2765 .addUse(Ext1.getReg(0)) 2766 .setMIFlags(Flags); 2767 2768 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2769 } else 2770 return false; 2771 2772 MI.eraseFromParent(); 2773 return true; 2774 } 2775 2776 // Find a source register, ignoring any possible source modifiers. 2777 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2778 Register ModSrc = OrigSrc; 2779 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2780 ModSrc = SrcFNeg->getOperand(1).getReg(); 2781 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2782 ModSrc = SrcFAbs->getOperand(1).getReg(); 2783 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2784 ModSrc = SrcFAbs->getOperand(1).getReg(); 2785 return ModSrc; 2786 } 2787 2788 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2789 MachineRegisterInfo &MRI, 2790 MachineIRBuilder &B) const { 2791 2792 const LLT S1 = LLT::scalar(1); 2793 const LLT S64 = LLT::scalar(64); 2794 Register Dst = MI.getOperand(0).getReg(); 2795 Register OrigSrc = MI.getOperand(1).getReg(); 2796 unsigned Flags = MI.getFlags(); 2797 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2798 "this should not have been custom lowered"); 2799 2800 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2801 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2802 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2803 // V_FRACT bug is: 2804 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2805 // 2806 // Convert floor(x) to (x - fract(x)) 2807 2808 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2809 .addUse(OrigSrc) 2810 .setMIFlags(Flags); 2811 2812 // Give source modifier matching some assistance before obscuring a foldable 2813 // pattern. 2814 2815 // TODO: We can avoid the neg on the fract? The input sign to fract 2816 // shouldn't matter? 2817 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2818 2819 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2820 2821 Register Min = MRI.createGenericVirtualRegister(S64); 2822 2823 // We don't need to concern ourselves with the snan handling difference, so 2824 // use the one which will directly select. 2825 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2826 if (MFI->getMode().IEEE) 2827 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2828 else 2829 B.buildFMinNum(Min, Fract, Const, Flags); 2830 2831 Register CorrectedFract = Min; 2832 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2833 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2834 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2835 } 2836 2837 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2838 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2839 2840 MI.eraseFromParent(); 2841 return true; 2842 } 2843 2844 // Turn an illegal packed v2s16 build vector into bit operations. 2845 // TODO: This should probably be a bitcast action in LegalizerHelper. 2846 bool AMDGPULegalizerInfo::legalizeBuildVector( 2847 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2848 Register Dst = MI.getOperand(0).getReg(); 2849 const LLT S32 = LLT::scalar(32); 2850 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2851 2852 Register Src0 = MI.getOperand(1).getReg(); 2853 Register Src1 = MI.getOperand(2).getReg(); 2854 assert(MRI.getType(Src0) == LLT::scalar(16)); 2855 2856 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2857 B.buildBitcast(Dst, Merge); 2858 2859 MI.eraseFromParent(); 2860 return true; 2861 } 2862 2863 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2864 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2865 // case with a single min instruction instead of a compare+select. 2866 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2867 MachineRegisterInfo &MRI, 2868 MachineIRBuilder &B) const { 2869 Register Dst = MI.getOperand(0).getReg(); 2870 Register Src = MI.getOperand(1).getReg(); 2871 LLT DstTy = MRI.getType(Dst); 2872 LLT SrcTy = MRI.getType(Src); 2873 2874 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2875 ? AMDGPU::G_AMDGPU_FFBH_U32 2876 : AMDGPU::G_AMDGPU_FFBL_B32; 2877 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2878 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2879 2880 MI.eraseFromParent(); 2881 return true; 2882 } 2883 2884 // Check that this is a G_XOR x, -1 2885 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2886 if (MI.getOpcode() != TargetOpcode::G_XOR) 2887 return false; 2888 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2889 return ConstVal && *ConstVal == -1; 2890 } 2891 2892 // Return the use branch instruction, otherwise null if the usage is invalid. 2893 static MachineInstr * 2894 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2895 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2896 Register CondDef = MI.getOperand(0).getReg(); 2897 if (!MRI.hasOneNonDBGUse(CondDef)) 2898 return nullptr; 2899 2900 MachineBasicBlock *Parent = MI.getParent(); 2901 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2902 2903 if (isNot(MRI, *UseMI)) { 2904 Register NegatedCond = UseMI->getOperand(0).getReg(); 2905 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2906 return nullptr; 2907 2908 // We're deleting the def of this value, so we need to remove it. 2909 eraseInstr(*UseMI, MRI); 2910 2911 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2912 Negated = true; 2913 } 2914 2915 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2916 return nullptr; 2917 2918 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2919 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2920 if (Next == Parent->end()) { 2921 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2922 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2923 return nullptr; 2924 UncondBrTarget = &*NextMBB; 2925 } else { 2926 if (Next->getOpcode() != AMDGPU::G_BR) 2927 return nullptr; 2928 Br = &*Next; 2929 UncondBrTarget = Br->getOperand(0).getMBB(); 2930 } 2931 2932 return UseMI; 2933 } 2934 2935 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2936 const ArgDescriptor *Arg, 2937 const TargetRegisterClass *ArgRC, 2938 LLT ArgTy) const { 2939 MCRegister SrcReg = Arg->getRegister(); 2940 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2941 assert(DstReg.isVirtual() && "Virtual register expected"); 2942 2943 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 2944 *ArgRC, B.getDebugLoc(), ArgTy); 2945 if (Arg->isMasked()) { 2946 // TODO: Should we try to emit this once in the entry block? 2947 const LLT S32 = LLT::scalar(32); 2948 const unsigned Mask = Arg->getMask(); 2949 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2950 2951 Register AndMaskSrc = LiveIn; 2952 2953 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 2954 // 0. 2955 if (Shift != 0) { 2956 auto ShiftAmt = B.buildConstant(S32, Shift); 2957 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2958 } 2959 2960 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2961 } else { 2962 B.buildCopy(DstReg, LiveIn); 2963 } 2964 2965 return true; 2966 } 2967 2968 bool AMDGPULegalizerInfo::loadInputValue( 2969 Register DstReg, MachineIRBuilder &B, 2970 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2971 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2972 const ArgDescriptor *Arg; 2973 const TargetRegisterClass *ArgRC; 2974 LLT ArgTy; 2975 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2976 2977 if (!Arg) { 2978 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 2979 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 2980 // case the pointer argument may be missing and we use null. 2981 B.buildConstant(DstReg, 0); 2982 return true; 2983 } 2984 2985 // It's undefined behavior if a function marked with the amdgpu-no-* 2986 // attributes uses the corresponding intrinsic. 2987 B.buildUndef(DstReg); 2988 return true; 2989 } 2990 2991 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2992 return false; // TODO: Handle these 2993 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2994 } 2995 2996 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2997 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2998 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2999 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 3000 return false; 3001 3002 MI.eraseFromParent(); 3003 return true; 3004 } 3005 3006 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 3007 MachineRegisterInfo &MRI, 3008 MachineIRBuilder &B) const { 3009 Register Dst = MI.getOperand(0).getReg(); 3010 LLT DstTy = MRI.getType(Dst); 3011 LLT S16 = LLT::scalar(16); 3012 LLT S32 = LLT::scalar(32); 3013 LLT S64 = LLT::scalar(64); 3014 3015 if (DstTy == S16) 3016 return legalizeFDIV16(MI, MRI, B); 3017 if (DstTy == S32) 3018 return legalizeFDIV32(MI, MRI, B); 3019 if (DstTy == S64) 3020 return legalizeFDIV64(MI, MRI, B); 3021 3022 return false; 3023 } 3024 3025 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 3026 Register DstDivReg, 3027 Register DstRemReg, 3028 Register X, 3029 Register Y) const { 3030 const LLT S1 = LLT::scalar(1); 3031 const LLT S32 = LLT::scalar(32); 3032 3033 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 3034 // algorithm used here. 3035 3036 // Initial estimate of inv(y). 3037 auto FloatY = B.buildUITOFP(S32, Y); 3038 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 3039 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 3040 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 3041 auto Z = B.buildFPTOUI(S32, ScaledY); 3042 3043 // One round of UNR. 3044 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 3045 auto NegYZ = B.buildMul(S32, NegY, Z); 3046 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 3047 3048 // Quotient/remainder estimate. 3049 auto Q = B.buildUMulH(S32, X, Z); 3050 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 3051 3052 // First quotient/remainder refinement. 3053 auto One = B.buildConstant(S32, 1); 3054 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3055 if (DstDivReg) 3056 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 3057 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 3058 3059 // Second quotient/remainder refinement. 3060 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3061 if (DstDivReg) 3062 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 3063 3064 if (DstRemReg) 3065 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 3066 } 3067 3068 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 3069 // 3070 // Return lo, hi of result 3071 // 3072 // %cvt.lo = G_UITOFP Val.lo 3073 // %cvt.hi = G_UITOFP Val.hi 3074 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 3075 // %rcp = G_AMDGPU_RCP_IFLAG %mad 3076 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 3077 // %mul2 = G_FMUL %mul1, 2**(-32) 3078 // %trunc = G_INTRINSIC_TRUNC %mul2 3079 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 3080 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 3081 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 3082 Register Val) { 3083 const LLT S32 = LLT::scalar(32); 3084 auto Unmerge = B.buildUnmerge(S32, Val); 3085 3086 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 3087 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 3088 3089 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 3090 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 3091 3092 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 3093 auto Mul1 = 3094 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3095 3096 // 2**(-32) 3097 auto Mul2 = 3098 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3099 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3100 3101 // -(2**32) 3102 auto Mad2 = B.buildFMAD(S32, Trunc, 3103 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3104 3105 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3106 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3107 3108 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3109 } 3110 3111 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3112 Register DstDivReg, 3113 Register DstRemReg, 3114 Register Numer, 3115 Register Denom) const { 3116 const LLT S32 = LLT::scalar(32); 3117 const LLT S64 = LLT::scalar(64); 3118 const LLT S1 = LLT::scalar(1); 3119 Register RcpLo, RcpHi; 3120 3121 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3122 3123 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3124 3125 auto Zero64 = B.buildConstant(S64, 0); 3126 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3127 3128 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3129 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3130 3131 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3132 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3133 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3134 3135 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3136 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3137 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3138 3139 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3140 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3141 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3142 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3143 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3144 3145 auto Zero32 = B.buildConstant(S32, 0); 3146 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3147 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3148 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3149 3150 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3151 Register NumerLo = UnmergeNumer.getReg(0); 3152 Register NumerHi = UnmergeNumer.getReg(1); 3153 3154 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3155 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3156 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3157 Register Mul3_Lo = UnmergeMul3.getReg(0); 3158 Register Mul3_Hi = UnmergeMul3.getReg(1); 3159 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3160 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3161 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3162 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3163 3164 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3165 Register DenomLo = UnmergeDenom.getReg(0); 3166 Register DenomHi = UnmergeDenom.getReg(1); 3167 3168 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3169 auto C1 = B.buildSExt(S32, CmpHi); 3170 3171 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3172 auto C2 = B.buildSExt(S32, CmpLo); 3173 3174 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3175 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3176 3177 // TODO: Here and below portions of the code can be enclosed into if/endif. 3178 // Currently control flow is unconditional and we have 4 selects after 3179 // potential endif to substitute PHIs. 3180 3181 // if C3 != 0 ... 3182 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3183 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3184 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3185 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3186 3187 auto One64 = B.buildConstant(S64, 1); 3188 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3189 3190 auto C4 = 3191 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3192 auto C5 = 3193 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3194 auto C6 = B.buildSelect( 3195 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3196 3197 // if (C6 != 0) 3198 auto Add4 = B.buildAdd(S64, Add3, One64); 3199 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3200 3201 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3202 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3203 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3204 3205 // endif C6 3206 // endif C3 3207 3208 if (DstDivReg) { 3209 auto Sel1 = B.buildSelect( 3210 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3211 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3212 Sel1, MulHi3); 3213 } 3214 3215 if (DstRemReg) { 3216 auto Sel2 = B.buildSelect( 3217 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3218 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3219 Sel2, Sub1); 3220 } 3221 } 3222 3223 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3224 MachineRegisterInfo &MRI, 3225 MachineIRBuilder &B) const { 3226 Register DstDivReg, DstRemReg; 3227 switch (MI.getOpcode()) { 3228 default: 3229 llvm_unreachable("Unexpected opcode!"); 3230 case AMDGPU::G_UDIV: { 3231 DstDivReg = MI.getOperand(0).getReg(); 3232 break; 3233 } 3234 case AMDGPU::G_UREM: { 3235 DstRemReg = MI.getOperand(0).getReg(); 3236 break; 3237 } 3238 case AMDGPU::G_UDIVREM: { 3239 DstDivReg = MI.getOperand(0).getReg(); 3240 DstRemReg = MI.getOperand(1).getReg(); 3241 break; 3242 } 3243 } 3244 3245 const LLT S64 = LLT::scalar(64); 3246 const LLT S32 = LLT::scalar(32); 3247 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3248 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3249 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3250 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3251 3252 if (Ty == S32) 3253 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3254 else if (Ty == S64) 3255 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3256 else 3257 return false; 3258 3259 MI.eraseFromParent(); 3260 return true; 3261 } 3262 3263 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3264 MachineRegisterInfo &MRI, 3265 MachineIRBuilder &B) const { 3266 const LLT S64 = LLT::scalar(64); 3267 const LLT S32 = LLT::scalar(32); 3268 3269 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3270 if (Ty != S32 && Ty != S64) 3271 return false; 3272 3273 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3274 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3275 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3276 3277 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3278 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3279 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3280 3281 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3282 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3283 3284 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3285 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3286 3287 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3288 switch (MI.getOpcode()) { 3289 default: 3290 llvm_unreachable("Unexpected opcode!"); 3291 case AMDGPU::G_SDIV: { 3292 DstDivReg = MI.getOperand(0).getReg(); 3293 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3294 break; 3295 } 3296 case AMDGPU::G_SREM: { 3297 DstRemReg = MI.getOperand(0).getReg(); 3298 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3299 break; 3300 } 3301 case AMDGPU::G_SDIVREM: { 3302 DstDivReg = MI.getOperand(0).getReg(); 3303 DstRemReg = MI.getOperand(1).getReg(); 3304 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3305 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3306 break; 3307 } 3308 } 3309 3310 if (Ty == S32) 3311 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3312 else 3313 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3314 3315 if (DstDivReg) { 3316 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3317 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3318 B.buildSub(DstDivReg, SignXor, Sign); 3319 } 3320 3321 if (DstRemReg) { 3322 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3323 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3324 B.buildSub(DstRemReg, SignXor, Sign); 3325 } 3326 3327 MI.eraseFromParent(); 3328 return true; 3329 } 3330 3331 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3332 MachineRegisterInfo &MRI, 3333 MachineIRBuilder &B) const { 3334 Register Res = MI.getOperand(0).getReg(); 3335 Register LHS = MI.getOperand(1).getReg(); 3336 Register RHS = MI.getOperand(2).getReg(); 3337 uint16_t Flags = MI.getFlags(); 3338 LLT ResTy = MRI.getType(Res); 3339 3340 const MachineFunction &MF = B.getMF(); 3341 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3342 MI.getFlag(MachineInstr::FmAfn); 3343 3344 if (!AllowInaccurateRcp) 3345 return false; 3346 3347 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3348 // 1 / x -> RCP(x) 3349 if (CLHS->isExactlyValue(1.0)) { 3350 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3351 .addUse(RHS) 3352 .setMIFlags(Flags); 3353 3354 MI.eraseFromParent(); 3355 return true; 3356 } 3357 3358 // -1 / x -> RCP( FNEG(x) ) 3359 if (CLHS->isExactlyValue(-1.0)) { 3360 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3361 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3362 .addUse(FNeg.getReg(0)) 3363 .setMIFlags(Flags); 3364 3365 MI.eraseFromParent(); 3366 return true; 3367 } 3368 } 3369 3370 // x / y -> x * (1.0 / y) 3371 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3372 .addUse(RHS) 3373 .setMIFlags(Flags); 3374 B.buildFMul(Res, LHS, RCP, Flags); 3375 3376 MI.eraseFromParent(); 3377 return true; 3378 } 3379 3380 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3381 MachineRegisterInfo &MRI, 3382 MachineIRBuilder &B) const { 3383 Register Res = MI.getOperand(0).getReg(); 3384 Register X = MI.getOperand(1).getReg(); 3385 Register Y = MI.getOperand(2).getReg(); 3386 uint16_t Flags = MI.getFlags(); 3387 LLT ResTy = MRI.getType(Res); 3388 3389 const MachineFunction &MF = B.getMF(); 3390 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3391 MI.getFlag(MachineInstr::FmAfn); 3392 3393 if (!AllowInaccurateRcp) 3394 return false; 3395 3396 auto NegY = B.buildFNeg(ResTy, Y); 3397 auto One = B.buildFConstant(ResTy, 1.0); 3398 3399 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3400 .addUse(Y) 3401 .setMIFlags(Flags); 3402 3403 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3404 R = B.buildFMA(ResTy, Tmp0, R, R); 3405 3406 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3407 R = B.buildFMA(ResTy, Tmp1, R, R); 3408 3409 auto Ret = B.buildFMul(ResTy, X, R); 3410 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3411 3412 B.buildFMA(Res, Tmp2, R, Ret); 3413 MI.eraseFromParent(); 3414 return true; 3415 } 3416 3417 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3418 MachineRegisterInfo &MRI, 3419 MachineIRBuilder &B) const { 3420 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3421 return true; 3422 3423 Register Res = MI.getOperand(0).getReg(); 3424 Register LHS = MI.getOperand(1).getReg(); 3425 Register RHS = MI.getOperand(2).getReg(); 3426 3427 uint16_t Flags = MI.getFlags(); 3428 3429 LLT S16 = LLT::scalar(16); 3430 LLT S32 = LLT::scalar(32); 3431 3432 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3433 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3434 3435 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3436 .addUse(RHSExt.getReg(0)) 3437 .setMIFlags(Flags); 3438 3439 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3440 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3441 3442 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3443 .addUse(RDst.getReg(0)) 3444 .addUse(RHS) 3445 .addUse(LHS) 3446 .setMIFlags(Flags); 3447 3448 MI.eraseFromParent(); 3449 return true; 3450 } 3451 3452 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3453 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3454 static void toggleSPDenormMode(bool Enable, 3455 MachineIRBuilder &B, 3456 const GCNSubtarget &ST, 3457 AMDGPU::SIModeRegisterDefaults Mode) { 3458 // Set SP denorm mode to this value. 3459 unsigned SPDenormMode = 3460 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3461 3462 if (ST.hasDenormModeInst()) { 3463 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3464 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3465 3466 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3467 B.buildInstr(AMDGPU::S_DENORM_MODE) 3468 .addImm(NewDenormModeValue); 3469 3470 } else { 3471 // Select FP32 bit field in mode register. 3472 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3473 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3474 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3475 3476 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3477 .addImm(SPDenormMode) 3478 .addImm(SPDenormModeBitField); 3479 } 3480 } 3481 3482 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3483 MachineRegisterInfo &MRI, 3484 MachineIRBuilder &B) const { 3485 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3486 return true; 3487 3488 Register Res = MI.getOperand(0).getReg(); 3489 Register LHS = MI.getOperand(1).getReg(); 3490 Register RHS = MI.getOperand(2).getReg(); 3491 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3492 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3493 3494 uint16_t Flags = MI.getFlags(); 3495 3496 LLT S32 = LLT::scalar(32); 3497 LLT S1 = LLT::scalar(1); 3498 3499 auto One = B.buildFConstant(S32, 1.0f); 3500 3501 auto DenominatorScaled = 3502 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3503 .addUse(LHS) 3504 .addUse(RHS) 3505 .addImm(0) 3506 .setMIFlags(Flags); 3507 auto NumeratorScaled = 3508 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3509 .addUse(LHS) 3510 .addUse(RHS) 3511 .addImm(1) 3512 .setMIFlags(Flags); 3513 3514 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3515 .addUse(DenominatorScaled.getReg(0)) 3516 .setMIFlags(Flags); 3517 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3518 3519 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3520 // aren't modeled as reading it. 3521 if (!Mode.allFP32Denormals()) 3522 toggleSPDenormMode(true, B, ST, Mode); 3523 3524 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3525 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3526 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3527 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3528 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3529 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3530 3531 if (!Mode.allFP32Denormals()) 3532 toggleSPDenormMode(false, B, ST, Mode); 3533 3534 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3535 .addUse(Fma4.getReg(0)) 3536 .addUse(Fma1.getReg(0)) 3537 .addUse(Fma3.getReg(0)) 3538 .addUse(NumeratorScaled.getReg(1)) 3539 .setMIFlags(Flags); 3540 3541 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3542 .addUse(Fmas.getReg(0)) 3543 .addUse(RHS) 3544 .addUse(LHS) 3545 .setMIFlags(Flags); 3546 3547 MI.eraseFromParent(); 3548 return true; 3549 } 3550 3551 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3552 MachineRegisterInfo &MRI, 3553 MachineIRBuilder &B) const { 3554 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3555 return true; 3556 3557 Register Res = MI.getOperand(0).getReg(); 3558 Register LHS = MI.getOperand(1).getReg(); 3559 Register RHS = MI.getOperand(2).getReg(); 3560 3561 uint16_t Flags = MI.getFlags(); 3562 3563 LLT S64 = LLT::scalar(64); 3564 LLT S1 = LLT::scalar(1); 3565 3566 auto One = B.buildFConstant(S64, 1.0); 3567 3568 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3569 .addUse(LHS) 3570 .addUse(RHS) 3571 .addImm(0) 3572 .setMIFlags(Flags); 3573 3574 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3575 3576 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3577 .addUse(DivScale0.getReg(0)) 3578 .setMIFlags(Flags); 3579 3580 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3581 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3582 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3583 3584 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3585 .addUse(LHS) 3586 .addUse(RHS) 3587 .addImm(1) 3588 .setMIFlags(Flags); 3589 3590 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3591 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3592 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3593 3594 Register Scale; 3595 if (!ST.hasUsableDivScaleConditionOutput()) { 3596 // Workaround a hardware bug on SI where the condition output from div_scale 3597 // is not usable. 3598 3599 LLT S32 = LLT::scalar(32); 3600 3601 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3602 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3603 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3604 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3605 3606 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3607 Scale1Unmerge.getReg(1)); 3608 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3609 Scale0Unmerge.getReg(1)); 3610 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3611 } else { 3612 Scale = DivScale1.getReg(1); 3613 } 3614 3615 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3616 .addUse(Fma4.getReg(0)) 3617 .addUse(Fma3.getReg(0)) 3618 .addUse(Mul.getReg(0)) 3619 .addUse(Scale) 3620 .setMIFlags(Flags); 3621 3622 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3623 .addUse(Fmas.getReg(0)) 3624 .addUse(RHS) 3625 .addUse(LHS) 3626 .setMIFlags(Flags); 3627 3628 MI.eraseFromParent(); 3629 return true; 3630 } 3631 3632 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3633 MachineRegisterInfo &MRI, 3634 MachineIRBuilder &B) const { 3635 Register Res = MI.getOperand(0).getReg(); 3636 Register LHS = MI.getOperand(2).getReg(); 3637 Register RHS = MI.getOperand(3).getReg(); 3638 uint16_t Flags = MI.getFlags(); 3639 3640 LLT S32 = LLT::scalar(32); 3641 LLT S1 = LLT::scalar(1); 3642 3643 auto Abs = B.buildFAbs(S32, RHS, Flags); 3644 const APFloat C0Val(1.0f); 3645 3646 auto C0 = B.buildConstant(S32, 0x6f800000); 3647 auto C1 = B.buildConstant(S32, 0x2f800000); 3648 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3649 3650 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3651 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3652 3653 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3654 3655 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3656 .addUse(Mul0.getReg(0)) 3657 .setMIFlags(Flags); 3658 3659 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3660 3661 B.buildFMul(Res, Sel, Mul1, Flags); 3662 3663 MI.eraseFromParent(); 3664 return true; 3665 } 3666 3667 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3668 // FIXME: Why do we handle this one but not other removed instructions? 3669 // 3670 // Reciprocal square root. The clamp prevents infinite results, clamping 3671 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3672 // +-max_float. 3673 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3674 MachineRegisterInfo &MRI, 3675 MachineIRBuilder &B) const { 3676 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3677 return true; 3678 3679 Register Dst = MI.getOperand(0).getReg(); 3680 Register Src = MI.getOperand(2).getReg(); 3681 auto Flags = MI.getFlags(); 3682 3683 LLT Ty = MRI.getType(Dst); 3684 3685 const fltSemantics *FltSemantics; 3686 if (Ty == LLT::scalar(32)) 3687 FltSemantics = &APFloat::IEEEsingle(); 3688 else if (Ty == LLT::scalar(64)) 3689 FltSemantics = &APFloat::IEEEdouble(); 3690 else 3691 return false; 3692 3693 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3694 .addUse(Src) 3695 .setMIFlags(Flags); 3696 3697 // We don't need to concern ourselves with the snan handling difference, since 3698 // the rsq quieted (or not) so use the one which will directly select. 3699 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3700 const bool UseIEEE = MFI->getMode().IEEE; 3701 3702 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3703 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3704 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3705 3706 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3707 3708 if (UseIEEE) 3709 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3710 else 3711 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3712 MI.eraseFromParent(); 3713 return true; 3714 } 3715 3716 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3717 switch (IID) { 3718 case Intrinsic::amdgcn_ds_fadd: 3719 return AMDGPU::G_ATOMICRMW_FADD; 3720 case Intrinsic::amdgcn_ds_fmin: 3721 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3722 case Intrinsic::amdgcn_ds_fmax: 3723 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3724 default: 3725 llvm_unreachable("not a DS FP intrinsic"); 3726 } 3727 } 3728 3729 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3730 MachineInstr &MI, 3731 Intrinsic::ID IID) const { 3732 GISelChangeObserver &Observer = Helper.Observer; 3733 Observer.changingInstr(MI); 3734 3735 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3736 3737 // The remaining operands were used to set fields in the MemOperand on 3738 // construction. 3739 for (int I = 6; I > 3; --I) 3740 MI.removeOperand(I); 3741 3742 MI.removeOperand(1); // Remove the intrinsic ID. 3743 Observer.changedInstr(MI); 3744 return true; 3745 } 3746 3747 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3748 MachineRegisterInfo &MRI, 3749 MachineIRBuilder &B) const { 3750 uint64_t Offset = 3751 ST.getTargetLowering()->getImplicitParameterOffset( 3752 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3753 LLT DstTy = MRI.getType(DstReg); 3754 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3755 3756 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3757 if (!loadInputValue(KernargPtrReg, B, 3758 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3759 return false; 3760 3761 // FIXME: This should be nuw 3762 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3763 return true; 3764 } 3765 3766 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3767 MachineRegisterInfo &MRI, 3768 MachineIRBuilder &B) const { 3769 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3770 if (!MFI->isEntryFunction()) { 3771 return legalizePreloadedArgIntrin(MI, MRI, B, 3772 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3773 } 3774 3775 Register DstReg = MI.getOperand(0).getReg(); 3776 if (!getImplicitArgPtr(DstReg, MRI, B)) 3777 return false; 3778 3779 MI.eraseFromParent(); 3780 return true; 3781 } 3782 3783 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3784 MachineRegisterInfo &MRI, 3785 MachineIRBuilder &B, 3786 unsigned AddrSpace) const { 3787 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3788 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3789 Register Hi32 = Unmerge.getReg(1); 3790 3791 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3792 MI.eraseFromParent(); 3793 return true; 3794 } 3795 3796 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3797 // offset (the offset that is included in bounds checking and swizzling, to be 3798 // split between the instruction's voffset and immoffset fields) and soffset 3799 // (the offset that is excluded from bounds checking and swizzling, to go in 3800 // the instruction's soffset field). This function takes the first kind of 3801 // offset and figures out how to split it between voffset and immoffset. 3802 std::pair<Register, unsigned> 3803 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3804 Register OrigOffset) const { 3805 const unsigned MaxImm = 4095; 3806 Register BaseReg; 3807 unsigned ImmOffset; 3808 const LLT S32 = LLT::scalar(32); 3809 MachineRegisterInfo &MRI = *B.getMRI(); 3810 3811 std::tie(BaseReg, ImmOffset) = 3812 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3813 3814 // If BaseReg is a pointer, convert it to int. 3815 if (MRI.getType(BaseReg).isPointer()) 3816 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3817 3818 // If the immediate value is too big for the immoffset field, put the value 3819 // and -4096 into the immoffset field so that the value that is copied/added 3820 // for the voffset field is a multiple of 4096, and it stands more chance 3821 // of being CSEd with the copy/add for another similar load/store. 3822 // However, do not do that rounding down to a multiple of 4096 if that is a 3823 // negative number, as it appears to be illegal to have a negative offset 3824 // in the vgpr, even if adding the immediate offset makes it positive. 3825 unsigned Overflow = ImmOffset & ~MaxImm; 3826 ImmOffset -= Overflow; 3827 if ((int32_t)Overflow < 0) { 3828 Overflow += ImmOffset; 3829 ImmOffset = 0; 3830 } 3831 3832 if (Overflow != 0) { 3833 if (!BaseReg) { 3834 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3835 } else { 3836 auto OverflowVal = B.buildConstant(S32, Overflow); 3837 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3838 } 3839 } 3840 3841 if (!BaseReg) 3842 BaseReg = B.buildConstant(S32, 0).getReg(0); 3843 3844 return std::make_pair(BaseReg, ImmOffset); 3845 } 3846 3847 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3848 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3849 Register VOffset, Register SOffset, 3850 unsigned ImmOffset, Register VIndex, 3851 MachineRegisterInfo &MRI) const { 3852 Optional<ValueAndVReg> MaybeVOffsetVal = 3853 getIConstantVRegValWithLookThrough(VOffset, MRI); 3854 Optional<ValueAndVReg> MaybeSOffsetVal = 3855 getIConstantVRegValWithLookThrough(SOffset, MRI); 3856 Optional<ValueAndVReg> MaybeVIndexVal = 3857 getIConstantVRegValWithLookThrough(VIndex, MRI); 3858 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3859 // update the MMO with that offset. The stride is unknown so we can only do 3860 // this if VIndex is constant 0. 3861 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3862 MaybeVIndexVal->Value == 0) { 3863 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3864 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3865 MMO->setOffset(TotalOffset); 3866 } else { 3867 // We don't have a constant combined offset to use in the MMO. Give up. 3868 MMO->setValue((Value *)nullptr); 3869 } 3870 } 3871 3872 /// Handle register layout difference for f16 images for some subtargets. 3873 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3874 MachineRegisterInfo &MRI, 3875 Register Reg, 3876 bool ImageStore) const { 3877 const LLT S16 = LLT::scalar(16); 3878 const LLT S32 = LLT::scalar(32); 3879 LLT StoreVT = MRI.getType(Reg); 3880 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3881 3882 if (ST.hasUnpackedD16VMem()) { 3883 auto Unmerge = B.buildUnmerge(S16, Reg); 3884 3885 SmallVector<Register, 4> WideRegs; 3886 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3887 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3888 3889 int NumElts = StoreVT.getNumElements(); 3890 3891 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3892 .getReg(0); 3893 } 3894 3895 if (ImageStore && ST.hasImageStoreD16Bug()) { 3896 if (StoreVT.getNumElements() == 2) { 3897 SmallVector<Register, 4> PackedRegs; 3898 Reg = B.buildBitcast(S32, Reg).getReg(0); 3899 PackedRegs.push_back(Reg); 3900 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3901 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3902 .getReg(0); 3903 } 3904 3905 if (StoreVT.getNumElements() == 3) { 3906 SmallVector<Register, 4> PackedRegs; 3907 auto Unmerge = B.buildUnmerge(S16, Reg); 3908 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3909 PackedRegs.push_back(Unmerge.getReg(I)); 3910 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3911 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3912 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3913 } 3914 3915 if (StoreVT.getNumElements() == 4) { 3916 SmallVector<Register, 4> PackedRegs; 3917 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3918 auto Unmerge = B.buildUnmerge(S32, Reg); 3919 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3920 PackedRegs.push_back(Unmerge.getReg(I)); 3921 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3922 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3923 .getReg(0); 3924 } 3925 3926 llvm_unreachable("invalid data type"); 3927 } 3928 3929 if (StoreVT == LLT::fixed_vector(3, S16)) { 3930 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 3931 .getReg(0); 3932 } 3933 return Reg; 3934 } 3935 3936 Register AMDGPULegalizerInfo::fixStoreSourceType( 3937 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3938 MachineRegisterInfo *MRI = B.getMRI(); 3939 LLT Ty = MRI->getType(VData); 3940 3941 const LLT S16 = LLT::scalar(16); 3942 3943 // Fixup illegal register types for i8 stores. 3944 if (Ty == LLT::scalar(8) || Ty == S16) { 3945 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3946 return AnyExt; 3947 } 3948 3949 if (Ty.isVector()) { 3950 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3951 if (IsFormat) 3952 return handleD16VData(B, *MRI, VData); 3953 } 3954 } 3955 3956 return VData; 3957 } 3958 3959 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3960 MachineRegisterInfo &MRI, 3961 MachineIRBuilder &B, 3962 bool IsTyped, 3963 bool IsFormat) const { 3964 Register VData = MI.getOperand(1).getReg(); 3965 LLT Ty = MRI.getType(VData); 3966 LLT EltTy = Ty.getScalarType(); 3967 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3968 const LLT S32 = LLT::scalar(32); 3969 3970 VData = fixStoreSourceType(B, VData, IsFormat); 3971 Register RSrc = MI.getOperand(2).getReg(); 3972 3973 MachineMemOperand *MMO = *MI.memoperands_begin(); 3974 const int MemSize = MMO->getSize(); 3975 3976 unsigned ImmOffset; 3977 3978 // The typed intrinsics add an immediate after the registers. 3979 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3980 3981 // The struct intrinsic variants add one additional operand over raw. 3982 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3983 Register VIndex; 3984 int OpOffset = 0; 3985 if (HasVIndex) { 3986 VIndex = MI.getOperand(3).getReg(); 3987 OpOffset = 1; 3988 } else { 3989 VIndex = B.buildConstant(S32, 0).getReg(0); 3990 } 3991 3992 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3993 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3994 3995 unsigned Format = 0; 3996 if (IsTyped) { 3997 Format = MI.getOperand(5 + OpOffset).getImm(); 3998 ++OpOffset; 3999 } 4000 4001 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4002 4003 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4004 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4005 4006 unsigned Opc; 4007 if (IsTyped) { 4008 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 4009 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 4010 } else if (IsFormat) { 4011 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 4012 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 4013 } else { 4014 switch (MemSize) { 4015 case 1: 4016 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 4017 break; 4018 case 2: 4019 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 4020 break; 4021 default: 4022 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 4023 break; 4024 } 4025 } 4026 4027 auto MIB = B.buildInstr(Opc) 4028 .addUse(VData) // vdata 4029 .addUse(RSrc) // rsrc 4030 .addUse(VIndex) // vindex 4031 .addUse(VOffset) // voffset 4032 .addUse(SOffset) // soffset 4033 .addImm(ImmOffset); // offset(imm) 4034 4035 if (IsTyped) 4036 MIB.addImm(Format); 4037 4038 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4039 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4040 .addMemOperand(MMO); 4041 4042 MI.eraseFromParent(); 4043 return true; 4044 } 4045 4046 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 4047 MachineRegisterInfo &MRI, 4048 MachineIRBuilder &B, 4049 bool IsFormat, 4050 bool IsTyped) const { 4051 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 4052 MachineMemOperand *MMO = *MI.memoperands_begin(); 4053 const LLT MemTy = MMO->getMemoryType(); 4054 const LLT S32 = LLT::scalar(32); 4055 4056 Register Dst = MI.getOperand(0).getReg(); 4057 Register RSrc = MI.getOperand(2).getReg(); 4058 4059 // The typed intrinsics add an immediate after the registers. 4060 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4061 4062 // The struct intrinsic variants add one additional operand over raw. 4063 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4064 Register VIndex; 4065 int OpOffset = 0; 4066 if (HasVIndex) { 4067 VIndex = MI.getOperand(3).getReg(); 4068 OpOffset = 1; 4069 } else { 4070 VIndex = B.buildConstant(S32, 0).getReg(0); 4071 } 4072 4073 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4074 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4075 4076 unsigned Format = 0; 4077 if (IsTyped) { 4078 Format = MI.getOperand(5 + OpOffset).getImm(); 4079 ++OpOffset; 4080 } 4081 4082 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4083 unsigned ImmOffset; 4084 4085 LLT Ty = MRI.getType(Dst); 4086 LLT EltTy = Ty.getScalarType(); 4087 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4088 const bool Unpacked = ST.hasUnpackedD16VMem(); 4089 4090 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4091 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4092 4093 unsigned Opc; 4094 4095 if (IsTyped) { 4096 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4097 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4098 } else if (IsFormat) { 4099 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4100 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4101 } else { 4102 switch (MemTy.getSizeInBits()) { 4103 case 8: 4104 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4105 break; 4106 case 16: 4107 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4108 break; 4109 default: 4110 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4111 break; 4112 } 4113 } 4114 4115 Register LoadDstReg; 4116 4117 bool IsExtLoad = 4118 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4119 LLT UnpackedTy = Ty.changeElementSize(32); 4120 4121 if (IsExtLoad) 4122 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4123 else if (Unpacked && IsD16 && Ty.isVector()) 4124 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4125 else 4126 LoadDstReg = Dst; 4127 4128 auto MIB = B.buildInstr(Opc) 4129 .addDef(LoadDstReg) // vdata 4130 .addUse(RSrc) // rsrc 4131 .addUse(VIndex) // vindex 4132 .addUse(VOffset) // voffset 4133 .addUse(SOffset) // soffset 4134 .addImm(ImmOffset); // offset(imm) 4135 4136 if (IsTyped) 4137 MIB.addImm(Format); 4138 4139 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4140 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4141 .addMemOperand(MMO); 4142 4143 if (LoadDstReg != Dst) { 4144 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4145 4146 // Widen result for extending loads was widened. 4147 if (IsExtLoad) 4148 B.buildTrunc(Dst, LoadDstReg); 4149 else { 4150 // Repack to original 16-bit vector result 4151 // FIXME: G_TRUNC should work, but legalization currently fails 4152 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4153 SmallVector<Register, 4> Repack; 4154 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4155 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4156 B.buildMerge(Dst, Repack); 4157 } 4158 } 4159 4160 MI.eraseFromParent(); 4161 return true; 4162 } 4163 4164 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4165 MachineIRBuilder &B, 4166 bool IsInc) const { 4167 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4168 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4169 B.buildInstr(Opc) 4170 .addDef(MI.getOperand(0).getReg()) 4171 .addUse(MI.getOperand(2).getReg()) 4172 .addUse(MI.getOperand(3).getReg()) 4173 .cloneMemRefs(MI); 4174 MI.eraseFromParent(); 4175 return true; 4176 } 4177 4178 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4179 switch (IntrID) { 4180 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4181 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4182 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4183 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4184 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4185 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4186 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4187 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4188 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4189 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4190 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4191 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4192 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4193 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4194 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4195 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4196 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4197 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4198 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4199 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4200 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4201 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4202 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4203 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4204 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4205 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4206 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4207 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4208 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4209 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4210 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4211 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4212 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4213 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4214 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4215 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4216 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4217 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4218 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4219 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4220 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4221 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4222 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4223 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4224 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4225 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4226 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4227 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4228 default: 4229 llvm_unreachable("unhandled atomic opcode"); 4230 } 4231 } 4232 4233 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4234 MachineIRBuilder &B, 4235 Intrinsic::ID IID) const { 4236 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4237 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4238 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4239 4240 Register Dst; 4241 4242 int OpOffset = 0; 4243 if (HasReturn) { 4244 // A few FP atomics do not support return values. 4245 Dst = MI.getOperand(0).getReg(); 4246 } else { 4247 OpOffset = -1; 4248 } 4249 4250 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4251 Register CmpVal; 4252 4253 if (IsCmpSwap) { 4254 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4255 ++OpOffset; 4256 } 4257 4258 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4259 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4260 4261 // The struct intrinsic variants add one additional operand over raw. 4262 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4263 Register VIndex; 4264 if (HasVIndex) { 4265 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4266 ++OpOffset; 4267 } else { 4268 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4269 } 4270 4271 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4272 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4273 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4274 4275 MachineMemOperand *MMO = *MI.memoperands_begin(); 4276 4277 unsigned ImmOffset; 4278 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4279 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4280 4281 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4282 4283 if (HasReturn) 4284 MIB.addDef(Dst); 4285 4286 MIB.addUse(VData); // vdata 4287 4288 if (IsCmpSwap) 4289 MIB.addReg(CmpVal); 4290 4291 MIB.addUse(RSrc) // rsrc 4292 .addUse(VIndex) // vindex 4293 .addUse(VOffset) // voffset 4294 .addUse(SOffset) // soffset 4295 .addImm(ImmOffset) // offset(imm) 4296 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4297 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4298 .addMemOperand(MMO); 4299 4300 MI.eraseFromParent(); 4301 return true; 4302 } 4303 4304 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4305 /// vector with s16 typed elements. 4306 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4307 SmallVectorImpl<Register> &PackedAddrs, 4308 unsigned ArgOffset, 4309 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4310 bool IsA16, bool IsG16) { 4311 const LLT S16 = LLT::scalar(16); 4312 const LLT V2S16 = LLT::fixed_vector(2, 16); 4313 auto EndIdx = Intr->VAddrEnd; 4314 4315 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4316 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4317 if (!SrcOp.isReg()) 4318 continue; // _L to _LZ may have eliminated this. 4319 4320 Register AddrReg = SrcOp.getReg(); 4321 4322 if ((I < Intr->GradientStart) || 4323 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4324 (I >= Intr->CoordStart && !IsA16)) { 4325 if ((I < Intr->GradientStart) && IsA16 && 4326 (B.getMRI()->getType(AddrReg) == S16)) { 4327 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 4328 // Special handling of bias when A16 is on. Bias is of type half but 4329 // occupies full 32-bit. 4330 PackedAddrs.push_back( 4331 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4332 .getReg(0)); 4333 } else { 4334 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 4335 "Bias needs to be converted to 16 bit in A16 mode"); 4336 // Handle any gradient or coordinate operands that should not be packed 4337 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4338 PackedAddrs.push_back(AddrReg); 4339 } 4340 } else { 4341 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4342 // derivatives dx/dh and dx/dv are packed with undef. 4343 if (((I + 1) >= EndIdx) || 4344 ((Intr->NumGradients / 2) % 2 == 1 && 4345 (I == static_cast<unsigned>(Intr->GradientStart + 4346 (Intr->NumGradients / 2) - 1) || 4347 I == static_cast<unsigned>(Intr->GradientStart + 4348 Intr->NumGradients - 1))) || 4349 // Check for _L to _LZ optimization 4350 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4351 PackedAddrs.push_back( 4352 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4353 .getReg(0)); 4354 } else { 4355 PackedAddrs.push_back( 4356 B.buildBuildVector( 4357 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4358 .getReg(0)); 4359 ++I; 4360 } 4361 } 4362 } 4363 } 4364 4365 /// Convert from separate vaddr components to a single vector address register, 4366 /// and replace the remaining operands with $noreg. 4367 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4368 int DimIdx, int NumVAddrs) { 4369 const LLT S32 = LLT::scalar(32); 4370 4371 SmallVector<Register, 8> AddrRegs; 4372 for (int I = 0; I != NumVAddrs; ++I) { 4373 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4374 if (SrcOp.isReg()) { 4375 AddrRegs.push_back(SrcOp.getReg()); 4376 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4377 } 4378 } 4379 4380 int NumAddrRegs = AddrRegs.size(); 4381 if (NumAddrRegs != 1) { 4382 // Above 8 elements round up to next power of 2 (i.e. 16). 4383 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4384 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4385 auto Undef = B.buildUndef(S32); 4386 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4387 NumAddrRegs = RoundedNumRegs; 4388 } 4389 4390 auto VAddr = 4391 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4392 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4393 } 4394 4395 for (int I = 1; I != NumVAddrs; ++I) { 4396 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4397 if (SrcOp.isReg()) 4398 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4399 } 4400 } 4401 4402 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4403 /// 4404 /// Depending on the subtarget, load/store with 16-bit element data need to be 4405 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4406 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4407 /// registers. 4408 /// 4409 /// We don't want to directly select image instructions just yet, but also want 4410 /// to exposes all register repacking to the legalizer/combiners. We also don't 4411 /// want a selected instruction entering RegBankSelect. In order to avoid 4412 /// defining a multitude of intermediate image instructions, directly hack on 4413 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4414 /// padding now unnecessary arguments with $noreg. 4415 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4416 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4417 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4418 4419 const unsigned NumDefs = MI.getNumExplicitDefs(); 4420 const unsigned ArgOffset = NumDefs + 1; 4421 bool IsTFE = NumDefs == 2; 4422 // We are only processing the operands of d16 image operations on subtargets 4423 // that use the unpacked register layout, or need to repack the TFE result. 4424 4425 // TODO: Do we need to guard against already legalized intrinsics? 4426 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4427 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4428 4429 MachineRegisterInfo *MRI = B.getMRI(); 4430 const LLT S32 = LLT::scalar(32); 4431 const LLT S16 = LLT::scalar(16); 4432 const LLT V2S16 = LLT::fixed_vector(2, 16); 4433 4434 unsigned DMask = 0; 4435 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 4436 LLT Ty = MRI->getType(VData); 4437 4438 // Check for 16 bit addresses and pack if true. 4439 LLT GradTy = 4440 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4441 LLT AddrTy = 4442 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4443 const bool IsG16 = GradTy == S16; 4444 const bool IsA16 = AddrTy == S16; 4445 const bool IsD16 = Ty.getScalarType() == S16; 4446 4447 int DMaskLanes = 0; 4448 if (!BaseOpcode->Atomic) { 4449 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4450 if (BaseOpcode->Gather4) { 4451 DMaskLanes = 4; 4452 } else if (DMask != 0) { 4453 DMaskLanes = countPopulation(DMask); 4454 } else if (!IsTFE && !BaseOpcode->Store) { 4455 // If dmask is 0, this is a no-op load. This can be eliminated. 4456 B.buildUndef(MI.getOperand(0)); 4457 MI.eraseFromParent(); 4458 return true; 4459 } 4460 } 4461 4462 Observer.changingInstr(MI); 4463 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4464 4465 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 4466 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 4467 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 4468 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4469 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 4470 4471 // Track that we legalized this 4472 MI.setDesc(B.getTII().get(NewOpcode)); 4473 4474 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4475 // dmask to be at least 1 otherwise the instruction will fail 4476 if (IsTFE && DMask == 0) { 4477 DMask = 0x1; 4478 DMaskLanes = 1; 4479 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4480 } 4481 4482 if (BaseOpcode->Atomic) { 4483 Register VData0 = MI.getOperand(2).getReg(); 4484 LLT Ty = MRI->getType(VData0); 4485 4486 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4487 if (Ty.isVector()) 4488 return false; 4489 4490 if (BaseOpcode->AtomicX2) { 4491 Register VData1 = MI.getOperand(3).getReg(); 4492 // The two values are packed in one register. 4493 LLT PackedTy = LLT::fixed_vector(2, Ty); 4494 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4495 MI.getOperand(2).setReg(Concat.getReg(0)); 4496 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4497 } 4498 } 4499 4500 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4501 4502 // Rewrite the addressing register layout before doing anything else. 4503 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4504 // 16 bit gradients are supported, but are tied to the A16 control 4505 // so both gradients and addresses must be 16 bit 4506 return false; 4507 } 4508 4509 if (IsA16 && !ST.hasA16()) { 4510 // A16 not supported 4511 return false; 4512 } 4513 4514 if (IsA16 || IsG16) { 4515 if (Intr->NumVAddrs > 1) { 4516 SmallVector<Register, 4> PackedRegs; 4517 4518 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4519 IsG16); 4520 4521 // See also below in the non-a16 branch 4522 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4523 PackedRegs.size() <= ST.getNSAMaxSize(); 4524 4525 if (!UseNSA && PackedRegs.size() > 1) { 4526 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4527 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4528 PackedRegs[0] = Concat.getReg(0); 4529 PackedRegs.resize(1); 4530 } 4531 4532 const unsigned NumPacked = PackedRegs.size(); 4533 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4534 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4535 if (!SrcOp.isReg()) { 4536 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4537 continue; 4538 } 4539 4540 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4541 4542 if (I - Intr->VAddrStart < NumPacked) 4543 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4544 else 4545 SrcOp.setReg(AMDGPU::NoRegister); 4546 } 4547 } 4548 } else { 4549 // If the register allocator cannot place the address registers contiguously 4550 // without introducing moves, then using the non-sequential address encoding 4551 // is always preferable, since it saves VALU instructions and is usually a 4552 // wash in terms of code size or even better. 4553 // 4554 // However, we currently have no way of hinting to the register allocator 4555 // that MIMG addresses should be placed contiguously when it is possible to 4556 // do so, so force non-NSA for the common 2-address case as a heuristic. 4557 // 4558 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4559 // allocation when possible. 4560 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4561 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4562 4563 if (!UseNSA && Intr->NumVAddrs > 1) 4564 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4565 Intr->NumVAddrs); 4566 } 4567 4568 int Flags = 0; 4569 if (IsA16) 4570 Flags |= 1; 4571 if (IsG16) 4572 Flags |= 2; 4573 MI.addOperand(MachineOperand::CreateImm(Flags)); 4574 4575 if (BaseOpcode->Store) { // No TFE for stores? 4576 // TODO: Handle dmask trim 4577 if (!Ty.isVector() || !IsD16) 4578 return true; 4579 4580 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4581 if (RepackedReg != VData) { 4582 MI.getOperand(1).setReg(RepackedReg); 4583 } 4584 4585 return true; 4586 } 4587 4588 Register DstReg = MI.getOperand(0).getReg(); 4589 const LLT EltTy = Ty.getScalarType(); 4590 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4591 4592 // Confirm that the return type is large enough for the dmask specified 4593 if (NumElts < DMaskLanes) 4594 return false; 4595 4596 if (NumElts > 4 || DMaskLanes > 4) 4597 return false; 4598 4599 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4600 const LLT AdjustedTy = 4601 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4602 4603 // The raw dword aligned data component of the load. The only legal cases 4604 // where this matters should be when using the packed D16 format, for 4605 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4606 LLT RoundedTy; 4607 4608 // S32 vector to to cover all data, plus TFE result element. 4609 LLT TFETy; 4610 4611 // Register type to use for each loaded component. Will be S32 or V2S16. 4612 LLT RegTy; 4613 4614 if (IsD16 && ST.hasUnpackedD16VMem()) { 4615 RoundedTy = 4616 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4617 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4618 RegTy = S32; 4619 } else { 4620 unsigned EltSize = EltTy.getSizeInBits(); 4621 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4622 unsigned RoundedSize = 32 * RoundedElts; 4623 RoundedTy = LLT::scalarOrVector( 4624 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4625 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4626 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4627 } 4628 4629 // The return type does not need adjustment. 4630 // TODO: Should we change s16 case to s32 or <2 x s16>? 4631 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4632 return true; 4633 4634 Register Dst1Reg; 4635 4636 // Insert after the instruction. 4637 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4638 4639 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4640 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4641 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4642 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4643 4644 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4645 4646 MI.getOperand(0).setReg(NewResultReg); 4647 4648 // In the IR, TFE is supposed to be used with a 2 element struct return 4649 // type. The instruction really returns these two values in one contiguous 4650 // register, with one additional dword beyond the loaded data. Rewrite the 4651 // return type to use a single register result. 4652 4653 if (IsTFE) { 4654 Dst1Reg = MI.getOperand(1).getReg(); 4655 if (MRI->getType(Dst1Reg) != S32) 4656 return false; 4657 4658 // TODO: Make sure the TFE operand bit is set. 4659 MI.removeOperand(1); 4660 4661 // Handle the easy case that requires no repack instructions. 4662 if (Ty == S32) { 4663 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4664 return true; 4665 } 4666 } 4667 4668 // Now figure out how to copy the new result register back into the old 4669 // result. 4670 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4671 4672 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4673 4674 if (ResultNumRegs == 1) { 4675 assert(!IsTFE); 4676 ResultRegs[0] = NewResultReg; 4677 } else { 4678 // We have to repack into a new vector of some kind. 4679 for (int I = 0; I != NumDataRegs; ++I) 4680 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4681 B.buildUnmerge(ResultRegs, NewResultReg); 4682 4683 // Drop the final TFE element to get the data part. The TFE result is 4684 // directly written to the right place already. 4685 if (IsTFE) 4686 ResultRegs.resize(NumDataRegs); 4687 } 4688 4689 // For an s16 scalar result, we form an s32 result with a truncate regardless 4690 // of packed vs. unpacked. 4691 if (IsD16 && !Ty.isVector()) { 4692 B.buildTrunc(DstReg, ResultRegs[0]); 4693 return true; 4694 } 4695 4696 // Avoid a build/concat_vector of 1 entry. 4697 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4698 B.buildBitcast(DstReg, ResultRegs[0]); 4699 return true; 4700 } 4701 4702 assert(Ty.isVector()); 4703 4704 if (IsD16) { 4705 // For packed D16 results with TFE enabled, all the data components are 4706 // S32. Cast back to the expected type. 4707 // 4708 // TODO: We don't really need to use load s32 elements. We would only need one 4709 // cast for the TFE result if a multiple of v2s16 was used. 4710 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4711 for (Register &Reg : ResultRegs) 4712 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4713 } else if (ST.hasUnpackedD16VMem()) { 4714 for (Register &Reg : ResultRegs) 4715 Reg = B.buildTrunc(S16, Reg).getReg(0); 4716 } 4717 } 4718 4719 auto padWithUndef = [&](LLT Ty, int NumElts) { 4720 if (NumElts == 0) 4721 return; 4722 Register Undef = B.buildUndef(Ty).getReg(0); 4723 for (int I = 0; I != NumElts; ++I) 4724 ResultRegs.push_back(Undef); 4725 }; 4726 4727 // Pad out any elements eliminated due to the dmask. 4728 LLT ResTy = MRI->getType(ResultRegs[0]); 4729 if (!ResTy.isVector()) { 4730 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4731 B.buildBuildVector(DstReg, ResultRegs); 4732 return true; 4733 } 4734 4735 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4736 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4737 4738 // Deal with the one annoying legal case. 4739 const LLT V3S16 = LLT::fixed_vector(3, 16); 4740 if (Ty == V3S16) { 4741 if (IsTFE) { 4742 if (ResultRegs.size() == 1) { 4743 NewResultReg = ResultRegs[0]; 4744 } else if (ResultRegs.size() == 2) { 4745 LLT V4S16 = LLT::fixed_vector(4, 16); 4746 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 4747 } else { 4748 return false; 4749 } 4750 } 4751 4752 if (MRI->getType(DstReg).getNumElements() < 4753 MRI->getType(NewResultReg).getNumElements()) { 4754 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 4755 } else { 4756 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 4757 } 4758 return true; 4759 } 4760 4761 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4762 B.buildConcatVectors(DstReg, ResultRegs); 4763 return true; 4764 } 4765 4766 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4767 LegalizerHelper &Helper, MachineInstr &MI) const { 4768 MachineIRBuilder &B = Helper.MIRBuilder; 4769 GISelChangeObserver &Observer = Helper.Observer; 4770 4771 Register Dst = MI.getOperand(0).getReg(); 4772 LLT Ty = B.getMRI()->getType(Dst); 4773 unsigned Size = Ty.getSizeInBits(); 4774 MachineFunction &MF = B.getMF(); 4775 4776 Observer.changingInstr(MI); 4777 4778 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4779 Ty = getBitcastRegisterType(Ty); 4780 Helper.bitcastDst(MI, Ty, 0); 4781 Dst = MI.getOperand(0).getReg(); 4782 B.setInsertPt(B.getMBB(), MI); 4783 } 4784 4785 // FIXME: We don't really need this intermediate instruction. The intrinsic 4786 // should be fixed to have a memory operand. Since it's readnone, we're not 4787 // allowed to add one. 4788 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4789 MI.removeOperand(1); // Remove intrinsic ID 4790 4791 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4792 // TODO: Should this use datalayout alignment? 4793 const unsigned MemSize = (Size + 7) / 8; 4794 const Align MemAlign(4); 4795 MachineMemOperand *MMO = MF.getMachineMemOperand( 4796 MachinePointerInfo(), 4797 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4798 MachineMemOperand::MOInvariant, 4799 MemSize, MemAlign); 4800 MI.addMemOperand(MF, MMO); 4801 4802 // There are no 96-bit result scalar loads, but widening to 128-bit should 4803 // always be legal. We may need to restore this to a 96-bit result if it turns 4804 // out this needs to be converted to a vector load during RegBankSelect. 4805 if (!isPowerOf2_32(Size)) { 4806 if (Ty.isVector()) 4807 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4808 else 4809 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4810 } 4811 4812 Observer.changedInstr(MI); 4813 return true; 4814 } 4815 4816 // TODO: Move to selection 4817 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4818 MachineRegisterInfo &MRI, 4819 MachineIRBuilder &B) const { 4820 if (!ST.isTrapHandlerEnabled() || 4821 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4822 return legalizeTrapEndpgm(MI, MRI, B); 4823 4824 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4825 switch (*HsaAbiVer) { 4826 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4827 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4828 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4829 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4830 case ELF::ELFABIVERSION_AMDGPU_HSA_V5: 4831 return ST.supportsGetDoorbellID() ? 4832 legalizeTrapHsa(MI, MRI, B) : 4833 legalizeTrapHsaQueuePtr(MI, MRI, B); 4834 } 4835 } 4836 4837 llvm_unreachable("Unknown trap handler"); 4838 } 4839 4840 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4841 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4842 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4843 MI.eraseFromParent(); 4844 return true; 4845 } 4846 4847 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4848 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4849 MachineFunction &MF = B.getMF(); 4850 const LLT S64 = LLT::scalar(64); 4851 4852 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4853 // For code object version 5, queue_ptr is passed through implicit kernarg. 4854 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 4855 AMDGPUTargetLowering::ImplicitParameter Param = 4856 AMDGPUTargetLowering::QUEUE_PTR; 4857 uint64_t Offset = 4858 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 4859 4860 Register KernargPtrReg = MRI.createGenericVirtualRegister( 4861 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4862 4863 if (!loadInputValue(KernargPtrReg, B, 4864 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 4865 return false; 4866 4867 // TODO: can we be smarter about machine pointer info? 4868 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 4869 MachineMemOperand *MMO = MF.getMachineMemOperand( 4870 PtrInfo, 4871 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4872 MachineMemOperand::MOInvariant, 4873 LLT::scalar(64), commonAlignment(Align(64), Offset)); 4874 4875 // Pointer address 4876 Register LoadAddr = MRI.createGenericVirtualRegister( 4877 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4878 B.buildPtrAdd(LoadAddr, KernargPtrReg, 4879 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 4880 // Load address 4881 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 4882 B.buildCopy(SGPR01, Temp); 4883 B.buildInstr(AMDGPU::S_TRAP) 4884 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4885 .addReg(SGPR01, RegState::Implicit); 4886 MI.eraseFromParent(); 4887 return true; 4888 } 4889 4890 // Pass queue pointer to trap handler as input, and insert trap instruction 4891 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4892 Register LiveIn = 4893 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4894 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4895 return false; 4896 4897 B.buildCopy(SGPR01, LiveIn); 4898 B.buildInstr(AMDGPU::S_TRAP) 4899 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4900 .addReg(SGPR01, RegState::Implicit); 4901 4902 MI.eraseFromParent(); 4903 return true; 4904 } 4905 4906 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4907 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4908 B.buildInstr(AMDGPU::S_TRAP) 4909 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4910 MI.eraseFromParent(); 4911 return true; 4912 } 4913 4914 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4915 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4916 // Is non-HSA path or trap-handler disabled? Then, report a warning 4917 // accordingly 4918 if (!ST.isTrapHandlerEnabled() || 4919 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4920 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4921 "debugtrap handler not supported", 4922 MI.getDebugLoc(), DS_Warning); 4923 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4924 Ctx.diagnose(NoTrap); 4925 } else { 4926 // Insert debug-trap instruction 4927 B.buildInstr(AMDGPU::S_TRAP) 4928 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4929 } 4930 4931 MI.eraseFromParent(); 4932 return true; 4933 } 4934 4935 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4936 MachineIRBuilder &B) const { 4937 MachineRegisterInfo &MRI = *B.getMRI(); 4938 const LLT S16 = LLT::scalar(16); 4939 const LLT S32 = LLT::scalar(32); 4940 4941 Register DstReg = MI.getOperand(0).getReg(); 4942 Register NodePtr = MI.getOperand(2).getReg(); 4943 Register RayExtent = MI.getOperand(3).getReg(); 4944 Register RayOrigin = MI.getOperand(4).getReg(); 4945 Register RayDir = MI.getOperand(5).getReg(); 4946 Register RayInvDir = MI.getOperand(6).getReg(); 4947 Register TDescr = MI.getOperand(7).getReg(); 4948 4949 if (!ST.hasGFX10_AEncoding()) { 4950 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4951 "intrinsic not supported on subtarget", 4952 MI.getDebugLoc()); 4953 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4954 return false; 4955 } 4956 4957 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4958 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4959 const unsigned NumVDataDwords = 4; 4960 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4961 const bool UseNSA = 4962 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 4963 const unsigned BaseOpcodes[2][2] = { 4964 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 4965 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 4966 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 4967 int Opcode; 4968 if (UseNSA) { 4969 Opcode = 4970 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 4971 NumVDataDwords, NumVAddrDwords); 4972 } else { 4973 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 4974 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 4975 PowerOf2Ceil(NumVAddrDwords)); 4976 } 4977 assert(Opcode != -1); 4978 4979 SmallVector<Register, 12> Ops; 4980 if (Is64) { 4981 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4982 Ops.push_back(Unmerge.getReg(0)); 4983 Ops.push_back(Unmerge.getReg(1)); 4984 } else { 4985 Ops.push_back(NodePtr); 4986 } 4987 Ops.push_back(RayExtent); 4988 4989 auto packLanes = [&Ops, &S32, &B](Register Src) { 4990 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 4991 Ops.push_back(Unmerge.getReg(0)); 4992 Ops.push_back(Unmerge.getReg(1)); 4993 Ops.push_back(Unmerge.getReg(2)); 4994 }; 4995 4996 packLanes(RayOrigin); 4997 if (IsA16) { 4998 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 4999 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 5000 Register R1 = MRI.createGenericVirtualRegister(S32); 5001 Register R2 = MRI.createGenericVirtualRegister(S32); 5002 Register R3 = MRI.createGenericVirtualRegister(S32); 5003 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 5004 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 5005 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 5006 Ops.push_back(R1); 5007 Ops.push_back(R2); 5008 Ops.push_back(R3); 5009 } else { 5010 packLanes(RayDir); 5011 packLanes(RayInvDir); 5012 } 5013 5014 if (!UseNSA) { 5015 // Build a single vector containing all the operands so far prepared. 5016 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 5017 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 5018 Ops.clear(); 5019 Ops.push_back(MergedOps); 5020 } 5021 5022 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 5023 .addDef(DstReg) 5024 .addImm(Opcode); 5025 5026 for (Register R : Ops) { 5027 MIB.addUse(R); 5028 } 5029 5030 MIB.addUse(TDescr) 5031 .addImm(IsA16 ? 1 : 0) 5032 .cloneMemRefs(MI); 5033 5034 MI.eraseFromParent(); 5035 return true; 5036 } 5037 5038 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) { 5039 B.buildConstant(MI.getOperand(0).getReg(), C); 5040 MI.eraseFromParent(); 5041 return true; 5042 } 5043 5044 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 5045 MachineIRBuilder &B) const { 5046 unsigned Opc; 5047 int RoundMode = MI.getOperand(2).getImm(); 5048 5049 if (RoundMode == (int)RoundingMode::TowardPositive) 5050 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 5051 else if (RoundMode == (int)RoundingMode::TowardNegative) 5052 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 5053 else 5054 return false; 5055 5056 B.buildInstr(Opc) 5057 .addDef(MI.getOperand(0).getReg()) 5058 .addUse(MI.getOperand(1).getReg()); 5059 5060 MI.eraseFromParent(); 5061 5062 return true; 5063 } 5064 5065 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 5066 MachineInstr &MI) const { 5067 MachineIRBuilder &B = Helper.MIRBuilder; 5068 MachineRegisterInfo &MRI = *B.getMRI(); 5069 5070 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 5071 auto IntrID = MI.getIntrinsicID(); 5072 switch (IntrID) { 5073 case Intrinsic::amdgcn_if: 5074 case Intrinsic::amdgcn_else: { 5075 MachineInstr *Br = nullptr; 5076 MachineBasicBlock *UncondBrTarget = nullptr; 5077 bool Negated = false; 5078 if (MachineInstr *BrCond = 5079 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5080 const SIRegisterInfo *TRI 5081 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5082 5083 Register Def = MI.getOperand(1).getReg(); 5084 Register Use = MI.getOperand(3).getReg(); 5085 5086 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5087 5088 if (Negated) 5089 std::swap(CondBrTarget, UncondBrTarget); 5090 5091 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5092 if (IntrID == Intrinsic::amdgcn_if) { 5093 B.buildInstr(AMDGPU::SI_IF) 5094 .addDef(Def) 5095 .addUse(Use) 5096 .addMBB(UncondBrTarget); 5097 } else { 5098 B.buildInstr(AMDGPU::SI_ELSE) 5099 .addDef(Def) 5100 .addUse(Use) 5101 .addMBB(UncondBrTarget); 5102 } 5103 5104 if (Br) { 5105 Br->getOperand(0).setMBB(CondBrTarget); 5106 } else { 5107 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 5108 // since we're swapping branch targets it needs to be reinserted. 5109 // FIXME: IRTranslator should probably not do this 5110 B.buildBr(*CondBrTarget); 5111 } 5112 5113 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 5114 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 5115 MI.eraseFromParent(); 5116 BrCond->eraseFromParent(); 5117 return true; 5118 } 5119 5120 return false; 5121 } 5122 case Intrinsic::amdgcn_loop: { 5123 MachineInstr *Br = nullptr; 5124 MachineBasicBlock *UncondBrTarget = nullptr; 5125 bool Negated = false; 5126 if (MachineInstr *BrCond = 5127 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5128 const SIRegisterInfo *TRI 5129 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5130 5131 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5132 Register Reg = MI.getOperand(2).getReg(); 5133 5134 if (Negated) 5135 std::swap(CondBrTarget, UncondBrTarget); 5136 5137 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5138 B.buildInstr(AMDGPU::SI_LOOP) 5139 .addUse(Reg) 5140 .addMBB(UncondBrTarget); 5141 5142 if (Br) 5143 Br->getOperand(0).setMBB(CondBrTarget); 5144 else 5145 B.buildBr(*CondBrTarget); 5146 5147 MI.eraseFromParent(); 5148 BrCond->eraseFromParent(); 5149 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5150 return true; 5151 } 5152 5153 return false; 5154 } 5155 case Intrinsic::amdgcn_kernarg_segment_ptr: 5156 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5157 // This only makes sense to call in a kernel, so just lower to null. 5158 B.buildConstant(MI.getOperand(0).getReg(), 0); 5159 MI.eraseFromParent(); 5160 return true; 5161 } 5162 5163 return legalizePreloadedArgIntrin( 5164 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5165 case Intrinsic::amdgcn_implicitarg_ptr: 5166 return legalizeImplicitArgPtr(MI, MRI, B); 5167 case Intrinsic::amdgcn_workitem_id_x: 5168 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0) 5169 return replaceWithConstant(B, MI, 0); 5170 return legalizePreloadedArgIntrin(MI, MRI, B, 5171 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5172 case Intrinsic::amdgcn_workitem_id_y: 5173 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0) 5174 return replaceWithConstant(B, MI, 0); 5175 5176 return legalizePreloadedArgIntrin(MI, MRI, B, 5177 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5178 case Intrinsic::amdgcn_workitem_id_z: 5179 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0) 5180 return replaceWithConstant(B, MI, 0); 5181 5182 return legalizePreloadedArgIntrin(MI, MRI, B, 5183 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5184 case Intrinsic::amdgcn_workgroup_id_x: 5185 return legalizePreloadedArgIntrin(MI, MRI, B, 5186 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5187 case Intrinsic::amdgcn_workgroup_id_y: 5188 return legalizePreloadedArgIntrin(MI, MRI, B, 5189 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5190 case Intrinsic::amdgcn_workgroup_id_z: 5191 return legalizePreloadedArgIntrin(MI, MRI, B, 5192 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5193 case Intrinsic::amdgcn_dispatch_ptr: 5194 return legalizePreloadedArgIntrin(MI, MRI, B, 5195 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5196 case Intrinsic::amdgcn_queue_ptr: 5197 return legalizePreloadedArgIntrin(MI, MRI, B, 5198 AMDGPUFunctionArgInfo::QUEUE_PTR); 5199 case Intrinsic::amdgcn_implicit_buffer_ptr: 5200 return legalizePreloadedArgIntrin( 5201 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5202 case Intrinsic::amdgcn_dispatch_id: 5203 return legalizePreloadedArgIntrin(MI, MRI, B, 5204 AMDGPUFunctionArgInfo::DISPATCH_ID); 5205 case Intrinsic::amdgcn_fdiv_fast: 5206 return legalizeFDIVFastIntrin(MI, MRI, B); 5207 case Intrinsic::amdgcn_is_shared: 5208 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5209 case Intrinsic::amdgcn_is_private: 5210 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5211 case Intrinsic::amdgcn_wavefrontsize: { 5212 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5213 MI.eraseFromParent(); 5214 return true; 5215 } 5216 case Intrinsic::amdgcn_s_buffer_load: 5217 return legalizeSBufferLoad(Helper, MI); 5218 case Intrinsic::amdgcn_raw_buffer_store: 5219 case Intrinsic::amdgcn_struct_buffer_store: 5220 return legalizeBufferStore(MI, MRI, B, false, false); 5221 case Intrinsic::amdgcn_raw_buffer_store_format: 5222 case Intrinsic::amdgcn_struct_buffer_store_format: 5223 return legalizeBufferStore(MI, MRI, B, false, true); 5224 case Intrinsic::amdgcn_raw_tbuffer_store: 5225 case Intrinsic::amdgcn_struct_tbuffer_store: 5226 return legalizeBufferStore(MI, MRI, B, true, true); 5227 case Intrinsic::amdgcn_raw_buffer_load: 5228 case Intrinsic::amdgcn_struct_buffer_load: 5229 return legalizeBufferLoad(MI, MRI, B, false, false); 5230 case Intrinsic::amdgcn_raw_buffer_load_format: 5231 case Intrinsic::amdgcn_struct_buffer_load_format: 5232 return legalizeBufferLoad(MI, MRI, B, true, false); 5233 case Intrinsic::amdgcn_raw_tbuffer_load: 5234 case Intrinsic::amdgcn_struct_tbuffer_load: 5235 return legalizeBufferLoad(MI, MRI, B, true, true); 5236 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5237 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5238 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5239 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5240 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5241 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5242 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5243 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5244 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5245 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5246 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5247 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5248 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5249 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5250 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5251 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5252 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5253 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5254 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5255 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5256 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5257 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5258 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5259 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5260 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5261 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5262 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5263 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5264 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5265 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5266 return legalizeBufferAtomic(MI, B, IntrID); 5267 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5268 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { 5269 Register DstReg = MI.getOperand(0).getReg(); 5270 if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) { 5271 Function &F = B.getMF().getFunction(); 5272 DiagnosticInfoUnsupported NoFpRet( 5273 F, "return versions of fp atomics not supported", B.getDebugLoc(), 5274 DS_Error); 5275 F.getContext().diagnose(NoFpRet); 5276 B.buildUndef(DstReg); 5277 MI.eraseFromParent(); 5278 return true; 5279 } 5280 5281 return legalizeBufferAtomic(MI, B, IntrID); 5282 } 5283 case Intrinsic::amdgcn_atomic_inc: 5284 return legalizeAtomicIncDec(MI, B, true); 5285 case Intrinsic::amdgcn_atomic_dec: 5286 return legalizeAtomicIncDec(MI, B, false); 5287 case Intrinsic::trap: 5288 return legalizeTrapIntrinsic(MI, MRI, B); 5289 case Intrinsic::debugtrap: 5290 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5291 case Intrinsic::amdgcn_rsq_clamp: 5292 return legalizeRsqClampIntrinsic(MI, MRI, B); 5293 case Intrinsic::amdgcn_ds_fadd: 5294 case Intrinsic::amdgcn_ds_fmin: 5295 case Intrinsic::amdgcn_ds_fmax: 5296 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5297 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5298 return legalizeBVHIntrinsic(MI, B); 5299 default: { 5300 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5301 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5302 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5303 return true; 5304 } 5305 } 5306 5307 return true; 5308 } 5309