1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 #include "llvm/IR/IntrinsicsR600.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 /// \returns true if this is an odd sized vector which should widen by adding an 64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65 /// excludes s1 vectors, which should always be scalarized. 66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 67 return [=](const LegalityQuery &Query) { 68 const LLT Ty = Query.Types[TypeIdx]; 69 if (!Ty.isVector()) 70 return false; 71 72 const LLT EltTy = Ty.getElementType(); 73 const unsigned EltSize = EltTy.getSizeInBits(); 74 return Ty.getNumElements() % 2 != 0 && 75 EltSize > 1 && EltSize < 32 && 76 Ty.getSizeInBits() % 32 != 0; 77 }; 78 } 79 80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 return Ty.getSizeInBits() % 32 == 0; 84 }; 85 } 86 87 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getScalarType(); 91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 92 }; 93 } 94 95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 return std::make_pair(TypeIdx, 100 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 101 }; 102 } 103 104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 105 return [=](const LegalityQuery &Query) { 106 const LLT Ty = Query.Types[TypeIdx]; 107 const LLT EltTy = Ty.getElementType(); 108 unsigned Size = Ty.getSizeInBits(); 109 unsigned Pieces = (Size + 63) / 64; 110 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 111 return std::make_pair( 112 TypeIdx, 113 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 114 }; 115 } 116 117 // Increase the number of vector elements to reach the next multiple of 32-bit 118 // type. 119 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 120 return [=](const LegalityQuery &Query) { 121 const LLT Ty = Query.Types[TypeIdx]; 122 123 const LLT EltTy = Ty.getElementType(); 124 const int Size = Ty.getSizeInBits(); 125 const int EltSize = EltTy.getSizeInBits(); 126 const int NextMul32 = (Size + 31) / 32; 127 128 assert(EltSize < 32); 129 130 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 131 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 132 }; 133 } 134 135 static LLT getBitcastRegisterType(const LLT Ty) { 136 const unsigned Size = Ty.getSizeInBits(); 137 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 uint64_t AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .minScalar(0, S16) 536 .clampMaxNumElementsStrict(0, S16, 2) 537 .widenScalarToNextMultipleOf(0, 32) 538 .maxScalar(0, S32) 539 .scalarize(0); 540 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 542 .legalFor({S32, S16, V2S16}) // Clamp modifier 543 .minScalarOrElt(0, S16) 544 .clampMaxNumElementsStrict(0, S16, 2) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 32) 547 .lower(); 548 } else if (ST.has16BitInsts()) { 549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 550 .legalFor({S32, S16}) 551 .minScalar(0, S16) 552 .widenScalarToNextMultipleOf(0, 32) 553 .maxScalar(0, S32) 554 .scalarize(0); 555 556 // Technically the saturating operations require clamp bit support, but this 557 // was introduced at the same time as 16-bit operations. 558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 559 .legalFor({S32, S16}) // Clamp modifier 560 .minScalar(0, S16) 561 .scalarize(0) 562 .widenScalarToNextPow2(0, 16) 563 .lower(); 564 565 // We're just lowering this, but it helps get a better result to try to 566 // coerce to the desired type first. 567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 568 .minScalar(0, S16) 569 .scalarize(0) 570 .lower(); 571 } else { 572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 573 .legalFor({S32}) 574 .widenScalarToNextMultipleOf(0, 32) 575 .clampScalar(0, S32, S32) 576 .scalarize(0); 577 578 if (ST.hasIntClamp()) { 579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 580 .legalFor({S32}) // Clamp modifier. 581 .scalarize(0) 582 .minScalarOrElt(0, S32) 583 .lower(); 584 } else { 585 // Clamp bit support was added in VI, along with 16-bit operations. 586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 587 .minScalar(0, S32) 588 .scalarize(0) 589 .lower(); 590 } 591 592 // FIXME: DAG expansion gets better results. The widening uses the smaller 593 // range values and goes for the min/max lowering directly. 594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 595 .minScalar(0, S32) 596 .scalarize(0) 597 .lower(); 598 } 599 600 getActionDefinitionsBuilder( 601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 602 .customFor({S32, S64}) 603 .clampScalar(0, S32, S64) 604 .widenScalarToNextPow2(0, 32) 605 .scalarize(0); 606 607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 608 .legalFor({S32}) 609 .maxScalar(0, S32); 610 611 if (ST.hasVOP3PInsts()) { 612 Mulh 613 .clampMaxNumElements(0, S8, 2) 614 .lowerFor({V2S8}); 615 } 616 617 Mulh 618 .scalarize(0) 619 .lower(); 620 621 // Report legal for any types we can handle anywhere. For the cases only legal 622 // on the SALU, RegBankSelect will be able to re-legalize. 623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 625 .clampScalar(0, S32, S64) 626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 628 .widenScalarToNextPow2(0) 629 .scalarize(0); 630 631 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 633 .legalFor({{S32, S1}, {S32, S32}}) 634 .minScalar(0, S32) 635 .scalarize(0) 636 .lower(); 637 638 getActionDefinitionsBuilder(G_BITCAST) 639 // Don't worry about the size constraint. 640 .legalIf(all(isRegisterType(0), isRegisterType(1))) 641 .lower(); 642 643 644 getActionDefinitionsBuilder(G_CONSTANT) 645 .legalFor({S1, S32, S64, S16, GlobalPtr, 646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 647 .legalIf(isPointer(0)) 648 .clampScalar(0, S32, S64) 649 .widenScalarToNextPow2(0); 650 651 getActionDefinitionsBuilder(G_FCONSTANT) 652 .legalFor({S32, S64, S16}) 653 .clampScalar(0, S16, S64); 654 655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 656 .legalIf(isRegisterType(0)) 657 // s1 and s16 are special cases because they have legal operations on 658 // them, but don't really occupy registers in the normal way. 659 .legalFor({S1, S16}) 660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 661 .clampScalarOrElt(0, S32, MaxScalar) 662 .widenScalarToNextPow2(0, 32) 663 .clampMaxNumElements(0, S32, 16); 664 665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 666 667 // If the amount is divergent, we have to do a wave reduction to get the 668 // maximum value, so this is expanded during RegBankSelect. 669 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 670 .legalFor({{PrivatePtr, S32}}); 671 672 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 673 .customIf(typeIsNot(0, PrivatePtr)); 674 675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 676 677 auto &FPOpActions = getActionDefinitionsBuilder( 678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 679 .legalFor({S32, S64}); 680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 681 .customFor({S32, S64}); 682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 683 .customFor({S32, S64}); 684 685 if (ST.has16BitInsts()) { 686 if (ST.hasVOP3PInsts()) 687 FPOpActions.legalFor({S16, V2S16}); 688 else 689 FPOpActions.legalFor({S16}); 690 691 TrigActions.customFor({S16}); 692 FDIVActions.customFor({S16}); 693 } 694 695 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 697 698 if (ST.hasVOP3PInsts()) { 699 MinNumMaxNum.customFor(FPTypesPK16) 700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 701 .clampMaxNumElements(0, S16, 2) 702 .clampScalar(0, S16, S64) 703 .scalarize(0); 704 } else if (ST.has16BitInsts()) { 705 MinNumMaxNum.customFor(FPTypes16) 706 .clampScalar(0, S16, S64) 707 .scalarize(0); 708 } else { 709 MinNumMaxNum.customFor(FPTypesBase) 710 .clampScalar(0, S32, S64) 711 .scalarize(0); 712 } 713 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 716 717 FPOpActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 TrigActions 722 .scalarize(0) 723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 724 725 FDIVActions 726 .scalarize(0) 727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 728 729 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 730 .legalFor(FPTypesPK16) 731 .clampMaxNumElementsStrict(0, S16, 2) 732 .scalarize(0) 733 .clampScalar(0, S16, S64); 734 735 if (ST.has16BitInsts()) { 736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 737 .legalFor({S32, S64, S16}) 738 .scalarize(0) 739 .clampScalar(0, S16, S64); 740 } else { 741 getActionDefinitionsBuilder(G_FSQRT) 742 .legalFor({S32, S64}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 746 if (ST.hasFractBug()) { 747 getActionDefinitionsBuilder(G_FFLOOR) 748 .customFor({S64}) 749 .legalFor({S32, S64}) 750 .scalarize(0) 751 .clampScalar(0, S32, S64); 752 } else { 753 getActionDefinitionsBuilder(G_FFLOOR) 754 .legalFor({S32, S64}) 755 .scalarize(0) 756 .clampScalar(0, S32, S64); 757 } 758 } 759 760 getActionDefinitionsBuilder(G_FPTRUNC) 761 .legalFor({{S32, S64}, {S16, S32}}) 762 .scalarize(0) 763 .lower(); 764 765 getActionDefinitionsBuilder(G_FPEXT) 766 .legalFor({{S64, S32}, {S32, S16}}) 767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 768 .scalarize(0); 769 770 getActionDefinitionsBuilder(G_FSUB) 771 // Use actual fsub instruction 772 .legalFor({S32}) 773 // Must use fadd + fneg 774 .lowerFor({S64, S16, V2S16}) 775 .scalarize(0) 776 .clampScalar(0, S32, S64); 777 778 // Whether this is legal depends on the floating point mode for the function. 779 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 780 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 781 FMad.customFor({S32, S16}); 782 else if (ST.hasMadMacF32Insts()) 783 FMad.customFor({S32}); 784 else if (ST.hasMadF16()) 785 FMad.customFor({S16}); 786 FMad.scalarize(0) 787 .lower(); 788 789 auto &FRem = getActionDefinitionsBuilder(G_FREM); 790 if (ST.has16BitInsts()) { 791 FRem.customFor({S16, S32, S64}); 792 } else { 793 FRem.minScalar(0, S32) 794 .customFor({S32, S64}); 795 } 796 FRem.scalarize(0); 797 798 // TODO: Do we need to clamp maximum bitwidth? 799 getActionDefinitionsBuilder(G_TRUNC) 800 .legalIf(isScalar(0)) 801 .legalFor({{V2S16, V2S32}}) 802 .clampMaxNumElements(0, S16, 2) 803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 804 // situations (like an invalid implicit use), we don't want to infinite loop 805 // in the legalizer. 806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 807 .alwaysLegal(); 808 809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 811 {S32, S1}, {S64, S1}, {S16, S1}}) 812 .scalarize(0) 813 .clampScalar(0, S32, S64) 814 .widenScalarToNextPow2(1, 32); 815 816 // TODO: Split s1->s64 during regbankselect for VALU. 817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 819 .lowerIf(typeIs(1, S1)) 820 .customFor({{S32, S64}, {S64, S64}}); 821 if (ST.has16BitInsts()) 822 IToFP.legalFor({{S16, S16}}); 823 IToFP.clampScalar(1, S32, S64) 824 .minScalar(0, S32) 825 .scalarize(0) 826 .widenScalarToNextPow2(1); 827 828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 830 .customFor({{S64, S32}, {S64, S64}}) 831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 832 if (ST.has16BitInsts()) 833 FPToI.legalFor({{S16, S16}}); 834 else 835 FPToI.minScalar(1, S32); 836 837 FPToI.minScalar(0, S32) 838 .widenScalarToNextPow2(0, 32) 839 .scalarize(0) 840 .lower(); 841 842 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 843 .customFor({S16, S32}) 844 .scalarize(0) 845 .lower(); 846 847 // Lower roundeven into G_FRINT 848 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 849 .scalarize(0) 850 .lower(); 851 852 if (ST.has16BitInsts()) { 853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 854 .legalFor({S16, S32, S64}) 855 .clampScalar(0, S16, S64) 856 .scalarize(0); 857 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 859 .legalFor({S32, S64}) 860 .clampScalar(0, S32, S64) 861 .scalarize(0); 862 } else { 863 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 864 .legalFor({S32}) 865 .customFor({S64}) 866 .clampScalar(0, S32, S64) 867 .scalarize(0); 868 } 869 870 getActionDefinitionsBuilder(G_PTR_ADD) 871 .legalIf(all(isPointer(0), sameSize(0, 1))) 872 .scalarize(0) 873 .scalarSameSizeAs(1, 0); 874 875 getActionDefinitionsBuilder(G_PTRMASK) 876 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 877 .scalarSameSizeAs(1, 0) 878 .scalarize(0); 879 880 auto &CmpBuilder = 881 getActionDefinitionsBuilder(G_ICMP) 882 // The compare output type differs based on the register bank of the output, 883 // so make both s1 and s32 legal. 884 // 885 // Scalar compares producing output in scc will be promoted to s32, as that 886 // is the allocatable register type that will be needed for the copy from 887 // scc. This will be promoted during RegBankSelect, and we assume something 888 // before that won't try to use s32 result types. 889 // 890 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 891 // bank. 892 .legalForCartesianProduct( 893 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 894 .legalForCartesianProduct( 895 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 896 if (ST.has16BitInsts()) { 897 CmpBuilder.legalFor({{S1, S16}}); 898 } 899 900 CmpBuilder 901 .widenScalarToNextPow2(1) 902 .clampScalar(1, S32, S64) 903 .scalarize(0) 904 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 905 906 getActionDefinitionsBuilder(G_FCMP) 907 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 908 .widenScalarToNextPow2(1) 909 .clampScalar(1, S32, S64) 910 .scalarize(0); 911 912 // FIXME: fpow has a selection pattern that should move to custom lowering. 913 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 914 if (ST.has16BitInsts()) 915 Exp2Ops.legalFor({S32, S16}); 916 else 917 Exp2Ops.legalFor({S32}); 918 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 919 Exp2Ops.scalarize(0); 920 921 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 922 if (ST.has16BitInsts()) 923 ExpOps.customFor({{S32}, {S16}}); 924 else 925 ExpOps.customFor({S32}); 926 ExpOps.clampScalar(0, MinScalarFPTy, S32) 927 .scalarize(0); 928 929 getActionDefinitionsBuilder(G_FPOWI) 930 .clampScalar(0, MinScalarFPTy, S32) 931 .lower(); 932 933 // The 64-bit versions produce 32-bit results, but only on the SALU. 934 getActionDefinitionsBuilder(G_CTPOP) 935 .legalFor({{S32, S32}, {S32, S64}}) 936 .clampScalar(0, S32, S32) 937 .widenScalarToNextPow2(1, 32) 938 .clampScalar(1, S32, S64) 939 .scalarize(0) 940 .widenScalarToNextPow2(0, 32); 941 942 943 // The hardware instructions return a different result on 0 than the generic 944 // instructions expect. The hardware produces -1, but these produce the 945 // bitwidth. 946 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 947 .scalarize(0) 948 .clampScalar(0, S32, S32) 949 .clampScalar(1, S32, S64) 950 .widenScalarToNextPow2(0, 32) 951 .widenScalarToNextPow2(1, 32) 952 .custom(); 953 954 // The 64-bit versions produce 32-bit results, but only on the SALU. 955 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 956 .legalFor({{S32, S32}, {S32, S64}}) 957 .clampScalar(0, S32, S32) 958 .clampScalar(1, S32, S64) 959 .scalarize(0) 960 .widenScalarToNextPow2(0, 32) 961 .widenScalarToNextPow2(1, 32); 962 963 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 964 // RegBankSelect. 965 getActionDefinitionsBuilder(G_BITREVERSE) 966 .legalFor({S32, S64}) 967 .clampScalar(0, S32, S64) 968 .scalarize(0) 969 .widenScalarToNextPow2(0); 970 971 if (ST.has16BitInsts()) { 972 getActionDefinitionsBuilder(G_BSWAP) 973 .legalFor({S16, S32, V2S16}) 974 .clampMaxNumElementsStrict(0, S16, 2) 975 // FIXME: Fixing non-power-of-2 before clamp is workaround for 976 // narrowScalar limitation. 977 .widenScalarToNextPow2(0) 978 .clampScalar(0, S16, S32) 979 .scalarize(0); 980 981 if (ST.hasVOP3PInsts()) { 982 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 983 .legalFor({S32, S16, V2S16}) 984 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 985 .clampMaxNumElements(0, S16, 2) 986 .minScalar(0, S16) 987 .widenScalarToNextPow2(0) 988 .scalarize(0) 989 .lower(); 990 } else { 991 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 992 .legalFor({S32, S16}) 993 .widenScalarToNextPow2(0) 994 .minScalar(0, S16) 995 .scalarize(0) 996 .lower(); 997 } 998 } else { 999 // TODO: Should have same legality without v_perm_b32 1000 getActionDefinitionsBuilder(G_BSWAP) 1001 .legalFor({S32}) 1002 .lowerIf(scalarNarrowerThan(0, 32)) 1003 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1004 // narrowScalar limitation. 1005 .widenScalarToNextPow2(0) 1006 .maxScalar(0, S32) 1007 .scalarize(0) 1008 .lower(); 1009 1010 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1011 .legalFor({S32}) 1012 .minScalar(0, S32) 1013 .widenScalarToNextPow2(0) 1014 .scalarize(0) 1015 .lower(); 1016 } 1017 1018 getActionDefinitionsBuilder(G_INTTOPTR) 1019 // List the common cases 1020 .legalForCartesianProduct(AddrSpaces64, {S64}) 1021 .legalForCartesianProduct(AddrSpaces32, {S32}) 1022 .scalarize(0) 1023 // Accept any address space as long as the size matches 1024 .legalIf(sameSize(0, 1)) 1025 .widenScalarIf(smallerThan(1, 0), 1026 [](const LegalityQuery &Query) { 1027 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1028 }) 1029 .narrowScalarIf(largerThan(1, 0), 1030 [](const LegalityQuery &Query) { 1031 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1032 }); 1033 1034 getActionDefinitionsBuilder(G_PTRTOINT) 1035 // List the common cases 1036 .legalForCartesianProduct(AddrSpaces64, {S64}) 1037 .legalForCartesianProduct(AddrSpaces32, {S32}) 1038 .scalarize(0) 1039 // Accept any address space as long as the size matches 1040 .legalIf(sameSize(0, 1)) 1041 .widenScalarIf(smallerThan(0, 1), 1042 [](const LegalityQuery &Query) { 1043 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1044 }) 1045 .narrowScalarIf( 1046 largerThan(0, 1), 1047 [](const LegalityQuery &Query) { 1048 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1049 }); 1050 1051 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1052 .scalarize(0) 1053 .custom(); 1054 1055 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1056 bool IsLoad) -> bool { 1057 const LLT DstTy = Query.Types[0]; 1058 1059 // Split vector extloads. 1060 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1061 1062 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1063 return true; 1064 1065 const LLT PtrTy = Query.Types[1]; 1066 unsigned AS = PtrTy.getAddressSpace(); 1067 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1068 return true; 1069 1070 // Catch weird sized loads that don't evenly divide into the access sizes 1071 // TODO: May be able to widen depending on alignment etc. 1072 unsigned NumRegs = (MemSize + 31) / 32; 1073 if (NumRegs == 3) { 1074 if (!ST.hasDwordx3LoadStores()) 1075 return true; 1076 } else { 1077 // If the alignment allows, these should have been widened. 1078 if (!isPowerOf2_32(NumRegs)) 1079 return true; 1080 } 1081 1082 return false; 1083 }; 1084 1085 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1086 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1087 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1088 1089 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1090 // LDS 1091 // TODO: Unsupported flat for SI. 1092 1093 for (unsigned Op : {G_LOAD, G_STORE}) { 1094 const bool IsStore = Op == G_STORE; 1095 1096 auto &Actions = getActionDefinitionsBuilder(Op); 1097 // Explicitly list some common cases. 1098 // TODO: Does this help compile time at all? 1099 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1100 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1101 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1102 {S64, GlobalPtr, S64, GlobalAlign32}, 1103 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1104 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1105 {S32, GlobalPtr, S8, GlobalAlign8}, 1106 {S32, GlobalPtr, S16, GlobalAlign16}, 1107 1108 {S32, LocalPtr, S32, 32}, 1109 {S64, LocalPtr, S64, 32}, 1110 {V2S32, LocalPtr, V2S32, 32}, 1111 {S32, LocalPtr, S8, 8}, 1112 {S32, LocalPtr, S16, 16}, 1113 {V2S16, LocalPtr, S32, 32}, 1114 1115 {S32, PrivatePtr, S32, 32}, 1116 {S32, PrivatePtr, S8, 8}, 1117 {S32, PrivatePtr, S16, 16}, 1118 {V2S16, PrivatePtr, S32, 32}, 1119 1120 {S32, ConstantPtr, S32, GlobalAlign32}, 1121 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1122 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1123 {S64, ConstantPtr, S64, GlobalAlign32}, 1124 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1125 Actions.legalIf( 1126 [=](const LegalityQuery &Query) -> bool { 1127 return isLoadStoreLegal(ST, Query); 1128 }); 1129 1130 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1131 // 64-bits. 1132 // 1133 // TODO: Should generalize bitcast action into coerce, which will also cover 1134 // inserting addrspacecasts. 1135 Actions.customIf(typeIs(1, Constant32Ptr)); 1136 1137 // Turn any illegal element vectors into something easier to deal 1138 // with. These will ultimately produce 32-bit scalar shifts to extract the 1139 // parts anyway. 1140 // 1141 // For odd 16-bit element vectors, prefer to split those into pieces with 1142 // 16-bit vector parts. 1143 Actions.bitcastIf( 1144 [=](const LegalityQuery &Query) -> bool { 1145 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1146 Query.MMODescrs[0].MemoryTy); 1147 }, bitcastToRegisterType(0)); 1148 1149 if (!IsStore) { 1150 // Widen suitably aligned loads by loading extra bytes. The standard 1151 // legalization actions can't properly express widening memory operands. 1152 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1153 return shouldWidenLoad(ST, Query, G_LOAD); 1154 }); 1155 } 1156 1157 // FIXME: load/store narrowing should be moved to lower action 1158 Actions 1159 .narrowScalarIf( 1160 [=](const LegalityQuery &Query) -> bool { 1161 return !Query.Types[0].isVector() && 1162 needToSplitMemOp(Query, Op == G_LOAD); 1163 }, 1164 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1165 const LLT DstTy = Query.Types[0]; 1166 const LLT PtrTy = Query.Types[1]; 1167 1168 const unsigned DstSize = DstTy.getSizeInBits(); 1169 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1170 1171 // Split extloads. 1172 if (DstSize > MemSize) 1173 return std::make_pair(0, LLT::scalar(MemSize)); 1174 1175 unsigned MaxSize = maxSizeForAddrSpace(ST, 1176 PtrTy.getAddressSpace(), 1177 Op == G_LOAD); 1178 if (MemSize > MaxSize) 1179 return std::make_pair(0, LLT::scalar(MaxSize)); 1180 1181 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1182 return std::make_pair(0, LLT::scalar(Align)); 1183 }) 1184 .fewerElementsIf( 1185 [=](const LegalityQuery &Query) -> bool { 1186 return Query.Types[0].isVector() && 1187 needToSplitMemOp(Query, Op == G_LOAD); 1188 }, 1189 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1190 const LLT DstTy = Query.Types[0]; 1191 const LLT PtrTy = Query.Types[1]; 1192 1193 LLT EltTy = DstTy.getElementType(); 1194 unsigned MaxSize = maxSizeForAddrSpace(ST, 1195 PtrTy.getAddressSpace(), 1196 Op == G_LOAD); 1197 1198 // FIXME: Handle widened to power of 2 results better. This ends 1199 // up scalarizing. 1200 // FIXME: 3 element stores scalarized on SI 1201 1202 // Split if it's too large for the address space. 1203 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1204 if (MemSize > MaxSize) { 1205 unsigned NumElts = DstTy.getNumElements(); 1206 unsigned EltSize = EltTy.getSizeInBits(); 1207 1208 if (MaxSize % EltSize == 0) { 1209 return std::make_pair( 1210 0, LLT::scalarOrVector( 1211 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1212 } 1213 1214 unsigned NumPieces = MemSize / MaxSize; 1215 1216 // FIXME: Refine when odd breakdowns handled 1217 // The scalars will need to be re-legalized. 1218 if (NumPieces == 1 || NumPieces >= NumElts || 1219 NumElts % NumPieces != 0) 1220 return std::make_pair(0, EltTy); 1221 1222 return std::make_pair( 1223 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1224 } 1225 1226 // FIXME: We could probably handle weird extending loads better. 1227 if (DstTy.getSizeInBits() > MemSize) 1228 return std::make_pair(0, EltTy); 1229 1230 unsigned EltSize = EltTy.getSizeInBits(); 1231 unsigned DstSize = DstTy.getSizeInBits(); 1232 if (!isPowerOf2_32(DstSize)) { 1233 // We're probably decomposing an odd sized store. Try to split 1234 // to the widest type. TODO: Account for alignment. As-is it 1235 // should be OK, since the new parts will be further legalized. 1236 unsigned FloorSize = PowerOf2Floor(DstSize); 1237 return std::make_pair( 1238 0, LLT::scalarOrVector( 1239 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1240 } 1241 1242 // May need relegalization for the scalars. 1243 return std::make_pair(0, EltTy); 1244 }) 1245 .minScalar(0, S32) 1246 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1247 .widenScalarToNextPow2(0) 1248 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1249 .lower(); 1250 } 1251 1252 // FIXME: Unaligned accesses not lowered. 1253 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1254 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1255 {S32, GlobalPtr, S16, 2 * 8}, 1256 {S32, LocalPtr, S8, 8}, 1257 {S32, LocalPtr, S16, 16}, 1258 {S32, PrivatePtr, S8, 8}, 1259 {S32, PrivatePtr, S16, 16}, 1260 {S32, ConstantPtr, S8, 8}, 1261 {S32, ConstantPtr, S16, 2 * 8}}) 1262 .legalIf( 1263 [=](const LegalityQuery &Query) -> bool { 1264 return isLoadStoreLegal(ST, Query); 1265 }); 1266 1267 if (ST.hasFlatAddressSpace()) { 1268 ExtLoads.legalForTypesWithMemDesc( 1269 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1270 } 1271 1272 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1273 // 64-bits. 1274 // 1275 // TODO: Should generalize bitcast action into coerce, which will also cover 1276 // inserting addrspacecasts. 1277 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1278 1279 ExtLoads.clampScalar(0, S32, S32) 1280 .widenScalarToNextPow2(0) 1281 .lower(); 1282 1283 auto &Atomics = getActionDefinitionsBuilder( 1284 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1285 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1286 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1287 G_ATOMICRMW_UMIN}) 1288 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1289 {S64, GlobalPtr}, {S64, LocalPtr}, 1290 {S32, RegionPtr}, {S64, RegionPtr}}); 1291 if (ST.hasFlatAddressSpace()) { 1292 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1293 } 1294 1295 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1296 if (ST.hasLDSFPAtomicAdd()) { 1297 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1298 if (ST.hasGFX90AInsts()) 1299 Atomic.legalFor({{S64, LocalPtr}}); 1300 if (ST.hasGFX940Insts()) 1301 Atomic.legalFor({{V2S16, LocalPtr}}); 1302 } 1303 if (ST.hasAtomicFaddInsts()) 1304 Atomic.legalFor({{S32, GlobalPtr}}); 1305 1306 if (ST.hasGFX90AInsts()) { 1307 // These are legal with some caveats, and should have undergone expansion in 1308 // the IR in most situations 1309 // TODO: Move atomic expansion into legalizer 1310 // TODO: Also supports <2 x f16> 1311 Atomic.legalFor({ 1312 {S32, GlobalPtr}, 1313 {S64, GlobalPtr}, 1314 {S64, FlatPtr} 1315 }); 1316 } 1317 1318 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1319 // demarshalling 1320 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1321 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1322 {S32, FlatPtr}, {S64, FlatPtr}}) 1323 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1324 {S32, RegionPtr}, {S64, RegionPtr}}); 1325 // TODO: Pointer types, any 32-bit or 64-bit vector 1326 1327 // Condition should be s32 for scalar, s1 for vector. 1328 getActionDefinitionsBuilder(G_SELECT) 1329 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1330 LocalPtr, FlatPtr, PrivatePtr, 1331 LLT::fixed_vector(2, LocalPtr), 1332 LLT::fixed_vector(2, PrivatePtr)}, 1333 {S1, S32}) 1334 .clampScalar(0, S16, S64) 1335 .scalarize(1) 1336 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1337 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1338 .clampMaxNumElements(0, S32, 2) 1339 .clampMaxNumElements(0, LocalPtr, 2) 1340 .clampMaxNumElements(0, PrivatePtr, 2) 1341 .scalarize(0) 1342 .widenScalarToNextPow2(0) 1343 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1344 1345 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1346 // be more flexible with the shift amount type. 1347 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1348 .legalFor({{S32, S32}, {S64, S32}}); 1349 if (ST.has16BitInsts()) { 1350 if (ST.hasVOP3PInsts()) { 1351 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1352 .clampMaxNumElements(0, S16, 2); 1353 } else 1354 Shifts.legalFor({{S16, S16}}); 1355 1356 // TODO: Support 16-bit shift amounts for all types 1357 Shifts.widenScalarIf( 1358 [=](const LegalityQuery &Query) { 1359 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1360 // 32-bit amount. 1361 const LLT ValTy = Query.Types[0]; 1362 const LLT AmountTy = Query.Types[1]; 1363 return ValTy.getSizeInBits() <= 16 && 1364 AmountTy.getSizeInBits() < 16; 1365 }, changeTo(1, S16)); 1366 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1367 Shifts.clampScalar(1, S32, S32); 1368 Shifts.widenScalarToNextPow2(0, 16); 1369 Shifts.clampScalar(0, S16, S64); 1370 1371 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1372 .minScalar(0, S16) 1373 .scalarize(0) 1374 .lower(); 1375 } else { 1376 // Make sure we legalize the shift amount type first, as the general 1377 // expansion for the shifted type will produce much worse code if it hasn't 1378 // been truncated already. 1379 Shifts.clampScalar(1, S32, S32); 1380 Shifts.widenScalarToNextPow2(0, 32); 1381 Shifts.clampScalar(0, S32, S64); 1382 1383 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1384 .minScalar(0, S32) 1385 .scalarize(0) 1386 .lower(); 1387 } 1388 Shifts.scalarize(0); 1389 1390 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1391 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1392 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1393 unsigned IdxTypeIdx = 2; 1394 1395 getActionDefinitionsBuilder(Op) 1396 .customIf([=](const LegalityQuery &Query) { 1397 const LLT EltTy = Query.Types[EltTypeIdx]; 1398 const LLT VecTy = Query.Types[VecTypeIdx]; 1399 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1400 const unsigned EltSize = EltTy.getSizeInBits(); 1401 return (EltSize == 32 || EltSize == 64) && 1402 VecTy.getSizeInBits() % 32 == 0 && 1403 VecTy.getSizeInBits() <= MaxRegisterSize && 1404 IdxTy.getSizeInBits() == 32; 1405 }) 1406 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1407 bitcastToVectorElement32(VecTypeIdx)) 1408 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1409 .bitcastIf( 1410 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1411 [=](const LegalityQuery &Query) { 1412 // For > 64-bit element types, try to turn this into a 64-bit 1413 // element vector since we may be able to do better indexing 1414 // if this is scalar. If not, fall back to 32. 1415 const LLT EltTy = Query.Types[EltTypeIdx]; 1416 const LLT VecTy = Query.Types[VecTypeIdx]; 1417 const unsigned DstEltSize = EltTy.getSizeInBits(); 1418 const unsigned VecSize = VecTy.getSizeInBits(); 1419 1420 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1421 return std::make_pair( 1422 VecTypeIdx, 1423 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1424 }) 1425 .clampScalar(EltTypeIdx, S32, S64) 1426 .clampScalar(VecTypeIdx, S32, S64) 1427 .clampScalar(IdxTypeIdx, S32, S32) 1428 .clampMaxNumElements(VecTypeIdx, S32, 32) 1429 // TODO: Clamp elements for 64-bit vectors? 1430 // It should only be necessary with variable indexes. 1431 // As a last resort, lower to the stack 1432 .lower(); 1433 } 1434 1435 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1436 .unsupportedIf([=](const LegalityQuery &Query) { 1437 const LLT &EltTy = Query.Types[1].getElementType(); 1438 return Query.Types[0] != EltTy; 1439 }); 1440 1441 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1442 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1443 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1444 1445 // FIXME: Doesn't handle extract of illegal sizes. 1446 getActionDefinitionsBuilder(Op) 1447 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1448 .lowerIf([=](const LegalityQuery &Query) { 1449 // Sub-vector(or single element) insert and extract. 1450 // TODO: verify immediate offset here since lower only works with 1451 // whole elements. 1452 const LLT BigTy = Query.Types[BigTyIdx]; 1453 return BigTy.isVector(); 1454 }) 1455 // FIXME: Multiples of 16 should not be legal. 1456 .legalIf([=](const LegalityQuery &Query) { 1457 const LLT BigTy = Query.Types[BigTyIdx]; 1458 const LLT LitTy = Query.Types[LitTyIdx]; 1459 return (BigTy.getSizeInBits() % 32 == 0) && 1460 (LitTy.getSizeInBits() % 16 == 0); 1461 }) 1462 .widenScalarIf( 1463 [=](const LegalityQuery &Query) { 1464 const LLT BigTy = Query.Types[BigTyIdx]; 1465 return (BigTy.getScalarSizeInBits() < 16); 1466 }, 1467 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1468 .widenScalarIf( 1469 [=](const LegalityQuery &Query) { 1470 const LLT LitTy = Query.Types[LitTyIdx]; 1471 return (LitTy.getScalarSizeInBits() < 16); 1472 }, 1473 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1474 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1475 .widenScalarToNextPow2(BigTyIdx, 32); 1476 1477 } 1478 1479 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1480 .legalForCartesianProduct(AllS32Vectors, {S32}) 1481 .legalForCartesianProduct(AllS64Vectors, {S64}) 1482 .clampNumElements(0, V16S32, V32S32) 1483 .clampNumElements(0, V2S64, V16S64) 1484 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1485 1486 if (ST.hasScalarPackInsts()) { 1487 BuildVector 1488 // FIXME: Should probably widen s1 vectors straight to s32 1489 .minScalarOrElt(0, S16) 1490 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1491 .minScalar(1, S32); 1492 1493 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1494 .legalFor({V2S16, S32}) 1495 .lower(); 1496 BuildVector.minScalarOrElt(0, S32); 1497 } else { 1498 BuildVector.customFor({V2S16, S16}); 1499 BuildVector.minScalarOrElt(0, S32); 1500 1501 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1502 .customFor({V2S16, S32}) 1503 .lower(); 1504 } 1505 1506 BuildVector.legalIf(isRegisterType(0)); 1507 1508 // FIXME: Clamp maximum size 1509 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1510 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1511 .clampMaxNumElements(0, S32, 32) 1512 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1513 .clampMaxNumElements(0, S16, 64); 1514 1515 // TODO: Don't fully scalarize v2s16 pieces? Or combine out those 1516 // pre-legalize. 1517 if (ST.hasVOP3PInsts()) { 1518 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1519 .customFor({V2S16, V2S16}) 1520 .lower(); 1521 } else 1522 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1523 1524 // Merge/Unmerge 1525 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1526 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1527 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1528 1529 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1530 const LLT Ty = Query.Types[TypeIdx]; 1531 if (Ty.isVector()) { 1532 const LLT &EltTy = Ty.getElementType(); 1533 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1534 return true; 1535 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1536 return true; 1537 } 1538 return false; 1539 }; 1540 1541 auto &Builder = getActionDefinitionsBuilder(Op) 1542 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1543 .lowerFor({{S16, V2S16}}) 1544 .lowerIf([=](const LegalityQuery &Query) { 1545 const LLT BigTy = Query.Types[BigTyIdx]; 1546 return BigTy.getSizeInBits() == 32; 1547 }) 1548 // Try to widen to s16 first for small types. 1549 // TODO: Only do this on targets with legal s16 shifts 1550 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1551 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1552 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1553 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1554 elementTypeIs(1, S16)), 1555 changeTo(1, V2S16)) 1556 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1557 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1558 // valid. 1559 .clampScalar(LitTyIdx, S32, S512) 1560 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1561 // Break up vectors with weird elements into scalars 1562 .fewerElementsIf( 1563 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1564 scalarize(0)) 1565 .fewerElementsIf( 1566 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1567 scalarize(1)) 1568 .clampScalar(BigTyIdx, S32, MaxScalar); 1569 1570 if (Op == G_MERGE_VALUES) { 1571 Builder.widenScalarIf( 1572 // TODO: Use 16-bit shifts if legal for 8-bit values? 1573 [=](const LegalityQuery &Query) { 1574 const LLT Ty = Query.Types[LitTyIdx]; 1575 return Ty.getSizeInBits() < 32; 1576 }, 1577 changeTo(LitTyIdx, S32)); 1578 } 1579 1580 Builder.widenScalarIf( 1581 [=](const LegalityQuery &Query) { 1582 const LLT Ty = Query.Types[BigTyIdx]; 1583 return !isPowerOf2_32(Ty.getSizeInBits()) && 1584 Ty.getSizeInBits() % 16 != 0; 1585 }, 1586 [=](const LegalityQuery &Query) { 1587 // Pick the next power of 2, or a multiple of 64 over 128. 1588 // Whichever is smaller. 1589 const LLT &Ty = Query.Types[BigTyIdx]; 1590 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1591 if (NewSizeInBits >= 256) { 1592 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1593 if (RoundedTo < NewSizeInBits) 1594 NewSizeInBits = RoundedTo; 1595 } 1596 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1597 }) 1598 // Any vectors left are the wrong size. Scalarize them. 1599 .scalarize(0) 1600 .scalarize(1); 1601 } 1602 1603 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1604 // RegBankSelect. 1605 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1606 .legalFor({{S32}, {S64}}); 1607 1608 if (ST.hasVOP3PInsts()) { 1609 SextInReg.lowerFor({{V2S16}}) 1610 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1611 // get more vector shift opportunities, since we'll get those when 1612 // expanded. 1613 .clampMaxNumElementsStrict(0, S16, 2); 1614 } else if (ST.has16BitInsts()) { 1615 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1616 } else { 1617 // Prefer to promote to s32 before lowering if we don't have 16-bit 1618 // shifts. This avoid a lot of intermediate truncate and extend operations. 1619 SextInReg.lowerFor({{S32}, {S64}}); 1620 } 1621 1622 SextInReg 1623 .scalarize(0) 1624 .clampScalar(0, S32, S64) 1625 .lower(); 1626 1627 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1628 .scalarize(0) 1629 .lower(); 1630 1631 // TODO: Only Try to form v2s16 with legal packed instructions. 1632 getActionDefinitionsBuilder(G_FSHR) 1633 .legalFor({{S32, S32}}) 1634 .lowerFor({{V2S16, V2S16}}) 1635 .clampMaxNumElementsStrict(0, S16, 2) 1636 .scalarize(0) 1637 .lower(); 1638 1639 if (ST.hasVOP3PInsts()) { 1640 getActionDefinitionsBuilder(G_FSHL) 1641 .lowerFor({{V2S16, V2S16}}) 1642 .clampMaxNumElementsStrict(0, S16, 2) 1643 .scalarize(0) 1644 .lower(); 1645 } else { 1646 getActionDefinitionsBuilder(G_FSHL) 1647 .scalarize(0) 1648 .lower(); 1649 } 1650 1651 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1652 .legalFor({S64}); 1653 1654 getActionDefinitionsBuilder(G_FENCE) 1655 .alwaysLegal(); 1656 1657 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1658 .scalarize(0) 1659 .minScalar(0, S32) 1660 .lower(); 1661 1662 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1663 .legalFor({{S32, S32}, {S64, S32}}) 1664 .clampScalar(1, S32, S32) 1665 .clampScalar(0, S32, S64) 1666 .widenScalarToNextPow2(0) 1667 .scalarize(0); 1668 1669 getActionDefinitionsBuilder({ 1670 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1671 G_FCOPYSIGN, 1672 1673 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1674 G_ATOMICRMW_NAND, 1675 G_ATOMICRMW_FSUB, 1676 G_READ_REGISTER, 1677 G_WRITE_REGISTER, 1678 1679 G_SADDO, G_SSUBO, 1680 1681 // TODO: Implement 1682 G_FMINIMUM, G_FMAXIMUM}).lower(); 1683 1684 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1685 .lower(); 1686 1687 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1688 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1689 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1690 .unsupported(); 1691 1692 getLegacyLegalizerInfo().computeTables(); 1693 verify(*ST.getInstrInfo()); 1694 } 1695 1696 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1697 MachineInstr &MI) const { 1698 MachineIRBuilder &B = Helper.MIRBuilder; 1699 MachineRegisterInfo &MRI = *B.getMRI(); 1700 1701 switch (MI.getOpcode()) { 1702 case TargetOpcode::G_ADDRSPACE_CAST: 1703 return legalizeAddrSpaceCast(MI, MRI, B); 1704 case TargetOpcode::G_FRINT: 1705 return legalizeFrint(MI, MRI, B); 1706 case TargetOpcode::G_FCEIL: 1707 return legalizeFceil(MI, MRI, B); 1708 case TargetOpcode::G_FREM: 1709 return legalizeFrem(MI, MRI, B); 1710 case TargetOpcode::G_INTRINSIC_TRUNC: 1711 return legalizeIntrinsicTrunc(MI, MRI, B); 1712 case TargetOpcode::G_SITOFP: 1713 return legalizeITOFP(MI, MRI, B, true); 1714 case TargetOpcode::G_UITOFP: 1715 return legalizeITOFP(MI, MRI, B, false); 1716 case TargetOpcode::G_FPTOSI: 1717 return legalizeFPTOI(MI, MRI, B, true); 1718 case TargetOpcode::G_FPTOUI: 1719 return legalizeFPTOI(MI, MRI, B, false); 1720 case TargetOpcode::G_FMINNUM: 1721 case TargetOpcode::G_FMAXNUM: 1722 case TargetOpcode::G_FMINNUM_IEEE: 1723 case TargetOpcode::G_FMAXNUM_IEEE: 1724 return legalizeMinNumMaxNum(Helper, MI); 1725 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1726 return legalizeExtractVectorElt(MI, MRI, B); 1727 case TargetOpcode::G_INSERT_VECTOR_ELT: 1728 return legalizeInsertVectorElt(MI, MRI, B); 1729 case TargetOpcode::G_SHUFFLE_VECTOR: 1730 return legalizeShuffleVector(MI, MRI, B); 1731 case TargetOpcode::G_FSIN: 1732 case TargetOpcode::G_FCOS: 1733 return legalizeSinCos(MI, MRI, B); 1734 case TargetOpcode::G_GLOBAL_VALUE: 1735 return legalizeGlobalValue(MI, MRI, B); 1736 case TargetOpcode::G_LOAD: 1737 case TargetOpcode::G_SEXTLOAD: 1738 case TargetOpcode::G_ZEXTLOAD: 1739 return legalizeLoad(Helper, MI); 1740 case TargetOpcode::G_FMAD: 1741 return legalizeFMad(MI, MRI, B); 1742 case TargetOpcode::G_FDIV: 1743 return legalizeFDIV(MI, MRI, B); 1744 case TargetOpcode::G_UDIV: 1745 case TargetOpcode::G_UREM: 1746 case TargetOpcode::G_UDIVREM: 1747 return legalizeUnsignedDIV_REM(MI, MRI, B); 1748 case TargetOpcode::G_SDIV: 1749 case TargetOpcode::G_SREM: 1750 case TargetOpcode::G_SDIVREM: 1751 return legalizeSignedDIV_REM(MI, MRI, B); 1752 case TargetOpcode::G_ATOMIC_CMPXCHG: 1753 return legalizeAtomicCmpXChg(MI, MRI, B); 1754 case TargetOpcode::G_FLOG: 1755 return legalizeFlog(MI, B, numbers::ln2f); 1756 case TargetOpcode::G_FLOG10: 1757 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1758 case TargetOpcode::G_FEXP: 1759 return legalizeFExp(MI, B); 1760 case TargetOpcode::G_FPOW: 1761 return legalizeFPow(MI, B); 1762 case TargetOpcode::G_FFLOOR: 1763 return legalizeFFloor(MI, MRI, B); 1764 case TargetOpcode::G_BUILD_VECTOR: 1765 return legalizeBuildVector(MI, MRI, B); 1766 case TargetOpcode::G_CTLZ: 1767 case TargetOpcode::G_CTTZ: 1768 return legalizeCTLZ_CTTZ(MI, MRI, B); 1769 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 1770 return legalizeFPTruncRound(MI, B); 1771 default: 1772 return false; 1773 } 1774 1775 llvm_unreachable("expected switch to return"); 1776 } 1777 1778 Register AMDGPULegalizerInfo::getSegmentAperture( 1779 unsigned AS, 1780 MachineRegisterInfo &MRI, 1781 MachineIRBuilder &B) const { 1782 MachineFunction &MF = B.getMF(); 1783 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1784 const LLT S32 = LLT::scalar(32); 1785 1786 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1787 1788 if (ST.hasApertureRegs()) { 1789 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1790 // getreg. 1791 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1792 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1793 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1794 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1795 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1796 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1797 unsigned Encoding = 1798 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1799 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1800 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1801 1802 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1803 1804 B.buildInstr(AMDGPU::S_GETREG_B32) 1805 .addDef(GetReg) 1806 .addImm(Encoding); 1807 MRI.setType(GetReg, S32); 1808 1809 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1810 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1811 } 1812 1813 // TODO: can we be smarter about machine pointer info? 1814 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1815 Register LoadAddr = MRI.createGenericVirtualRegister( 1816 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1817 // For code object version 5, private_base and shared_base are passed through 1818 // implicit kernargs. 1819 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 1820 AMDGPUTargetLowering::ImplicitParameter Param = 1821 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 1822 : AMDGPUTargetLowering::PRIVATE_BASE; 1823 uint64_t Offset = 1824 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 1825 1826 Register KernargPtrReg = MRI.createGenericVirtualRegister( 1827 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1828 1829 if (!loadInputValue(KernargPtrReg, B, 1830 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 1831 return Register(); 1832 1833 MachineMemOperand *MMO = MF.getMachineMemOperand( 1834 PtrInfo, 1835 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1836 MachineMemOperand::MOInvariant, 1837 LLT::scalar(32), commonAlignment(Align(64), Offset)); 1838 1839 // Pointer address 1840 B.buildPtrAdd(LoadAddr, KernargPtrReg, 1841 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 1842 // Load address 1843 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1844 } 1845 1846 Register QueuePtr = MRI.createGenericVirtualRegister( 1847 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1848 1849 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1850 return Register(); 1851 1852 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1853 // private_segment_aperture_base_hi. 1854 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1855 1856 MachineMemOperand *MMO = MF.getMachineMemOperand( 1857 PtrInfo, 1858 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1859 MachineMemOperand::MOInvariant, 1860 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1861 1862 B.buildPtrAdd(LoadAddr, QueuePtr, 1863 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 1864 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1865 } 1866 1867 /// Return true if the value is a known valid address, such that a null check is 1868 /// not necessary. 1869 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 1870 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 1871 MachineInstr *Def = MRI.getVRegDef(Val); 1872 switch (Def->getOpcode()) { 1873 case AMDGPU::G_FRAME_INDEX: 1874 case AMDGPU::G_GLOBAL_VALUE: 1875 case AMDGPU::G_BLOCK_ADDR: 1876 return true; 1877 case AMDGPU::G_CONSTANT: { 1878 const ConstantInt *CI = Def->getOperand(1).getCImm(); 1879 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 1880 } 1881 default: 1882 return false; 1883 } 1884 1885 return false; 1886 } 1887 1888 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1889 MachineInstr &MI, MachineRegisterInfo &MRI, 1890 MachineIRBuilder &B) const { 1891 MachineFunction &MF = B.getMF(); 1892 1893 const LLT S32 = LLT::scalar(32); 1894 Register Dst = MI.getOperand(0).getReg(); 1895 Register Src = MI.getOperand(1).getReg(); 1896 1897 LLT DstTy = MRI.getType(Dst); 1898 LLT SrcTy = MRI.getType(Src); 1899 unsigned DestAS = DstTy.getAddressSpace(); 1900 unsigned SrcAS = SrcTy.getAddressSpace(); 1901 1902 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1903 // vector element. 1904 assert(!DstTy.isVector()); 1905 1906 const AMDGPUTargetMachine &TM 1907 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1908 1909 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1910 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1911 return true; 1912 } 1913 1914 if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 1915 (DestAS == AMDGPUAS::LOCAL_ADDRESS || 1916 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1917 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1918 // Extract low 32-bits of the pointer. 1919 B.buildExtract(Dst, Src, 0); 1920 MI.eraseFromParent(); 1921 return true; 1922 } 1923 1924 unsigned NullVal = TM.getNullPointerValue(DestAS); 1925 1926 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1927 auto FlatNull = B.buildConstant(SrcTy, 0); 1928 1929 // Extract low 32-bits of the pointer. 1930 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1931 1932 auto CmpRes = 1933 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1934 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1935 1936 MI.eraseFromParent(); 1937 return true; 1938 } 1939 1940 if (DestAS == AMDGPUAS::FLAT_ADDRESS && 1941 (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 1942 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1943 if (!ST.hasFlatAddressSpace()) 1944 return false; 1945 1946 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1947 if (!ApertureReg.isValid()) 1948 return false; 1949 1950 // Coerce the type of the low half of the result so we can use merge_values. 1951 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1952 1953 // TODO: Should we allow mismatched types but matching sizes in merges to 1954 // avoid the ptrtoint? 1955 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1956 1957 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1958 B.buildCopy(Dst, BuildPtr); 1959 MI.eraseFromParent(); 1960 return true; 1961 } 1962 1963 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1964 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1965 1966 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 1967 SegmentNull.getReg(0)); 1968 1969 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1970 1971 MI.eraseFromParent(); 1972 return true; 1973 } 1974 1975 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 1976 SrcTy.getSizeInBits() == 64) { 1977 // Truncate. 1978 B.buildExtract(Dst, Src, 0); 1979 MI.eraseFromParent(); 1980 return true; 1981 } 1982 1983 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 1984 DstTy.getSizeInBits() == 64) { 1985 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1986 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1987 1988 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1989 // another. Merge operands are required to be the same type, but creating an 1990 // extra ptrtoint would be kind of pointless. 1991 auto HighAddr = B.buildConstant( 1992 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1993 B.buildMerge(Dst, {Src, HighAddr}); 1994 MI.eraseFromParent(); 1995 return true; 1996 } 1997 1998 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 1999 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 2000 2001 LLVMContext &Ctx = MF.getFunction().getContext(); 2002 Ctx.diagnose(InvalidAddrSpaceCast); 2003 B.buildUndef(Dst); 2004 MI.eraseFromParent(); 2005 return true; 2006 } 2007 2008 bool AMDGPULegalizerInfo::legalizeFrint( 2009 MachineInstr &MI, MachineRegisterInfo &MRI, 2010 MachineIRBuilder &B) const { 2011 Register Src = MI.getOperand(1).getReg(); 2012 LLT Ty = MRI.getType(Src); 2013 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 2014 2015 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2016 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2017 2018 auto C1 = B.buildFConstant(Ty, C1Val); 2019 auto CopySign = B.buildFCopysign(Ty, C1, Src); 2020 2021 // TODO: Should this propagate fast-math-flags? 2022 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 2023 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 2024 2025 auto C2 = B.buildFConstant(Ty, C2Val); 2026 auto Fabs = B.buildFAbs(Ty, Src); 2027 2028 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 2029 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2030 MI.eraseFromParent(); 2031 return true; 2032 } 2033 2034 bool AMDGPULegalizerInfo::legalizeFceil( 2035 MachineInstr &MI, MachineRegisterInfo &MRI, 2036 MachineIRBuilder &B) const { 2037 2038 const LLT S1 = LLT::scalar(1); 2039 const LLT S64 = LLT::scalar(64); 2040 2041 Register Src = MI.getOperand(1).getReg(); 2042 assert(MRI.getType(Src) == S64); 2043 2044 // result = trunc(src) 2045 // if (src > 0.0 && src != result) 2046 // result += 1.0 2047 2048 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 2049 2050 const auto Zero = B.buildFConstant(S64, 0.0); 2051 const auto One = B.buildFConstant(S64, 1.0); 2052 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2053 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2054 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2055 auto Add = B.buildSelect(S64, And, One, Zero); 2056 2057 // TODO: Should this propagate fast-math-flags? 2058 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2059 MI.eraseFromParent(); 2060 return true; 2061 } 2062 2063 bool AMDGPULegalizerInfo::legalizeFrem( 2064 MachineInstr &MI, MachineRegisterInfo &MRI, 2065 MachineIRBuilder &B) const { 2066 Register DstReg = MI.getOperand(0).getReg(); 2067 Register Src0Reg = MI.getOperand(1).getReg(); 2068 Register Src1Reg = MI.getOperand(2).getReg(); 2069 auto Flags = MI.getFlags(); 2070 LLT Ty = MRI.getType(DstReg); 2071 2072 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2073 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2074 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2075 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2076 MI.eraseFromParent(); 2077 return true; 2078 } 2079 2080 static MachineInstrBuilder extractF64Exponent(Register Hi, 2081 MachineIRBuilder &B) { 2082 const unsigned FractBits = 52; 2083 const unsigned ExpBits = 11; 2084 LLT S32 = LLT::scalar(32); 2085 2086 auto Const0 = B.buildConstant(S32, FractBits - 32); 2087 auto Const1 = B.buildConstant(S32, ExpBits); 2088 2089 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2090 .addUse(Hi) 2091 .addUse(Const0.getReg(0)) 2092 .addUse(Const1.getReg(0)); 2093 2094 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2095 } 2096 2097 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2098 MachineInstr &MI, MachineRegisterInfo &MRI, 2099 MachineIRBuilder &B) const { 2100 const LLT S1 = LLT::scalar(1); 2101 const LLT S32 = LLT::scalar(32); 2102 const LLT S64 = LLT::scalar(64); 2103 2104 Register Src = MI.getOperand(1).getReg(); 2105 assert(MRI.getType(Src) == S64); 2106 2107 // TODO: Should this use extract since the low half is unused? 2108 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2109 Register Hi = Unmerge.getReg(1); 2110 2111 // Extract the upper half, since this is where we will find the sign and 2112 // exponent. 2113 auto Exp = extractF64Exponent(Hi, B); 2114 2115 const unsigned FractBits = 52; 2116 2117 // Extract the sign bit. 2118 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2119 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2120 2121 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2122 2123 const auto Zero32 = B.buildConstant(S32, 0); 2124 2125 // Extend back to 64-bits. 2126 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2127 2128 auto Shr = B.buildAShr(S64, FractMask, Exp); 2129 auto Not = B.buildNot(S64, Shr); 2130 auto Tmp0 = B.buildAnd(S64, Src, Not); 2131 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2132 2133 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2134 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2135 2136 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2137 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2138 MI.eraseFromParent(); 2139 return true; 2140 } 2141 2142 bool AMDGPULegalizerInfo::legalizeITOFP( 2143 MachineInstr &MI, MachineRegisterInfo &MRI, 2144 MachineIRBuilder &B, bool Signed) const { 2145 2146 Register Dst = MI.getOperand(0).getReg(); 2147 Register Src = MI.getOperand(1).getReg(); 2148 2149 const LLT S64 = LLT::scalar(64); 2150 const LLT S32 = LLT::scalar(32); 2151 2152 assert(MRI.getType(Src) == S64); 2153 2154 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2155 auto ThirtyTwo = B.buildConstant(S32, 32); 2156 2157 if (MRI.getType(Dst) == S64) { 2158 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2159 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2160 2161 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2162 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2163 .addUse(CvtHi.getReg(0)) 2164 .addUse(ThirtyTwo.getReg(0)); 2165 2166 // TODO: Should this propagate fast-math-flags? 2167 B.buildFAdd(Dst, LdExp, CvtLo); 2168 MI.eraseFromParent(); 2169 return true; 2170 } 2171 2172 assert(MRI.getType(Dst) == S32); 2173 2174 auto One = B.buildConstant(S32, 1); 2175 2176 MachineInstrBuilder ShAmt; 2177 if (Signed) { 2178 auto ThirtyOne = B.buildConstant(S32, 31); 2179 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2180 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2181 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2182 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2183 /*HasSideEffects=*/false) 2184 .addUse(Unmerge.getReg(1)); 2185 auto LS2 = B.buildSub(S32, LS, One); 2186 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2187 } else 2188 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2189 auto Norm = B.buildShl(S64, Src, ShAmt); 2190 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2191 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2192 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2193 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2194 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2195 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2196 /*HasSideEffects=*/false) 2197 .addUse(FVal.getReg(0)) 2198 .addUse(Scale.getReg(0)); 2199 MI.eraseFromParent(); 2200 return true; 2201 } 2202 2203 // TODO: Copied from DAG implementation. Verify logic and document how this 2204 // actually works. 2205 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2206 MachineRegisterInfo &MRI, 2207 MachineIRBuilder &B, 2208 bool Signed) const { 2209 2210 Register Dst = MI.getOperand(0).getReg(); 2211 Register Src = MI.getOperand(1).getReg(); 2212 2213 const LLT S64 = LLT::scalar(64); 2214 const LLT S32 = LLT::scalar(32); 2215 2216 const LLT SrcLT = MRI.getType(Src); 2217 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2218 2219 unsigned Flags = MI.getFlags(); 2220 2221 // The basic idea of converting a floating point number into a pair of 32-bit 2222 // integers is illustrated as follows: 2223 // 2224 // tf := trunc(val); 2225 // hif := floor(tf * 2^-32); 2226 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2227 // hi := fptoi(hif); 2228 // lo := fptoi(lof); 2229 // 2230 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2231 MachineInstrBuilder Sign; 2232 if (Signed && SrcLT == S32) { 2233 // However, a 32-bit floating point number has only 23 bits mantissa and 2234 // it's not enough to hold all the significant bits of `lof` if val is 2235 // negative. To avoid the loss of precision, We need to take the absolute 2236 // value after truncating and flip the result back based on the original 2237 // signedness. 2238 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2239 Trunc = B.buildFAbs(S32, Trunc, Flags); 2240 } 2241 MachineInstrBuilder K0, K1; 2242 if (SrcLT == S64) { 2243 K0 = B.buildFConstant(S64, 2244 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2245 K1 = B.buildFConstant(S64, 2246 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2247 } else { 2248 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2249 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2250 } 2251 2252 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2253 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2254 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2255 2256 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2257 : B.buildFPTOUI(S32, FloorMul); 2258 auto Lo = B.buildFPTOUI(S32, Fma); 2259 2260 if (Signed && SrcLT == S32) { 2261 // Flip the result based on the signedness, which is either all 0s or 1s. 2262 Sign = B.buildMerge(S64, {Sign, Sign}); 2263 // r := xor({lo, hi}, sign) - sign; 2264 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2265 } else 2266 B.buildMerge(Dst, {Lo, Hi}); 2267 MI.eraseFromParent(); 2268 2269 return true; 2270 } 2271 2272 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2273 MachineInstr &MI) const { 2274 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2275 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2276 2277 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2278 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2279 2280 // With ieee_mode disabled, the instructions have the correct behavior 2281 // already for G_FMINNUM/G_FMAXNUM 2282 if (!MFI->getMode().IEEE) 2283 return !IsIEEEOp; 2284 2285 if (IsIEEEOp) 2286 return true; 2287 2288 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2289 } 2290 2291 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2292 MachineInstr &MI, MachineRegisterInfo &MRI, 2293 MachineIRBuilder &B) const { 2294 // TODO: Should move some of this into LegalizerHelper. 2295 2296 // TODO: Promote dynamic indexing of s16 to s32 2297 2298 // FIXME: Artifact combiner probably should have replaced the truncated 2299 // constant before this, so we shouldn't need 2300 // getIConstantVRegValWithLookThrough. 2301 Optional<ValueAndVReg> MaybeIdxVal = 2302 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2303 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2304 return true; 2305 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2306 2307 Register Dst = MI.getOperand(0).getReg(); 2308 Register Vec = MI.getOperand(1).getReg(); 2309 2310 LLT VecTy = MRI.getType(Vec); 2311 LLT EltTy = VecTy.getElementType(); 2312 assert(EltTy == MRI.getType(Dst)); 2313 2314 if (IdxVal < VecTy.getNumElements()) { 2315 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2316 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2317 } else { 2318 B.buildUndef(Dst); 2319 } 2320 2321 MI.eraseFromParent(); 2322 return true; 2323 } 2324 2325 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2326 MachineInstr &MI, MachineRegisterInfo &MRI, 2327 MachineIRBuilder &B) const { 2328 // TODO: Should move some of this into LegalizerHelper. 2329 2330 // TODO: Promote dynamic indexing of s16 to s32 2331 2332 // FIXME: Artifact combiner probably should have replaced the truncated 2333 // constant before this, so we shouldn't need 2334 // getIConstantVRegValWithLookThrough. 2335 Optional<ValueAndVReg> MaybeIdxVal = 2336 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2337 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2338 return true; 2339 2340 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2341 Register Dst = MI.getOperand(0).getReg(); 2342 Register Vec = MI.getOperand(1).getReg(); 2343 Register Ins = MI.getOperand(2).getReg(); 2344 2345 LLT VecTy = MRI.getType(Vec); 2346 LLT EltTy = VecTy.getElementType(); 2347 assert(EltTy == MRI.getType(Ins)); 2348 (void)Ins; 2349 2350 unsigned NumElts = VecTy.getNumElements(); 2351 if (IdxVal < NumElts) { 2352 SmallVector<Register, 8> SrcRegs; 2353 for (unsigned i = 0; i < NumElts; ++i) 2354 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2355 B.buildUnmerge(SrcRegs, Vec); 2356 2357 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2358 B.buildMerge(Dst, SrcRegs); 2359 } else { 2360 B.buildUndef(Dst); 2361 } 2362 2363 MI.eraseFromParent(); 2364 return true; 2365 } 2366 2367 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2368 MachineInstr &MI, MachineRegisterInfo &MRI, 2369 MachineIRBuilder &B) const { 2370 const LLT V2S16 = LLT::fixed_vector(2, 16); 2371 2372 Register Dst = MI.getOperand(0).getReg(); 2373 Register Src0 = MI.getOperand(1).getReg(); 2374 LLT DstTy = MRI.getType(Dst); 2375 LLT SrcTy = MRI.getType(Src0); 2376 2377 if (SrcTy == V2S16 && DstTy == V2S16 && 2378 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2379 return true; 2380 2381 MachineIRBuilder HelperBuilder(MI); 2382 GISelObserverWrapper DummyObserver; 2383 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2384 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2385 } 2386 2387 bool AMDGPULegalizerInfo::legalizeSinCos( 2388 MachineInstr &MI, MachineRegisterInfo &MRI, 2389 MachineIRBuilder &B) const { 2390 2391 Register DstReg = MI.getOperand(0).getReg(); 2392 Register SrcReg = MI.getOperand(1).getReg(); 2393 LLT Ty = MRI.getType(DstReg); 2394 unsigned Flags = MI.getFlags(); 2395 2396 Register TrigVal; 2397 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2398 if (ST.hasTrigReducedRange()) { 2399 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2400 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2401 .addUse(MulVal.getReg(0)) 2402 .setMIFlags(Flags).getReg(0); 2403 } else 2404 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2405 2406 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2407 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2408 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2409 .addUse(TrigVal) 2410 .setMIFlags(Flags); 2411 MI.eraseFromParent(); 2412 return true; 2413 } 2414 2415 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2416 MachineIRBuilder &B, 2417 const GlobalValue *GV, 2418 int64_t Offset, 2419 unsigned GAFlags) const { 2420 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2421 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2422 // to the following code sequence: 2423 // 2424 // For constant address space: 2425 // s_getpc_b64 s[0:1] 2426 // s_add_u32 s0, s0, $symbol 2427 // s_addc_u32 s1, s1, 0 2428 // 2429 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2430 // a fixup or relocation is emitted to replace $symbol with a literal 2431 // constant, which is a pc-relative offset from the encoding of the $symbol 2432 // operand to the global variable. 2433 // 2434 // For global address space: 2435 // s_getpc_b64 s[0:1] 2436 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2437 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2438 // 2439 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2440 // fixups or relocations are emitted to replace $symbol@*@lo and 2441 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2442 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2443 // operand to the global variable. 2444 // 2445 // What we want here is an offset from the value returned by s_getpc 2446 // (which is the address of the s_add_u32 instruction) to the global 2447 // variable, but since the encoding of $symbol starts 4 bytes after the start 2448 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2449 // small. This requires us to add 4 to the global variable offset in order to 2450 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2451 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2452 // instruction. 2453 2454 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2455 2456 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2457 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2458 2459 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2460 .addDef(PCReg); 2461 2462 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2463 if (GAFlags == SIInstrInfo::MO_NONE) 2464 MIB.addImm(0); 2465 else 2466 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2467 2468 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2469 2470 if (PtrTy.getSizeInBits() == 32) 2471 B.buildExtract(DstReg, PCReg, 0); 2472 return true; 2473 } 2474 2475 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2476 MachineInstr &MI, MachineRegisterInfo &MRI, 2477 MachineIRBuilder &B) const { 2478 Register DstReg = MI.getOperand(0).getReg(); 2479 LLT Ty = MRI.getType(DstReg); 2480 unsigned AS = Ty.getAddressSpace(); 2481 2482 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2483 MachineFunction &MF = B.getMF(); 2484 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2485 2486 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2487 if (!MFI->isModuleEntryFunction() && 2488 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2489 const Function &Fn = MF.getFunction(); 2490 DiagnosticInfoUnsupported BadLDSDecl( 2491 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2492 DS_Warning); 2493 Fn.getContext().diagnose(BadLDSDecl); 2494 2495 // We currently don't have a way to correctly allocate LDS objects that 2496 // aren't directly associated with a kernel. We do force inlining of 2497 // functions that use local objects. However, if these dead functions are 2498 // not eliminated, we don't want a compile time error. Just emit a warning 2499 // and a trap, since there should be no callable path here. 2500 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2501 B.buildUndef(DstReg); 2502 MI.eraseFromParent(); 2503 return true; 2504 } 2505 2506 // TODO: We could emit code to handle the initialization somewhere. 2507 // We ignore the initializer for now and legalize it to allow selection. 2508 // The initializer will anyway get errored out during assembly emission. 2509 const SITargetLowering *TLI = ST.getTargetLowering(); 2510 if (!TLI->shouldUseLDSConstAddress(GV)) { 2511 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2512 return true; // Leave in place; 2513 } 2514 2515 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2516 Type *Ty = GV->getValueType(); 2517 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2518 // zero-sized type in other languages to declare the dynamic shared 2519 // memory which size is not known at the compile time. They will be 2520 // allocated by the runtime and placed directly after the static 2521 // allocated ones. They all share the same offset. 2522 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2523 // Adjust alignment for that dynamic shared memory array. 2524 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2525 LLT S32 = LLT::scalar(32); 2526 auto Sz = 2527 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2528 B.buildIntToPtr(DstReg, Sz); 2529 MI.eraseFromParent(); 2530 return true; 2531 } 2532 } 2533 2534 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2535 *cast<GlobalVariable>(GV))); 2536 MI.eraseFromParent(); 2537 return true; 2538 } 2539 2540 const SITargetLowering *TLI = ST.getTargetLowering(); 2541 2542 if (TLI->shouldEmitFixup(GV)) { 2543 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2544 MI.eraseFromParent(); 2545 return true; 2546 } 2547 2548 if (TLI->shouldEmitPCReloc(GV)) { 2549 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2550 MI.eraseFromParent(); 2551 return true; 2552 } 2553 2554 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2555 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2556 2557 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2558 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2559 MachinePointerInfo::getGOT(MF), 2560 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2561 MachineMemOperand::MOInvariant, 2562 LoadTy, Align(8)); 2563 2564 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2565 2566 if (Ty.getSizeInBits() == 32) { 2567 // Truncate if this is a 32-bit constant address. 2568 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2569 B.buildExtract(DstReg, Load, 0); 2570 } else 2571 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2572 2573 MI.eraseFromParent(); 2574 return true; 2575 } 2576 2577 static LLT widenToNextPowerOf2(LLT Ty) { 2578 if (Ty.isVector()) 2579 return Ty.changeElementCount( 2580 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2581 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2582 } 2583 2584 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2585 MachineInstr &MI) const { 2586 MachineIRBuilder &B = Helper.MIRBuilder; 2587 MachineRegisterInfo &MRI = *B.getMRI(); 2588 GISelChangeObserver &Observer = Helper.Observer; 2589 2590 Register PtrReg = MI.getOperand(1).getReg(); 2591 LLT PtrTy = MRI.getType(PtrReg); 2592 unsigned AddrSpace = PtrTy.getAddressSpace(); 2593 2594 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2595 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2596 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2597 Observer.changingInstr(MI); 2598 MI.getOperand(1).setReg(Cast.getReg(0)); 2599 Observer.changedInstr(MI); 2600 return true; 2601 } 2602 2603 if (MI.getOpcode() != AMDGPU::G_LOAD) 2604 return false; 2605 2606 Register ValReg = MI.getOperand(0).getReg(); 2607 LLT ValTy = MRI.getType(ValReg); 2608 2609 MachineMemOperand *MMO = *MI.memoperands_begin(); 2610 const unsigned ValSize = ValTy.getSizeInBits(); 2611 const LLT MemTy = MMO->getMemoryType(); 2612 const Align MemAlign = MMO->getAlign(); 2613 const unsigned MemSize = MemTy.getSizeInBits(); 2614 const uint64_t AlignInBits = 8 * MemAlign.value(); 2615 2616 // Widen non-power-of-2 loads to the alignment if needed 2617 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2618 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2619 2620 // This was already the correct extending load result type, so just adjust 2621 // the memory type. 2622 if (WideMemSize == ValSize) { 2623 MachineFunction &MF = B.getMF(); 2624 2625 MachineMemOperand *WideMMO = 2626 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2627 Observer.changingInstr(MI); 2628 MI.setMemRefs(MF, {WideMMO}); 2629 Observer.changedInstr(MI); 2630 return true; 2631 } 2632 2633 // Don't bother handling edge case that should probably never be produced. 2634 if (ValSize > WideMemSize) 2635 return false; 2636 2637 LLT WideTy = widenToNextPowerOf2(ValTy); 2638 2639 Register WideLoad; 2640 if (!WideTy.isVector()) { 2641 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2642 B.buildTrunc(ValReg, WideLoad).getReg(0); 2643 } else { 2644 // Extract the subvector. 2645 2646 if (isRegisterType(ValTy)) { 2647 // If this a case where G_EXTRACT is legal, use it. 2648 // (e.g. <3 x s32> -> <4 x s32>) 2649 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2650 B.buildExtract(ValReg, WideLoad, 0); 2651 } else { 2652 // For cases where the widened type isn't a nice register value, unmerge 2653 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2654 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2655 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2656 } 2657 } 2658 2659 MI.eraseFromParent(); 2660 return true; 2661 } 2662 2663 return false; 2664 } 2665 2666 bool AMDGPULegalizerInfo::legalizeFMad( 2667 MachineInstr &MI, MachineRegisterInfo &MRI, 2668 MachineIRBuilder &B) const { 2669 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2670 assert(Ty.isScalar()); 2671 2672 MachineFunction &MF = B.getMF(); 2673 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2674 2675 // TODO: Always legal with future ftz flag. 2676 // FIXME: Do we need just output? 2677 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2678 return true; 2679 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2680 return true; 2681 2682 MachineIRBuilder HelperBuilder(MI); 2683 GISelObserverWrapper DummyObserver; 2684 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2685 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2686 } 2687 2688 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2689 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2690 Register DstReg = MI.getOperand(0).getReg(); 2691 Register PtrReg = MI.getOperand(1).getReg(); 2692 Register CmpVal = MI.getOperand(2).getReg(); 2693 Register NewVal = MI.getOperand(3).getReg(); 2694 2695 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2696 "this should not have been custom lowered"); 2697 2698 LLT ValTy = MRI.getType(CmpVal); 2699 LLT VecTy = LLT::fixed_vector(2, ValTy); 2700 2701 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2702 2703 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2704 .addDef(DstReg) 2705 .addUse(PtrReg) 2706 .addUse(PackedVal) 2707 .setMemRefs(MI.memoperands()); 2708 2709 MI.eraseFromParent(); 2710 return true; 2711 } 2712 2713 bool AMDGPULegalizerInfo::legalizeFlog( 2714 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2715 Register Dst = MI.getOperand(0).getReg(); 2716 Register Src = MI.getOperand(1).getReg(); 2717 LLT Ty = B.getMRI()->getType(Dst); 2718 unsigned Flags = MI.getFlags(); 2719 2720 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2721 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2722 2723 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2724 MI.eraseFromParent(); 2725 return true; 2726 } 2727 2728 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2729 MachineIRBuilder &B) const { 2730 Register Dst = MI.getOperand(0).getReg(); 2731 Register Src = MI.getOperand(1).getReg(); 2732 unsigned Flags = MI.getFlags(); 2733 LLT Ty = B.getMRI()->getType(Dst); 2734 2735 auto K = B.buildFConstant(Ty, numbers::log2e); 2736 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2737 B.buildFExp2(Dst, Mul, Flags); 2738 MI.eraseFromParent(); 2739 return true; 2740 } 2741 2742 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2743 MachineIRBuilder &B) const { 2744 Register Dst = MI.getOperand(0).getReg(); 2745 Register Src0 = MI.getOperand(1).getReg(); 2746 Register Src1 = MI.getOperand(2).getReg(); 2747 unsigned Flags = MI.getFlags(); 2748 LLT Ty = B.getMRI()->getType(Dst); 2749 const LLT S16 = LLT::scalar(16); 2750 const LLT S32 = LLT::scalar(32); 2751 2752 if (Ty == S32) { 2753 auto Log = B.buildFLog2(S32, Src0, Flags); 2754 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2755 .addUse(Log.getReg(0)) 2756 .addUse(Src1) 2757 .setMIFlags(Flags); 2758 B.buildFExp2(Dst, Mul, Flags); 2759 } else if (Ty == S16) { 2760 // There's no f16 fmul_legacy, so we need to convert for it. 2761 auto Log = B.buildFLog2(S16, Src0, Flags); 2762 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2763 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2764 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2765 .addUse(Ext0.getReg(0)) 2766 .addUse(Ext1.getReg(0)) 2767 .setMIFlags(Flags); 2768 2769 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2770 } else 2771 return false; 2772 2773 MI.eraseFromParent(); 2774 return true; 2775 } 2776 2777 // Find a source register, ignoring any possible source modifiers. 2778 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2779 Register ModSrc = OrigSrc; 2780 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2781 ModSrc = SrcFNeg->getOperand(1).getReg(); 2782 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2783 ModSrc = SrcFAbs->getOperand(1).getReg(); 2784 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2785 ModSrc = SrcFAbs->getOperand(1).getReg(); 2786 return ModSrc; 2787 } 2788 2789 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2790 MachineRegisterInfo &MRI, 2791 MachineIRBuilder &B) const { 2792 2793 const LLT S1 = LLT::scalar(1); 2794 const LLT S64 = LLT::scalar(64); 2795 Register Dst = MI.getOperand(0).getReg(); 2796 Register OrigSrc = MI.getOperand(1).getReg(); 2797 unsigned Flags = MI.getFlags(); 2798 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2799 "this should not have been custom lowered"); 2800 2801 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2802 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2803 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2804 // V_FRACT bug is: 2805 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2806 // 2807 // Convert floor(x) to (x - fract(x)) 2808 2809 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2810 .addUse(OrigSrc) 2811 .setMIFlags(Flags); 2812 2813 // Give source modifier matching some assistance before obscuring a foldable 2814 // pattern. 2815 2816 // TODO: We can avoid the neg on the fract? The input sign to fract 2817 // shouldn't matter? 2818 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2819 2820 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2821 2822 Register Min = MRI.createGenericVirtualRegister(S64); 2823 2824 // We don't need to concern ourselves with the snan handling difference, so 2825 // use the one which will directly select. 2826 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2827 if (MFI->getMode().IEEE) 2828 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2829 else 2830 B.buildFMinNum(Min, Fract, Const, Flags); 2831 2832 Register CorrectedFract = Min; 2833 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2834 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2835 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2836 } 2837 2838 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2839 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2840 2841 MI.eraseFromParent(); 2842 return true; 2843 } 2844 2845 // Turn an illegal packed v2s16 build vector into bit operations. 2846 // TODO: This should probably be a bitcast action in LegalizerHelper. 2847 bool AMDGPULegalizerInfo::legalizeBuildVector( 2848 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2849 Register Dst = MI.getOperand(0).getReg(); 2850 const LLT S32 = LLT::scalar(32); 2851 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2852 2853 Register Src0 = MI.getOperand(1).getReg(); 2854 Register Src1 = MI.getOperand(2).getReg(); 2855 assert(MRI.getType(Src0) == LLT::scalar(16)); 2856 2857 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2858 B.buildBitcast(Dst, Merge); 2859 2860 MI.eraseFromParent(); 2861 return true; 2862 } 2863 2864 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2865 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2866 // case with a single min instruction instead of a compare+select. 2867 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2868 MachineRegisterInfo &MRI, 2869 MachineIRBuilder &B) const { 2870 Register Dst = MI.getOperand(0).getReg(); 2871 Register Src = MI.getOperand(1).getReg(); 2872 LLT DstTy = MRI.getType(Dst); 2873 LLT SrcTy = MRI.getType(Src); 2874 2875 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2876 ? AMDGPU::G_AMDGPU_FFBH_U32 2877 : AMDGPU::G_AMDGPU_FFBL_B32; 2878 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2879 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2880 2881 MI.eraseFromParent(); 2882 return true; 2883 } 2884 2885 // Check that this is a G_XOR x, -1 2886 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2887 if (MI.getOpcode() != TargetOpcode::G_XOR) 2888 return false; 2889 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2890 return ConstVal && *ConstVal == -1; 2891 } 2892 2893 // Return the use branch instruction, otherwise null if the usage is invalid. 2894 static MachineInstr * 2895 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2896 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2897 Register CondDef = MI.getOperand(0).getReg(); 2898 if (!MRI.hasOneNonDBGUse(CondDef)) 2899 return nullptr; 2900 2901 MachineBasicBlock *Parent = MI.getParent(); 2902 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2903 2904 if (isNot(MRI, *UseMI)) { 2905 Register NegatedCond = UseMI->getOperand(0).getReg(); 2906 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2907 return nullptr; 2908 2909 // We're deleting the def of this value, so we need to remove it. 2910 eraseInstr(*UseMI, MRI); 2911 2912 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2913 Negated = true; 2914 } 2915 2916 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2917 return nullptr; 2918 2919 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2920 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2921 if (Next == Parent->end()) { 2922 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2923 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2924 return nullptr; 2925 UncondBrTarget = &*NextMBB; 2926 } else { 2927 if (Next->getOpcode() != AMDGPU::G_BR) 2928 return nullptr; 2929 Br = &*Next; 2930 UncondBrTarget = Br->getOperand(0).getMBB(); 2931 } 2932 2933 return UseMI; 2934 } 2935 2936 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2937 const ArgDescriptor *Arg, 2938 const TargetRegisterClass *ArgRC, 2939 LLT ArgTy) const { 2940 MCRegister SrcReg = Arg->getRegister(); 2941 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2942 assert(DstReg.isVirtual() && "Virtual register expected"); 2943 2944 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 2945 *ArgRC, B.getDebugLoc(), ArgTy); 2946 if (Arg->isMasked()) { 2947 // TODO: Should we try to emit this once in the entry block? 2948 const LLT S32 = LLT::scalar(32); 2949 const unsigned Mask = Arg->getMask(); 2950 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2951 2952 Register AndMaskSrc = LiveIn; 2953 2954 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 2955 // 0. 2956 if (Shift != 0) { 2957 auto ShiftAmt = B.buildConstant(S32, Shift); 2958 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2959 } 2960 2961 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2962 } else { 2963 B.buildCopy(DstReg, LiveIn); 2964 } 2965 2966 return true; 2967 } 2968 2969 bool AMDGPULegalizerInfo::loadInputValue( 2970 Register DstReg, MachineIRBuilder &B, 2971 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2972 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2973 const ArgDescriptor *Arg; 2974 const TargetRegisterClass *ArgRC; 2975 LLT ArgTy; 2976 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2977 2978 if (!Arg) { 2979 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 2980 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 2981 // case the pointer argument may be missing and we use null. 2982 B.buildConstant(DstReg, 0); 2983 return true; 2984 } 2985 2986 // It's undefined behavior if a function marked with the amdgpu-no-* 2987 // attributes uses the corresponding intrinsic. 2988 B.buildUndef(DstReg); 2989 return true; 2990 } 2991 2992 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2993 return false; // TODO: Handle these 2994 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2995 } 2996 2997 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2998 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2999 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3000 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 3001 return false; 3002 3003 MI.eraseFromParent(); 3004 return true; 3005 } 3006 3007 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, 3008 int64_t C) { 3009 B.buildConstant(MI.getOperand(0).getReg(), C); 3010 MI.eraseFromParent(); 3011 return true; 3012 } 3013 3014 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( 3015 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 3016 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3017 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); 3018 if (MaxID == 0) 3019 return replaceWithConstant(B, MI, 0); 3020 3021 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3022 const ArgDescriptor *Arg; 3023 const TargetRegisterClass *ArgRC; 3024 LLT ArgTy; 3025 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 3026 3027 Register DstReg = MI.getOperand(0).getReg(); 3028 if (!Arg) { 3029 // It's undefined behavior if a function marked with the amdgpu-no-* 3030 // attributes uses the corresponding intrinsic. 3031 B.buildUndef(DstReg); 3032 MI.eraseFromParent(); 3033 return true; 3034 } 3035 3036 if (Arg->isMasked()) { 3037 // Don't bother inserting AssertZext for packed IDs since we're emitting the 3038 // masking operations anyway. 3039 // 3040 // TODO: We could assert the top bit is 0 for the source copy. 3041 if (!loadInputValue(DstReg, B, ArgType)) 3042 return false; 3043 } else { 3044 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 3045 if (!loadInputValue(TmpReg, B, ArgType)) 3046 return false; 3047 B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID)); 3048 } 3049 3050 MI.eraseFromParent(); 3051 return true; 3052 } 3053 3054 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, 3055 int64_t Offset) const { 3056 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 3057 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 3058 3059 // TODO: If we passed in the base kernel offset we could have a better 3060 // alignment than 4, but we don't really need it. 3061 if (!loadInputValue(KernArgReg, B, 3062 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3063 llvm_unreachable("failed to find kernarg segment ptr"); 3064 3065 auto COffset = B.buildConstant(LLT::scalar(64), Offset); 3066 // TODO: Should get nuw 3067 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); 3068 } 3069 3070 /// Legalize a value that's loaded from kernel arguments. This is only used by 3071 /// legacy intrinsics. 3072 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, 3073 MachineIRBuilder &B, 3074 uint64_t Offset, 3075 Align Alignment) const { 3076 Register DstReg = MI.getOperand(0).getReg(); 3077 3078 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) && 3079 "unexpected kernarg parameter type"); 3080 3081 Register Ptr = getKernargParameterPtr(B, Offset); 3082 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 3083 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), 3084 MachineMemOperand::MODereferenceable | 3085 MachineMemOperand::MOInvariant); 3086 MI.eraseFromParent(); 3087 return true; 3088 } 3089 3090 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 3091 MachineRegisterInfo &MRI, 3092 MachineIRBuilder &B) const { 3093 Register Dst = MI.getOperand(0).getReg(); 3094 LLT DstTy = MRI.getType(Dst); 3095 LLT S16 = LLT::scalar(16); 3096 LLT S32 = LLT::scalar(32); 3097 LLT S64 = LLT::scalar(64); 3098 3099 if (DstTy == S16) 3100 return legalizeFDIV16(MI, MRI, B); 3101 if (DstTy == S32) 3102 return legalizeFDIV32(MI, MRI, B); 3103 if (DstTy == S64) 3104 return legalizeFDIV64(MI, MRI, B); 3105 3106 return false; 3107 } 3108 3109 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 3110 Register DstDivReg, 3111 Register DstRemReg, 3112 Register X, 3113 Register Y) const { 3114 const LLT S1 = LLT::scalar(1); 3115 const LLT S32 = LLT::scalar(32); 3116 3117 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 3118 // algorithm used here. 3119 3120 // Initial estimate of inv(y). 3121 auto FloatY = B.buildUITOFP(S32, Y); 3122 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 3123 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 3124 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 3125 auto Z = B.buildFPTOUI(S32, ScaledY); 3126 3127 // One round of UNR. 3128 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 3129 auto NegYZ = B.buildMul(S32, NegY, Z); 3130 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 3131 3132 // Quotient/remainder estimate. 3133 auto Q = B.buildUMulH(S32, X, Z); 3134 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 3135 3136 // First quotient/remainder refinement. 3137 auto One = B.buildConstant(S32, 1); 3138 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3139 if (DstDivReg) 3140 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 3141 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 3142 3143 // Second quotient/remainder refinement. 3144 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3145 if (DstDivReg) 3146 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 3147 3148 if (DstRemReg) 3149 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 3150 } 3151 3152 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 3153 // 3154 // Return lo, hi of result 3155 // 3156 // %cvt.lo = G_UITOFP Val.lo 3157 // %cvt.hi = G_UITOFP Val.hi 3158 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 3159 // %rcp = G_AMDGPU_RCP_IFLAG %mad 3160 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 3161 // %mul2 = G_FMUL %mul1, 2**(-32) 3162 // %trunc = G_INTRINSIC_TRUNC %mul2 3163 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 3164 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 3165 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 3166 Register Val) { 3167 const LLT S32 = LLT::scalar(32); 3168 auto Unmerge = B.buildUnmerge(S32, Val); 3169 3170 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 3171 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 3172 3173 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 3174 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 3175 3176 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 3177 auto Mul1 = 3178 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3179 3180 // 2**(-32) 3181 auto Mul2 = 3182 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3183 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3184 3185 // -(2**32) 3186 auto Mad2 = B.buildFMAD(S32, Trunc, 3187 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3188 3189 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3190 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3191 3192 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3193 } 3194 3195 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3196 Register DstDivReg, 3197 Register DstRemReg, 3198 Register Numer, 3199 Register Denom) const { 3200 const LLT S32 = LLT::scalar(32); 3201 const LLT S64 = LLT::scalar(64); 3202 const LLT S1 = LLT::scalar(1); 3203 Register RcpLo, RcpHi; 3204 3205 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3206 3207 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3208 3209 auto Zero64 = B.buildConstant(S64, 0); 3210 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3211 3212 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3213 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3214 3215 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3216 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3217 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3218 3219 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3220 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3221 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3222 3223 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3224 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3225 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3226 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3227 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3228 3229 auto Zero32 = B.buildConstant(S32, 0); 3230 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3231 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3232 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3233 3234 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3235 Register NumerLo = UnmergeNumer.getReg(0); 3236 Register NumerHi = UnmergeNumer.getReg(1); 3237 3238 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3239 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3240 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3241 Register Mul3_Lo = UnmergeMul3.getReg(0); 3242 Register Mul3_Hi = UnmergeMul3.getReg(1); 3243 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3244 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3245 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3246 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3247 3248 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3249 Register DenomLo = UnmergeDenom.getReg(0); 3250 Register DenomHi = UnmergeDenom.getReg(1); 3251 3252 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3253 auto C1 = B.buildSExt(S32, CmpHi); 3254 3255 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3256 auto C2 = B.buildSExt(S32, CmpLo); 3257 3258 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3259 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3260 3261 // TODO: Here and below portions of the code can be enclosed into if/endif. 3262 // Currently control flow is unconditional and we have 4 selects after 3263 // potential endif to substitute PHIs. 3264 3265 // if C3 != 0 ... 3266 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3267 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3268 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3269 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3270 3271 auto One64 = B.buildConstant(S64, 1); 3272 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3273 3274 auto C4 = 3275 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3276 auto C5 = 3277 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3278 auto C6 = B.buildSelect( 3279 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3280 3281 // if (C6 != 0) 3282 auto Add4 = B.buildAdd(S64, Add3, One64); 3283 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3284 3285 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3286 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3287 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3288 3289 // endif C6 3290 // endif C3 3291 3292 if (DstDivReg) { 3293 auto Sel1 = B.buildSelect( 3294 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3295 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3296 Sel1, MulHi3); 3297 } 3298 3299 if (DstRemReg) { 3300 auto Sel2 = B.buildSelect( 3301 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3302 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3303 Sel2, Sub1); 3304 } 3305 } 3306 3307 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3308 MachineRegisterInfo &MRI, 3309 MachineIRBuilder &B) const { 3310 Register DstDivReg, DstRemReg; 3311 switch (MI.getOpcode()) { 3312 default: 3313 llvm_unreachable("Unexpected opcode!"); 3314 case AMDGPU::G_UDIV: { 3315 DstDivReg = MI.getOperand(0).getReg(); 3316 break; 3317 } 3318 case AMDGPU::G_UREM: { 3319 DstRemReg = MI.getOperand(0).getReg(); 3320 break; 3321 } 3322 case AMDGPU::G_UDIVREM: { 3323 DstDivReg = MI.getOperand(0).getReg(); 3324 DstRemReg = MI.getOperand(1).getReg(); 3325 break; 3326 } 3327 } 3328 3329 const LLT S64 = LLT::scalar(64); 3330 const LLT S32 = LLT::scalar(32); 3331 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3332 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3333 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3334 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3335 3336 if (Ty == S32) 3337 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3338 else if (Ty == S64) 3339 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3340 else 3341 return false; 3342 3343 MI.eraseFromParent(); 3344 return true; 3345 } 3346 3347 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3348 MachineRegisterInfo &MRI, 3349 MachineIRBuilder &B) const { 3350 const LLT S64 = LLT::scalar(64); 3351 const LLT S32 = LLT::scalar(32); 3352 3353 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3354 if (Ty != S32 && Ty != S64) 3355 return false; 3356 3357 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3358 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3359 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3360 3361 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3362 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3363 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3364 3365 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3366 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3367 3368 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3369 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3370 3371 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3372 switch (MI.getOpcode()) { 3373 default: 3374 llvm_unreachable("Unexpected opcode!"); 3375 case AMDGPU::G_SDIV: { 3376 DstDivReg = MI.getOperand(0).getReg(); 3377 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3378 break; 3379 } 3380 case AMDGPU::G_SREM: { 3381 DstRemReg = MI.getOperand(0).getReg(); 3382 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3383 break; 3384 } 3385 case AMDGPU::G_SDIVREM: { 3386 DstDivReg = MI.getOperand(0).getReg(); 3387 DstRemReg = MI.getOperand(1).getReg(); 3388 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3389 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3390 break; 3391 } 3392 } 3393 3394 if (Ty == S32) 3395 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3396 else 3397 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3398 3399 if (DstDivReg) { 3400 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3401 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3402 B.buildSub(DstDivReg, SignXor, Sign); 3403 } 3404 3405 if (DstRemReg) { 3406 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3407 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3408 B.buildSub(DstRemReg, SignXor, Sign); 3409 } 3410 3411 MI.eraseFromParent(); 3412 return true; 3413 } 3414 3415 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3416 MachineRegisterInfo &MRI, 3417 MachineIRBuilder &B) const { 3418 Register Res = MI.getOperand(0).getReg(); 3419 Register LHS = MI.getOperand(1).getReg(); 3420 Register RHS = MI.getOperand(2).getReg(); 3421 uint16_t Flags = MI.getFlags(); 3422 LLT ResTy = MRI.getType(Res); 3423 3424 const MachineFunction &MF = B.getMF(); 3425 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3426 MI.getFlag(MachineInstr::FmAfn); 3427 3428 if (!AllowInaccurateRcp) 3429 return false; 3430 3431 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3432 // 1 / x -> RCP(x) 3433 if (CLHS->isExactlyValue(1.0)) { 3434 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3435 .addUse(RHS) 3436 .setMIFlags(Flags); 3437 3438 MI.eraseFromParent(); 3439 return true; 3440 } 3441 3442 // -1 / x -> RCP( FNEG(x) ) 3443 if (CLHS->isExactlyValue(-1.0)) { 3444 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3445 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3446 .addUse(FNeg.getReg(0)) 3447 .setMIFlags(Flags); 3448 3449 MI.eraseFromParent(); 3450 return true; 3451 } 3452 } 3453 3454 // x / y -> x * (1.0 / y) 3455 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3456 .addUse(RHS) 3457 .setMIFlags(Flags); 3458 B.buildFMul(Res, LHS, RCP, Flags); 3459 3460 MI.eraseFromParent(); 3461 return true; 3462 } 3463 3464 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3465 MachineRegisterInfo &MRI, 3466 MachineIRBuilder &B) const { 3467 Register Res = MI.getOperand(0).getReg(); 3468 Register X = MI.getOperand(1).getReg(); 3469 Register Y = MI.getOperand(2).getReg(); 3470 uint16_t Flags = MI.getFlags(); 3471 LLT ResTy = MRI.getType(Res); 3472 3473 const MachineFunction &MF = B.getMF(); 3474 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3475 MI.getFlag(MachineInstr::FmAfn); 3476 3477 if (!AllowInaccurateRcp) 3478 return false; 3479 3480 auto NegY = B.buildFNeg(ResTy, Y); 3481 auto One = B.buildFConstant(ResTy, 1.0); 3482 3483 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3484 .addUse(Y) 3485 .setMIFlags(Flags); 3486 3487 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3488 R = B.buildFMA(ResTy, Tmp0, R, R); 3489 3490 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3491 R = B.buildFMA(ResTy, Tmp1, R, R); 3492 3493 auto Ret = B.buildFMul(ResTy, X, R); 3494 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3495 3496 B.buildFMA(Res, Tmp2, R, Ret); 3497 MI.eraseFromParent(); 3498 return true; 3499 } 3500 3501 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3502 MachineRegisterInfo &MRI, 3503 MachineIRBuilder &B) const { 3504 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3505 return true; 3506 3507 Register Res = MI.getOperand(0).getReg(); 3508 Register LHS = MI.getOperand(1).getReg(); 3509 Register RHS = MI.getOperand(2).getReg(); 3510 3511 uint16_t Flags = MI.getFlags(); 3512 3513 LLT S16 = LLT::scalar(16); 3514 LLT S32 = LLT::scalar(32); 3515 3516 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3517 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3518 3519 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3520 .addUse(RHSExt.getReg(0)) 3521 .setMIFlags(Flags); 3522 3523 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3524 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3525 3526 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3527 .addUse(RDst.getReg(0)) 3528 .addUse(RHS) 3529 .addUse(LHS) 3530 .setMIFlags(Flags); 3531 3532 MI.eraseFromParent(); 3533 return true; 3534 } 3535 3536 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3537 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3538 static void toggleSPDenormMode(bool Enable, 3539 MachineIRBuilder &B, 3540 const GCNSubtarget &ST, 3541 AMDGPU::SIModeRegisterDefaults Mode) { 3542 // Set SP denorm mode to this value. 3543 unsigned SPDenormMode = 3544 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3545 3546 if (ST.hasDenormModeInst()) { 3547 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3548 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3549 3550 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3551 B.buildInstr(AMDGPU::S_DENORM_MODE) 3552 .addImm(NewDenormModeValue); 3553 3554 } else { 3555 // Select FP32 bit field in mode register. 3556 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3557 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3558 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3559 3560 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3561 .addImm(SPDenormMode) 3562 .addImm(SPDenormModeBitField); 3563 } 3564 } 3565 3566 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3567 MachineRegisterInfo &MRI, 3568 MachineIRBuilder &B) const { 3569 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3570 return true; 3571 3572 Register Res = MI.getOperand(0).getReg(); 3573 Register LHS = MI.getOperand(1).getReg(); 3574 Register RHS = MI.getOperand(2).getReg(); 3575 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3576 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3577 3578 uint16_t Flags = MI.getFlags(); 3579 3580 LLT S32 = LLT::scalar(32); 3581 LLT S1 = LLT::scalar(1); 3582 3583 auto One = B.buildFConstant(S32, 1.0f); 3584 3585 auto DenominatorScaled = 3586 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3587 .addUse(LHS) 3588 .addUse(RHS) 3589 .addImm(0) 3590 .setMIFlags(Flags); 3591 auto NumeratorScaled = 3592 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3593 .addUse(LHS) 3594 .addUse(RHS) 3595 .addImm(1) 3596 .setMIFlags(Flags); 3597 3598 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3599 .addUse(DenominatorScaled.getReg(0)) 3600 .setMIFlags(Flags); 3601 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3602 3603 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3604 // aren't modeled as reading it. 3605 if (!Mode.allFP32Denormals()) 3606 toggleSPDenormMode(true, B, ST, Mode); 3607 3608 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3609 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3610 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3611 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3612 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3613 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3614 3615 if (!Mode.allFP32Denormals()) 3616 toggleSPDenormMode(false, B, ST, Mode); 3617 3618 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3619 .addUse(Fma4.getReg(0)) 3620 .addUse(Fma1.getReg(0)) 3621 .addUse(Fma3.getReg(0)) 3622 .addUse(NumeratorScaled.getReg(1)) 3623 .setMIFlags(Flags); 3624 3625 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3626 .addUse(Fmas.getReg(0)) 3627 .addUse(RHS) 3628 .addUse(LHS) 3629 .setMIFlags(Flags); 3630 3631 MI.eraseFromParent(); 3632 return true; 3633 } 3634 3635 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3636 MachineRegisterInfo &MRI, 3637 MachineIRBuilder &B) const { 3638 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3639 return true; 3640 3641 Register Res = MI.getOperand(0).getReg(); 3642 Register LHS = MI.getOperand(1).getReg(); 3643 Register RHS = MI.getOperand(2).getReg(); 3644 3645 uint16_t Flags = MI.getFlags(); 3646 3647 LLT S64 = LLT::scalar(64); 3648 LLT S1 = LLT::scalar(1); 3649 3650 auto One = B.buildFConstant(S64, 1.0); 3651 3652 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3653 .addUse(LHS) 3654 .addUse(RHS) 3655 .addImm(0) 3656 .setMIFlags(Flags); 3657 3658 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3659 3660 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3661 .addUse(DivScale0.getReg(0)) 3662 .setMIFlags(Flags); 3663 3664 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3665 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3666 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3667 3668 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3669 .addUse(LHS) 3670 .addUse(RHS) 3671 .addImm(1) 3672 .setMIFlags(Flags); 3673 3674 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3675 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3676 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3677 3678 Register Scale; 3679 if (!ST.hasUsableDivScaleConditionOutput()) { 3680 // Workaround a hardware bug on SI where the condition output from div_scale 3681 // is not usable. 3682 3683 LLT S32 = LLT::scalar(32); 3684 3685 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3686 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3687 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3688 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3689 3690 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3691 Scale1Unmerge.getReg(1)); 3692 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3693 Scale0Unmerge.getReg(1)); 3694 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3695 } else { 3696 Scale = DivScale1.getReg(1); 3697 } 3698 3699 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3700 .addUse(Fma4.getReg(0)) 3701 .addUse(Fma3.getReg(0)) 3702 .addUse(Mul.getReg(0)) 3703 .addUse(Scale) 3704 .setMIFlags(Flags); 3705 3706 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3707 .addUse(Fmas.getReg(0)) 3708 .addUse(RHS) 3709 .addUse(LHS) 3710 .setMIFlags(Flags); 3711 3712 MI.eraseFromParent(); 3713 return true; 3714 } 3715 3716 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3717 MachineRegisterInfo &MRI, 3718 MachineIRBuilder &B) const { 3719 Register Res = MI.getOperand(0).getReg(); 3720 Register LHS = MI.getOperand(2).getReg(); 3721 Register RHS = MI.getOperand(3).getReg(); 3722 uint16_t Flags = MI.getFlags(); 3723 3724 LLT S32 = LLT::scalar(32); 3725 LLT S1 = LLT::scalar(1); 3726 3727 auto Abs = B.buildFAbs(S32, RHS, Flags); 3728 const APFloat C0Val(1.0f); 3729 3730 auto C0 = B.buildConstant(S32, 0x6f800000); 3731 auto C1 = B.buildConstant(S32, 0x2f800000); 3732 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3733 3734 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3735 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3736 3737 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3738 3739 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3740 .addUse(Mul0.getReg(0)) 3741 .setMIFlags(Flags); 3742 3743 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3744 3745 B.buildFMul(Res, Sel, Mul1, Flags); 3746 3747 MI.eraseFromParent(); 3748 return true; 3749 } 3750 3751 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3752 // FIXME: Why do we handle this one but not other removed instructions? 3753 // 3754 // Reciprocal square root. The clamp prevents infinite results, clamping 3755 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3756 // +-max_float. 3757 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3758 MachineRegisterInfo &MRI, 3759 MachineIRBuilder &B) const { 3760 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3761 return true; 3762 3763 Register Dst = MI.getOperand(0).getReg(); 3764 Register Src = MI.getOperand(2).getReg(); 3765 auto Flags = MI.getFlags(); 3766 3767 LLT Ty = MRI.getType(Dst); 3768 3769 const fltSemantics *FltSemantics; 3770 if (Ty == LLT::scalar(32)) 3771 FltSemantics = &APFloat::IEEEsingle(); 3772 else if (Ty == LLT::scalar(64)) 3773 FltSemantics = &APFloat::IEEEdouble(); 3774 else 3775 return false; 3776 3777 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3778 .addUse(Src) 3779 .setMIFlags(Flags); 3780 3781 // We don't need to concern ourselves with the snan handling difference, since 3782 // the rsq quieted (or not) so use the one which will directly select. 3783 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3784 const bool UseIEEE = MFI->getMode().IEEE; 3785 3786 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3787 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3788 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3789 3790 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3791 3792 if (UseIEEE) 3793 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3794 else 3795 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3796 MI.eraseFromParent(); 3797 return true; 3798 } 3799 3800 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3801 switch (IID) { 3802 case Intrinsic::amdgcn_ds_fadd: 3803 return AMDGPU::G_ATOMICRMW_FADD; 3804 case Intrinsic::amdgcn_ds_fmin: 3805 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3806 case Intrinsic::amdgcn_ds_fmax: 3807 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3808 default: 3809 llvm_unreachable("not a DS FP intrinsic"); 3810 } 3811 } 3812 3813 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3814 MachineInstr &MI, 3815 Intrinsic::ID IID) const { 3816 GISelChangeObserver &Observer = Helper.Observer; 3817 Observer.changingInstr(MI); 3818 3819 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3820 3821 // The remaining operands were used to set fields in the MemOperand on 3822 // construction. 3823 for (int I = 6; I > 3; --I) 3824 MI.removeOperand(I); 3825 3826 MI.removeOperand(1); // Remove the intrinsic ID. 3827 Observer.changedInstr(MI); 3828 return true; 3829 } 3830 3831 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3832 MachineRegisterInfo &MRI, 3833 MachineIRBuilder &B) const { 3834 uint64_t Offset = 3835 ST.getTargetLowering()->getImplicitParameterOffset( 3836 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3837 LLT DstTy = MRI.getType(DstReg); 3838 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3839 3840 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3841 if (!loadInputValue(KernargPtrReg, B, 3842 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3843 return false; 3844 3845 // FIXME: This should be nuw 3846 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3847 return true; 3848 } 3849 3850 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3851 MachineRegisterInfo &MRI, 3852 MachineIRBuilder &B) const { 3853 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3854 if (!MFI->isEntryFunction()) { 3855 return legalizePreloadedArgIntrin(MI, MRI, B, 3856 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3857 } 3858 3859 Register DstReg = MI.getOperand(0).getReg(); 3860 if (!getImplicitArgPtr(DstReg, MRI, B)) 3861 return false; 3862 3863 MI.eraseFromParent(); 3864 return true; 3865 } 3866 3867 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3868 MachineRegisterInfo &MRI, 3869 MachineIRBuilder &B, 3870 unsigned AddrSpace) const { 3871 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3872 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3873 Register Hi32 = Unmerge.getReg(1); 3874 3875 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3876 MI.eraseFromParent(); 3877 return true; 3878 } 3879 3880 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3881 // offset (the offset that is included in bounds checking and swizzling, to be 3882 // split between the instruction's voffset and immoffset fields) and soffset 3883 // (the offset that is excluded from bounds checking and swizzling, to go in 3884 // the instruction's soffset field). This function takes the first kind of 3885 // offset and figures out how to split it between voffset and immoffset. 3886 std::pair<Register, unsigned> 3887 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3888 Register OrigOffset) const { 3889 const unsigned MaxImm = 4095; 3890 Register BaseReg; 3891 unsigned ImmOffset; 3892 const LLT S32 = LLT::scalar(32); 3893 MachineRegisterInfo &MRI = *B.getMRI(); 3894 3895 std::tie(BaseReg, ImmOffset) = 3896 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3897 3898 // If BaseReg is a pointer, convert it to int. 3899 if (MRI.getType(BaseReg).isPointer()) 3900 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3901 3902 // If the immediate value is too big for the immoffset field, put the value 3903 // and -4096 into the immoffset field so that the value that is copied/added 3904 // for the voffset field is a multiple of 4096, and it stands more chance 3905 // of being CSEd with the copy/add for another similar load/store. 3906 // However, do not do that rounding down to a multiple of 4096 if that is a 3907 // negative number, as it appears to be illegal to have a negative offset 3908 // in the vgpr, even if adding the immediate offset makes it positive. 3909 unsigned Overflow = ImmOffset & ~MaxImm; 3910 ImmOffset -= Overflow; 3911 if ((int32_t)Overflow < 0) { 3912 Overflow += ImmOffset; 3913 ImmOffset = 0; 3914 } 3915 3916 if (Overflow != 0) { 3917 if (!BaseReg) { 3918 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3919 } else { 3920 auto OverflowVal = B.buildConstant(S32, Overflow); 3921 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3922 } 3923 } 3924 3925 if (!BaseReg) 3926 BaseReg = B.buildConstant(S32, 0).getReg(0); 3927 3928 return std::make_pair(BaseReg, ImmOffset); 3929 } 3930 3931 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3932 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3933 Register VOffset, Register SOffset, 3934 unsigned ImmOffset, Register VIndex, 3935 MachineRegisterInfo &MRI) const { 3936 Optional<ValueAndVReg> MaybeVOffsetVal = 3937 getIConstantVRegValWithLookThrough(VOffset, MRI); 3938 Optional<ValueAndVReg> MaybeSOffsetVal = 3939 getIConstantVRegValWithLookThrough(SOffset, MRI); 3940 Optional<ValueAndVReg> MaybeVIndexVal = 3941 getIConstantVRegValWithLookThrough(VIndex, MRI); 3942 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3943 // update the MMO with that offset. The stride is unknown so we can only do 3944 // this if VIndex is constant 0. 3945 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3946 MaybeVIndexVal->Value == 0) { 3947 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3948 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3949 MMO->setOffset(TotalOffset); 3950 } else { 3951 // We don't have a constant combined offset to use in the MMO. Give up. 3952 MMO->setValue((Value *)nullptr); 3953 } 3954 } 3955 3956 /// Handle register layout difference for f16 images for some subtargets. 3957 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3958 MachineRegisterInfo &MRI, 3959 Register Reg, 3960 bool ImageStore) const { 3961 const LLT S16 = LLT::scalar(16); 3962 const LLT S32 = LLT::scalar(32); 3963 LLT StoreVT = MRI.getType(Reg); 3964 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3965 3966 if (ST.hasUnpackedD16VMem()) { 3967 auto Unmerge = B.buildUnmerge(S16, Reg); 3968 3969 SmallVector<Register, 4> WideRegs; 3970 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3971 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3972 3973 int NumElts = StoreVT.getNumElements(); 3974 3975 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3976 .getReg(0); 3977 } 3978 3979 if (ImageStore && ST.hasImageStoreD16Bug()) { 3980 if (StoreVT.getNumElements() == 2) { 3981 SmallVector<Register, 4> PackedRegs; 3982 Reg = B.buildBitcast(S32, Reg).getReg(0); 3983 PackedRegs.push_back(Reg); 3984 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3985 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3986 .getReg(0); 3987 } 3988 3989 if (StoreVT.getNumElements() == 3) { 3990 SmallVector<Register, 4> PackedRegs; 3991 auto Unmerge = B.buildUnmerge(S16, Reg); 3992 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3993 PackedRegs.push_back(Unmerge.getReg(I)); 3994 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3995 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3996 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3997 } 3998 3999 if (StoreVT.getNumElements() == 4) { 4000 SmallVector<Register, 4> PackedRegs; 4001 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 4002 auto Unmerge = B.buildUnmerge(S32, Reg); 4003 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 4004 PackedRegs.push_back(Unmerge.getReg(I)); 4005 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 4006 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 4007 .getReg(0); 4008 } 4009 4010 llvm_unreachable("invalid data type"); 4011 } 4012 4013 if (StoreVT == LLT::fixed_vector(3, S16)) { 4014 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 4015 .getReg(0); 4016 } 4017 return Reg; 4018 } 4019 4020 Register AMDGPULegalizerInfo::fixStoreSourceType( 4021 MachineIRBuilder &B, Register VData, bool IsFormat) const { 4022 MachineRegisterInfo *MRI = B.getMRI(); 4023 LLT Ty = MRI->getType(VData); 4024 4025 const LLT S16 = LLT::scalar(16); 4026 4027 // Fixup illegal register types for i8 stores. 4028 if (Ty == LLT::scalar(8) || Ty == S16) { 4029 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 4030 return AnyExt; 4031 } 4032 4033 if (Ty.isVector()) { 4034 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 4035 if (IsFormat) 4036 return handleD16VData(B, *MRI, VData); 4037 } 4038 } 4039 4040 return VData; 4041 } 4042 4043 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 4044 MachineRegisterInfo &MRI, 4045 MachineIRBuilder &B, 4046 bool IsTyped, 4047 bool IsFormat) const { 4048 Register VData = MI.getOperand(1).getReg(); 4049 LLT Ty = MRI.getType(VData); 4050 LLT EltTy = Ty.getScalarType(); 4051 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4052 const LLT S32 = LLT::scalar(32); 4053 4054 VData = fixStoreSourceType(B, VData, IsFormat); 4055 Register RSrc = MI.getOperand(2).getReg(); 4056 4057 MachineMemOperand *MMO = *MI.memoperands_begin(); 4058 const int MemSize = MMO->getSize(); 4059 4060 unsigned ImmOffset; 4061 4062 // The typed intrinsics add an immediate after the registers. 4063 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4064 4065 // The struct intrinsic variants add one additional operand over raw. 4066 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4067 Register VIndex; 4068 int OpOffset = 0; 4069 if (HasVIndex) { 4070 VIndex = MI.getOperand(3).getReg(); 4071 OpOffset = 1; 4072 } else { 4073 VIndex = B.buildConstant(S32, 0).getReg(0); 4074 } 4075 4076 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4077 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4078 4079 unsigned Format = 0; 4080 if (IsTyped) { 4081 Format = MI.getOperand(5 + OpOffset).getImm(); 4082 ++OpOffset; 4083 } 4084 4085 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4086 4087 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4088 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4089 4090 unsigned Opc; 4091 if (IsTyped) { 4092 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 4093 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 4094 } else if (IsFormat) { 4095 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 4096 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 4097 } else { 4098 switch (MemSize) { 4099 case 1: 4100 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 4101 break; 4102 case 2: 4103 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 4104 break; 4105 default: 4106 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 4107 break; 4108 } 4109 } 4110 4111 auto MIB = B.buildInstr(Opc) 4112 .addUse(VData) // vdata 4113 .addUse(RSrc) // rsrc 4114 .addUse(VIndex) // vindex 4115 .addUse(VOffset) // voffset 4116 .addUse(SOffset) // soffset 4117 .addImm(ImmOffset); // offset(imm) 4118 4119 if (IsTyped) 4120 MIB.addImm(Format); 4121 4122 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4123 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4124 .addMemOperand(MMO); 4125 4126 MI.eraseFromParent(); 4127 return true; 4128 } 4129 4130 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 4131 MachineRegisterInfo &MRI, 4132 MachineIRBuilder &B, 4133 bool IsFormat, 4134 bool IsTyped) const { 4135 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 4136 MachineMemOperand *MMO = *MI.memoperands_begin(); 4137 const LLT MemTy = MMO->getMemoryType(); 4138 const LLT S32 = LLT::scalar(32); 4139 4140 Register Dst = MI.getOperand(0).getReg(); 4141 Register RSrc = MI.getOperand(2).getReg(); 4142 4143 // The typed intrinsics add an immediate after the registers. 4144 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4145 4146 // The struct intrinsic variants add one additional operand over raw. 4147 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4148 Register VIndex; 4149 int OpOffset = 0; 4150 if (HasVIndex) { 4151 VIndex = MI.getOperand(3).getReg(); 4152 OpOffset = 1; 4153 } else { 4154 VIndex = B.buildConstant(S32, 0).getReg(0); 4155 } 4156 4157 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4158 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4159 4160 unsigned Format = 0; 4161 if (IsTyped) { 4162 Format = MI.getOperand(5 + OpOffset).getImm(); 4163 ++OpOffset; 4164 } 4165 4166 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4167 unsigned ImmOffset; 4168 4169 LLT Ty = MRI.getType(Dst); 4170 LLT EltTy = Ty.getScalarType(); 4171 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4172 const bool Unpacked = ST.hasUnpackedD16VMem(); 4173 4174 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4175 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4176 4177 unsigned Opc; 4178 4179 if (IsTyped) { 4180 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4181 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4182 } else if (IsFormat) { 4183 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4184 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4185 } else { 4186 switch (MemTy.getSizeInBits()) { 4187 case 8: 4188 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4189 break; 4190 case 16: 4191 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4192 break; 4193 default: 4194 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4195 break; 4196 } 4197 } 4198 4199 Register LoadDstReg; 4200 4201 bool IsExtLoad = 4202 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4203 LLT UnpackedTy = Ty.changeElementSize(32); 4204 4205 if (IsExtLoad) 4206 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4207 else if (Unpacked && IsD16 && Ty.isVector()) 4208 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4209 else 4210 LoadDstReg = Dst; 4211 4212 auto MIB = B.buildInstr(Opc) 4213 .addDef(LoadDstReg) // vdata 4214 .addUse(RSrc) // rsrc 4215 .addUse(VIndex) // vindex 4216 .addUse(VOffset) // voffset 4217 .addUse(SOffset) // soffset 4218 .addImm(ImmOffset); // offset(imm) 4219 4220 if (IsTyped) 4221 MIB.addImm(Format); 4222 4223 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4224 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4225 .addMemOperand(MMO); 4226 4227 if (LoadDstReg != Dst) { 4228 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4229 4230 // Widen result for extending loads was widened. 4231 if (IsExtLoad) 4232 B.buildTrunc(Dst, LoadDstReg); 4233 else { 4234 // Repack to original 16-bit vector result 4235 // FIXME: G_TRUNC should work, but legalization currently fails 4236 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4237 SmallVector<Register, 4> Repack; 4238 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4239 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4240 B.buildMerge(Dst, Repack); 4241 } 4242 } 4243 4244 MI.eraseFromParent(); 4245 return true; 4246 } 4247 4248 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4249 MachineIRBuilder &B, 4250 bool IsInc) const { 4251 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4252 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4253 B.buildInstr(Opc) 4254 .addDef(MI.getOperand(0).getReg()) 4255 .addUse(MI.getOperand(2).getReg()) 4256 .addUse(MI.getOperand(3).getReg()) 4257 .cloneMemRefs(MI); 4258 MI.eraseFromParent(); 4259 return true; 4260 } 4261 4262 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4263 switch (IntrID) { 4264 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4265 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4266 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4267 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4268 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4269 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4270 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4271 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4272 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4273 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4274 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4275 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4276 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4277 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4278 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4279 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4280 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4281 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4282 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4283 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4284 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4285 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4286 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4287 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4288 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4289 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4290 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4291 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4292 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4293 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4294 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4295 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4296 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4297 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4298 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4299 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4300 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4301 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4302 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4303 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4304 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4305 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4306 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4307 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4308 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4309 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4310 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4311 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4312 default: 4313 llvm_unreachable("unhandled atomic opcode"); 4314 } 4315 } 4316 4317 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4318 MachineIRBuilder &B, 4319 Intrinsic::ID IID) const { 4320 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4321 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4322 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4323 4324 Register Dst; 4325 4326 int OpOffset = 0; 4327 if (HasReturn) { 4328 // A few FP atomics do not support return values. 4329 Dst = MI.getOperand(0).getReg(); 4330 } else { 4331 OpOffset = -1; 4332 } 4333 4334 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4335 Register CmpVal; 4336 4337 if (IsCmpSwap) { 4338 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4339 ++OpOffset; 4340 } 4341 4342 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4343 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4344 4345 // The struct intrinsic variants add one additional operand over raw. 4346 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4347 Register VIndex; 4348 if (HasVIndex) { 4349 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4350 ++OpOffset; 4351 } else { 4352 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4353 } 4354 4355 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4356 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4357 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4358 4359 MachineMemOperand *MMO = *MI.memoperands_begin(); 4360 4361 unsigned ImmOffset; 4362 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4363 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4364 4365 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4366 4367 if (HasReturn) 4368 MIB.addDef(Dst); 4369 4370 MIB.addUse(VData); // vdata 4371 4372 if (IsCmpSwap) 4373 MIB.addReg(CmpVal); 4374 4375 MIB.addUse(RSrc) // rsrc 4376 .addUse(VIndex) // vindex 4377 .addUse(VOffset) // voffset 4378 .addUse(SOffset) // soffset 4379 .addImm(ImmOffset) // offset(imm) 4380 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4381 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4382 .addMemOperand(MMO); 4383 4384 MI.eraseFromParent(); 4385 return true; 4386 } 4387 4388 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4389 /// vector with s16 typed elements. 4390 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4391 SmallVectorImpl<Register> &PackedAddrs, 4392 unsigned ArgOffset, 4393 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4394 bool IsA16, bool IsG16) { 4395 const LLT S16 = LLT::scalar(16); 4396 const LLT V2S16 = LLT::fixed_vector(2, 16); 4397 auto EndIdx = Intr->VAddrEnd; 4398 4399 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4400 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4401 if (!SrcOp.isReg()) 4402 continue; // _L to _LZ may have eliminated this. 4403 4404 Register AddrReg = SrcOp.getReg(); 4405 4406 if ((I < Intr->GradientStart) || 4407 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4408 (I >= Intr->CoordStart && !IsA16)) { 4409 if ((I < Intr->GradientStart) && IsA16 && 4410 (B.getMRI()->getType(AddrReg) == S16)) { 4411 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 4412 // Special handling of bias when A16 is on. Bias is of type half but 4413 // occupies full 32-bit. 4414 PackedAddrs.push_back( 4415 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4416 .getReg(0)); 4417 } else { 4418 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 4419 "Bias needs to be converted to 16 bit in A16 mode"); 4420 // Handle any gradient or coordinate operands that should not be packed 4421 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4422 PackedAddrs.push_back(AddrReg); 4423 } 4424 } else { 4425 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4426 // derivatives dx/dh and dx/dv are packed with undef. 4427 if (((I + 1) >= EndIdx) || 4428 ((Intr->NumGradients / 2) % 2 == 1 && 4429 (I == static_cast<unsigned>(Intr->GradientStart + 4430 (Intr->NumGradients / 2) - 1) || 4431 I == static_cast<unsigned>(Intr->GradientStart + 4432 Intr->NumGradients - 1))) || 4433 // Check for _L to _LZ optimization 4434 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4435 PackedAddrs.push_back( 4436 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4437 .getReg(0)); 4438 } else { 4439 PackedAddrs.push_back( 4440 B.buildBuildVector( 4441 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4442 .getReg(0)); 4443 ++I; 4444 } 4445 } 4446 } 4447 } 4448 4449 /// Convert from separate vaddr components to a single vector address register, 4450 /// and replace the remaining operands with $noreg. 4451 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4452 int DimIdx, int NumVAddrs) { 4453 const LLT S32 = LLT::scalar(32); 4454 4455 SmallVector<Register, 8> AddrRegs; 4456 for (int I = 0; I != NumVAddrs; ++I) { 4457 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4458 if (SrcOp.isReg()) { 4459 AddrRegs.push_back(SrcOp.getReg()); 4460 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4461 } 4462 } 4463 4464 int NumAddrRegs = AddrRegs.size(); 4465 if (NumAddrRegs != 1) { 4466 // Above 8 elements round up to next power of 2 (i.e. 16). 4467 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4468 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4469 auto Undef = B.buildUndef(S32); 4470 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4471 NumAddrRegs = RoundedNumRegs; 4472 } 4473 4474 auto VAddr = 4475 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4476 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4477 } 4478 4479 for (int I = 1; I != NumVAddrs; ++I) { 4480 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4481 if (SrcOp.isReg()) 4482 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4483 } 4484 } 4485 4486 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4487 /// 4488 /// Depending on the subtarget, load/store with 16-bit element data need to be 4489 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4490 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4491 /// registers. 4492 /// 4493 /// We don't want to directly select image instructions just yet, but also want 4494 /// to exposes all register repacking to the legalizer/combiners. We also don't 4495 /// want a selected instruction entering RegBankSelect. In order to avoid 4496 /// defining a multitude of intermediate image instructions, directly hack on 4497 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4498 /// padding now unnecessary arguments with $noreg. 4499 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4500 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4501 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4502 4503 const unsigned NumDefs = MI.getNumExplicitDefs(); 4504 const unsigned ArgOffset = NumDefs + 1; 4505 bool IsTFE = NumDefs == 2; 4506 // We are only processing the operands of d16 image operations on subtargets 4507 // that use the unpacked register layout, or need to repack the TFE result. 4508 4509 // TODO: Do we need to guard against already legalized intrinsics? 4510 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4511 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4512 4513 MachineRegisterInfo *MRI = B.getMRI(); 4514 const LLT S32 = LLT::scalar(32); 4515 const LLT S16 = LLT::scalar(16); 4516 const LLT V2S16 = LLT::fixed_vector(2, 16); 4517 4518 unsigned DMask = 0; 4519 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 4520 LLT Ty = MRI->getType(VData); 4521 4522 // Check for 16 bit addresses and pack if true. 4523 LLT GradTy = 4524 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4525 LLT AddrTy = 4526 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4527 const bool IsG16 = GradTy == S16; 4528 const bool IsA16 = AddrTy == S16; 4529 const bool IsD16 = Ty.getScalarType() == S16; 4530 4531 int DMaskLanes = 0; 4532 if (!BaseOpcode->Atomic) { 4533 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4534 if (BaseOpcode->Gather4) { 4535 DMaskLanes = 4; 4536 } else if (DMask != 0) { 4537 DMaskLanes = countPopulation(DMask); 4538 } else if (!IsTFE && !BaseOpcode->Store) { 4539 // If dmask is 0, this is a no-op load. This can be eliminated. 4540 B.buildUndef(MI.getOperand(0)); 4541 MI.eraseFromParent(); 4542 return true; 4543 } 4544 } 4545 4546 Observer.changingInstr(MI); 4547 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4548 4549 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 4550 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 4551 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 4552 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4553 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 4554 4555 // Track that we legalized this 4556 MI.setDesc(B.getTII().get(NewOpcode)); 4557 4558 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4559 // dmask to be at least 1 otherwise the instruction will fail 4560 if (IsTFE && DMask == 0) { 4561 DMask = 0x1; 4562 DMaskLanes = 1; 4563 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4564 } 4565 4566 if (BaseOpcode->Atomic) { 4567 Register VData0 = MI.getOperand(2).getReg(); 4568 LLT Ty = MRI->getType(VData0); 4569 4570 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4571 if (Ty.isVector()) 4572 return false; 4573 4574 if (BaseOpcode->AtomicX2) { 4575 Register VData1 = MI.getOperand(3).getReg(); 4576 // The two values are packed in one register. 4577 LLT PackedTy = LLT::fixed_vector(2, Ty); 4578 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4579 MI.getOperand(2).setReg(Concat.getReg(0)); 4580 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4581 } 4582 } 4583 4584 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4585 4586 // Rewrite the addressing register layout before doing anything else. 4587 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4588 // 16 bit gradients are supported, but are tied to the A16 control 4589 // so both gradients and addresses must be 16 bit 4590 return false; 4591 } 4592 4593 if (IsA16 && !ST.hasA16()) { 4594 // A16 not supported 4595 return false; 4596 } 4597 4598 if (IsA16 || IsG16) { 4599 if (Intr->NumVAddrs > 1) { 4600 SmallVector<Register, 4> PackedRegs; 4601 4602 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4603 IsG16); 4604 4605 // See also below in the non-a16 branch 4606 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4607 PackedRegs.size() <= ST.getNSAMaxSize(); 4608 4609 if (!UseNSA && PackedRegs.size() > 1) { 4610 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4611 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4612 PackedRegs[0] = Concat.getReg(0); 4613 PackedRegs.resize(1); 4614 } 4615 4616 const unsigned NumPacked = PackedRegs.size(); 4617 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4618 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4619 if (!SrcOp.isReg()) { 4620 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4621 continue; 4622 } 4623 4624 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4625 4626 if (I - Intr->VAddrStart < NumPacked) 4627 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4628 else 4629 SrcOp.setReg(AMDGPU::NoRegister); 4630 } 4631 } 4632 } else { 4633 // If the register allocator cannot place the address registers contiguously 4634 // without introducing moves, then using the non-sequential address encoding 4635 // is always preferable, since it saves VALU instructions and is usually a 4636 // wash in terms of code size or even better. 4637 // 4638 // However, we currently have no way of hinting to the register allocator 4639 // that MIMG addresses should be placed contiguously when it is possible to 4640 // do so, so force non-NSA for the common 2-address case as a heuristic. 4641 // 4642 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4643 // allocation when possible. 4644 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4645 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4646 4647 if (!UseNSA && Intr->NumVAddrs > 1) 4648 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4649 Intr->NumVAddrs); 4650 } 4651 4652 int Flags = 0; 4653 if (IsA16) 4654 Flags |= 1; 4655 if (IsG16) 4656 Flags |= 2; 4657 MI.addOperand(MachineOperand::CreateImm(Flags)); 4658 4659 if (BaseOpcode->Store) { // No TFE for stores? 4660 // TODO: Handle dmask trim 4661 if (!Ty.isVector() || !IsD16) 4662 return true; 4663 4664 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4665 if (RepackedReg != VData) { 4666 MI.getOperand(1).setReg(RepackedReg); 4667 } 4668 4669 return true; 4670 } 4671 4672 Register DstReg = MI.getOperand(0).getReg(); 4673 const LLT EltTy = Ty.getScalarType(); 4674 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4675 4676 // Confirm that the return type is large enough for the dmask specified 4677 if (NumElts < DMaskLanes) 4678 return false; 4679 4680 if (NumElts > 4 || DMaskLanes > 4) 4681 return false; 4682 4683 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4684 const LLT AdjustedTy = 4685 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4686 4687 // The raw dword aligned data component of the load. The only legal cases 4688 // where this matters should be when using the packed D16 format, for 4689 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4690 LLT RoundedTy; 4691 4692 // S32 vector to to cover all data, plus TFE result element. 4693 LLT TFETy; 4694 4695 // Register type to use for each loaded component. Will be S32 or V2S16. 4696 LLT RegTy; 4697 4698 if (IsD16 && ST.hasUnpackedD16VMem()) { 4699 RoundedTy = 4700 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4701 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4702 RegTy = S32; 4703 } else { 4704 unsigned EltSize = EltTy.getSizeInBits(); 4705 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4706 unsigned RoundedSize = 32 * RoundedElts; 4707 RoundedTy = LLT::scalarOrVector( 4708 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4709 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4710 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4711 } 4712 4713 // The return type does not need adjustment. 4714 // TODO: Should we change s16 case to s32 or <2 x s16>? 4715 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4716 return true; 4717 4718 Register Dst1Reg; 4719 4720 // Insert after the instruction. 4721 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4722 4723 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4724 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4725 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4726 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4727 4728 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4729 4730 MI.getOperand(0).setReg(NewResultReg); 4731 4732 // In the IR, TFE is supposed to be used with a 2 element struct return 4733 // type. The instruction really returns these two values in one contiguous 4734 // register, with one additional dword beyond the loaded data. Rewrite the 4735 // return type to use a single register result. 4736 4737 if (IsTFE) { 4738 Dst1Reg = MI.getOperand(1).getReg(); 4739 if (MRI->getType(Dst1Reg) != S32) 4740 return false; 4741 4742 // TODO: Make sure the TFE operand bit is set. 4743 MI.removeOperand(1); 4744 4745 // Handle the easy case that requires no repack instructions. 4746 if (Ty == S32) { 4747 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4748 return true; 4749 } 4750 } 4751 4752 // Now figure out how to copy the new result register back into the old 4753 // result. 4754 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4755 4756 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4757 4758 if (ResultNumRegs == 1) { 4759 assert(!IsTFE); 4760 ResultRegs[0] = NewResultReg; 4761 } else { 4762 // We have to repack into a new vector of some kind. 4763 for (int I = 0; I != NumDataRegs; ++I) 4764 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4765 B.buildUnmerge(ResultRegs, NewResultReg); 4766 4767 // Drop the final TFE element to get the data part. The TFE result is 4768 // directly written to the right place already. 4769 if (IsTFE) 4770 ResultRegs.resize(NumDataRegs); 4771 } 4772 4773 // For an s16 scalar result, we form an s32 result with a truncate regardless 4774 // of packed vs. unpacked. 4775 if (IsD16 && !Ty.isVector()) { 4776 B.buildTrunc(DstReg, ResultRegs[0]); 4777 return true; 4778 } 4779 4780 // Avoid a build/concat_vector of 1 entry. 4781 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4782 B.buildBitcast(DstReg, ResultRegs[0]); 4783 return true; 4784 } 4785 4786 assert(Ty.isVector()); 4787 4788 if (IsD16) { 4789 // For packed D16 results with TFE enabled, all the data components are 4790 // S32. Cast back to the expected type. 4791 // 4792 // TODO: We don't really need to use load s32 elements. We would only need one 4793 // cast for the TFE result if a multiple of v2s16 was used. 4794 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4795 for (Register &Reg : ResultRegs) 4796 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4797 } else if (ST.hasUnpackedD16VMem()) { 4798 for (Register &Reg : ResultRegs) 4799 Reg = B.buildTrunc(S16, Reg).getReg(0); 4800 } 4801 } 4802 4803 auto padWithUndef = [&](LLT Ty, int NumElts) { 4804 if (NumElts == 0) 4805 return; 4806 Register Undef = B.buildUndef(Ty).getReg(0); 4807 for (int I = 0; I != NumElts; ++I) 4808 ResultRegs.push_back(Undef); 4809 }; 4810 4811 // Pad out any elements eliminated due to the dmask. 4812 LLT ResTy = MRI->getType(ResultRegs[0]); 4813 if (!ResTy.isVector()) { 4814 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4815 B.buildBuildVector(DstReg, ResultRegs); 4816 return true; 4817 } 4818 4819 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4820 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4821 4822 // Deal with the one annoying legal case. 4823 const LLT V3S16 = LLT::fixed_vector(3, 16); 4824 if (Ty == V3S16) { 4825 if (IsTFE) { 4826 if (ResultRegs.size() == 1) { 4827 NewResultReg = ResultRegs[0]; 4828 } else if (ResultRegs.size() == 2) { 4829 LLT V4S16 = LLT::fixed_vector(4, 16); 4830 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 4831 } else { 4832 return false; 4833 } 4834 } 4835 4836 if (MRI->getType(DstReg).getNumElements() < 4837 MRI->getType(NewResultReg).getNumElements()) { 4838 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 4839 } else { 4840 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 4841 } 4842 return true; 4843 } 4844 4845 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4846 B.buildConcatVectors(DstReg, ResultRegs); 4847 return true; 4848 } 4849 4850 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4851 LegalizerHelper &Helper, MachineInstr &MI) const { 4852 MachineIRBuilder &B = Helper.MIRBuilder; 4853 GISelChangeObserver &Observer = Helper.Observer; 4854 4855 Register Dst = MI.getOperand(0).getReg(); 4856 LLT Ty = B.getMRI()->getType(Dst); 4857 unsigned Size = Ty.getSizeInBits(); 4858 MachineFunction &MF = B.getMF(); 4859 4860 Observer.changingInstr(MI); 4861 4862 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4863 Ty = getBitcastRegisterType(Ty); 4864 Helper.bitcastDst(MI, Ty, 0); 4865 Dst = MI.getOperand(0).getReg(); 4866 B.setInsertPt(B.getMBB(), MI); 4867 } 4868 4869 // FIXME: We don't really need this intermediate instruction. The intrinsic 4870 // should be fixed to have a memory operand. Since it's readnone, we're not 4871 // allowed to add one. 4872 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4873 MI.removeOperand(1); // Remove intrinsic ID 4874 4875 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4876 // TODO: Should this use datalayout alignment? 4877 const unsigned MemSize = (Size + 7) / 8; 4878 const Align MemAlign(4); 4879 MachineMemOperand *MMO = MF.getMachineMemOperand( 4880 MachinePointerInfo(), 4881 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4882 MachineMemOperand::MOInvariant, 4883 MemSize, MemAlign); 4884 MI.addMemOperand(MF, MMO); 4885 4886 // There are no 96-bit result scalar loads, but widening to 128-bit should 4887 // always be legal. We may need to restore this to a 96-bit result if it turns 4888 // out this needs to be converted to a vector load during RegBankSelect. 4889 if (!isPowerOf2_32(Size)) { 4890 if (Ty.isVector()) 4891 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4892 else 4893 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4894 } 4895 4896 Observer.changedInstr(MI); 4897 return true; 4898 } 4899 4900 // TODO: Move to selection 4901 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4902 MachineRegisterInfo &MRI, 4903 MachineIRBuilder &B) const { 4904 if (!ST.isTrapHandlerEnabled() || 4905 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4906 return legalizeTrapEndpgm(MI, MRI, B); 4907 4908 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4909 switch (*HsaAbiVer) { 4910 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4911 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4912 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4913 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4914 case ELF::ELFABIVERSION_AMDGPU_HSA_V5: 4915 return ST.supportsGetDoorbellID() ? 4916 legalizeTrapHsa(MI, MRI, B) : 4917 legalizeTrapHsaQueuePtr(MI, MRI, B); 4918 } 4919 } 4920 4921 llvm_unreachable("Unknown trap handler"); 4922 } 4923 4924 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4925 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4926 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4927 MI.eraseFromParent(); 4928 return true; 4929 } 4930 4931 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4932 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4933 MachineFunction &MF = B.getMF(); 4934 const LLT S64 = LLT::scalar(64); 4935 4936 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4937 // For code object version 5, queue_ptr is passed through implicit kernarg. 4938 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 4939 AMDGPUTargetLowering::ImplicitParameter Param = 4940 AMDGPUTargetLowering::QUEUE_PTR; 4941 uint64_t Offset = 4942 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 4943 4944 Register KernargPtrReg = MRI.createGenericVirtualRegister( 4945 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4946 4947 if (!loadInputValue(KernargPtrReg, B, 4948 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 4949 return false; 4950 4951 // TODO: can we be smarter about machine pointer info? 4952 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 4953 MachineMemOperand *MMO = MF.getMachineMemOperand( 4954 PtrInfo, 4955 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4956 MachineMemOperand::MOInvariant, 4957 LLT::scalar(64), commonAlignment(Align(64), Offset)); 4958 4959 // Pointer address 4960 Register LoadAddr = MRI.createGenericVirtualRegister( 4961 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4962 B.buildPtrAdd(LoadAddr, KernargPtrReg, 4963 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 4964 // Load address 4965 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 4966 B.buildCopy(SGPR01, Temp); 4967 B.buildInstr(AMDGPU::S_TRAP) 4968 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4969 .addReg(SGPR01, RegState::Implicit); 4970 MI.eraseFromParent(); 4971 return true; 4972 } 4973 4974 // Pass queue pointer to trap handler as input, and insert trap instruction 4975 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4976 Register LiveIn = 4977 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4978 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4979 return false; 4980 4981 B.buildCopy(SGPR01, LiveIn); 4982 B.buildInstr(AMDGPU::S_TRAP) 4983 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4984 .addReg(SGPR01, RegState::Implicit); 4985 4986 MI.eraseFromParent(); 4987 return true; 4988 } 4989 4990 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4991 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4992 B.buildInstr(AMDGPU::S_TRAP) 4993 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4994 MI.eraseFromParent(); 4995 return true; 4996 } 4997 4998 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4999 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5000 // Is non-HSA path or trap-handler disabled? Then, report a warning 5001 // accordingly 5002 if (!ST.isTrapHandlerEnabled() || 5003 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 5004 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 5005 "debugtrap handler not supported", 5006 MI.getDebugLoc(), DS_Warning); 5007 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 5008 Ctx.diagnose(NoTrap); 5009 } else { 5010 // Insert debug-trap instruction 5011 B.buildInstr(AMDGPU::S_TRAP) 5012 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 5013 } 5014 5015 MI.eraseFromParent(); 5016 return true; 5017 } 5018 5019 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 5020 MachineIRBuilder &B) const { 5021 MachineRegisterInfo &MRI = *B.getMRI(); 5022 const LLT S16 = LLT::scalar(16); 5023 const LLT S32 = LLT::scalar(32); 5024 5025 Register DstReg = MI.getOperand(0).getReg(); 5026 Register NodePtr = MI.getOperand(2).getReg(); 5027 Register RayExtent = MI.getOperand(3).getReg(); 5028 Register RayOrigin = MI.getOperand(4).getReg(); 5029 Register RayDir = MI.getOperand(5).getReg(); 5030 Register RayInvDir = MI.getOperand(6).getReg(); 5031 Register TDescr = MI.getOperand(7).getReg(); 5032 5033 if (!ST.hasGFX10_AEncoding()) { 5034 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 5035 "intrinsic not supported on subtarget", 5036 MI.getDebugLoc()); 5037 B.getMF().getFunction().getContext().diagnose(BadIntrin); 5038 return false; 5039 } 5040 5041 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 5042 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 5043 const unsigned NumVDataDwords = 4; 5044 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 5045 const bool UseNSA = 5046 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 5047 const unsigned BaseOpcodes[2][2] = { 5048 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 5049 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 5050 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 5051 int Opcode; 5052 if (UseNSA) { 5053 Opcode = 5054 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 5055 NumVDataDwords, NumVAddrDwords); 5056 } else { 5057 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 5058 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 5059 PowerOf2Ceil(NumVAddrDwords)); 5060 } 5061 assert(Opcode != -1); 5062 5063 SmallVector<Register, 12> Ops; 5064 if (Is64) { 5065 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 5066 Ops.push_back(Unmerge.getReg(0)); 5067 Ops.push_back(Unmerge.getReg(1)); 5068 } else { 5069 Ops.push_back(NodePtr); 5070 } 5071 Ops.push_back(RayExtent); 5072 5073 auto packLanes = [&Ops, &S32, &B](Register Src) { 5074 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 5075 Ops.push_back(Unmerge.getReg(0)); 5076 Ops.push_back(Unmerge.getReg(1)); 5077 Ops.push_back(Unmerge.getReg(2)); 5078 }; 5079 5080 packLanes(RayOrigin); 5081 if (IsA16) { 5082 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 5083 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 5084 Register R1 = MRI.createGenericVirtualRegister(S32); 5085 Register R2 = MRI.createGenericVirtualRegister(S32); 5086 Register R3 = MRI.createGenericVirtualRegister(S32); 5087 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 5088 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 5089 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 5090 Ops.push_back(R1); 5091 Ops.push_back(R2); 5092 Ops.push_back(R3); 5093 } else { 5094 packLanes(RayDir); 5095 packLanes(RayInvDir); 5096 } 5097 5098 if (!UseNSA) { 5099 // Build a single vector containing all the operands so far prepared. 5100 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 5101 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 5102 Ops.clear(); 5103 Ops.push_back(MergedOps); 5104 } 5105 5106 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 5107 .addDef(DstReg) 5108 .addImm(Opcode); 5109 5110 for (Register R : Ops) { 5111 MIB.addUse(R); 5112 } 5113 5114 MIB.addUse(TDescr) 5115 .addImm(IsA16 ? 1 : 0) 5116 .cloneMemRefs(MI); 5117 5118 MI.eraseFromParent(); 5119 return true; 5120 } 5121 5122 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 5123 MachineIRBuilder &B) const { 5124 unsigned Opc; 5125 int RoundMode = MI.getOperand(2).getImm(); 5126 5127 if (RoundMode == (int)RoundingMode::TowardPositive) 5128 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 5129 else if (RoundMode == (int)RoundingMode::TowardNegative) 5130 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 5131 else 5132 return false; 5133 5134 B.buildInstr(Opc) 5135 .addDef(MI.getOperand(0).getReg()) 5136 .addUse(MI.getOperand(1).getReg()); 5137 5138 MI.eraseFromParent(); 5139 5140 return true; 5141 } 5142 5143 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 5144 MachineInstr &MI) const { 5145 MachineIRBuilder &B = Helper.MIRBuilder; 5146 MachineRegisterInfo &MRI = *B.getMRI(); 5147 5148 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 5149 auto IntrID = MI.getIntrinsicID(); 5150 switch (IntrID) { 5151 case Intrinsic::amdgcn_if: 5152 case Intrinsic::amdgcn_else: { 5153 MachineInstr *Br = nullptr; 5154 MachineBasicBlock *UncondBrTarget = nullptr; 5155 bool Negated = false; 5156 if (MachineInstr *BrCond = 5157 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5158 const SIRegisterInfo *TRI 5159 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5160 5161 Register Def = MI.getOperand(1).getReg(); 5162 Register Use = MI.getOperand(3).getReg(); 5163 5164 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5165 5166 if (Negated) 5167 std::swap(CondBrTarget, UncondBrTarget); 5168 5169 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5170 if (IntrID == Intrinsic::amdgcn_if) { 5171 B.buildInstr(AMDGPU::SI_IF) 5172 .addDef(Def) 5173 .addUse(Use) 5174 .addMBB(UncondBrTarget); 5175 } else { 5176 B.buildInstr(AMDGPU::SI_ELSE) 5177 .addDef(Def) 5178 .addUse(Use) 5179 .addMBB(UncondBrTarget); 5180 } 5181 5182 if (Br) { 5183 Br->getOperand(0).setMBB(CondBrTarget); 5184 } else { 5185 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 5186 // since we're swapping branch targets it needs to be reinserted. 5187 // FIXME: IRTranslator should probably not do this 5188 B.buildBr(*CondBrTarget); 5189 } 5190 5191 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 5192 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 5193 MI.eraseFromParent(); 5194 BrCond->eraseFromParent(); 5195 return true; 5196 } 5197 5198 return false; 5199 } 5200 case Intrinsic::amdgcn_loop: { 5201 MachineInstr *Br = nullptr; 5202 MachineBasicBlock *UncondBrTarget = nullptr; 5203 bool Negated = false; 5204 if (MachineInstr *BrCond = 5205 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5206 const SIRegisterInfo *TRI 5207 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5208 5209 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5210 Register Reg = MI.getOperand(2).getReg(); 5211 5212 if (Negated) 5213 std::swap(CondBrTarget, UncondBrTarget); 5214 5215 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5216 B.buildInstr(AMDGPU::SI_LOOP) 5217 .addUse(Reg) 5218 .addMBB(UncondBrTarget); 5219 5220 if (Br) 5221 Br->getOperand(0).setMBB(CondBrTarget); 5222 else 5223 B.buildBr(*CondBrTarget); 5224 5225 MI.eraseFromParent(); 5226 BrCond->eraseFromParent(); 5227 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5228 return true; 5229 } 5230 5231 return false; 5232 } 5233 case Intrinsic::amdgcn_kernarg_segment_ptr: 5234 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5235 // This only makes sense to call in a kernel, so just lower to null. 5236 B.buildConstant(MI.getOperand(0).getReg(), 0); 5237 MI.eraseFromParent(); 5238 return true; 5239 } 5240 5241 return legalizePreloadedArgIntrin( 5242 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5243 case Intrinsic::amdgcn_implicitarg_ptr: 5244 return legalizeImplicitArgPtr(MI, MRI, B); 5245 case Intrinsic::amdgcn_workitem_id_x: 5246 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, 5247 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5248 case Intrinsic::amdgcn_workitem_id_y: 5249 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, 5250 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5251 case Intrinsic::amdgcn_workitem_id_z: 5252 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, 5253 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5254 case Intrinsic::amdgcn_workgroup_id_x: 5255 return legalizePreloadedArgIntrin(MI, MRI, B, 5256 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5257 case Intrinsic::amdgcn_workgroup_id_y: 5258 return legalizePreloadedArgIntrin(MI, MRI, B, 5259 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5260 case Intrinsic::amdgcn_workgroup_id_z: 5261 return legalizePreloadedArgIntrin(MI, MRI, B, 5262 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5263 case Intrinsic::amdgcn_dispatch_ptr: 5264 return legalizePreloadedArgIntrin(MI, MRI, B, 5265 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5266 case Intrinsic::amdgcn_queue_ptr: 5267 return legalizePreloadedArgIntrin(MI, MRI, B, 5268 AMDGPUFunctionArgInfo::QUEUE_PTR); 5269 case Intrinsic::amdgcn_implicit_buffer_ptr: 5270 return legalizePreloadedArgIntrin( 5271 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5272 case Intrinsic::amdgcn_dispatch_id: 5273 return legalizePreloadedArgIntrin(MI, MRI, B, 5274 AMDGPUFunctionArgInfo::DISPATCH_ID); 5275 case Intrinsic::r600_read_ngroups_x: 5276 // TODO: Emit error for hsa 5277 return legalizeKernargMemParameter(MI, B, 5278 SI::KernelInputOffsets::NGROUPS_X); 5279 case Intrinsic::r600_read_ngroups_y: 5280 return legalizeKernargMemParameter(MI, B, 5281 SI::KernelInputOffsets::NGROUPS_Y); 5282 case Intrinsic::r600_read_ngroups_z: 5283 return legalizeKernargMemParameter(MI, B, 5284 SI::KernelInputOffsets::NGROUPS_Z); 5285 case Intrinsic::r600_read_local_size_x: 5286 // TODO: Could insert G_ASSERT_ZEXT from s16 5287 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); 5288 case Intrinsic::r600_read_local_size_y: 5289 // TODO: Could insert G_ASSERT_ZEXT from s16 5290 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); 5291 // TODO: Could insert G_ASSERT_ZEXT from s16 5292 case Intrinsic::r600_read_local_size_z: 5293 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); 5294 case Intrinsic::r600_read_global_size_x: 5295 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); 5296 case Intrinsic::r600_read_global_size_y: 5297 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); 5298 case Intrinsic::r600_read_global_size_z: 5299 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); 5300 case Intrinsic::amdgcn_fdiv_fast: 5301 return legalizeFDIVFastIntrin(MI, MRI, B); 5302 case Intrinsic::amdgcn_is_shared: 5303 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5304 case Intrinsic::amdgcn_is_private: 5305 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5306 case Intrinsic::amdgcn_wavefrontsize: { 5307 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5308 MI.eraseFromParent(); 5309 return true; 5310 } 5311 case Intrinsic::amdgcn_s_buffer_load: 5312 return legalizeSBufferLoad(Helper, MI); 5313 case Intrinsic::amdgcn_raw_buffer_store: 5314 case Intrinsic::amdgcn_struct_buffer_store: 5315 return legalizeBufferStore(MI, MRI, B, false, false); 5316 case Intrinsic::amdgcn_raw_buffer_store_format: 5317 case Intrinsic::amdgcn_struct_buffer_store_format: 5318 return legalizeBufferStore(MI, MRI, B, false, true); 5319 case Intrinsic::amdgcn_raw_tbuffer_store: 5320 case Intrinsic::amdgcn_struct_tbuffer_store: 5321 return legalizeBufferStore(MI, MRI, B, true, true); 5322 case Intrinsic::amdgcn_raw_buffer_load: 5323 case Intrinsic::amdgcn_struct_buffer_load: 5324 return legalizeBufferLoad(MI, MRI, B, false, false); 5325 case Intrinsic::amdgcn_raw_buffer_load_format: 5326 case Intrinsic::amdgcn_struct_buffer_load_format: 5327 return legalizeBufferLoad(MI, MRI, B, true, false); 5328 case Intrinsic::amdgcn_raw_tbuffer_load: 5329 case Intrinsic::amdgcn_struct_tbuffer_load: 5330 return legalizeBufferLoad(MI, MRI, B, true, true); 5331 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5332 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5333 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5334 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5335 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5336 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5337 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5338 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5339 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5340 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5341 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5342 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5343 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5344 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5345 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5346 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5347 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5348 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5349 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5350 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5351 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5352 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5353 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5354 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5355 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5356 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5357 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5358 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5359 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5360 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5361 return legalizeBufferAtomic(MI, B, IntrID); 5362 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5363 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { 5364 Register DstReg = MI.getOperand(0).getReg(); 5365 if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) { 5366 Function &F = B.getMF().getFunction(); 5367 DiagnosticInfoUnsupported NoFpRet( 5368 F, "return versions of fp atomics not supported", B.getDebugLoc(), 5369 DS_Error); 5370 F.getContext().diagnose(NoFpRet); 5371 B.buildUndef(DstReg); 5372 MI.eraseFromParent(); 5373 return true; 5374 } 5375 5376 return legalizeBufferAtomic(MI, B, IntrID); 5377 } 5378 case Intrinsic::amdgcn_atomic_inc: 5379 return legalizeAtomicIncDec(MI, B, true); 5380 case Intrinsic::amdgcn_atomic_dec: 5381 return legalizeAtomicIncDec(MI, B, false); 5382 case Intrinsic::trap: 5383 return legalizeTrapIntrinsic(MI, MRI, B); 5384 case Intrinsic::debugtrap: 5385 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5386 case Intrinsic::amdgcn_rsq_clamp: 5387 return legalizeRsqClampIntrinsic(MI, MRI, B); 5388 case Intrinsic::amdgcn_ds_fadd: 5389 case Intrinsic::amdgcn_ds_fmin: 5390 case Intrinsic::amdgcn_ds_fmax: 5391 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5392 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5393 return legalizeBVHIntrinsic(MI, B); 5394 default: { 5395 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5396 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5397 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5398 return true; 5399 } 5400 } 5401 5402 return true; 5403 } 5404