1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 #include "llvm/IR/IntrinsicsR600.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 /// \returns true if this is an odd sized vector which should widen by adding an 64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65 /// excludes s1 vectors, which should always be scalarized. 66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 67 return [=](const LegalityQuery &Query) { 68 const LLT Ty = Query.Types[TypeIdx]; 69 if (!Ty.isVector()) 70 return false; 71 72 const LLT EltTy = Ty.getElementType(); 73 const unsigned EltSize = EltTy.getSizeInBits(); 74 return Ty.getNumElements() % 2 != 0 && 75 EltSize > 1 && EltSize < 32 && 76 Ty.getSizeInBits() % 32 != 0; 77 }; 78 } 79 80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 return Ty.getSizeInBits() % 32 == 0; 84 }; 85 } 86 87 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getScalarType(); 91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 92 }; 93 } 94 95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 return std::make_pair(TypeIdx, 100 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 101 }; 102 } 103 104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 105 return [=](const LegalityQuery &Query) { 106 const LLT Ty = Query.Types[TypeIdx]; 107 const LLT EltTy = Ty.getElementType(); 108 unsigned Size = Ty.getSizeInBits(); 109 unsigned Pieces = (Size + 63) / 64; 110 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 111 return std::make_pair( 112 TypeIdx, 113 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 114 }; 115 } 116 117 // Increase the number of vector elements to reach the next multiple of 32-bit 118 // type. 119 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 120 return [=](const LegalityQuery &Query) { 121 const LLT Ty = Query.Types[TypeIdx]; 122 123 const LLT EltTy = Ty.getElementType(); 124 const int Size = Ty.getSizeInBits(); 125 const int EltSize = EltTy.getSizeInBits(); 126 const int NextMul32 = (Size + 31) / 32; 127 128 assert(EltSize < 32); 129 130 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 131 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 132 }; 133 } 134 135 static LLT getBitcastRegisterType(const LLT Ty) { 136 const unsigned Size = Ty.getSizeInBits(); 137 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 uint64_t AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .minScalar(0, S16) 536 .clampMaxNumElementsStrict(0, S16, 2) 537 .widenScalarToNextMultipleOf(0, 32) 538 .maxScalar(0, S32) 539 .scalarize(0); 540 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 542 .legalFor({S32, S16, V2S16}) // Clamp modifier 543 .minScalarOrElt(0, S16) 544 .clampMaxNumElementsStrict(0, S16, 2) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 32) 547 .lower(); 548 } else if (ST.has16BitInsts()) { 549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 550 .legalFor({S32, S16}) 551 .minScalar(0, S16) 552 .widenScalarToNextMultipleOf(0, 32) 553 .maxScalar(0, S32) 554 .scalarize(0); 555 556 // Technically the saturating operations require clamp bit support, but this 557 // was introduced at the same time as 16-bit operations. 558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 559 .legalFor({S32, S16}) // Clamp modifier 560 .minScalar(0, S16) 561 .scalarize(0) 562 .widenScalarToNextPow2(0, 16) 563 .lower(); 564 565 // We're just lowering this, but it helps get a better result to try to 566 // coerce to the desired type first. 567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 568 .minScalar(0, S16) 569 .scalarize(0) 570 .lower(); 571 } else { 572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 573 .legalFor({S32}) 574 .widenScalarToNextMultipleOf(0, 32) 575 .clampScalar(0, S32, S32) 576 .scalarize(0); 577 578 if (ST.hasIntClamp()) { 579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 580 .legalFor({S32}) // Clamp modifier. 581 .scalarize(0) 582 .minScalarOrElt(0, S32) 583 .lower(); 584 } else { 585 // Clamp bit support was added in VI, along with 16-bit operations. 586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 587 .minScalar(0, S32) 588 .scalarize(0) 589 .lower(); 590 } 591 592 // FIXME: DAG expansion gets better results. The widening uses the smaller 593 // range values and goes for the min/max lowering directly. 594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 595 .minScalar(0, S32) 596 .scalarize(0) 597 .lower(); 598 } 599 600 getActionDefinitionsBuilder( 601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 602 .customFor({S32, S64}) 603 .clampScalar(0, S32, S64) 604 .widenScalarToNextPow2(0, 32) 605 .scalarize(0); 606 607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 608 .legalFor({S32}) 609 .maxScalar(0, S32); 610 611 if (ST.hasVOP3PInsts()) { 612 Mulh 613 .clampMaxNumElements(0, S8, 2) 614 .lowerFor({V2S8}); 615 } 616 617 Mulh 618 .scalarize(0) 619 .lower(); 620 621 // Report legal for any types we can handle anywhere. For the cases only legal 622 // on the SALU, RegBankSelect will be able to re-legalize. 623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 625 .clampScalar(0, S32, S64) 626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 628 .widenScalarToNextPow2(0) 629 .scalarize(0); 630 631 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 633 .legalFor({{S32, S1}, {S32, S32}}) 634 .minScalar(0, S32) 635 .scalarize(0) 636 .lower(); 637 638 getActionDefinitionsBuilder(G_BITCAST) 639 // Don't worry about the size constraint. 640 .legalIf(all(isRegisterType(0), isRegisterType(1))) 641 .lower(); 642 643 644 getActionDefinitionsBuilder(G_CONSTANT) 645 .legalFor({S1, S32, S64, S16, GlobalPtr, 646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 647 .legalIf(isPointer(0)) 648 .clampScalar(0, S32, S64) 649 .widenScalarToNextPow2(0); 650 651 getActionDefinitionsBuilder(G_FCONSTANT) 652 .legalFor({S32, S64, S16}) 653 .clampScalar(0, S16, S64); 654 655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 656 .legalIf(isRegisterType(0)) 657 // s1 and s16 are special cases because they have legal operations on 658 // them, but don't really occupy registers in the normal way. 659 .legalFor({S1, S16}) 660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 661 .clampScalarOrElt(0, S32, MaxScalar) 662 .widenScalarToNextPow2(0, 32) 663 .clampMaxNumElements(0, S32, 16); 664 665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 666 667 // If the amount is divergent, we have to do a wave reduction to get the 668 // maximum value, so this is expanded during RegBankSelect. 669 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 670 .legalFor({{PrivatePtr, S32}}); 671 672 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 673 .customIf(typeIsNot(0, PrivatePtr)); 674 675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 676 677 auto &FPOpActions = getActionDefinitionsBuilder( 678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 679 .legalFor({S32, S64}); 680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 681 .customFor({S32, S64}); 682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 683 .customFor({S32, S64}); 684 685 if (ST.has16BitInsts()) { 686 if (ST.hasVOP3PInsts()) 687 FPOpActions.legalFor({S16, V2S16}); 688 else 689 FPOpActions.legalFor({S16}); 690 691 TrigActions.customFor({S16}); 692 FDIVActions.customFor({S16}); 693 } 694 695 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 697 698 if (ST.hasVOP3PInsts()) { 699 MinNumMaxNum.customFor(FPTypesPK16) 700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 701 .clampMaxNumElements(0, S16, 2) 702 .clampScalar(0, S16, S64) 703 .scalarize(0); 704 } else if (ST.has16BitInsts()) { 705 MinNumMaxNum.customFor(FPTypes16) 706 .clampScalar(0, S16, S64) 707 .scalarize(0); 708 } else { 709 MinNumMaxNum.customFor(FPTypesBase) 710 .clampScalar(0, S32, S64) 711 .scalarize(0); 712 } 713 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 716 717 FPOpActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 TrigActions 722 .scalarize(0) 723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 724 725 FDIVActions 726 .scalarize(0) 727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 728 729 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 730 .legalFor(FPTypesPK16) 731 .clampMaxNumElementsStrict(0, S16, 2) 732 .scalarize(0) 733 .clampScalar(0, S16, S64); 734 735 if (ST.has16BitInsts()) { 736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 737 .legalFor({S32, S64, S16}) 738 .scalarize(0) 739 .clampScalar(0, S16, S64); 740 } else { 741 getActionDefinitionsBuilder(G_FSQRT) 742 .legalFor({S32, S64}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 746 if (ST.hasFractBug()) { 747 getActionDefinitionsBuilder(G_FFLOOR) 748 .customFor({S64}) 749 .legalFor({S32, S64}) 750 .scalarize(0) 751 .clampScalar(0, S32, S64); 752 } else { 753 getActionDefinitionsBuilder(G_FFLOOR) 754 .legalFor({S32, S64}) 755 .scalarize(0) 756 .clampScalar(0, S32, S64); 757 } 758 } 759 760 getActionDefinitionsBuilder(G_FPTRUNC) 761 .legalFor({{S32, S64}, {S16, S32}}) 762 .scalarize(0) 763 .lower(); 764 765 getActionDefinitionsBuilder(G_FPEXT) 766 .legalFor({{S64, S32}, {S32, S16}}) 767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 768 .scalarize(0); 769 770 getActionDefinitionsBuilder(G_FSUB) 771 // Use actual fsub instruction 772 .legalFor({S32}) 773 // Must use fadd + fneg 774 .lowerFor({S64, S16, V2S16}) 775 .scalarize(0) 776 .clampScalar(0, S32, S64); 777 778 // Whether this is legal depends on the floating point mode for the function. 779 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 780 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 781 FMad.customFor({S32, S16}); 782 else if (ST.hasMadMacF32Insts()) 783 FMad.customFor({S32}); 784 else if (ST.hasMadF16()) 785 FMad.customFor({S16}); 786 FMad.scalarize(0) 787 .lower(); 788 789 auto &FRem = getActionDefinitionsBuilder(G_FREM); 790 if (ST.has16BitInsts()) { 791 FRem.customFor({S16, S32, S64}); 792 } else { 793 FRem.minScalar(0, S32) 794 .customFor({S32, S64}); 795 } 796 FRem.scalarize(0); 797 798 // TODO: Do we need to clamp maximum bitwidth? 799 getActionDefinitionsBuilder(G_TRUNC) 800 .legalIf(isScalar(0)) 801 .legalFor({{V2S16, V2S32}}) 802 .clampMaxNumElements(0, S16, 2) 803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 804 // situations (like an invalid implicit use), we don't want to infinite loop 805 // in the legalizer. 806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 807 .alwaysLegal(); 808 809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 811 {S32, S1}, {S64, S1}, {S16, S1}}) 812 .scalarize(0) 813 .clampScalar(0, S32, S64) 814 .widenScalarToNextPow2(1, 32); 815 816 // TODO: Split s1->s64 during regbankselect for VALU. 817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 819 .lowerIf(typeIs(1, S1)) 820 .customFor({{S32, S64}, {S64, S64}}); 821 if (ST.has16BitInsts()) 822 IToFP.legalFor({{S16, S16}}); 823 IToFP.clampScalar(1, S32, S64) 824 .minScalar(0, S32) 825 .scalarize(0) 826 .widenScalarToNextPow2(1); 827 828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 830 .customFor({{S64, S32}, {S64, S64}}) 831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 832 if (ST.has16BitInsts()) 833 FPToI.legalFor({{S16, S16}}); 834 else 835 FPToI.minScalar(1, S32); 836 837 FPToI.minScalar(0, S32) 838 .widenScalarToNextPow2(0, 32) 839 .scalarize(0) 840 .lower(); 841 842 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 843 .customFor({S16, S32}) 844 .scalarize(0) 845 .lower(); 846 847 // Lower roundeven into G_FRINT 848 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 849 .scalarize(0) 850 .lower(); 851 852 if (ST.has16BitInsts()) { 853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 854 .legalFor({S16, S32, S64}) 855 .clampScalar(0, S16, S64) 856 .scalarize(0); 857 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 859 .legalFor({S32, S64}) 860 .clampScalar(0, S32, S64) 861 .scalarize(0); 862 } else { 863 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 864 .legalFor({S32}) 865 .customFor({S64}) 866 .clampScalar(0, S32, S64) 867 .scalarize(0); 868 } 869 870 getActionDefinitionsBuilder(G_PTR_ADD) 871 .legalIf(all(isPointer(0), sameSize(0, 1))) 872 .scalarize(0) 873 .scalarSameSizeAs(1, 0); 874 875 getActionDefinitionsBuilder(G_PTRMASK) 876 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 877 .scalarSameSizeAs(1, 0) 878 .scalarize(0); 879 880 auto &CmpBuilder = 881 getActionDefinitionsBuilder(G_ICMP) 882 // The compare output type differs based on the register bank of the output, 883 // so make both s1 and s32 legal. 884 // 885 // Scalar compares producing output in scc will be promoted to s32, as that 886 // is the allocatable register type that will be needed for the copy from 887 // scc. This will be promoted during RegBankSelect, and we assume something 888 // before that won't try to use s32 result types. 889 // 890 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 891 // bank. 892 .legalForCartesianProduct( 893 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 894 .legalForCartesianProduct( 895 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 896 if (ST.has16BitInsts()) { 897 CmpBuilder.legalFor({{S1, S16}}); 898 } 899 900 CmpBuilder 901 .widenScalarToNextPow2(1) 902 .clampScalar(1, S32, S64) 903 .scalarize(0) 904 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 905 906 getActionDefinitionsBuilder(G_FCMP) 907 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 908 .widenScalarToNextPow2(1) 909 .clampScalar(1, S32, S64) 910 .scalarize(0); 911 912 // FIXME: fpow has a selection pattern that should move to custom lowering. 913 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 914 if (ST.has16BitInsts()) 915 Exp2Ops.legalFor({S32, S16}); 916 else 917 Exp2Ops.legalFor({S32}); 918 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 919 Exp2Ops.scalarize(0); 920 921 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 922 if (ST.has16BitInsts()) 923 ExpOps.customFor({{S32}, {S16}}); 924 else 925 ExpOps.customFor({S32}); 926 ExpOps.clampScalar(0, MinScalarFPTy, S32) 927 .scalarize(0); 928 929 getActionDefinitionsBuilder(G_FPOWI) 930 .clampScalar(0, MinScalarFPTy, S32) 931 .lower(); 932 933 // The 64-bit versions produce 32-bit results, but only on the SALU. 934 getActionDefinitionsBuilder(G_CTPOP) 935 .legalFor({{S32, S32}, {S32, S64}}) 936 .clampScalar(0, S32, S32) 937 .widenScalarToNextPow2(1, 32) 938 .clampScalar(1, S32, S64) 939 .scalarize(0) 940 .widenScalarToNextPow2(0, 32); 941 942 943 // The hardware instructions return a different result on 0 than the generic 944 // instructions expect. The hardware produces -1, but these produce the 945 // bitwidth. 946 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 947 .scalarize(0) 948 .clampScalar(0, S32, S32) 949 .clampScalar(1, S32, S64) 950 .widenScalarToNextPow2(0, 32) 951 .widenScalarToNextPow2(1, 32) 952 .custom(); 953 954 // The 64-bit versions produce 32-bit results, but only on the SALU. 955 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 956 .legalFor({{S32, S32}, {S32, S64}}) 957 .clampScalar(0, S32, S32) 958 .clampScalar(1, S32, S64) 959 .scalarize(0) 960 .widenScalarToNextPow2(0, 32) 961 .widenScalarToNextPow2(1, 32); 962 963 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 964 // RegBankSelect. 965 getActionDefinitionsBuilder(G_BITREVERSE) 966 .legalFor({S32, S64}) 967 .clampScalar(0, S32, S64) 968 .scalarize(0) 969 .widenScalarToNextPow2(0); 970 971 if (ST.has16BitInsts()) { 972 getActionDefinitionsBuilder(G_BSWAP) 973 .legalFor({S16, S32, V2S16}) 974 .clampMaxNumElementsStrict(0, S16, 2) 975 // FIXME: Fixing non-power-of-2 before clamp is workaround for 976 // narrowScalar limitation. 977 .widenScalarToNextPow2(0) 978 .clampScalar(0, S16, S32) 979 .scalarize(0); 980 981 if (ST.hasVOP3PInsts()) { 982 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 983 .legalFor({S32, S16, V2S16}) 984 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 985 .clampMaxNumElements(0, S16, 2) 986 .minScalar(0, S16) 987 .widenScalarToNextPow2(0) 988 .scalarize(0) 989 .lower(); 990 } else { 991 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 992 .legalFor({S32, S16}) 993 .widenScalarToNextPow2(0) 994 .minScalar(0, S16) 995 .scalarize(0) 996 .lower(); 997 } 998 } else { 999 // TODO: Should have same legality without v_perm_b32 1000 getActionDefinitionsBuilder(G_BSWAP) 1001 .legalFor({S32}) 1002 .lowerIf(scalarNarrowerThan(0, 32)) 1003 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1004 // narrowScalar limitation. 1005 .widenScalarToNextPow2(0) 1006 .maxScalar(0, S32) 1007 .scalarize(0) 1008 .lower(); 1009 1010 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1011 .legalFor({S32}) 1012 .minScalar(0, S32) 1013 .widenScalarToNextPow2(0) 1014 .scalarize(0) 1015 .lower(); 1016 } 1017 1018 getActionDefinitionsBuilder(G_INTTOPTR) 1019 // List the common cases 1020 .legalForCartesianProduct(AddrSpaces64, {S64}) 1021 .legalForCartesianProduct(AddrSpaces32, {S32}) 1022 .scalarize(0) 1023 // Accept any address space as long as the size matches 1024 .legalIf(sameSize(0, 1)) 1025 .widenScalarIf(smallerThan(1, 0), 1026 [](const LegalityQuery &Query) { 1027 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1028 }) 1029 .narrowScalarIf(largerThan(1, 0), 1030 [](const LegalityQuery &Query) { 1031 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1032 }); 1033 1034 getActionDefinitionsBuilder(G_PTRTOINT) 1035 // List the common cases 1036 .legalForCartesianProduct(AddrSpaces64, {S64}) 1037 .legalForCartesianProduct(AddrSpaces32, {S32}) 1038 .scalarize(0) 1039 // Accept any address space as long as the size matches 1040 .legalIf(sameSize(0, 1)) 1041 .widenScalarIf(smallerThan(0, 1), 1042 [](const LegalityQuery &Query) { 1043 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1044 }) 1045 .narrowScalarIf( 1046 largerThan(0, 1), 1047 [](const LegalityQuery &Query) { 1048 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1049 }); 1050 1051 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1052 .scalarize(0) 1053 .custom(); 1054 1055 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1056 bool IsLoad) -> bool { 1057 const LLT DstTy = Query.Types[0]; 1058 1059 // Split vector extloads. 1060 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1061 1062 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1063 return true; 1064 1065 const LLT PtrTy = Query.Types[1]; 1066 unsigned AS = PtrTy.getAddressSpace(); 1067 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1068 return true; 1069 1070 // Catch weird sized loads that don't evenly divide into the access sizes 1071 // TODO: May be able to widen depending on alignment etc. 1072 unsigned NumRegs = (MemSize + 31) / 32; 1073 if (NumRegs == 3) { 1074 if (!ST.hasDwordx3LoadStores()) 1075 return true; 1076 } else { 1077 // If the alignment allows, these should have been widened. 1078 if (!isPowerOf2_32(NumRegs)) 1079 return true; 1080 } 1081 1082 return false; 1083 }; 1084 1085 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1086 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1087 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1088 1089 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1090 // LDS 1091 // TODO: Unsupported flat for SI. 1092 1093 for (unsigned Op : {G_LOAD, G_STORE}) { 1094 const bool IsStore = Op == G_STORE; 1095 1096 auto &Actions = getActionDefinitionsBuilder(Op); 1097 // Explicitly list some common cases. 1098 // TODO: Does this help compile time at all? 1099 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1100 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1101 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1102 {S64, GlobalPtr, S64, GlobalAlign32}, 1103 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1104 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1105 {S32, GlobalPtr, S8, GlobalAlign8}, 1106 {S32, GlobalPtr, S16, GlobalAlign16}, 1107 1108 {S32, LocalPtr, S32, 32}, 1109 {S64, LocalPtr, S64, 32}, 1110 {V2S32, LocalPtr, V2S32, 32}, 1111 {S32, LocalPtr, S8, 8}, 1112 {S32, LocalPtr, S16, 16}, 1113 {V2S16, LocalPtr, S32, 32}, 1114 1115 {S32, PrivatePtr, S32, 32}, 1116 {S32, PrivatePtr, S8, 8}, 1117 {S32, PrivatePtr, S16, 16}, 1118 {V2S16, PrivatePtr, S32, 32}, 1119 1120 {S32, ConstantPtr, S32, GlobalAlign32}, 1121 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1122 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1123 {S64, ConstantPtr, S64, GlobalAlign32}, 1124 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1125 Actions.legalIf( 1126 [=](const LegalityQuery &Query) -> bool { 1127 return isLoadStoreLegal(ST, Query); 1128 }); 1129 1130 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1131 // 64-bits. 1132 // 1133 // TODO: Should generalize bitcast action into coerce, which will also cover 1134 // inserting addrspacecasts. 1135 Actions.customIf(typeIs(1, Constant32Ptr)); 1136 1137 // Turn any illegal element vectors into something easier to deal 1138 // with. These will ultimately produce 32-bit scalar shifts to extract the 1139 // parts anyway. 1140 // 1141 // For odd 16-bit element vectors, prefer to split those into pieces with 1142 // 16-bit vector parts. 1143 Actions.bitcastIf( 1144 [=](const LegalityQuery &Query) -> bool { 1145 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1146 Query.MMODescrs[0].MemoryTy); 1147 }, bitcastToRegisterType(0)); 1148 1149 if (!IsStore) { 1150 // Widen suitably aligned loads by loading extra bytes. The standard 1151 // legalization actions can't properly express widening memory operands. 1152 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1153 return shouldWidenLoad(ST, Query, G_LOAD); 1154 }); 1155 } 1156 1157 // FIXME: load/store narrowing should be moved to lower action 1158 Actions 1159 .narrowScalarIf( 1160 [=](const LegalityQuery &Query) -> bool { 1161 return !Query.Types[0].isVector() && 1162 needToSplitMemOp(Query, Op == G_LOAD); 1163 }, 1164 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1165 const LLT DstTy = Query.Types[0]; 1166 const LLT PtrTy = Query.Types[1]; 1167 1168 const unsigned DstSize = DstTy.getSizeInBits(); 1169 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1170 1171 // Split extloads. 1172 if (DstSize > MemSize) 1173 return std::make_pair(0, LLT::scalar(MemSize)); 1174 1175 unsigned MaxSize = maxSizeForAddrSpace(ST, 1176 PtrTy.getAddressSpace(), 1177 Op == G_LOAD); 1178 if (MemSize > MaxSize) 1179 return std::make_pair(0, LLT::scalar(MaxSize)); 1180 1181 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1182 return std::make_pair(0, LLT::scalar(Align)); 1183 }) 1184 .fewerElementsIf( 1185 [=](const LegalityQuery &Query) -> bool { 1186 return Query.Types[0].isVector() && 1187 needToSplitMemOp(Query, Op == G_LOAD); 1188 }, 1189 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1190 const LLT DstTy = Query.Types[0]; 1191 const LLT PtrTy = Query.Types[1]; 1192 1193 LLT EltTy = DstTy.getElementType(); 1194 unsigned MaxSize = maxSizeForAddrSpace(ST, 1195 PtrTy.getAddressSpace(), 1196 Op == G_LOAD); 1197 1198 // FIXME: Handle widened to power of 2 results better. This ends 1199 // up scalarizing. 1200 // FIXME: 3 element stores scalarized on SI 1201 1202 // Split if it's too large for the address space. 1203 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1204 if (MemSize > MaxSize) { 1205 unsigned NumElts = DstTy.getNumElements(); 1206 unsigned EltSize = EltTy.getSizeInBits(); 1207 1208 if (MaxSize % EltSize == 0) { 1209 return std::make_pair( 1210 0, LLT::scalarOrVector( 1211 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1212 } 1213 1214 unsigned NumPieces = MemSize / MaxSize; 1215 1216 // FIXME: Refine when odd breakdowns handled 1217 // The scalars will need to be re-legalized. 1218 if (NumPieces == 1 || NumPieces >= NumElts || 1219 NumElts % NumPieces != 0) 1220 return std::make_pair(0, EltTy); 1221 1222 return std::make_pair( 1223 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1224 } 1225 1226 // FIXME: We could probably handle weird extending loads better. 1227 if (DstTy.getSizeInBits() > MemSize) 1228 return std::make_pair(0, EltTy); 1229 1230 unsigned EltSize = EltTy.getSizeInBits(); 1231 unsigned DstSize = DstTy.getSizeInBits(); 1232 if (!isPowerOf2_32(DstSize)) { 1233 // We're probably decomposing an odd sized store. Try to split 1234 // to the widest type. TODO: Account for alignment. As-is it 1235 // should be OK, since the new parts will be further legalized. 1236 unsigned FloorSize = PowerOf2Floor(DstSize); 1237 return std::make_pair( 1238 0, LLT::scalarOrVector( 1239 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1240 } 1241 1242 // May need relegalization for the scalars. 1243 return std::make_pair(0, EltTy); 1244 }) 1245 .minScalar(0, S32) 1246 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1247 .widenScalarToNextPow2(0) 1248 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1249 .lower(); 1250 } 1251 1252 // FIXME: Unaligned accesses not lowered. 1253 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1254 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1255 {S32, GlobalPtr, S16, 2 * 8}, 1256 {S32, LocalPtr, S8, 8}, 1257 {S32, LocalPtr, S16, 16}, 1258 {S32, PrivatePtr, S8, 8}, 1259 {S32, PrivatePtr, S16, 16}, 1260 {S32, ConstantPtr, S8, 8}, 1261 {S32, ConstantPtr, S16, 2 * 8}}) 1262 .legalIf( 1263 [=](const LegalityQuery &Query) -> bool { 1264 return isLoadStoreLegal(ST, Query); 1265 }); 1266 1267 if (ST.hasFlatAddressSpace()) { 1268 ExtLoads.legalForTypesWithMemDesc( 1269 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1270 } 1271 1272 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1273 // 64-bits. 1274 // 1275 // TODO: Should generalize bitcast action into coerce, which will also cover 1276 // inserting addrspacecasts. 1277 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1278 1279 ExtLoads.clampScalar(0, S32, S32) 1280 .widenScalarToNextPow2(0) 1281 .lower(); 1282 1283 auto &Atomics = getActionDefinitionsBuilder( 1284 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1285 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1286 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1287 G_ATOMICRMW_UMIN}) 1288 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1289 {S64, GlobalPtr}, {S64, LocalPtr}, 1290 {S32, RegionPtr}, {S64, RegionPtr}}); 1291 if (ST.hasFlatAddressSpace()) { 1292 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1293 } 1294 1295 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1296 if (ST.hasLDSFPAtomicAdd()) { 1297 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1298 if (ST.hasGFX90AInsts()) 1299 Atomic.legalFor({{S64, LocalPtr}}); 1300 if (ST.hasGFX940Insts()) 1301 Atomic.legalFor({{V2S16, LocalPtr}}); 1302 } 1303 if (ST.hasAtomicFaddInsts()) 1304 Atomic.legalFor({{S32, GlobalPtr}}); 1305 1306 if (ST.hasGFX90AInsts()) { 1307 // These are legal with some caveats, and should have undergone expansion in 1308 // the IR in most situations 1309 // TODO: Move atomic expansion into legalizer 1310 // TODO: Also supports <2 x f16> 1311 Atomic.legalFor({ 1312 {S32, GlobalPtr}, 1313 {S64, GlobalPtr}, 1314 {S64, FlatPtr} 1315 }); 1316 } 1317 1318 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1319 // demarshalling 1320 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1321 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1322 {S32, FlatPtr}, {S64, FlatPtr}}) 1323 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1324 {S32, RegionPtr}, {S64, RegionPtr}}); 1325 // TODO: Pointer types, any 32-bit or 64-bit vector 1326 1327 // Condition should be s32 for scalar, s1 for vector. 1328 getActionDefinitionsBuilder(G_SELECT) 1329 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1330 LocalPtr, FlatPtr, PrivatePtr, 1331 LLT::fixed_vector(2, LocalPtr), 1332 LLT::fixed_vector(2, PrivatePtr)}, 1333 {S1, S32}) 1334 .clampScalar(0, S16, S64) 1335 .scalarize(1) 1336 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1337 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1338 .clampMaxNumElements(0, S32, 2) 1339 .clampMaxNumElements(0, LocalPtr, 2) 1340 .clampMaxNumElements(0, PrivatePtr, 2) 1341 .scalarize(0) 1342 .widenScalarToNextPow2(0) 1343 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1344 1345 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1346 // be more flexible with the shift amount type. 1347 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1348 .legalFor({{S32, S32}, {S64, S32}}); 1349 if (ST.has16BitInsts()) { 1350 if (ST.hasVOP3PInsts()) { 1351 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1352 .clampMaxNumElements(0, S16, 2); 1353 } else 1354 Shifts.legalFor({{S16, S16}}); 1355 1356 // TODO: Support 16-bit shift amounts for all types 1357 Shifts.widenScalarIf( 1358 [=](const LegalityQuery &Query) { 1359 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1360 // 32-bit amount. 1361 const LLT ValTy = Query.Types[0]; 1362 const LLT AmountTy = Query.Types[1]; 1363 return ValTy.getSizeInBits() <= 16 && 1364 AmountTy.getSizeInBits() < 16; 1365 }, changeTo(1, S16)); 1366 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1367 Shifts.clampScalar(1, S32, S32); 1368 Shifts.widenScalarToNextPow2(0, 16); 1369 Shifts.clampScalar(0, S16, S64); 1370 1371 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1372 .minScalar(0, S16) 1373 .scalarize(0) 1374 .lower(); 1375 } else { 1376 // Make sure we legalize the shift amount type first, as the general 1377 // expansion for the shifted type will produce much worse code if it hasn't 1378 // been truncated already. 1379 Shifts.clampScalar(1, S32, S32); 1380 Shifts.widenScalarToNextPow2(0, 32); 1381 Shifts.clampScalar(0, S32, S64); 1382 1383 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1384 .minScalar(0, S32) 1385 .scalarize(0) 1386 .lower(); 1387 } 1388 Shifts.scalarize(0); 1389 1390 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1391 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1392 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1393 unsigned IdxTypeIdx = 2; 1394 1395 getActionDefinitionsBuilder(Op) 1396 .customIf([=](const LegalityQuery &Query) { 1397 const LLT EltTy = Query.Types[EltTypeIdx]; 1398 const LLT VecTy = Query.Types[VecTypeIdx]; 1399 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1400 const unsigned EltSize = EltTy.getSizeInBits(); 1401 return (EltSize == 32 || EltSize == 64) && 1402 VecTy.getSizeInBits() % 32 == 0 && 1403 VecTy.getSizeInBits() <= MaxRegisterSize && 1404 IdxTy.getSizeInBits() == 32; 1405 }) 1406 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1407 bitcastToVectorElement32(VecTypeIdx)) 1408 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1409 .bitcastIf( 1410 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1411 [=](const LegalityQuery &Query) { 1412 // For > 64-bit element types, try to turn this into a 64-bit 1413 // element vector since we may be able to do better indexing 1414 // if this is scalar. If not, fall back to 32. 1415 const LLT EltTy = Query.Types[EltTypeIdx]; 1416 const LLT VecTy = Query.Types[VecTypeIdx]; 1417 const unsigned DstEltSize = EltTy.getSizeInBits(); 1418 const unsigned VecSize = VecTy.getSizeInBits(); 1419 1420 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1421 return std::make_pair( 1422 VecTypeIdx, 1423 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1424 }) 1425 .clampScalar(EltTypeIdx, S32, S64) 1426 .clampScalar(VecTypeIdx, S32, S64) 1427 .clampScalar(IdxTypeIdx, S32, S32) 1428 .clampMaxNumElements(VecTypeIdx, S32, 32) 1429 // TODO: Clamp elements for 64-bit vectors? 1430 // It should only be necessary with variable indexes. 1431 // As a last resort, lower to the stack 1432 .lower(); 1433 } 1434 1435 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1436 .unsupportedIf([=](const LegalityQuery &Query) { 1437 const LLT &EltTy = Query.Types[1].getElementType(); 1438 return Query.Types[0] != EltTy; 1439 }); 1440 1441 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1442 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1443 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1444 1445 // FIXME: Doesn't handle extract of illegal sizes. 1446 getActionDefinitionsBuilder(Op) 1447 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1448 .lowerIf([=](const LegalityQuery &Query) { 1449 // Sub-vector(or single element) insert and extract. 1450 // TODO: verify immediate offset here since lower only works with 1451 // whole elements. 1452 const LLT BigTy = Query.Types[BigTyIdx]; 1453 return BigTy.isVector(); 1454 }) 1455 // FIXME: Multiples of 16 should not be legal. 1456 .legalIf([=](const LegalityQuery &Query) { 1457 const LLT BigTy = Query.Types[BigTyIdx]; 1458 const LLT LitTy = Query.Types[LitTyIdx]; 1459 return (BigTy.getSizeInBits() % 32 == 0) && 1460 (LitTy.getSizeInBits() % 16 == 0); 1461 }) 1462 .widenScalarIf( 1463 [=](const LegalityQuery &Query) { 1464 const LLT BigTy = Query.Types[BigTyIdx]; 1465 return (BigTy.getScalarSizeInBits() < 16); 1466 }, 1467 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1468 .widenScalarIf( 1469 [=](const LegalityQuery &Query) { 1470 const LLT LitTy = Query.Types[LitTyIdx]; 1471 return (LitTy.getScalarSizeInBits() < 16); 1472 }, 1473 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1474 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1475 .widenScalarToNextPow2(BigTyIdx, 32); 1476 1477 } 1478 1479 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1480 .legalForCartesianProduct(AllS32Vectors, {S32}) 1481 .legalForCartesianProduct(AllS64Vectors, {S64}) 1482 .clampNumElements(0, V16S32, V32S32) 1483 .clampNumElements(0, V2S64, V16S64) 1484 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1485 1486 if (ST.hasScalarPackInsts()) { 1487 BuildVector 1488 // FIXME: Should probably widen s1 vectors straight to s32 1489 .minScalarOrElt(0, S16) 1490 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1491 .minScalar(1, S32); 1492 1493 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1494 .legalFor({V2S16, S32}) 1495 .lower(); 1496 BuildVector.minScalarOrElt(0, S32); 1497 } else { 1498 BuildVector.customFor({V2S16, S16}); 1499 BuildVector.minScalarOrElt(0, S32); 1500 1501 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1502 .customFor({V2S16, S32}) 1503 .lower(); 1504 } 1505 1506 BuildVector.legalIf(isRegisterType(0)); 1507 1508 // FIXME: Clamp maximum size 1509 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1510 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1511 .clampMaxNumElements(0, S32, 32) 1512 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1513 .clampMaxNumElements(0, S16, 64); 1514 1515 // TODO: Don't fully scalarize v2s16 pieces? Or combine out those 1516 // pre-legalize. 1517 if (ST.hasVOP3PInsts()) { 1518 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1519 .customFor({V2S16, V2S16}) 1520 .lower(); 1521 } else 1522 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1523 1524 // Merge/Unmerge 1525 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1526 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1527 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1528 1529 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1530 const LLT Ty = Query.Types[TypeIdx]; 1531 if (Ty.isVector()) { 1532 const LLT &EltTy = Ty.getElementType(); 1533 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1534 return true; 1535 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1536 return true; 1537 } 1538 return false; 1539 }; 1540 1541 auto &Builder = getActionDefinitionsBuilder(Op) 1542 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1543 .lowerFor({{S16, V2S16}}) 1544 .lowerIf([=](const LegalityQuery &Query) { 1545 const LLT BigTy = Query.Types[BigTyIdx]; 1546 return BigTy.getSizeInBits() == 32; 1547 }) 1548 // Try to widen to s16 first for small types. 1549 // TODO: Only do this on targets with legal s16 shifts 1550 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1551 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1552 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1553 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1554 elementTypeIs(1, S16)), 1555 changeTo(1, V2S16)) 1556 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1557 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1558 // valid. 1559 .clampScalar(LitTyIdx, S32, S512) 1560 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1561 // Break up vectors with weird elements into scalars 1562 .fewerElementsIf( 1563 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1564 scalarize(0)) 1565 .fewerElementsIf( 1566 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1567 scalarize(1)) 1568 .clampScalar(BigTyIdx, S32, MaxScalar); 1569 1570 if (Op == G_MERGE_VALUES) { 1571 Builder.widenScalarIf( 1572 // TODO: Use 16-bit shifts if legal for 8-bit values? 1573 [=](const LegalityQuery &Query) { 1574 const LLT Ty = Query.Types[LitTyIdx]; 1575 return Ty.getSizeInBits() < 32; 1576 }, 1577 changeTo(LitTyIdx, S32)); 1578 } 1579 1580 Builder.widenScalarIf( 1581 [=](const LegalityQuery &Query) { 1582 const LLT Ty = Query.Types[BigTyIdx]; 1583 return !isPowerOf2_32(Ty.getSizeInBits()) && 1584 Ty.getSizeInBits() % 16 != 0; 1585 }, 1586 [=](const LegalityQuery &Query) { 1587 // Pick the next power of 2, or a multiple of 64 over 128. 1588 // Whichever is smaller. 1589 const LLT &Ty = Query.Types[BigTyIdx]; 1590 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1591 if (NewSizeInBits >= 256) { 1592 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1593 if (RoundedTo < NewSizeInBits) 1594 NewSizeInBits = RoundedTo; 1595 } 1596 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1597 }) 1598 // Any vectors left are the wrong size. Scalarize them. 1599 .scalarize(0) 1600 .scalarize(1); 1601 } 1602 1603 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1604 // RegBankSelect. 1605 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1606 .legalFor({{S32}, {S64}}); 1607 1608 if (ST.hasVOP3PInsts()) { 1609 SextInReg.lowerFor({{V2S16}}) 1610 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1611 // get more vector shift opportunities, since we'll get those when 1612 // expanded. 1613 .clampMaxNumElementsStrict(0, S16, 2); 1614 } else if (ST.has16BitInsts()) { 1615 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1616 } else { 1617 // Prefer to promote to s32 before lowering if we don't have 16-bit 1618 // shifts. This avoid a lot of intermediate truncate and extend operations. 1619 SextInReg.lowerFor({{S32}, {S64}}); 1620 } 1621 1622 SextInReg 1623 .scalarize(0) 1624 .clampScalar(0, S32, S64) 1625 .lower(); 1626 1627 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1628 .scalarize(0) 1629 .lower(); 1630 1631 // TODO: Only Try to form v2s16 with legal packed instructions. 1632 getActionDefinitionsBuilder(G_FSHR) 1633 .legalFor({{S32, S32}}) 1634 .lowerFor({{V2S16, V2S16}}) 1635 .clampMaxNumElementsStrict(0, S16, 2) 1636 .scalarize(0) 1637 .lower(); 1638 1639 if (ST.hasVOP3PInsts()) { 1640 getActionDefinitionsBuilder(G_FSHL) 1641 .lowerFor({{V2S16, V2S16}}) 1642 .clampMaxNumElementsStrict(0, S16, 2) 1643 .scalarize(0) 1644 .lower(); 1645 } else { 1646 getActionDefinitionsBuilder(G_FSHL) 1647 .scalarize(0) 1648 .lower(); 1649 } 1650 1651 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1652 .legalFor({S64}); 1653 1654 getActionDefinitionsBuilder(G_FENCE) 1655 .alwaysLegal(); 1656 1657 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1658 .scalarize(0) 1659 .minScalar(0, S32) 1660 .lower(); 1661 1662 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1663 .legalFor({{S32, S32}, {S64, S32}}) 1664 .clampScalar(1, S32, S32) 1665 .clampScalar(0, S32, S64) 1666 .widenScalarToNextPow2(0) 1667 .scalarize(0); 1668 1669 getActionDefinitionsBuilder({ 1670 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1671 G_FCOPYSIGN, 1672 1673 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1674 G_ATOMICRMW_NAND, 1675 G_ATOMICRMW_FSUB, 1676 G_READ_REGISTER, 1677 G_WRITE_REGISTER, 1678 1679 G_SADDO, G_SSUBO, 1680 1681 // TODO: Implement 1682 G_FMINIMUM, G_FMAXIMUM}).lower(); 1683 1684 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1685 .lower(); 1686 1687 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1688 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1689 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1690 .unsupported(); 1691 1692 getLegacyLegalizerInfo().computeTables(); 1693 verify(*ST.getInstrInfo()); 1694 } 1695 1696 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1697 MachineInstr &MI) const { 1698 MachineIRBuilder &B = Helper.MIRBuilder; 1699 MachineRegisterInfo &MRI = *B.getMRI(); 1700 1701 switch (MI.getOpcode()) { 1702 case TargetOpcode::G_ADDRSPACE_CAST: 1703 return legalizeAddrSpaceCast(MI, MRI, B); 1704 case TargetOpcode::G_FRINT: 1705 return legalizeFrint(MI, MRI, B); 1706 case TargetOpcode::G_FCEIL: 1707 return legalizeFceil(MI, MRI, B); 1708 case TargetOpcode::G_FREM: 1709 return legalizeFrem(MI, MRI, B); 1710 case TargetOpcode::G_INTRINSIC_TRUNC: 1711 return legalizeIntrinsicTrunc(MI, MRI, B); 1712 case TargetOpcode::G_SITOFP: 1713 return legalizeITOFP(MI, MRI, B, true); 1714 case TargetOpcode::G_UITOFP: 1715 return legalizeITOFP(MI, MRI, B, false); 1716 case TargetOpcode::G_FPTOSI: 1717 return legalizeFPTOI(MI, MRI, B, true); 1718 case TargetOpcode::G_FPTOUI: 1719 return legalizeFPTOI(MI, MRI, B, false); 1720 case TargetOpcode::G_FMINNUM: 1721 case TargetOpcode::G_FMAXNUM: 1722 case TargetOpcode::G_FMINNUM_IEEE: 1723 case TargetOpcode::G_FMAXNUM_IEEE: 1724 return legalizeMinNumMaxNum(Helper, MI); 1725 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1726 return legalizeExtractVectorElt(MI, MRI, B); 1727 case TargetOpcode::G_INSERT_VECTOR_ELT: 1728 return legalizeInsertVectorElt(MI, MRI, B); 1729 case TargetOpcode::G_SHUFFLE_VECTOR: 1730 return legalizeShuffleVector(MI, MRI, B); 1731 case TargetOpcode::G_FSIN: 1732 case TargetOpcode::G_FCOS: 1733 return legalizeSinCos(MI, MRI, B); 1734 case TargetOpcode::G_GLOBAL_VALUE: 1735 return legalizeGlobalValue(MI, MRI, B); 1736 case TargetOpcode::G_LOAD: 1737 case TargetOpcode::G_SEXTLOAD: 1738 case TargetOpcode::G_ZEXTLOAD: 1739 return legalizeLoad(Helper, MI); 1740 case TargetOpcode::G_FMAD: 1741 return legalizeFMad(MI, MRI, B); 1742 case TargetOpcode::G_FDIV: 1743 return legalizeFDIV(MI, MRI, B); 1744 case TargetOpcode::G_UDIV: 1745 case TargetOpcode::G_UREM: 1746 case TargetOpcode::G_UDIVREM: 1747 return legalizeUnsignedDIV_REM(MI, MRI, B); 1748 case TargetOpcode::G_SDIV: 1749 case TargetOpcode::G_SREM: 1750 case TargetOpcode::G_SDIVREM: 1751 return legalizeSignedDIV_REM(MI, MRI, B); 1752 case TargetOpcode::G_ATOMIC_CMPXCHG: 1753 return legalizeAtomicCmpXChg(MI, MRI, B); 1754 case TargetOpcode::G_FLOG: 1755 return legalizeFlog(MI, B, numbers::ln2f); 1756 case TargetOpcode::G_FLOG10: 1757 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1758 case TargetOpcode::G_FEXP: 1759 return legalizeFExp(MI, B); 1760 case TargetOpcode::G_FPOW: 1761 return legalizeFPow(MI, B); 1762 case TargetOpcode::G_FFLOOR: 1763 return legalizeFFloor(MI, MRI, B); 1764 case TargetOpcode::G_BUILD_VECTOR: 1765 return legalizeBuildVector(MI, MRI, B); 1766 case TargetOpcode::G_CTLZ: 1767 case TargetOpcode::G_CTTZ: 1768 return legalizeCTLZ_CTTZ(MI, MRI, B); 1769 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 1770 return legalizeFPTruncRound(MI, B); 1771 default: 1772 return false; 1773 } 1774 1775 llvm_unreachable("expected switch to return"); 1776 } 1777 1778 Register AMDGPULegalizerInfo::getSegmentAperture( 1779 unsigned AS, 1780 MachineRegisterInfo &MRI, 1781 MachineIRBuilder &B) const { 1782 MachineFunction &MF = B.getMF(); 1783 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1784 const LLT S32 = LLT::scalar(32); 1785 1786 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1787 1788 if (ST.hasApertureRegs()) { 1789 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1790 // getreg. 1791 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1792 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1793 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1794 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1795 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1796 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1797 unsigned Encoding = 1798 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1799 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1800 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1801 1802 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1803 1804 B.buildInstr(AMDGPU::S_GETREG_B32) 1805 .addDef(GetReg) 1806 .addImm(Encoding); 1807 MRI.setType(GetReg, S32); 1808 1809 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1810 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1811 } 1812 1813 // TODO: can we be smarter about machine pointer info? 1814 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1815 Register LoadAddr = MRI.createGenericVirtualRegister( 1816 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1817 // For code object version 5, private_base and shared_base are passed through 1818 // implicit kernargs. 1819 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 1820 AMDGPUTargetLowering::ImplicitParameter Param = 1821 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 1822 : AMDGPUTargetLowering::PRIVATE_BASE; 1823 uint64_t Offset = 1824 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 1825 1826 Register KernargPtrReg = MRI.createGenericVirtualRegister( 1827 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1828 1829 if (!loadInputValue(KernargPtrReg, B, 1830 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 1831 return Register(); 1832 1833 MachineMemOperand *MMO = MF.getMachineMemOperand( 1834 PtrInfo, 1835 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1836 MachineMemOperand::MOInvariant, 1837 LLT::scalar(32), commonAlignment(Align(64), Offset)); 1838 1839 // Pointer address 1840 B.buildPtrAdd(LoadAddr, KernargPtrReg, 1841 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 1842 // Load address 1843 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1844 } 1845 1846 Register QueuePtr = MRI.createGenericVirtualRegister( 1847 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1848 1849 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1850 return Register(); 1851 1852 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1853 // private_segment_aperture_base_hi. 1854 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1855 1856 MachineMemOperand *MMO = MF.getMachineMemOperand( 1857 PtrInfo, 1858 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1859 MachineMemOperand::MOInvariant, 1860 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1861 1862 B.buildPtrAdd(LoadAddr, QueuePtr, 1863 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 1864 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1865 } 1866 1867 /// Return true if the value is a known valid address, such that a null check is 1868 /// not necessary. 1869 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 1870 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 1871 MachineInstr *Def = MRI.getVRegDef(Val); 1872 switch (Def->getOpcode()) { 1873 case AMDGPU::G_FRAME_INDEX: 1874 case AMDGPU::G_GLOBAL_VALUE: 1875 case AMDGPU::G_BLOCK_ADDR: 1876 return true; 1877 case AMDGPU::G_CONSTANT: { 1878 const ConstantInt *CI = Def->getOperand(1).getCImm(); 1879 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 1880 } 1881 default: 1882 return false; 1883 } 1884 1885 return false; 1886 } 1887 1888 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1889 MachineInstr &MI, MachineRegisterInfo &MRI, 1890 MachineIRBuilder &B) const { 1891 MachineFunction &MF = B.getMF(); 1892 1893 const LLT S32 = LLT::scalar(32); 1894 Register Dst = MI.getOperand(0).getReg(); 1895 Register Src = MI.getOperand(1).getReg(); 1896 1897 LLT DstTy = MRI.getType(Dst); 1898 LLT SrcTy = MRI.getType(Src); 1899 unsigned DestAS = DstTy.getAddressSpace(); 1900 unsigned SrcAS = SrcTy.getAddressSpace(); 1901 1902 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1903 // vector element. 1904 assert(!DstTy.isVector()); 1905 1906 const AMDGPUTargetMachine &TM 1907 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1908 1909 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1910 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1911 return true; 1912 } 1913 1914 if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 1915 (DestAS == AMDGPUAS::LOCAL_ADDRESS || 1916 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1917 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1918 // Extract low 32-bits of the pointer. 1919 B.buildExtract(Dst, Src, 0); 1920 MI.eraseFromParent(); 1921 return true; 1922 } 1923 1924 unsigned NullVal = TM.getNullPointerValue(DestAS); 1925 1926 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1927 auto FlatNull = B.buildConstant(SrcTy, 0); 1928 1929 // Extract low 32-bits of the pointer. 1930 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1931 1932 auto CmpRes = 1933 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1934 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1935 1936 MI.eraseFromParent(); 1937 return true; 1938 } 1939 1940 if (DestAS == AMDGPUAS::FLAT_ADDRESS && 1941 (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 1942 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1943 if (!ST.hasFlatAddressSpace()) 1944 return false; 1945 1946 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1947 if (!ApertureReg.isValid()) 1948 return false; 1949 1950 // Coerce the type of the low half of the result so we can use merge_values. 1951 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1952 1953 // TODO: Should we allow mismatched types but matching sizes in merges to 1954 // avoid the ptrtoint? 1955 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1956 1957 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1958 B.buildCopy(Dst, BuildPtr); 1959 MI.eraseFromParent(); 1960 return true; 1961 } 1962 1963 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1964 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1965 1966 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 1967 SegmentNull.getReg(0)); 1968 1969 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1970 1971 MI.eraseFromParent(); 1972 return true; 1973 } 1974 1975 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 1976 SrcTy.getSizeInBits() == 64) { 1977 // Truncate. 1978 B.buildExtract(Dst, Src, 0); 1979 MI.eraseFromParent(); 1980 return true; 1981 } 1982 1983 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 1984 DstTy.getSizeInBits() == 64) { 1985 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1986 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1987 1988 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1989 // another. Merge operands are required to be the same type, but creating an 1990 // extra ptrtoint would be kind of pointless. 1991 auto HighAddr = B.buildConstant( 1992 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1993 B.buildMerge(Dst, {Src, HighAddr}); 1994 MI.eraseFromParent(); 1995 return true; 1996 } 1997 1998 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 1999 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 2000 2001 LLVMContext &Ctx = MF.getFunction().getContext(); 2002 Ctx.diagnose(InvalidAddrSpaceCast); 2003 B.buildUndef(Dst); 2004 MI.eraseFromParent(); 2005 return true; 2006 } 2007 2008 bool AMDGPULegalizerInfo::legalizeFrint( 2009 MachineInstr &MI, MachineRegisterInfo &MRI, 2010 MachineIRBuilder &B) const { 2011 Register Src = MI.getOperand(1).getReg(); 2012 LLT Ty = MRI.getType(Src); 2013 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 2014 2015 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2016 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2017 2018 auto C1 = B.buildFConstant(Ty, C1Val); 2019 auto CopySign = B.buildFCopysign(Ty, C1, Src); 2020 2021 // TODO: Should this propagate fast-math-flags? 2022 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 2023 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 2024 2025 auto C2 = B.buildFConstant(Ty, C2Val); 2026 auto Fabs = B.buildFAbs(Ty, Src); 2027 2028 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 2029 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2030 MI.eraseFromParent(); 2031 return true; 2032 } 2033 2034 bool AMDGPULegalizerInfo::legalizeFceil( 2035 MachineInstr &MI, MachineRegisterInfo &MRI, 2036 MachineIRBuilder &B) const { 2037 2038 const LLT S1 = LLT::scalar(1); 2039 const LLT S64 = LLT::scalar(64); 2040 2041 Register Src = MI.getOperand(1).getReg(); 2042 assert(MRI.getType(Src) == S64); 2043 2044 // result = trunc(src) 2045 // if (src > 0.0 && src != result) 2046 // result += 1.0 2047 2048 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 2049 2050 const auto Zero = B.buildFConstant(S64, 0.0); 2051 const auto One = B.buildFConstant(S64, 1.0); 2052 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2053 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2054 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2055 auto Add = B.buildSelect(S64, And, One, Zero); 2056 2057 // TODO: Should this propagate fast-math-flags? 2058 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2059 MI.eraseFromParent(); 2060 return true; 2061 } 2062 2063 bool AMDGPULegalizerInfo::legalizeFrem( 2064 MachineInstr &MI, MachineRegisterInfo &MRI, 2065 MachineIRBuilder &B) const { 2066 Register DstReg = MI.getOperand(0).getReg(); 2067 Register Src0Reg = MI.getOperand(1).getReg(); 2068 Register Src1Reg = MI.getOperand(2).getReg(); 2069 auto Flags = MI.getFlags(); 2070 LLT Ty = MRI.getType(DstReg); 2071 2072 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2073 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2074 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2075 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2076 MI.eraseFromParent(); 2077 return true; 2078 } 2079 2080 static MachineInstrBuilder extractF64Exponent(Register Hi, 2081 MachineIRBuilder &B) { 2082 const unsigned FractBits = 52; 2083 const unsigned ExpBits = 11; 2084 LLT S32 = LLT::scalar(32); 2085 2086 auto Const0 = B.buildConstant(S32, FractBits - 32); 2087 auto Const1 = B.buildConstant(S32, ExpBits); 2088 2089 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2090 .addUse(Hi) 2091 .addUse(Const0.getReg(0)) 2092 .addUse(Const1.getReg(0)); 2093 2094 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2095 } 2096 2097 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2098 MachineInstr &MI, MachineRegisterInfo &MRI, 2099 MachineIRBuilder &B) const { 2100 const LLT S1 = LLT::scalar(1); 2101 const LLT S32 = LLT::scalar(32); 2102 const LLT S64 = LLT::scalar(64); 2103 2104 Register Src = MI.getOperand(1).getReg(); 2105 assert(MRI.getType(Src) == S64); 2106 2107 // TODO: Should this use extract since the low half is unused? 2108 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2109 Register Hi = Unmerge.getReg(1); 2110 2111 // Extract the upper half, since this is where we will find the sign and 2112 // exponent. 2113 auto Exp = extractF64Exponent(Hi, B); 2114 2115 const unsigned FractBits = 52; 2116 2117 // Extract the sign bit. 2118 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2119 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2120 2121 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2122 2123 const auto Zero32 = B.buildConstant(S32, 0); 2124 2125 // Extend back to 64-bits. 2126 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2127 2128 auto Shr = B.buildAShr(S64, FractMask, Exp); 2129 auto Not = B.buildNot(S64, Shr); 2130 auto Tmp0 = B.buildAnd(S64, Src, Not); 2131 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2132 2133 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2134 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2135 2136 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2137 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2138 MI.eraseFromParent(); 2139 return true; 2140 } 2141 2142 bool AMDGPULegalizerInfo::legalizeITOFP( 2143 MachineInstr &MI, MachineRegisterInfo &MRI, 2144 MachineIRBuilder &B, bool Signed) const { 2145 2146 Register Dst = MI.getOperand(0).getReg(); 2147 Register Src = MI.getOperand(1).getReg(); 2148 2149 const LLT S64 = LLT::scalar(64); 2150 const LLT S32 = LLT::scalar(32); 2151 2152 assert(MRI.getType(Src) == S64); 2153 2154 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2155 auto ThirtyTwo = B.buildConstant(S32, 32); 2156 2157 if (MRI.getType(Dst) == S64) { 2158 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2159 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2160 2161 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2162 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2163 .addUse(CvtHi.getReg(0)) 2164 .addUse(ThirtyTwo.getReg(0)); 2165 2166 // TODO: Should this propagate fast-math-flags? 2167 B.buildFAdd(Dst, LdExp, CvtLo); 2168 MI.eraseFromParent(); 2169 return true; 2170 } 2171 2172 assert(MRI.getType(Dst) == S32); 2173 2174 auto One = B.buildConstant(S32, 1); 2175 2176 MachineInstrBuilder ShAmt; 2177 if (Signed) { 2178 auto ThirtyOne = B.buildConstant(S32, 31); 2179 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2180 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2181 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2182 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2183 /*HasSideEffects=*/false) 2184 .addUse(Unmerge.getReg(1)); 2185 auto LS2 = B.buildSub(S32, LS, One); 2186 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2187 } else 2188 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2189 auto Norm = B.buildShl(S64, Src, ShAmt); 2190 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2191 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2192 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2193 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2194 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2195 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2196 /*HasSideEffects=*/false) 2197 .addUse(FVal.getReg(0)) 2198 .addUse(Scale.getReg(0)); 2199 MI.eraseFromParent(); 2200 return true; 2201 } 2202 2203 // TODO: Copied from DAG implementation. Verify logic and document how this 2204 // actually works. 2205 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2206 MachineRegisterInfo &MRI, 2207 MachineIRBuilder &B, 2208 bool Signed) const { 2209 2210 Register Dst = MI.getOperand(0).getReg(); 2211 Register Src = MI.getOperand(1).getReg(); 2212 2213 const LLT S64 = LLT::scalar(64); 2214 const LLT S32 = LLT::scalar(32); 2215 2216 const LLT SrcLT = MRI.getType(Src); 2217 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2218 2219 unsigned Flags = MI.getFlags(); 2220 2221 // The basic idea of converting a floating point number into a pair of 32-bit 2222 // integers is illustrated as follows: 2223 // 2224 // tf := trunc(val); 2225 // hif := floor(tf * 2^-32); 2226 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2227 // hi := fptoi(hif); 2228 // lo := fptoi(lof); 2229 // 2230 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2231 MachineInstrBuilder Sign; 2232 if (Signed && SrcLT == S32) { 2233 // However, a 32-bit floating point number has only 23 bits mantissa and 2234 // it's not enough to hold all the significant bits of `lof` if val is 2235 // negative. To avoid the loss of precision, We need to take the absolute 2236 // value after truncating and flip the result back based on the original 2237 // signedness. 2238 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2239 Trunc = B.buildFAbs(S32, Trunc, Flags); 2240 } 2241 MachineInstrBuilder K0, K1; 2242 if (SrcLT == S64) { 2243 K0 = B.buildFConstant(S64, 2244 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2245 K1 = B.buildFConstant(S64, 2246 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2247 } else { 2248 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2249 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2250 } 2251 2252 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2253 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2254 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2255 2256 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2257 : B.buildFPTOUI(S32, FloorMul); 2258 auto Lo = B.buildFPTOUI(S32, Fma); 2259 2260 if (Signed && SrcLT == S32) { 2261 // Flip the result based on the signedness, which is either all 0s or 1s. 2262 Sign = B.buildMerge(S64, {Sign, Sign}); 2263 // r := xor({lo, hi}, sign) - sign; 2264 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2265 } else 2266 B.buildMerge(Dst, {Lo, Hi}); 2267 MI.eraseFromParent(); 2268 2269 return true; 2270 } 2271 2272 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2273 MachineInstr &MI) const { 2274 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2275 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2276 2277 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2278 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2279 2280 // With ieee_mode disabled, the instructions have the correct behavior 2281 // already for G_FMINNUM/G_FMAXNUM 2282 if (!MFI->getMode().IEEE) 2283 return !IsIEEEOp; 2284 2285 if (IsIEEEOp) 2286 return true; 2287 2288 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2289 } 2290 2291 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2292 MachineInstr &MI, MachineRegisterInfo &MRI, 2293 MachineIRBuilder &B) const { 2294 // TODO: Should move some of this into LegalizerHelper. 2295 2296 // TODO: Promote dynamic indexing of s16 to s32 2297 2298 // FIXME: Artifact combiner probably should have replaced the truncated 2299 // constant before this, so we shouldn't need 2300 // getIConstantVRegValWithLookThrough. 2301 Optional<ValueAndVReg> MaybeIdxVal = 2302 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2303 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2304 return true; 2305 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2306 2307 Register Dst = MI.getOperand(0).getReg(); 2308 Register Vec = MI.getOperand(1).getReg(); 2309 2310 LLT VecTy = MRI.getType(Vec); 2311 LLT EltTy = VecTy.getElementType(); 2312 assert(EltTy == MRI.getType(Dst)); 2313 2314 if (IdxVal < VecTy.getNumElements()) { 2315 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2316 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2317 } else { 2318 B.buildUndef(Dst); 2319 } 2320 2321 MI.eraseFromParent(); 2322 return true; 2323 } 2324 2325 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2326 MachineInstr &MI, MachineRegisterInfo &MRI, 2327 MachineIRBuilder &B) const { 2328 // TODO: Should move some of this into LegalizerHelper. 2329 2330 // TODO: Promote dynamic indexing of s16 to s32 2331 2332 // FIXME: Artifact combiner probably should have replaced the truncated 2333 // constant before this, so we shouldn't need 2334 // getIConstantVRegValWithLookThrough. 2335 Optional<ValueAndVReg> MaybeIdxVal = 2336 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2337 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2338 return true; 2339 2340 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2341 Register Dst = MI.getOperand(0).getReg(); 2342 Register Vec = MI.getOperand(1).getReg(); 2343 Register Ins = MI.getOperand(2).getReg(); 2344 2345 LLT VecTy = MRI.getType(Vec); 2346 LLT EltTy = VecTy.getElementType(); 2347 assert(EltTy == MRI.getType(Ins)); 2348 (void)Ins; 2349 2350 unsigned NumElts = VecTy.getNumElements(); 2351 if (IdxVal < NumElts) { 2352 SmallVector<Register, 8> SrcRegs; 2353 for (unsigned i = 0; i < NumElts; ++i) 2354 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2355 B.buildUnmerge(SrcRegs, Vec); 2356 2357 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2358 B.buildMerge(Dst, SrcRegs); 2359 } else { 2360 B.buildUndef(Dst); 2361 } 2362 2363 MI.eraseFromParent(); 2364 return true; 2365 } 2366 2367 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2368 MachineInstr &MI, MachineRegisterInfo &MRI, 2369 MachineIRBuilder &B) const { 2370 const LLT V2S16 = LLT::fixed_vector(2, 16); 2371 2372 Register Dst = MI.getOperand(0).getReg(); 2373 Register Src0 = MI.getOperand(1).getReg(); 2374 LLT DstTy = MRI.getType(Dst); 2375 LLT SrcTy = MRI.getType(Src0); 2376 2377 if (SrcTy == V2S16 && DstTy == V2S16 && 2378 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2379 return true; 2380 2381 MachineIRBuilder HelperBuilder(MI); 2382 GISelObserverWrapper DummyObserver; 2383 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2384 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2385 } 2386 2387 bool AMDGPULegalizerInfo::legalizeSinCos( 2388 MachineInstr &MI, MachineRegisterInfo &MRI, 2389 MachineIRBuilder &B) const { 2390 2391 Register DstReg = MI.getOperand(0).getReg(); 2392 Register SrcReg = MI.getOperand(1).getReg(); 2393 LLT Ty = MRI.getType(DstReg); 2394 unsigned Flags = MI.getFlags(); 2395 2396 Register TrigVal; 2397 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2398 if (ST.hasTrigReducedRange()) { 2399 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2400 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2401 .addUse(MulVal.getReg(0)) 2402 .setMIFlags(Flags).getReg(0); 2403 } else 2404 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2405 2406 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2407 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2408 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2409 .addUse(TrigVal) 2410 .setMIFlags(Flags); 2411 MI.eraseFromParent(); 2412 return true; 2413 } 2414 2415 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2416 MachineIRBuilder &B, 2417 const GlobalValue *GV, 2418 int64_t Offset, 2419 unsigned GAFlags) const { 2420 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2421 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2422 // to the following code sequence: 2423 // 2424 // For constant address space: 2425 // s_getpc_b64 s[0:1] 2426 // s_add_u32 s0, s0, $symbol 2427 // s_addc_u32 s1, s1, 0 2428 // 2429 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2430 // a fixup or relocation is emitted to replace $symbol with a literal 2431 // constant, which is a pc-relative offset from the encoding of the $symbol 2432 // operand to the global variable. 2433 // 2434 // For global address space: 2435 // s_getpc_b64 s[0:1] 2436 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2437 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2438 // 2439 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2440 // fixups or relocations are emitted to replace $symbol@*@lo and 2441 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2442 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2443 // operand to the global variable. 2444 // 2445 // What we want here is an offset from the value returned by s_getpc 2446 // (which is the address of the s_add_u32 instruction) to the global 2447 // variable, but since the encoding of $symbol starts 4 bytes after the start 2448 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2449 // small. This requires us to add 4 to the global variable offset in order to 2450 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2451 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2452 // instruction. 2453 2454 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2455 2456 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2457 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2458 2459 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2460 .addDef(PCReg); 2461 2462 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2463 if (GAFlags == SIInstrInfo::MO_NONE) 2464 MIB.addImm(0); 2465 else 2466 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2467 2468 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2469 2470 if (PtrTy.getSizeInBits() == 32) 2471 B.buildExtract(DstReg, PCReg, 0); 2472 return true; 2473 } 2474 2475 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2476 MachineInstr &MI, MachineRegisterInfo &MRI, 2477 MachineIRBuilder &B) const { 2478 Register DstReg = MI.getOperand(0).getReg(); 2479 LLT Ty = MRI.getType(DstReg); 2480 unsigned AS = Ty.getAddressSpace(); 2481 2482 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2483 MachineFunction &MF = B.getMF(); 2484 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2485 2486 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2487 if (!MFI->isModuleEntryFunction() && 2488 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2489 const Function &Fn = MF.getFunction(); 2490 DiagnosticInfoUnsupported BadLDSDecl( 2491 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2492 DS_Warning); 2493 Fn.getContext().diagnose(BadLDSDecl); 2494 2495 // We currently don't have a way to correctly allocate LDS objects that 2496 // aren't directly associated with a kernel. We do force inlining of 2497 // functions that use local objects. However, if these dead functions are 2498 // not eliminated, we don't want a compile time error. Just emit a warning 2499 // and a trap, since there should be no callable path here. 2500 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2501 B.buildUndef(DstReg); 2502 MI.eraseFromParent(); 2503 return true; 2504 } 2505 2506 // TODO: We could emit code to handle the initialization somewhere. 2507 // We ignore the initializer for now and legalize it to allow selection. 2508 // The initializer will anyway get errored out during assembly emission. 2509 const SITargetLowering *TLI = ST.getTargetLowering(); 2510 if (!TLI->shouldUseLDSConstAddress(GV)) { 2511 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2512 return true; // Leave in place; 2513 } 2514 2515 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2516 Type *Ty = GV->getValueType(); 2517 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2518 // zero-sized type in other languages to declare the dynamic shared 2519 // memory which size is not known at the compile time. They will be 2520 // allocated by the runtime and placed directly after the static 2521 // allocated ones. They all share the same offset. 2522 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2523 // Adjust alignment for that dynamic shared memory array. 2524 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2525 LLT S32 = LLT::scalar(32); 2526 auto Sz = 2527 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2528 B.buildIntToPtr(DstReg, Sz); 2529 MI.eraseFromParent(); 2530 return true; 2531 } 2532 } 2533 2534 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2535 *cast<GlobalVariable>(GV))); 2536 MI.eraseFromParent(); 2537 return true; 2538 } 2539 2540 const SITargetLowering *TLI = ST.getTargetLowering(); 2541 2542 if (TLI->shouldEmitFixup(GV)) { 2543 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2544 MI.eraseFromParent(); 2545 return true; 2546 } 2547 2548 if (TLI->shouldEmitPCReloc(GV)) { 2549 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2550 MI.eraseFromParent(); 2551 return true; 2552 } 2553 2554 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2555 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2556 2557 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2558 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2559 MachinePointerInfo::getGOT(MF), 2560 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2561 MachineMemOperand::MOInvariant, 2562 LoadTy, Align(8)); 2563 2564 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2565 2566 if (Ty.getSizeInBits() == 32) { 2567 // Truncate if this is a 32-bit constant address. 2568 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2569 B.buildExtract(DstReg, Load, 0); 2570 } else 2571 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2572 2573 MI.eraseFromParent(); 2574 return true; 2575 } 2576 2577 static LLT widenToNextPowerOf2(LLT Ty) { 2578 if (Ty.isVector()) 2579 return Ty.changeElementCount( 2580 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2581 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2582 } 2583 2584 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2585 MachineInstr &MI) const { 2586 MachineIRBuilder &B = Helper.MIRBuilder; 2587 MachineRegisterInfo &MRI = *B.getMRI(); 2588 GISelChangeObserver &Observer = Helper.Observer; 2589 2590 Register PtrReg = MI.getOperand(1).getReg(); 2591 LLT PtrTy = MRI.getType(PtrReg); 2592 unsigned AddrSpace = PtrTy.getAddressSpace(); 2593 2594 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2595 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2596 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2597 Observer.changingInstr(MI); 2598 MI.getOperand(1).setReg(Cast.getReg(0)); 2599 Observer.changedInstr(MI); 2600 return true; 2601 } 2602 2603 if (MI.getOpcode() != AMDGPU::G_LOAD) 2604 return false; 2605 2606 Register ValReg = MI.getOperand(0).getReg(); 2607 LLT ValTy = MRI.getType(ValReg); 2608 2609 MachineMemOperand *MMO = *MI.memoperands_begin(); 2610 const unsigned ValSize = ValTy.getSizeInBits(); 2611 const LLT MemTy = MMO->getMemoryType(); 2612 const Align MemAlign = MMO->getAlign(); 2613 const unsigned MemSize = MemTy.getSizeInBits(); 2614 const uint64_t AlignInBits = 8 * MemAlign.value(); 2615 2616 // Widen non-power-of-2 loads to the alignment if needed 2617 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2618 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2619 2620 // This was already the correct extending load result type, so just adjust 2621 // the memory type. 2622 if (WideMemSize == ValSize) { 2623 MachineFunction &MF = B.getMF(); 2624 2625 MachineMemOperand *WideMMO = 2626 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2627 Observer.changingInstr(MI); 2628 MI.setMemRefs(MF, {WideMMO}); 2629 Observer.changedInstr(MI); 2630 return true; 2631 } 2632 2633 // Don't bother handling edge case that should probably never be produced. 2634 if (ValSize > WideMemSize) 2635 return false; 2636 2637 LLT WideTy = widenToNextPowerOf2(ValTy); 2638 2639 Register WideLoad; 2640 if (!WideTy.isVector()) { 2641 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2642 B.buildTrunc(ValReg, WideLoad).getReg(0); 2643 } else { 2644 // Extract the subvector. 2645 2646 if (isRegisterType(ValTy)) { 2647 // If this a case where G_EXTRACT is legal, use it. 2648 // (e.g. <3 x s32> -> <4 x s32>) 2649 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2650 B.buildExtract(ValReg, WideLoad, 0); 2651 } else { 2652 // For cases where the widened type isn't a nice register value, unmerge 2653 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2654 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2655 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2656 } 2657 } 2658 2659 MI.eraseFromParent(); 2660 return true; 2661 } 2662 2663 return false; 2664 } 2665 2666 bool AMDGPULegalizerInfo::legalizeFMad( 2667 MachineInstr &MI, MachineRegisterInfo &MRI, 2668 MachineIRBuilder &B) const { 2669 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2670 assert(Ty.isScalar()); 2671 2672 MachineFunction &MF = B.getMF(); 2673 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2674 2675 // TODO: Always legal with future ftz flag. 2676 // FIXME: Do we need just output? 2677 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2678 return true; 2679 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2680 return true; 2681 2682 MachineIRBuilder HelperBuilder(MI); 2683 GISelObserverWrapper DummyObserver; 2684 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2685 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2686 } 2687 2688 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2689 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2690 Register DstReg = MI.getOperand(0).getReg(); 2691 Register PtrReg = MI.getOperand(1).getReg(); 2692 Register CmpVal = MI.getOperand(2).getReg(); 2693 Register NewVal = MI.getOperand(3).getReg(); 2694 2695 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2696 "this should not have been custom lowered"); 2697 2698 LLT ValTy = MRI.getType(CmpVal); 2699 LLT VecTy = LLT::fixed_vector(2, ValTy); 2700 2701 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2702 2703 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2704 .addDef(DstReg) 2705 .addUse(PtrReg) 2706 .addUse(PackedVal) 2707 .setMemRefs(MI.memoperands()); 2708 2709 MI.eraseFromParent(); 2710 return true; 2711 } 2712 2713 bool AMDGPULegalizerInfo::legalizeFlog( 2714 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2715 Register Dst = MI.getOperand(0).getReg(); 2716 Register Src = MI.getOperand(1).getReg(); 2717 LLT Ty = B.getMRI()->getType(Dst); 2718 unsigned Flags = MI.getFlags(); 2719 2720 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2721 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2722 2723 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2724 MI.eraseFromParent(); 2725 return true; 2726 } 2727 2728 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2729 MachineIRBuilder &B) const { 2730 Register Dst = MI.getOperand(0).getReg(); 2731 Register Src = MI.getOperand(1).getReg(); 2732 unsigned Flags = MI.getFlags(); 2733 LLT Ty = B.getMRI()->getType(Dst); 2734 2735 auto K = B.buildFConstant(Ty, numbers::log2e); 2736 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2737 B.buildFExp2(Dst, Mul, Flags); 2738 MI.eraseFromParent(); 2739 return true; 2740 } 2741 2742 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2743 MachineIRBuilder &B) const { 2744 Register Dst = MI.getOperand(0).getReg(); 2745 Register Src0 = MI.getOperand(1).getReg(); 2746 Register Src1 = MI.getOperand(2).getReg(); 2747 unsigned Flags = MI.getFlags(); 2748 LLT Ty = B.getMRI()->getType(Dst); 2749 const LLT S16 = LLT::scalar(16); 2750 const LLT S32 = LLT::scalar(32); 2751 2752 if (Ty == S32) { 2753 auto Log = B.buildFLog2(S32, Src0, Flags); 2754 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2755 .addUse(Log.getReg(0)) 2756 .addUse(Src1) 2757 .setMIFlags(Flags); 2758 B.buildFExp2(Dst, Mul, Flags); 2759 } else if (Ty == S16) { 2760 // There's no f16 fmul_legacy, so we need to convert for it. 2761 auto Log = B.buildFLog2(S16, Src0, Flags); 2762 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2763 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2764 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2765 .addUse(Ext0.getReg(0)) 2766 .addUse(Ext1.getReg(0)) 2767 .setMIFlags(Flags); 2768 2769 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2770 } else 2771 return false; 2772 2773 MI.eraseFromParent(); 2774 return true; 2775 } 2776 2777 // Find a source register, ignoring any possible source modifiers. 2778 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2779 Register ModSrc = OrigSrc; 2780 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2781 ModSrc = SrcFNeg->getOperand(1).getReg(); 2782 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2783 ModSrc = SrcFAbs->getOperand(1).getReg(); 2784 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2785 ModSrc = SrcFAbs->getOperand(1).getReg(); 2786 return ModSrc; 2787 } 2788 2789 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2790 MachineRegisterInfo &MRI, 2791 MachineIRBuilder &B) const { 2792 2793 const LLT S1 = LLT::scalar(1); 2794 const LLT S64 = LLT::scalar(64); 2795 Register Dst = MI.getOperand(0).getReg(); 2796 Register OrigSrc = MI.getOperand(1).getReg(); 2797 unsigned Flags = MI.getFlags(); 2798 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2799 "this should not have been custom lowered"); 2800 2801 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2802 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2803 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2804 // V_FRACT bug is: 2805 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2806 // 2807 // Convert floor(x) to (x - fract(x)) 2808 2809 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2810 .addUse(OrigSrc) 2811 .setMIFlags(Flags); 2812 2813 // Give source modifier matching some assistance before obscuring a foldable 2814 // pattern. 2815 2816 // TODO: We can avoid the neg on the fract? The input sign to fract 2817 // shouldn't matter? 2818 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2819 2820 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2821 2822 Register Min = MRI.createGenericVirtualRegister(S64); 2823 2824 // We don't need to concern ourselves with the snan handling difference, so 2825 // use the one which will directly select. 2826 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2827 if (MFI->getMode().IEEE) 2828 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2829 else 2830 B.buildFMinNum(Min, Fract, Const, Flags); 2831 2832 Register CorrectedFract = Min; 2833 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2834 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2835 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2836 } 2837 2838 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2839 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2840 2841 MI.eraseFromParent(); 2842 return true; 2843 } 2844 2845 // Turn an illegal packed v2s16 build vector into bit operations. 2846 // TODO: This should probably be a bitcast action in LegalizerHelper. 2847 bool AMDGPULegalizerInfo::legalizeBuildVector( 2848 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2849 Register Dst = MI.getOperand(0).getReg(); 2850 const LLT S32 = LLT::scalar(32); 2851 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2852 2853 Register Src0 = MI.getOperand(1).getReg(); 2854 Register Src1 = MI.getOperand(2).getReg(); 2855 assert(MRI.getType(Src0) == LLT::scalar(16)); 2856 2857 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2858 B.buildBitcast(Dst, Merge); 2859 2860 MI.eraseFromParent(); 2861 return true; 2862 } 2863 2864 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2865 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2866 // case with a single min instruction instead of a compare+select. 2867 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2868 MachineRegisterInfo &MRI, 2869 MachineIRBuilder &B) const { 2870 Register Dst = MI.getOperand(0).getReg(); 2871 Register Src = MI.getOperand(1).getReg(); 2872 LLT DstTy = MRI.getType(Dst); 2873 LLT SrcTy = MRI.getType(Src); 2874 2875 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2876 ? AMDGPU::G_AMDGPU_FFBH_U32 2877 : AMDGPU::G_AMDGPU_FFBL_B32; 2878 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2879 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2880 2881 MI.eraseFromParent(); 2882 return true; 2883 } 2884 2885 // Check that this is a G_XOR x, -1 2886 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2887 if (MI.getOpcode() != TargetOpcode::G_XOR) 2888 return false; 2889 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2890 return ConstVal && *ConstVal == -1; 2891 } 2892 2893 // Return the use branch instruction, otherwise null if the usage is invalid. 2894 static MachineInstr * 2895 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2896 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2897 Register CondDef = MI.getOperand(0).getReg(); 2898 if (!MRI.hasOneNonDBGUse(CondDef)) 2899 return nullptr; 2900 2901 MachineBasicBlock *Parent = MI.getParent(); 2902 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2903 2904 if (isNot(MRI, *UseMI)) { 2905 Register NegatedCond = UseMI->getOperand(0).getReg(); 2906 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2907 return nullptr; 2908 2909 // We're deleting the def of this value, so we need to remove it. 2910 eraseInstr(*UseMI, MRI); 2911 2912 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2913 Negated = true; 2914 } 2915 2916 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2917 return nullptr; 2918 2919 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2920 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2921 if (Next == Parent->end()) { 2922 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2923 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2924 return nullptr; 2925 UncondBrTarget = &*NextMBB; 2926 } else { 2927 if (Next->getOpcode() != AMDGPU::G_BR) 2928 return nullptr; 2929 Br = &*Next; 2930 UncondBrTarget = Br->getOperand(0).getMBB(); 2931 } 2932 2933 return UseMI; 2934 } 2935 2936 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2937 const ArgDescriptor *Arg, 2938 const TargetRegisterClass *ArgRC, 2939 LLT ArgTy) const { 2940 MCRegister SrcReg = Arg->getRegister(); 2941 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2942 assert(DstReg.isVirtual() && "Virtual register expected"); 2943 2944 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 2945 *ArgRC, B.getDebugLoc(), ArgTy); 2946 if (Arg->isMasked()) { 2947 // TODO: Should we try to emit this once in the entry block? 2948 const LLT S32 = LLT::scalar(32); 2949 const unsigned Mask = Arg->getMask(); 2950 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2951 2952 Register AndMaskSrc = LiveIn; 2953 2954 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 2955 // 0. 2956 if (Shift != 0) { 2957 auto ShiftAmt = B.buildConstant(S32, Shift); 2958 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2959 } 2960 2961 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2962 } else { 2963 B.buildCopy(DstReg, LiveIn); 2964 } 2965 2966 return true; 2967 } 2968 2969 bool AMDGPULegalizerInfo::loadInputValue( 2970 Register DstReg, MachineIRBuilder &B, 2971 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2972 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2973 const ArgDescriptor *Arg; 2974 const TargetRegisterClass *ArgRC; 2975 LLT ArgTy; 2976 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2977 2978 if (!Arg) { 2979 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 2980 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 2981 // case the pointer argument may be missing and we use null. 2982 B.buildConstant(DstReg, 0); 2983 return true; 2984 } 2985 2986 // It's undefined behavior if a function marked with the amdgpu-no-* 2987 // attributes uses the corresponding intrinsic. 2988 B.buildUndef(DstReg); 2989 return true; 2990 } 2991 2992 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2993 return false; // TODO: Handle these 2994 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2995 } 2996 2997 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2998 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2999 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3000 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 3001 return false; 3002 3003 MI.eraseFromParent(); 3004 return true; 3005 } 3006 3007 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, 3008 int64_t Offset) const { 3009 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 3010 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 3011 3012 // TODO: If we passed in the base kernel offset we could have a better 3013 // alignment than 4, but we don't really need it. 3014 if (!loadInputValue(KernArgReg, B, 3015 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3016 llvm_unreachable("failed to find kernarg segment ptr"); 3017 3018 auto COffset = B.buildConstant(LLT::scalar(64), Offset); 3019 // TODO: Should get nuw 3020 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); 3021 } 3022 3023 /// Legalize a value that's loaded from kernel arguments. This is only used by 3024 /// legacy intrinsics. 3025 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, 3026 MachineIRBuilder &B, 3027 uint64_t Offset, 3028 Align Alignment) const { 3029 Register DstReg = MI.getOperand(0).getReg(); 3030 3031 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) && 3032 "unexpected kernarg parameter type"); 3033 3034 Register Ptr = getKernargParameterPtr(B, Offset); 3035 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 3036 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), 3037 MachineMemOperand::MODereferenceable | 3038 MachineMemOperand::MOInvariant); 3039 MI.eraseFromParent(); 3040 return true; 3041 } 3042 3043 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 3044 MachineRegisterInfo &MRI, 3045 MachineIRBuilder &B) const { 3046 Register Dst = MI.getOperand(0).getReg(); 3047 LLT DstTy = MRI.getType(Dst); 3048 LLT S16 = LLT::scalar(16); 3049 LLT S32 = LLT::scalar(32); 3050 LLT S64 = LLT::scalar(64); 3051 3052 if (DstTy == S16) 3053 return legalizeFDIV16(MI, MRI, B); 3054 if (DstTy == S32) 3055 return legalizeFDIV32(MI, MRI, B); 3056 if (DstTy == S64) 3057 return legalizeFDIV64(MI, MRI, B); 3058 3059 return false; 3060 } 3061 3062 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 3063 Register DstDivReg, 3064 Register DstRemReg, 3065 Register X, 3066 Register Y) const { 3067 const LLT S1 = LLT::scalar(1); 3068 const LLT S32 = LLT::scalar(32); 3069 3070 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 3071 // algorithm used here. 3072 3073 // Initial estimate of inv(y). 3074 auto FloatY = B.buildUITOFP(S32, Y); 3075 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 3076 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 3077 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 3078 auto Z = B.buildFPTOUI(S32, ScaledY); 3079 3080 // One round of UNR. 3081 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 3082 auto NegYZ = B.buildMul(S32, NegY, Z); 3083 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 3084 3085 // Quotient/remainder estimate. 3086 auto Q = B.buildUMulH(S32, X, Z); 3087 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 3088 3089 // First quotient/remainder refinement. 3090 auto One = B.buildConstant(S32, 1); 3091 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3092 if (DstDivReg) 3093 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 3094 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 3095 3096 // Second quotient/remainder refinement. 3097 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3098 if (DstDivReg) 3099 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 3100 3101 if (DstRemReg) 3102 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 3103 } 3104 3105 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 3106 // 3107 // Return lo, hi of result 3108 // 3109 // %cvt.lo = G_UITOFP Val.lo 3110 // %cvt.hi = G_UITOFP Val.hi 3111 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 3112 // %rcp = G_AMDGPU_RCP_IFLAG %mad 3113 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 3114 // %mul2 = G_FMUL %mul1, 2**(-32) 3115 // %trunc = G_INTRINSIC_TRUNC %mul2 3116 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 3117 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 3118 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 3119 Register Val) { 3120 const LLT S32 = LLT::scalar(32); 3121 auto Unmerge = B.buildUnmerge(S32, Val); 3122 3123 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 3124 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 3125 3126 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 3127 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 3128 3129 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 3130 auto Mul1 = 3131 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3132 3133 // 2**(-32) 3134 auto Mul2 = 3135 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3136 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3137 3138 // -(2**32) 3139 auto Mad2 = B.buildFMAD(S32, Trunc, 3140 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3141 3142 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3143 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3144 3145 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3146 } 3147 3148 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3149 Register DstDivReg, 3150 Register DstRemReg, 3151 Register Numer, 3152 Register Denom) const { 3153 const LLT S32 = LLT::scalar(32); 3154 const LLT S64 = LLT::scalar(64); 3155 const LLT S1 = LLT::scalar(1); 3156 Register RcpLo, RcpHi; 3157 3158 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3159 3160 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3161 3162 auto Zero64 = B.buildConstant(S64, 0); 3163 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3164 3165 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3166 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3167 3168 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3169 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3170 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3171 3172 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3173 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3174 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3175 3176 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3177 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3178 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3179 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3180 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3181 3182 auto Zero32 = B.buildConstant(S32, 0); 3183 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3184 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3185 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3186 3187 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3188 Register NumerLo = UnmergeNumer.getReg(0); 3189 Register NumerHi = UnmergeNumer.getReg(1); 3190 3191 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3192 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3193 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3194 Register Mul3_Lo = UnmergeMul3.getReg(0); 3195 Register Mul3_Hi = UnmergeMul3.getReg(1); 3196 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3197 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3198 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3199 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3200 3201 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3202 Register DenomLo = UnmergeDenom.getReg(0); 3203 Register DenomHi = UnmergeDenom.getReg(1); 3204 3205 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3206 auto C1 = B.buildSExt(S32, CmpHi); 3207 3208 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3209 auto C2 = B.buildSExt(S32, CmpLo); 3210 3211 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3212 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3213 3214 // TODO: Here and below portions of the code can be enclosed into if/endif. 3215 // Currently control flow is unconditional and we have 4 selects after 3216 // potential endif to substitute PHIs. 3217 3218 // if C3 != 0 ... 3219 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3220 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3221 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3222 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3223 3224 auto One64 = B.buildConstant(S64, 1); 3225 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3226 3227 auto C4 = 3228 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3229 auto C5 = 3230 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3231 auto C6 = B.buildSelect( 3232 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3233 3234 // if (C6 != 0) 3235 auto Add4 = B.buildAdd(S64, Add3, One64); 3236 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3237 3238 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3239 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3240 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3241 3242 // endif C6 3243 // endif C3 3244 3245 if (DstDivReg) { 3246 auto Sel1 = B.buildSelect( 3247 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3248 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3249 Sel1, MulHi3); 3250 } 3251 3252 if (DstRemReg) { 3253 auto Sel2 = B.buildSelect( 3254 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3255 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3256 Sel2, Sub1); 3257 } 3258 } 3259 3260 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3261 MachineRegisterInfo &MRI, 3262 MachineIRBuilder &B) const { 3263 Register DstDivReg, DstRemReg; 3264 switch (MI.getOpcode()) { 3265 default: 3266 llvm_unreachable("Unexpected opcode!"); 3267 case AMDGPU::G_UDIV: { 3268 DstDivReg = MI.getOperand(0).getReg(); 3269 break; 3270 } 3271 case AMDGPU::G_UREM: { 3272 DstRemReg = MI.getOperand(0).getReg(); 3273 break; 3274 } 3275 case AMDGPU::G_UDIVREM: { 3276 DstDivReg = MI.getOperand(0).getReg(); 3277 DstRemReg = MI.getOperand(1).getReg(); 3278 break; 3279 } 3280 } 3281 3282 const LLT S64 = LLT::scalar(64); 3283 const LLT S32 = LLT::scalar(32); 3284 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3285 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3286 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3287 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3288 3289 if (Ty == S32) 3290 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3291 else if (Ty == S64) 3292 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3293 else 3294 return false; 3295 3296 MI.eraseFromParent(); 3297 return true; 3298 } 3299 3300 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3301 MachineRegisterInfo &MRI, 3302 MachineIRBuilder &B) const { 3303 const LLT S64 = LLT::scalar(64); 3304 const LLT S32 = LLT::scalar(32); 3305 3306 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3307 if (Ty != S32 && Ty != S64) 3308 return false; 3309 3310 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3311 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3312 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3313 3314 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3315 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3316 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3317 3318 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3319 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3320 3321 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3322 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3323 3324 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3325 switch (MI.getOpcode()) { 3326 default: 3327 llvm_unreachable("Unexpected opcode!"); 3328 case AMDGPU::G_SDIV: { 3329 DstDivReg = MI.getOperand(0).getReg(); 3330 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3331 break; 3332 } 3333 case AMDGPU::G_SREM: { 3334 DstRemReg = MI.getOperand(0).getReg(); 3335 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3336 break; 3337 } 3338 case AMDGPU::G_SDIVREM: { 3339 DstDivReg = MI.getOperand(0).getReg(); 3340 DstRemReg = MI.getOperand(1).getReg(); 3341 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3342 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3343 break; 3344 } 3345 } 3346 3347 if (Ty == S32) 3348 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3349 else 3350 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3351 3352 if (DstDivReg) { 3353 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3354 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3355 B.buildSub(DstDivReg, SignXor, Sign); 3356 } 3357 3358 if (DstRemReg) { 3359 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3360 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3361 B.buildSub(DstRemReg, SignXor, Sign); 3362 } 3363 3364 MI.eraseFromParent(); 3365 return true; 3366 } 3367 3368 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3369 MachineRegisterInfo &MRI, 3370 MachineIRBuilder &B) const { 3371 Register Res = MI.getOperand(0).getReg(); 3372 Register LHS = MI.getOperand(1).getReg(); 3373 Register RHS = MI.getOperand(2).getReg(); 3374 uint16_t Flags = MI.getFlags(); 3375 LLT ResTy = MRI.getType(Res); 3376 3377 const MachineFunction &MF = B.getMF(); 3378 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3379 MI.getFlag(MachineInstr::FmAfn); 3380 3381 if (!AllowInaccurateRcp) 3382 return false; 3383 3384 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3385 // 1 / x -> RCP(x) 3386 if (CLHS->isExactlyValue(1.0)) { 3387 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3388 .addUse(RHS) 3389 .setMIFlags(Flags); 3390 3391 MI.eraseFromParent(); 3392 return true; 3393 } 3394 3395 // -1 / x -> RCP( FNEG(x) ) 3396 if (CLHS->isExactlyValue(-1.0)) { 3397 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3398 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3399 .addUse(FNeg.getReg(0)) 3400 .setMIFlags(Flags); 3401 3402 MI.eraseFromParent(); 3403 return true; 3404 } 3405 } 3406 3407 // x / y -> x * (1.0 / y) 3408 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3409 .addUse(RHS) 3410 .setMIFlags(Flags); 3411 B.buildFMul(Res, LHS, RCP, Flags); 3412 3413 MI.eraseFromParent(); 3414 return true; 3415 } 3416 3417 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3418 MachineRegisterInfo &MRI, 3419 MachineIRBuilder &B) const { 3420 Register Res = MI.getOperand(0).getReg(); 3421 Register X = MI.getOperand(1).getReg(); 3422 Register Y = MI.getOperand(2).getReg(); 3423 uint16_t Flags = MI.getFlags(); 3424 LLT ResTy = MRI.getType(Res); 3425 3426 const MachineFunction &MF = B.getMF(); 3427 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3428 MI.getFlag(MachineInstr::FmAfn); 3429 3430 if (!AllowInaccurateRcp) 3431 return false; 3432 3433 auto NegY = B.buildFNeg(ResTy, Y); 3434 auto One = B.buildFConstant(ResTy, 1.0); 3435 3436 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3437 .addUse(Y) 3438 .setMIFlags(Flags); 3439 3440 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3441 R = B.buildFMA(ResTy, Tmp0, R, R); 3442 3443 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3444 R = B.buildFMA(ResTy, Tmp1, R, R); 3445 3446 auto Ret = B.buildFMul(ResTy, X, R); 3447 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3448 3449 B.buildFMA(Res, Tmp2, R, Ret); 3450 MI.eraseFromParent(); 3451 return true; 3452 } 3453 3454 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3455 MachineRegisterInfo &MRI, 3456 MachineIRBuilder &B) const { 3457 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3458 return true; 3459 3460 Register Res = MI.getOperand(0).getReg(); 3461 Register LHS = MI.getOperand(1).getReg(); 3462 Register RHS = MI.getOperand(2).getReg(); 3463 3464 uint16_t Flags = MI.getFlags(); 3465 3466 LLT S16 = LLT::scalar(16); 3467 LLT S32 = LLT::scalar(32); 3468 3469 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3470 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3471 3472 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3473 .addUse(RHSExt.getReg(0)) 3474 .setMIFlags(Flags); 3475 3476 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3477 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3478 3479 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3480 .addUse(RDst.getReg(0)) 3481 .addUse(RHS) 3482 .addUse(LHS) 3483 .setMIFlags(Flags); 3484 3485 MI.eraseFromParent(); 3486 return true; 3487 } 3488 3489 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3490 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3491 static void toggleSPDenormMode(bool Enable, 3492 MachineIRBuilder &B, 3493 const GCNSubtarget &ST, 3494 AMDGPU::SIModeRegisterDefaults Mode) { 3495 // Set SP denorm mode to this value. 3496 unsigned SPDenormMode = 3497 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3498 3499 if (ST.hasDenormModeInst()) { 3500 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3501 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3502 3503 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3504 B.buildInstr(AMDGPU::S_DENORM_MODE) 3505 .addImm(NewDenormModeValue); 3506 3507 } else { 3508 // Select FP32 bit field in mode register. 3509 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3510 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3511 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3512 3513 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3514 .addImm(SPDenormMode) 3515 .addImm(SPDenormModeBitField); 3516 } 3517 } 3518 3519 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3520 MachineRegisterInfo &MRI, 3521 MachineIRBuilder &B) const { 3522 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3523 return true; 3524 3525 Register Res = MI.getOperand(0).getReg(); 3526 Register LHS = MI.getOperand(1).getReg(); 3527 Register RHS = MI.getOperand(2).getReg(); 3528 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3529 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3530 3531 uint16_t Flags = MI.getFlags(); 3532 3533 LLT S32 = LLT::scalar(32); 3534 LLT S1 = LLT::scalar(1); 3535 3536 auto One = B.buildFConstant(S32, 1.0f); 3537 3538 auto DenominatorScaled = 3539 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3540 .addUse(LHS) 3541 .addUse(RHS) 3542 .addImm(0) 3543 .setMIFlags(Flags); 3544 auto NumeratorScaled = 3545 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3546 .addUse(LHS) 3547 .addUse(RHS) 3548 .addImm(1) 3549 .setMIFlags(Flags); 3550 3551 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3552 .addUse(DenominatorScaled.getReg(0)) 3553 .setMIFlags(Flags); 3554 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3555 3556 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3557 // aren't modeled as reading it. 3558 if (!Mode.allFP32Denormals()) 3559 toggleSPDenormMode(true, B, ST, Mode); 3560 3561 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3562 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3563 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3564 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3565 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3566 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3567 3568 if (!Mode.allFP32Denormals()) 3569 toggleSPDenormMode(false, B, ST, Mode); 3570 3571 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3572 .addUse(Fma4.getReg(0)) 3573 .addUse(Fma1.getReg(0)) 3574 .addUse(Fma3.getReg(0)) 3575 .addUse(NumeratorScaled.getReg(1)) 3576 .setMIFlags(Flags); 3577 3578 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3579 .addUse(Fmas.getReg(0)) 3580 .addUse(RHS) 3581 .addUse(LHS) 3582 .setMIFlags(Flags); 3583 3584 MI.eraseFromParent(); 3585 return true; 3586 } 3587 3588 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3589 MachineRegisterInfo &MRI, 3590 MachineIRBuilder &B) const { 3591 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3592 return true; 3593 3594 Register Res = MI.getOperand(0).getReg(); 3595 Register LHS = MI.getOperand(1).getReg(); 3596 Register RHS = MI.getOperand(2).getReg(); 3597 3598 uint16_t Flags = MI.getFlags(); 3599 3600 LLT S64 = LLT::scalar(64); 3601 LLT S1 = LLT::scalar(1); 3602 3603 auto One = B.buildFConstant(S64, 1.0); 3604 3605 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3606 .addUse(LHS) 3607 .addUse(RHS) 3608 .addImm(0) 3609 .setMIFlags(Flags); 3610 3611 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3612 3613 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3614 .addUse(DivScale0.getReg(0)) 3615 .setMIFlags(Flags); 3616 3617 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3618 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3619 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3620 3621 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3622 .addUse(LHS) 3623 .addUse(RHS) 3624 .addImm(1) 3625 .setMIFlags(Flags); 3626 3627 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3628 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3629 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3630 3631 Register Scale; 3632 if (!ST.hasUsableDivScaleConditionOutput()) { 3633 // Workaround a hardware bug on SI where the condition output from div_scale 3634 // is not usable. 3635 3636 LLT S32 = LLT::scalar(32); 3637 3638 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3639 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3640 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3641 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3642 3643 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3644 Scale1Unmerge.getReg(1)); 3645 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3646 Scale0Unmerge.getReg(1)); 3647 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3648 } else { 3649 Scale = DivScale1.getReg(1); 3650 } 3651 3652 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3653 .addUse(Fma4.getReg(0)) 3654 .addUse(Fma3.getReg(0)) 3655 .addUse(Mul.getReg(0)) 3656 .addUse(Scale) 3657 .setMIFlags(Flags); 3658 3659 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3660 .addUse(Fmas.getReg(0)) 3661 .addUse(RHS) 3662 .addUse(LHS) 3663 .setMIFlags(Flags); 3664 3665 MI.eraseFromParent(); 3666 return true; 3667 } 3668 3669 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3670 MachineRegisterInfo &MRI, 3671 MachineIRBuilder &B) const { 3672 Register Res = MI.getOperand(0).getReg(); 3673 Register LHS = MI.getOperand(2).getReg(); 3674 Register RHS = MI.getOperand(3).getReg(); 3675 uint16_t Flags = MI.getFlags(); 3676 3677 LLT S32 = LLT::scalar(32); 3678 LLT S1 = LLT::scalar(1); 3679 3680 auto Abs = B.buildFAbs(S32, RHS, Flags); 3681 const APFloat C0Val(1.0f); 3682 3683 auto C0 = B.buildConstant(S32, 0x6f800000); 3684 auto C1 = B.buildConstant(S32, 0x2f800000); 3685 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3686 3687 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3688 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3689 3690 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3691 3692 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3693 .addUse(Mul0.getReg(0)) 3694 .setMIFlags(Flags); 3695 3696 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3697 3698 B.buildFMul(Res, Sel, Mul1, Flags); 3699 3700 MI.eraseFromParent(); 3701 return true; 3702 } 3703 3704 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3705 // FIXME: Why do we handle this one but not other removed instructions? 3706 // 3707 // Reciprocal square root. The clamp prevents infinite results, clamping 3708 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3709 // +-max_float. 3710 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3711 MachineRegisterInfo &MRI, 3712 MachineIRBuilder &B) const { 3713 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3714 return true; 3715 3716 Register Dst = MI.getOperand(0).getReg(); 3717 Register Src = MI.getOperand(2).getReg(); 3718 auto Flags = MI.getFlags(); 3719 3720 LLT Ty = MRI.getType(Dst); 3721 3722 const fltSemantics *FltSemantics; 3723 if (Ty == LLT::scalar(32)) 3724 FltSemantics = &APFloat::IEEEsingle(); 3725 else if (Ty == LLT::scalar(64)) 3726 FltSemantics = &APFloat::IEEEdouble(); 3727 else 3728 return false; 3729 3730 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3731 .addUse(Src) 3732 .setMIFlags(Flags); 3733 3734 // We don't need to concern ourselves with the snan handling difference, since 3735 // the rsq quieted (or not) so use the one which will directly select. 3736 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3737 const bool UseIEEE = MFI->getMode().IEEE; 3738 3739 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3740 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3741 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3742 3743 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3744 3745 if (UseIEEE) 3746 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3747 else 3748 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3749 MI.eraseFromParent(); 3750 return true; 3751 } 3752 3753 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3754 switch (IID) { 3755 case Intrinsic::amdgcn_ds_fadd: 3756 return AMDGPU::G_ATOMICRMW_FADD; 3757 case Intrinsic::amdgcn_ds_fmin: 3758 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3759 case Intrinsic::amdgcn_ds_fmax: 3760 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3761 default: 3762 llvm_unreachable("not a DS FP intrinsic"); 3763 } 3764 } 3765 3766 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3767 MachineInstr &MI, 3768 Intrinsic::ID IID) const { 3769 GISelChangeObserver &Observer = Helper.Observer; 3770 Observer.changingInstr(MI); 3771 3772 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3773 3774 // The remaining operands were used to set fields in the MemOperand on 3775 // construction. 3776 for (int I = 6; I > 3; --I) 3777 MI.removeOperand(I); 3778 3779 MI.removeOperand(1); // Remove the intrinsic ID. 3780 Observer.changedInstr(MI); 3781 return true; 3782 } 3783 3784 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3785 MachineRegisterInfo &MRI, 3786 MachineIRBuilder &B) const { 3787 uint64_t Offset = 3788 ST.getTargetLowering()->getImplicitParameterOffset( 3789 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3790 LLT DstTy = MRI.getType(DstReg); 3791 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3792 3793 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3794 if (!loadInputValue(KernargPtrReg, B, 3795 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3796 return false; 3797 3798 // FIXME: This should be nuw 3799 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3800 return true; 3801 } 3802 3803 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3804 MachineRegisterInfo &MRI, 3805 MachineIRBuilder &B) const { 3806 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3807 if (!MFI->isEntryFunction()) { 3808 return legalizePreloadedArgIntrin(MI, MRI, B, 3809 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3810 } 3811 3812 Register DstReg = MI.getOperand(0).getReg(); 3813 if (!getImplicitArgPtr(DstReg, MRI, B)) 3814 return false; 3815 3816 MI.eraseFromParent(); 3817 return true; 3818 } 3819 3820 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3821 MachineRegisterInfo &MRI, 3822 MachineIRBuilder &B, 3823 unsigned AddrSpace) const { 3824 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3825 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3826 Register Hi32 = Unmerge.getReg(1); 3827 3828 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3829 MI.eraseFromParent(); 3830 return true; 3831 } 3832 3833 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3834 // offset (the offset that is included in bounds checking and swizzling, to be 3835 // split between the instruction's voffset and immoffset fields) and soffset 3836 // (the offset that is excluded from bounds checking and swizzling, to go in 3837 // the instruction's soffset field). This function takes the first kind of 3838 // offset and figures out how to split it between voffset and immoffset. 3839 std::pair<Register, unsigned> 3840 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3841 Register OrigOffset) const { 3842 const unsigned MaxImm = 4095; 3843 Register BaseReg; 3844 unsigned ImmOffset; 3845 const LLT S32 = LLT::scalar(32); 3846 MachineRegisterInfo &MRI = *B.getMRI(); 3847 3848 std::tie(BaseReg, ImmOffset) = 3849 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3850 3851 // If BaseReg is a pointer, convert it to int. 3852 if (MRI.getType(BaseReg).isPointer()) 3853 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3854 3855 // If the immediate value is too big for the immoffset field, put the value 3856 // and -4096 into the immoffset field so that the value that is copied/added 3857 // for the voffset field is a multiple of 4096, and it stands more chance 3858 // of being CSEd with the copy/add for another similar load/store. 3859 // However, do not do that rounding down to a multiple of 4096 if that is a 3860 // negative number, as it appears to be illegal to have a negative offset 3861 // in the vgpr, even if adding the immediate offset makes it positive. 3862 unsigned Overflow = ImmOffset & ~MaxImm; 3863 ImmOffset -= Overflow; 3864 if ((int32_t)Overflow < 0) { 3865 Overflow += ImmOffset; 3866 ImmOffset = 0; 3867 } 3868 3869 if (Overflow != 0) { 3870 if (!BaseReg) { 3871 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3872 } else { 3873 auto OverflowVal = B.buildConstant(S32, Overflow); 3874 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3875 } 3876 } 3877 3878 if (!BaseReg) 3879 BaseReg = B.buildConstant(S32, 0).getReg(0); 3880 3881 return std::make_pair(BaseReg, ImmOffset); 3882 } 3883 3884 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3885 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3886 Register VOffset, Register SOffset, 3887 unsigned ImmOffset, Register VIndex, 3888 MachineRegisterInfo &MRI) const { 3889 Optional<ValueAndVReg> MaybeVOffsetVal = 3890 getIConstantVRegValWithLookThrough(VOffset, MRI); 3891 Optional<ValueAndVReg> MaybeSOffsetVal = 3892 getIConstantVRegValWithLookThrough(SOffset, MRI); 3893 Optional<ValueAndVReg> MaybeVIndexVal = 3894 getIConstantVRegValWithLookThrough(VIndex, MRI); 3895 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3896 // update the MMO with that offset. The stride is unknown so we can only do 3897 // this if VIndex is constant 0. 3898 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3899 MaybeVIndexVal->Value == 0) { 3900 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3901 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3902 MMO->setOffset(TotalOffset); 3903 } else { 3904 // We don't have a constant combined offset to use in the MMO. Give up. 3905 MMO->setValue((Value *)nullptr); 3906 } 3907 } 3908 3909 /// Handle register layout difference for f16 images for some subtargets. 3910 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3911 MachineRegisterInfo &MRI, 3912 Register Reg, 3913 bool ImageStore) const { 3914 const LLT S16 = LLT::scalar(16); 3915 const LLT S32 = LLT::scalar(32); 3916 LLT StoreVT = MRI.getType(Reg); 3917 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3918 3919 if (ST.hasUnpackedD16VMem()) { 3920 auto Unmerge = B.buildUnmerge(S16, Reg); 3921 3922 SmallVector<Register, 4> WideRegs; 3923 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3924 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3925 3926 int NumElts = StoreVT.getNumElements(); 3927 3928 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3929 .getReg(0); 3930 } 3931 3932 if (ImageStore && ST.hasImageStoreD16Bug()) { 3933 if (StoreVT.getNumElements() == 2) { 3934 SmallVector<Register, 4> PackedRegs; 3935 Reg = B.buildBitcast(S32, Reg).getReg(0); 3936 PackedRegs.push_back(Reg); 3937 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3938 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3939 .getReg(0); 3940 } 3941 3942 if (StoreVT.getNumElements() == 3) { 3943 SmallVector<Register, 4> PackedRegs; 3944 auto Unmerge = B.buildUnmerge(S16, Reg); 3945 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3946 PackedRegs.push_back(Unmerge.getReg(I)); 3947 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3948 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3949 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3950 } 3951 3952 if (StoreVT.getNumElements() == 4) { 3953 SmallVector<Register, 4> PackedRegs; 3954 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3955 auto Unmerge = B.buildUnmerge(S32, Reg); 3956 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3957 PackedRegs.push_back(Unmerge.getReg(I)); 3958 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3959 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3960 .getReg(0); 3961 } 3962 3963 llvm_unreachable("invalid data type"); 3964 } 3965 3966 if (StoreVT == LLT::fixed_vector(3, S16)) { 3967 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 3968 .getReg(0); 3969 } 3970 return Reg; 3971 } 3972 3973 Register AMDGPULegalizerInfo::fixStoreSourceType( 3974 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3975 MachineRegisterInfo *MRI = B.getMRI(); 3976 LLT Ty = MRI->getType(VData); 3977 3978 const LLT S16 = LLT::scalar(16); 3979 3980 // Fixup illegal register types for i8 stores. 3981 if (Ty == LLT::scalar(8) || Ty == S16) { 3982 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3983 return AnyExt; 3984 } 3985 3986 if (Ty.isVector()) { 3987 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3988 if (IsFormat) 3989 return handleD16VData(B, *MRI, VData); 3990 } 3991 } 3992 3993 return VData; 3994 } 3995 3996 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3997 MachineRegisterInfo &MRI, 3998 MachineIRBuilder &B, 3999 bool IsTyped, 4000 bool IsFormat) const { 4001 Register VData = MI.getOperand(1).getReg(); 4002 LLT Ty = MRI.getType(VData); 4003 LLT EltTy = Ty.getScalarType(); 4004 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4005 const LLT S32 = LLT::scalar(32); 4006 4007 VData = fixStoreSourceType(B, VData, IsFormat); 4008 Register RSrc = MI.getOperand(2).getReg(); 4009 4010 MachineMemOperand *MMO = *MI.memoperands_begin(); 4011 const int MemSize = MMO->getSize(); 4012 4013 unsigned ImmOffset; 4014 4015 // The typed intrinsics add an immediate after the registers. 4016 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4017 4018 // The struct intrinsic variants add one additional operand over raw. 4019 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4020 Register VIndex; 4021 int OpOffset = 0; 4022 if (HasVIndex) { 4023 VIndex = MI.getOperand(3).getReg(); 4024 OpOffset = 1; 4025 } else { 4026 VIndex = B.buildConstant(S32, 0).getReg(0); 4027 } 4028 4029 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4030 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4031 4032 unsigned Format = 0; 4033 if (IsTyped) { 4034 Format = MI.getOperand(5 + OpOffset).getImm(); 4035 ++OpOffset; 4036 } 4037 4038 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4039 4040 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4041 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4042 4043 unsigned Opc; 4044 if (IsTyped) { 4045 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 4046 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 4047 } else if (IsFormat) { 4048 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 4049 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 4050 } else { 4051 switch (MemSize) { 4052 case 1: 4053 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 4054 break; 4055 case 2: 4056 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 4057 break; 4058 default: 4059 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 4060 break; 4061 } 4062 } 4063 4064 auto MIB = B.buildInstr(Opc) 4065 .addUse(VData) // vdata 4066 .addUse(RSrc) // rsrc 4067 .addUse(VIndex) // vindex 4068 .addUse(VOffset) // voffset 4069 .addUse(SOffset) // soffset 4070 .addImm(ImmOffset); // offset(imm) 4071 4072 if (IsTyped) 4073 MIB.addImm(Format); 4074 4075 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4076 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4077 .addMemOperand(MMO); 4078 4079 MI.eraseFromParent(); 4080 return true; 4081 } 4082 4083 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 4084 MachineRegisterInfo &MRI, 4085 MachineIRBuilder &B, 4086 bool IsFormat, 4087 bool IsTyped) const { 4088 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 4089 MachineMemOperand *MMO = *MI.memoperands_begin(); 4090 const LLT MemTy = MMO->getMemoryType(); 4091 const LLT S32 = LLT::scalar(32); 4092 4093 Register Dst = MI.getOperand(0).getReg(); 4094 Register RSrc = MI.getOperand(2).getReg(); 4095 4096 // The typed intrinsics add an immediate after the registers. 4097 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4098 4099 // The struct intrinsic variants add one additional operand over raw. 4100 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4101 Register VIndex; 4102 int OpOffset = 0; 4103 if (HasVIndex) { 4104 VIndex = MI.getOperand(3).getReg(); 4105 OpOffset = 1; 4106 } else { 4107 VIndex = B.buildConstant(S32, 0).getReg(0); 4108 } 4109 4110 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4111 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4112 4113 unsigned Format = 0; 4114 if (IsTyped) { 4115 Format = MI.getOperand(5 + OpOffset).getImm(); 4116 ++OpOffset; 4117 } 4118 4119 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4120 unsigned ImmOffset; 4121 4122 LLT Ty = MRI.getType(Dst); 4123 LLT EltTy = Ty.getScalarType(); 4124 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4125 const bool Unpacked = ST.hasUnpackedD16VMem(); 4126 4127 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4128 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4129 4130 unsigned Opc; 4131 4132 if (IsTyped) { 4133 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4134 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4135 } else if (IsFormat) { 4136 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4137 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4138 } else { 4139 switch (MemTy.getSizeInBits()) { 4140 case 8: 4141 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4142 break; 4143 case 16: 4144 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4145 break; 4146 default: 4147 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4148 break; 4149 } 4150 } 4151 4152 Register LoadDstReg; 4153 4154 bool IsExtLoad = 4155 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4156 LLT UnpackedTy = Ty.changeElementSize(32); 4157 4158 if (IsExtLoad) 4159 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4160 else if (Unpacked && IsD16 && Ty.isVector()) 4161 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4162 else 4163 LoadDstReg = Dst; 4164 4165 auto MIB = B.buildInstr(Opc) 4166 .addDef(LoadDstReg) // vdata 4167 .addUse(RSrc) // rsrc 4168 .addUse(VIndex) // vindex 4169 .addUse(VOffset) // voffset 4170 .addUse(SOffset) // soffset 4171 .addImm(ImmOffset); // offset(imm) 4172 4173 if (IsTyped) 4174 MIB.addImm(Format); 4175 4176 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4177 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4178 .addMemOperand(MMO); 4179 4180 if (LoadDstReg != Dst) { 4181 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4182 4183 // Widen result for extending loads was widened. 4184 if (IsExtLoad) 4185 B.buildTrunc(Dst, LoadDstReg); 4186 else { 4187 // Repack to original 16-bit vector result 4188 // FIXME: G_TRUNC should work, but legalization currently fails 4189 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4190 SmallVector<Register, 4> Repack; 4191 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4192 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4193 B.buildMerge(Dst, Repack); 4194 } 4195 } 4196 4197 MI.eraseFromParent(); 4198 return true; 4199 } 4200 4201 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4202 MachineIRBuilder &B, 4203 bool IsInc) const { 4204 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4205 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4206 B.buildInstr(Opc) 4207 .addDef(MI.getOperand(0).getReg()) 4208 .addUse(MI.getOperand(2).getReg()) 4209 .addUse(MI.getOperand(3).getReg()) 4210 .cloneMemRefs(MI); 4211 MI.eraseFromParent(); 4212 return true; 4213 } 4214 4215 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4216 switch (IntrID) { 4217 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4218 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4219 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4220 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4221 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4222 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4223 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4224 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4225 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4226 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4227 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4228 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4229 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4230 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4231 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4232 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4233 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4234 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4235 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4236 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4237 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4238 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4239 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4240 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4241 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4242 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4243 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4244 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4245 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4246 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4247 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4248 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4249 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4250 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4251 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4252 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4253 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4254 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4255 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4256 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4257 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4258 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4259 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4260 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4261 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4262 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4263 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4264 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4265 default: 4266 llvm_unreachable("unhandled atomic opcode"); 4267 } 4268 } 4269 4270 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4271 MachineIRBuilder &B, 4272 Intrinsic::ID IID) const { 4273 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4274 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4275 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4276 4277 Register Dst; 4278 4279 int OpOffset = 0; 4280 if (HasReturn) { 4281 // A few FP atomics do not support return values. 4282 Dst = MI.getOperand(0).getReg(); 4283 } else { 4284 OpOffset = -1; 4285 } 4286 4287 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4288 Register CmpVal; 4289 4290 if (IsCmpSwap) { 4291 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4292 ++OpOffset; 4293 } 4294 4295 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4296 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4297 4298 // The struct intrinsic variants add one additional operand over raw. 4299 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4300 Register VIndex; 4301 if (HasVIndex) { 4302 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4303 ++OpOffset; 4304 } else { 4305 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4306 } 4307 4308 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4309 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4310 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4311 4312 MachineMemOperand *MMO = *MI.memoperands_begin(); 4313 4314 unsigned ImmOffset; 4315 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4316 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4317 4318 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4319 4320 if (HasReturn) 4321 MIB.addDef(Dst); 4322 4323 MIB.addUse(VData); // vdata 4324 4325 if (IsCmpSwap) 4326 MIB.addReg(CmpVal); 4327 4328 MIB.addUse(RSrc) // rsrc 4329 .addUse(VIndex) // vindex 4330 .addUse(VOffset) // voffset 4331 .addUse(SOffset) // soffset 4332 .addImm(ImmOffset) // offset(imm) 4333 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4334 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4335 .addMemOperand(MMO); 4336 4337 MI.eraseFromParent(); 4338 return true; 4339 } 4340 4341 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4342 /// vector with s16 typed elements. 4343 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4344 SmallVectorImpl<Register> &PackedAddrs, 4345 unsigned ArgOffset, 4346 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4347 bool IsA16, bool IsG16) { 4348 const LLT S16 = LLT::scalar(16); 4349 const LLT V2S16 = LLT::fixed_vector(2, 16); 4350 auto EndIdx = Intr->VAddrEnd; 4351 4352 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4353 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4354 if (!SrcOp.isReg()) 4355 continue; // _L to _LZ may have eliminated this. 4356 4357 Register AddrReg = SrcOp.getReg(); 4358 4359 if ((I < Intr->GradientStart) || 4360 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4361 (I >= Intr->CoordStart && !IsA16)) { 4362 if ((I < Intr->GradientStart) && IsA16 && 4363 (B.getMRI()->getType(AddrReg) == S16)) { 4364 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 4365 // Special handling of bias when A16 is on. Bias is of type half but 4366 // occupies full 32-bit. 4367 PackedAddrs.push_back( 4368 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4369 .getReg(0)); 4370 } else { 4371 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 4372 "Bias needs to be converted to 16 bit in A16 mode"); 4373 // Handle any gradient or coordinate operands that should not be packed 4374 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4375 PackedAddrs.push_back(AddrReg); 4376 } 4377 } else { 4378 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4379 // derivatives dx/dh and dx/dv are packed with undef. 4380 if (((I + 1) >= EndIdx) || 4381 ((Intr->NumGradients / 2) % 2 == 1 && 4382 (I == static_cast<unsigned>(Intr->GradientStart + 4383 (Intr->NumGradients / 2) - 1) || 4384 I == static_cast<unsigned>(Intr->GradientStart + 4385 Intr->NumGradients - 1))) || 4386 // Check for _L to _LZ optimization 4387 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4388 PackedAddrs.push_back( 4389 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4390 .getReg(0)); 4391 } else { 4392 PackedAddrs.push_back( 4393 B.buildBuildVector( 4394 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4395 .getReg(0)); 4396 ++I; 4397 } 4398 } 4399 } 4400 } 4401 4402 /// Convert from separate vaddr components to a single vector address register, 4403 /// and replace the remaining operands with $noreg. 4404 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4405 int DimIdx, int NumVAddrs) { 4406 const LLT S32 = LLT::scalar(32); 4407 4408 SmallVector<Register, 8> AddrRegs; 4409 for (int I = 0; I != NumVAddrs; ++I) { 4410 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4411 if (SrcOp.isReg()) { 4412 AddrRegs.push_back(SrcOp.getReg()); 4413 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4414 } 4415 } 4416 4417 int NumAddrRegs = AddrRegs.size(); 4418 if (NumAddrRegs != 1) { 4419 // Above 8 elements round up to next power of 2 (i.e. 16). 4420 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4421 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4422 auto Undef = B.buildUndef(S32); 4423 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4424 NumAddrRegs = RoundedNumRegs; 4425 } 4426 4427 auto VAddr = 4428 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4429 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4430 } 4431 4432 for (int I = 1; I != NumVAddrs; ++I) { 4433 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4434 if (SrcOp.isReg()) 4435 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4436 } 4437 } 4438 4439 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4440 /// 4441 /// Depending on the subtarget, load/store with 16-bit element data need to be 4442 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4443 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4444 /// registers. 4445 /// 4446 /// We don't want to directly select image instructions just yet, but also want 4447 /// to exposes all register repacking to the legalizer/combiners. We also don't 4448 /// want a selected instruction entering RegBankSelect. In order to avoid 4449 /// defining a multitude of intermediate image instructions, directly hack on 4450 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4451 /// padding now unnecessary arguments with $noreg. 4452 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4453 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4454 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4455 4456 const unsigned NumDefs = MI.getNumExplicitDefs(); 4457 const unsigned ArgOffset = NumDefs + 1; 4458 bool IsTFE = NumDefs == 2; 4459 // We are only processing the operands of d16 image operations on subtargets 4460 // that use the unpacked register layout, or need to repack the TFE result. 4461 4462 // TODO: Do we need to guard against already legalized intrinsics? 4463 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4464 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4465 4466 MachineRegisterInfo *MRI = B.getMRI(); 4467 const LLT S32 = LLT::scalar(32); 4468 const LLT S16 = LLT::scalar(16); 4469 const LLT V2S16 = LLT::fixed_vector(2, 16); 4470 4471 unsigned DMask = 0; 4472 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 4473 LLT Ty = MRI->getType(VData); 4474 4475 // Check for 16 bit addresses and pack if true. 4476 LLT GradTy = 4477 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4478 LLT AddrTy = 4479 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4480 const bool IsG16 = GradTy == S16; 4481 const bool IsA16 = AddrTy == S16; 4482 const bool IsD16 = Ty.getScalarType() == S16; 4483 4484 int DMaskLanes = 0; 4485 if (!BaseOpcode->Atomic) { 4486 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4487 if (BaseOpcode->Gather4) { 4488 DMaskLanes = 4; 4489 } else if (DMask != 0) { 4490 DMaskLanes = countPopulation(DMask); 4491 } else if (!IsTFE && !BaseOpcode->Store) { 4492 // If dmask is 0, this is a no-op load. This can be eliminated. 4493 B.buildUndef(MI.getOperand(0)); 4494 MI.eraseFromParent(); 4495 return true; 4496 } 4497 } 4498 4499 Observer.changingInstr(MI); 4500 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4501 4502 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 4503 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 4504 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 4505 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4506 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 4507 4508 // Track that we legalized this 4509 MI.setDesc(B.getTII().get(NewOpcode)); 4510 4511 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4512 // dmask to be at least 1 otherwise the instruction will fail 4513 if (IsTFE && DMask == 0) { 4514 DMask = 0x1; 4515 DMaskLanes = 1; 4516 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4517 } 4518 4519 if (BaseOpcode->Atomic) { 4520 Register VData0 = MI.getOperand(2).getReg(); 4521 LLT Ty = MRI->getType(VData0); 4522 4523 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4524 if (Ty.isVector()) 4525 return false; 4526 4527 if (BaseOpcode->AtomicX2) { 4528 Register VData1 = MI.getOperand(3).getReg(); 4529 // The two values are packed in one register. 4530 LLT PackedTy = LLT::fixed_vector(2, Ty); 4531 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4532 MI.getOperand(2).setReg(Concat.getReg(0)); 4533 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4534 } 4535 } 4536 4537 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4538 4539 // Rewrite the addressing register layout before doing anything else. 4540 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4541 // 16 bit gradients are supported, but are tied to the A16 control 4542 // so both gradients and addresses must be 16 bit 4543 return false; 4544 } 4545 4546 if (IsA16 && !ST.hasA16()) { 4547 // A16 not supported 4548 return false; 4549 } 4550 4551 if (IsA16 || IsG16) { 4552 if (Intr->NumVAddrs > 1) { 4553 SmallVector<Register, 4> PackedRegs; 4554 4555 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4556 IsG16); 4557 4558 // See also below in the non-a16 branch 4559 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4560 PackedRegs.size() <= ST.getNSAMaxSize(); 4561 4562 if (!UseNSA && PackedRegs.size() > 1) { 4563 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4564 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4565 PackedRegs[0] = Concat.getReg(0); 4566 PackedRegs.resize(1); 4567 } 4568 4569 const unsigned NumPacked = PackedRegs.size(); 4570 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4571 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4572 if (!SrcOp.isReg()) { 4573 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4574 continue; 4575 } 4576 4577 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4578 4579 if (I - Intr->VAddrStart < NumPacked) 4580 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4581 else 4582 SrcOp.setReg(AMDGPU::NoRegister); 4583 } 4584 } 4585 } else { 4586 // If the register allocator cannot place the address registers contiguously 4587 // without introducing moves, then using the non-sequential address encoding 4588 // is always preferable, since it saves VALU instructions and is usually a 4589 // wash in terms of code size or even better. 4590 // 4591 // However, we currently have no way of hinting to the register allocator 4592 // that MIMG addresses should be placed contiguously when it is possible to 4593 // do so, so force non-NSA for the common 2-address case as a heuristic. 4594 // 4595 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4596 // allocation when possible. 4597 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4598 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4599 4600 if (!UseNSA && Intr->NumVAddrs > 1) 4601 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4602 Intr->NumVAddrs); 4603 } 4604 4605 int Flags = 0; 4606 if (IsA16) 4607 Flags |= 1; 4608 if (IsG16) 4609 Flags |= 2; 4610 MI.addOperand(MachineOperand::CreateImm(Flags)); 4611 4612 if (BaseOpcode->Store) { // No TFE for stores? 4613 // TODO: Handle dmask trim 4614 if (!Ty.isVector() || !IsD16) 4615 return true; 4616 4617 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4618 if (RepackedReg != VData) { 4619 MI.getOperand(1).setReg(RepackedReg); 4620 } 4621 4622 return true; 4623 } 4624 4625 Register DstReg = MI.getOperand(0).getReg(); 4626 const LLT EltTy = Ty.getScalarType(); 4627 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4628 4629 // Confirm that the return type is large enough for the dmask specified 4630 if (NumElts < DMaskLanes) 4631 return false; 4632 4633 if (NumElts > 4 || DMaskLanes > 4) 4634 return false; 4635 4636 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4637 const LLT AdjustedTy = 4638 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4639 4640 // The raw dword aligned data component of the load. The only legal cases 4641 // where this matters should be when using the packed D16 format, for 4642 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4643 LLT RoundedTy; 4644 4645 // S32 vector to to cover all data, plus TFE result element. 4646 LLT TFETy; 4647 4648 // Register type to use for each loaded component. Will be S32 or V2S16. 4649 LLT RegTy; 4650 4651 if (IsD16 && ST.hasUnpackedD16VMem()) { 4652 RoundedTy = 4653 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4654 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4655 RegTy = S32; 4656 } else { 4657 unsigned EltSize = EltTy.getSizeInBits(); 4658 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4659 unsigned RoundedSize = 32 * RoundedElts; 4660 RoundedTy = LLT::scalarOrVector( 4661 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4662 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4663 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4664 } 4665 4666 // The return type does not need adjustment. 4667 // TODO: Should we change s16 case to s32 or <2 x s16>? 4668 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4669 return true; 4670 4671 Register Dst1Reg; 4672 4673 // Insert after the instruction. 4674 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4675 4676 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4677 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4678 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4679 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4680 4681 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4682 4683 MI.getOperand(0).setReg(NewResultReg); 4684 4685 // In the IR, TFE is supposed to be used with a 2 element struct return 4686 // type. The instruction really returns these two values in one contiguous 4687 // register, with one additional dword beyond the loaded data. Rewrite the 4688 // return type to use a single register result. 4689 4690 if (IsTFE) { 4691 Dst1Reg = MI.getOperand(1).getReg(); 4692 if (MRI->getType(Dst1Reg) != S32) 4693 return false; 4694 4695 // TODO: Make sure the TFE operand bit is set. 4696 MI.removeOperand(1); 4697 4698 // Handle the easy case that requires no repack instructions. 4699 if (Ty == S32) { 4700 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4701 return true; 4702 } 4703 } 4704 4705 // Now figure out how to copy the new result register back into the old 4706 // result. 4707 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4708 4709 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4710 4711 if (ResultNumRegs == 1) { 4712 assert(!IsTFE); 4713 ResultRegs[0] = NewResultReg; 4714 } else { 4715 // We have to repack into a new vector of some kind. 4716 for (int I = 0; I != NumDataRegs; ++I) 4717 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4718 B.buildUnmerge(ResultRegs, NewResultReg); 4719 4720 // Drop the final TFE element to get the data part. The TFE result is 4721 // directly written to the right place already. 4722 if (IsTFE) 4723 ResultRegs.resize(NumDataRegs); 4724 } 4725 4726 // For an s16 scalar result, we form an s32 result with a truncate regardless 4727 // of packed vs. unpacked. 4728 if (IsD16 && !Ty.isVector()) { 4729 B.buildTrunc(DstReg, ResultRegs[0]); 4730 return true; 4731 } 4732 4733 // Avoid a build/concat_vector of 1 entry. 4734 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4735 B.buildBitcast(DstReg, ResultRegs[0]); 4736 return true; 4737 } 4738 4739 assert(Ty.isVector()); 4740 4741 if (IsD16) { 4742 // For packed D16 results with TFE enabled, all the data components are 4743 // S32. Cast back to the expected type. 4744 // 4745 // TODO: We don't really need to use load s32 elements. We would only need one 4746 // cast for the TFE result if a multiple of v2s16 was used. 4747 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4748 for (Register &Reg : ResultRegs) 4749 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4750 } else if (ST.hasUnpackedD16VMem()) { 4751 for (Register &Reg : ResultRegs) 4752 Reg = B.buildTrunc(S16, Reg).getReg(0); 4753 } 4754 } 4755 4756 auto padWithUndef = [&](LLT Ty, int NumElts) { 4757 if (NumElts == 0) 4758 return; 4759 Register Undef = B.buildUndef(Ty).getReg(0); 4760 for (int I = 0; I != NumElts; ++I) 4761 ResultRegs.push_back(Undef); 4762 }; 4763 4764 // Pad out any elements eliminated due to the dmask. 4765 LLT ResTy = MRI->getType(ResultRegs[0]); 4766 if (!ResTy.isVector()) { 4767 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4768 B.buildBuildVector(DstReg, ResultRegs); 4769 return true; 4770 } 4771 4772 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4773 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4774 4775 // Deal with the one annoying legal case. 4776 const LLT V3S16 = LLT::fixed_vector(3, 16); 4777 if (Ty == V3S16) { 4778 if (IsTFE) { 4779 if (ResultRegs.size() == 1) { 4780 NewResultReg = ResultRegs[0]; 4781 } else if (ResultRegs.size() == 2) { 4782 LLT V4S16 = LLT::fixed_vector(4, 16); 4783 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 4784 } else { 4785 return false; 4786 } 4787 } 4788 4789 if (MRI->getType(DstReg).getNumElements() < 4790 MRI->getType(NewResultReg).getNumElements()) { 4791 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 4792 } else { 4793 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 4794 } 4795 return true; 4796 } 4797 4798 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4799 B.buildConcatVectors(DstReg, ResultRegs); 4800 return true; 4801 } 4802 4803 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4804 LegalizerHelper &Helper, MachineInstr &MI) const { 4805 MachineIRBuilder &B = Helper.MIRBuilder; 4806 GISelChangeObserver &Observer = Helper.Observer; 4807 4808 Register Dst = MI.getOperand(0).getReg(); 4809 LLT Ty = B.getMRI()->getType(Dst); 4810 unsigned Size = Ty.getSizeInBits(); 4811 MachineFunction &MF = B.getMF(); 4812 4813 Observer.changingInstr(MI); 4814 4815 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4816 Ty = getBitcastRegisterType(Ty); 4817 Helper.bitcastDst(MI, Ty, 0); 4818 Dst = MI.getOperand(0).getReg(); 4819 B.setInsertPt(B.getMBB(), MI); 4820 } 4821 4822 // FIXME: We don't really need this intermediate instruction. The intrinsic 4823 // should be fixed to have a memory operand. Since it's readnone, we're not 4824 // allowed to add one. 4825 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4826 MI.removeOperand(1); // Remove intrinsic ID 4827 4828 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4829 // TODO: Should this use datalayout alignment? 4830 const unsigned MemSize = (Size + 7) / 8; 4831 const Align MemAlign(4); 4832 MachineMemOperand *MMO = MF.getMachineMemOperand( 4833 MachinePointerInfo(), 4834 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4835 MachineMemOperand::MOInvariant, 4836 MemSize, MemAlign); 4837 MI.addMemOperand(MF, MMO); 4838 4839 // There are no 96-bit result scalar loads, but widening to 128-bit should 4840 // always be legal. We may need to restore this to a 96-bit result if it turns 4841 // out this needs to be converted to a vector load during RegBankSelect. 4842 if (!isPowerOf2_32(Size)) { 4843 if (Ty.isVector()) 4844 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4845 else 4846 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4847 } 4848 4849 Observer.changedInstr(MI); 4850 return true; 4851 } 4852 4853 // TODO: Move to selection 4854 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4855 MachineRegisterInfo &MRI, 4856 MachineIRBuilder &B) const { 4857 if (!ST.isTrapHandlerEnabled() || 4858 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4859 return legalizeTrapEndpgm(MI, MRI, B); 4860 4861 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4862 switch (*HsaAbiVer) { 4863 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4864 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4865 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4866 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4867 case ELF::ELFABIVERSION_AMDGPU_HSA_V5: 4868 return ST.supportsGetDoorbellID() ? 4869 legalizeTrapHsa(MI, MRI, B) : 4870 legalizeTrapHsaQueuePtr(MI, MRI, B); 4871 } 4872 } 4873 4874 llvm_unreachable("Unknown trap handler"); 4875 } 4876 4877 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4878 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4879 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4880 MI.eraseFromParent(); 4881 return true; 4882 } 4883 4884 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4885 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4886 MachineFunction &MF = B.getMF(); 4887 const LLT S64 = LLT::scalar(64); 4888 4889 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4890 // For code object version 5, queue_ptr is passed through implicit kernarg. 4891 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 4892 AMDGPUTargetLowering::ImplicitParameter Param = 4893 AMDGPUTargetLowering::QUEUE_PTR; 4894 uint64_t Offset = 4895 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 4896 4897 Register KernargPtrReg = MRI.createGenericVirtualRegister( 4898 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4899 4900 if (!loadInputValue(KernargPtrReg, B, 4901 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 4902 return false; 4903 4904 // TODO: can we be smarter about machine pointer info? 4905 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 4906 MachineMemOperand *MMO = MF.getMachineMemOperand( 4907 PtrInfo, 4908 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4909 MachineMemOperand::MOInvariant, 4910 LLT::scalar(64), commonAlignment(Align(64), Offset)); 4911 4912 // Pointer address 4913 Register LoadAddr = MRI.createGenericVirtualRegister( 4914 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4915 B.buildPtrAdd(LoadAddr, KernargPtrReg, 4916 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 4917 // Load address 4918 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 4919 B.buildCopy(SGPR01, Temp); 4920 B.buildInstr(AMDGPU::S_TRAP) 4921 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4922 .addReg(SGPR01, RegState::Implicit); 4923 MI.eraseFromParent(); 4924 return true; 4925 } 4926 4927 // Pass queue pointer to trap handler as input, and insert trap instruction 4928 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4929 Register LiveIn = 4930 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4931 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4932 return false; 4933 4934 B.buildCopy(SGPR01, LiveIn); 4935 B.buildInstr(AMDGPU::S_TRAP) 4936 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4937 .addReg(SGPR01, RegState::Implicit); 4938 4939 MI.eraseFromParent(); 4940 return true; 4941 } 4942 4943 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4944 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4945 B.buildInstr(AMDGPU::S_TRAP) 4946 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4947 MI.eraseFromParent(); 4948 return true; 4949 } 4950 4951 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4952 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4953 // Is non-HSA path or trap-handler disabled? Then, report a warning 4954 // accordingly 4955 if (!ST.isTrapHandlerEnabled() || 4956 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4957 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4958 "debugtrap handler not supported", 4959 MI.getDebugLoc(), DS_Warning); 4960 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4961 Ctx.diagnose(NoTrap); 4962 } else { 4963 // Insert debug-trap instruction 4964 B.buildInstr(AMDGPU::S_TRAP) 4965 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4966 } 4967 4968 MI.eraseFromParent(); 4969 return true; 4970 } 4971 4972 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4973 MachineIRBuilder &B) const { 4974 MachineRegisterInfo &MRI = *B.getMRI(); 4975 const LLT S16 = LLT::scalar(16); 4976 const LLT S32 = LLT::scalar(32); 4977 4978 Register DstReg = MI.getOperand(0).getReg(); 4979 Register NodePtr = MI.getOperand(2).getReg(); 4980 Register RayExtent = MI.getOperand(3).getReg(); 4981 Register RayOrigin = MI.getOperand(4).getReg(); 4982 Register RayDir = MI.getOperand(5).getReg(); 4983 Register RayInvDir = MI.getOperand(6).getReg(); 4984 Register TDescr = MI.getOperand(7).getReg(); 4985 4986 if (!ST.hasGFX10_AEncoding()) { 4987 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4988 "intrinsic not supported on subtarget", 4989 MI.getDebugLoc()); 4990 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4991 return false; 4992 } 4993 4994 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4995 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4996 const unsigned NumVDataDwords = 4; 4997 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4998 const bool UseNSA = 4999 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 5000 const unsigned BaseOpcodes[2][2] = { 5001 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 5002 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 5003 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 5004 int Opcode; 5005 if (UseNSA) { 5006 Opcode = 5007 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 5008 NumVDataDwords, NumVAddrDwords); 5009 } else { 5010 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 5011 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 5012 PowerOf2Ceil(NumVAddrDwords)); 5013 } 5014 assert(Opcode != -1); 5015 5016 SmallVector<Register, 12> Ops; 5017 if (Is64) { 5018 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 5019 Ops.push_back(Unmerge.getReg(0)); 5020 Ops.push_back(Unmerge.getReg(1)); 5021 } else { 5022 Ops.push_back(NodePtr); 5023 } 5024 Ops.push_back(RayExtent); 5025 5026 auto packLanes = [&Ops, &S32, &B](Register Src) { 5027 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 5028 Ops.push_back(Unmerge.getReg(0)); 5029 Ops.push_back(Unmerge.getReg(1)); 5030 Ops.push_back(Unmerge.getReg(2)); 5031 }; 5032 5033 packLanes(RayOrigin); 5034 if (IsA16) { 5035 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 5036 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 5037 Register R1 = MRI.createGenericVirtualRegister(S32); 5038 Register R2 = MRI.createGenericVirtualRegister(S32); 5039 Register R3 = MRI.createGenericVirtualRegister(S32); 5040 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 5041 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 5042 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 5043 Ops.push_back(R1); 5044 Ops.push_back(R2); 5045 Ops.push_back(R3); 5046 } else { 5047 packLanes(RayDir); 5048 packLanes(RayInvDir); 5049 } 5050 5051 if (!UseNSA) { 5052 // Build a single vector containing all the operands so far prepared. 5053 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 5054 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 5055 Ops.clear(); 5056 Ops.push_back(MergedOps); 5057 } 5058 5059 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 5060 .addDef(DstReg) 5061 .addImm(Opcode); 5062 5063 for (Register R : Ops) { 5064 MIB.addUse(R); 5065 } 5066 5067 MIB.addUse(TDescr) 5068 .addImm(IsA16 ? 1 : 0) 5069 .cloneMemRefs(MI); 5070 5071 MI.eraseFromParent(); 5072 return true; 5073 } 5074 5075 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) { 5076 B.buildConstant(MI.getOperand(0).getReg(), C); 5077 MI.eraseFromParent(); 5078 return true; 5079 } 5080 5081 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 5082 MachineIRBuilder &B) const { 5083 unsigned Opc; 5084 int RoundMode = MI.getOperand(2).getImm(); 5085 5086 if (RoundMode == (int)RoundingMode::TowardPositive) 5087 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 5088 else if (RoundMode == (int)RoundingMode::TowardNegative) 5089 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 5090 else 5091 return false; 5092 5093 B.buildInstr(Opc) 5094 .addDef(MI.getOperand(0).getReg()) 5095 .addUse(MI.getOperand(1).getReg()); 5096 5097 MI.eraseFromParent(); 5098 5099 return true; 5100 } 5101 5102 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 5103 MachineInstr &MI) const { 5104 MachineIRBuilder &B = Helper.MIRBuilder; 5105 MachineRegisterInfo &MRI = *B.getMRI(); 5106 5107 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 5108 auto IntrID = MI.getIntrinsicID(); 5109 switch (IntrID) { 5110 case Intrinsic::amdgcn_if: 5111 case Intrinsic::amdgcn_else: { 5112 MachineInstr *Br = nullptr; 5113 MachineBasicBlock *UncondBrTarget = nullptr; 5114 bool Negated = false; 5115 if (MachineInstr *BrCond = 5116 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5117 const SIRegisterInfo *TRI 5118 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5119 5120 Register Def = MI.getOperand(1).getReg(); 5121 Register Use = MI.getOperand(3).getReg(); 5122 5123 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5124 5125 if (Negated) 5126 std::swap(CondBrTarget, UncondBrTarget); 5127 5128 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5129 if (IntrID == Intrinsic::amdgcn_if) { 5130 B.buildInstr(AMDGPU::SI_IF) 5131 .addDef(Def) 5132 .addUse(Use) 5133 .addMBB(UncondBrTarget); 5134 } else { 5135 B.buildInstr(AMDGPU::SI_ELSE) 5136 .addDef(Def) 5137 .addUse(Use) 5138 .addMBB(UncondBrTarget); 5139 } 5140 5141 if (Br) { 5142 Br->getOperand(0).setMBB(CondBrTarget); 5143 } else { 5144 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 5145 // since we're swapping branch targets it needs to be reinserted. 5146 // FIXME: IRTranslator should probably not do this 5147 B.buildBr(*CondBrTarget); 5148 } 5149 5150 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 5151 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 5152 MI.eraseFromParent(); 5153 BrCond->eraseFromParent(); 5154 return true; 5155 } 5156 5157 return false; 5158 } 5159 case Intrinsic::amdgcn_loop: { 5160 MachineInstr *Br = nullptr; 5161 MachineBasicBlock *UncondBrTarget = nullptr; 5162 bool Negated = false; 5163 if (MachineInstr *BrCond = 5164 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5165 const SIRegisterInfo *TRI 5166 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5167 5168 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5169 Register Reg = MI.getOperand(2).getReg(); 5170 5171 if (Negated) 5172 std::swap(CondBrTarget, UncondBrTarget); 5173 5174 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5175 B.buildInstr(AMDGPU::SI_LOOP) 5176 .addUse(Reg) 5177 .addMBB(UncondBrTarget); 5178 5179 if (Br) 5180 Br->getOperand(0).setMBB(CondBrTarget); 5181 else 5182 B.buildBr(*CondBrTarget); 5183 5184 MI.eraseFromParent(); 5185 BrCond->eraseFromParent(); 5186 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5187 return true; 5188 } 5189 5190 return false; 5191 } 5192 case Intrinsic::amdgcn_kernarg_segment_ptr: 5193 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5194 // This only makes sense to call in a kernel, so just lower to null. 5195 B.buildConstant(MI.getOperand(0).getReg(), 0); 5196 MI.eraseFromParent(); 5197 return true; 5198 } 5199 5200 return legalizePreloadedArgIntrin( 5201 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5202 case Intrinsic::amdgcn_implicitarg_ptr: 5203 return legalizeImplicitArgPtr(MI, MRI, B); 5204 case Intrinsic::amdgcn_workitem_id_x: 5205 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0) 5206 return replaceWithConstant(B, MI, 0); 5207 return legalizePreloadedArgIntrin(MI, MRI, B, 5208 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5209 case Intrinsic::amdgcn_workitem_id_y: 5210 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0) 5211 return replaceWithConstant(B, MI, 0); 5212 5213 return legalizePreloadedArgIntrin(MI, MRI, B, 5214 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5215 case Intrinsic::amdgcn_workitem_id_z: 5216 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0) 5217 return replaceWithConstant(B, MI, 0); 5218 5219 return legalizePreloadedArgIntrin(MI, MRI, B, 5220 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5221 case Intrinsic::amdgcn_workgroup_id_x: 5222 return legalizePreloadedArgIntrin(MI, MRI, B, 5223 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5224 case Intrinsic::amdgcn_workgroup_id_y: 5225 return legalizePreloadedArgIntrin(MI, MRI, B, 5226 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5227 case Intrinsic::amdgcn_workgroup_id_z: 5228 return legalizePreloadedArgIntrin(MI, MRI, B, 5229 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5230 case Intrinsic::amdgcn_dispatch_ptr: 5231 return legalizePreloadedArgIntrin(MI, MRI, B, 5232 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5233 case Intrinsic::amdgcn_queue_ptr: 5234 return legalizePreloadedArgIntrin(MI, MRI, B, 5235 AMDGPUFunctionArgInfo::QUEUE_PTR); 5236 case Intrinsic::amdgcn_implicit_buffer_ptr: 5237 return legalizePreloadedArgIntrin( 5238 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5239 case Intrinsic::amdgcn_dispatch_id: 5240 return legalizePreloadedArgIntrin(MI, MRI, B, 5241 AMDGPUFunctionArgInfo::DISPATCH_ID); 5242 case Intrinsic::r600_read_ngroups_x: 5243 // TODO: Emit error for hsa 5244 return legalizeKernargMemParameter(MI, B, 5245 SI::KernelInputOffsets::NGROUPS_X); 5246 case Intrinsic::r600_read_ngroups_y: 5247 return legalizeKernargMemParameter(MI, B, 5248 SI::KernelInputOffsets::NGROUPS_Y); 5249 case Intrinsic::r600_read_ngroups_z: 5250 return legalizeKernargMemParameter(MI, B, 5251 SI::KernelInputOffsets::NGROUPS_Z); 5252 case Intrinsic::r600_read_local_size_x: 5253 // TODO: Could insert G_ASSERT_ZEXT from s16 5254 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); 5255 case Intrinsic::r600_read_local_size_y: 5256 // TODO: Could insert G_ASSERT_ZEXT from s16 5257 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); 5258 // TODO: Could insert G_ASSERT_ZEXT from s16 5259 case Intrinsic::r600_read_local_size_z: 5260 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); 5261 case Intrinsic::r600_read_global_size_x: 5262 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); 5263 case Intrinsic::r600_read_global_size_y: 5264 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); 5265 case Intrinsic::r600_read_global_size_z: 5266 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); 5267 case Intrinsic::amdgcn_fdiv_fast: 5268 return legalizeFDIVFastIntrin(MI, MRI, B); 5269 case Intrinsic::amdgcn_is_shared: 5270 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5271 case Intrinsic::amdgcn_is_private: 5272 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5273 case Intrinsic::amdgcn_wavefrontsize: { 5274 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5275 MI.eraseFromParent(); 5276 return true; 5277 } 5278 case Intrinsic::amdgcn_s_buffer_load: 5279 return legalizeSBufferLoad(Helper, MI); 5280 case Intrinsic::amdgcn_raw_buffer_store: 5281 case Intrinsic::amdgcn_struct_buffer_store: 5282 return legalizeBufferStore(MI, MRI, B, false, false); 5283 case Intrinsic::amdgcn_raw_buffer_store_format: 5284 case Intrinsic::amdgcn_struct_buffer_store_format: 5285 return legalizeBufferStore(MI, MRI, B, false, true); 5286 case Intrinsic::amdgcn_raw_tbuffer_store: 5287 case Intrinsic::amdgcn_struct_tbuffer_store: 5288 return legalizeBufferStore(MI, MRI, B, true, true); 5289 case Intrinsic::amdgcn_raw_buffer_load: 5290 case Intrinsic::amdgcn_struct_buffer_load: 5291 return legalizeBufferLoad(MI, MRI, B, false, false); 5292 case Intrinsic::amdgcn_raw_buffer_load_format: 5293 case Intrinsic::amdgcn_struct_buffer_load_format: 5294 return legalizeBufferLoad(MI, MRI, B, true, false); 5295 case Intrinsic::amdgcn_raw_tbuffer_load: 5296 case Intrinsic::amdgcn_struct_tbuffer_load: 5297 return legalizeBufferLoad(MI, MRI, B, true, true); 5298 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5299 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5300 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5301 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5302 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5303 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5304 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5305 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5306 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5307 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5308 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5309 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5310 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5311 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5312 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5313 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5314 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5315 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5316 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5317 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5318 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5319 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5320 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5321 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5322 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5323 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5324 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5325 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5326 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5327 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5328 return legalizeBufferAtomic(MI, B, IntrID); 5329 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5330 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { 5331 Register DstReg = MI.getOperand(0).getReg(); 5332 if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) { 5333 Function &F = B.getMF().getFunction(); 5334 DiagnosticInfoUnsupported NoFpRet( 5335 F, "return versions of fp atomics not supported", B.getDebugLoc(), 5336 DS_Error); 5337 F.getContext().diagnose(NoFpRet); 5338 B.buildUndef(DstReg); 5339 MI.eraseFromParent(); 5340 return true; 5341 } 5342 5343 return legalizeBufferAtomic(MI, B, IntrID); 5344 } 5345 case Intrinsic::amdgcn_atomic_inc: 5346 return legalizeAtomicIncDec(MI, B, true); 5347 case Intrinsic::amdgcn_atomic_dec: 5348 return legalizeAtomicIncDec(MI, B, false); 5349 case Intrinsic::trap: 5350 return legalizeTrapIntrinsic(MI, MRI, B); 5351 case Intrinsic::debugtrap: 5352 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5353 case Intrinsic::amdgcn_rsq_clamp: 5354 return legalizeRsqClampIntrinsic(MI, MRI, B); 5355 case Intrinsic::amdgcn_ds_fadd: 5356 case Intrinsic::amdgcn_ds_fmin: 5357 case Intrinsic::amdgcn_ds_fmax: 5358 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5359 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5360 return legalizeBVHIntrinsic(MI, B); 5361 default: { 5362 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5363 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5364 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5365 return true; 5366 } 5367 } 5368 5369 return true; 5370 } 5371