1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 #include "llvm/IR/IntrinsicsR600.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 /// \returns true if this is an odd sized vector which should widen by adding an 64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65 /// excludes s1 vectors, which should always be scalarized. 66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 67 return [=](const LegalityQuery &Query) { 68 const LLT Ty = Query.Types[TypeIdx]; 69 if (!Ty.isVector()) 70 return false; 71 72 const LLT EltTy = Ty.getElementType(); 73 const unsigned EltSize = EltTy.getSizeInBits(); 74 return Ty.getNumElements() % 2 != 0 && 75 EltSize > 1 && EltSize < 32 && 76 Ty.getSizeInBits() % 32 != 0; 77 }; 78 } 79 80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 return Ty.getSizeInBits() % 32 == 0; 84 }; 85 } 86 87 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getScalarType(); 91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 92 }; 93 } 94 95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 return std::make_pair(TypeIdx, 100 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 101 }; 102 } 103 104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 105 return [=](const LegalityQuery &Query) { 106 const LLT Ty = Query.Types[TypeIdx]; 107 const LLT EltTy = Ty.getElementType(); 108 unsigned Size = Ty.getSizeInBits(); 109 unsigned Pieces = (Size + 63) / 64; 110 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 111 return std::make_pair( 112 TypeIdx, 113 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 114 }; 115 } 116 117 // Increase the number of vector elements to reach the next multiple of 32-bit 118 // type. 119 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 120 return [=](const LegalityQuery &Query) { 121 const LLT Ty = Query.Types[TypeIdx]; 122 123 const LLT EltTy = Ty.getElementType(); 124 const int Size = Ty.getSizeInBits(); 125 const int EltSize = EltTy.getSizeInBits(); 126 const int NextMul32 = (Size + 31) / 32; 127 128 assert(EltSize < 32); 129 130 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 131 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 132 }; 133 } 134 135 static LLT getBitcastRegisterType(const LLT Ty) { 136 const unsigned Size = Ty.getSizeInBits(); 137 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 uint64_t AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .minScalar(0, S16) 536 .clampMaxNumElementsStrict(0, S16, 2) 537 .widenScalarToNextMultipleOf(0, 32) 538 .maxScalar(0, S32) 539 .scalarize(0); 540 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 542 .legalFor({S32, S16, V2S16}) // Clamp modifier 543 .minScalarOrElt(0, S16) 544 .clampMaxNumElementsStrict(0, S16, 2) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 32) 547 .lower(); 548 } else if (ST.has16BitInsts()) { 549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 550 .legalFor({S32, S16}) 551 .minScalar(0, S16) 552 .widenScalarToNextMultipleOf(0, 32) 553 .maxScalar(0, S32) 554 .scalarize(0); 555 556 // Technically the saturating operations require clamp bit support, but this 557 // was introduced at the same time as 16-bit operations. 558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 559 .legalFor({S32, S16}) // Clamp modifier 560 .minScalar(0, S16) 561 .scalarize(0) 562 .widenScalarToNextPow2(0, 16) 563 .lower(); 564 565 // We're just lowering this, but it helps get a better result to try to 566 // coerce to the desired type first. 567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 568 .minScalar(0, S16) 569 .scalarize(0) 570 .lower(); 571 } else { 572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 573 .legalFor({S32}) 574 .widenScalarToNextMultipleOf(0, 32) 575 .clampScalar(0, S32, S32) 576 .scalarize(0); 577 578 if (ST.hasIntClamp()) { 579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 580 .legalFor({S32}) // Clamp modifier. 581 .scalarize(0) 582 .minScalarOrElt(0, S32) 583 .lower(); 584 } else { 585 // Clamp bit support was added in VI, along with 16-bit operations. 586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 587 .minScalar(0, S32) 588 .scalarize(0) 589 .lower(); 590 } 591 592 // FIXME: DAG expansion gets better results. The widening uses the smaller 593 // range values and goes for the min/max lowering directly. 594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 595 .minScalar(0, S32) 596 .scalarize(0) 597 .lower(); 598 } 599 600 getActionDefinitionsBuilder( 601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 602 .customFor({S32, S64}) 603 .clampScalar(0, S32, S64) 604 .widenScalarToNextPow2(0, 32) 605 .scalarize(0); 606 607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 608 .legalFor({S32}) 609 .maxScalar(0, S32); 610 611 if (ST.hasVOP3PInsts()) { 612 Mulh 613 .clampMaxNumElements(0, S8, 2) 614 .lowerFor({V2S8}); 615 } 616 617 Mulh 618 .scalarize(0) 619 .lower(); 620 621 // Report legal for any types we can handle anywhere. For the cases only legal 622 // on the SALU, RegBankSelect will be able to re-legalize. 623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 625 .clampScalar(0, S32, S64) 626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 628 .widenScalarToNextPow2(0) 629 .scalarize(0); 630 631 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 633 .legalFor({{S32, S1}, {S32, S32}}) 634 .minScalar(0, S32) 635 .scalarize(0) 636 .lower(); 637 638 getActionDefinitionsBuilder(G_BITCAST) 639 // Don't worry about the size constraint. 640 .legalIf(all(isRegisterType(0), isRegisterType(1))) 641 .lower(); 642 643 644 getActionDefinitionsBuilder(G_CONSTANT) 645 .legalFor({S1, S32, S64, S16, GlobalPtr, 646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 647 .legalIf(isPointer(0)) 648 .clampScalar(0, S32, S64) 649 .widenScalarToNextPow2(0); 650 651 getActionDefinitionsBuilder(G_FCONSTANT) 652 .legalFor({S32, S64, S16}) 653 .clampScalar(0, S16, S64); 654 655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 656 .legalIf(isRegisterType(0)) 657 // s1 and s16 are special cases because they have legal operations on 658 // them, but don't really occupy registers in the normal way. 659 .legalFor({S1, S16}) 660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 661 .clampScalarOrElt(0, S32, MaxScalar) 662 .widenScalarToNextPow2(0, 32) 663 .clampMaxNumElements(0, S32, 16); 664 665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 666 667 // If the amount is divergent, we have to do a wave reduction to get the 668 // maximum value, so this is expanded during RegBankSelect. 669 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 670 .legalFor({{PrivatePtr, S32}}); 671 672 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 673 .customIf(typeIsNot(0, PrivatePtr)); 674 675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 676 677 auto &FPOpActions = getActionDefinitionsBuilder( 678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 679 .legalFor({S32, S64}); 680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 681 .customFor({S32, S64}); 682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 683 .customFor({S32, S64}); 684 685 if (ST.has16BitInsts()) { 686 if (ST.hasVOP3PInsts()) 687 FPOpActions.legalFor({S16, V2S16}); 688 else 689 FPOpActions.legalFor({S16}); 690 691 TrigActions.customFor({S16}); 692 FDIVActions.customFor({S16}); 693 } 694 695 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 697 698 if (ST.hasVOP3PInsts()) { 699 MinNumMaxNum.customFor(FPTypesPK16) 700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 701 .clampMaxNumElements(0, S16, 2) 702 .clampScalar(0, S16, S64) 703 .scalarize(0); 704 } else if (ST.has16BitInsts()) { 705 MinNumMaxNum.customFor(FPTypes16) 706 .clampScalar(0, S16, S64) 707 .scalarize(0); 708 } else { 709 MinNumMaxNum.customFor(FPTypesBase) 710 .clampScalar(0, S32, S64) 711 .scalarize(0); 712 } 713 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 716 717 FPOpActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 TrigActions 722 .scalarize(0) 723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 724 725 FDIVActions 726 .scalarize(0) 727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 728 729 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 730 .legalFor(FPTypesPK16) 731 .clampMaxNumElementsStrict(0, S16, 2) 732 .scalarize(0) 733 .clampScalar(0, S16, S64); 734 735 if (ST.has16BitInsts()) { 736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 737 .legalFor({S32, S64, S16}) 738 .scalarize(0) 739 .clampScalar(0, S16, S64); 740 } else { 741 getActionDefinitionsBuilder(G_FSQRT) 742 .legalFor({S32, S64}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 746 if (ST.hasFractBug()) { 747 getActionDefinitionsBuilder(G_FFLOOR) 748 .customFor({S64}) 749 .legalFor({S32, S64}) 750 .scalarize(0) 751 .clampScalar(0, S32, S64); 752 } else { 753 getActionDefinitionsBuilder(G_FFLOOR) 754 .legalFor({S32, S64}) 755 .scalarize(0) 756 .clampScalar(0, S32, S64); 757 } 758 } 759 760 getActionDefinitionsBuilder(G_FPTRUNC) 761 .legalFor({{S32, S64}, {S16, S32}}) 762 .scalarize(0) 763 .lower(); 764 765 getActionDefinitionsBuilder(G_FPEXT) 766 .legalFor({{S64, S32}, {S32, S16}}) 767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 768 .scalarize(0); 769 770 getActionDefinitionsBuilder(G_FSUB) 771 // Use actual fsub instruction 772 .legalFor({S32}) 773 // Must use fadd + fneg 774 .lowerFor({S64, S16, V2S16}) 775 .scalarize(0) 776 .clampScalar(0, S32, S64); 777 778 // Whether this is legal depends on the floating point mode for the function. 779 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 780 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 781 FMad.customFor({S32, S16}); 782 else if (ST.hasMadMacF32Insts()) 783 FMad.customFor({S32}); 784 else if (ST.hasMadF16()) 785 FMad.customFor({S16}); 786 FMad.scalarize(0) 787 .lower(); 788 789 auto &FRem = getActionDefinitionsBuilder(G_FREM); 790 if (ST.has16BitInsts()) { 791 FRem.customFor({S16, S32, S64}); 792 } else { 793 FRem.minScalar(0, S32) 794 .customFor({S32, S64}); 795 } 796 FRem.scalarize(0); 797 798 // TODO: Do we need to clamp maximum bitwidth? 799 getActionDefinitionsBuilder(G_TRUNC) 800 .legalIf(isScalar(0)) 801 .legalFor({{V2S16, V2S32}}) 802 .clampMaxNumElements(0, S16, 2) 803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 804 // situations (like an invalid implicit use), we don't want to infinite loop 805 // in the legalizer. 806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 807 .alwaysLegal(); 808 809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 811 {S32, S1}, {S64, S1}, {S16, S1}}) 812 .scalarize(0) 813 .clampScalar(0, S32, S64) 814 .widenScalarToNextPow2(1, 32); 815 816 // TODO: Split s1->s64 during regbankselect for VALU. 817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 819 .lowerIf(typeIs(1, S1)) 820 .customFor({{S32, S64}, {S64, S64}}); 821 if (ST.has16BitInsts()) 822 IToFP.legalFor({{S16, S16}}); 823 IToFP.clampScalar(1, S32, S64) 824 .minScalar(0, S32) 825 .scalarize(0) 826 .widenScalarToNextPow2(1); 827 828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 830 .customFor({{S64, S32}, {S64, S64}}) 831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 832 if (ST.has16BitInsts()) 833 FPToI.legalFor({{S16, S16}}); 834 else 835 FPToI.minScalar(1, S32); 836 837 FPToI.minScalar(0, S32) 838 .widenScalarToNextPow2(0, 32) 839 .scalarize(0) 840 .lower(); 841 842 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 843 .customFor({S16, S32}) 844 .scalarize(0) 845 .lower(); 846 847 // Lower roundeven into G_FRINT 848 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 849 .scalarize(0) 850 .lower(); 851 852 if (ST.has16BitInsts()) { 853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 854 .legalFor({S16, S32, S64}) 855 .clampScalar(0, S16, S64) 856 .scalarize(0); 857 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 859 .legalFor({S32, S64}) 860 .clampScalar(0, S32, S64) 861 .scalarize(0); 862 } else { 863 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 864 .legalFor({S32}) 865 .customFor({S64}) 866 .clampScalar(0, S32, S64) 867 .scalarize(0); 868 } 869 870 getActionDefinitionsBuilder(G_PTR_ADD) 871 .legalIf(all(isPointer(0), sameSize(0, 1))) 872 .scalarize(0) 873 .scalarSameSizeAs(1, 0); 874 875 getActionDefinitionsBuilder(G_PTRMASK) 876 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 877 .scalarSameSizeAs(1, 0) 878 .scalarize(0); 879 880 auto &CmpBuilder = 881 getActionDefinitionsBuilder(G_ICMP) 882 // The compare output type differs based on the register bank of the output, 883 // so make both s1 and s32 legal. 884 // 885 // Scalar compares producing output in scc will be promoted to s32, as that 886 // is the allocatable register type that will be needed for the copy from 887 // scc. This will be promoted during RegBankSelect, and we assume something 888 // before that won't try to use s32 result types. 889 // 890 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 891 // bank. 892 .legalForCartesianProduct( 893 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 894 .legalForCartesianProduct( 895 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 896 if (ST.has16BitInsts()) { 897 CmpBuilder.legalFor({{S1, S16}}); 898 } 899 900 CmpBuilder 901 .widenScalarToNextPow2(1) 902 .clampScalar(1, S32, S64) 903 .scalarize(0) 904 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 905 906 getActionDefinitionsBuilder(G_FCMP) 907 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 908 .widenScalarToNextPow2(1) 909 .clampScalar(1, S32, S64) 910 .scalarize(0); 911 912 // FIXME: fpow has a selection pattern that should move to custom lowering. 913 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 914 if (ST.has16BitInsts()) 915 Exp2Ops.legalFor({S32, S16}); 916 else 917 Exp2Ops.legalFor({S32}); 918 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 919 Exp2Ops.scalarize(0); 920 921 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 922 if (ST.has16BitInsts()) 923 ExpOps.customFor({{S32}, {S16}}); 924 else 925 ExpOps.customFor({S32}); 926 ExpOps.clampScalar(0, MinScalarFPTy, S32) 927 .scalarize(0); 928 929 getActionDefinitionsBuilder(G_FPOWI) 930 .clampScalar(0, MinScalarFPTy, S32) 931 .lower(); 932 933 // The 64-bit versions produce 32-bit results, but only on the SALU. 934 getActionDefinitionsBuilder(G_CTPOP) 935 .legalFor({{S32, S32}, {S32, S64}}) 936 .clampScalar(0, S32, S32) 937 .widenScalarToNextPow2(1, 32) 938 .clampScalar(1, S32, S64) 939 .scalarize(0) 940 .widenScalarToNextPow2(0, 32); 941 942 943 // The hardware instructions return a different result on 0 than the generic 944 // instructions expect. The hardware produces -1, but these produce the 945 // bitwidth. 946 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 947 .scalarize(0) 948 .clampScalar(0, S32, S32) 949 .clampScalar(1, S32, S64) 950 .widenScalarToNextPow2(0, 32) 951 .widenScalarToNextPow2(1, 32) 952 .custom(); 953 954 // The 64-bit versions produce 32-bit results, but only on the SALU. 955 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 956 .legalFor({{S32, S32}, {S32, S64}}) 957 .clampScalar(0, S32, S32) 958 .clampScalar(1, S32, S64) 959 .scalarize(0) 960 .widenScalarToNextPow2(0, 32) 961 .widenScalarToNextPow2(1, 32); 962 963 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 964 // RegBankSelect. 965 getActionDefinitionsBuilder(G_BITREVERSE) 966 .legalFor({S32, S64}) 967 .clampScalar(0, S32, S64) 968 .scalarize(0) 969 .widenScalarToNextPow2(0); 970 971 if (ST.has16BitInsts()) { 972 getActionDefinitionsBuilder(G_BSWAP) 973 .legalFor({S16, S32, V2S16}) 974 .clampMaxNumElementsStrict(0, S16, 2) 975 // FIXME: Fixing non-power-of-2 before clamp is workaround for 976 // narrowScalar limitation. 977 .widenScalarToNextPow2(0) 978 .clampScalar(0, S16, S32) 979 .scalarize(0); 980 981 if (ST.hasVOP3PInsts()) { 982 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 983 .legalFor({S32, S16, V2S16}) 984 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 985 .clampMaxNumElements(0, S16, 2) 986 .minScalar(0, S16) 987 .widenScalarToNextPow2(0) 988 .scalarize(0) 989 .lower(); 990 } else { 991 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 992 .legalFor({S32, S16}) 993 .widenScalarToNextPow2(0) 994 .minScalar(0, S16) 995 .scalarize(0) 996 .lower(); 997 } 998 } else { 999 // TODO: Should have same legality without v_perm_b32 1000 getActionDefinitionsBuilder(G_BSWAP) 1001 .legalFor({S32}) 1002 .lowerIf(scalarNarrowerThan(0, 32)) 1003 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1004 // narrowScalar limitation. 1005 .widenScalarToNextPow2(0) 1006 .maxScalar(0, S32) 1007 .scalarize(0) 1008 .lower(); 1009 1010 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1011 .legalFor({S32}) 1012 .minScalar(0, S32) 1013 .widenScalarToNextPow2(0) 1014 .scalarize(0) 1015 .lower(); 1016 } 1017 1018 getActionDefinitionsBuilder(G_INTTOPTR) 1019 // List the common cases 1020 .legalForCartesianProduct(AddrSpaces64, {S64}) 1021 .legalForCartesianProduct(AddrSpaces32, {S32}) 1022 .scalarize(0) 1023 // Accept any address space as long as the size matches 1024 .legalIf(sameSize(0, 1)) 1025 .widenScalarIf(smallerThan(1, 0), 1026 [](const LegalityQuery &Query) { 1027 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1028 }) 1029 .narrowScalarIf(largerThan(1, 0), 1030 [](const LegalityQuery &Query) { 1031 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1032 }); 1033 1034 getActionDefinitionsBuilder(G_PTRTOINT) 1035 // List the common cases 1036 .legalForCartesianProduct(AddrSpaces64, {S64}) 1037 .legalForCartesianProduct(AddrSpaces32, {S32}) 1038 .scalarize(0) 1039 // Accept any address space as long as the size matches 1040 .legalIf(sameSize(0, 1)) 1041 .widenScalarIf(smallerThan(0, 1), 1042 [](const LegalityQuery &Query) { 1043 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1044 }) 1045 .narrowScalarIf( 1046 largerThan(0, 1), 1047 [](const LegalityQuery &Query) { 1048 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1049 }); 1050 1051 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1052 .scalarize(0) 1053 .custom(); 1054 1055 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1056 bool IsLoad) -> bool { 1057 const LLT DstTy = Query.Types[0]; 1058 1059 // Split vector extloads. 1060 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1061 1062 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1063 return true; 1064 1065 const LLT PtrTy = Query.Types[1]; 1066 unsigned AS = PtrTy.getAddressSpace(); 1067 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1068 return true; 1069 1070 // Catch weird sized loads that don't evenly divide into the access sizes 1071 // TODO: May be able to widen depending on alignment etc. 1072 unsigned NumRegs = (MemSize + 31) / 32; 1073 if (NumRegs == 3) { 1074 if (!ST.hasDwordx3LoadStores()) 1075 return true; 1076 } else { 1077 // If the alignment allows, these should have been widened. 1078 if (!isPowerOf2_32(NumRegs)) 1079 return true; 1080 } 1081 1082 return false; 1083 }; 1084 1085 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1086 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1087 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1088 1089 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1090 // LDS 1091 // TODO: Unsupported flat for SI. 1092 1093 for (unsigned Op : {G_LOAD, G_STORE}) { 1094 const bool IsStore = Op == G_STORE; 1095 1096 auto &Actions = getActionDefinitionsBuilder(Op); 1097 // Explicitly list some common cases. 1098 // TODO: Does this help compile time at all? 1099 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1100 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1101 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1102 {S64, GlobalPtr, S64, GlobalAlign32}, 1103 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1104 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1105 {S32, GlobalPtr, S8, GlobalAlign8}, 1106 {S32, GlobalPtr, S16, GlobalAlign16}, 1107 1108 {S32, LocalPtr, S32, 32}, 1109 {S64, LocalPtr, S64, 32}, 1110 {V2S32, LocalPtr, V2S32, 32}, 1111 {S32, LocalPtr, S8, 8}, 1112 {S32, LocalPtr, S16, 16}, 1113 {V2S16, LocalPtr, S32, 32}, 1114 1115 {S32, PrivatePtr, S32, 32}, 1116 {S32, PrivatePtr, S8, 8}, 1117 {S32, PrivatePtr, S16, 16}, 1118 {V2S16, PrivatePtr, S32, 32}, 1119 1120 {S32, ConstantPtr, S32, GlobalAlign32}, 1121 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1122 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1123 {S64, ConstantPtr, S64, GlobalAlign32}, 1124 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1125 Actions.legalIf( 1126 [=](const LegalityQuery &Query) -> bool { 1127 return isLoadStoreLegal(ST, Query); 1128 }); 1129 1130 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1131 // 64-bits. 1132 // 1133 // TODO: Should generalize bitcast action into coerce, which will also cover 1134 // inserting addrspacecasts. 1135 Actions.customIf(typeIs(1, Constant32Ptr)); 1136 1137 // Turn any illegal element vectors into something easier to deal 1138 // with. These will ultimately produce 32-bit scalar shifts to extract the 1139 // parts anyway. 1140 // 1141 // For odd 16-bit element vectors, prefer to split those into pieces with 1142 // 16-bit vector parts. 1143 Actions.bitcastIf( 1144 [=](const LegalityQuery &Query) -> bool { 1145 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1146 Query.MMODescrs[0].MemoryTy); 1147 }, bitcastToRegisterType(0)); 1148 1149 if (!IsStore) { 1150 // Widen suitably aligned loads by loading extra bytes. The standard 1151 // legalization actions can't properly express widening memory operands. 1152 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1153 return shouldWidenLoad(ST, Query, G_LOAD); 1154 }); 1155 } 1156 1157 // FIXME: load/store narrowing should be moved to lower action 1158 Actions 1159 .narrowScalarIf( 1160 [=](const LegalityQuery &Query) -> bool { 1161 return !Query.Types[0].isVector() && 1162 needToSplitMemOp(Query, Op == G_LOAD); 1163 }, 1164 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1165 const LLT DstTy = Query.Types[0]; 1166 const LLT PtrTy = Query.Types[1]; 1167 1168 const unsigned DstSize = DstTy.getSizeInBits(); 1169 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1170 1171 // Split extloads. 1172 if (DstSize > MemSize) 1173 return std::make_pair(0, LLT::scalar(MemSize)); 1174 1175 unsigned MaxSize = maxSizeForAddrSpace(ST, 1176 PtrTy.getAddressSpace(), 1177 Op == G_LOAD); 1178 if (MemSize > MaxSize) 1179 return std::make_pair(0, LLT::scalar(MaxSize)); 1180 1181 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1182 return std::make_pair(0, LLT::scalar(Align)); 1183 }) 1184 .fewerElementsIf( 1185 [=](const LegalityQuery &Query) -> bool { 1186 return Query.Types[0].isVector() && 1187 needToSplitMemOp(Query, Op == G_LOAD); 1188 }, 1189 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1190 const LLT DstTy = Query.Types[0]; 1191 const LLT PtrTy = Query.Types[1]; 1192 1193 LLT EltTy = DstTy.getElementType(); 1194 unsigned MaxSize = maxSizeForAddrSpace(ST, 1195 PtrTy.getAddressSpace(), 1196 Op == G_LOAD); 1197 1198 // FIXME: Handle widened to power of 2 results better. This ends 1199 // up scalarizing. 1200 // FIXME: 3 element stores scalarized on SI 1201 1202 // Split if it's too large for the address space. 1203 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1204 if (MemSize > MaxSize) { 1205 unsigned NumElts = DstTy.getNumElements(); 1206 unsigned EltSize = EltTy.getSizeInBits(); 1207 1208 if (MaxSize % EltSize == 0) { 1209 return std::make_pair( 1210 0, LLT::scalarOrVector( 1211 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1212 } 1213 1214 unsigned NumPieces = MemSize / MaxSize; 1215 1216 // FIXME: Refine when odd breakdowns handled 1217 // The scalars will need to be re-legalized. 1218 if (NumPieces == 1 || NumPieces >= NumElts || 1219 NumElts % NumPieces != 0) 1220 return std::make_pair(0, EltTy); 1221 1222 return std::make_pair( 1223 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1224 } 1225 1226 // FIXME: We could probably handle weird extending loads better. 1227 if (DstTy.getSizeInBits() > MemSize) 1228 return std::make_pair(0, EltTy); 1229 1230 unsigned EltSize = EltTy.getSizeInBits(); 1231 unsigned DstSize = DstTy.getSizeInBits(); 1232 if (!isPowerOf2_32(DstSize)) { 1233 // We're probably decomposing an odd sized store. Try to split 1234 // to the widest type. TODO: Account for alignment. As-is it 1235 // should be OK, since the new parts will be further legalized. 1236 unsigned FloorSize = PowerOf2Floor(DstSize); 1237 return std::make_pair( 1238 0, LLT::scalarOrVector( 1239 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1240 } 1241 1242 // May need relegalization for the scalars. 1243 return std::make_pair(0, EltTy); 1244 }) 1245 .minScalar(0, S32) 1246 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1247 .widenScalarToNextPow2(0) 1248 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1249 .lower(); 1250 } 1251 1252 // FIXME: Unaligned accesses not lowered. 1253 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1254 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1255 {S32, GlobalPtr, S16, 2 * 8}, 1256 {S32, LocalPtr, S8, 8}, 1257 {S32, LocalPtr, S16, 16}, 1258 {S32, PrivatePtr, S8, 8}, 1259 {S32, PrivatePtr, S16, 16}, 1260 {S32, ConstantPtr, S8, 8}, 1261 {S32, ConstantPtr, S16, 2 * 8}}) 1262 .legalIf( 1263 [=](const LegalityQuery &Query) -> bool { 1264 return isLoadStoreLegal(ST, Query); 1265 }); 1266 1267 if (ST.hasFlatAddressSpace()) { 1268 ExtLoads.legalForTypesWithMemDesc( 1269 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1270 } 1271 1272 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1273 // 64-bits. 1274 // 1275 // TODO: Should generalize bitcast action into coerce, which will also cover 1276 // inserting addrspacecasts. 1277 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1278 1279 ExtLoads.clampScalar(0, S32, S32) 1280 .widenScalarToNextPow2(0) 1281 .lower(); 1282 1283 auto &Atomics = getActionDefinitionsBuilder( 1284 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1285 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1286 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1287 G_ATOMICRMW_UMIN}) 1288 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1289 {S64, GlobalPtr}, {S64, LocalPtr}, 1290 {S32, RegionPtr}, {S64, RegionPtr}}); 1291 if (ST.hasFlatAddressSpace()) { 1292 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1293 } 1294 1295 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1296 if (ST.hasLDSFPAtomicAdd()) { 1297 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1298 if (ST.hasGFX90AInsts()) 1299 Atomic.legalFor({{S64, LocalPtr}}); 1300 if (ST.hasGFX940Insts()) 1301 Atomic.legalFor({{V2S16, LocalPtr}}); 1302 } 1303 if (ST.hasAtomicFaddInsts()) 1304 Atomic.legalFor({{S32, GlobalPtr}}); 1305 1306 if (ST.hasGFX90AInsts()) { 1307 // These are legal with some caveats, and should have undergone expansion in 1308 // the IR in most situations 1309 // TODO: Move atomic expansion into legalizer 1310 // TODO: Also supports <2 x f16> 1311 Atomic.legalFor({ 1312 {S32, GlobalPtr}, 1313 {S64, GlobalPtr}, 1314 {S64, FlatPtr} 1315 }); 1316 } 1317 1318 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1319 // demarshalling 1320 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1321 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1322 {S32, FlatPtr}, {S64, FlatPtr}}) 1323 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1324 {S32, RegionPtr}, {S64, RegionPtr}}); 1325 // TODO: Pointer types, any 32-bit or 64-bit vector 1326 1327 // Condition should be s32 for scalar, s1 for vector. 1328 getActionDefinitionsBuilder(G_SELECT) 1329 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1330 LocalPtr, FlatPtr, PrivatePtr, 1331 LLT::fixed_vector(2, LocalPtr), 1332 LLT::fixed_vector(2, PrivatePtr)}, 1333 {S1, S32}) 1334 .clampScalar(0, S16, S64) 1335 .scalarize(1) 1336 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1337 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1338 .clampMaxNumElements(0, S32, 2) 1339 .clampMaxNumElements(0, LocalPtr, 2) 1340 .clampMaxNumElements(0, PrivatePtr, 2) 1341 .scalarize(0) 1342 .widenScalarToNextPow2(0) 1343 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1344 1345 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1346 // be more flexible with the shift amount type. 1347 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1348 .legalFor({{S32, S32}, {S64, S32}}); 1349 if (ST.has16BitInsts()) { 1350 if (ST.hasVOP3PInsts()) { 1351 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1352 .clampMaxNumElements(0, S16, 2); 1353 } else 1354 Shifts.legalFor({{S16, S16}}); 1355 1356 // TODO: Support 16-bit shift amounts for all types 1357 Shifts.widenScalarIf( 1358 [=](const LegalityQuery &Query) { 1359 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1360 // 32-bit amount. 1361 const LLT ValTy = Query.Types[0]; 1362 const LLT AmountTy = Query.Types[1]; 1363 return ValTy.getSizeInBits() <= 16 && 1364 AmountTy.getSizeInBits() < 16; 1365 }, changeTo(1, S16)); 1366 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1367 Shifts.clampScalar(1, S32, S32); 1368 Shifts.widenScalarToNextPow2(0, 16); 1369 Shifts.clampScalar(0, S16, S64); 1370 1371 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1372 .minScalar(0, S16) 1373 .scalarize(0) 1374 .lower(); 1375 } else { 1376 // Make sure we legalize the shift amount type first, as the general 1377 // expansion for the shifted type will produce much worse code if it hasn't 1378 // been truncated already. 1379 Shifts.clampScalar(1, S32, S32); 1380 Shifts.widenScalarToNextPow2(0, 32); 1381 Shifts.clampScalar(0, S32, S64); 1382 1383 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1384 .minScalar(0, S32) 1385 .scalarize(0) 1386 .lower(); 1387 } 1388 Shifts.scalarize(0); 1389 1390 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1391 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1392 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1393 unsigned IdxTypeIdx = 2; 1394 1395 getActionDefinitionsBuilder(Op) 1396 .customIf([=](const LegalityQuery &Query) { 1397 const LLT EltTy = Query.Types[EltTypeIdx]; 1398 const LLT VecTy = Query.Types[VecTypeIdx]; 1399 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1400 const unsigned EltSize = EltTy.getSizeInBits(); 1401 return (EltSize == 32 || EltSize == 64) && 1402 VecTy.getSizeInBits() % 32 == 0 && 1403 VecTy.getSizeInBits() <= MaxRegisterSize && 1404 IdxTy.getSizeInBits() == 32; 1405 }) 1406 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1407 bitcastToVectorElement32(VecTypeIdx)) 1408 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1409 .bitcastIf( 1410 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1411 [=](const LegalityQuery &Query) { 1412 // For > 64-bit element types, try to turn this into a 64-bit 1413 // element vector since we may be able to do better indexing 1414 // if this is scalar. If not, fall back to 32. 1415 const LLT EltTy = Query.Types[EltTypeIdx]; 1416 const LLT VecTy = Query.Types[VecTypeIdx]; 1417 const unsigned DstEltSize = EltTy.getSizeInBits(); 1418 const unsigned VecSize = VecTy.getSizeInBits(); 1419 1420 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1421 return std::make_pair( 1422 VecTypeIdx, 1423 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1424 }) 1425 .clampScalar(EltTypeIdx, S32, S64) 1426 .clampScalar(VecTypeIdx, S32, S64) 1427 .clampScalar(IdxTypeIdx, S32, S32) 1428 .clampMaxNumElements(VecTypeIdx, S32, 32) 1429 // TODO: Clamp elements for 64-bit vectors? 1430 // It should only be necessary with variable indexes. 1431 // As a last resort, lower to the stack 1432 .lower(); 1433 } 1434 1435 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1436 .unsupportedIf([=](const LegalityQuery &Query) { 1437 const LLT &EltTy = Query.Types[1].getElementType(); 1438 return Query.Types[0] != EltTy; 1439 }); 1440 1441 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1442 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1443 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1444 1445 // FIXME: Doesn't handle extract of illegal sizes. 1446 getActionDefinitionsBuilder(Op) 1447 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1448 .lowerIf([=](const LegalityQuery &Query) { 1449 // Sub-vector(or single element) insert and extract. 1450 // TODO: verify immediate offset here since lower only works with 1451 // whole elements. 1452 const LLT BigTy = Query.Types[BigTyIdx]; 1453 return BigTy.isVector(); 1454 }) 1455 // FIXME: Multiples of 16 should not be legal. 1456 .legalIf([=](const LegalityQuery &Query) { 1457 const LLT BigTy = Query.Types[BigTyIdx]; 1458 const LLT LitTy = Query.Types[LitTyIdx]; 1459 return (BigTy.getSizeInBits() % 32 == 0) && 1460 (LitTy.getSizeInBits() % 16 == 0); 1461 }) 1462 .widenScalarIf( 1463 [=](const LegalityQuery &Query) { 1464 const LLT BigTy = Query.Types[BigTyIdx]; 1465 return (BigTy.getScalarSizeInBits() < 16); 1466 }, 1467 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1468 .widenScalarIf( 1469 [=](const LegalityQuery &Query) { 1470 const LLT LitTy = Query.Types[LitTyIdx]; 1471 return (LitTy.getScalarSizeInBits() < 16); 1472 }, 1473 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1474 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1475 .widenScalarToNextPow2(BigTyIdx, 32); 1476 1477 } 1478 1479 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1480 .legalForCartesianProduct(AllS32Vectors, {S32}) 1481 .legalForCartesianProduct(AllS64Vectors, {S64}) 1482 .clampNumElements(0, V16S32, V32S32) 1483 .clampNumElements(0, V2S64, V16S64) 1484 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1485 1486 if (ST.hasScalarPackInsts()) { 1487 BuildVector 1488 // FIXME: Should probably widen s1 vectors straight to s32 1489 .minScalarOrElt(0, S16) 1490 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1491 .minScalar(1, S32); 1492 1493 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1494 .legalFor({V2S16, S32}) 1495 .lower(); 1496 BuildVector.minScalarOrElt(0, S32); 1497 } else { 1498 BuildVector.customFor({V2S16, S16}); 1499 BuildVector.minScalarOrElt(0, S32); 1500 1501 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1502 .customFor({V2S16, S32}) 1503 .lower(); 1504 } 1505 1506 BuildVector.legalIf(isRegisterType(0)); 1507 1508 // FIXME: Clamp maximum size 1509 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1510 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1511 .clampMaxNumElements(0, S32, 32) 1512 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1513 .clampMaxNumElements(0, S16, 64); 1514 1515 // TODO: Don't fully scalarize v2s16 pieces? Or combine out those 1516 // pre-legalize. 1517 if (ST.hasVOP3PInsts()) { 1518 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1519 .customFor({V2S16, V2S16}) 1520 .lower(); 1521 } else 1522 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1523 1524 // Merge/Unmerge 1525 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1526 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1527 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1528 1529 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1530 const LLT Ty = Query.Types[TypeIdx]; 1531 if (Ty.isVector()) { 1532 const LLT &EltTy = Ty.getElementType(); 1533 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1534 return true; 1535 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1536 return true; 1537 } 1538 return false; 1539 }; 1540 1541 auto &Builder = getActionDefinitionsBuilder(Op) 1542 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1543 .lowerFor({{S16, V2S16}}) 1544 .lowerIf([=](const LegalityQuery &Query) { 1545 const LLT BigTy = Query.Types[BigTyIdx]; 1546 return BigTy.getSizeInBits() == 32; 1547 }) 1548 // Try to widen to s16 first for small types. 1549 // TODO: Only do this on targets with legal s16 shifts 1550 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1551 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1552 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1553 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1554 elementTypeIs(1, S16)), 1555 changeTo(1, V2S16)) 1556 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1557 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1558 // valid. 1559 .clampScalar(LitTyIdx, S32, S512) 1560 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1561 // Break up vectors with weird elements into scalars 1562 .fewerElementsIf( 1563 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1564 scalarize(0)) 1565 .fewerElementsIf( 1566 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1567 scalarize(1)) 1568 .clampScalar(BigTyIdx, S32, MaxScalar); 1569 1570 if (Op == G_MERGE_VALUES) { 1571 Builder.widenScalarIf( 1572 // TODO: Use 16-bit shifts if legal for 8-bit values? 1573 [=](const LegalityQuery &Query) { 1574 const LLT Ty = Query.Types[LitTyIdx]; 1575 return Ty.getSizeInBits() < 32; 1576 }, 1577 changeTo(LitTyIdx, S32)); 1578 } 1579 1580 Builder.widenScalarIf( 1581 [=](const LegalityQuery &Query) { 1582 const LLT Ty = Query.Types[BigTyIdx]; 1583 return !isPowerOf2_32(Ty.getSizeInBits()) && 1584 Ty.getSizeInBits() % 16 != 0; 1585 }, 1586 [=](const LegalityQuery &Query) { 1587 // Pick the next power of 2, or a multiple of 64 over 128. 1588 // Whichever is smaller. 1589 const LLT &Ty = Query.Types[BigTyIdx]; 1590 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1591 if (NewSizeInBits >= 256) { 1592 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1593 if (RoundedTo < NewSizeInBits) 1594 NewSizeInBits = RoundedTo; 1595 } 1596 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1597 }) 1598 // Any vectors left are the wrong size. Scalarize them. 1599 .scalarize(0) 1600 .scalarize(1); 1601 } 1602 1603 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1604 // RegBankSelect. 1605 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1606 .legalFor({{S32}, {S64}}); 1607 1608 if (ST.hasVOP3PInsts()) { 1609 SextInReg.lowerFor({{V2S16}}) 1610 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1611 // get more vector shift opportunities, since we'll get those when 1612 // expanded. 1613 .clampMaxNumElementsStrict(0, S16, 2); 1614 } else if (ST.has16BitInsts()) { 1615 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1616 } else { 1617 // Prefer to promote to s32 before lowering if we don't have 16-bit 1618 // shifts. This avoid a lot of intermediate truncate and extend operations. 1619 SextInReg.lowerFor({{S32}, {S64}}); 1620 } 1621 1622 SextInReg 1623 .scalarize(0) 1624 .clampScalar(0, S32, S64) 1625 .lower(); 1626 1627 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1628 .scalarize(0) 1629 .lower(); 1630 1631 // TODO: Only Try to form v2s16 with legal packed instructions. 1632 getActionDefinitionsBuilder(G_FSHR) 1633 .legalFor({{S32, S32}}) 1634 .lowerFor({{V2S16, V2S16}}) 1635 .clampMaxNumElementsStrict(0, S16, 2) 1636 .scalarize(0) 1637 .lower(); 1638 1639 if (ST.hasVOP3PInsts()) { 1640 getActionDefinitionsBuilder(G_FSHL) 1641 .lowerFor({{V2S16, V2S16}}) 1642 .clampMaxNumElementsStrict(0, S16, 2) 1643 .scalarize(0) 1644 .lower(); 1645 } else { 1646 getActionDefinitionsBuilder(G_FSHL) 1647 .scalarize(0) 1648 .lower(); 1649 } 1650 1651 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1652 .legalFor({S64}); 1653 1654 getActionDefinitionsBuilder(G_FENCE) 1655 .alwaysLegal(); 1656 1657 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1658 .scalarize(0) 1659 .minScalar(0, S32) 1660 .lower(); 1661 1662 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1663 .legalFor({{S32, S32}, {S64, S32}}) 1664 .clampScalar(1, S32, S32) 1665 .clampScalar(0, S32, S64) 1666 .widenScalarToNextPow2(0) 1667 .scalarize(0); 1668 1669 getActionDefinitionsBuilder({ 1670 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1671 G_FCOPYSIGN, 1672 1673 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1674 G_ATOMICRMW_NAND, 1675 G_ATOMICRMW_FSUB, 1676 G_READ_REGISTER, 1677 G_WRITE_REGISTER, 1678 1679 G_SADDO, G_SSUBO, 1680 1681 // TODO: Implement 1682 G_FMINIMUM, G_FMAXIMUM}).lower(); 1683 1684 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1685 .lower(); 1686 1687 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1688 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1689 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1690 .unsupported(); 1691 1692 getLegacyLegalizerInfo().computeTables(); 1693 verify(*ST.getInstrInfo()); 1694 } 1695 1696 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1697 MachineInstr &MI) const { 1698 MachineIRBuilder &B = Helper.MIRBuilder; 1699 MachineRegisterInfo &MRI = *B.getMRI(); 1700 1701 switch (MI.getOpcode()) { 1702 case TargetOpcode::G_ADDRSPACE_CAST: 1703 return legalizeAddrSpaceCast(MI, MRI, B); 1704 case TargetOpcode::G_FRINT: 1705 return legalizeFrint(MI, MRI, B); 1706 case TargetOpcode::G_FCEIL: 1707 return legalizeFceil(MI, MRI, B); 1708 case TargetOpcode::G_FREM: 1709 return legalizeFrem(MI, MRI, B); 1710 case TargetOpcode::G_INTRINSIC_TRUNC: 1711 return legalizeIntrinsicTrunc(MI, MRI, B); 1712 case TargetOpcode::G_SITOFP: 1713 return legalizeITOFP(MI, MRI, B, true); 1714 case TargetOpcode::G_UITOFP: 1715 return legalizeITOFP(MI, MRI, B, false); 1716 case TargetOpcode::G_FPTOSI: 1717 return legalizeFPTOI(MI, MRI, B, true); 1718 case TargetOpcode::G_FPTOUI: 1719 return legalizeFPTOI(MI, MRI, B, false); 1720 case TargetOpcode::G_FMINNUM: 1721 case TargetOpcode::G_FMAXNUM: 1722 case TargetOpcode::G_FMINNUM_IEEE: 1723 case TargetOpcode::G_FMAXNUM_IEEE: 1724 return legalizeMinNumMaxNum(Helper, MI); 1725 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1726 return legalizeExtractVectorElt(MI, MRI, B); 1727 case TargetOpcode::G_INSERT_VECTOR_ELT: 1728 return legalizeInsertVectorElt(MI, MRI, B); 1729 case TargetOpcode::G_SHUFFLE_VECTOR: 1730 return legalizeShuffleVector(MI, MRI, B); 1731 case TargetOpcode::G_FSIN: 1732 case TargetOpcode::G_FCOS: 1733 return legalizeSinCos(MI, MRI, B); 1734 case TargetOpcode::G_GLOBAL_VALUE: 1735 return legalizeGlobalValue(MI, MRI, B); 1736 case TargetOpcode::G_LOAD: 1737 case TargetOpcode::G_SEXTLOAD: 1738 case TargetOpcode::G_ZEXTLOAD: 1739 return legalizeLoad(Helper, MI); 1740 case TargetOpcode::G_FMAD: 1741 return legalizeFMad(MI, MRI, B); 1742 case TargetOpcode::G_FDIV: 1743 return legalizeFDIV(MI, MRI, B); 1744 case TargetOpcode::G_UDIV: 1745 case TargetOpcode::G_UREM: 1746 case TargetOpcode::G_UDIVREM: 1747 return legalizeUnsignedDIV_REM(MI, MRI, B); 1748 case TargetOpcode::G_SDIV: 1749 case TargetOpcode::G_SREM: 1750 case TargetOpcode::G_SDIVREM: 1751 return legalizeSignedDIV_REM(MI, MRI, B); 1752 case TargetOpcode::G_ATOMIC_CMPXCHG: 1753 return legalizeAtomicCmpXChg(MI, MRI, B); 1754 case TargetOpcode::G_FLOG: 1755 return legalizeFlog(MI, B, numbers::ln2f); 1756 case TargetOpcode::G_FLOG10: 1757 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1758 case TargetOpcode::G_FEXP: 1759 return legalizeFExp(MI, B); 1760 case TargetOpcode::G_FPOW: 1761 return legalizeFPow(MI, B); 1762 case TargetOpcode::G_FFLOOR: 1763 return legalizeFFloor(MI, MRI, B); 1764 case TargetOpcode::G_BUILD_VECTOR: 1765 return legalizeBuildVector(MI, MRI, B); 1766 case TargetOpcode::G_CTLZ: 1767 case TargetOpcode::G_CTTZ: 1768 return legalizeCTLZ_CTTZ(MI, MRI, B); 1769 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 1770 return legalizeFPTruncRound(MI, B); 1771 default: 1772 return false; 1773 } 1774 1775 llvm_unreachable("expected switch to return"); 1776 } 1777 1778 Register AMDGPULegalizerInfo::getSegmentAperture( 1779 unsigned AS, 1780 MachineRegisterInfo &MRI, 1781 MachineIRBuilder &B) const { 1782 MachineFunction &MF = B.getMF(); 1783 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1784 const LLT S32 = LLT::scalar(32); 1785 1786 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1787 1788 if (ST.hasApertureRegs()) { 1789 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1790 // getreg. 1791 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1792 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1793 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1794 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1795 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1796 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1797 unsigned Encoding = 1798 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1799 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1800 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1801 1802 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1803 1804 B.buildInstr(AMDGPU::S_GETREG_B32) 1805 .addDef(GetReg) 1806 .addImm(Encoding); 1807 MRI.setType(GetReg, S32); 1808 1809 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1810 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1811 } 1812 1813 // TODO: can we be smarter about machine pointer info? 1814 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1815 Register LoadAddr = MRI.createGenericVirtualRegister( 1816 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1817 // For code object version 5, private_base and shared_base are passed through 1818 // implicit kernargs. 1819 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 1820 AMDGPUTargetLowering::ImplicitParameter Param = 1821 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 1822 : AMDGPUTargetLowering::PRIVATE_BASE; 1823 uint64_t Offset = 1824 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 1825 1826 Register KernargPtrReg = MRI.createGenericVirtualRegister( 1827 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1828 1829 if (!loadInputValue(KernargPtrReg, B, 1830 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 1831 return Register(); 1832 1833 MachineMemOperand *MMO = MF.getMachineMemOperand( 1834 PtrInfo, 1835 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1836 MachineMemOperand::MOInvariant, 1837 LLT::scalar(32), commonAlignment(Align(64), Offset)); 1838 1839 // Pointer address 1840 B.buildPtrAdd(LoadAddr, KernargPtrReg, 1841 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 1842 // Load address 1843 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1844 } 1845 1846 Register QueuePtr = MRI.createGenericVirtualRegister( 1847 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1848 1849 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1850 return Register(); 1851 1852 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1853 // private_segment_aperture_base_hi. 1854 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1855 1856 MachineMemOperand *MMO = MF.getMachineMemOperand( 1857 PtrInfo, 1858 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1859 MachineMemOperand::MOInvariant, 1860 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1861 1862 B.buildPtrAdd(LoadAddr, QueuePtr, 1863 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 1864 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1865 } 1866 1867 /// Return true if the value is a known valid address, such that a null check is 1868 /// not necessary. 1869 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 1870 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 1871 MachineInstr *Def = MRI.getVRegDef(Val); 1872 switch (Def->getOpcode()) { 1873 case AMDGPU::G_FRAME_INDEX: 1874 case AMDGPU::G_GLOBAL_VALUE: 1875 case AMDGPU::G_BLOCK_ADDR: 1876 return true; 1877 case AMDGPU::G_CONSTANT: { 1878 const ConstantInt *CI = Def->getOperand(1).getCImm(); 1879 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 1880 } 1881 default: 1882 return false; 1883 } 1884 1885 return false; 1886 } 1887 1888 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1889 MachineInstr &MI, MachineRegisterInfo &MRI, 1890 MachineIRBuilder &B) const { 1891 MachineFunction &MF = B.getMF(); 1892 1893 const LLT S32 = LLT::scalar(32); 1894 Register Dst = MI.getOperand(0).getReg(); 1895 Register Src = MI.getOperand(1).getReg(); 1896 1897 LLT DstTy = MRI.getType(Dst); 1898 LLT SrcTy = MRI.getType(Src); 1899 unsigned DestAS = DstTy.getAddressSpace(); 1900 unsigned SrcAS = SrcTy.getAddressSpace(); 1901 1902 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1903 // vector element. 1904 assert(!DstTy.isVector()); 1905 1906 const AMDGPUTargetMachine &TM 1907 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1908 1909 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1910 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1911 return true; 1912 } 1913 1914 if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 1915 (DestAS == AMDGPUAS::LOCAL_ADDRESS || 1916 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1917 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1918 // Extract low 32-bits of the pointer. 1919 B.buildExtract(Dst, Src, 0); 1920 MI.eraseFromParent(); 1921 return true; 1922 } 1923 1924 unsigned NullVal = TM.getNullPointerValue(DestAS); 1925 1926 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1927 auto FlatNull = B.buildConstant(SrcTy, 0); 1928 1929 // Extract low 32-bits of the pointer. 1930 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1931 1932 auto CmpRes = 1933 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1934 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1935 1936 MI.eraseFromParent(); 1937 return true; 1938 } 1939 1940 if (DestAS == AMDGPUAS::FLAT_ADDRESS && 1941 (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 1942 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1943 if (!ST.hasFlatAddressSpace()) 1944 return false; 1945 1946 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1947 if (!ApertureReg.isValid()) 1948 return false; 1949 1950 // Coerce the type of the low half of the result so we can use merge_values. 1951 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1952 1953 // TODO: Should we allow mismatched types but matching sizes in merges to 1954 // avoid the ptrtoint? 1955 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1956 1957 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1958 B.buildCopy(Dst, BuildPtr); 1959 MI.eraseFromParent(); 1960 return true; 1961 } 1962 1963 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1964 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1965 1966 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 1967 SegmentNull.getReg(0)); 1968 1969 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1970 1971 MI.eraseFromParent(); 1972 return true; 1973 } 1974 1975 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 1976 SrcTy.getSizeInBits() == 64) { 1977 // Truncate. 1978 B.buildExtract(Dst, Src, 0); 1979 MI.eraseFromParent(); 1980 return true; 1981 } 1982 1983 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 1984 DstTy.getSizeInBits() == 64) { 1985 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1986 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1987 1988 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1989 // another. Merge operands are required to be the same type, but creating an 1990 // extra ptrtoint would be kind of pointless. 1991 auto HighAddr = B.buildConstant( 1992 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1993 B.buildMerge(Dst, {Src, HighAddr}); 1994 MI.eraseFromParent(); 1995 return true; 1996 } 1997 1998 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 1999 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 2000 2001 LLVMContext &Ctx = MF.getFunction().getContext(); 2002 Ctx.diagnose(InvalidAddrSpaceCast); 2003 B.buildUndef(Dst); 2004 MI.eraseFromParent(); 2005 return true; 2006 } 2007 2008 bool AMDGPULegalizerInfo::legalizeFrint( 2009 MachineInstr &MI, MachineRegisterInfo &MRI, 2010 MachineIRBuilder &B) const { 2011 Register Src = MI.getOperand(1).getReg(); 2012 LLT Ty = MRI.getType(Src); 2013 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 2014 2015 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2016 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2017 2018 auto C1 = B.buildFConstant(Ty, C1Val); 2019 auto CopySign = B.buildFCopysign(Ty, C1, Src); 2020 2021 // TODO: Should this propagate fast-math-flags? 2022 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 2023 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 2024 2025 auto C2 = B.buildFConstant(Ty, C2Val); 2026 auto Fabs = B.buildFAbs(Ty, Src); 2027 2028 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 2029 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2030 MI.eraseFromParent(); 2031 return true; 2032 } 2033 2034 bool AMDGPULegalizerInfo::legalizeFceil( 2035 MachineInstr &MI, MachineRegisterInfo &MRI, 2036 MachineIRBuilder &B) const { 2037 2038 const LLT S1 = LLT::scalar(1); 2039 const LLT S64 = LLT::scalar(64); 2040 2041 Register Src = MI.getOperand(1).getReg(); 2042 assert(MRI.getType(Src) == S64); 2043 2044 // result = trunc(src) 2045 // if (src > 0.0 && src != result) 2046 // result += 1.0 2047 2048 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 2049 2050 const auto Zero = B.buildFConstant(S64, 0.0); 2051 const auto One = B.buildFConstant(S64, 1.0); 2052 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2053 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2054 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2055 auto Add = B.buildSelect(S64, And, One, Zero); 2056 2057 // TODO: Should this propagate fast-math-flags? 2058 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2059 MI.eraseFromParent(); 2060 return true; 2061 } 2062 2063 bool AMDGPULegalizerInfo::legalizeFrem( 2064 MachineInstr &MI, MachineRegisterInfo &MRI, 2065 MachineIRBuilder &B) const { 2066 Register DstReg = MI.getOperand(0).getReg(); 2067 Register Src0Reg = MI.getOperand(1).getReg(); 2068 Register Src1Reg = MI.getOperand(2).getReg(); 2069 auto Flags = MI.getFlags(); 2070 LLT Ty = MRI.getType(DstReg); 2071 2072 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2073 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2074 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2075 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2076 MI.eraseFromParent(); 2077 return true; 2078 } 2079 2080 static MachineInstrBuilder extractF64Exponent(Register Hi, 2081 MachineIRBuilder &B) { 2082 const unsigned FractBits = 52; 2083 const unsigned ExpBits = 11; 2084 LLT S32 = LLT::scalar(32); 2085 2086 auto Const0 = B.buildConstant(S32, FractBits - 32); 2087 auto Const1 = B.buildConstant(S32, ExpBits); 2088 2089 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2090 .addUse(Hi) 2091 .addUse(Const0.getReg(0)) 2092 .addUse(Const1.getReg(0)); 2093 2094 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2095 } 2096 2097 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2098 MachineInstr &MI, MachineRegisterInfo &MRI, 2099 MachineIRBuilder &B) const { 2100 const LLT S1 = LLT::scalar(1); 2101 const LLT S32 = LLT::scalar(32); 2102 const LLT S64 = LLT::scalar(64); 2103 2104 Register Src = MI.getOperand(1).getReg(); 2105 assert(MRI.getType(Src) == S64); 2106 2107 // TODO: Should this use extract since the low half is unused? 2108 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2109 Register Hi = Unmerge.getReg(1); 2110 2111 // Extract the upper half, since this is where we will find the sign and 2112 // exponent. 2113 auto Exp = extractF64Exponent(Hi, B); 2114 2115 const unsigned FractBits = 52; 2116 2117 // Extract the sign bit. 2118 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2119 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2120 2121 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2122 2123 const auto Zero32 = B.buildConstant(S32, 0); 2124 2125 // Extend back to 64-bits. 2126 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2127 2128 auto Shr = B.buildAShr(S64, FractMask, Exp); 2129 auto Not = B.buildNot(S64, Shr); 2130 auto Tmp0 = B.buildAnd(S64, Src, Not); 2131 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2132 2133 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2134 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2135 2136 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2137 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2138 MI.eraseFromParent(); 2139 return true; 2140 } 2141 2142 bool AMDGPULegalizerInfo::legalizeITOFP( 2143 MachineInstr &MI, MachineRegisterInfo &MRI, 2144 MachineIRBuilder &B, bool Signed) const { 2145 2146 Register Dst = MI.getOperand(0).getReg(); 2147 Register Src = MI.getOperand(1).getReg(); 2148 2149 const LLT S64 = LLT::scalar(64); 2150 const LLT S32 = LLT::scalar(32); 2151 2152 assert(MRI.getType(Src) == S64); 2153 2154 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2155 auto ThirtyTwo = B.buildConstant(S32, 32); 2156 2157 if (MRI.getType(Dst) == S64) { 2158 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2159 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2160 2161 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2162 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2163 .addUse(CvtHi.getReg(0)) 2164 .addUse(ThirtyTwo.getReg(0)); 2165 2166 // TODO: Should this propagate fast-math-flags? 2167 B.buildFAdd(Dst, LdExp, CvtLo); 2168 MI.eraseFromParent(); 2169 return true; 2170 } 2171 2172 assert(MRI.getType(Dst) == S32); 2173 2174 auto One = B.buildConstant(S32, 1); 2175 2176 MachineInstrBuilder ShAmt; 2177 if (Signed) { 2178 auto ThirtyOne = B.buildConstant(S32, 31); 2179 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2180 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2181 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2182 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2183 /*HasSideEffects=*/false) 2184 .addUse(Unmerge.getReg(1)); 2185 auto LS2 = B.buildSub(S32, LS, One); 2186 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2187 } else 2188 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2189 auto Norm = B.buildShl(S64, Src, ShAmt); 2190 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2191 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2192 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2193 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2194 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2195 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2196 /*HasSideEffects=*/false) 2197 .addUse(FVal.getReg(0)) 2198 .addUse(Scale.getReg(0)); 2199 MI.eraseFromParent(); 2200 return true; 2201 } 2202 2203 // TODO: Copied from DAG implementation. Verify logic and document how this 2204 // actually works. 2205 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2206 MachineRegisterInfo &MRI, 2207 MachineIRBuilder &B, 2208 bool Signed) const { 2209 2210 Register Dst = MI.getOperand(0).getReg(); 2211 Register Src = MI.getOperand(1).getReg(); 2212 2213 const LLT S64 = LLT::scalar(64); 2214 const LLT S32 = LLT::scalar(32); 2215 2216 const LLT SrcLT = MRI.getType(Src); 2217 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2218 2219 unsigned Flags = MI.getFlags(); 2220 2221 // The basic idea of converting a floating point number into a pair of 32-bit 2222 // integers is illustrated as follows: 2223 // 2224 // tf := trunc(val); 2225 // hif := floor(tf * 2^-32); 2226 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2227 // hi := fptoi(hif); 2228 // lo := fptoi(lof); 2229 // 2230 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2231 MachineInstrBuilder Sign; 2232 if (Signed && SrcLT == S32) { 2233 // However, a 32-bit floating point number has only 23 bits mantissa and 2234 // it's not enough to hold all the significant bits of `lof` if val is 2235 // negative. To avoid the loss of precision, We need to take the absolute 2236 // value after truncating and flip the result back based on the original 2237 // signedness. 2238 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2239 Trunc = B.buildFAbs(S32, Trunc, Flags); 2240 } 2241 MachineInstrBuilder K0, K1; 2242 if (SrcLT == S64) { 2243 K0 = B.buildFConstant(S64, 2244 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2245 K1 = B.buildFConstant(S64, 2246 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2247 } else { 2248 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2249 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2250 } 2251 2252 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2253 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2254 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2255 2256 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2257 : B.buildFPTOUI(S32, FloorMul); 2258 auto Lo = B.buildFPTOUI(S32, Fma); 2259 2260 if (Signed && SrcLT == S32) { 2261 // Flip the result based on the signedness, which is either all 0s or 1s. 2262 Sign = B.buildMerge(S64, {Sign, Sign}); 2263 // r := xor({lo, hi}, sign) - sign; 2264 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2265 } else 2266 B.buildMerge(Dst, {Lo, Hi}); 2267 MI.eraseFromParent(); 2268 2269 return true; 2270 } 2271 2272 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2273 MachineInstr &MI) const { 2274 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2275 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2276 2277 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2278 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2279 2280 // With ieee_mode disabled, the instructions have the correct behavior 2281 // already for G_FMINNUM/G_FMAXNUM 2282 if (!MFI->getMode().IEEE) 2283 return !IsIEEEOp; 2284 2285 if (IsIEEEOp) 2286 return true; 2287 2288 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2289 } 2290 2291 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2292 MachineInstr &MI, MachineRegisterInfo &MRI, 2293 MachineIRBuilder &B) const { 2294 // TODO: Should move some of this into LegalizerHelper. 2295 2296 // TODO: Promote dynamic indexing of s16 to s32 2297 2298 // FIXME: Artifact combiner probably should have replaced the truncated 2299 // constant before this, so we shouldn't need 2300 // getIConstantVRegValWithLookThrough. 2301 Optional<ValueAndVReg> MaybeIdxVal = 2302 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2303 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2304 return true; 2305 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2306 2307 Register Dst = MI.getOperand(0).getReg(); 2308 Register Vec = MI.getOperand(1).getReg(); 2309 2310 LLT VecTy = MRI.getType(Vec); 2311 LLT EltTy = VecTy.getElementType(); 2312 assert(EltTy == MRI.getType(Dst)); 2313 2314 if (IdxVal < VecTy.getNumElements()) { 2315 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2316 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2317 } else { 2318 B.buildUndef(Dst); 2319 } 2320 2321 MI.eraseFromParent(); 2322 return true; 2323 } 2324 2325 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2326 MachineInstr &MI, MachineRegisterInfo &MRI, 2327 MachineIRBuilder &B) const { 2328 // TODO: Should move some of this into LegalizerHelper. 2329 2330 // TODO: Promote dynamic indexing of s16 to s32 2331 2332 // FIXME: Artifact combiner probably should have replaced the truncated 2333 // constant before this, so we shouldn't need 2334 // getIConstantVRegValWithLookThrough. 2335 Optional<ValueAndVReg> MaybeIdxVal = 2336 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2337 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2338 return true; 2339 2340 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2341 Register Dst = MI.getOperand(0).getReg(); 2342 Register Vec = MI.getOperand(1).getReg(); 2343 Register Ins = MI.getOperand(2).getReg(); 2344 2345 LLT VecTy = MRI.getType(Vec); 2346 LLT EltTy = VecTy.getElementType(); 2347 assert(EltTy == MRI.getType(Ins)); 2348 (void)Ins; 2349 2350 unsigned NumElts = VecTy.getNumElements(); 2351 if (IdxVal < NumElts) { 2352 SmallVector<Register, 8> SrcRegs; 2353 for (unsigned i = 0; i < NumElts; ++i) 2354 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2355 B.buildUnmerge(SrcRegs, Vec); 2356 2357 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2358 B.buildMerge(Dst, SrcRegs); 2359 } else { 2360 B.buildUndef(Dst); 2361 } 2362 2363 MI.eraseFromParent(); 2364 return true; 2365 } 2366 2367 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2368 MachineInstr &MI, MachineRegisterInfo &MRI, 2369 MachineIRBuilder &B) const { 2370 const LLT V2S16 = LLT::fixed_vector(2, 16); 2371 2372 Register Dst = MI.getOperand(0).getReg(); 2373 Register Src0 = MI.getOperand(1).getReg(); 2374 LLT DstTy = MRI.getType(Dst); 2375 LLT SrcTy = MRI.getType(Src0); 2376 2377 if (SrcTy == V2S16 && DstTy == V2S16 && 2378 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2379 return true; 2380 2381 MachineIRBuilder HelperBuilder(MI); 2382 GISelObserverWrapper DummyObserver; 2383 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2384 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2385 } 2386 2387 bool AMDGPULegalizerInfo::legalizeSinCos( 2388 MachineInstr &MI, MachineRegisterInfo &MRI, 2389 MachineIRBuilder &B) const { 2390 2391 Register DstReg = MI.getOperand(0).getReg(); 2392 Register SrcReg = MI.getOperand(1).getReg(); 2393 LLT Ty = MRI.getType(DstReg); 2394 unsigned Flags = MI.getFlags(); 2395 2396 Register TrigVal; 2397 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2398 if (ST.hasTrigReducedRange()) { 2399 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2400 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2401 .addUse(MulVal.getReg(0)) 2402 .setMIFlags(Flags).getReg(0); 2403 } else 2404 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2405 2406 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2407 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2408 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2409 .addUse(TrigVal) 2410 .setMIFlags(Flags); 2411 MI.eraseFromParent(); 2412 return true; 2413 } 2414 2415 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2416 MachineIRBuilder &B, 2417 const GlobalValue *GV, 2418 int64_t Offset, 2419 unsigned GAFlags) const { 2420 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2421 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2422 // to the following code sequence: 2423 // 2424 // For constant address space: 2425 // s_getpc_b64 s[0:1] 2426 // s_add_u32 s0, s0, $symbol 2427 // s_addc_u32 s1, s1, 0 2428 // 2429 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2430 // a fixup or relocation is emitted to replace $symbol with a literal 2431 // constant, which is a pc-relative offset from the encoding of the $symbol 2432 // operand to the global variable. 2433 // 2434 // For global address space: 2435 // s_getpc_b64 s[0:1] 2436 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2437 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2438 // 2439 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2440 // fixups or relocations are emitted to replace $symbol@*@lo and 2441 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2442 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2443 // operand to the global variable. 2444 // 2445 // What we want here is an offset from the value returned by s_getpc 2446 // (which is the address of the s_add_u32 instruction) to the global 2447 // variable, but since the encoding of $symbol starts 4 bytes after the start 2448 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2449 // small. This requires us to add 4 to the global variable offset in order to 2450 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2451 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2452 // instruction. 2453 2454 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2455 2456 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2457 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2458 2459 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2460 .addDef(PCReg); 2461 2462 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2463 if (GAFlags == SIInstrInfo::MO_NONE) 2464 MIB.addImm(0); 2465 else 2466 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2467 2468 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2469 2470 if (PtrTy.getSizeInBits() == 32) 2471 B.buildExtract(DstReg, PCReg, 0); 2472 return true; 2473 } 2474 2475 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2476 MachineInstr &MI, MachineRegisterInfo &MRI, 2477 MachineIRBuilder &B) const { 2478 Register DstReg = MI.getOperand(0).getReg(); 2479 LLT Ty = MRI.getType(DstReg); 2480 unsigned AS = Ty.getAddressSpace(); 2481 2482 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2483 MachineFunction &MF = B.getMF(); 2484 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2485 2486 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2487 if (!MFI->isModuleEntryFunction() && 2488 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2489 const Function &Fn = MF.getFunction(); 2490 DiagnosticInfoUnsupported BadLDSDecl( 2491 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2492 DS_Warning); 2493 Fn.getContext().diagnose(BadLDSDecl); 2494 2495 // We currently don't have a way to correctly allocate LDS objects that 2496 // aren't directly associated with a kernel. We do force inlining of 2497 // functions that use local objects. However, if these dead functions are 2498 // not eliminated, we don't want a compile time error. Just emit a warning 2499 // and a trap, since there should be no callable path here. 2500 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2501 B.buildUndef(DstReg); 2502 MI.eraseFromParent(); 2503 return true; 2504 } 2505 2506 // TODO: We could emit code to handle the initialization somewhere. 2507 // We ignore the initializer for now and legalize it to allow selection. 2508 // The initializer will anyway get errored out during assembly emission. 2509 const SITargetLowering *TLI = ST.getTargetLowering(); 2510 if (!TLI->shouldUseLDSConstAddress(GV)) { 2511 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2512 return true; // Leave in place; 2513 } 2514 2515 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2516 Type *Ty = GV->getValueType(); 2517 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2518 // zero-sized type in other languages to declare the dynamic shared 2519 // memory which size is not known at the compile time. They will be 2520 // allocated by the runtime and placed directly after the static 2521 // allocated ones. They all share the same offset. 2522 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2523 // Adjust alignment for that dynamic shared memory array. 2524 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2525 LLT S32 = LLT::scalar(32); 2526 auto Sz = 2527 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2528 B.buildIntToPtr(DstReg, Sz); 2529 MI.eraseFromParent(); 2530 return true; 2531 } 2532 } 2533 2534 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2535 *cast<GlobalVariable>(GV))); 2536 MI.eraseFromParent(); 2537 return true; 2538 } 2539 2540 const SITargetLowering *TLI = ST.getTargetLowering(); 2541 2542 if (TLI->shouldEmitFixup(GV)) { 2543 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2544 MI.eraseFromParent(); 2545 return true; 2546 } 2547 2548 if (TLI->shouldEmitPCReloc(GV)) { 2549 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2550 MI.eraseFromParent(); 2551 return true; 2552 } 2553 2554 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2555 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2556 2557 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2558 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2559 MachinePointerInfo::getGOT(MF), 2560 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2561 MachineMemOperand::MOInvariant, 2562 LoadTy, Align(8)); 2563 2564 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2565 2566 if (Ty.getSizeInBits() == 32) { 2567 // Truncate if this is a 32-bit constant address. 2568 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2569 B.buildExtract(DstReg, Load, 0); 2570 } else 2571 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2572 2573 MI.eraseFromParent(); 2574 return true; 2575 } 2576 2577 static LLT widenToNextPowerOf2(LLT Ty) { 2578 if (Ty.isVector()) 2579 return Ty.changeElementCount( 2580 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2581 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2582 } 2583 2584 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2585 MachineInstr &MI) const { 2586 MachineIRBuilder &B = Helper.MIRBuilder; 2587 MachineRegisterInfo &MRI = *B.getMRI(); 2588 GISelChangeObserver &Observer = Helper.Observer; 2589 2590 Register PtrReg = MI.getOperand(1).getReg(); 2591 LLT PtrTy = MRI.getType(PtrReg); 2592 unsigned AddrSpace = PtrTy.getAddressSpace(); 2593 2594 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2595 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2596 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2597 Observer.changingInstr(MI); 2598 MI.getOperand(1).setReg(Cast.getReg(0)); 2599 Observer.changedInstr(MI); 2600 return true; 2601 } 2602 2603 if (MI.getOpcode() != AMDGPU::G_LOAD) 2604 return false; 2605 2606 Register ValReg = MI.getOperand(0).getReg(); 2607 LLT ValTy = MRI.getType(ValReg); 2608 2609 MachineMemOperand *MMO = *MI.memoperands_begin(); 2610 const unsigned ValSize = ValTy.getSizeInBits(); 2611 const LLT MemTy = MMO->getMemoryType(); 2612 const Align MemAlign = MMO->getAlign(); 2613 const unsigned MemSize = MemTy.getSizeInBits(); 2614 const uint64_t AlignInBits = 8 * MemAlign.value(); 2615 2616 // Widen non-power-of-2 loads to the alignment if needed 2617 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2618 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2619 2620 // This was already the correct extending load result type, so just adjust 2621 // the memory type. 2622 if (WideMemSize == ValSize) { 2623 MachineFunction &MF = B.getMF(); 2624 2625 MachineMemOperand *WideMMO = 2626 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2627 Observer.changingInstr(MI); 2628 MI.setMemRefs(MF, {WideMMO}); 2629 Observer.changedInstr(MI); 2630 return true; 2631 } 2632 2633 // Don't bother handling edge case that should probably never be produced. 2634 if (ValSize > WideMemSize) 2635 return false; 2636 2637 LLT WideTy = widenToNextPowerOf2(ValTy); 2638 2639 Register WideLoad; 2640 if (!WideTy.isVector()) { 2641 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2642 B.buildTrunc(ValReg, WideLoad).getReg(0); 2643 } else { 2644 // Extract the subvector. 2645 2646 if (isRegisterType(ValTy)) { 2647 // If this a case where G_EXTRACT is legal, use it. 2648 // (e.g. <3 x s32> -> <4 x s32>) 2649 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2650 B.buildExtract(ValReg, WideLoad, 0); 2651 } else { 2652 // For cases where the widened type isn't a nice register value, unmerge 2653 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2654 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2655 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2656 } 2657 } 2658 2659 MI.eraseFromParent(); 2660 return true; 2661 } 2662 2663 return false; 2664 } 2665 2666 bool AMDGPULegalizerInfo::legalizeFMad( 2667 MachineInstr &MI, MachineRegisterInfo &MRI, 2668 MachineIRBuilder &B) const { 2669 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2670 assert(Ty.isScalar()); 2671 2672 MachineFunction &MF = B.getMF(); 2673 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2674 2675 // TODO: Always legal with future ftz flag. 2676 // FIXME: Do we need just output? 2677 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2678 return true; 2679 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2680 return true; 2681 2682 MachineIRBuilder HelperBuilder(MI); 2683 GISelObserverWrapper DummyObserver; 2684 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2685 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2686 } 2687 2688 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2689 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2690 Register DstReg = MI.getOperand(0).getReg(); 2691 Register PtrReg = MI.getOperand(1).getReg(); 2692 Register CmpVal = MI.getOperand(2).getReg(); 2693 Register NewVal = MI.getOperand(3).getReg(); 2694 2695 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2696 "this should not have been custom lowered"); 2697 2698 LLT ValTy = MRI.getType(CmpVal); 2699 LLT VecTy = LLT::fixed_vector(2, ValTy); 2700 2701 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2702 2703 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2704 .addDef(DstReg) 2705 .addUse(PtrReg) 2706 .addUse(PackedVal) 2707 .setMemRefs(MI.memoperands()); 2708 2709 MI.eraseFromParent(); 2710 return true; 2711 } 2712 2713 bool AMDGPULegalizerInfo::legalizeFlog( 2714 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2715 Register Dst = MI.getOperand(0).getReg(); 2716 Register Src = MI.getOperand(1).getReg(); 2717 LLT Ty = B.getMRI()->getType(Dst); 2718 unsigned Flags = MI.getFlags(); 2719 2720 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2721 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2722 2723 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2724 MI.eraseFromParent(); 2725 return true; 2726 } 2727 2728 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2729 MachineIRBuilder &B) const { 2730 Register Dst = MI.getOperand(0).getReg(); 2731 Register Src = MI.getOperand(1).getReg(); 2732 unsigned Flags = MI.getFlags(); 2733 LLT Ty = B.getMRI()->getType(Dst); 2734 2735 auto K = B.buildFConstant(Ty, numbers::log2e); 2736 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2737 B.buildFExp2(Dst, Mul, Flags); 2738 MI.eraseFromParent(); 2739 return true; 2740 } 2741 2742 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2743 MachineIRBuilder &B) const { 2744 Register Dst = MI.getOperand(0).getReg(); 2745 Register Src0 = MI.getOperand(1).getReg(); 2746 Register Src1 = MI.getOperand(2).getReg(); 2747 unsigned Flags = MI.getFlags(); 2748 LLT Ty = B.getMRI()->getType(Dst); 2749 const LLT S16 = LLT::scalar(16); 2750 const LLT S32 = LLT::scalar(32); 2751 2752 if (Ty == S32) { 2753 auto Log = B.buildFLog2(S32, Src0, Flags); 2754 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2755 .addUse(Log.getReg(0)) 2756 .addUse(Src1) 2757 .setMIFlags(Flags); 2758 B.buildFExp2(Dst, Mul, Flags); 2759 } else if (Ty == S16) { 2760 // There's no f16 fmul_legacy, so we need to convert for it. 2761 auto Log = B.buildFLog2(S16, Src0, Flags); 2762 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2763 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2764 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2765 .addUse(Ext0.getReg(0)) 2766 .addUse(Ext1.getReg(0)) 2767 .setMIFlags(Flags); 2768 2769 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2770 } else 2771 return false; 2772 2773 MI.eraseFromParent(); 2774 return true; 2775 } 2776 2777 // Find a source register, ignoring any possible source modifiers. 2778 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2779 Register ModSrc = OrigSrc; 2780 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2781 ModSrc = SrcFNeg->getOperand(1).getReg(); 2782 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2783 ModSrc = SrcFAbs->getOperand(1).getReg(); 2784 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2785 ModSrc = SrcFAbs->getOperand(1).getReg(); 2786 return ModSrc; 2787 } 2788 2789 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2790 MachineRegisterInfo &MRI, 2791 MachineIRBuilder &B) const { 2792 2793 const LLT S1 = LLT::scalar(1); 2794 const LLT S64 = LLT::scalar(64); 2795 Register Dst = MI.getOperand(0).getReg(); 2796 Register OrigSrc = MI.getOperand(1).getReg(); 2797 unsigned Flags = MI.getFlags(); 2798 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2799 "this should not have been custom lowered"); 2800 2801 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2802 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2803 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2804 // V_FRACT bug is: 2805 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2806 // 2807 // Convert floor(x) to (x - fract(x)) 2808 2809 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2810 .addUse(OrigSrc) 2811 .setMIFlags(Flags); 2812 2813 // Give source modifier matching some assistance before obscuring a foldable 2814 // pattern. 2815 2816 // TODO: We can avoid the neg on the fract? The input sign to fract 2817 // shouldn't matter? 2818 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2819 2820 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2821 2822 Register Min = MRI.createGenericVirtualRegister(S64); 2823 2824 // We don't need to concern ourselves with the snan handling difference, so 2825 // use the one which will directly select. 2826 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2827 if (MFI->getMode().IEEE) 2828 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2829 else 2830 B.buildFMinNum(Min, Fract, Const, Flags); 2831 2832 Register CorrectedFract = Min; 2833 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2834 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2835 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2836 } 2837 2838 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2839 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2840 2841 MI.eraseFromParent(); 2842 return true; 2843 } 2844 2845 // Turn an illegal packed v2s16 build vector into bit operations. 2846 // TODO: This should probably be a bitcast action in LegalizerHelper. 2847 bool AMDGPULegalizerInfo::legalizeBuildVector( 2848 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2849 Register Dst = MI.getOperand(0).getReg(); 2850 const LLT S32 = LLT::scalar(32); 2851 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2852 2853 Register Src0 = MI.getOperand(1).getReg(); 2854 Register Src1 = MI.getOperand(2).getReg(); 2855 assert(MRI.getType(Src0) == LLT::scalar(16)); 2856 2857 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2858 B.buildBitcast(Dst, Merge); 2859 2860 MI.eraseFromParent(); 2861 return true; 2862 } 2863 2864 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2865 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2866 // case with a single min instruction instead of a compare+select. 2867 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2868 MachineRegisterInfo &MRI, 2869 MachineIRBuilder &B) const { 2870 Register Dst = MI.getOperand(0).getReg(); 2871 Register Src = MI.getOperand(1).getReg(); 2872 LLT DstTy = MRI.getType(Dst); 2873 LLT SrcTy = MRI.getType(Src); 2874 2875 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2876 ? AMDGPU::G_AMDGPU_FFBH_U32 2877 : AMDGPU::G_AMDGPU_FFBL_B32; 2878 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2879 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2880 2881 MI.eraseFromParent(); 2882 return true; 2883 } 2884 2885 // Check that this is a G_XOR x, -1 2886 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2887 if (MI.getOpcode() != TargetOpcode::G_XOR) 2888 return false; 2889 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2890 return ConstVal && *ConstVal == -1; 2891 } 2892 2893 // Return the use branch instruction, otherwise null if the usage is invalid. 2894 static MachineInstr * 2895 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2896 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2897 Register CondDef = MI.getOperand(0).getReg(); 2898 if (!MRI.hasOneNonDBGUse(CondDef)) 2899 return nullptr; 2900 2901 MachineBasicBlock *Parent = MI.getParent(); 2902 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2903 2904 if (isNot(MRI, *UseMI)) { 2905 Register NegatedCond = UseMI->getOperand(0).getReg(); 2906 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2907 return nullptr; 2908 2909 // We're deleting the def of this value, so we need to remove it. 2910 eraseInstr(*UseMI, MRI); 2911 2912 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2913 Negated = true; 2914 } 2915 2916 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2917 return nullptr; 2918 2919 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2920 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2921 if (Next == Parent->end()) { 2922 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2923 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2924 return nullptr; 2925 UncondBrTarget = &*NextMBB; 2926 } else { 2927 if (Next->getOpcode() != AMDGPU::G_BR) 2928 return nullptr; 2929 Br = &*Next; 2930 UncondBrTarget = Br->getOperand(0).getMBB(); 2931 } 2932 2933 return UseMI; 2934 } 2935 2936 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2937 const ArgDescriptor *Arg, 2938 const TargetRegisterClass *ArgRC, 2939 LLT ArgTy) const { 2940 MCRegister SrcReg = Arg->getRegister(); 2941 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2942 assert(DstReg.isVirtual() && "Virtual register expected"); 2943 2944 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 2945 *ArgRC, B.getDebugLoc(), ArgTy); 2946 if (Arg->isMasked()) { 2947 // TODO: Should we try to emit this once in the entry block? 2948 const LLT S32 = LLT::scalar(32); 2949 const unsigned Mask = Arg->getMask(); 2950 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2951 2952 Register AndMaskSrc = LiveIn; 2953 2954 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 2955 // 0. 2956 if (Shift != 0) { 2957 auto ShiftAmt = B.buildConstant(S32, Shift); 2958 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2959 } 2960 2961 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2962 } else { 2963 B.buildCopy(DstReg, LiveIn); 2964 } 2965 2966 return true; 2967 } 2968 2969 bool AMDGPULegalizerInfo::loadInputValue( 2970 Register DstReg, MachineIRBuilder &B, 2971 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2972 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2973 const ArgDescriptor *Arg; 2974 const TargetRegisterClass *ArgRC; 2975 LLT ArgTy; 2976 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2977 2978 if (!Arg) { 2979 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 2980 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 2981 // case the pointer argument may be missing and we use null. 2982 B.buildConstant(DstReg, 0); 2983 return true; 2984 } 2985 2986 // It's undefined behavior if a function marked with the amdgpu-no-* 2987 // attributes uses the corresponding intrinsic. 2988 B.buildUndef(DstReg); 2989 return true; 2990 } 2991 2992 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2993 return false; // TODO: Handle these 2994 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2995 } 2996 2997 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2998 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2999 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3000 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 3001 return false; 3002 3003 MI.eraseFromParent(); 3004 return true; 3005 } 3006 3007 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, 3008 int64_t Offset) const { 3009 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 3010 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 3011 3012 // TODO: If we passed in the base kernel offset we could have a better 3013 // alignment than 4, but we don't really need it. 3014 if (!loadInputValue(KernArgReg, B, 3015 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3016 llvm_unreachable("failed to find kernarg segment ptr"); 3017 3018 auto COffset = B.buildConstant(LLT::scalar(64), Offset); 3019 // TODO: Should get nuw 3020 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); 3021 } 3022 3023 /// Legalize a value that's loaded from kernel arguments. This is only used by 3024 /// legacy intrinsics. 3025 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, 3026 MachineIRBuilder &B, 3027 uint64_t Offset, 3028 Align Alignment) const { 3029 const LLT S32 = LLT::scalar(32); 3030 Register DstReg = MI.getOperand(0).getReg(); 3031 3032 assert(B.getMRI()->getType(DstReg) == S32 && 3033 "unexpected kernarg parameter type"); 3034 3035 Register Ptr = getKernargParameterPtr(B, Offset); 3036 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 3037 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), 3038 MachineMemOperand::MODereferenceable | 3039 MachineMemOperand::MOInvariant); 3040 MI.eraseFromParent(); 3041 return true; 3042 } 3043 3044 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 3045 MachineRegisterInfo &MRI, 3046 MachineIRBuilder &B) const { 3047 Register Dst = MI.getOperand(0).getReg(); 3048 LLT DstTy = MRI.getType(Dst); 3049 LLT S16 = LLT::scalar(16); 3050 LLT S32 = LLT::scalar(32); 3051 LLT S64 = LLT::scalar(64); 3052 3053 if (DstTy == S16) 3054 return legalizeFDIV16(MI, MRI, B); 3055 if (DstTy == S32) 3056 return legalizeFDIV32(MI, MRI, B); 3057 if (DstTy == S64) 3058 return legalizeFDIV64(MI, MRI, B); 3059 3060 return false; 3061 } 3062 3063 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 3064 Register DstDivReg, 3065 Register DstRemReg, 3066 Register X, 3067 Register Y) const { 3068 const LLT S1 = LLT::scalar(1); 3069 const LLT S32 = LLT::scalar(32); 3070 3071 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 3072 // algorithm used here. 3073 3074 // Initial estimate of inv(y). 3075 auto FloatY = B.buildUITOFP(S32, Y); 3076 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 3077 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 3078 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 3079 auto Z = B.buildFPTOUI(S32, ScaledY); 3080 3081 // One round of UNR. 3082 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 3083 auto NegYZ = B.buildMul(S32, NegY, Z); 3084 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 3085 3086 // Quotient/remainder estimate. 3087 auto Q = B.buildUMulH(S32, X, Z); 3088 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 3089 3090 // First quotient/remainder refinement. 3091 auto One = B.buildConstant(S32, 1); 3092 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3093 if (DstDivReg) 3094 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 3095 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 3096 3097 // Second quotient/remainder refinement. 3098 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3099 if (DstDivReg) 3100 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 3101 3102 if (DstRemReg) 3103 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 3104 } 3105 3106 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 3107 // 3108 // Return lo, hi of result 3109 // 3110 // %cvt.lo = G_UITOFP Val.lo 3111 // %cvt.hi = G_UITOFP Val.hi 3112 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 3113 // %rcp = G_AMDGPU_RCP_IFLAG %mad 3114 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 3115 // %mul2 = G_FMUL %mul1, 2**(-32) 3116 // %trunc = G_INTRINSIC_TRUNC %mul2 3117 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 3118 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 3119 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 3120 Register Val) { 3121 const LLT S32 = LLT::scalar(32); 3122 auto Unmerge = B.buildUnmerge(S32, Val); 3123 3124 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 3125 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 3126 3127 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 3128 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 3129 3130 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 3131 auto Mul1 = 3132 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3133 3134 // 2**(-32) 3135 auto Mul2 = 3136 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3137 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3138 3139 // -(2**32) 3140 auto Mad2 = B.buildFMAD(S32, Trunc, 3141 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3142 3143 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3144 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3145 3146 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3147 } 3148 3149 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3150 Register DstDivReg, 3151 Register DstRemReg, 3152 Register Numer, 3153 Register Denom) const { 3154 const LLT S32 = LLT::scalar(32); 3155 const LLT S64 = LLT::scalar(64); 3156 const LLT S1 = LLT::scalar(1); 3157 Register RcpLo, RcpHi; 3158 3159 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3160 3161 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3162 3163 auto Zero64 = B.buildConstant(S64, 0); 3164 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3165 3166 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3167 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3168 3169 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3170 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3171 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3172 3173 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3174 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3175 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3176 3177 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3178 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3179 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3180 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3181 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3182 3183 auto Zero32 = B.buildConstant(S32, 0); 3184 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3185 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3186 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3187 3188 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3189 Register NumerLo = UnmergeNumer.getReg(0); 3190 Register NumerHi = UnmergeNumer.getReg(1); 3191 3192 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3193 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3194 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3195 Register Mul3_Lo = UnmergeMul3.getReg(0); 3196 Register Mul3_Hi = UnmergeMul3.getReg(1); 3197 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3198 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3199 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3200 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3201 3202 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3203 Register DenomLo = UnmergeDenom.getReg(0); 3204 Register DenomHi = UnmergeDenom.getReg(1); 3205 3206 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3207 auto C1 = B.buildSExt(S32, CmpHi); 3208 3209 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3210 auto C2 = B.buildSExt(S32, CmpLo); 3211 3212 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3213 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3214 3215 // TODO: Here and below portions of the code can be enclosed into if/endif. 3216 // Currently control flow is unconditional and we have 4 selects after 3217 // potential endif to substitute PHIs. 3218 3219 // if C3 != 0 ... 3220 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3221 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3222 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3223 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3224 3225 auto One64 = B.buildConstant(S64, 1); 3226 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3227 3228 auto C4 = 3229 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3230 auto C5 = 3231 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3232 auto C6 = B.buildSelect( 3233 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3234 3235 // if (C6 != 0) 3236 auto Add4 = B.buildAdd(S64, Add3, One64); 3237 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3238 3239 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3240 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3241 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3242 3243 // endif C6 3244 // endif C3 3245 3246 if (DstDivReg) { 3247 auto Sel1 = B.buildSelect( 3248 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3249 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3250 Sel1, MulHi3); 3251 } 3252 3253 if (DstRemReg) { 3254 auto Sel2 = B.buildSelect( 3255 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3256 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3257 Sel2, Sub1); 3258 } 3259 } 3260 3261 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3262 MachineRegisterInfo &MRI, 3263 MachineIRBuilder &B) const { 3264 Register DstDivReg, DstRemReg; 3265 switch (MI.getOpcode()) { 3266 default: 3267 llvm_unreachable("Unexpected opcode!"); 3268 case AMDGPU::G_UDIV: { 3269 DstDivReg = MI.getOperand(0).getReg(); 3270 break; 3271 } 3272 case AMDGPU::G_UREM: { 3273 DstRemReg = MI.getOperand(0).getReg(); 3274 break; 3275 } 3276 case AMDGPU::G_UDIVREM: { 3277 DstDivReg = MI.getOperand(0).getReg(); 3278 DstRemReg = MI.getOperand(1).getReg(); 3279 break; 3280 } 3281 } 3282 3283 const LLT S64 = LLT::scalar(64); 3284 const LLT S32 = LLT::scalar(32); 3285 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3286 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3287 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3288 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3289 3290 if (Ty == S32) 3291 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3292 else if (Ty == S64) 3293 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3294 else 3295 return false; 3296 3297 MI.eraseFromParent(); 3298 return true; 3299 } 3300 3301 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3302 MachineRegisterInfo &MRI, 3303 MachineIRBuilder &B) const { 3304 const LLT S64 = LLT::scalar(64); 3305 const LLT S32 = LLT::scalar(32); 3306 3307 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3308 if (Ty != S32 && Ty != S64) 3309 return false; 3310 3311 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3312 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3313 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3314 3315 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3316 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3317 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3318 3319 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3320 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3321 3322 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3323 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3324 3325 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3326 switch (MI.getOpcode()) { 3327 default: 3328 llvm_unreachable("Unexpected opcode!"); 3329 case AMDGPU::G_SDIV: { 3330 DstDivReg = MI.getOperand(0).getReg(); 3331 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3332 break; 3333 } 3334 case AMDGPU::G_SREM: { 3335 DstRemReg = MI.getOperand(0).getReg(); 3336 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3337 break; 3338 } 3339 case AMDGPU::G_SDIVREM: { 3340 DstDivReg = MI.getOperand(0).getReg(); 3341 DstRemReg = MI.getOperand(1).getReg(); 3342 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3343 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3344 break; 3345 } 3346 } 3347 3348 if (Ty == S32) 3349 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3350 else 3351 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3352 3353 if (DstDivReg) { 3354 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3355 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3356 B.buildSub(DstDivReg, SignXor, Sign); 3357 } 3358 3359 if (DstRemReg) { 3360 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3361 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3362 B.buildSub(DstRemReg, SignXor, Sign); 3363 } 3364 3365 MI.eraseFromParent(); 3366 return true; 3367 } 3368 3369 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3370 MachineRegisterInfo &MRI, 3371 MachineIRBuilder &B) const { 3372 Register Res = MI.getOperand(0).getReg(); 3373 Register LHS = MI.getOperand(1).getReg(); 3374 Register RHS = MI.getOperand(2).getReg(); 3375 uint16_t Flags = MI.getFlags(); 3376 LLT ResTy = MRI.getType(Res); 3377 3378 const MachineFunction &MF = B.getMF(); 3379 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3380 MI.getFlag(MachineInstr::FmAfn); 3381 3382 if (!AllowInaccurateRcp) 3383 return false; 3384 3385 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3386 // 1 / x -> RCP(x) 3387 if (CLHS->isExactlyValue(1.0)) { 3388 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3389 .addUse(RHS) 3390 .setMIFlags(Flags); 3391 3392 MI.eraseFromParent(); 3393 return true; 3394 } 3395 3396 // -1 / x -> RCP( FNEG(x) ) 3397 if (CLHS->isExactlyValue(-1.0)) { 3398 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3399 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3400 .addUse(FNeg.getReg(0)) 3401 .setMIFlags(Flags); 3402 3403 MI.eraseFromParent(); 3404 return true; 3405 } 3406 } 3407 3408 // x / y -> x * (1.0 / y) 3409 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3410 .addUse(RHS) 3411 .setMIFlags(Flags); 3412 B.buildFMul(Res, LHS, RCP, Flags); 3413 3414 MI.eraseFromParent(); 3415 return true; 3416 } 3417 3418 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3419 MachineRegisterInfo &MRI, 3420 MachineIRBuilder &B) const { 3421 Register Res = MI.getOperand(0).getReg(); 3422 Register X = MI.getOperand(1).getReg(); 3423 Register Y = MI.getOperand(2).getReg(); 3424 uint16_t Flags = MI.getFlags(); 3425 LLT ResTy = MRI.getType(Res); 3426 3427 const MachineFunction &MF = B.getMF(); 3428 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3429 MI.getFlag(MachineInstr::FmAfn); 3430 3431 if (!AllowInaccurateRcp) 3432 return false; 3433 3434 auto NegY = B.buildFNeg(ResTy, Y); 3435 auto One = B.buildFConstant(ResTy, 1.0); 3436 3437 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3438 .addUse(Y) 3439 .setMIFlags(Flags); 3440 3441 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3442 R = B.buildFMA(ResTy, Tmp0, R, R); 3443 3444 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3445 R = B.buildFMA(ResTy, Tmp1, R, R); 3446 3447 auto Ret = B.buildFMul(ResTy, X, R); 3448 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3449 3450 B.buildFMA(Res, Tmp2, R, Ret); 3451 MI.eraseFromParent(); 3452 return true; 3453 } 3454 3455 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3456 MachineRegisterInfo &MRI, 3457 MachineIRBuilder &B) const { 3458 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3459 return true; 3460 3461 Register Res = MI.getOperand(0).getReg(); 3462 Register LHS = MI.getOperand(1).getReg(); 3463 Register RHS = MI.getOperand(2).getReg(); 3464 3465 uint16_t Flags = MI.getFlags(); 3466 3467 LLT S16 = LLT::scalar(16); 3468 LLT S32 = LLT::scalar(32); 3469 3470 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3471 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3472 3473 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3474 .addUse(RHSExt.getReg(0)) 3475 .setMIFlags(Flags); 3476 3477 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3478 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3479 3480 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3481 .addUse(RDst.getReg(0)) 3482 .addUse(RHS) 3483 .addUse(LHS) 3484 .setMIFlags(Flags); 3485 3486 MI.eraseFromParent(); 3487 return true; 3488 } 3489 3490 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3491 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3492 static void toggleSPDenormMode(bool Enable, 3493 MachineIRBuilder &B, 3494 const GCNSubtarget &ST, 3495 AMDGPU::SIModeRegisterDefaults Mode) { 3496 // Set SP denorm mode to this value. 3497 unsigned SPDenormMode = 3498 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3499 3500 if (ST.hasDenormModeInst()) { 3501 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3502 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3503 3504 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3505 B.buildInstr(AMDGPU::S_DENORM_MODE) 3506 .addImm(NewDenormModeValue); 3507 3508 } else { 3509 // Select FP32 bit field in mode register. 3510 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3511 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3512 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3513 3514 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3515 .addImm(SPDenormMode) 3516 .addImm(SPDenormModeBitField); 3517 } 3518 } 3519 3520 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3521 MachineRegisterInfo &MRI, 3522 MachineIRBuilder &B) const { 3523 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3524 return true; 3525 3526 Register Res = MI.getOperand(0).getReg(); 3527 Register LHS = MI.getOperand(1).getReg(); 3528 Register RHS = MI.getOperand(2).getReg(); 3529 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3530 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3531 3532 uint16_t Flags = MI.getFlags(); 3533 3534 LLT S32 = LLT::scalar(32); 3535 LLT S1 = LLT::scalar(1); 3536 3537 auto One = B.buildFConstant(S32, 1.0f); 3538 3539 auto DenominatorScaled = 3540 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3541 .addUse(LHS) 3542 .addUse(RHS) 3543 .addImm(0) 3544 .setMIFlags(Flags); 3545 auto NumeratorScaled = 3546 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3547 .addUse(LHS) 3548 .addUse(RHS) 3549 .addImm(1) 3550 .setMIFlags(Flags); 3551 3552 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3553 .addUse(DenominatorScaled.getReg(0)) 3554 .setMIFlags(Flags); 3555 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3556 3557 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3558 // aren't modeled as reading it. 3559 if (!Mode.allFP32Denormals()) 3560 toggleSPDenormMode(true, B, ST, Mode); 3561 3562 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3563 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3564 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3565 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3566 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3567 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3568 3569 if (!Mode.allFP32Denormals()) 3570 toggleSPDenormMode(false, B, ST, Mode); 3571 3572 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3573 .addUse(Fma4.getReg(0)) 3574 .addUse(Fma1.getReg(0)) 3575 .addUse(Fma3.getReg(0)) 3576 .addUse(NumeratorScaled.getReg(1)) 3577 .setMIFlags(Flags); 3578 3579 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3580 .addUse(Fmas.getReg(0)) 3581 .addUse(RHS) 3582 .addUse(LHS) 3583 .setMIFlags(Flags); 3584 3585 MI.eraseFromParent(); 3586 return true; 3587 } 3588 3589 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3590 MachineRegisterInfo &MRI, 3591 MachineIRBuilder &B) const { 3592 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3593 return true; 3594 3595 Register Res = MI.getOperand(0).getReg(); 3596 Register LHS = MI.getOperand(1).getReg(); 3597 Register RHS = MI.getOperand(2).getReg(); 3598 3599 uint16_t Flags = MI.getFlags(); 3600 3601 LLT S64 = LLT::scalar(64); 3602 LLT S1 = LLT::scalar(1); 3603 3604 auto One = B.buildFConstant(S64, 1.0); 3605 3606 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3607 .addUse(LHS) 3608 .addUse(RHS) 3609 .addImm(0) 3610 .setMIFlags(Flags); 3611 3612 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3613 3614 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3615 .addUse(DivScale0.getReg(0)) 3616 .setMIFlags(Flags); 3617 3618 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3619 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3620 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3621 3622 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3623 .addUse(LHS) 3624 .addUse(RHS) 3625 .addImm(1) 3626 .setMIFlags(Flags); 3627 3628 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3629 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3630 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3631 3632 Register Scale; 3633 if (!ST.hasUsableDivScaleConditionOutput()) { 3634 // Workaround a hardware bug on SI where the condition output from div_scale 3635 // is not usable. 3636 3637 LLT S32 = LLT::scalar(32); 3638 3639 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3640 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3641 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3642 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3643 3644 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3645 Scale1Unmerge.getReg(1)); 3646 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3647 Scale0Unmerge.getReg(1)); 3648 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3649 } else { 3650 Scale = DivScale1.getReg(1); 3651 } 3652 3653 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3654 .addUse(Fma4.getReg(0)) 3655 .addUse(Fma3.getReg(0)) 3656 .addUse(Mul.getReg(0)) 3657 .addUse(Scale) 3658 .setMIFlags(Flags); 3659 3660 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3661 .addUse(Fmas.getReg(0)) 3662 .addUse(RHS) 3663 .addUse(LHS) 3664 .setMIFlags(Flags); 3665 3666 MI.eraseFromParent(); 3667 return true; 3668 } 3669 3670 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3671 MachineRegisterInfo &MRI, 3672 MachineIRBuilder &B) const { 3673 Register Res = MI.getOperand(0).getReg(); 3674 Register LHS = MI.getOperand(2).getReg(); 3675 Register RHS = MI.getOperand(3).getReg(); 3676 uint16_t Flags = MI.getFlags(); 3677 3678 LLT S32 = LLT::scalar(32); 3679 LLT S1 = LLT::scalar(1); 3680 3681 auto Abs = B.buildFAbs(S32, RHS, Flags); 3682 const APFloat C0Val(1.0f); 3683 3684 auto C0 = B.buildConstant(S32, 0x6f800000); 3685 auto C1 = B.buildConstant(S32, 0x2f800000); 3686 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3687 3688 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3689 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3690 3691 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3692 3693 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3694 .addUse(Mul0.getReg(0)) 3695 .setMIFlags(Flags); 3696 3697 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3698 3699 B.buildFMul(Res, Sel, Mul1, Flags); 3700 3701 MI.eraseFromParent(); 3702 return true; 3703 } 3704 3705 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3706 // FIXME: Why do we handle this one but not other removed instructions? 3707 // 3708 // Reciprocal square root. The clamp prevents infinite results, clamping 3709 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3710 // +-max_float. 3711 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3712 MachineRegisterInfo &MRI, 3713 MachineIRBuilder &B) const { 3714 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3715 return true; 3716 3717 Register Dst = MI.getOperand(0).getReg(); 3718 Register Src = MI.getOperand(2).getReg(); 3719 auto Flags = MI.getFlags(); 3720 3721 LLT Ty = MRI.getType(Dst); 3722 3723 const fltSemantics *FltSemantics; 3724 if (Ty == LLT::scalar(32)) 3725 FltSemantics = &APFloat::IEEEsingle(); 3726 else if (Ty == LLT::scalar(64)) 3727 FltSemantics = &APFloat::IEEEdouble(); 3728 else 3729 return false; 3730 3731 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3732 .addUse(Src) 3733 .setMIFlags(Flags); 3734 3735 // We don't need to concern ourselves with the snan handling difference, since 3736 // the rsq quieted (or not) so use the one which will directly select. 3737 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3738 const bool UseIEEE = MFI->getMode().IEEE; 3739 3740 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3741 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3742 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3743 3744 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3745 3746 if (UseIEEE) 3747 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3748 else 3749 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3750 MI.eraseFromParent(); 3751 return true; 3752 } 3753 3754 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3755 switch (IID) { 3756 case Intrinsic::amdgcn_ds_fadd: 3757 return AMDGPU::G_ATOMICRMW_FADD; 3758 case Intrinsic::amdgcn_ds_fmin: 3759 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3760 case Intrinsic::amdgcn_ds_fmax: 3761 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3762 default: 3763 llvm_unreachable("not a DS FP intrinsic"); 3764 } 3765 } 3766 3767 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3768 MachineInstr &MI, 3769 Intrinsic::ID IID) const { 3770 GISelChangeObserver &Observer = Helper.Observer; 3771 Observer.changingInstr(MI); 3772 3773 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3774 3775 // The remaining operands were used to set fields in the MemOperand on 3776 // construction. 3777 for (int I = 6; I > 3; --I) 3778 MI.removeOperand(I); 3779 3780 MI.removeOperand(1); // Remove the intrinsic ID. 3781 Observer.changedInstr(MI); 3782 return true; 3783 } 3784 3785 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3786 MachineRegisterInfo &MRI, 3787 MachineIRBuilder &B) const { 3788 uint64_t Offset = 3789 ST.getTargetLowering()->getImplicitParameterOffset( 3790 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3791 LLT DstTy = MRI.getType(DstReg); 3792 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3793 3794 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3795 if (!loadInputValue(KernargPtrReg, B, 3796 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3797 return false; 3798 3799 // FIXME: This should be nuw 3800 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3801 return true; 3802 } 3803 3804 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3805 MachineRegisterInfo &MRI, 3806 MachineIRBuilder &B) const { 3807 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3808 if (!MFI->isEntryFunction()) { 3809 return legalizePreloadedArgIntrin(MI, MRI, B, 3810 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3811 } 3812 3813 Register DstReg = MI.getOperand(0).getReg(); 3814 if (!getImplicitArgPtr(DstReg, MRI, B)) 3815 return false; 3816 3817 MI.eraseFromParent(); 3818 return true; 3819 } 3820 3821 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3822 MachineRegisterInfo &MRI, 3823 MachineIRBuilder &B, 3824 unsigned AddrSpace) const { 3825 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3826 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3827 Register Hi32 = Unmerge.getReg(1); 3828 3829 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3830 MI.eraseFromParent(); 3831 return true; 3832 } 3833 3834 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3835 // offset (the offset that is included in bounds checking and swizzling, to be 3836 // split between the instruction's voffset and immoffset fields) and soffset 3837 // (the offset that is excluded from bounds checking and swizzling, to go in 3838 // the instruction's soffset field). This function takes the first kind of 3839 // offset and figures out how to split it between voffset and immoffset. 3840 std::pair<Register, unsigned> 3841 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3842 Register OrigOffset) const { 3843 const unsigned MaxImm = 4095; 3844 Register BaseReg; 3845 unsigned ImmOffset; 3846 const LLT S32 = LLT::scalar(32); 3847 MachineRegisterInfo &MRI = *B.getMRI(); 3848 3849 std::tie(BaseReg, ImmOffset) = 3850 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3851 3852 // If BaseReg is a pointer, convert it to int. 3853 if (MRI.getType(BaseReg).isPointer()) 3854 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3855 3856 // If the immediate value is too big for the immoffset field, put the value 3857 // and -4096 into the immoffset field so that the value that is copied/added 3858 // for the voffset field is a multiple of 4096, and it stands more chance 3859 // of being CSEd with the copy/add for another similar load/store. 3860 // However, do not do that rounding down to a multiple of 4096 if that is a 3861 // negative number, as it appears to be illegal to have a negative offset 3862 // in the vgpr, even if adding the immediate offset makes it positive. 3863 unsigned Overflow = ImmOffset & ~MaxImm; 3864 ImmOffset -= Overflow; 3865 if ((int32_t)Overflow < 0) { 3866 Overflow += ImmOffset; 3867 ImmOffset = 0; 3868 } 3869 3870 if (Overflow != 0) { 3871 if (!BaseReg) { 3872 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3873 } else { 3874 auto OverflowVal = B.buildConstant(S32, Overflow); 3875 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3876 } 3877 } 3878 3879 if (!BaseReg) 3880 BaseReg = B.buildConstant(S32, 0).getReg(0); 3881 3882 return std::make_pair(BaseReg, ImmOffset); 3883 } 3884 3885 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3886 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3887 Register VOffset, Register SOffset, 3888 unsigned ImmOffset, Register VIndex, 3889 MachineRegisterInfo &MRI) const { 3890 Optional<ValueAndVReg> MaybeVOffsetVal = 3891 getIConstantVRegValWithLookThrough(VOffset, MRI); 3892 Optional<ValueAndVReg> MaybeSOffsetVal = 3893 getIConstantVRegValWithLookThrough(SOffset, MRI); 3894 Optional<ValueAndVReg> MaybeVIndexVal = 3895 getIConstantVRegValWithLookThrough(VIndex, MRI); 3896 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3897 // update the MMO with that offset. The stride is unknown so we can only do 3898 // this if VIndex is constant 0. 3899 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3900 MaybeVIndexVal->Value == 0) { 3901 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3902 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3903 MMO->setOffset(TotalOffset); 3904 } else { 3905 // We don't have a constant combined offset to use in the MMO. Give up. 3906 MMO->setValue((Value *)nullptr); 3907 } 3908 } 3909 3910 /// Handle register layout difference for f16 images for some subtargets. 3911 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3912 MachineRegisterInfo &MRI, 3913 Register Reg, 3914 bool ImageStore) const { 3915 const LLT S16 = LLT::scalar(16); 3916 const LLT S32 = LLT::scalar(32); 3917 LLT StoreVT = MRI.getType(Reg); 3918 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3919 3920 if (ST.hasUnpackedD16VMem()) { 3921 auto Unmerge = B.buildUnmerge(S16, Reg); 3922 3923 SmallVector<Register, 4> WideRegs; 3924 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3925 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3926 3927 int NumElts = StoreVT.getNumElements(); 3928 3929 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3930 .getReg(0); 3931 } 3932 3933 if (ImageStore && ST.hasImageStoreD16Bug()) { 3934 if (StoreVT.getNumElements() == 2) { 3935 SmallVector<Register, 4> PackedRegs; 3936 Reg = B.buildBitcast(S32, Reg).getReg(0); 3937 PackedRegs.push_back(Reg); 3938 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3939 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3940 .getReg(0); 3941 } 3942 3943 if (StoreVT.getNumElements() == 3) { 3944 SmallVector<Register, 4> PackedRegs; 3945 auto Unmerge = B.buildUnmerge(S16, Reg); 3946 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3947 PackedRegs.push_back(Unmerge.getReg(I)); 3948 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3949 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3950 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3951 } 3952 3953 if (StoreVT.getNumElements() == 4) { 3954 SmallVector<Register, 4> PackedRegs; 3955 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3956 auto Unmerge = B.buildUnmerge(S32, Reg); 3957 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3958 PackedRegs.push_back(Unmerge.getReg(I)); 3959 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3960 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3961 .getReg(0); 3962 } 3963 3964 llvm_unreachable("invalid data type"); 3965 } 3966 3967 if (StoreVT == LLT::fixed_vector(3, S16)) { 3968 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 3969 .getReg(0); 3970 } 3971 return Reg; 3972 } 3973 3974 Register AMDGPULegalizerInfo::fixStoreSourceType( 3975 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3976 MachineRegisterInfo *MRI = B.getMRI(); 3977 LLT Ty = MRI->getType(VData); 3978 3979 const LLT S16 = LLT::scalar(16); 3980 3981 // Fixup illegal register types for i8 stores. 3982 if (Ty == LLT::scalar(8) || Ty == S16) { 3983 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3984 return AnyExt; 3985 } 3986 3987 if (Ty.isVector()) { 3988 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3989 if (IsFormat) 3990 return handleD16VData(B, *MRI, VData); 3991 } 3992 } 3993 3994 return VData; 3995 } 3996 3997 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3998 MachineRegisterInfo &MRI, 3999 MachineIRBuilder &B, 4000 bool IsTyped, 4001 bool IsFormat) const { 4002 Register VData = MI.getOperand(1).getReg(); 4003 LLT Ty = MRI.getType(VData); 4004 LLT EltTy = Ty.getScalarType(); 4005 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4006 const LLT S32 = LLT::scalar(32); 4007 4008 VData = fixStoreSourceType(B, VData, IsFormat); 4009 Register RSrc = MI.getOperand(2).getReg(); 4010 4011 MachineMemOperand *MMO = *MI.memoperands_begin(); 4012 const int MemSize = MMO->getSize(); 4013 4014 unsigned ImmOffset; 4015 4016 // The typed intrinsics add an immediate after the registers. 4017 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4018 4019 // The struct intrinsic variants add one additional operand over raw. 4020 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4021 Register VIndex; 4022 int OpOffset = 0; 4023 if (HasVIndex) { 4024 VIndex = MI.getOperand(3).getReg(); 4025 OpOffset = 1; 4026 } else { 4027 VIndex = B.buildConstant(S32, 0).getReg(0); 4028 } 4029 4030 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4031 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4032 4033 unsigned Format = 0; 4034 if (IsTyped) { 4035 Format = MI.getOperand(5 + OpOffset).getImm(); 4036 ++OpOffset; 4037 } 4038 4039 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4040 4041 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4042 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4043 4044 unsigned Opc; 4045 if (IsTyped) { 4046 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 4047 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 4048 } else if (IsFormat) { 4049 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 4050 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 4051 } else { 4052 switch (MemSize) { 4053 case 1: 4054 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 4055 break; 4056 case 2: 4057 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 4058 break; 4059 default: 4060 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 4061 break; 4062 } 4063 } 4064 4065 auto MIB = B.buildInstr(Opc) 4066 .addUse(VData) // vdata 4067 .addUse(RSrc) // rsrc 4068 .addUse(VIndex) // vindex 4069 .addUse(VOffset) // voffset 4070 .addUse(SOffset) // soffset 4071 .addImm(ImmOffset); // offset(imm) 4072 4073 if (IsTyped) 4074 MIB.addImm(Format); 4075 4076 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4077 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4078 .addMemOperand(MMO); 4079 4080 MI.eraseFromParent(); 4081 return true; 4082 } 4083 4084 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 4085 MachineRegisterInfo &MRI, 4086 MachineIRBuilder &B, 4087 bool IsFormat, 4088 bool IsTyped) const { 4089 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 4090 MachineMemOperand *MMO = *MI.memoperands_begin(); 4091 const LLT MemTy = MMO->getMemoryType(); 4092 const LLT S32 = LLT::scalar(32); 4093 4094 Register Dst = MI.getOperand(0).getReg(); 4095 Register RSrc = MI.getOperand(2).getReg(); 4096 4097 // The typed intrinsics add an immediate after the registers. 4098 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4099 4100 // The struct intrinsic variants add one additional operand over raw. 4101 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4102 Register VIndex; 4103 int OpOffset = 0; 4104 if (HasVIndex) { 4105 VIndex = MI.getOperand(3).getReg(); 4106 OpOffset = 1; 4107 } else { 4108 VIndex = B.buildConstant(S32, 0).getReg(0); 4109 } 4110 4111 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4112 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4113 4114 unsigned Format = 0; 4115 if (IsTyped) { 4116 Format = MI.getOperand(5 + OpOffset).getImm(); 4117 ++OpOffset; 4118 } 4119 4120 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4121 unsigned ImmOffset; 4122 4123 LLT Ty = MRI.getType(Dst); 4124 LLT EltTy = Ty.getScalarType(); 4125 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4126 const bool Unpacked = ST.hasUnpackedD16VMem(); 4127 4128 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4129 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4130 4131 unsigned Opc; 4132 4133 if (IsTyped) { 4134 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4135 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4136 } else if (IsFormat) { 4137 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4138 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4139 } else { 4140 switch (MemTy.getSizeInBits()) { 4141 case 8: 4142 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4143 break; 4144 case 16: 4145 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4146 break; 4147 default: 4148 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4149 break; 4150 } 4151 } 4152 4153 Register LoadDstReg; 4154 4155 bool IsExtLoad = 4156 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4157 LLT UnpackedTy = Ty.changeElementSize(32); 4158 4159 if (IsExtLoad) 4160 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4161 else if (Unpacked && IsD16 && Ty.isVector()) 4162 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4163 else 4164 LoadDstReg = Dst; 4165 4166 auto MIB = B.buildInstr(Opc) 4167 .addDef(LoadDstReg) // vdata 4168 .addUse(RSrc) // rsrc 4169 .addUse(VIndex) // vindex 4170 .addUse(VOffset) // voffset 4171 .addUse(SOffset) // soffset 4172 .addImm(ImmOffset); // offset(imm) 4173 4174 if (IsTyped) 4175 MIB.addImm(Format); 4176 4177 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4178 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4179 .addMemOperand(MMO); 4180 4181 if (LoadDstReg != Dst) { 4182 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4183 4184 // Widen result for extending loads was widened. 4185 if (IsExtLoad) 4186 B.buildTrunc(Dst, LoadDstReg); 4187 else { 4188 // Repack to original 16-bit vector result 4189 // FIXME: G_TRUNC should work, but legalization currently fails 4190 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4191 SmallVector<Register, 4> Repack; 4192 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4193 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4194 B.buildMerge(Dst, Repack); 4195 } 4196 } 4197 4198 MI.eraseFromParent(); 4199 return true; 4200 } 4201 4202 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4203 MachineIRBuilder &B, 4204 bool IsInc) const { 4205 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4206 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4207 B.buildInstr(Opc) 4208 .addDef(MI.getOperand(0).getReg()) 4209 .addUse(MI.getOperand(2).getReg()) 4210 .addUse(MI.getOperand(3).getReg()) 4211 .cloneMemRefs(MI); 4212 MI.eraseFromParent(); 4213 return true; 4214 } 4215 4216 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4217 switch (IntrID) { 4218 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4219 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4220 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4221 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4222 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4223 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4224 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4225 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4226 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4227 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4228 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4229 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4230 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4231 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4232 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4233 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4234 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4235 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4236 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4237 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4238 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4239 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4240 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4241 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4242 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4243 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4244 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4245 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4246 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4247 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4248 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4249 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4250 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4251 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4252 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4253 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4254 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4255 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4256 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4257 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4258 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4259 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4260 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4261 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4262 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4263 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4264 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4265 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4266 default: 4267 llvm_unreachable("unhandled atomic opcode"); 4268 } 4269 } 4270 4271 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4272 MachineIRBuilder &B, 4273 Intrinsic::ID IID) const { 4274 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4275 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4276 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4277 4278 Register Dst; 4279 4280 int OpOffset = 0; 4281 if (HasReturn) { 4282 // A few FP atomics do not support return values. 4283 Dst = MI.getOperand(0).getReg(); 4284 } else { 4285 OpOffset = -1; 4286 } 4287 4288 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4289 Register CmpVal; 4290 4291 if (IsCmpSwap) { 4292 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4293 ++OpOffset; 4294 } 4295 4296 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4297 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4298 4299 // The struct intrinsic variants add one additional operand over raw. 4300 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4301 Register VIndex; 4302 if (HasVIndex) { 4303 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4304 ++OpOffset; 4305 } else { 4306 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4307 } 4308 4309 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4310 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4311 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4312 4313 MachineMemOperand *MMO = *MI.memoperands_begin(); 4314 4315 unsigned ImmOffset; 4316 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4317 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4318 4319 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4320 4321 if (HasReturn) 4322 MIB.addDef(Dst); 4323 4324 MIB.addUse(VData); // vdata 4325 4326 if (IsCmpSwap) 4327 MIB.addReg(CmpVal); 4328 4329 MIB.addUse(RSrc) // rsrc 4330 .addUse(VIndex) // vindex 4331 .addUse(VOffset) // voffset 4332 .addUse(SOffset) // soffset 4333 .addImm(ImmOffset) // offset(imm) 4334 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4335 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4336 .addMemOperand(MMO); 4337 4338 MI.eraseFromParent(); 4339 return true; 4340 } 4341 4342 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4343 /// vector with s16 typed elements. 4344 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4345 SmallVectorImpl<Register> &PackedAddrs, 4346 unsigned ArgOffset, 4347 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4348 bool IsA16, bool IsG16) { 4349 const LLT S16 = LLT::scalar(16); 4350 const LLT V2S16 = LLT::fixed_vector(2, 16); 4351 auto EndIdx = Intr->VAddrEnd; 4352 4353 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4354 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4355 if (!SrcOp.isReg()) 4356 continue; // _L to _LZ may have eliminated this. 4357 4358 Register AddrReg = SrcOp.getReg(); 4359 4360 if ((I < Intr->GradientStart) || 4361 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4362 (I >= Intr->CoordStart && !IsA16)) { 4363 if ((I < Intr->GradientStart) && IsA16 && 4364 (B.getMRI()->getType(AddrReg) == S16)) { 4365 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 4366 // Special handling of bias when A16 is on. Bias is of type half but 4367 // occupies full 32-bit. 4368 PackedAddrs.push_back( 4369 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4370 .getReg(0)); 4371 } else { 4372 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 4373 "Bias needs to be converted to 16 bit in A16 mode"); 4374 // Handle any gradient or coordinate operands that should not be packed 4375 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4376 PackedAddrs.push_back(AddrReg); 4377 } 4378 } else { 4379 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4380 // derivatives dx/dh and dx/dv are packed with undef. 4381 if (((I + 1) >= EndIdx) || 4382 ((Intr->NumGradients / 2) % 2 == 1 && 4383 (I == static_cast<unsigned>(Intr->GradientStart + 4384 (Intr->NumGradients / 2) - 1) || 4385 I == static_cast<unsigned>(Intr->GradientStart + 4386 Intr->NumGradients - 1))) || 4387 // Check for _L to _LZ optimization 4388 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4389 PackedAddrs.push_back( 4390 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4391 .getReg(0)); 4392 } else { 4393 PackedAddrs.push_back( 4394 B.buildBuildVector( 4395 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4396 .getReg(0)); 4397 ++I; 4398 } 4399 } 4400 } 4401 } 4402 4403 /// Convert from separate vaddr components to a single vector address register, 4404 /// and replace the remaining operands with $noreg. 4405 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4406 int DimIdx, int NumVAddrs) { 4407 const LLT S32 = LLT::scalar(32); 4408 4409 SmallVector<Register, 8> AddrRegs; 4410 for (int I = 0; I != NumVAddrs; ++I) { 4411 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4412 if (SrcOp.isReg()) { 4413 AddrRegs.push_back(SrcOp.getReg()); 4414 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4415 } 4416 } 4417 4418 int NumAddrRegs = AddrRegs.size(); 4419 if (NumAddrRegs != 1) { 4420 // Above 8 elements round up to next power of 2 (i.e. 16). 4421 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4422 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4423 auto Undef = B.buildUndef(S32); 4424 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4425 NumAddrRegs = RoundedNumRegs; 4426 } 4427 4428 auto VAddr = 4429 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4430 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4431 } 4432 4433 for (int I = 1; I != NumVAddrs; ++I) { 4434 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4435 if (SrcOp.isReg()) 4436 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4437 } 4438 } 4439 4440 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4441 /// 4442 /// Depending on the subtarget, load/store with 16-bit element data need to be 4443 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4444 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4445 /// registers. 4446 /// 4447 /// We don't want to directly select image instructions just yet, but also want 4448 /// to exposes all register repacking to the legalizer/combiners. We also don't 4449 /// want a selected instruction entering RegBankSelect. In order to avoid 4450 /// defining a multitude of intermediate image instructions, directly hack on 4451 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4452 /// padding now unnecessary arguments with $noreg. 4453 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4454 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4455 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4456 4457 const unsigned NumDefs = MI.getNumExplicitDefs(); 4458 const unsigned ArgOffset = NumDefs + 1; 4459 bool IsTFE = NumDefs == 2; 4460 // We are only processing the operands of d16 image operations on subtargets 4461 // that use the unpacked register layout, or need to repack the TFE result. 4462 4463 // TODO: Do we need to guard against already legalized intrinsics? 4464 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4465 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4466 4467 MachineRegisterInfo *MRI = B.getMRI(); 4468 const LLT S32 = LLT::scalar(32); 4469 const LLT S16 = LLT::scalar(16); 4470 const LLT V2S16 = LLT::fixed_vector(2, 16); 4471 4472 unsigned DMask = 0; 4473 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 4474 LLT Ty = MRI->getType(VData); 4475 4476 // Check for 16 bit addresses and pack if true. 4477 LLT GradTy = 4478 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4479 LLT AddrTy = 4480 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4481 const bool IsG16 = GradTy == S16; 4482 const bool IsA16 = AddrTy == S16; 4483 const bool IsD16 = Ty.getScalarType() == S16; 4484 4485 int DMaskLanes = 0; 4486 if (!BaseOpcode->Atomic) { 4487 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4488 if (BaseOpcode->Gather4) { 4489 DMaskLanes = 4; 4490 } else if (DMask != 0) { 4491 DMaskLanes = countPopulation(DMask); 4492 } else if (!IsTFE && !BaseOpcode->Store) { 4493 // If dmask is 0, this is a no-op load. This can be eliminated. 4494 B.buildUndef(MI.getOperand(0)); 4495 MI.eraseFromParent(); 4496 return true; 4497 } 4498 } 4499 4500 Observer.changingInstr(MI); 4501 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4502 4503 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 4504 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 4505 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 4506 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4507 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 4508 4509 // Track that we legalized this 4510 MI.setDesc(B.getTII().get(NewOpcode)); 4511 4512 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4513 // dmask to be at least 1 otherwise the instruction will fail 4514 if (IsTFE && DMask == 0) { 4515 DMask = 0x1; 4516 DMaskLanes = 1; 4517 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4518 } 4519 4520 if (BaseOpcode->Atomic) { 4521 Register VData0 = MI.getOperand(2).getReg(); 4522 LLT Ty = MRI->getType(VData0); 4523 4524 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4525 if (Ty.isVector()) 4526 return false; 4527 4528 if (BaseOpcode->AtomicX2) { 4529 Register VData1 = MI.getOperand(3).getReg(); 4530 // The two values are packed in one register. 4531 LLT PackedTy = LLT::fixed_vector(2, Ty); 4532 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4533 MI.getOperand(2).setReg(Concat.getReg(0)); 4534 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4535 } 4536 } 4537 4538 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4539 4540 // Rewrite the addressing register layout before doing anything else. 4541 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4542 // 16 bit gradients are supported, but are tied to the A16 control 4543 // so both gradients and addresses must be 16 bit 4544 return false; 4545 } 4546 4547 if (IsA16 && !ST.hasA16()) { 4548 // A16 not supported 4549 return false; 4550 } 4551 4552 if (IsA16 || IsG16) { 4553 if (Intr->NumVAddrs > 1) { 4554 SmallVector<Register, 4> PackedRegs; 4555 4556 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4557 IsG16); 4558 4559 // See also below in the non-a16 branch 4560 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4561 PackedRegs.size() <= ST.getNSAMaxSize(); 4562 4563 if (!UseNSA && PackedRegs.size() > 1) { 4564 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4565 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4566 PackedRegs[0] = Concat.getReg(0); 4567 PackedRegs.resize(1); 4568 } 4569 4570 const unsigned NumPacked = PackedRegs.size(); 4571 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4572 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4573 if (!SrcOp.isReg()) { 4574 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4575 continue; 4576 } 4577 4578 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4579 4580 if (I - Intr->VAddrStart < NumPacked) 4581 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4582 else 4583 SrcOp.setReg(AMDGPU::NoRegister); 4584 } 4585 } 4586 } else { 4587 // If the register allocator cannot place the address registers contiguously 4588 // without introducing moves, then using the non-sequential address encoding 4589 // is always preferable, since it saves VALU instructions and is usually a 4590 // wash in terms of code size or even better. 4591 // 4592 // However, we currently have no way of hinting to the register allocator 4593 // that MIMG addresses should be placed contiguously when it is possible to 4594 // do so, so force non-NSA for the common 2-address case as a heuristic. 4595 // 4596 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4597 // allocation when possible. 4598 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4599 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4600 4601 if (!UseNSA && Intr->NumVAddrs > 1) 4602 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4603 Intr->NumVAddrs); 4604 } 4605 4606 int Flags = 0; 4607 if (IsA16) 4608 Flags |= 1; 4609 if (IsG16) 4610 Flags |= 2; 4611 MI.addOperand(MachineOperand::CreateImm(Flags)); 4612 4613 if (BaseOpcode->Store) { // No TFE for stores? 4614 // TODO: Handle dmask trim 4615 if (!Ty.isVector() || !IsD16) 4616 return true; 4617 4618 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4619 if (RepackedReg != VData) { 4620 MI.getOperand(1).setReg(RepackedReg); 4621 } 4622 4623 return true; 4624 } 4625 4626 Register DstReg = MI.getOperand(0).getReg(); 4627 const LLT EltTy = Ty.getScalarType(); 4628 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4629 4630 // Confirm that the return type is large enough for the dmask specified 4631 if (NumElts < DMaskLanes) 4632 return false; 4633 4634 if (NumElts > 4 || DMaskLanes > 4) 4635 return false; 4636 4637 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4638 const LLT AdjustedTy = 4639 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4640 4641 // The raw dword aligned data component of the load. The only legal cases 4642 // where this matters should be when using the packed D16 format, for 4643 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4644 LLT RoundedTy; 4645 4646 // S32 vector to to cover all data, plus TFE result element. 4647 LLT TFETy; 4648 4649 // Register type to use for each loaded component. Will be S32 or V2S16. 4650 LLT RegTy; 4651 4652 if (IsD16 && ST.hasUnpackedD16VMem()) { 4653 RoundedTy = 4654 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4655 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4656 RegTy = S32; 4657 } else { 4658 unsigned EltSize = EltTy.getSizeInBits(); 4659 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4660 unsigned RoundedSize = 32 * RoundedElts; 4661 RoundedTy = LLT::scalarOrVector( 4662 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4663 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4664 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4665 } 4666 4667 // The return type does not need adjustment. 4668 // TODO: Should we change s16 case to s32 or <2 x s16>? 4669 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4670 return true; 4671 4672 Register Dst1Reg; 4673 4674 // Insert after the instruction. 4675 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4676 4677 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4678 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4679 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4680 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4681 4682 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4683 4684 MI.getOperand(0).setReg(NewResultReg); 4685 4686 // In the IR, TFE is supposed to be used with a 2 element struct return 4687 // type. The instruction really returns these two values in one contiguous 4688 // register, with one additional dword beyond the loaded data. Rewrite the 4689 // return type to use a single register result. 4690 4691 if (IsTFE) { 4692 Dst1Reg = MI.getOperand(1).getReg(); 4693 if (MRI->getType(Dst1Reg) != S32) 4694 return false; 4695 4696 // TODO: Make sure the TFE operand bit is set. 4697 MI.removeOperand(1); 4698 4699 // Handle the easy case that requires no repack instructions. 4700 if (Ty == S32) { 4701 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4702 return true; 4703 } 4704 } 4705 4706 // Now figure out how to copy the new result register back into the old 4707 // result. 4708 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4709 4710 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4711 4712 if (ResultNumRegs == 1) { 4713 assert(!IsTFE); 4714 ResultRegs[0] = NewResultReg; 4715 } else { 4716 // We have to repack into a new vector of some kind. 4717 for (int I = 0; I != NumDataRegs; ++I) 4718 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4719 B.buildUnmerge(ResultRegs, NewResultReg); 4720 4721 // Drop the final TFE element to get the data part. The TFE result is 4722 // directly written to the right place already. 4723 if (IsTFE) 4724 ResultRegs.resize(NumDataRegs); 4725 } 4726 4727 // For an s16 scalar result, we form an s32 result with a truncate regardless 4728 // of packed vs. unpacked. 4729 if (IsD16 && !Ty.isVector()) { 4730 B.buildTrunc(DstReg, ResultRegs[0]); 4731 return true; 4732 } 4733 4734 // Avoid a build/concat_vector of 1 entry. 4735 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4736 B.buildBitcast(DstReg, ResultRegs[0]); 4737 return true; 4738 } 4739 4740 assert(Ty.isVector()); 4741 4742 if (IsD16) { 4743 // For packed D16 results with TFE enabled, all the data components are 4744 // S32. Cast back to the expected type. 4745 // 4746 // TODO: We don't really need to use load s32 elements. We would only need one 4747 // cast for the TFE result if a multiple of v2s16 was used. 4748 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4749 for (Register &Reg : ResultRegs) 4750 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4751 } else if (ST.hasUnpackedD16VMem()) { 4752 for (Register &Reg : ResultRegs) 4753 Reg = B.buildTrunc(S16, Reg).getReg(0); 4754 } 4755 } 4756 4757 auto padWithUndef = [&](LLT Ty, int NumElts) { 4758 if (NumElts == 0) 4759 return; 4760 Register Undef = B.buildUndef(Ty).getReg(0); 4761 for (int I = 0; I != NumElts; ++I) 4762 ResultRegs.push_back(Undef); 4763 }; 4764 4765 // Pad out any elements eliminated due to the dmask. 4766 LLT ResTy = MRI->getType(ResultRegs[0]); 4767 if (!ResTy.isVector()) { 4768 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4769 B.buildBuildVector(DstReg, ResultRegs); 4770 return true; 4771 } 4772 4773 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4774 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4775 4776 // Deal with the one annoying legal case. 4777 const LLT V3S16 = LLT::fixed_vector(3, 16); 4778 if (Ty == V3S16) { 4779 if (IsTFE) { 4780 if (ResultRegs.size() == 1) { 4781 NewResultReg = ResultRegs[0]; 4782 } else if (ResultRegs.size() == 2) { 4783 LLT V4S16 = LLT::fixed_vector(4, 16); 4784 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 4785 } else { 4786 return false; 4787 } 4788 } 4789 4790 if (MRI->getType(DstReg).getNumElements() < 4791 MRI->getType(NewResultReg).getNumElements()) { 4792 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 4793 } else { 4794 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 4795 } 4796 return true; 4797 } 4798 4799 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4800 B.buildConcatVectors(DstReg, ResultRegs); 4801 return true; 4802 } 4803 4804 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4805 LegalizerHelper &Helper, MachineInstr &MI) const { 4806 MachineIRBuilder &B = Helper.MIRBuilder; 4807 GISelChangeObserver &Observer = Helper.Observer; 4808 4809 Register Dst = MI.getOperand(0).getReg(); 4810 LLT Ty = B.getMRI()->getType(Dst); 4811 unsigned Size = Ty.getSizeInBits(); 4812 MachineFunction &MF = B.getMF(); 4813 4814 Observer.changingInstr(MI); 4815 4816 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4817 Ty = getBitcastRegisterType(Ty); 4818 Helper.bitcastDst(MI, Ty, 0); 4819 Dst = MI.getOperand(0).getReg(); 4820 B.setInsertPt(B.getMBB(), MI); 4821 } 4822 4823 // FIXME: We don't really need this intermediate instruction. The intrinsic 4824 // should be fixed to have a memory operand. Since it's readnone, we're not 4825 // allowed to add one. 4826 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4827 MI.removeOperand(1); // Remove intrinsic ID 4828 4829 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4830 // TODO: Should this use datalayout alignment? 4831 const unsigned MemSize = (Size + 7) / 8; 4832 const Align MemAlign(4); 4833 MachineMemOperand *MMO = MF.getMachineMemOperand( 4834 MachinePointerInfo(), 4835 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4836 MachineMemOperand::MOInvariant, 4837 MemSize, MemAlign); 4838 MI.addMemOperand(MF, MMO); 4839 4840 // There are no 96-bit result scalar loads, but widening to 128-bit should 4841 // always be legal. We may need to restore this to a 96-bit result if it turns 4842 // out this needs to be converted to a vector load during RegBankSelect. 4843 if (!isPowerOf2_32(Size)) { 4844 if (Ty.isVector()) 4845 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4846 else 4847 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4848 } 4849 4850 Observer.changedInstr(MI); 4851 return true; 4852 } 4853 4854 // TODO: Move to selection 4855 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4856 MachineRegisterInfo &MRI, 4857 MachineIRBuilder &B) const { 4858 if (!ST.isTrapHandlerEnabled() || 4859 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4860 return legalizeTrapEndpgm(MI, MRI, B); 4861 4862 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4863 switch (*HsaAbiVer) { 4864 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4865 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4866 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4867 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4868 case ELF::ELFABIVERSION_AMDGPU_HSA_V5: 4869 return ST.supportsGetDoorbellID() ? 4870 legalizeTrapHsa(MI, MRI, B) : 4871 legalizeTrapHsaQueuePtr(MI, MRI, B); 4872 } 4873 } 4874 4875 llvm_unreachable("Unknown trap handler"); 4876 } 4877 4878 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4879 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4880 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4881 MI.eraseFromParent(); 4882 return true; 4883 } 4884 4885 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4886 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4887 MachineFunction &MF = B.getMF(); 4888 const LLT S64 = LLT::scalar(64); 4889 4890 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4891 // For code object version 5, queue_ptr is passed through implicit kernarg. 4892 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 4893 AMDGPUTargetLowering::ImplicitParameter Param = 4894 AMDGPUTargetLowering::QUEUE_PTR; 4895 uint64_t Offset = 4896 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 4897 4898 Register KernargPtrReg = MRI.createGenericVirtualRegister( 4899 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4900 4901 if (!loadInputValue(KernargPtrReg, B, 4902 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 4903 return false; 4904 4905 // TODO: can we be smarter about machine pointer info? 4906 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 4907 MachineMemOperand *MMO = MF.getMachineMemOperand( 4908 PtrInfo, 4909 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4910 MachineMemOperand::MOInvariant, 4911 LLT::scalar(64), commonAlignment(Align(64), Offset)); 4912 4913 // Pointer address 4914 Register LoadAddr = MRI.createGenericVirtualRegister( 4915 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4916 B.buildPtrAdd(LoadAddr, KernargPtrReg, 4917 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 4918 // Load address 4919 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 4920 B.buildCopy(SGPR01, Temp); 4921 B.buildInstr(AMDGPU::S_TRAP) 4922 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4923 .addReg(SGPR01, RegState::Implicit); 4924 MI.eraseFromParent(); 4925 return true; 4926 } 4927 4928 // Pass queue pointer to trap handler as input, and insert trap instruction 4929 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4930 Register LiveIn = 4931 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4932 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4933 return false; 4934 4935 B.buildCopy(SGPR01, LiveIn); 4936 B.buildInstr(AMDGPU::S_TRAP) 4937 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4938 .addReg(SGPR01, RegState::Implicit); 4939 4940 MI.eraseFromParent(); 4941 return true; 4942 } 4943 4944 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4945 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4946 B.buildInstr(AMDGPU::S_TRAP) 4947 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4948 MI.eraseFromParent(); 4949 return true; 4950 } 4951 4952 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4953 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4954 // Is non-HSA path or trap-handler disabled? Then, report a warning 4955 // accordingly 4956 if (!ST.isTrapHandlerEnabled() || 4957 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4958 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4959 "debugtrap handler not supported", 4960 MI.getDebugLoc(), DS_Warning); 4961 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4962 Ctx.diagnose(NoTrap); 4963 } else { 4964 // Insert debug-trap instruction 4965 B.buildInstr(AMDGPU::S_TRAP) 4966 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4967 } 4968 4969 MI.eraseFromParent(); 4970 return true; 4971 } 4972 4973 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4974 MachineIRBuilder &B) const { 4975 MachineRegisterInfo &MRI = *B.getMRI(); 4976 const LLT S16 = LLT::scalar(16); 4977 const LLT S32 = LLT::scalar(32); 4978 4979 Register DstReg = MI.getOperand(0).getReg(); 4980 Register NodePtr = MI.getOperand(2).getReg(); 4981 Register RayExtent = MI.getOperand(3).getReg(); 4982 Register RayOrigin = MI.getOperand(4).getReg(); 4983 Register RayDir = MI.getOperand(5).getReg(); 4984 Register RayInvDir = MI.getOperand(6).getReg(); 4985 Register TDescr = MI.getOperand(7).getReg(); 4986 4987 if (!ST.hasGFX10_AEncoding()) { 4988 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4989 "intrinsic not supported on subtarget", 4990 MI.getDebugLoc()); 4991 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4992 return false; 4993 } 4994 4995 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4996 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4997 const unsigned NumVDataDwords = 4; 4998 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4999 const bool UseNSA = 5000 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 5001 const unsigned BaseOpcodes[2][2] = { 5002 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 5003 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 5004 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 5005 int Opcode; 5006 if (UseNSA) { 5007 Opcode = 5008 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 5009 NumVDataDwords, NumVAddrDwords); 5010 } else { 5011 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 5012 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 5013 PowerOf2Ceil(NumVAddrDwords)); 5014 } 5015 assert(Opcode != -1); 5016 5017 SmallVector<Register, 12> Ops; 5018 if (Is64) { 5019 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 5020 Ops.push_back(Unmerge.getReg(0)); 5021 Ops.push_back(Unmerge.getReg(1)); 5022 } else { 5023 Ops.push_back(NodePtr); 5024 } 5025 Ops.push_back(RayExtent); 5026 5027 auto packLanes = [&Ops, &S32, &B](Register Src) { 5028 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 5029 Ops.push_back(Unmerge.getReg(0)); 5030 Ops.push_back(Unmerge.getReg(1)); 5031 Ops.push_back(Unmerge.getReg(2)); 5032 }; 5033 5034 packLanes(RayOrigin); 5035 if (IsA16) { 5036 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 5037 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 5038 Register R1 = MRI.createGenericVirtualRegister(S32); 5039 Register R2 = MRI.createGenericVirtualRegister(S32); 5040 Register R3 = MRI.createGenericVirtualRegister(S32); 5041 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 5042 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 5043 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 5044 Ops.push_back(R1); 5045 Ops.push_back(R2); 5046 Ops.push_back(R3); 5047 } else { 5048 packLanes(RayDir); 5049 packLanes(RayInvDir); 5050 } 5051 5052 if (!UseNSA) { 5053 // Build a single vector containing all the operands so far prepared. 5054 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 5055 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 5056 Ops.clear(); 5057 Ops.push_back(MergedOps); 5058 } 5059 5060 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 5061 .addDef(DstReg) 5062 .addImm(Opcode); 5063 5064 for (Register R : Ops) { 5065 MIB.addUse(R); 5066 } 5067 5068 MIB.addUse(TDescr) 5069 .addImm(IsA16 ? 1 : 0) 5070 .cloneMemRefs(MI); 5071 5072 MI.eraseFromParent(); 5073 return true; 5074 } 5075 5076 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) { 5077 B.buildConstant(MI.getOperand(0).getReg(), C); 5078 MI.eraseFromParent(); 5079 return true; 5080 } 5081 5082 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 5083 MachineIRBuilder &B) const { 5084 unsigned Opc; 5085 int RoundMode = MI.getOperand(2).getImm(); 5086 5087 if (RoundMode == (int)RoundingMode::TowardPositive) 5088 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 5089 else if (RoundMode == (int)RoundingMode::TowardNegative) 5090 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 5091 else 5092 return false; 5093 5094 B.buildInstr(Opc) 5095 .addDef(MI.getOperand(0).getReg()) 5096 .addUse(MI.getOperand(1).getReg()); 5097 5098 MI.eraseFromParent(); 5099 5100 return true; 5101 } 5102 5103 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 5104 MachineInstr &MI) const { 5105 MachineIRBuilder &B = Helper.MIRBuilder; 5106 MachineRegisterInfo &MRI = *B.getMRI(); 5107 5108 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 5109 auto IntrID = MI.getIntrinsicID(); 5110 switch (IntrID) { 5111 case Intrinsic::amdgcn_if: 5112 case Intrinsic::amdgcn_else: { 5113 MachineInstr *Br = nullptr; 5114 MachineBasicBlock *UncondBrTarget = nullptr; 5115 bool Negated = false; 5116 if (MachineInstr *BrCond = 5117 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5118 const SIRegisterInfo *TRI 5119 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5120 5121 Register Def = MI.getOperand(1).getReg(); 5122 Register Use = MI.getOperand(3).getReg(); 5123 5124 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5125 5126 if (Negated) 5127 std::swap(CondBrTarget, UncondBrTarget); 5128 5129 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5130 if (IntrID == Intrinsic::amdgcn_if) { 5131 B.buildInstr(AMDGPU::SI_IF) 5132 .addDef(Def) 5133 .addUse(Use) 5134 .addMBB(UncondBrTarget); 5135 } else { 5136 B.buildInstr(AMDGPU::SI_ELSE) 5137 .addDef(Def) 5138 .addUse(Use) 5139 .addMBB(UncondBrTarget); 5140 } 5141 5142 if (Br) { 5143 Br->getOperand(0).setMBB(CondBrTarget); 5144 } else { 5145 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 5146 // since we're swapping branch targets it needs to be reinserted. 5147 // FIXME: IRTranslator should probably not do this 5148 B.buildBr(*CondBrTarget); 5149 } 5150 5151 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 5152 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 5153 MI.eraseFromParent(); 5154 BrCond->eraseFromParent(); 5155 return true; 5156 } 5157 5158 return false; 5159 } 5160 case Intrinsic::amdgcn_loop: { 5161 MachineInstr *Br = nullptr; 5162 MachineBasicBlock *UncondBrTarget = nullptr; 5163 bool Negated = false; 5164 if (MachineInstr *BrCond = 5165 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5166 const SIRegisterInfo *TRI 5167 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5168 5169 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5170 Register Reg = MI.getOperand(2).getReg(); 5171 5172 if (Negated) 5173 std::swap(CondBrTarget, UncondBrTarget); 5174 5175 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5176 B.buildInstr(AMDGPU::SI_LOOP) 5177 .addUse(Reg) 5178 .addMBB(UncondBrTarget); 5179 5180 if (Br) 5181 Br->getOperand(0).setMBB(CondBrTarget); 5182 else 5183 B.buildBr(*CondBrTarget); 5184 5185 MI.eraseFromParent(); 5186 BrCond->eraseFromParent(); 5187 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5188 return true; 5189 } 5190 5191 return false; 5192 } 5193 case Intrinsic::amdgcn_kernarg_segment_ptr: 5194 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5195 // This only makes sense to call in a kernel, so just lower to null. 5196 B.buildConstant(MI.getOperand(0).getReg(), 0); 5197 MI.eraseFromParent(); 5198 return true; 5199 } 5200 5201 return legalizePreloadedArgIntrin( 5202 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5203 case Intrinsic::amdgcn_implicitarg_ptr: 5204 return legalizeImplicitArgPtr(MI, MRI, B); 5205 case Intrinsic::amdgcn_workitem_id_x: 5206 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0) 5207 return replaceWithConstant(B, MI, 0); 5208 return legalizePreloadedArgIntrin(MI, MRI, B, 5209 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5210 case Intrinsic::amdgcn_workitem_id_y: 5211 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0) 5212 return replaceWithConstant(B, MI, 0); 5213 5214 return legalizePreloadedArgIntrin(MI, MRI, B, 5215 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5216 case Intrinsic::amdgcn_workitem_id_z: 5217 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0) 5218 return replaceWithConstant(B, MI, 0); 5219 5220 return legalizePreloadedArgIntrin(MI, MRI, B, 5221 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5222 case Intrinsic::amdgcn_workgroup_id_x: 5223 return legalizePreloadedArgIntrin(MI, MRI, B, 5224 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5225 case Intrinsic::amdgcn_workgroup_id_y: 5226 return legalizePreloadedArgIntrin(MI, MRI, B, 5227 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5228 case Intrinsic::amdgcn_workgroup_id_z: 5229 return legalizePreloadedArgIntrin(MI, MRI, B, 5230 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5231 case Intrinsic::amdgcn_dispatch_ptr: 5232 return legalizePreloadedArgIntrin(MI, MRI, B, 5233 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5234 case Intrinsic::amdgcn_queue_ptr: 5235 return legalizePreloadedArgIntrin(MI, MRI, B, 5236 AMDGPUFunctionArgInfo::QUEUE_PTR); 5237 case Intrinsic::amdgcn_implicit_buffer_ptr: 5238 return legalizePreloadedArgIntrin( 5239 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5240 case Intrinsic::amdgcn_dispatch_id: 5241 return legalizePreloadedArgIntrin(MI, MRI, B, 5242 AMDGPUFunctionArgInfo::DISPATCH_ID); 5243 case Intrinsic::r600_read_ngroups_x: 5244 // TODO: Emit error for hsa 5245 return legalizeKernargMemParameter(MI, B, 5246 SI::KernelInputOffsets::NGROUPS_X); 5247 case Intrinsic::r600_read_ngroups_y: 5248 return legalizeKernargMemParameter(MI, B, 5249 SI::KernelInputOffsets::NGROUPS_Y); 5250 case Intrinsic::r600_read_ngroups_z: 5251 return legalizeKernargMemParameter(MI, B, 5252 SI::KernelInputOffsets::NGROUPS_Z); 5253 case Intrinsic::r600_read_local_size_x: 5254 // TODO: Could insert G_ASSERT_ZEXT from s16 5255 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); 5256 case Intrinsic::r600_read_local_size_y: 5257 // TODO: Could insert G_ASSERT_ZEXT from s16 5258 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); 5259 // TODO: Could insert G_ASSERT_ZEXT from s16 5260 case Intrinsic::r600_read_local_size_z: 5261 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); 5262 case Intrinsic::r600_read_global_size_x: 5263 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); 5264 case Intrinsic::r600_read_global_size_y: 5265 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); 5266 case Intrinsic::r600_read_global_size_z: 5267 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); 5268 case Intrinsic::amdgcn_fdiv_fast: 5269 return legalizeFDIVFastIntrin(MI, MRI, B); 5270 case Intrinsic::amdgcn_is_shared: 5271 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5272 case Intrinsic::amdgcn_is_private: 5273 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5274 case Intrinsic::amdgcn_wavefrontsize: { 5275 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5276 MI.eraseFromParent(); 5277 return true; 5278 } 5279 case Intrinsic::amdgcn_s_buffer_load: 5280 return legalizeSBufferLoad(Helper, MI); 5281 case Intrinsic::amdgcn_raw_buffer_store: 5282 case Intrinsic::amdgcn_struct_buffer_store: 5283 return legalizeBufferStore(MI, MRI, B, false, false); 5284 case Intrinsic::amdgcn_raw_buffer_store_format: 5285 case Intrinsic::amdgcn_struct_buffer_store_format: 5286 return legalizeBufferStore(MI, MRI, B, false, true); 5287 case Intrinsic::amdgcn_raw_tbuffer_store: 5288 case Intrinsic::amdgcn_struct_tbuffer_store: 5289 return legalizeBufferStore(MI, MRI, B, true, true); 5290 case Intrinsic::amdgcn_raw_buffer_load: 5291 case Intrinsic::amdgcn_struct_buffer_load: 5292 return legalizeBufferLoad(MI, MRI, B, false, false); 5293 case Intrinsic::amdgcn_raw_buffer_load_format: 5294 case Intrinsic::amdgcn_struct_buffer_load_format: 5295 return legalizeBufferLoad(MI, MRI, B, true, false); 5296 case Intrinsic::amdgcn_raw_tbuffer_load: 5297 case Intrinsic::amdgcn_struct_tbuffer_load: 5298 return legalizeBufferLoad(MI, MRI, B, true, true); 5299 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5300 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5301 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5302 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5303 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5304 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5305 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5306 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5307 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5308 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5309 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5310 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5311 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5312 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5313 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5314 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5315 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5316 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5317 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5318 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5319 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5320 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5321 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5322 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5323 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5324 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5325 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5326 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5327 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5328 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5329 return legalizeBufferAtomic(MI, B, IntrID); 5330 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5331 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { 5332 Register DstReg = MI.getOperand(0).getReg(); 5333 if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) { 5334 Function &F = B.getMF().getFunction(); 5335 DiagnosticInfoUnsupported NoFpRet( 5336 F, "return versions of fp atomics not supported", B.getDebugLoc(), 5337 DS_Error); 5338 F.getContext().diagnose(NoFpRet); 5339 B.buildUndef(DstReg); 5340 MI.eraseFromParent(); 5341 return true; 5342 } 5343 5344 return legalizeBufferAtomic(MI, B, IntrID); 5345 } 5346 case Intrinsic::amdgcn_atomic_inc: 5347 return legalizeAtomicIncDec(MI, B, true); 5348 case Intrinsic::amdgcn_atomic_dec: 5349 return legalizeAtomicIncDec(MI, B, false); 5350 case Intrinsic::trap: 5351 return legalizeTrapIntrinsic(MI, MRI, B); 5352 case Intrinsic::debugtrap: 5353 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5354 case Intrinsic::amdgcn_rsq_clamp: 5355 return legalizeRsqClampIntrinsic(MI, MRI, B); 5356 case Intrinsic::amdgcn_ds_fadd: 5357 case Intrinsic::amdgcn_ds_fmin: 5358 case Intrinsic::amdgcn_ds_fmax: 5359 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5360 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5361 return legalizeBVHIntrinsic(MI, B); 5362 default: { 5363 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5364 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5365 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5366 return true; 5367 } 5368 } 5369 5370 return true; 5371 } 5372