1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 #include "llvm/IR/IntrinsicsR600.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 /// \returns true if this is an odd sized vector which should widen by adding an 64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65 /// excludes s1 vectors, which should always be scalarized. 66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 67 return [=](const LegalityQuery &Query) { 68 const LLT Ty = Query.Types[TypeIdx]; 69 if (!Ty.isVector()) 70 return false; 71 72 const LLT EltTy = Ty.getElementType(); 73 const unsigned EltSize = EltTy.getSizeInBits(); 74 return Ty.getNumElements() % 2 != 0 && 75 EltSize > 1 && EltSize < 32 && 76 Ty.getSizeInBits() % 32 != 0; 77 }; 78 } 79 80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 return Ty.getSizeInBits() % 32 == 0; 84 }; 85 } 86 87 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getScalarType(); 91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 92 }; 93 } 94 95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 return std::make_pair(TypeIdx, 100 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 101 }; 102 } 103 104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 105 return [=](const LegalityQuery &Query) { 106 const LLT Ty = Query.Types[TypeIdx]; 107 const LLT EltTy = Ty.getElementType(); 108 unsigned Size = Ty.getSizeInBits(); 109 unsigned Pieces = (Size + 63) / 64; 110 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 111 return std::make_pair( 112 TypeIdx, 113 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 114 }; 115 } 116 117 // Increase the number of vector elements to reach the next multiple of 32-bit 118 // type. 119 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 120 return [=](const LegalityQuery &Query) { 121 const LLT Ty = Query.Types[TypeIdx]; 122 123 const LLT EltTy = Ty.getElementType(); 124 const int Size = Ty.getSizeInBits(); 125 const int EltSize = EltTy.getSizeInBits(); 126 const int NextMul32 = (Size + 31) / 32; 127 128 assert(EltSize < 32); 129 130 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 131 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 132 }; 133 } 134 135 static LLT getBitcastRegisterType(const LLT Ty) { 136 const unsigned Size = Ty.getSizeInBits(); 137 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 uint64_t AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB}) 534 .legalFor({S32, S16, V2S16}) 535 .clampMaxNumElementsStrict(0, S16, 2) 536 .scalarize(0) 537 .minScalar(0, S16) 538 .widenScalarToNextMultipleOf(0, 32) 539 .maxScalar(0, S32); 540 541 getActionDefinitionsBuilder(G_MUL) 542 .legalFor({S32, S16, V2S16}) 543 .clampMaxNumElementsStrict(0, S16, 2) 544 .scalarize(0) 545 .minScalar(0, S16) 546 .widenScalarToNextMultipleOf(0, 32) 547 .custom(); 548 assert(ST.hasMad64_32()); 549 550 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 551 .legalFor({S32, S16, V2S16}) // Clamp modifier 552 .minScalarOrElt(0, S16) 553 .clampMaxNumElementsStrict(0, S16, 2) 554 .scalarize(0) 555 .widenScalarToNextPow2(0, 32) 556 .lower(); 557 } else if (ST.has16BitInsts()) { 558 getActionDefinitionsBuilder({G_ADD, G_SUB}) 559 .legalFor({S32, S16}) 560 .minScalar(0, S16) 561 .widenScalarToNextMultipleOf(0, 32) 562 .maxScalar(0, S32) 563 .scalarize(0); 564 565 getActionDefinitionsBuilder(G_MUL) 566 .legalFor({S32, S16}) 567 .scalarize(0) 568 .minScalar(0, S16) 569 .widenScalarToNextMultipleOf(0, 32) 570 .custom(); 571 assert(ST.hasMad64_32()); 572 573 // Technically the saturating operations require clamp bit support, but this 574 // was introduced at the same time as 16-bit operations. 575 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 576 .legalFor({S32, S16}) // Clamp modifier 577 .minScalar(0, S16) 578 .scalarize(0) 579 .widenScalarToNextPow2(0, 16) 580 .lower(); 581 582 // We're just lowering this, but it helps get a better result to try to 583 // coerce to the desired type first. 584 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 585 .minScalar(0, S16) 586 .scalarize(0) 587 .lower(); 588 } else { 589 getActionDefinitionsBuilder({G_ADD, G_SUB}) 590 .legalFor({S32}) 591 .widenScalarToNextMultipleOf(0, 32) 592 .clampScalar(0, S32, S32) 593 .scalarize(0); 594 595 auto &Mul = getActionDefinitionsBuilder(G_MUL) 596 .legalFor({S32}) 597 .scalarize(0) 598 .minScalar(0, S32) 599 .widenScalarToNextMultipleOf(0, 32); 600 601 if (ST.hasMad64_32()) 602 Mul.custom(); 603 else 604 Mul.maxScalar(0, S32); 605 606 if (ST.hasIntClamp()) { 607 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 608 .legalFor({S32}) // Clamp modifier. 609 .scalarize(0) 610 .minScalarOrElt(0, S32) 611 .lower(); 612 } else { 613 // Clamp bit support was added in VI, along with 16-bit operations. 614 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 615 .minScalar(0, S32) 616 .scalarize(0) 617 .lower(); 618 } 619 620 // FIXME: DAG expansion gets better results. The widening uses the smaller 621 // range values and goes for the min/max lowering directly. 622 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 623 .minScalar(0, S32) 624 .scalarize(0) 625 .lower(); 626 } 627 628 getActionDefinitionsBuilder( 629 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 630 .customFor({S32, S64}) 631 .clampScalar(0, S32, S64) 632 .widenScalarToNextPow2(0, 32) 633 .scalarize(0); 634 635 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 636 .legalFor({S32}) 637 .maxScalar(0, S32); 638 639 if (ST.hasVOP3PInsts()) { 640 Mulh 641 .clampMaxNumElements(0, S8, 2) 642 .lowerFor({V2S8}); 643 } 644 645 Mulh 646 .scalarize(0) 647 .lower(); 648 649 // Report legal for any types we can handle anywhere. For the cases only legal 650 // on the SALU, RegBankSelect will be able to re-legalize. 651 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 652 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 653 .clampScalar(0, S32, S64) 654 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 655 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 656 .widenScalarToNextPow2(0) 657 .scalarize(0); 658 659 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 660 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 661 .legalFor({{S32, S1}, {S32, S32}}) 662 .minScalar(0, S32) 663 .scalarize(0) 664 .lower(); 665 666 getActionDefinitionsBuilder(G_BITCAST) 667 // Don't worry about the size constraint. 668 .legalIf(all(isRegisterType(0), isRegisterType(1))) 669 .lower(); 670 671 672 getActionDefinitionsBuilder(G_CONSTANT) 673 .legalFor({S1, S32, S64, S16, GlobalPtr, 674 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 675 .legalIf(isPointer(0)) 676 .clampScalar(0, S32, S64) 677 .widenScalarToNextPow2(0); 678 679 getActionDefinitionsBuilder(G_FCONSTANT) 680 .legalFor({S32, S64, S16}) 681 .clampScalar(0, S16, S64); 682 683 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 684 .legalIf(isRegisterType(0)) 685 // s1 and s16 are special cases because they have legal operations on 686 // them, but don't really occupy registers in the normal way. 687 .legalFor({S1, S16}) 688 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 689 .clampScalarOrElt(0, S32, MaxScalar) 690 .widenScalarToNextPow2(0, 32) 691 .clampMaxNumElements(0, S32, 16); 692 693 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 694 695 // If the amount is divergent, we have to do a wave reduction to get the 696 // maximum value, so this is expanded during RegBankSelect. 697 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 698 .legalFor({{PrivatePtr, S32}}); 699 700 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 701 .customIf(typeIsNot(0, PrivatePtr)); 702 703 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 704 705 auto &FPOpActions = getActionDefinitionsBuilder( 706 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 707 .legalFor({S32, S64}); 708 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 709 .customFor({S32, S64}); 710 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 711 .customFor({S32, S64}); 712 713 if (ST.has16BitInsts()) { 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.legalFor({S16, V2S16}); 716 else 717 FPOpActions.legalFor({S16}); 718 719 TrigActions.customFor({S16}); 720 FDIVActions.customFor({S16}); 721 } 722 723 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 724 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 725 726 if (ST.hasVOP3PInsts()) { 727 MinNumMaxNum.customFor(FPTypesPK16) 728 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 729 .clampMaxNumElements(0, S16, 2) 730 .clampScalar(0, S16, S64) 731 .scalarize(0); 732 } else if (ST.has16BitInsts()) { 733 MinNumMaxNum.customFor(FPTypes16) 734 .clampScalar(0, S16, S64) 735 .scalarize(0); 736 } else { 737 MinNumMaxNum.customFor(FPTypesBase) 738 .clampScalar(0, S32, S64) 739 .scalarize(0); 740 } 741 742 if (ST.hasVOP3PInsts()) 743 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 744 745 FPOpActions 746 .scalarize(0) 747 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 748 749 TrigActions 750 .scalarize(0) 751 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 752 753 FDIVActions 754 .scalarize(0) 755 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 756 757 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 758 .legalFor(FPTypesPK16) 759 .clampMaxNumElementsStrict(0, S16, 2) 760 .scalarize(0) 761 .clampScalar(0, S16, S64); 762 763 if (ST.has16BitInsts()) { 764 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 765 .legalFor({S32, S64, S16}) 766 .scalarize(0) 767 .clampScalar(0, S16, S64); 768 } else { 769 getActionDefinitionsBuilder(G_FSQRT) 770 .legalFor({S32, S64}) 771 .scalarize(0) 772 .clampScalar(0, S32, S64); 773 774 if (ST.hasFractBug()) { 775 getActionDefinitionsBuilder(G_FFLOOR) 776 .customFor({S64}) 777 .legalFor({S32, S64}) 778 .scalarize(0) 779 .clampScalar(0, S32, S64); 780 } else { 781 getActionDefinitionsBuilder(G_FFLOOR) 782 .legalFor({S32, S64}) 783 .scalarize(0) 784 .clampScalar(0, S32, S64); 785 } 786 } 787 788 getActionDefinitionsBuilder(G_FPTRUNC) 789 .legalFor({{S32, S64}, {S16, S32}}) 790 .scalarize(0) 791 .lower(); 792 793 getActionDefinitionsBuilder(G_FPEXT) 794 .legalFor({{S64, S32}, {S32, S16}}) 795 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 796 .scalarize(0); 797 798 getActionDefinitionsBuilder(G_FSUB) 799 // Use actual fsub instruction 800 .legalFor({S32}) 801 // Must use fadd + fneg 802 .lowerFor({S64, S16, V2S16}) 803 .scalarize(0) 804 .clampScalar(0, S32, S64); 805 806 // Whether this is legal depends on the floating point mode for the function. 807 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 808 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 809 FMad.customFor({S32, S16}); 810 else if (ST.hasMadMacF32Insts()) 811 FMad.customFor({S32}); 812 else if (ST.hasMadF16()) 813 FMad.customFor({S16}); 814 FMad.scalarize(0) 815 .lower(); 816 817 auto &FRem = getActionDefinitionsBuilder(G_FREM); 818 if (ST.has16BitInsts()) { 819 FRem.customFor({S16, S32, S64}); 820 } else { 821 FRem.minScalar(0, S32) 822 .customFor({S32, S64}); 823 } 824 FRem.scalarize(0); 825 826 // TODO: Do we need to clamp maximum bitwidth? 827 getActionDefinitionsBuilder(G_TRUNC) 828 .legalIf(isScalar(0)) 829 .legalFor({{V2S16, V2S32}}) 830 .clampMaxNumElements(0, S16, 2) 831 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 832 // situations (like an invalid implicit use), we don't want to infinite loop 833 // in the legalizer. 834 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 835 .alwaysLegal(); 836 837 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 838 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 839 {S32, S1}, {S64, S1}, {S16, S1}}) 840 .scalarize(0) 841 .clampScalar(0, S32, S64) 842 .widenScalarToNextPow2(1, 32); 843 844 // TODO: Split s1->s64 during regbankselect for VALU. 845 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 846 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 847 .lowerIf(typeIs(1, S1)) 848 .customFor({{S32, S64}, {S64, S64}}); 849 if (ST.has16BitInsts()) 850 IToFP.legalFor({{S16, S16}}); 851 IToFP.clampScalar(1, S32, S64) 852 .minScalar(0, S32) 853 .scalarize(0) 854 .widenScalarToNextPow2(1); 855 856 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 857 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 858 .customFor({{S64, S32}, {S64, S64}}) 859 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 860 if (ST.has16BitInsts()) 861 FPToI.legalFor({{S16, S16}}); 862 else 863 FPToI.minScalar(1, S32); 864 865 FPToI.minScalar(0, S32) 866 .widenScalarToNextPow2(0, 32) 867 .scalarize(0) 868 .lower(); 869 870 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 871 .customFor({S16, S32}) 872 .scalarize(0) 873 .lower(); 874 875 // Lower roundeven into G_FRINT 876 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 877 .scalarize(0) 878 .lower(); 879 880 if (ST.has16BitInsts()) { 881 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 882 .legalFor({S16, S32, S64}) 883 .clampScalar(0, S16, S64) 884 .scalarize(0); 885 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 886 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 887 .legalFor({S32, S64}) 888 .clampScalar(0, S32, S64) 889 .scalarize(0); 890 } else { 891 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 892 .legalFor({S32}) 893 .customFor({S64}) 894 .clampScalar(0, S32, S64) 895 .scalarize(0); 896 } 897 898 getActionDefinitionsBuilder(G_PTR_ADD) 899 .legalIf(all(isPointer(0), sameSize(0, 1))) 900 .scalarize(0) 901 .scalarSameSizeAs(1, 0); 902 903 getActionDefinitionsBuilder(G_PTRMASK) 904 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 905 .scalarSameSizeAs(1, 0) 906 .scalarize(0); 907 908 auto &CmpBuilder = 909 getActionDefinitionsBuilder(G_ICMP) 910 // The compare output type differs based on the register bank of the output, 911 // so make both s1 and s32 legal. 912 // 913 // Scalar compares producing output in scc will be promoted to s32, as that 914 // is the allocatable register type that will be needed for the copy from 915 // scc. This will be promoted during RegBankSelect, and we assume something 916 // before that won't try to use s32 result types. 917 // 918 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 919 // bank. 920 .legalForCartesianProduct( 921 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 922 .legalForCartesianProduct( 923 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 924 if (ST.has16BitInsts()) { 925 CmpBuilder.legalFor({{S1, S16}}); 926 } 927 928 CmpBuilder 929 .widenScalarToNextPow2(1) 930 .clampScalar(1, S32, S64) 931 .scalarize(0) 932 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 933 934 getActionDefinitionsBuilder(G_FCMP) 935 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 936 .widenScalarToNextPow2(1) 937 .clampScalar(1, S32, S64) 938 .scalarize(0); 939 940 // FIXME: fpow has a selection pattern that should move to custom lowering. 941 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 942 if (ST.has16BitInsts()) 943 Exp2Ops.legalFor({S32, S16}); 944 else 945 Exp2Ops.legalFor({S32}); 946 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 947 Exp2Ops.scalarize(0); 948 949 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 950 if (ST.has16BitInsts()) 951 ExpOps.customFor({{S32}, {S16}}); 952 else 953 ExpOps.customFor({S32}); 954 ExpOps.clampScalar(0, MinScalarFPTy, S32) 955 .scalarize(0); 956 957 getActionDefinitionsBuilder(G_FPOWI) 958 .clampScalar(0, MinScalarFPTy, S32) 959 .lower(); 960 961 // The 64-bit versions produce 32-bit results, but only on the SALU. 962 getActionDefinitionsBuilder(G_CTPOP) 963 .legalFor({{S32, S32}, {S32, S64}}) 964 .clampScalar(0, S32, S32) 965 .widenScalarToNextPow2(1, 32) 966 .clampScalar(1, S32, S64) 967 .scalarize(0) 968 .widenScalarToNextPow2(0, 32); 969 970 971 // The hardware instructions return a different result on 0 than the generic 972 // instructions expect. The hardware produces -1, but these produce the 973 // bitwidth. 974 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 975 .scalarize(0) 976 .clampScalar(0, S32, S32) 977 .clampScalar(1, S32, S64) 978 .widenScalarToNextPow2(0, 32) 979 .widenScalarToNextPow2(1, 32) 980 .custom(); 981 982 // The 64-bit versions produce 32-bit results, but only on the SALU. 983 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 984 .legalFor({{S32, S32}, {S32, S64}}) 985 .clampScalar(0, S32, S32) 986 .clampScalar(1, S32, S64) 987 .scalarize(0) 988 .widenScalarToNextPow2(0, 32) 989 .widenScalarToNextPow2(1, 32); 990 991 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 992 // RegBankSelect. 993 getActionDefinitionsBuilder(G_BITREVERSE) 994 .legalFor({S32, S64}) 995 .clampScalar(0, S32, S64) 996 .scalarize(0) 997 .widenScalarToNextPow2(0); 998 999 if (ST.has16BitInsts()) { 1000 getActionDefinitionsBuilder(G_BSWAP) 1001 .legalFor({S16, S32, V2S16}) 1002 .clampMaxNumElementsStrict(0, S16, 2) 1003 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1004 // narrowScalar limitation. 1005 .widenScalarToNextPow2(0) 1006 .clampScalar(0, S16, S32) 1007 .scalarize(0); 1008 1009 if (ST.hasVOP3PInsts()) { 1010 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1011 .legalFor({S32, S16, V2S16}) 1012 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1013 .clampMaxNumElements(0, S16, 2) 1014 .minScalar(0, S16) 1015 .widenScalarToNextPow2(0) 1016 .scalarize(0) 1017 .lower(); 1018 } else { 1019 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1020 .legalFor({S32, S16}) 1021 .widenScalarToNextPow2(0) 1022 .minScalar(0, S16) 1023 .scalarize(0) 1024 .lower(); 1025 } 1026 } else { 1027 // TODO: Should have same legality without v_perm_b32 1028 getActionDefinitionsBuilder(G_BSWAP) 1029 .legalFor({S32}) 1030 .lowerIf(scalarNarrowerThan(0, 32)) 1031 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1032 // narrowScalar limitation. 1033 .widenScalarToNextPow2(0) 1034 .maxScalar(0, S32) 1035 .scalarize(0) 1036 .lower(); 1037 1038 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1039 .legalFor({S32}) 1040 .minScalar(0, S32) 1041 .widenScalarToNextPow2(0) 1042 .scalarize(0) 1043 .lower(); 1044 } 1045 1046 getActionDefinitionsBuilder(G_INTTOPTR) 1047 // List the common cases 1048 .legalForCartesianProduct(AddrSpaces64, {S64}) 1049 .legalForCartesianProduct(AddrSpaces32, {S32}) 1050 .scalarize(0) 1051 // Accept any address space as long as the size matches 1052 .legalIf(sameSize(0, 1)) 1053 .widenScalarIf(smallerThan(1, 0), 1054 [](const LegalityQuery &Query) { 1055 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1056 }) 1057 .narrowScalarIf(largerThan(1, 0), 1058 [](const LegalityQuery &Query) { 1059 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1060 }); 1061 1062 getActionDefinitionsBuilder(G_PTRTOINT) 1063 // List the common cases 1064 .legalForCartesianProduct(AddrSpaces64, {S64}) 1065 .legalForCartesianProduct(AddrSpaces32, {S32}) 1066 .scalarize(0) 1067 // Accept any address space as long as the size matches 1068 .legalIf(sameSize(0, 1)) 1069 .widenScalarIf(smallerThan(0, 1), 1070 [](const LegalityQuery &Query) { 1071 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1072 }) 1073 .narrowScalarIf( 1074 largerThan(0, 1), 1075 [](const LegalityQuery &Query) { 1076 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1077 }); 1078 1079 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1080 .scalarize(0) 1081 .custom(); 1082 1083 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1084 bool IsLoad) -> bool { 1085 const LLT DstTy = Query.Types[0]; 1086 1087 // Split vector extloads. 1088 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1089 1090 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1091 return true; 1092 1093 const LLT PtrTy = Query.Types[1]; 1094 unsigned AS = PtrTy.getAddressSpace(); 1095 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1096 return true; 1097 1098 // Catch weird sized loads that don't evenly divide into the access sizes 1099 // TODO: May be able to widen depending on alignment etc. 1100 unsigned NumRegs = (MemSize + 31) / 32; 1101 if (NumRegs == 3) { 1102 if (!ST.hasDwordx3LoadStores()) 1103 return true; 1104 } else { 1105 // If the alignment allows, these should have been widened. 1106 if (!isPowerOf2_32(NumRegs)) 1107 return true; 1108 } 1109 1110 return false; 1111 }; 1112 1113 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1114 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1115 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1116 1117 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1118 // LDS 1119 // TODO: Unsupported flat for SI. 1120 1121 for (unsigned Op : {G_LOAD, G_STORE}) { 1122 const bool IsStore = Op == G_STORE; 1123 1124 auto &Actions = getActionDefinitionsBuilder(Op); 1125 // Explicitly list some common cases. 1126 // TODO: Does this help compile time at all? 1127 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1128 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1129 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1130 {S64, GlobalPtr, S64, GlobalAlign32}, 1131 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1132 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1133 {S32, GlobalPtr, S8, GlobalAlign8}, 1134 {S32, GlobalPtr, S16, GlobalAlign16}, 1135 1136 {S32, LocalPtr, S32, 32}, 1137 {S64, LocalPtr, S64, 32}, 1138 {V2S32, LocalPtr, V2S32, 32}, 1139 {S32, LocalPtr, S8, 8}, 1140 {S32, LocalPtr, S16, 16}, 1141 {V2S16, LocalPtr, S32, 32}, 1142 1143 {S32, PrivatePtr, S32, 32}, 1144 {S32, PrivatePtr, S8, 8}, 1145 {S32, PrivatePtr, S16, 16}, 1146 {V2S16, PrivatePtr, S32, 32}, 1147 1148 {S32, ConstantPtr, S32, GlobalAlign32}, 1149 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1150 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1151 {S64, ConstantPtr, S64, GlobalAlign32}, 1152 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1153 Actions.legalIf( 1154 [=](const LegalityQuery &Query) -> bool { 1155 return isLoadStoreLegal(ST, Query); 1156 }); 1157 1158 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1159 // 64-bits. 1160 // 1161 // TODO: Should generalize bitcast action into coerce, which will also cover 1162 // inserting addrspacecasts. 1163 Actions.customIf(typeIs(1, Constant32Ptr)); 1164 1165 // Turn any illegal element vectors into something easier to deal 1166 // with. These will ultimately produce 32-bit scalar shifts to extract the 1167 // parts anyway. 1168 // 1169 // For odd 16-bit element vectors, prefer to split those into pieces with 1170 // 16-bit vector parts. 1171 Actions.bitcastIf( 1172 [=](const LegalityQuery &Query) -> bool { 1173 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1174 Query.MMODescrs[0].MemoryTy); 1175 }, bitcastToRegisterType(0)); 1176 1177 if (!IsStore) { 1178 // Widen suitably aligned loads by loading extra bytes. The standard 1179 // legalization actions can't properly express widening memory operands. 1180 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1181 return shouldWidenLoad(ST, Query, G_LOAD); 1182 }); 1183 } 1184 1185 // FIXME: load/store narrowing should be moved to lower action 1186 Actions 1187 .narrowScalarIf( 1188 [=](const LegalityQuery &Query) -> bool { 1189 return !Query.Types[0].isVector() && 1190 needToSplitMemOp(Query, Op == G_LOAD); 1191 }, 1192 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1193 const LLT DstTy = Query.Types[0]; 1194 const LLT PtrTy = Query.Types[1]; 1195 1196 const unsigned DstSize = DstTy.getSizeInBits(); 1197 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1198 1199 // Split extloads. 1200 if (DstSize > MemSize) 1201 return std::make_pair(0, LLT::scalar(MemSize)); 1202 1203 unsigned MaxSize = maxSizeForAddrSpace(ST, 1204 PtrTy.getAddressSpace(), 1205 Op == G_LOAD); 1206 if (MemSize > MaxSize) 1207 return std::make_pair(0, LLT::scalar(MaxSize)); 1208 1209 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1210 return std::make_pair(0, LLT::scalar(Align)); 1211 }) 1212 .fewerElementsIf( 1213 [=](const LegalityQuery &Query) -> bool { 1214 return Query.Types[0].isVector() && 1215 needToSplitMemOp(Query, Op == G_LOAD); 1216 }, 1217 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1218 const LLT DstTy = Query.Types[0]; 1219 const LLT PtrTy = Query.Types[1]; 1220 1221 LLT EltTy = DstTy.getElementType(); 1222 unsigned MaxSize = maxSizeForAddrSpace(ST, 1223 PtrTy.getAddressSpace(), 1224 Op == G_LOAD); 1225 1226 // FIXME: Handle widened to power of 2 results better. This ends 1227 // up scalarizing. 1228 // FIXME: 3 element stores scalarized on SI 1229 1230 // Split if it's too large for the address space. 1231 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1232 if (MemSize > MaxSize) { 1233 unsigned NumElts = DstTy.getNumElements(); 1234 unsigned EltSize = EltTy.getSizeInBits(); 1235 1236 if (MaxSize % EltSize == 0) { 1237 return std::make_pair( 1238 0, LLT::scalarOrVector( 1239 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1240 } 1241 1242 unsigned NumPieces = MemSize / MaxSize; 1243 1244 // FIXME: Refine when odd breakdowns handled 1245 // The scalars will need to be re-legalized. 1246 if (NumPieces == 1 || NumPieces >= NumElts || 1247 NumElts % NumPieces != 0) 1248 return std::make_pair(0, EltTy); 1249 1250 return std::make_pair( 1251 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1252 } 1253 1254 // FIXME: We could probably handle weird extending loads better. 1255 if (DstTy.getSizeInBits() > MemSize) 1256 return std::make_pair(0, EltTy); 1257 1258 unsigned EltSize = EltTy.getSizeInBits(); 1259 unsigned DstSize = DstTy.getSizeInBits(); 1260 if (!isPowerOf2_32(DstSize)) { 1261 // We're probably decomposing an odd sized store. Try to split 1262 // to the widest type. TODO: Account for alignment. As-is it 1263 // should be OK, since the new parts will be further legalized. 1264 unsigned FloorSize = PowerOf2Floor(DstSize); 1265 return std::make_pair( 1266 0, LLT::scalarOrVector( 1267 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1268 } 1269 1270 // May need relegalization for the scalars. 1271 return std::make_pair(0, EltTy); 1272 }) 1273 .minScalar(0, S32) 1274 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1275 .widenScalarToNextPow2(0) 1276 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1277 .lower(); 1278 } 1279 1280 // FIXME: Unaligned accesses not lowered. 1281 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1282 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1283 {S32, GlobalPtr, S16, 2 * 8}, 1284 {S32, LocalPtr, S8, 8}, 1285 {S32, LocalPtr, S16, 16}, 1286 {S32, PrivatePtr, S8, 8}, 1287 {S32, PrivatePtr, S16, 16}, 1288 {S32, ConstantPtr, S8, 8}, 1289 {S32, ConstantPtr, S16, 2 * 8}}) 1290 .legalIf( 1291 [=](const LegalityQuery &Query) -> bool { 1292 return isLoadStoreLegal(ST, Query); 1293 }); 1294 1295 if (ST.hasFlatAddressSpace()) { 1296 ExtLoads.legalForTypesWithMemDesc( 1297 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1298 } 1299 1300 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1301 // 64-bits. 1302 // 1303 // TODO: Should generalize bitcast action into coerce, which will also cover 1304 // inserting addrspacecasts. 1305 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1306 1307 ExtLoads.clampScalar(0, S32, S32) 1308 .widenScalarToNextPow2(0) 1309 .lower(); 1310 1311 auto &Atomics = getActionDefinitionsBuilder( 1312 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1313 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1314 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1315 G_ATOMICRMW_UMIN}) 1316 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1317 {S64, GlobalPtr}, {S64, LocalPtr}, 1318 {S32, RegionPtr}, {S64, RegionPtr}}); 1319 if (ST.hasFlatAddressSpace()) { 1320 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1321 } 1322 1323 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1324 if (ST.hasLDSFPAtomicAdd()) { 1325 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1326 if (ST.hasGFX90AInsts()) 1327 Atomic.legalFor({{S64, LocalPtr}}); 1328 if (ST.hasGFX940Insts()) 1329 Atomic.legalFor({{V2S16, LocalPtr}}); 1330 } 1331 if (ST.hasAtomicFaddInsts()) 1332 Atomic.legalFor({{S32, GlobalPtr}}); 1333 1334 if (ST.hasGFX90AInsts()) { 1335 // These are legal with some caveats, and should have undergone expansion in 1336 // the IR in most situations 1337 // TODO: Move atomic expansion into legalizer 1338 // TODO: Also supports <2 x f16> 1339 Atomic.legalFor({ 1340 {S32, GlobalPtr}, 1341 {S64, GlobalPtr}, 1342 {S64, FlatPtr} 1343 }); 1344 } 1345 1346 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1347 // demarshalling 1348 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1349 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1350 {S32, FlatPtr}, {S64, FlatPtr}}) 1351 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1352 {S32, RegionPtr}, {S64, RegionPtr}}); 1353 // TODO: Pointer types, any 32-bit or 64-bit vector 1354 1355 // Condition should be s32 for scalar, s1 for vector. 1356 getActionDefinitionsBuilder(G_SELECT) 1357 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1358 LocalPtr, FlatPtr, PrivatePtr, 1359 LLT::fixed_vector(2, LocalPtr), 1360 LLT::fixed_vector(2, PrivatePtr)}, 1361 {S1, S32}) 1362 .clampScalar(0, S16, S64) 1363 .scalarize(1) 1364 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1365 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1366 .clampMaxNumElements(0, S32, 2) 1367 .clampMaxNumElements(0, LocalPtr, 2) 1368 .clampMaxNumElements(0, PrivatePtr, 2) 1369 .scalarize(0) 1370 .widenScalarToNextPow2(0) 1371 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1372 1373 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1374 // be more flexible with the shift amount type. 1375 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1376 .legalFor({{S32, S32}, {S64, S32}}); 1377 if (ST.has16BitInsts()) { 1378 if (ST.hasVOP3PInsts()) { 1379 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1380 .clampMaxNumElements(0, S16, 2); 1381 } else 1382 Shifts.legalFor({{S16, S16}}); 1383 1384 // TODO: Support 16-bit shift amounts for all types 1385 Shifts.widenScalarIf( 1386 [=](const LegalityQuery &Query) { 1387 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1388 // 32-bit amount. 1389 const LLT ValTy = Query.Types[0]; 1390 const LLT AmountTy = Query.Types[1]; 1391 return ValTy.getSizeInBits() <= 16 && 1392 AmountTy.getSizeInBits() < 16; 1393 }, changeTo(1, S16)); 1394 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1395 Shifts.clampScalar(1, S32, S32); 1396 Shifts.widenScalarToNextPow2(0, 16); 1397 Shifts.clampScalar(0, S16, S64); 1398 1399 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1400 .minScalar(0, S16) 1401 .scalarize(0) 1402 .lower(); 1403 } else { 1404 // Make sure we legalize the shift amount type first, as the general 1405 // expansion for the shifted type will produce much worse code if it hasn't 1406 // been truncated already. 1407 Shifts.clampScalar(1, S32, S32); 1408 Shifts.widenScalarToNextPow2(0, 32); 1409 Shifts.clampScalar(0, S32, S64); 1410 1411 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1412 .minScalar(0, S32) 1413 .scalarize(0) 1414 .lower(); 1415 } 1416 Shifts.scalarize(0); 1417 1418 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1419 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1420 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1421 unsigned IdxTypeIdx = 2; 1422 1423 getActionDefinitionsBuilder(Op) 1424 .customIf([=](const LegalityQuery &Query) { 1425 const LLT EltTy = Query.Types[EltTypeIdx]; 1426 const LLT VecTy = Query.Types[VecTypeIdx]; 1427 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1428 const unsigned EltSize = EltTy.getSizeInBits(); 1429 return (EltSize == 32 || EltSize == 64) && 1430 VecTy.getSizeInBits() % 32 == 0 && 1431 VecTy.getSizeInBits() <= MaxRegisterSize && 1432 IdxTy.getSizeInBits() == 32; 1433 }) 1434 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1435 bitcastToVectorElement32(VecTypeIdx)) 1436 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1437 .bitcastIf( 1438 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1439 [=](const LegalityQuery &Query) { 1440 // For > 64-bit element types, try to turn this into a 64-bit 1441 // element vector since we may be able to do better indexing 1442 // if this is scalar. If not, fall back to 32. 1443 const LLT EltTy = Query.Types[EltTypeIdx]; 1444 const LLT VecTy = Query.Types[VecTypeIdx]; 1445 const unsigned DstEltSize = EltTy.getSizeInBits(); 1446 const unsigned VecSize = VecTy.getSizeInBits(); 1447 1448 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1449 return std::make_pair( 1450 VecTypeIdx, 1451 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1452 }) 1453 .clampScalar(EltTypeIdx, S32, S64) 1454 .clampScalar(VecTypeIdx, S32, S64) 1455 .clampScalar(IdxTypeIdx, S32, S32) 1456 .clampMaxNumElements(VecTypeIdx, S32, 32) 1457 // TODO: Clamp elements for 64-bit vectors? 1458 // It should only be necessary with variable indexes. 1459 // As a last resort, lower to the stack 1460 .lower(); 1461 } 1462 1463 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1464 .unsupportedIf([=](const LegalityQuery &Query) { 1465 const LLT &EltTy = Query.Types[1].getElementType(); 1466 return Query.Types[0] != EltTy; 1467 }); 1468 1469 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1470 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1471 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1472 1473 // FIXME: Doesn't handle extract of illegal sizes. 1474 getActionDefinitionsBuilder(Op) 1475 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1476 .lowerIf([=](const LegalityQuery &Query) { 1477 // Sub-vector(or single element) insert and extract. 1478 // TODO: verify immediate offset here since lower only works with 1479 // whole elements. 1480 const LLT BigTy = Query.Types[BigTyIdx]; 1481 return BigTy.isVector(); 1482 }) 1483 // FIXME: Multiples of 16 should not be legal. 1484 .legalIf([=](const LegalityQuery &Query) { 1485 const LLT BigTy = Query.Types[BigTyIdx]; 1486 const LLT LitTy = Query.Types[LitTyIdx]; 1487 return (BigTy.getSizeInBits() % 32 == 0) && 1488 (LitTy.getSizeInBits() % 16 == 0); 1489 }) 1490 .widenScalarIf( 1491 [=](const LegalityQuery &Query) { 1492 const LLT BigTy = Query.Types[BigTyIdx]; 1493 return (BigTy.getScalarSizeInBits() < 16); 1494 }, 1495 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1496 .widenScalarIf( 1497 [=](const LegalityQuery &Query) { 1498 const LLT LitTy = Query.Types[LitTyIdx]; 1499 return (LitTy.getScalarSizeInBits() < 16); 1500 }, 1501 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1502 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1503 .widenScalarToNextPow2(BigTyIdx, 32); 1504 1505 } 1506 1507 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1508 .legalForCartesianProduct(AllS32Vectors, {S32}) 1509 .legalForCartesianProduct(AllS64Vectors, {S64}) 1510 .clampNumElements(0, V16S32, V32S32) 1511 .clampNumElements(0, V2S64, V16S64) 1512 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1513 1514 if (ST.hasScalarPackInsts()) { 1515 BuildVector 1516 // FIXME: Should probably widen s1 vectors straight to s32 1517 .minScalarOrElt(0, S16) 1518 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1519 .minScalar(1, S32); 1520 1521 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1522 .legalFor({V2S16, S32}) 1523 .lower(); 1524 BuildVector.minScalarOrElt(0, S32); 1525 } else { 1526 BuildVector.customFor({V2S16, S16}); 1527 BuildVector.minScalarOrElt(0, S32); 1528 1529 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1530 .customFor({V2S16, S32}) 1531 .lower(); 1532 } 1533 1534 BuildVector.legalIf(isRegisterType(0)); 1535 1536 // FIXME: Clamp maximum size 1537 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1538 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1539 .clampMaxNumElements(0, S32, 32) 1540 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1541 .clampMaxNumElements(0, S16, 64); 1542 1543 // TODO: Don't fully scalarize v2s16 pieces? Or combine out those 1544 // pre-legalize. 1545 if (ST.hasVOP3PInsts()) { 1546 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1547 .customFor({V2S16, V2S16}) 1548 .lower(); 1549 } else 1550 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1551 1552 // Merge/Unmerge 1553 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1554 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1555 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1556 1557 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1558 const LLT Ty = Query.Types[TypeIdx]; 1559 if (Ty.isVector()) { 1560 const LLT &EltTy = Ty.getElementType(); 1561 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1562 return true; 1563 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1564 return true; 1565 } 1566 return false; 1567 }; 1568 1569 auto &Builder = getActionDefinitionsBuilder(Op) 1570 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1571 .lowerFor({{S16, V2S16}}) 1572 .lowerIf([=](const LegalityQuery &Query) { 1573 const LLT BigTy = Query.Types[BigTyIdx]; 1574 return BigTy.getSizeInBits() == 32; 1575 }) 1576 // Try to widen to s16 first for small types. 1577 // TODO: Only do this on targets with legal s16 shifts 1578 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1579 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1580 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1581 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1582 elementTypeIs(1, S16)), 1583 changeTo(1, V2S16)) 1584 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1585 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1586 // valid. 1587 .clampScalar(LitTyIdx, S32, S512) 1588 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1589 // Break up vectors with weird elements into scalars 1590 .fewerElementsIf( 1591 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1592 scalarize(0)) 1593 .fewerElementsIf( 1594 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1595 scalarize(1)) 1596 .clampScalar(BigTyIdx, S32, MaxScalar); 1597 1598 if (Op == G_MERGE_VALUES) { 1599 Builder.widenScalarIf( 1600 // TODO: Use 16-bit shifts if legal for 8-bit values? 1601 [=](const LegalityQuery &Query) { 1602 const LLT Ty = Query.Types[LitTyIdx]; 1603 return Ty.getSizeInBits() < 32; 1604 }, 1605 changeTo(LitTyIdx, S32)); 1606 } 1607 1608 Builder.widenScalarIf( 1609 [=](const LegalityQuery &Query) { 1610 const LLT Ty = Query.Types[BigTyIdx]; 1611 return !isPowerOf2_32(Ty.getSizeInBits()) && 1612 Ty.getSizeInBits() % 16 != 0; 1613 }, 1614 [=](const LegalityQuery &Query) { 1615 // Pick the next power of 2, or a multiple of 64 over 128. 1616 // Whichever is smaller. 1617 const LLT &Ty = Query.Types[BigTyIdx]; 1618 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1619 if (NewSizeInBits >= 256) { 1620 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1621 if (RoundedTo < NewSizeInBits) 1622 NewSizeInBits = RoundedTo; 1623 } 1624 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1625 }) 1626 // Any vectors left are the wrong size. Scalarize them. 1627 .scalarize(0) 1628 .scalarize(1); 1629 } 1630 1631 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1632 // RegBankSelect. 1633 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1634 .legalFor({{S32}, {S64}}); 1635 1636 if (ST.hasVOP3PInsts()) { 1637 SextInReg.lowerFor({{V2S16}}) 1638 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1639 // get more vector shift opportunities, since we'll get those when 1640 // expanded. 1641 .clampMaxNumElementsStrict(0, S16, 2); 1642 } else if (ST.has16BitInsts()) { 1643 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1644 } else { 1645 // Prefer to promote to s32 before lowering if we don't have 16-bit 1646 // shifts. This avoid a lot of intermediate truncate and extend operations. 1647 SextInReg.lowerFor({{S32}, {S64}}); 1648 } 1649 1650 SextInReg 1651 .scalarize(0) 1652 .clampScalar(0, S32, S64) 1653 .lower(); 1654 1655 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1656 .scalarize(0) 1657 .lower(); 1658 1659 // TODO: Only Try to form v2s16 with legal packed instructions. 1660 getActionDefinitionsBuilder(G_FSHR) 1661 .legalFor({{S32, S32}}) 1662 .lowerFor({{V2S16, V2S16}}) 1663 .clampMaxNumElementsStrict(0, S16, 2) 1664 .scalarize(0) 1665 .lower(); 1666 1667 if (ST.hasVOP3PInsts()) { 1668 getActionDefinitionsBuilder(G_FSHL) 1669 .lowerFor({{V2S16, V2S16}}) 1670 .clampMaxNumElementsStrict(0, S16, 2) 1671 .scalarize(0) 1672 .lower(); 1673 } else { 1674 getActionDefinitionsBuilder(G_FSHL) 1675 .scalarize(0) 1676 .lower(); 1677 } 1678 1679 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1680 .legalFor({S64}); 1681 1682 getActionDefinitionsBuilder(G_FENCE) 1683 .alwaysLegal(); 1684 1685 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1686 .scalarize(0) 1687 .minScalar(0, S32) 1688 .lower(); 1689 1690 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1691 .legalFor({{S32, S32}, {S64, S32}}) 1692 .clampScalar(1, S32, S32) 1693 .clampScalar(0, S32, S64) 1694 .widenScalarToNextPow2(0) 1695 .scalarize(0); 1696 1697 getActionDefinitionsBuilder({ 1698 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1699 G_FCOPYSIGN, 1700 1701 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1702 G_ATOMICRMW_NAND, 1703 G_ATOMICRMW_FSUB, 1704 G_READ_REGISTER, 1705 G_WRITE_REGISTER, 1706 1707 G_SADDO, G_SSUBO, 1708 1709 // TODO: Implement 1710 G_FMINIMUM, G_FMAXIMUM}).lower(); 1711 1712 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1713 .lower(); 1714 1715 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1716 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1717 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1718 .unsupported(); 1719 1720 getLegacyLegalizerInfo().computeTables(); 1721 verify(*ST.getInstrInfo()); 1722 } 1723 1724 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1725 MachineInstr &MI) const { 1726 MachineIRBuilder &B = Helper.MIRBuilder; 1727 MachineRegisterInfo &MRI = *B.getMRI(); 1728 1729 switch (MI.getOpcode()) { 1730 case TargetOpcode::G_ADDRSPACE_CAST: 1731 return legalizeAddrSpaceCast(MI, MRI, B); 1732 case TargetOpcode::G_FRINT: 1733 return legalizeFrint(MI, MRI, B); 1734 case TargetOpcode::G_FCEIL: 1735 return legalizeFceil(MI, MRI, B); 1736 case TargetOpcode::G_FREM: 1737 return legalizeFrem(MI, MRI, B); 1738 case TargetOpcode::G_INTRINSIC_TRUNC: 1739 return legalizeIntrinsicTrunc(MI, MRI, B); 1740 case TargetOpcode::G_SITOFP: 1741 return legalizeITOFP(MI, MRI, B, true); 1742 case TargetOpcode::G_UITOFP: 1743 return legalizeITOFP(MI, MRI, B, false); 1744 case TargetOpcode::G_FPTOSI: 1745 return legalizeFPTOI(MI, MRI, B, true); 1746 case TargetOpcode::G_FPTOUI: 1747 return legalizeFPTOI(MI, MRI, B, false); 1748 case TargetOpcode::G_FMINNUM: 1749 case TargetOpcode::G_FMAXNUM: 1750 case TargetOpcode::G_FMINNUM_IEEE: 1751 case TargetOpcode::G_FMAXNUM_IEEE: 1752 return legalizeMinNumMaxNum(Helper, MI); 1753 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1754 return legalizeExtractVectorElt(MI, MRI, B); 1755 case TargetOpcode::G_INSERT_VECTOR_ELT: 1756 return legalizeInsertVectorElt(MI, MRI, B); 1757 case TargetOpcode::G_SHUFFLE_VECTOR: 1758 return legalizeShuffleVector(MI, MRI, B); 1759 case TargetOpcode::G_FSIN: 1760 case TargetOpcode::G_FCOS: 1761 return legalizeSinCos(MI, MRI, B); 1762 case TargetOpcode::G_GLOBAL_VALUE: 1763 return legalizeGlobalValue(MI, MRI, B); 1764 case TargetOpcode::G_LOAD: 1765 case TargetOpcode::G_SEXTLOAD: 1766 case TargetOpcode::G_ZEXTLOAD: 1767 return legalizeLoad(Helper, MI); 1768 case TargetOpcode::G_FMAD: 1769 return legalizeFMad(MI, MRI, B); 1770 case TargetOpcode::G_FDIV: 1771 return legalizeFDIV(MI, MRI, B); 1772 case TargetOpcode::G_UDIV: 1773 case TargetOpcode::G_UREM: 1774 case TargetOpcode::G_UDIVREM: 1775 return legalizeUnsignedDIV_REM(MI, MRI, B); 1776 case TargetOpcode::G_SDIV: 1777 case TargetOpcode::G_SREM: 1778 case TargetOpcode::G_SDIVREM: 1779 return legalizeSignedDIV_REM(MI, MRI, B); 1780 case TargetOpcode::G_ATOMIC_CMPXCHG: 1781 return legalizeAtomicCmpXChg(MI, MRI, B); 1782 case TargetOpcode::G_FLOG: 1783 return legalizeFlog(MI, B, numbers::ln2f); 1784 case TargetOpcode::G_FLOG10: 1785 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1786 case TargetOpcode::G_FEXP: 1787 return legalizeFExp(MI, B); 1788 case TargetOpcode::G_FPOW: 1789 return legalizeFPow(MI, B); 1790 case TargetOpcode::G_FFLOOR: 1791 return legalizeFFloor(MI, MRI, B); 1792 case TargetOpcode::G_BUILD_VECTOR: 1793 return legalizeBuildVector(MI, MRI, B); 1794 case TargetOpcode::G_MUL: 1795 return legalizeMul(Helper, MI); 1796 case TargetOpcode::G_CTLZ: 1797 case TargetOpcode::G_CTTZ: 1798 return legalizeCTLZ_CTTZ(MI, MRI, B); 1799 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 1800 return legalizeFPTruncRound(MI, B); 1801 default: 1802 return false; 1803 } 1804 1805 llvm_unreachable("expected switch to return"); 1806 } 1807 1808 Register AMDGPULegalizerInfo::getSegmentAperture( 1809 unsigned AS, 1810 MachineRegisterInfo &MRI, 1811 MachineIRBuilder &B) const { 1812 MachineFunction &MF = B.getMF(); 1813 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1814 const LLT S32 = LLT::scalar(32); 1815 1816 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1817 1818 if (ST.hasApertureRegs()) { 1819 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1820 // getreg. 1821 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1822 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1823 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1824 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1825 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1826 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1827 unsigned Encoding = 1828 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1829 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1830 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1831 1832 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1833 1834 B.buildInstr(AMDGPU::S_GETREG_B32) 1835 .addDef(GetReg) 1836 .addImm(Encoding); 1837 MRI.setType(GetReg, S32); 1838 1839 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1840 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1841 } 1842 1843 // TODO: can we be smarter about machine pointer info? 1844 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1845 Register LoadAddr = MRI.createGenericVirtualRegister( 1846 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1847 // For code object version 5, private_base and shared_base are passed through 1848 // implicit kernargs. 1849 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 1850 AMDGPUTargetLowering::ImplicitParameter Param = 1851 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 1852 : AMDGPUTargetLowering::PRIVATE_BASE; 1853 uint64_t Offset = 1854 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 1855 1856 Register KernargPtrReg = MRI.createGenericVirtualRegister( 1857 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1858 1859 if (!loadInputValue(KernargPtrReg, B, 1860 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 1861 return Register(); 1862 1863 MachineMemOperand *MMO = MF.getMachineMemOperand( 1864 PtrInfo, 1865 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1866 MachineMemOperand::MOInvariant, 1867 LLT::scalar(32), commonAlignment(Align(64), Offset)); 1868 1869 // Pointer address 1870 B.buildPtrAdd(LoadAddr, KernargPtrReg, 1871 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 1872 // Load address 1873 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1874 } 1875 1876 Register QueuePtr = MRI.createGenericVirtualRegister( 1877 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1878 1879 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1880 return Register(); 1881 1882 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1883 // private_segment_aperture_base_hi. 1884 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1885 1886 MachineMemOperand *MMO = MF.getMachineMemOperand( 1887 PtrInfo, 1888 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1889 MachineMemOperand::MOInvariant, 1890 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1891 1892 B.buildPtrAdd(LoadAddr, QueuePtr, 1893 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 1894 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1895 } 1896 1897 /// Return true if the value is a known valid address, such that a null check is 1898 /// not necessary. 1899 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 1900 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 1901 MachineInstr *Def = MRI.getVRegDef(Val); 1902 switch (Def->getOpcode()) { 1903 case AMDGPU::G_FRAME_INDEX: 1904 case AMDGPU::G_GLOBAL_VALUE: 1905 case AMDGPU::G_BLOCK_ADDR: 1906 return true; 1907 case AMDGPU::G_CONSTANT: { 1908 const ConstantInt *CI = Def->getOperand(1).getCImm(); 1909 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 1910 } 1911 default: 1912 return false; 1913 } 1914 1915 return false; 1916 } 1917 1918 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1919 MachineInstr &MI, MachineRegisterInfo &MRI, 1920 MachineIRBuilder &B) const { 1921 MachineFunction &MF = B.getMF(); 1922 1923 const LLT S32 = LLT::scalar(32); 1924 Register Dst = MI.getOperand(0).getReg(); 1925 Register Src = MI.getOperand(1).getReg(); 1926 1927 LLT DstTy = MRI.getType(Dst); 1928 LLT SrcTy = MRI.getType(Src); 1929 unsigned DestAS = DstTy.getAddressSpace(); 1930 unsigned SrcAS = SrcTy.getAddressSpace(); 1931 1932 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1933 // vector element. 1934 assert(!DstTy.isVector()); 1935 1936 const AMDGPUTargetMachine &TM 1937 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1938 1939 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1940 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1941 return true; 1942 } 1943 1944 if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 1945 (DestAS == AMDGPUAS::LOCAL_ADDRESS || 1946 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1947 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1948 // Extract low 32-bits of the pointer. 1949 B.buildExtract(Dst, Src, 0); 1950 MI.eraseFromParent(); 1951 return true; 1952 } 1953 1954 unsigned NullVal = TM.getNullPointerValue(DestAS); 1955 1956 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1957 auto FlatNull = B.buildConstant(SrcTy, 0); 1958 1959 // Extract low 32-bits of the pointer. 1960 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1961 1962 auto CmpRes = 1963 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1964 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1965 1966 MI.eraseFromParent(); 1967 return true; 1968 } 1969 1970 if (DestAS == AMDGPUAS::FLAT_ADDRESS && 1971 (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 1972 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1973 if (!ST.hasFlatAddressSpace()) 1974 return false; 1975 1976 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1977 if (!ApertureReg.isValid()) 1978 return false; 1979 1980 // Coerce the type of the low half of the result so we can use merge_values. 1981 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1982 1983 // TODO: Should we allow mismatched types but matching sizes in merges to 1984 // avoid the ptrtoint? 1985 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1986 1987 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1988 B.buildCopy(Dst, BuildPtr); 1989 MI.eraseFromParent(); 1990 return true; 1991 } 1992 1993 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1994 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1995 1996 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 1997 SegmentNull.getReg(0)); 1998 1999 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 2000 2001 MI.eraseFromParent(); 2002 return true; 2003 } 2004 2005 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2006 SrcTy.getSizeInBits() == 64) { 2007 // Truncate. 2008 B.buildExtract(Dst, Src, 0); 2009 MI.eraseFromParent(); 2010 return true; 2011 } 2012 2013 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2014 DstTy.getSizeInBits() == 64) { 2015 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 2016 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 2017 2018 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 2019 // another. Merge operands are required to be the same type, but creating an 2020 // extra ptrtoint would be kind of pointless. 2021 auto HighAddr = B.buildConstant( 2022 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 2023 B.buildMerge(Dst, {Src, HighAddr}); 2024 MI.eraseFromParent(); 2025 return true; 2026 } 2027 2028 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 2029 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 2030 2031 LLVMContext &Ctx = MF.getFunction().getContext(); 2032 Ctx.diagnose(InvalidAddrSpaceCast); 2033 B.buildUndef(Dst); 2034 MI.eraseFromParent(); 2035 return true; 2036 } 2037 2038 bool AMDGPULegalizerInfo::legalizeFrint( 2039 MachineInstr &MI, MachineRegisterInfo &MRI, 2040 MachineIRBuilder &B) const { 2041 Register Src = MI.getOperand(1).getReg(); 2042 LLT Ty = MRI.getType(Src); 2043 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 2044 2045 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2046 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2047 2048 auto C1 = B.buildFConstant(Ty, C1Val); 2049 auto CopySign = B.buildFCopysign(Ty, C1, Src); 2050 2051 // TODO: Should this propagate fast-math-flags? 2052 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 2053 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 2054 2055 auto C2 = B.buildFConstant(Ty, C2Val); 2056 auto Fabs = B.buildFAbs(Ty, Src); 2057 2058 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 2059 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2060 MI.eraseFromParent(); 2061 return true; 2062 } 2063 2064 bool AMDGPULegalizerInfo::legalizeFceil( 2065 MachineInstr &MI, MachineRegisterInfo &MRI, 2066 MachineIRBuilder &B) const { 2067 2068 const LLT S1 = LLT::scalar(1); 2069 const LLT S64 = LLT::scalar(64); 2070 2071 Register Src = MI.getOperand(1).getReg(); 2072 assert(MRI.getType(Src) == S64); 2073 2074 // result = trunc(src) 2075 // if (src > 0.0 && src != result) 2076 // result += 1.0 2077 2078 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 2079 2080 const auto Zero = B.buildFConstant(S64, 0.0); 2081 const auto One = B.buildFConstant(S64, 1.0); 2082 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2083 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2084 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2085 auto Add = B.buildSelect(S64, And, One, Zero); 2086 2087 // TODO: Should this propagate fast-math-flags? 2088 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2089 MI.eraseFromParent(); 2090 return true; 2091 } 2092 2093 bool AMDGPULegalizerInfo::legalizeFrem( 2094 MachineInstr &MI, MachineRegisterInfo &MRI, 2095 MachineIRBuilder &B) const { 2096 Register DstReg = MI.getOperand(0).getReg(); 2097 Register Src0Reg = MI.getOperand(1).getReg(); 2098 Register Src1Reg = MI.getOperand(2).getReg(); 2099 auto Flags = MI.getFlags(); 2100 LLT Ty = MRI.getType(DstReg); 2101 2102 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2103 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2104 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2105 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2106 MI.eraseFromParent(); 2107 return true; 2108 } 2109 2110 static MachineInstrBuilder extractF64Exponent(Register Hi, 2111 MachineIRBuilder &B) { 2112 const unsigned FractBits = 52; 2113 const unsigned ExpBits = 11; 2114 LLT S32 = LLT::scalar(32); 2115 2116 auto Const0 = B.buildConstant(S32, FractBits - 32); 2117 auto Const1 = B.buildConstant(S32, ExpBits); 2118 2119 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2120 .addUse(Hi) 2121 .addUse(Const0.getReg(0)) 2122 .addUse(Const1.getReg(0)); 2123 2124 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2125 } 2126 2127 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2128 MachineInstr &MI, MachineRegisterInfo &MRI, 2129 MachineIRBuilder &B) const { 2130 const LLT S1 = LLT::scalar(1); 2131 const LLT S32 = LLT::scalar(32); 2132 const LLT S64 = LLT::scalar(64); 2133 2134 Register Src = MI.getOperand(1).getReg(); 2135 assert(MRI.getType(Src) == S64); 2136 2137 // TODO: Should this use extract since the low half is unused? 2138 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2139 Register Hi = Unmerge.getReg(1); 2140 2141 // Extract the upper half, since this is where we will find the sign and 2142 // exponent. 2143 auto Exp = extractF64Exponent(Hi, B); 2144 2145 const unsigned FractBits = 52; 2146 2147 // Extract the sign bit. 2148 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2149 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2150 2151 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2152 2153 const auto Zero32 = B.buildConstant(S32, 0); 2154 2155 // Extend back to 64-bits. 2156 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2157 2158 auto Shr = B.buildAShr(S64, FractMask, Exp); 2159 auto Not = B.buildNot(S64, Shr); 2160 auto Tmp0 = B.buildAnd(S64, Src, Not); 2161 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2162 2163 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2164 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2165 2166 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2167 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2168 MI.eraseFromParent(); 2169 return true; 2170 } 2171 2172 bool AMDGPULegalizerInfo::legalizeITOFP( 2173 MachineInstr &MI, MachineRegisterInfo &MRI, 2174 MachineIRBuilder &B, bool Signed) const { 2175 2176 Register Dst = MI.getOperand(0).getReg(); 2177 Register Src = MI.getOperand(1).getReg(); 2178 2179 const LLT S64 = LLT::scalar(64); 2180 const LLT S32 = LLT::scalar(32); 2181 2182 assert(MRI.getType(Src) == S64); 2183 2184 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2185 auto ThirtyTwo = B.buildConstant(S32, 32); 2186 2187 if (MRI.getType(Dst) == S64) { 2188 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2189 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2190 2191 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2192 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2193 .addUse(CvtHi.getReg(0)) 2194 .addUse(ThirtyTwo.getReg(0)); 2195 2196 // TODO: Should this propagate fast-math-flags? 2197 B.buildFAdd(Dst, LdExp, CvtLo); 2198 MI.eraseFromParent(); 2199 return true; 2200 } 2201 2202 assert(MRI.getType(Dst) == S32); 2203 2204 auto One = B.buildConstant(S32, 1); 2205 2206 MachineInstrBuilder ShAmt; 2207 if (Signed) { 2208 auto ThirtyOne = B.buildConstant(S32, 31); 2209 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2210 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2211 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2212 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2213 /*HasSideEffects=*/false) 2214 .addUse(Unmerge.getReg(1)); 2215 auto LS2 = B.buildSub(S32, LS, One); 2216 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2217 } else 2218 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2219 auto Norm = B.buildShl(S64, Src, ShAmt); 2220 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2221 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2222 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2223 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2224 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2225 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2226 /*HasSideEffects=*/false) 2227 .addUse(FVal.getReg(0)) 2228 .addUse(Scale.getReg(0)); 2229 MI.eraseFromParent(); 2230 return true; 2231 } 2232 2233 // TODO: Copied from DAG implementation. Verify logic and document how this 2234 // actually works. 2235 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2236 MachineRegisterInfo &MRI, 2237 MachineIRBuilder &B, 2238 bool Signed) const { 2239 2240 Register Dst = MI.getOperand(0).getReg(); 2241 Register Src = MI.getOperand(1).getReg(); 2242 2243 const LLT S64 = LLT::scalar(64); 2244 const LLT S32 = LLT::scalar(32); 2245 2246 const LLT SrcLT = MRI.getType(Src); 2247 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2248 2249 unsigned Flags = MI.getFlags(); 2250 2251 // The basic idea of converting a floating point number into a pair of 32-bit 2252 // integers is illustrated as follows: 2253 // 2254 // tf := trunc(val); 2255 // hif := floor(tf * 2^-32); 2256 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2257 // hi := fptoi(hif); 2258 // lo := fptoi(lof); 2259 // 2260 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2261 MachineInstrBuilder Sign; 2262 if (Signed && SrcLT == S32) { 2263 // However, a 32-bit floating point number has only 23 bits mantissa and 2264 // it's not enough to hold all the significant bits of `lof` if val is 2265 // negative. To avoid the loss of precision, We need to take the absolute 2266 // value after truncating and flip the result back based on the original 2267 // signedness. 2268 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2269 Trunc = B.buildFAbs(S32, Trunc, Flags); 2270 } 2271 MachineInstrBuilder K0, K1; 2272 if (SrcLT == S64) { 2273 K0 = B.buildFConstant(S64, 2274 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2275 K1 = B.buildFConstant(S64, 2276 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2277 } else { 2278 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2279 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2280 } 2281 2282 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2283 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2284 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2285 2286 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2287 : B.buildFPTOUI(S32, FloorMul); 2288 auto Lo = B.buildFPTOUI(S32, Fma); 2289 2290 if (Signed && SrcLT == S32) { 2291 // Flip the result based on the signedness, which is either all 0s or 1s. 2292 Sign = B.buildMerge(S64, {Sign, Sign}); 2293 // r := xor({lo, hi}, sign) - sign; 2294 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2295 } else 2296 B.buildMerge(Dst, {Lo, Hi}); 2297 MI.eraseFromParent(); 2298 2299 return true; 2300 } 2301 2302 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2303 MachineInstr &MI) const { 2304 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2305 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2306 2307 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2308 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2309 2310 // With ieee_mode disabled, the instructions have the correct behavior 2311 // already for G_FMINNUM/G_FMAXNUM 2312 if (!MFI->getMode().IEEE) 2313 return !IsIEEEOp; 2314 2315 if (IsIEEEOp) 2316 return true; 2317 2318 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2319 } 2320 2321 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2322 MachineInstr &MI, MachineRegisterInfo &MRI, 2323 MachineIRBuilder &B) const { 2324 // TODO: Should move some of this into LegalizerHelper. 2325 2326 // TODO: Promote dynamic indexing of s16 to s32 2327 2328 // FIXME: Artifact combiner probably should have replaced the truncated 2329 // constant before this, so we shouldn't need 2330 // getIConstantVRegValWithLookThrough. 2331 Optional<ValueAndVReg> MaybeIdxVal = 2332 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2333 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2334 return true; 2335 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2336 2337 Register Dst = MI.getOperand(0).getReg(); 2338 Register Vec = MI.getOperand(1).getReg(); 2339 2340 LLT VecTy = MRI.getType(Vec); 2341 LLT EltTy = VecTy.getElementType(); 2342 assert(EltTy == MRI.getType(Dst)); 2343 2344 if (IdxVal < VecTy.getNumElements()) { 2345 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2346 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2347 } else { 2348 B.buildUndef(Dst); 2349 } 2350 2351 MI.eraseFromParent(); 2352 return true; 2353 } 2354 2355 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2356 MachineInstr &MI, MachineRegisterInfo &MRI, 2357 MachineIRBuilder &B) const { 2358 // TODO: Should move some of this into LegalizerHelper. 2359 2360 // TODO: Promote dynamic indexing of s16 to s32 2361 2362 // FIXME: Artifact combiner probably should have replaced the truncated 2363 // constant before this, so we shouldn't need 2364 // getIConstantVRegValWithLookThrough. 2365 Optional<ValueAndVReg> MaybeIdxVal = 2366 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2367 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2368 return true; 2369 2370 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2371 Register Dst = MI.getOperand(0).getReg(); 2372 Register Vec = MI.getOperand(1).getReg(); 2373 Register Ins = MI.getOperand(2).getReg(); 2374 2375 LLT VecTy = MRI.getType(Vec); 2376 LLT EltTy = VecTy.getElementType(); 2377 assert(EltTy == MRI.getType(Ins)); 2378 (void)Ins; 2379 2380 unsigned NumElts = VecTy.getNumElements(); 2381 if (IdxVal < NumElts) { 2382 SmallVector<Register, 8> SrcRegs; 2383 for (unsigned i = 0; i < NumElts; ++i) 2384 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2385 B.buildUnmerge(SrcRegs, Vec); 2386 2387 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2388 B.buildMerge(Dst, SrcRegs); 2389 } else { 2390 B.buildUndef(Dst); 2391 } 2392 2393 MI.eraseFromParent(); 2394 return true; 2395 } 2396 2397 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2398 MachineInstr &MI, MachineRegisterInfo &MRI, 2399 MachineIRBuilder &B) const { 2400 const LLT V2S16 = LLT::fixed_vector(2, 16); 2401 2402 Register Dst = MI.getOperand(0).getReg(); 2403 Register Src0 = MI.getOperand(1).getReg(); 2404 LLT DstTy = MRI.getType(Dst); 2405 LLT SrcTy = MRI.getType(Src0); 2406 2407 if (SrcTy == V2S16 && DstTy == V2S16 && 2408 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2409 return true; 2410 2411 MachineIRBuilder HelperBuilder(MI); 2412 GISelObserverWrapper DummyObserver; 2413 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2414 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2415 } 2416 2417 bool AMDGPULegalizerInfo::legalizeSinCos( 2418 MachineInstr &MI, MachineRegisterInfo &MRI, 2419 MachineIRBuilder &B) const { 2420 2421 Register DstReg = MI.getOperand(0).getReg(); 2422 Register SrcReg = MI.getOperand(1).getReg(); 2423 LLT Ty = MRI.getType(DstReg); 2424 unsigned Flags = MI.getFlags(); 2425 2426 Register TrigVal; 2427 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2428 if (ST.hasTrigReducedRange()) { 2429 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2430 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2431 .addUse(MulVal.getReg(0)) 2432 .setMIFlags(Flags).getReg(0); 2433 } else 2434 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2435 2436 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2437 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2438 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2439 .addUse(TrigVal) 2440 .setMIFlags(Flags); 2441 MI.eraseFromParent(); 2442 return true; 2443 } 2444 2445 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2446 MachineIRBuilder &B, 2447 const GlobalValue *GV, 2448 int64_t Offset, 2449 unsigned GAFlags) const { 2450 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2451 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2452 // to the following code sequence: 2453 // 2454 // For constant address space: 2455 // s_getpc_b64 s[0:1] 2456 // s_add_u32 s0, s0, $symbol 2457 // s_addc_u32 s1, s1, 0 2458 // 2459 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2460 // a fixup or relocation is emitted to replace $symbol with a literal 2461 // constant, which is a pc-relative offset from the encoding of the $symbol 2462 // operand to the global variable. 2463 // 2464 // For global address space: 2465 // s_getpc_b64 s[0:1] 2466 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2467 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2468 // 2469 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2470 // fixups or relocations are emitted to replace $symbol@*@lo and 2471 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2472 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2473 // operand to the global variable. 2474 // 2475 // What we want here is an offset from the value returned by s_getpc 2476 // (which is the address of the s_add_u32 instruction) to the global 2477 // variable, but since the encoding of $symbol starts 4 bytes after the start 2478 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2479 // small. This requires us to add 4 to the global variable offset in order to 2480 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2481 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2482 // instruction. 2483 2484 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2485 2486 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2487 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2488 2489 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2490 .addDef(PCReg); 2491 2492 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2493 if (GAFlags == SIInstrInfo::MO_NONE) 2494 MIB.addImm(0); 2495 else 2496 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2497 2498 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2499 2500 if (PtrTy.getSizeInBits() == 32) 2501 B.buildExtract(DstReg, PCReg, 0); 2502 return true; 2503 } 2504 2505 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2506 MachineInstr &MI, MachineRegisterInfo &MRI, 2507 MachineIRBuilder &B) const { 2508 Register DstReg = MI.getOperand(0).getReg(); 2509 LLT Ty = MRI.getType(DstReg); 2510 unsigned AS = Ty.getAddressSpace(); 2511 2512 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2513 MachineFunction &MF = B.getMF(); 2514 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2515 2516 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2517 if (!MFI->isModuleEntryFunction() && 2518 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2519 const Function &Fn = MF.getFunction(); 2520 DiagnosticInfoUnsupported BadLDSDecl( 2521 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2522 DS_Warning); 2523 Fn.getContext().diagnose(BadLDSDecl); 2524 2525 // We currently don't have a way to correctly allocate LDS objects that 2526 // aren't directly associated with a kernel. We do force inlining of 2527 // functions that use local objects. However, if these dead functions are 2528 // not eliminated, we don't want a compile time error. Just emit a warning 2529 // and a trap, since there should be no callable path here. 2530 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2531 B.buildUndef(DstReg); 2532 MI.eraseFromParent(); 2533 return true; 2534 } 2535 2536 // TODO: We could emit code to handle the initialization somewhere. 2537 // We ignore the initializer for now and legalize it to allow selection. 2538 // The initializer will anyway get errored out during assembly emission. 2539 const SITargetLowering *TLI = ST.getTargetLowering(); 2540 if (!TLI->shouldUseLDSConstAddress(GV)) { 2541 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2542 return true; // Leave in place; 2543 } 2544 2545 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2546 Type *Ty = GV->getValueType(); 2547 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2548 // zero-sized type in other languages to declare the dynamic shared 2549 // memory which size is not known at the compile time. They will be 2550 // allocated by the runtime and placed directly after the static 2551 // allocated ones. They all share the same offset. 2552 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2553 // Adjust alignment for that dynamic shared memory array. 2554 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2555 LLT S32 = LLT::scalar(32); 2556 auto Sz = 2557 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2558 B.buildIntToPtr(DstReg, Sz); 2559 MI.eraseFromParent(); 2560 return true; 2561 } 2562 } 2563 2564 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2565 *cast<GlobalVariable>(GV))); 2566 MI.eraseFromParent(); 2567 return true; 2568 } 2569 2570 const SITargetLowering *TLI = ST.getTargetLowering(); 2571 2572 if (TLI->shouldEmitFixup(GV)) { 2573 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2574 MI.eraseFromParent(); 2575 return true; 2576 } 2577 2578 if (TLI->shouldEmitPCReloc(GV)) { 2579 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2580 MI.eraseFromParent(); 2581 return true; 2582 } 2583 2584 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2585 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2586 2587 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2588 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2589 MachinePointerInfo::getGOT(MF), 2590 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2591 MachineMemOperand::MOInvariant, 2592 LoadTy, Align(8)); 2593 2594 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2595 2596 if (Ty.getSizeInBits() == 32) { 2597 // Truncate if this is a 32-bit constant address. 2598 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2599 B.buildExtract(DstReg, Load, 0); 2600 } else 2601 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2602 2603 MI.eraseFromParent(); 2604 return true; 2605 } 2606 2607 static LLT widenToNextPowerOf2(LLT Ty) { 2608 if (Ty.isVector()) 2609 return Ty.changeElementCount( 2610 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2611 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2612 } 2613 2614 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2615 MachineInstr &MI) const { 2616 MachineIRBuilder &B = Helper.MIRBuilder; 2617 MachineRegisterInfo &MRI = *B.getMRI(); 2618 GISelChangeObserver &Observer = Helper.Observer; 2619 2620 Register PtrReg = MI.getOperand(1).getReg(); 2621 LLT PtrTy = MRI.getType(PtrReg); 2622 unsigned AddrSpace = PtrTy.getAddressSpace(); 2623 2624 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2625 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2626 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2627 Observer.changingInstr(MI); 2628 MI.getOperand(1).setReg(Cast.getReg(0)); 2629 Observer.changedInstr(MI); 2630 return true; 2631 } 2632 2633 if (MI.getOpcode() != AMDGPU::G_LOAD) 2634 return false; 2635 2636 Register ValReg = MI.getOperand(0).getReg(); 2637 LLT ValTy = MRI.getType(ValReg); 2638 2639 MachineMemOperand *MMO = *MI.memoperands_begin(); 2640 const unsigned ValSize = ValTy.getSizeInBits(); 2641 const LLT MemTy = MMO->getMemoryType(); 2642 const Align MemAlign = MMO->getAlign(); 2643 const unsigned MemSize = MemTy.getSizeInBits(); 2644 const uint64_t AlignInBits = 8 * MemAlign.value(); 2645 2646 // Widen non-power-of-2 loads to the alignment if needed 2647 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2648 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2649 2650 // This was already the correct extending load result type, so just adjust 2651 // the memory type. 2652 if (WideMemSize == ValSize) { 2653 MachineFunction &MF = B.getMF(); 2654 2655 MachineMemOperand *WideMMO = 2656 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2657 Observer.changingInstr(MI); 2658 MI.setMemRefs(MF, {WideMMO}); 2659 Observer.changedInstr(MI); 2660 return true; 2661 } 2662 2663 // Don't bother handling edge case that should probably never be produced. 2664 if (ValSize > WideMemSize) 2665 return false; 2666 2667 LLT WideTy = widenToNextPowerOf2(ValTy); 2668 2669 Register WideLoad; 2670 if (!WideTy.isVector()) { 2671 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2672 B.buildTrunc(ValReg, WideLoad).getReg(0); 2673 } else { 2674 // Extract the subvector. 2675 2676 if (isRegisterType(ValTy)) { 2677 // If this a case where G_EXTRACT is legal, use it. 2678 // (e.g. <3 x s32> -> <4 x s32>) 2679 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2680 B.buildExtract(ValReg, WideLoad, 0); 2681 } else { 2682 // For cases where the widened type isn't a nice register value, unmerge 2683 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2684 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2685 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2686 } 2687 } 2688 2689 MI.eraseFromParent(); 2690 return true; 2691 } 2692 2693 return false; 2694 } 2695 2696 bool AMDGPULegalizerInfo::legalizeFMad( 2697 MachineInstr &MI, MachineRegisterInfo &MRI, 2698 MachineIRBuilder &B) const { 2699 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2700 assert(Ty.isScalar()); 2701 2702 MachineFunction &MF = B.getMF(); 2703 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2704 2705 // TODO: Always legal with future ftz flag. 2706 // FIXME: Do we need just output? 2707 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2708 return true; 2709 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2710 return true; 2711 2712 MachineIRBuilder HelperBuilder(MI); 2713 GISelObserverWrapper DummyObserver; 2714 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2715 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2716 } 2717 2718 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2719 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2720 Register DstReg = MI.getOperand(0).getReg(); 2721 Register PtrReg = MI.getOperand(1).getReg(); 2722 Register CmpVal = MI.getOperand(2).getReg(); 2723 Register NewVal = MI.getOperand(3).getReg(); 2724 2725 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2726 "this should not have been custom lowered"); 2727 2728 LLT ValTy = MRI.getType(CmpVal); 2729 LLT VecTy = LLT::fixed_vector(2, ValTy); 2730 2731 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2732 2733 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2734 .addDef(DstReg) 2735 .addUse(PtrReg) 2736 .addUse(PackedVal) 2737 .setMemRefs(MI.memoperands()); 2738 2739 MI.eraseFromParent(); 2740 return true; 2741 } 2742 2743 bool AMDGPULegalizerInfo::legalizeFlog( 2744 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2745 Register Dst = MI.getOperand(0).getReg(); 2746 Register Src = MI.getOperand(1).getReg(); 2747 LLT Ty = B.getMRI()->getType(Dst); 2748 unsigned Flags = MI.getFlags(); 2749 2750 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2751 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2752 2753 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2754 MI.eraseFromParent(); 2755 return true; 2756 } 2757 2758 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2759 MachineIRBuilder &B) const { 2760 Register Dst = MI.getOperand(0).getReg(); 2761 Register Src = MI.getOperand(1).getReg(); 2762 unsigned Flags = MI.getFlags(); 2763 LLT Ty = B.getMRI()->getType(Dst); 2764 2765 auto K = B.buildFConstant(Ty, numbers::log2e); 2766 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2767 B.buildFExp2(Dst, Mul, Flags); 2768 MI.eraseFromParent(); 2769 return true; 2770 } 2771 2772 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2773 MachineIRBuilder &B) const { 2774 Register Dst = MI.getOperand(0).getReg(); 2775 Register Src0 = MI.getOperand(1).getReg(); 2776 Register Src1 = MI.getOperand(2).getReg(); 2777 unsigned Flags = MI.getFlags(); 2778 LLT Ty = B.getMRI()->getType(Dst); 2779 const LLT S16 = LLT::scalar(16); 2780 const LLT S32 = LLT::scalar(32); 2781 2782 if (Ty == S32) { 2783 auto Log = B.buildFLog2(S32, Src0, Flags); 2784 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2785 .addUse(Log.getReg(0)) 2786 .addUse(Src1) 2787 .setMIFlags(Flags); 2788 B.buildFExp2(Dst, Mul, Flags); 2789 } else if (Ty == S16) { 2790 // There's no f16 fmul_legacy, so we need to convert for it. 2791 auto Log = B.buildFLog2(S16, Src0, Flags); 2792 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2793 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2794 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2795 .addUse(Ext0.getReg(0)) 2796 .addUse(Ext1.getReg(0)) 2797 .setMIFlags(Flags); 2798 2799 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2800 } else 2801 return false; 2802 2803 MI.eraseFromParent(); 2804 return true; 2805 } 2806 2807 // Find a source register, ignoring any possible source modifiers. 2808 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2809 Register ModSrc = OrigSrc; 2810 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2811 ModSrc = SrcFNeg->getOperand(1).getReg(); 2812 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2813 ModSrc = SrcFAbs->getOperand(1).getReg(); 2814 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2815 ModSrc = SrcFAbs->getOperand(1).getReg(); 2816 return ModSrc; 2817 } 2818 2819 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2820 MachineRegisterInfo &MRI, 2821 MachineIRBuilder &B) const { 2822 2823 const LLT S1 = LLT::scalar(1); 2824 const LLT S64 = LLT::scalar(64); 2825 Register Dst = MI.getOperand(0).getReg(); 2826 Register OrigSrc = MI.getOperand(1).getReg(); 2827 unsigned Flags = MI.getFlags(); 2828 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2829 "this should not have been custom lowered"); 2830 2831 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2832 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2833 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2834 // V_FRACT bug is: 2835 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2836 // 2837 // Convert floor(x) to (x - fract(x)) 2838 2839 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2840 .addUse(OrigSrc) 2841 .setMIFlags(Flags); 2842 2843 // Give source modifier matching some assistance before obscuring a foldable 2844 // pattern. 2845 2846 // TODO: We can avoid the neg on the fract? The input sign to fract 2847 // shouldn't matter? 2848 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2849 2850 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2851 2852 Register Min = MRI.createGenericVirtualRegister(S64); 2853 2854 // We don't need to concern ourselves with the snan handling difference, so 2855 // use the one which will directly select. 2856 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2857 if (MFI->getMode().IEEE) 2858 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2859 else 2860 B.buildFMinNum(Min, Fract, Const, Flags); 2861 2862 Register CorrectedFract = Min; 2863 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2864 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2865 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2866 } 2867 2868 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2869 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2870 2871 MI.eraseFromParent(); 2872 return true; 2873 } 2874 2875 // Turn an illegal packed v2s16 build vector into bit operations. 2876 // TODO: This should probably be a bitcast action in LegalizerHelper. 2877 bool AMDGPULegalizerInfo::legalizeBuildVector( 2878 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2879 Register Dst = MI.getOperand(0).getReg(); 2880 const LLT S32 = LLT::scalar(32); 2881 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2882 2883 Register Src0 = MI.getOperand(1).getReg(); 2884 Register Src1 = MI.getOperand(2).getReg(); 2885 assert(MRI.getType(Src0) == LLT::scalar(16)); 2886 2887 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2888 B.buildBitcast(Dst, Merge); 2889 2890 MI.eraseFromParent(); 2891 return true; 2892 } 2893 2894 // Build a big integer multiply or multiply-add using MAD_64_32 instructions. 2895 // 2896 // Source and accumulation registers must all be 32-bits. 2897 // 2898 // TODO: When the multiply is uniform, we should produce a code sequence 2899 // that is better suited to instruction selection on the SALU. Instead of 2900 // the outer loop going over parts of the result, the outer loop should go 2901 // over parts of one of the factors. This should result in instruction 2902 // selection that makes full use of S_ADDC_U32 instructions. 2903 void AMDGPULegalizerInfo::buildMultiply( 2904 LegalizerHelper &Helper, MutableArrayRef<Register> Accum, 2905 ArrayRef<Register> Src0, ArrayRef<Register> Src1, 2906 bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const { 2907 // Use (possibly empty) vectors of S1 registers to represent the set of 2908 // carries from one pair of positions to the next. 2909 using Carry = SmallVector<Register, 2>; 2910 2911 MachineIRBuilder &B = Helper.MIRBuilder; 2912 2913 const LLT S1 = LLT::scalar(1); 2914 const LLT S32 = LLT::scalar(32); 2915 const LLT S64 = LLT::scalar(64); 2916 2917 Register Zero32; 2918 Register Zero64; 2919 2920 auto getZero32 = [&]() -> Register { 2921 if (!Zero32) 2922 Zero32 = B.buildConstant(S32, 0).getReg(0); 2923 return Zero32; 2924 }; 2925 auto getZero64 = [&]() -> Register { 2926 if (!Zero64) 2927 Zero64 = B.buildConstant(S64, 0).getReg(0); 2928 return Zero64; 2929 }; 2930 2931 // Merge the given carries into the 32-bit LocalAccum, which is modified 2932 // in-place. 2933 // 2934 // Returns the carry-out, which is a single S1 register or null. 2935 auto mergeCarry = 2936 [&](Register &LocalAccum, const Carry &CarryIn) -> Register { 2937 if (CarryIn.empty()) 2938 return Register(); 2939 2940 bool HaveCarryOut = true; 2941 Register CarryAccum; 2942 if (CarryIn.size() == 1) { 2943 if (!LocalAccum) { 2944 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 2945 return Register(); 2946 } 2947 2948 CarryAccum = getZero32(); 2949 } else { 2950 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 2951 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { 2952 CarryAccum = 2953 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) 2954 .getReg(0); 2955 } 2956 2957 if (!LocalAccum) { 2958 LocalAccum = getZero32(); 2959 HaveCarryOut = false; 2960 } 2961 } 2962 2963 auto Add = 2964 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); 2965 LocalAccum = Add.getReg(0); 2966 return HaveCarryOut ? Add.getReg(1) : Register(); 2967 }; 2968 2969 // Build a multiply-add chain to compute 2970 // 2971 // LocalAccum + (partial products at DstIndex) 2972 // + (opportunistic subset of CarryIn) 2973 // 2974 // LocalAccum is an array of one or two 32-bit registers that are updated 2975 // in-place. The incoming registers may be null. 2976 // 2977 // In some edge cases, carry-ins can be consumed "for free". In that case, 2978 // the consumed carry bits are removed from CarryIn in-place. 2979 auto buildMadChain = 2980 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) 2981 -> Carry { 2982 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || 2983 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)); 2984 2985 Register Tmp; 2986 Carry CarryOut; 2987 unsigned j0 = 0; 2988 2989 // Use plain 32-bit multiplication for the most significant part of the 2990 // result by default. 2991 if (LocalAccum.size() == 1 && 2992 (!UsePartialMad64_32 || !CarryIn.empty())) { 2993 do { 2994 unsigned j1 = DstIndex - j0; 2995 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); 2996 if (!LocalAccum[0]) { 2997 LocalAccum[0] = Mul.getReg(0); 2998 } else { 2999 if (CarryIn.empty()) { 3000 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); 3001 } else { 3002 LocalAccum[0] = 3003 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) 3004 .getReg(0); 3005 CarryIn.pop_back(); 3006 } 3007 } 3008 ++j0; 3009 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); 3010 } 3011 3012 // Build full 64-bit multiplies. 3013 if (j0 <= DstIndex) { 3014 bool HaveSmallAccum = false; 3015 Register Tmp; 3016 3017 if (LocalAccum[0]) { 3018 if (LocalAccum.size() == 1) { 3019 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); 3020 HaveSmallAccum = true; 3021 } else if (LocalAccum[1]) { 3022 Tmp = B.buildMerge(S64, LocalAccum).getReg(0); 3023 HaveSmallAccum = false; 3024 } else { 3025 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); 3026 HaveSmallAccum = true; 3027 } 3028 } else { 3029 assert(LocalAccum.size() == 1 || !LocalAccum[1]); 3030 Tmp = getZero64(); 3031 HaveSmallAccum = true; 3032 } 3033 3034 do { 3035 unsigned j1 = DstIndex - j0; 3036 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, 3037 {Src0[j0], Src1[j1], Tmp}); 3038 Tmp = Mad.getReg(0); 3039 if (!HaveSmallAccum) 3040 CarryOut.push_back(Mad.getReg(1)); 3041 HaveSmallAccum = false; 3042 ++j0; 3043 } while (j0 <= DstIndex); 3044 3045 auto Unmerge = B.buildUnmerge(S32, Tmp); 3046 LocalAccum[0] = Unmerge.getReg(0); 3047 if (LocalAccum.size() > 1) 3048 LocalAccum[1] = Unmerge.getReg(1); 3049 } 3050 3051 return CarryOut; 3052 }; 3053 3054 // Outer multiply loop, iterating over destination parts from least 3055 // significant to most significant parts. 3056 // 3057 // The columns of the following diagram correspond to the destination parts 3058 // affected by one iteration of the outer loop (ignoring boundary 3059 // conditions). 3060 // 3061 // Dest index relative to 2 * i: 1 0 -1 3062 // ------ 3063 // Carries from previous iteration: e o 3064 // Even-aligned partial product sum: E E . 3065 // Odd-aligned partial product sum: O O 3066 // 3067 // 'o' is OddCarry, 'e' is EvenCarry. 3068 // EE and OO are computed from partial products via buildMadChain and use 3069 // accumulation where possible and appropriate. 3070 // 3071 Register SeparateOddCarry; 3072 Carry EvenCarry; 3073 Carry OddCarry; 3074 3075 for (unsigned i = 0; i <= Accum.size() / 2; ++i) { 3076 Carry OddCarryIn = std::move(OddCarry); 3077 Carry EvenCarryIn = std::move(EvenCarry); 3078 OddCarry.clear(); 3079 EvenCarry.clear(); 3080 3081 // Partial products at offset 2 * i. 3082 if (2 * i < Accum.size()) { 3083 auto LocalAccum = Accum.drop_front(2 * i).take_front(2); 3084 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); 3085 } 3086 3087 // Partial products at offset 2 * i - 1. 3088 if (i > 0) { 3089 if (!SeparateOddAlignedProducts) { 3090 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); 3091 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 3092 } else { 3093 bool IsHighest = 2 * i >= Accum.size(); 3094 Register SeparateOddOut[2]; 3095 auto LocalAccum = makeMutableArrayRef(SeparateOddOut) 3096 .take_front(IsHighest ? 1 : 2); 3097 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 3098 3099 MachineInstr *Lo; 3100 3101 if (i == 1) { 3102 if (!IsHighest) 3103 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); 3104 else 3105 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); 3106 } else { 3107 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], 3108 SeparateOddCarry); 3109 } 3110 Accum[2 * i - 1] = Lo->getOperand(0).getReg(); 3111 3112 if (!IsHighest) { 3113 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], 3114 Lo->getOperand(1).getReg()); 3115 Accum[2 * i] = Hi.getReg(0); 3116 SeparateOddCarry = Hi.getReg(1); 3117 } 3118 } 3119 } 3120 3121 // Add in the carries from the previous iteration 3122 if (i > 0) { 3123 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) 3124 EvenCarryIn.push_back(CarryOut); 3125 3126 if (2 * i < Accum.size()) { 3127 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) 3128 OddCarry.push_back(CarryOut); 3129 } 3130 } 3131 } 3132 } 3133 3134 // Custom narrowing of wide multiplies using wide multiply-add instructions. 3135 // 3136 // TODO: If the multiply is followed by an addition, we should attempt to 3137 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. 3138 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, 3139 MachineInstr &MI) const { 3140 assert(ST.hasMad64_32()); 3141 assert(MI.getOpcode() == TargetOpcode::G_MUL); 3142 3143 MachineIRBuilder &B = Helper.MIRBuilder; 3144 MachineRegisterInfo &MRI = *B.getMRI(); 3145 3146 Register DstReg = MI.getOperand(0).getReg(); 3147 Register Src0 = MI.getOperand(1).getReg(); 3148 Register Src1 = MI.getOperand(2).getReg(); 3149 3150 LLT Ty = MRI.getType(DstReg); 3151 assert(Ty.isScalar()); 3152 3153 unsigned Size = Ty.getSizeInBits(); 3154 unsigned NumParts = Size / 32; 3155 assert((Size % 32) == 0); 3156 assert(NumParts >= 2); 3157 3158 // Whether to use MAD_64_32 for partial products whose high half is 3159 // discarded. This avoids some ADD instructions but risks false dependency 3160 // stalls on some subtargets in some cases. 3161 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; 3162 3163 // Whether to compute odd-aligned partial products separately. This is 3164 // advisable on subtargets where the accumulator of MAD_64_32 must be placed 3165 // in an even-aligned VGPR. 3166 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); 3167 3168 LLT S32 = LLT::scalar(32); 3169 SmallVector<Register, 2> Src0Parts, Src1Parts; 3170 for (unsigned i = 0; i < NumParts; ++i) { 3171 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); 3172 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); 3173 } 3174 B.buildUnmerge(Src0Parts, Src0); 3175 B.buildUnmerge(Src1Parts, Src1); 3176 3177 SmallVector<Register, 2> AccumRegs(NumParts); 3178 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, 3179 SeparateOddAlignedProducts); 3180 3181 B.buildMerge(DstReg, AccumRegs); 3182 MI.eraseFromParent(); 3183 return true; 3184 3185 } 3186 3187 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 3188 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 3189 // case with a single min instruction instead of a compare+select. 3190 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 3191 MachineRegisterInfo &MRI, 3192 MachineIRBuilder &B) const { 3193 Register Dst = MI.getOperand(0).getReg(); 3194 Register Src = MI.getOperand(1).getReg(); 3195 LLT DstTy = MRI.getType(Dst); 3196 LLT SrcTy = MRI.getType(Src); 3197 3198 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 3199 ? AMDGPU::G_AMDGPU_FFBH_U32 3200 : AMDGPU::G_AMDGPU_FFBL_B32; 3201 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 3202 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 3203 3204 MI.eraseFromParent(); 3205 return true; 3206 } 3207 3208 // Check that this is a G_XOR x, -1 3209 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 3210 if (MI.getOpcode() != TargetOpcode::G_XOR) 3211 return false; 3212 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 3213 return ConstVal && *ConstVal == -1; 3214 } 3215 3216 // Return the use branch instruction, otherwise null if the usage is invalid. 3217 static MachineInstr * 3218 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 3219 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 3220 Register CondDef = MI.getOperand(0).getReg(); 3221 if (!MRI.hasOneNonDBGUse(CondDef)) 3222 return nullptr; 3223 3224 MachineBasicBlock *Parent = MI.getParent(); 3225 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 3226 3227 if (isNot(MRI, *UseMI)) { 3228 Register NegatedCond = UseMI->getOperand(0).getReg(); 3229 if (!MRI.hasOneNonDBGUse(NegatedCond)) 3230 return nullptr; 3231 3232 // We're deleting the def of this value, so we need to remove it. 3233 eraseInstr(*UseMI, MRI); 3234 3235 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 3236 Negated = true; 3237 } 3238 3239 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 3240 return nullptr; 3241 3242 // Make sure the cond br is followed by a G_BR, or is the last instruction. 3243 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 3244 if (Next == Parent->end()) { 3245 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 3246 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 3247 return nullptr; 3248 UncondBrTarget = &*NextMBB; 3249 } else { 3250 if (Next->getOpcode() != AMDGPU::G_BR) 3251 return nullptr; 3252 Br = &*Next; 3253 UncondBrTarget = Br->getOperand(0).getMBB(); 3254 } 3255 3256 return UseMI; 3257 } 3258 3259 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 3260 const ArgDescriptor *Arg, 3261 const TargetRegisterClass *ArgRC, 3262 LLT ArgTy) const { 3263 MCRegister SrcReg = Arg->getRegister(); 3264 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 3265 assert(DstReg.isVirtual() && "Virtual register expected"); 3266 3267 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 3268 *ArgRC, B.getDebugLoc(), ArgTy); 3269 if (Arg->isMasked()) { 3270 // TODO: Should we try to emit this once in the entry block? 3271 const LLT S32 = LLT::scalar(32); 3272 const unsigned Mask = Arg->getMask(); 3273 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 3274 3275 Register AndMaskSrc = LiveIn; 3276 3277 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 3278 // 0. 3279 if (Shift != 0) { 3280 auto ShiftAmt = B.buildConstant(S32, Shift); 3281 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 3282 } 3283 3284 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 3285 } else { 3286 B.buildCopy(DstReg, LiveIn); 3287 } 3288 3289 return true; 3290 } 3291 3292 bool AMDGPULegalizerInfo::loadInputValue( 3293 Register DstReg, MachineIRBuilder &B, 3294 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3295 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3296 const ArgDescriptor *Arg; 3297 const TargetRegisterClass *ArgRC; 3298 LLT ArgTy; 3299 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 3300 3301 if (!Arg) { 3302 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 3303 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 3304 // case the pointer argument may be missing and we use null. 3305 B.buildConstant(DstReg, 0); 3306 return true; 3307 } 3308 3309 // It's undefined behavior if a function marked with the amdgpu-no-* 3310 // attributes uses the corresponding intrinsic. 3311 B.buildUndef(DstReg); 3312 return true; 3313 } 3314 3315 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 3316 return false; // TODO: Handle these 3317 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 3318 } 3319 3320 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 3321 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 3322 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3323 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 3324 return false; 3325 3326 MI.eraseFromParent(); 3327 return true; 3328 } 3329 3330 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, 3331 int64_t C) { 3332 B.buildConstant(MI.getOperand(0).getReg(), C); 3333 MI.eraseFromParent(); 3334 return true; 3335 } 3336 3337 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( 3338 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 3339 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3340 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); 3341 if (MaxID == 0) 3342 return replaceWithConstant(B, MI, 0); 3343 3344 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3345 const ArgDescriptor *Arg; 3346 const TargetRegisterClass *ArgRC; 3347 LLT ArgTy; 3348 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 3349 3350 Register DstReg = MI.getOperand(0).getReg(); 3351 if (!Arg) { 3352 // It's undefined behavior if a function marked with the amdgpu-no-* 3353 // attributes uses the corresponding intrinsic. 3354 B.buildUndef(DstReg); 3355 MI.eraseFromParent(); 3356 return true; 3357 } 3358 3359 if (Arg->isMasked()) { 3360 // Don't bother inserting AssertZext for packed IDs since we're emitting the 3361 // masking operations anyway. 3362 // 3363 // TODO: We could assert the top bit is 0 for the source copy. 3364 if (!loadInputValue(DstReg, B, ArgType)) 3365 return false; 3366 } else { 3367 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 3368 if (!loadInputValue(TmpReg, B, ArgType)) 3369 return false; 3370 B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID)); 3371 } 3372 3373 MI.eraseFromParent(); 3374 return true; 3375 } 3376 3377 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, 3378 int64_t Offset) const { 3379 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 3380 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 3381 3382 // TODO: If we passed in the base kernel offset we could have a better 3383 // alignment than 4, but we don't really need it. 3384 if (!loadInputValue(KernArgReg, B, 3385 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3386 llvm_unreachable("failed to find kernarg segment ptr"); 3387 3388 auto COffset = B.buildConstant(LLT::scalar(64), Offset); 3389 // TODO: Should get nuw 3390 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); 3391 } 3392 3393 /// Legalize a value that's loaded from kernel arguments. This is only used by 3394 /// legacy intrinsics. 3395 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, 3396 MachineIRBuilder &B, 3397 uint64_t Offset, 3398 Align Alignment) const { 3399 Register DstReg = MI.getOperand(0).getReg(); 3400 3401 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) && 3402 "unexpected kernarg parameter type"); 3403 3404 Register Ptr = getKernargParameterPtr(B, Offset); 3405 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 3406 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), 3407 MachineMemOperand::MODereferenceable | 3408 MachineMemOperand::MOInvariant); 3409 MI.eraseFromParent(); 3410 return true; 3411 } 3412 3413 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 3414 MachineRegisterInfo &MRI, 3415 MachineIRBuilder &B) const { 3416 Register Dst = MI.getOperand(0).getReg(); 3417 LLT DstTy = MRI.getType(Dst); 3418 LLT S16 = LLT::scalar(16); 3419 LLT S32 = LLT::scalar(32); 3420 LLT S64 = LLT::scalar(64); 3421 3422 if (DstTy == S16) 3423 return legalizeFDIV16(MI, MRI, B); 3424 if (DstTy == S32) 3425 return legalizeFDIV32(MI, MRI, B); 3426 if (DstTy == S64) 3427 return legalizeFDIV64(MI, MRI, B); 3428 3429 return false; 3430 } 3431 3432 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 3433 Register DstDivReg, 3434 Register DstRemReg, 3435 Register X, 3436 Register Y) const { 3437 const LLT S1 = LLT::scalar(1); 3438 const LLT S32 = LLT::scalar(32); 3439 3440 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 3441 // algorithm used here. 3442 3443 // Initial estimate of inv(y). 3444 auto FloatY = B.buildUITOFP(S32, Y); 3445 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 3446 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 3447 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 3448 auto Z = B.buildFPTOUI(S32, ScaledY); 3449 3450 // One round of UNR. 3451 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 3452 auto NegYZ = B.buildMul(S32, NegY, Z); 3453 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 3454 3455 // Quotient/remainder estimate. 3456 auto Q = B.buildUMulH(S32, X, Z); 3457 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 3458 3459 // First quotient/remainder refinement. 3460 auto One = B.buildConstant(S32, 1); 3461 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3462 if (DstDivReg) 3463 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 3464 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 3465 3466 // Second quotient/remainder refinement. 3467 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3468 if (DstDivReg) 3469 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 3470 3471 if (DstRemReg) 3472 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 3473 } 3474 3475 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 3476 // 3477 // Return lo, hi of result 3478 // 3479 // %cvt.lo = G_UITOFP Val.lo 3480 // %cvt.hi = G_UITOFP Val.hi 3481 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 3482 // %rcp = G_AMDGPU_RCP_IFLAG %mad 3483 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 3484 // %mul2 = G_FMUL %mul1, 2**(-32) 3485 // %trunc = G_INTRINSIC_TRUNC %mul2 3486 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 3487 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 3488 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 3489 Register Val) { 3490 const LLT S32 = LLT::scalar(32); 3491 auto Unmerge = B.buildUnmerge(S32, Val); 3492 3493 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 3494 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 3495 3496 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 3497 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 3498 3499 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 3500 auto Mul1 = 3501 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3502 3503 // 2**(-32) 3504 auto Mul2 = 3505 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3506 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3507 3508 // -(2**32) 3509 auto Mad2 = B.buildFMAD(S32, Trunc, 3510 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3511 3512 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3513 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3514 3515 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3516 } 3517 3518 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3519 Register DstDivReg, 3520 Register DstRemReg, 3521 Register Numer, 3522 Register Denom) const { 3523 const LLT S32 = LLT::scalar(32); 3524 const LLT S64 = LLT::scalar(64); 3525 const LLT S1 = LLT::scalar(1); 3526 Register RcpLo, RcpHi; 3527 3528 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3529 3530 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3531 3532 auto Zero64 = B.buildConstant(S64, 0); 3533 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3534 3535 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3536 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3537 3538 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3539 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3540 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3541 3542 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3543 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3544 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3545 3546 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3547 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3548 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3549 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3550 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3551 3552 auto Zero32 = B.buildConstant(S32, 0); 3553 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3554 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3555 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3556 3557 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3558 Register NumerLo = UnmergeNumer.getReg(0); 3559 Register NumerHi = UnmergeNumer.getReg(1); 3560 3561 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3562 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3563 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3564 Register Mul3_Lo = UnmergeMul3.getReg(0); 3565 Register Mul3_Hi = UnmergeMul3.getReg(1); 3566 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3567 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3568 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3569 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3570 3571 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3572 Register DenomLo = UnmergeDenom.getReg(0); 3573 Register DenomHi = UnmergeDenom.getReg(1); 3574 3575 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3576 auto C1 = B.buildSExt(S32, CmpHi); 3577 3578 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3579 auto C2 = B.buildSExt(S32, CmpLo); 3580 3581 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3582 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3583 3584 // TODO: Here and below portions of the code can be enclosed into if/endif. 3585 // Currently control flow is unconditional and we have 4 selects after 3586 // potential endif to substitute PHIs. 3587 3588 // if C3 != 0 ... 3589 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3590 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3591 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3592 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3593 3594 auto One64 = B.buildConstant(S64, 1); 3595 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3596 3597 auto C4 = 3598 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3599 auto C5 = 3600 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3601 auto C6 = B.buildSelect( 3602 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3603 3604 // if (C6 != 0) 3605 auto Add4 = B.buildAdd(S64, Add3, One64); 3606 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3607 3608 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3609 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3610 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3611 3612 // endif C6 3613 // endif C3 3614 3615 if (DstDivReg) { 3616 auto Sel1 = B.buildSelect( 3617 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3618 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3619 Sel1, MulHi3); 3620 } 3621 3622 if (DstRemReg) { 3623 auto Sel2 = B.buildSelect( 3624 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3625 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3626 Sel2, Sub1); 3627 } 3628 } 3629 3630 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3631 MachineRegisterInfo &MRI, 3632 MachineIRBuilder &B) const { 3633 Register DstDivReg, DstRemReg; 3634 switch (MI.getOpcode()) { 3635 default: 3636 llvm_unreachable("Unexpected opcode!"); 3637 case AMDGPU::G_UDIV: { 3638 DstDivReg = MI.getOperand(0).getReg(); 3639 break; 3640 } 3641 case AMDGPU::G_UREM: { 3642 DstRemReg = MI.getOperand(0).getReg(); 3643 break; 3644 } 3645 case AMDGPU::G_UDIVREM: { 3646 DstDivReg = MI.getOperand(0).getReg(); 3647 DstRemReg = MI.getOperand(1).getReg(); 3648 break; 3649 } 3650 } 3651 3652 const LLT S64 = LLT::scalar(64); 3653 const LLT S32 = LLT::scalar(32); 3654 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3655 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3656 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3657 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3658 3659 if (Ty == S32) 3660 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3661 else if (Ty == S64) 3662 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3663 else 3664 return false; 3665 3666 MI.eraseFromParent(); 3667 return true; 3668 } 3669 3670 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3671 MachineRegisterInfo &MRI, 3672 MachineIRBuilder &B) const { 3673 const LLT S64 = LLT::scalar(64); 3674 const LLT S32 = LLT::scalar(32); 3675 3676 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3677 if (Ty != S32 && Ty != S64) 3678 return false; 3679 3680 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3681 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3682 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3683 3684 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3685 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3686 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3687 3688 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3689 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3690 3691 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3692 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3693 3694 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3695 switch (MI.getOpcode()) { 3696 default: 3697 llvm_unreachable("Unexpected opcode!"); 3698 case AMDGPU::G_SDIV: { 3699 DstDivReg = MI.getOperand(0).getReg(); 3700 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3701 break; 3702 } 3703 case AMDGPU::G_SREM: { 3704 DstRemReg = MI.getOperand(0).getReg(); 3705 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3706 break; 3707 } 3708 case AMDGPU::G_SDIVREM: { 3709 DstDivReg = MI.getOperand(0).getReg(); 3710 DstRemReg = MI.getOperand(1).getReg(); 3711 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3712 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3713 break; 3714 } 3715 } 3716 3717 if (Ty == S32) 3718 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3719 else 3720 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3721 3722 if (DstDivReg) { 3723 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3724 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3725 B.buildSub(DstDivReg, SignXor, Sign); 3726 } 3727 3728 if (DstRemReg) { 3729 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3730 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3731 B.buildSub(DstRemReg, SignXor, Sign); 3732 } 3733 3734 MI.eraseFromParent(); 3735 return true; 3736 } 3737 3738 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3739 MachineRegisterInfo &MRI, 3740 MachineIRBuilder &B) const { 3741 Register Res = MI.getOperand(0).getReg(); 3742 Register LHS = MI.getOperand(1).getReg(); 3743 Register RHS = MI.getOperand(2).getReg(); 3744 uint16_t Flags = MI.getFlags(); 3745 LLT ResTy = MRI.getType(Res); 3746 3747 const MachineFunction &MF = B.getMF(); 3748 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3749 MI.getFlag(MachineInstr::FmAfn); 3750 3751 if (!AllowInaccurateRcp) 3752 return false; 3753 3754 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3755 // 1 / x -> RCP(x) 3756 if (CLHS->isExactlyValue(1.0)) { 3757 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3758 .addUse(RHS) 3759 .setMIFlags(Flags); 3760 3761 MI.eraseFromParent(); 3762 return true; 3763 } 3764 3765 // -1 / x -> RCP( FNEG(x) ) 3766 if (CLHS->isExactlyValue(-1.0)) { 3767 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3768 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3769 .addUse(FNeg.getReg(0)) 3770 .setMIFlags(Flags); 3771 3772 MI.eraseFromParent(); 3773 return true; 3774 } 3775 } 3776 3777 // x / y -> x * (1.0 / y) 3778 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3779 .addUse(RHS) 3780 .setMIFlags(Flags); 3781 B.buildFMul(Res, LHS, RCP, Flags); 3782 3783 MI.eraseFromParent(); 3784 return true; 3785 } 3786 3787 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3788 MachineRegisterInfo &MRI, 3789 MachineIRBuilder &B) const { 3790 Register Res = MI.getOperand(0).getReg(); 3791 Register X = MI.getOperand(1).getReg(); 3792 Register Y = MI.getOperand(2).getReg(); 3793 uint16_t Flags = MI.getFlags(); 3794 LLT ResTy = MRI.getType(Res); 3795 3796 const MachineFunction &MF = B.getMF(); 3797 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3798 MI.getFlag(MachineInstr::FmAfn); 3799 3800 if (!AllowInaccurateRcp) 3801 return false; 3802 3803 auto NegY = B.buildFNeg(ResTy, Y); 3804 auto One = B.buildFConstant(ResTy, 1.0); 3805 3806 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3807 .addUse(Y) 3808 .setMIFlags(Flags); 3809 3810 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3811 R = B.buildFMA(ResTy, Tmp0, R, R); 3812 3813 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3814 R = B.buildFMA(ResTy, Tmp1, R, R); 3815 3816 auto Ret = B.buildFMul(ResTy, X, R); 3817 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3818 3819 B.buildFMA(Res, Tmp2, R, Ret); 3820 MI.eraseFromParent(); 3821 return true; 3822 } 3823 3824 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3825 MachineRegisterInfo &MRI, 3826 MachineIRBuilder &B) const { 3827 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3828 return true; 3829 3830 Register Res = MI.getOperand(0).getReg(); 3831 Register LHS = MI.getOperand(1).getReg(); 3832 Register RHS = MI.getOperand(2).getReg(); 3833 3834 uint16_t Flags = MI.getFlags(); 3835 3836 LLT S16 = LLT::scalar(16); 3837 LLT S32 = LLT::scalar(32); 3838 3839 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3840 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3841 3842 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3843 .addUse(RHSExt.getReg(0)) 3844 .setMIFlags(Flags); 3845 3846 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3847 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3848 3849 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3850 .addUse(RDst.getReg(0)) 3851 .addUse(RHS) 3852 .addUse(LHS) 3853 .setMIFlags(Flags); 3854 3855 MI.eraseFromParent(); 3856 return true; 3857 } 3858 3859 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3860 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3861 static void toggleSPDenormMode(bool Enable, 3862 MachineIRBuilder &B, 3863 const GCNSubtarget &ST, 3864 AMDGPU::SIModeRegisterDefaults Mode) { 3865 // Set SP denorm mode to this value. 3866 unsigned SPDenormMode = 3867 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3868 3869 if (ST.hasDenormModeInst()) { 3870 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3871 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3872 3873 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3874 B.buildInstr(AMDGPU::S_DENORM_MODE) 3875 .addImm(NewDenormModeValue); 3876 3877 } else { 3878 // Select FP32 bit field in mode register. 3879 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3880 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3881 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3882 3883 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3884 .addImm(SPDenormMode) 3885 .addImm(SPDenormModeBitField); 3886 } 3887 } 3888 3889 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3890 MachineRegisterInfo &MRI, 3891 MachineIRBuilder &B) const { 3892 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3893 return true; 3894 3895 Register Res = MI.getOperand(0).getReg(); 3896 Register LHS = MI.getOperand(1).getReg(); 3897 Register RHS = MI.getOperand(2).getReg(); 3898 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3899 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3900 3901 uint16_t Flags = MI.getFlags(); 3902 3903 LLT S32 = LLT::scalar(32); 3904 LLT S1 = LLT::scalar(1); 3905 3906 auto One = B.buildFConstant(S32, 1.0f); 3907 3908 auto DenominatorScaled = 3909 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3910 .addUse(LHS) 3911 .addUse(RHS) 3912 .addImm(0) 3913 .setMIFlags(Flags); 3914 auto NumeratorScaled = 3915 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3916 .addUse(LHS) 3917 .addUse(RHS) 3918 .addImm(1) 3919 .setMIFlags(Flags); 3920 3921 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3922 .addUse(DenominatorScaled.getReg(0)) 3923 .setMIFlags(Flags); 3924 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3925 3926 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3927 // aren't modeled as reading it. 3928 if (!Mode.allFP32Denormals()) 3929 toggleSPDenormMode(true, B, ST, Mode); 3930 3931 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3932 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3933 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3934 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3935 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3936 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3937 3938 if (!Mode.allFP32Denormals()) 3939 toggleSPDenormMode(false, B, ST, Mode); 3940 3941 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3942 .addUse(Fma4.getReg(0)) 3943 .addUse(Fma1.getReg(0)) 3944 .addUse(Fma3.getReg(0)) 3945 .addUse(NumeratorScaled.getReg(1)) 3946 .setMIFlags(Flags); 3947 3948 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3949 .addUse(Fmas.getReg(0)) 3950 .addUse(RHS) 3951 .addUse(LHS) 3952 .setMIFlags(Flags); 3953 3954 MI.eraseFromParent(); 3955 return true; 3956 } 3957 3958 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3959 MachineRegisterInfo &MRI, 3960 MachineIRBuilder &B) const { 3961 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3962 return true; 3963 3964 Register Res = MI.getOperand(0).getReg(); 3965 Register LHS = MI.getOperand(1).getReg(); 3966 Register RHS = MI.getOperand(2).getReg(); 3967 3968 uint16_t Flags = MI.getFlags(); 3969 3970 LLT S64 = LLT::scalar(64); 3971 LLT S1 = LLT::scalar(1); 3972 3973 auto One = B.buildFConstant(S64, 1.0); 3974 3975 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3976 .addUse(LHS) 3977 .addUse(RHS) 3978 .addImm(0) 3979 .setMIFlags(Flags); 3980 3981 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3982 3983 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3984 .addUse(DivScale0.getReg(0)) 3985 .setMIFlags(Flags); 3986 3987 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3988 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3989 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3990 3991 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3992 .addUse(LHS) 3993 .addUse(RHS) 3994 .addImm(1) 3995 .setMIFlags(Flags); 3996 3997 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3998 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3999 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 4000 4001 Register Scale; 4002 if (!ST.hasUsableDivScaleConditionOutput()) { 4003 // Workaround a hardware bug on SI where the condition output from div_scale 4004 // is not usable. 4005 4006 LLT S32 = LLT::scalar(32); 4007 4008 auto NumUnmerge = B.buildUnmerge(S32, LHS); 4009 auto DenUnmerge = B.buildUnmerge(S32, RHS); 4010 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 4011 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 4012 4013 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 4014 Scale1Unmerge.getReg(1)); 4015 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 4016 Scale0Unmerge.getReg(1)); 4017 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 4018 } else { 4019 Scale = DivScale1.getReg(1); 4020 } 4021 4022 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 4023 .addUse(Fma4.getReg(0)) 4024 .addUse(Fma3.getReg(0)) 4025 .addUse(Mul.getReg(0)) 4026 .addUse(Scale) 4027 .setMIFlags(Flags); 4028 4029 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 4030 .addUse(Fmas.getReg(0)) 4031 .addUse(RHS) 4032 .addUse(LHS) 4033 .setMIFlags(Flags); 4034 4035 MI.eraseFromParent(); 4036 return true; 4037 } 4038 4039 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 4040 MachineRegisterInfo &MRI, 4041 MachineIRBuilder &B) const { 4042 Register Res = MI.getOperand(0).getReg(); 4043 Register LHS = MI.getOperand(2).getReg(); 4044 Register RHS = MI.getOperand(3).getReg(); 4045 uint16_t Flags = MI.getFlags(); 4046 4047 LLT S32 = LLT::scalar(32); 4048 LLT S1 = LLT::scalar(1); 4049 4050 auto Abs = B.buildFAbs(S32, RHS, Flags); 4051 const APFloat C0Val(1.0f); 4052 4053 auto C0 = B.buildConstant(S32, 0x6f800000); 4054 auto C1 = B.buildConstant(S32, 0x2f800000); 4055 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 4056 4057 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 4058 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 4059 4060 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 4061 4062 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 4063 .addUse(Mul0.getReg(0)) 4064 .setMIFlags(Flags); 4065 4066 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 4067 4068 B.buildFMul(Res, Sel, Mul1, Flags); 4069 4070 MI.eraseFromParent(); 4071 return true; 4072 } 4073 4074 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 4075 // FIXME: Why do we handle this one but not other removed instructions? 4076 // 4077 // Reciprocal square root. The clamp prevents infinite results, clamping 4078 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 4079 // +-max_float. 4080 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 4081 MachineRegisterInfo &MRI, 4082 MachineIRBuilder &B) const { 4083 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 4084 return true; 4085 4086 Register Dst = MI.getOperand(0).getReg(); 4087 Register Src = MI.getOperand(2).getReg(); 4088 auto Flags = MI.getFlags(); 4089 4090 LLT Ty = MRI.getType(Dst); 4091 4092 const fltSemantics *FltSemantics; 4093 if (Ty == LLT::scalar(32)) 4094 FltSemantics = &APFloat::IEEEsingle(); 4095 else if (Ty == LLT::scalar(64)) 4096 FltSemantics = &APFloat::IEEEdouble(); 4097 else 4098 return false; 4099 4100 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 4101 .addUse(Src) 4102 .setMIFlags(Flags); 4103 4104 // We don't need to concern ourselves with the snan handling difference, since 4105 // the rsq quieted (or not) so use the one which will directly select. 4106 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4107 const bool UseIEEE = MFI->getMode().IEEE; 4108 4109 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 4110 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 4111 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 4112 4113 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 4114 4115 if (UseIEEE) 4116 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 4117 else 4118 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 4119 MI.eraseFromParent(); 4120 return true; 4121 } 4122 4123 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 4124 switch (IID) { 4125 case Intrinsic::amdgcn_ds_fadd: 4126 return AMDGPU::G_ATOMICRMW_FADD; 4127 case Intrinsic::amdgcn_ds_fmin: 4128 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 4129 case Intrinsic::amdgcn_ds_fmax: 4130 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 4131 default: 4132 llvm_unreachable("not a DS FP intrinsic"); 4133 } 4134 } 4135 4136 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 4137 MachineInstr &MI, 4138 Intrinsic::ID IID) const { 4139 GISelChangeObserver &Observer = Helper.Observer; 4140 Observer.changingInstr(MI); 4141 4142 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 4143 4144 // The remaining operands were used to set fields in the MemOperand on 4145 // construction. 4146 for (int I = 6; I > 3; --I) 4147 MI.removeOperand(I); 4148 4149 MI.removeOperand(1); // Remove the intrinsic ID. 4150 Observer.changedInstr(MI); 4151 return true; 4152 } 4153 4154 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 4155 MachineRegisterInfo &MRI, 4156 MachineIRBuilder &B) const { 4157 uint64_t Offset = 4158 ST.getTargetLowering()->getImplicitParameterOffset( 4159 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 4160 LLT DstTy = MRI.getType(DstReg); 4161 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 4162 4163 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 4164 if (!loadInputValue(KernargPtrReg, B, 4165 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 4166 return false; 4167 4168 // FIXME: This should be nuw 4169 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 4170 return true; 4171 } 4172 4173 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 4174 MachineRegisterInfo &MRI, 4175 MachineIRBuilder &B) const { 4176 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4177 if (!MFI->isEntryFunction()) { 4178 return legalizePreloadedArgIntrin(MI, MRI, B, 4179 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 4180 } 4181 4182 Register DstReg = MI.getOperand(0).getReg(); 4183 if (!getImplicitArgPtr(DstReg, MRI, B)) 4184 return false; 4185 4186 MI.eraseFromParent(); 4187 return true; 4188 } 4189 4190 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 4191 MachineRegisterInfo &MRI, 4192 MachineIRBuilder &B, 4193 unsigned AddrSpace) const { 4194 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 4195 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 4196 Register Hi32 = Unmerge.getReg(1); 4197 4198 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 4199 MI.eraseFromParent(); 4200 return true; 4201 } 4202 4203 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 4204 // offset (the offset that is included in bounds checking and swizzling, to be 4205 // split between the instruction's voffset and immoffset fields) and soffset 4206 // (the offset that is excluded from bounds checking and swizzling, to go in 4207 // the instruction's soffset field). This function takes the first kind of 4208 // offset and figures out how to split it between voffset and immoffset. 4209 std::pair<Register, unsigned> 4210 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 4211 Register OrigOffset) const { 4212 const unsigned MaxImm = 4095; 4213 Register BaseReg; 4214 unsigned ImmOffset; 4215 const LLT S32 = LLT::scalar(32); 4216 MachineRegisterInfo &MRI = *B.getMRI(); 4217 4218 std::tie(BaseReg, ImmOffset) = 4219 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 4220 4221 // If BaseReg is a pointer, convert it to int. 4222 if (MRI.getType(BaseReg).isPointer()) 4223 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 4224 4225 // If the immediate value is too big for the immoffset field, put the value 4226 // and -4096 into the immoffset field so that the value that is copied/added 4227 // for the voffset field is a multiple of 4096, and it stands more chance 4228 // of being CSEd with the copy/add for another similar load/store. 4229 // However, do not do that rounding down to a multiple of 4096 if that is a 4230 // negative number, as it appears to be illegal to have a negative offset 4231 // in the vgpr, even if adding the immediate offset makes it positive. 4232 unsigned Overflow = ImmOffset & ~MaxImm; 4233 ImmOffset -= Overflow; 4234 if ((int32_t)Overflow < 0) { 4235 Overflow += ImmOffset; 4236 ImmOffset = 0; 4237 } 4238 4239 if (Overflow != 0) { 4240 if (!BaseReg) { 4241 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 4242 } else { 4243 auto OverflowVal = B.buildConstant(S32, Overflow); 4244 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 4245 } 4246 } 4247 4248 if (!BaseReg) 4249 BaseReg = B.buildConstant(S32, 0).getReg(0); 4250 4251 return std::make_pair(BaseReg, ImmOffset); 4252 } 4253 4254 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 4255 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 4256 Register VOffset, Register SOffset, 4257 unsigned ImmOffset, Register VIndex, 4258 MachineRegisterInfo &MRI) const { 4259 Optional<ValueAndVReg> MaybeVOffsetVal = 4260 getIConstantVRegValWithLookThrough(VOffset, MRI); 4261 Optional<ValueAndVReg> MaybeSOffsetVal = 4262 getIConstantVRegValWithLookThrough(SOffset, MRI); 4263 Optional<ValueAndVReg> MaybeVIndexVal = 4264 getIConstantVRegValWithLookThrough(VIndex, MRI); 4265 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 4266 // update the MMO with that offset. The stride is unknown so we can only do 4267 // this if VIndex is constant 0. 4268 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 4269 MaybeVIndexVal->Value == 0) { 4270 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 4271 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 4272 MMO->setOffset(TotalOffset); 4273 } else { 4274 // We don't have a constant combined offset to use in the MMO. Give up. 4275 MMO->setValue((Value *)nullptr); 4276 } 4277 } 4278 4279 /// Handle register layout difference for f16 images for some subtargets. 4280 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 4281 MachineRegisterInfo &MRI, 4282 Register Reg, 4283 bool ImageStore) const { 4284 const LLT S16 = LLT::scalar(16); 4285 const LLT S32 = LLT::scalar(32); 4286 LLT StoreVT = MRI.getType(Reg); 4287 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 4288 4289 if (ST.hasUnpackedD16VMem()) { 4290 auto Unmerge = B.buildUnmerge(S16, Reg); 4291 4292 SmallVector<Register, 4> WideRegs; 4293 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 4294 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 4295 4296 int NumElts = StoreVT.getNumElements(); 4297 4298 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 4299 .getReg(0); 4300 } 4301 4302 if (ImageStore && ST.hasImageStoreD16Bug()) { 4303 if (StoreVT.getNumElements() == 2) { 4304 SmallVector<Register, 4> PackedRegs; 4305 Reg = B.buildBitcast(S32, Reg).getReg(0); 4306 PackedRegs.push_back(Reg); 4307 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 4308 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 4309 .getReg(0); 4310 } 4311 4312 if (StoreVT.getNumElements() == 3) { 4313 SmallVector<Register, 4> PackedRegs; 4314 auto Unmerge = B.buildUnmerge(S16, Reg); 4315 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 4316 PackedRegs.push_back(Unmerge.getReg(I)); 4317 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 4318 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 4319 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 4320 } 4321 4322 if (StoreVT.getNumElements() == 4) { 4323 SmallVector<Register, 4> PackedRegs; 4324 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 4325 auto Unmerge = B.buildUnmerge(S32, Reg); 4326 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 4327 PackedRegs.push_back(Unmerge.getReg(I)); 4328 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 4329 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 4330 .getReg(0); 4331 } 4332 4333 llvm_unreachable("invalid data type"); 4334 } 4335 4336 if (StoreVT == LLT::fixed_vector(3, S16)) { 4337 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 4338 .getReg(0); 4339 } 4340 return Reg; 4341 } 4342 4343 Register AMDGPULegalizerInfo::fixStoreSourceType( 4344 MachineIRBuilder &B, Register VData, bool IsFormat) const { 4345 MachineRegisterInfo *MRI = B.getMRI(); 4346 LLT Ty = MRI->getType(VData); 4347 4348 const LLT S16 = LLT::scalar(16); 4349 4350 // Fixup illegal register types for i8 stores. 4351 if (Ty == LLT::scalar(8) || Ty == S16) { 4352 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 4353 return AnyExt; 4354 } 4355 4356 if (Ty.isVector()) { 4357 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 4358 if (IsFormat) 4359 return handleD16VData(B, *MRI, VData); 4360 } 4361 } 4362 4363 return VData; 4364 } 4365 4366 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 4367 MachineRegisterInfo &MRI, 4368 MachineIRBuilder &B, 4369 bool IsTyped, 4370 bool IsFormat) const { 4371 Register VData = MI.getOperand(1).getReg(); 4372 LLT Ty = MRI.getType(VData); 4373 LLT EltTy = Ty.getScalarType(); 4374 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4375 const LLT S32 = LLT::scalar(32); 4376 4377 VData = fixStoreSourceType(B, VData, IsFormat); 4378 Register RSrc = MI.getOperand(2).getReg(); 4379 4380 MachineMemOperand *MMO = *MI.memoperands_begin(); 4381 const int MemSize = MMO->getSize(); 4382 4383 unsigned ImmOffset; 4384 4385 // The typed intrinsics add an immediate after the registers. 4386 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4387 4388 // The struct intrinsic variants add one additional operand over raw. 4389 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4390 Register VIndex; 4391 int OpOffset = 0; 4392 if (HasVIndex) { 4393 VIndex = MI.getOperand(3).getReg(); 4394 OpOffset = 1; 4395 } else { 4396 VIndex = B.buildConstant(S32, 0).getReg(0); 4397 } 4398 4399 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4400 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4401 4402 unsigned Format = 0; 4403 if (IsTyped) { 4404 Format = MI.getOperand(5 + OpOffset).getImm(); 4405 ++OpOffset; 4406 } 4407 4408 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4409 4410 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4411 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4412 4413 unsigned Opc; 4414 if (IsTyped) { 4415 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 4416 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 4417 } else if (IsFormat) { 4418 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 4419 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 4420 } else { 4421 switch (MemSize) { 4422 case 1: 4423 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 4424 break; 4425 case 2: 4426 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 4427 break; 4428 default: 4429 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 4430 break; 4431 } 4432 } 4433 4434 auto MIB = B.buildInstr(Opc) 4435 .addUse(VData) // vdata 4436 .addUse(RSrc) // rsrc 4437 .addUse(VIndex) // vindex 4438 .addUse(VOffset) // voffset 4439 .addUse(SOffset) // soffset 4440 .addImm(ImmOffset); // offset(imm) 4441 4442 if (IsTyped) 4443 MIB.addImm(Format); 4444 4445 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4446 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4447 .addMemOperand(MMO); 4448 4449 MI.eraseFromParent(); 4450 return true; 4451 } 4452 4453 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 4454 MachineRegisterInfo &MRI, 4455 MachineIRBuilder &B, 4456 bool IsFormat, 4457 bool IsTyped) const { 4458 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 4459 MachineMemOperand *MMO = *MI.memoperands_begin(); 4460 const LLT MemTy = MMO->getMemoryType(); 4461 const LLT S32 = LLT::scalar(32); 4462 4463 Register Dst = MI.getOperand(0).getReg(); 4464 Register RSrc = MI.getOperand(2).getReg(); 4465 4466 // The typed intrinsics add an immediate after the registers. 4467 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4468 4469 // The struct intrinsic variants add one additional operand over raw. 4470 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4471 Register VIndex; 4472 int OpOffset = 0; 4473 if (HasVIndex) { 4474 VIndex = MI.getOperand(3).getReg(); 4475 OpOffset = 1; 4476 } else { 4477 VIndex = B.buildConstant(S32, 0).getReg(0); 4478 } 4479 4480 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4481 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4482 4483 unsigned Format = 0; 4484 if (IsTyped) { 4485 Format = MI.getOperand(5 + OpOffset).getImm(); 4486 ++OpOffset; 4487 } 4488 4489 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4490 unsigned ImmOffset; 4491 4492 LLT Ty = MRI.getType(Dst); 4493 LLT EltTy = Ty.getScalarType(); 4494 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4495 const bool Unpacked = ST.hasUnpackedD16VMem(); 4496 4497 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4498 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4499 4500 unsigned Opc; 4501 4502 if (IsTyped) { 4503 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4504 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4505 } else if (IsFormat) { 4506 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4507 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4508 } else { 4509 switch (MemTy.getSizeInBits()) { 4510 case 8: 4511 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4512 break; 4513 case 16: 4514 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4515 break; 4516 default: 4517 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4518 break; 4519 } 4520 } 4521 4522 Register LoadDstReg; 4523 4524 bool IsExtLoad = 4525 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4526 LLT UnpackedTy = Ty.changeElementSize(32); 4527 4528 if (IsExtLoad) 4529 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4530 else if (Unpacked && IsD16 && Ty.isVector()) 4531 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4532 else 4533 LoadDstReg = Dst; 4534 4535 auto MIB = B.buildInstr(Opc) 4536 .addDef(LoadDstReg) // vdata 4537 .addUse(RSrc) // rsrc 4538 .addUse(VIndex) // vindex 4539 .addUse(VOffset) // voffset 4540 .addUse(SOffset) // soffset 4541 .addImm(ImmOffset); // offset(imm) 4542 4543 if (IsTyped) 4544 MIB.addImm(Format); 4545 4546 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4547 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4548 .addMemOperand(MMO); 4549 4550 if (LoadDstReg != Dst) { 4551 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4552 4553 // Widen result for extending loads was widened. 4554 if (IsExtLoad) 4555 B.buildTrunc(Dst, LoadDstReg); 4556 else { 4557 // Repack to original 16-bit vector result 4558 // FIXME: G_TRUNC should work, but legalization currently fails 4559 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4560 SmallVector<Register, 4> Repack; 4561 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4562 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4563 B.buildMerge(Dst, Repack); 4564 } 4565 } 4566 4567 MI.eraseFromParent(); 4568 return true; 4569 } 4570 4571 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4572 MachineIRBuilder &B, 4573 bool IsInc) const { 4574 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4575 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4576 B.buildInstr(Opc) 4577 .addDef(MI.getOperand(0).getReg()) 4578 .addUse(MI.getOperand(2).getReg()) 4579 .addUse(MI.getOperand(3).getReg()) 4580 .cloneMemRefs(MI); 4581 MI.eraseFromParent(); 4582 return true; 4583 } 4584 4585 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4586 switch (IntrID) { 4587 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4588 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4589 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4590 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4591 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4592 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4593 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4594 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4595 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4596 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4597 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4598 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4599 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4600 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4601 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4602 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4603 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4604 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4605 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4606 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4607 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4608 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4609 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4610 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4611 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4612 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4613 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4614 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4615 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4616 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4617 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4618 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4619 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4620 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4621 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4622 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4623 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4624 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4625 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4626 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4627 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4628 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4629 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4630 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4631 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4632 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4633 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4634 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4635 default: 4636 llvm_unreachable("unhandled atomic opcode"); 4637 } 4638 } 4639 4640 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4641 MachineIRBuilder &B, 4642 Intrinsic::ID IID) const { 4643 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4644 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4645 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4646 4647 Register Dst; 4648 4649 int OpOffset = 0; 4650 if (HasReturn) { 4651 // A few FP atomics do not support return values. 4652 Dst = MI.getOperand(0).getReg(); 4653 } else { 4654 OpOffset = -1; 4655 } 4656 4657 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4658 Register CmpVal; 4659 4660 if (IsCmpSwap) { 4661 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4662 ++OpOffset; 4663 } 4664 4665 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4666 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4667 4668 // The struct intrinsic variants add one additional operand over raw. 4669 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4670 Register VIndex; 4671 if (HasVIndex) { 4672 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4673 ++OpOffset; 4674 } else { 4675 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4676 } 4677 4678 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4679 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4680 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4681 4682 MachineMemOperand *MMO = *MI.memoperands_begin(); 4683 4684 unsigned ImmOffset; 4685 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4686 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4687 4688 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4689 4690 if (HasReturn) 4691 MIB.addDef(Dst); 4692 4693 MIB.addUse(VData); // vdata 4694 4695 if (IsCmpSwap) 4696 MIB.addReg(CmpVal); 4697 4698 MIB.addUse(RSrc) // rsrc 4699 .addUse(VIndex) // vindex 4700 .addUse(VOffset) // voffset 4701 .addUse(SOffset) // soffset 4702 .addImm(ImmOffset) // offset(imm) 4703 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4704 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4705 .addMemOperand(MMO); 4706 4707 MI.eraseFromParent(); 4708 return true; 4709 } 4710 4711 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4712 /// vector with s16 typed elements. 4713 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4714 SmallVectorImpl<Register> &PackedAddrs, 4715 unsigned ArgOffset, 4716 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4717 bool IsA16, bool IsG16) { 4718 const LLT S16 = LLT::scalar(16); 4719 const LLT V2S16 = LLT::fixed_vector(2, 16); 4720 auto EndIdx = Intr->VAddrEnd; 4721 4722 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4723 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4724 if (!SrcOp.isReg()) 4725 continue; // _L to _LZ may have eliminated this. 4726 4727 Register AddrReg = SrcOp.getReg(); 4728 4729 if ((I < Intr->GradientStart) || 4730 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4731 (I >= Intr->CoordStart && !IsA16)) { 4732 if ((I < Intr->GradientStart) && IsA16 && 4733 (B.getMRI()->getType(AddrReg) == S16)) { 4734 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 4735 // Special handling of bias when A16 is on. Bias is of type half but 4736 // occupies full 32-bit. 4737 PackedAddrs.push_back( 4738 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4739 .getReg(0)); 4740 } else { 4741 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 4742 "Bias needs to be converted to 16 bit in A16 mode"); 4743 // Handle any gradient or coordinate operands that should not be packed 4744 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4745 PackedAddrs.push_back(AddrReg); 4746 } 4747 } else { 4748 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4749 // derivatives dx/dh and dx/dv are packed with undef. 4750 if (((I + 1) >= EndIdx) || 4751 ((Intr->NumGradients / 2) % 2 == 1 && 4752 (I == static_cast<unsigned>(Intr->GradientStart + 4753 (Intr->NumGradients / 2) - 1) || 4754 I == static_cast<unsigned>(Intr->GradientStart + 4755 Intr->NumGradients - 1))) || 4756 // Check for _L to _LZ optimization 4757 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4758 PackedAddrs.push_back( 4759 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4760 .getReg(0)); 4761 } else { 4762 PackedAddrs.push_back( 4763 B.buildBuildVector( 4764 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4765 .getReg(0)); 4766 ++I; 4767 } 4768 } 4769 } 4770 } 4771 4772 /// Convert from separate vaddr components to a single vector address register, 4773 /// and replace the remaining operands with $noreg. 4774 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4775 int DimIdx, int NumVAddrs) { 4776 const LLT S32 = LLT::scalar(32); 4777 4778 SmallVector<Register, 8> AddrRegs; 4779 for (int I = 0; I != NumVAddrs; ++I) { 4780 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4781 if (SrcOp.isReg()) { 4782 AddrRegs.push_back(SrcOp.getReg()); 4783 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4784 } 4785 } 4786 4787 int NumAddrRegs = AddrRegs.size(); 4788 if (NumAddrRegs != 1) { 4789 // Above 8 elements round up to next power of 2 (i.e. 16). 4790 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4791 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4792 auto Undef = B.buildUndef(S32); 4793 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4794 NumAddrRegs = RoundedNumRegs; 4795 } 4796 4797 auto VAddr = 4798 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4799 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4800 } 4801 4802 for (int I = 1; I != NumVAddrs; ++I) { 4803 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4804 if (SrcOp.isReg()) 4805 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4806 } 4807 } 4808 4809 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4810 /// 4811 /// Depending on the subtarget, load/store with 16-bit element data need to be 4812 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4813 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4814 /// registers. 4815 /// 4816 /// We don't want to directly select image instructions just yet, but also want 4817 /// to exposes all register repacking to the legalizer/combiners. We also don't 4818 /// want a selected instruction entering RegBankSelect. In order to avoid 4819 /// defining a multitude of intermediate image instructions, directly hack on 4820 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4821 /// padding now unnecessary arguments with $noreg. 4822 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4823 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4824 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4825 4826 const unsigned NumDefs = MI.getNumExplicitDefs(); 4827 const unsigned ArgOffset = NumDefs + 1; 4828 bool IsTFE = NumDefs == 2; 4829 // We are only processing the operands of d16 image operations on subtargets 4830 // that use the unpacked register layout, or need to repack the TFE result. 4831 4832 // TODO: Do we need to guard against already legalized intrinsics? 4833 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4834 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4835 4836 MachineRegisterInfo *MRI = B.getMRI(); 4837 const LLT S32 = LLT::scalar(32); 4838 const LLT S16 = LLT::scalar(16); 4839 const LLT V2S16 = LLT::fixed_vector(2, 16); 4840 4841 unsigned DMask = 0; 4842 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 4843 LLT Ty = MRI->getType(VData); 4844 4845 // Check for 16 bit addresses and pack if true. 4846 LLT GradTy = 4847 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4848 LLT AddrTy = 4849 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4850 const bool IsG16 = GradTy == S16; 4851 const bool IsA16 = AddrTy == S16; 4852 const bool IsD16 = Ty.getScalarType() == S16; 4853 4854 int DMaskLanes = 0; 4855 if (!BaseOpcode->Atomic) { 4856 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4857 if (BaseOpcode->Gather4) { 4858 DMaskLanes = 4; 4859 } else if (DMask != 0) { 4860 DMaskLanes = countPopulation(DMask); 4861 } else if (!IsTFE && !BaseOpcode->Store) { 4862 // If dmask is 0, this is a no-op load. This can be eliminated. 4863 B.buildUndef(MI.getOperand(0)); 4864 MI.eraseFromParent(); 4865 return true; 4866 } 4867 } 4868 4869 Observer.changingInstr(MI); 4870 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4871 4872 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 4873 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 4874 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 4875 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4876 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 4877 4878 // Track that we legalized this 4879 MI.setDesc(B.getTII().get(NewOpcode)); 4880 4881 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4882 // dmask to be at least 1 otherwise the instruction will fail 4883 if (IsTFE && DMask == 0) { 4884 DMask = 0x1; 4885 DMaskLanes = 1; 4886 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4887 } 4888 4889 if (BaseOpcode->Atomic) { 4890 Register VData0 = MI.getOperand(2).getReg(); 4891 LLT Ty = MRI->getType(VData0); 4892 4893 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4894 if (Ty.isVector()) 4895 return false; 4896 4897 if (BaseOpcode->AtomicX2) { 4898 Register VData1 = MI.getOperand(3).getReg(); 4899 // The two values are packed in one register. 4900 LLT PackedTy = LLT::fixed_vector(2, Ty); 4901 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4902 MI.getOperand(2).setReg(Concat.getReg(0)); 4903 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4904 } 4905 } 4906 4907 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4908 4909 // Rewrite the addressing register layout before doing anything else. 4910 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4911 // 16 bit gradients are supported, but are tied to the A16 control 4912 // so both gradients and addresses must be 16 bit 4913 return false; 4914 } 4915 4916 if (IsA16 && !ST.hasA16()) { 4917 // A16 not supported 4918 return false; 4919 } 4920 4921 if (IsA16 || IsG16) { 4922 if (Intr->NumVAddrs > 1) { 4923 SmallVector<Register, 4> PackedRegs; 4924 4925 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4926 IsG16); 4927 4928 // See also below in the non-a16 branch 4929 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4930 PackedRegs.size() <= ST.getNSAMaxSize(); 4931 4932 if (!UseNSA && PackedRegs.size() > 1) { 4933 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4934 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4935 PackedRegs[0] = Concat.getReg(0); 4936 PackedRegs.resize(1); 4937 } 4938 4939 const unsigned NumPacked = PackedRegs.size(); 4940 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4941 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4942 if (!SrcOp.isReg()) { 4943 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4944 continue; 4945 } 4946 4947 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4948 4949 if (I - Intr->VAddrStart < NumPacked) 4950 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4951 else 4952 SrcOp.setReg(AMDGPU::NoRegister); 4953 } 4954 } 4955 } else { 4956 // If the register allocator cannot place the address registers contiguously 4957 // without introducing moves, then using the non-sequential address encoding 4958 // is always preferable, since it saves VALU instructions and is usually a 4959 // wash in terms of code size or even better. 4960 // 4961 // However, we currently have no way of hinting to the register allocator 4962 // that MIMG addresses should be placed contiguously when it is possible to 4963 // do so, so force non-NSA for the common 2-address case as a heuristic. 4964 // 4965 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4966 // allocation when possible. 4967 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4968 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4969 4970 if (!UseNSA && Intr->NumVAddrs > 1) 4971 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4972 Intr->NumVAddrs); 4973 } 4974 4975 int Flags = 0; 4976 if (IsA16) 4977 Flags |= 1; 4978 if (IsG16) 4979 Flags |= 2; 4980 MI.addOperand(MachineOperand::CreateImm(Flags)); 4981 4982 if (BaseOpcode->Store) { // No TFE for stores? 4983 // TODO: Handle dmask trim 4984 if (!Ty.isVector() || !IsD16) 4985 return true; 4986 4987 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4988 if (RepackedReg != VData) { 4989 MI.getOperand(1).setReg(RepackedReg); 4990 } 4991 4992 return true; 4993 } 4994 4995 Register DstReg = MI.getOperand(0).getReg(); 4996 const LLT EltTy = Ty.getScalarType(); 4997 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4998 4999 // Confirm that the return type is large enough for the dmask specified 5000 if (NumElts < DMaskLanes) 5001 return false; 5002 5003 if (NumElts > 4 || DMaskLanes > 4) 5004 return false; 5005 5006 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 5007 const LLT AdjustedTy = 5008 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 5009 5010 // The raw dword aligned data component of the load. The only legal cases 5011 // where this matters should be when using the packed D16 format, for 5012 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 5013 LLT RoundedTy; 5014 5015 // S32 vector to to cover all data, plus TFE result element. 5016 LLT TFETy; 5017 5018 // Register type to use for each loaded component. Will be S32 or V2S16. 5019 LLT RegTy; 5020 5021 if (IsD16 && ST.hasUnpackedD16VMem()) { 5022 RoundedTy = 5023 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 5024 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 5025 RegTy = S32; 5026 } else { 5027 unsigned EltSize = EltTy.getSizeInBits(); 5028 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 5029 unsigned RoundedSize = 32 * RoundedElts; 5030 RoundedTy = LLT::scalarOrVector( 5031 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 5032 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 5033 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 5034 } 5035 5036 // The return type does not need adjustment. 5037 // TODO: Should we change s16 case to s32 or <2 x s16>? 5038 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 5039 return true; 5040 5041 Register Dst1Reg; 5042 5043 // Insert after the instruction. 5044 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 5045 5046 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 5047 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 5048 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 5049 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 5050 5051 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 5052 5053 MI.getOperand(0).setReg(NewResultReg); 5054 5055 // In the IR, TFE is supposed to be used with a 2 element struct return 5056 // type. The instruction really returns these two values in one contiguous 5057 // register, with one additional dword beyond the loaded data. Rewrite the 5058 // return type to use a single register result. 5059 5060 if (IsTFE) { 5061 Dst1Reg = MI.getOperand(1).getReg(); 5062 if (MRI->getType(Dst1Reg) != S32) 5063 return false; 5064 5065 // TODO: Make sure the TFE operand bit is set. 5066 MI.removeOperand(1); 5067 5068 // Handle the easy case that requires no repack instructions. 5069 if (Ty == S32) { 5070 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 5071 return true; 5072 } 5073 } 5074 5075 // Now figure out how to copy the new result register back into the old 5076 // result. 5077 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 5078 5079 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 5080 5081 if (ResultNumRegs == 1) { 5082 assert(!IsTFE); 5083 ResultRegs[0] = NewResultReg; 5084 } else { 5085 // We have to repack into a new vector of some kind. 5086 for (int I = 0; I != NumDataRegs; ++I) 5087 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 5088 B.buildUnmerge(ResultRegs, NewResultReg); 5089 5090 // Drop the final TFE element to get the data part. The TFE result is 5091 // directly written to the right place already. 5092 if (IsTFE) 5093 ResultRegs.resize(NumDataRegs); 5094 } 5095 5096 // For an s16 scalar result, we form an s32 result with a truncate regardless 5097 // of packed vs. unpacked. 5098 if (IsD16 && !Ty.isVector()) { 5099 B.buildTrunc(DstReg, ResultRegs[0]); 5100 return true; 5101 } 5102 5103 // Avoid a build/concat_vector of 1 entry. 5104 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 5105 B.buildBitcast(DstReg, ResultRegs[0]); 5106 return true; 5107 } 5108 5109 assert(Ty.isVector()); 5110 5111 if (IsD16) { 5112 // For packed D16 results with TFE enabled, all the data components are 5113 // S32. Cast back to the expected type. 5114 // 5115 // TODO: We don't really need to use load s32 elements. We would only need one 5116 // cast for the TFE result if a multiple of v2s16 was used. 5117 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 5118 for (Register &Reg : ResultRegs) 5119 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 5120 } else if (ST.hasUnpackedD16VMem()) { 5121 for (Register &Reg : ResultRegs) 5122 Reg = B.buildTrunc(S16, Reg).getReg(0); 5123 } 5124 } 5125 5126 auto padWithUndef = [&](LLT Ty, int NumElts) { 5127 if (NumElts == 0) 5128 return; 5129 Register Undef = B.buildUndef(Ty).getReg(0); 5130 for (int I = 0; I != NumElts; ++I) 5131 ResultRegs.push_back(Undef); 5132 }; 5133 5134 // Pad out any elements eliminated due to the dmask. 5135 LLT ResTy = MRI->getType(ResultRegs[0]); 5136 if (!ResTy.isVector()) { 5137 padWithUndef(ResTy, NumElts - ResultRegs.size()); 5138 B.buildBuildVector(DstReg, ResultRegs); 5139 return true; 5140 } 5141 5142 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 5143 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 5144 5145 // Deal with the one annoying legal case. 5146 const LLT V3S16 = LLT::fixed_vector(3, 16); 5147 if (Ty == V3S16) { 5148 if (IsTFE) { 5149 if (ResultRegs.size() == 1) { 5150 NewResultReg = ResultRegs[0]; 5151 } else if (ResultRegs.size() == 2) { 5152 LLT V4S16 = LLT::fixed_vector(4, 16); 5153 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 5154 } else { 5155 return false; 5156 } 5157 } 5158 5159 if (MRI->getType(DstReg).getNumElements() < 5160 MRI->getType(NewResultReg).getNumElements()) { 5161 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 5162 } else { 5163 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 5164 } 5165 return true; 5166 } 5167 5168 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 5169 B.buildConcatVectors(DstReg, ResultRegs); 5170 return true; 5171 } 5172 5173 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 5174 LegalizerHelper &Helper, MachineInstr &MI) const { 5175 MachineIRBuilder &B = Helper.MIRBuilder; 5176 GISelChangeObserver &Observer = Helper.Observer; 5177 5178 Register Dst = MI.getOperand(0).getReg(); 5179 LLT Ty = B.getMRI()->getType(Dst); 5180 unsigned Size = Ty.getSizeInBits(); 5181 MachineFunction &MF = B.getMF(); 5182 5183 Observer.changingInstr(MI); 5184 5185 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 5186 Ty = getBitcastRegisterType(Ty); 5187 Helper.bitcastDst(MI, Ty, 0); 5188 Dst = MI.getOperand(0).getReg(); 5189 B.setInsertPt(B.getMBB(), MI); 5190 } 5191 5192 // FIXME: We don't really need this intermediate instruction. The intrinsic 5193 // should be fixed to have a memory operand. Since it's readnone, we're not 5194 // allowed to add one. 5195 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 5196 MI.removeOperand(1); // Remove intrinsic ID 5197 5198 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 5199 // TODO: Should this use datalayout alignment? 5200 const unsigned MemSize = (Size + 7) / 8; 5201 const Align MemAlign(4); 5202 MachineMemOperand *MMO = MF.getMachineMemOperand( 5203 MachinePointerInfo(), 5204 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 5205 MachineMemOperand::MOInvariant, 5206 MemSize, MemAlign); 5207 MI.addMemOperand(MF, MMO); 5208 5209 // There are no 96-bit result scalar loads, but widening to 128-bit should 5210 // always be legal. We may need to restore this to a 96-bit result if it turns 5211 // out this needs to be converted to a vector load during RegBankSelect. 5212 if (!isPowerOf2_32(Size)) { 5213 if (Ty.isVector()) 5214 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 5215 else 5216 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 5217 } 5218 5219 Observer.changedInstr(MI); 5220 return true; 5221 } 5222 5223 // TODO: Move to selection 5224 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 5225 MachineRegisterInfo &MRI, 5226 MachineIRBuilder &B) const { 5227 if (!ST.isTrapHandlerEnabled() || 5228 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 5229 return legalizeTrapEndpgm(MI, MRI, B); 5230 5231 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 5232 switch (*HsaAbiVer) { 5233 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 5234 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 5235 return legalizeTrapHsaQueuePtr(MI, MRI, B); 5236 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 5237 case ELF::ELFABIVERSION_AMDGPU_HSA_V5: 5238 return ST.supportsGetDoorbellID() ? 5239 legalizeTrapHsa(MI, MRI, B) : 5240 legalizeTrapHsaQueuePtr(MI, MRI, B); 5241 } 5242 } 5243 5244 llvm_unreachable("Unknown trap handler"); 5245 } 5246 5247 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 5248 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5249 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 5250 MI.eraseFromParent(); 5251 return true; 5252 } 5253 5254 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 5255 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5256 MachineFunction &MF = B.getMF(); 5257 const LLT S64 = LLT::scalar(64); 5258 5259 Register SGPR01(AMDGPU::SGPR0_SGPR1); 5260 // For code object version 5, queue_ptr is passed through implicit kernarg. 5261 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 5262 AMDGPUTargetLowering::ImplicitParameter Param = 5263 AMDGPUTargetLowering::QUEUE_PTR; 5264 uint64_t Offset = 5265 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 5266 5267 Register KernargPtrReg = MRI.createGenericVirtualRegister( 5268 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 5269 5270 if (!loadInputValue(KernargPtrReg, B, 5271 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 5272 return false; 5273 5274 // TODO: can we be smarter about machine pointer info? 5275 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 5276 MachineMemOperand *MMO = MF.getMachineMemOperand( 5277 PtrInfo, 5278 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 5279 MachineMemOperand::MOInvariant, 5280 LLT::scalar(64), commonAlignment(Align(64), Offset)); 5281 5282 // Pointer address 5283 Register LoadAddr = MRI.createGenericVirtualRegister( 5284 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 5285 B.buildPtrAdd(LoadAddr, KernargPtrReg, 5286 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 5287 // Load address 5288 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 5289 B.buildCopy(SGPR01, Temp); 5290 B.buildInstr(AMDGPU::S_TRAP) 5291 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 5292 .addReg(SGPR01, RegState::Implicit); 5293 MI.eraseFromParent(); 5294 return true; 5295 } 5296 5297 // Pass queue pointer to trap handler as input, and insert trap instruction 5298 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 5299 Register LiveIn = 5300 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 5301 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 5302 return false; 5303 5304 B.buildCopy(SGPR01, LiveIn); 5305 B.buildInstr(AMDGPU::S_TRAP) 5306 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 5307 .addReg(SGPR01, RegState::Implicit); 5308 5309 MI.eraseFromParent(); 5310 return true; 5311 } 5312 5313 bool AMDGPULegalizerInfo::legalizeTrapHsa( 5314 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5315 B.buildInstr(AMDGPU::S_TRAP) 5316 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 5317 MI.eraseFromParent(); 5318 return true; 5319 } 5320 5321 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 5322 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5323 // Is non-HSA path or trap-handler disabled? Then, report a warning 5324 // accordingly 5325 if (!ST.isTrapHandlerEnabled() || 5326 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 5327 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 5328 "debugtrap handler not supported", 5329 MI.getDebugLoc(), DS_Warning); 5330 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 5331 Ctx.diagnose(NoTrap); 5332 } else { 5333 // Insert debug-trap instruction 5334 B.buildInstr(AMDGPU::S_TRAP) 5335 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 5336 } 5337 5338 MI.eraseFromParent(); 5339 return true; 5340 } 5341 5342 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 5343 MachineIRBuilder &B) const { 5344 MachineRegisterInfo &MRI = *B.getMRI(); 5345 const LLT S16 = LLT::scalar(16); 5346 const LLT S32 = LLT::scalar(32); 5347 5348 Register DstReg = MI.getOperand(0).getReg(); 5349 Register NodePtr = MI.getOperand(2).getReg(); 5350 Register RayExtent = MI.getOperand(3).getReg(); 5351 Register RayOrigin = MI.getOperand(4).getReg(); 5352 Register RayDir = MI.getOperand(5).getReg(); 5353 Register RayInvDir = MI.getOperand(6).getReg(); 5354 Register TDescr = MI.getOperand(7).getReg(); 5355 5356 if (!ST.hasGFX10_AEncoding()) { 5357 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 5358 "intrinsic not supported on subtarget", 5359 MI.getDebugLoc()); 5360 B.getMF().getFunction().getContext().diagnose(BadIntrin); 5361 return false; 5362 } 5363 5364 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 5365 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 5366 const unsigned NumVDataDwords = 4; 5367 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 5368 const bool UseNSA = 5369 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 5370 const unsigned BaseOpcodes[2][2] = { 5371 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 5372 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 5373 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 5374 int Opcode; 5375 if (UseNSA) { 5376 Opcode = 5377 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 5378 NumVDataDwords, NumVAddrDwords); 5379 } else { 5380 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 5381 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 5382 PowerOf2Ceil(NumVAddrDwords)); 5383 } 5384 assert(Opcode != -1); 5385 5386 SmallVector<Register, 12> Ops; 5387 if (Is64) { 5388 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 5389 Ops.push_back(Unmerge.getReg(0)); 5390 Ops.push_back(Unmerge.getReg(1)); 5391 } else { 5392 Ops.push_back(NodePtr); 5393 } 5394 Ops.push_back(RayExtent); 5395 5396 auto packLanes = [&Ops, &S32, &B](Register Src) { 5397 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 5398 Ops.push_back(Unmerge.getReg(0)); 5399 Ops.push_back(Unmerge.getReg(1)); 5400 Ops.push_back(Unmerge.getReg(2)); 5401 }; 5402 5403 packLanes(RayOrigin); 5404 if (IsA16) { 5405 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 5406 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 5407 Register R1 = MRI.createGenericVirtualRegister(S32); 5408 Register R2 = MRI.createGenericVirtualRegister(S32); 5409 Register R3 = MRI.createGenericVirtualRegister(S32); 5410 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 5411 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 5412 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 5413 Ops.push_back(R1); 5414 Ops.push_back(R2); 5415 Ops.push_back(R3); 5416 } else { 5417 packLanes(RayDir); 5418 packLanes(RayInvDir); 5419 } 5420 5421 if (!UseNSA) { 5422 // Build a single vector containing all the operands so far prepared. 5423 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 5424 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 5425 Ops.clear(); 5426 Ops.push_back(MergedOps); 5427 } 5428 5429 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 5430 .addDef(DstReg) 5431 .addImm(Opcode); 5432 5433 for (Register R : Ops) { 5434 MIB.addUse(R); 5435 } 5436 5437 MIB.addUse(TDescr) 5438 .addImm(IsA16 ? 1 : 0) 5439 .cloneMemRefs(MI); 5440 5441 MI.eraseFromParent(); 5442 return true; 5443 } 5444 5445 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 5446 MachineIRBuilder &B) const { 5447 unsigned Opc; 5448 int RoundMode = MI.getOperand(2).getImm(); 5449 5450 if (RoundMode == (int)RoundingMode::TowardPositive) 5451 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 5452 else if (RoundMode == (int)RoundingMode::TowardNegative) 5453 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 5454 else 5455 return false; 5456 5457 B.buildInstr(Opc) 5458 .addDef(MI.getOperand(0).getReg()) 5459 .addUse(MI.getOperand(1).getReg()); 5460 5461 MI.eraseFromParent(); 5462 5463 return true; 5464 } 5465 5466 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 5467 MachineInstr &MI) const { 5468 MachineIRBuilder &B = Helper.MIRBuilder; 5469 MachineRegisterInfo &MRI = *B.getMRI(); 5470 5471 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 5472 auto IntrID = MI.getIntrinsicID(); 5473 switch (IntrID) { 5474 case Intrinsic::amdgcn_if: 5475 case Intrinsic::amdgcn_else: { 5476 MachineInstr *Br = nullptr; 5477 MachineBasicBlock *UncondBrTarget = nullptr; 5478 bool Negated = false; 5479 if (MachineInstr *BrCond = 5480 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5481 const SIRegisterInfo *TRI 5482 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5483 5484 Register Def = MI.getOperand(1).getReg(); 5485 Register Use = MI.getOperand(3).getReg(); 5486 5487 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5488 5489 if (Negated) 5490 std::swap(CondBrTarget, UncondBrTarget); 5491 5492 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5493 if (IntrID == Intrinsic::amdgcn_if) { 5494 B.buildInstr(AMDGPU::SI_IF) 5495 .addDef(Def) 5496 .addUse(Use) 5497 .addMBB(UncondBrTarget); 5498 } else { 5499 B.buildInstr(AMDGPU::SI_ELSE) 5500 .addDef(Def) 5501 .addUse(Use) 5502 .addMBB(UncondBrTarget); 5503 } 5504 5505 if (Br) { 5506 Br->getOperand(0).setMBB(CondBrTarget); 5507 } else { 5508 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 5509 // since we're swapping branch targets it needs to be reinserted. 5510 // FIXME: IRTranslator should probably not do this 5511 B.buildBr(*CondBrTarget); 5512 } 5513 5514 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 5515 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 5516 MI.eraseFromParent(); 5517 BrCond->eraseFromParent(); 5518 return true; 5519 } 5520 5521 return false; 5522 } 5523 case Intrinsic::amdgcn_loop: { 5524 MachineInstr *Br = nullptr; 5525 MachineBasicBlock *UncondBrTarget = nullptr; 5526 bool Negated = false; 5527 if (MachineInstr *BrCond = 5528 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5529 const SIRegisterInfo *TRI 5530 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5531 5532 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5533 Register Reg = MI.getOperand(2).getReg(); 5534 5535 if (Negated) 5536 std::swap(CondBrTarget, UncondBrTarget); 5537 5538 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5539 B.buildInstr(AMDGPU::SI_LOOP) 5540 .addUse(Reg) 5541 .addMBB(UncondBrTarget); 5542 5543 if (Br) 5544 Br->getOperand(0).setMBB(CondBrTarget); 5545 else 5546 B.buildBr(*CondBrTarget); 5547 5548 MI.eraseFromParent(); 5549 BrCond->eraseFromParent(); 5550 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5551 return true; 5552 } 5553 5554 return false; 5555 } 5556 case Intrinsic::amdgcn_kernarg_segment_ptr: 5557 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5558 // This only makes sense to call in a kernel, so just lower to null. 5559 B.buildConstant(MI.getOperand(0).getReg(), 0); 5560 MI.eraseFromParent(); 5561 return true; 5562 } 5563 5564 return legalizePreloadedArgIntrin( 5565 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5566 case Intrinsic::amdgcn_implicitarg_ptr: 5567 return legalizeImplicitArgPtr(MI, MRI, B); 5568 case Intrinsic::amdgcn_workitem_id_x: 5569 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, 5570 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5571 case Intrinsic::amdgcn_workitem_id_y: 5572 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, 5573 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5574 case Intrinsic::amdgcn_workitem_id_z: 5575 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, 5576 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5577 case Intrinsic::amdgcn_workgroup_id_x: 5578 return legalizePreloadedArgIntrin(MI, MRI, B, 5579 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5580 case Intrinsic::amdgcn_workgroup_id_y: 5581 return legalizePreloadedArgIntrin(MI, MRI, B, 5582 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5583 case Intrinsic::amdgcn_workgroup_id_z: 5584 return legalizePreloadedArgIntrin(MI, MRI, B, 5585 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5586 case Intrinsic::amdgcn_dispatch_ptr: 5587 return legalizePreloadedArgIntrin(MI, MRI, B, 5588 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5589 case Intrinsic::amdgcn_queue_ptr: 5590 return legalizePreloadedArgIntrin(MI, MRI, B, 5591 AMDGPUFunctionArgInfo::QUEUE_PTR); 5592 case Intrinsic::amdgcn_implicit_buffer_ptr: 5593 return legalizePreloadedArgIntrin( 5594 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5595 case Intrinsic::amdgcn_dispatch_id: 5596 return legalizePreloadedArgIntrin(MI, MRI, B, 5597 AMDGPUFunctionArgInfo::DISPATCH_ID); 5598 case Intrinsic::r600_read_ngroups_x: 5599 // TODO: Emit error for hsa 5600 return legalizeKernargMemParameter(MI, B, 5601 SI::KernelInputOffsets::NGROUPS_X); 5602 case Intrinsic::r600_read_ngroups_y: 5603 return legalizeKernargMemParameter(MI, B, 5604 SI::KernelInputOffsets::NGROUPS_Y); 5605 case Intrinsic::r600_read_ngroups_z: 5606 return legalizeKernargMemParameter(MI, B, 5607 SI::KernelInputOffsets::NGROUPS_Z); 5608 case Intrinsic::r600_read_local_size_x: 5609 // TODO: Could insert G_ASSERT_ZEXT from s16 5610 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); 5611 case Intrinsic::r600_read_local_size_y: 5612 // TODO: Could insert G_ASSERT_ZEXT from s16 5613 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); 5614 // TODO: Could insert G_ASSERT_ZEXT from s16 5615 case Intrinsic::r600_read_local_size_z: 5616 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); 5617 case Intrinsic::r600_read_global_size_x: 5618 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); 5619 case Intrinsic::r600_read_global_size_y: 5620 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); 5621 case Intrinsic::r600_read_global_size_z: 5622 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); 5623 case Intrinsic::amdgcn_fdiv_fast: 5624 return legalizeFDIVFastIntrin(MI, MRI, B); 5625 case Intrinsic::amdgcn_is_shared: 5626 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5627 case Intrinsic::amdgcn_is_private: 5628 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5629 case Intrinsic::amdgcn_wavefrontsize: { 5630 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5631 MI.eraseFromParent(); 5632 return true; 5633 } 5634 case Intrinsic::amdgcn_s_buffer_load: 5635 return legalizeSBufferLoad(Helper, MI); 5636 case Intrinsic::amdgcn_raw_buffer_store: 5637 case Intrinsic::amdgcn_struct_buffer_store: 5638 return legalizeBufferStore(MI, MRI, B, false, false); 5639 case Intrinsic::amdgcn_raw_buffer_store_format: 5640 case Intrinsic::amdgcn_struct_buffer_store_format: 5641 return legalizeBufferStore(MI, MRI, B, false, true); 5642 case Intrinsic::amdgcn_raw_tbuffer_store: 5643 case Intrinsic::amdgcn_struct_tbuffer_store: 5644 return legalizeBufferStore(MI, MRI, B, true, true); 5645 case Intrinsic::amdgcn_raw_buffer_load: 5646 case Intrinsic::amdgcn_struct_buffer_load: 5647 return legalizeBufferLoad(MI, MRI, B, false, false); 5648 case Intrinsic::amdgcn_raw_buffer_load_format: 5649 case Intrinsic::amdgcn_struct_buffer_load_format: 5650 return legalizeBufferLoad(MI, MRI, B, true, false); 5651 case Intrinsic::amdgcn_raw_tbuffer_load: 5652 case Intrinsic::amdgcn_struct_tbuffer_load: 5653 return legalizeBufferLoad(MI, MRI, B, true, true); 5654 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5655 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5656 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5657 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5658 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5659 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5660 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5661 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5662 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5663 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5664 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5665 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5666 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5667 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5668 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5669 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5670 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5671 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5672 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5673 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5674 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5675 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5676 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5677 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5678 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5679 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5680 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5681 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5682 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5683 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5684 return legalizeBufferAtomic(MI, B, IntrID); 5685 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5686 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { 5687 Register DstReg = MI.getOperand(0).getReg(); 5688 if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) { 5689 Function &F = B.getMF().getFunction(); 5690 DiagnosticInfoUnsupported NoFpRet( 5691 F, "return versions of fp atomics not supported", B.getDebugLoc(), 5692 DS_Error); 5693 F.getContext().diagnose(NoFpRet); 5694 B.buildUndef(DstReg); 5695 MI.eraseFromParent(); 5696 return true; 5697 } 5698 5699 return legalizeBufferAtomic(MI, B, IntrID); 5700 } 5701 case Intrinsic::amdgcn_atomic_inc: 5702 return legalizeAtomicIncDec(MI, B, true); 5703 case Intrinsic::amdgcn_atomic_dec: 5704 return legalizeAtomicIncDec(MI, B, false); 5705 case Intrinsic::trap: 5706 return legalizeTrapIntrinsic(MI, MRI, B); 5707 case Intrinsic::debugtrap: 5708 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5709 case Intrinsic::amdgcn_rsq_clamp: 5710 return legalizeRsqClampIntrinsic(MI, MRI, B); 5711 case Intrinsic::amdgcn_ds_fadd: 5712 case Intrinsic::amdgcn_ds_fmin: 5713 case Intrinsic::amdgcn_ds_fmax: 5714 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5715 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5716 return legalizeBVHIntrinsic(MI, B); 5717 default: { 5718 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5719 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5720 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5721 return true; 5722 } 5723 } 5724 5725 return true; 5726 } 5727