1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeNumElements(Pow2NElts); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 111 }; 112 } 113 114 // Increase the number of vector elements to reach the next multiple of 32-bit 115 // type. 116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 117 return [=](const LegalityQuery &Query) { 118 const LLT Ty = Query.Types[TypeIdx]; 119 120 const LLT EltTy = Ty.getElementType(); 121 const int Size = Ty.getSizeInBits(); 122 const int EltSize = EltTy.getSizeInBits(); 123 const int NextMul32 = (Size + 31) / 32; 124 125 assert(EltSize < 32); 126 127 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 128 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 129 }; 130 } 131 132 static LLT getBitcastRegisterType(const LLT Ty) { 133 const unsigned Size = Ty.getSizeInBits(); 134 135 LLT CoercedTy; 136 if (Size <= 32) { 137 // <2 x s8> -> s16 138 // <4 x s8> -> s32 139 return LLT::scalar(Size); 140 } 141 142 return LLT::scalarOrVector(Size / 32, 32); 143 } 144 145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 146 return [=](const LegalityQuery &Query) { 147 const LLT Ty = Query.Types[TypeIdx]; 148 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 149 }; 150 } 151 152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 unsigned Size = Ty.getSizeInBits(); 156 assert(Size % 32 == 0); 157 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 158 }; 159 } 160 161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 162 return [=](const LegalityQuery &Query) { 163 const LLT QueryTy = Query.Types[TypeIdx]; 164 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 165 }; 166 } 167 168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 169 return [=](const LegalityQuery &Query) { 170 const LLT QueryTy = Query.Types[TypeIdx]; 171 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 172 }; 173 } 174 175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 176 return [=](const LegalityQuery &Query) { 177 const LLT QueryTy = Query.Types[TypeIdx]; 178 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 179 }; 180 } 181 182 static bool isRegisterSize(unsigned Size) { 183 return Size % 32 == 0 && Size <= MaxRegisterSize; 184 } 185 186 static bool isRegisterVectorElementType(LLT EltTy) { 187 const int EltSize = EltTy.getSizeInBits(); 188 return EltSize == 16 || EltSize % 32 == 0; 189 } 190 191 static bool isRegisterVectorType(LLT Ty) { 192 const int EltSize = Ty.getElementType().getSizeInBits(); 193 return EltSize == 32 || EltSize == 64 || 194 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 195 EltSize == 128 || EltSize == 256; 196 } 197 198 static bool isRegisterType(LLT Ty) { 199 if (!isRegisterSize(Ty.getSizeInBits())) 200 return false; 201 202 if (Ty.isVector()) 203 return isRegisterVectorType(Ty); 204 205 return true; 206 } 207 208 // Any combination of 32 or 64-bit elements up the maximum register size, and 209 // multiples of v2s16. 210 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 211 return [=](const LegalityQuery &Query) { 212 return isRegisterType(Query.Types[TypeIdx]); 213 }; 214 } 215 216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 217 return [=](const LegalityQuery &Query) { 218 const LLT QueryTy = Query.Types[TypeIdx]; 219 if (!QueryTy.isVector()) 220 return false; 221 const LLT EltTy = QueryTy.getElementType(); 222 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 223 }; 224 } 225 226 // If we have a truncating store or an extending load with a data size larger 227 // than 32-bits, we need to reduce to a 32-bit type. 228 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 229 return [=](const LegalityQuery &Query) { 230 const LLT Ty = Query.Types[TypeIdx]; 231 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 232 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 233 }; 234 } 235 236 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 237 // handle some operations by just promoting the register during 238 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 239 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 240 bool IsLoad) { 241 switch (AS) { 242 case AMDGPUAS::PRIVATE_ADDRESS: 243 // FIXME: Private element size. 244 return ST.enableFlatScratch() ? 128 : 32; 245 case AMDGPUAS::LOCAL_ADDRESS: 246 return ST.useDS128() ? 128 : 64; 247 case AMDGPUAS::GLOBAL_ADDRESS: 248 case AMDGPUAS::CONSTANT_ADDRESS: 249 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 250 // Treat constant and global as identical. SMRD loads are sometimes usable for 251 // global loads (ideally constant address space should be eliminated) 252 // depending on the context. Legality cannot be context dependent, but 253 // RegBankSelect can split the load as necessary depending on the pointer 254 // register bank/uniformity and if the memory is invariant or not written in a 255 // kernel. 256 return IsLoad ? 512 : 128; 257 default: 258 // Flat addresses may contextually need to be split to 32-bit parts if they 259 // may alias scratch depending on the subtarget. 260 return 128; 261 } 262 } 263 264 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 265 const LegalityQuery &Query) { 266 const LLT Ty = Query.Types[0]; 267 268 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 269 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 270 271 unsigned RegSize = Ty.getSizeInBits(); 272 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 273 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 274 unsigned AS = Query.Types[1].getAddressSpace(); 275 276 // All of these need to be custom lowered to cast the pointer operand. 277 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 278 return false; 279 280 // Do not handle extending vector loads. 281 if (Ty.isVector() && MemSize != RegSize) 282 return false; 283 284 // TODO: We should be able to widen loads if the alignment is high enough, but 285 // we also need to modify the memory access size. 286 #if 0 287 // Accept widening loads based on alignment. 288 if (IsLoad && MemSize < Size) 289 MemSize = std::max(MemSize, Align); 290 #endif 291 292 // Only 1-byte and 2-byte to 32-bit extloads are valid. 293 if (MemSize != RegSize && RegSize != 32) 294 return false; 295 296 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 297 return false; 298 299 switch (MemSize) { 300 case 8: 301 case 16: 302 case 32: 303 case 64: 304 case 128: 305 break; 306 case 96: 307 if (!ST.hasDwordx3LoadStores()) 308 return false; 309 break; 310 case 256: 311 case 512: 312 // These may contextually need to be broken down. 313 break; 314 default: 315 return false; 316 } 317 318 assert(RegSize >= MemSize); 319 320 if (AlignBits < MemSize) { 321 const SITargetLowering *TLI = ST.getTargetLowering(); 322 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 323 Align(AlignBits / 8))) 324 return false; 325 } 326 327 return true; 328 } 329 330 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 331 // workaround this. Eventually it should ignore the type for loads and only care 332 // about the size. Return true in cases where we will workaround this for now by 333 // bitcasting. 334 static bool loadStoreBitcastWorkaround(const LLT Ty) { 335 if (EnableNewLegality) 336 return false; 337 338 const unsigned Size = Ty.getSizeInBits(); 339 if (Size <= 64) 340 return false; 341 if (!Ty.isVector()) 342 return true; 343 344 LLT EltTy = Ty.getElementType(); 345 if (EltTy.isPointer()) 346 return true; 347 348 unsigned EltSize = EltTy.getSizeInBits(); 349 return EltSize != 32 && EltSize != 64; 350 } 351 352 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 353 const LLT Ty = Query.Types[0]; 354 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 355 !loadStoreBitcastWorkaround(Ty); 356 } 357 358 /// Return true if a load or store of the type should be lowered with a bitcast 359 /// to a different type. 360 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 361 const unsigned MemSizeInBits) { 362 const unsigned Size = Ty.getSizeInBits(); 363 if (Size != MemSizeInBits) 364 return Size <= 32 && Ty.isVector(); 365 366 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 367 return true; 368 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 369 !isRegisterVectorElementType(Ty.getElementType()); 370 } 371 372 /// Return true if we should legalize a load by widening an odd sized memory 373 /// access up to the alignment. Note this case when the memory access itself 374 /// changes, not the size of the result register. 375 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 376 unsigned AlignInBits, unsigned AddrSpace, 377 unsigned Opcode) { 378 // We don't want to widen cases that are naturally legal. 379 if (isPowerOf2_32(SizeInBits)) 380 return false; 381 382 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 383 // end up widening these for a scalar load during RegBankSelect, since there 384 // aren't 96-bit scalar loads. 385 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 386 return false; 387 388 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 389 return false; 390 391 // A load is known dereferenceable up to the alignment, so it's legal to widen 392 // to it. 393 // 394 // TODO: Could check dereferenceable for less aligned cases. 395 unsigned RoundedSize = NextPowerOf2(SizeInBits); 396 if (AlignInBits < RoundedSize) 397 return false; 398 399 // Do not widen if it would introduce a slow unaligned load. 400 const SITargetLowering *TLI = ST.getTargetLowering(); 401 bool Fast = false; 402 return TLI->allowsMisalignedMemoryAccessesImpl( 403 RoundedSize, AddrSpace, Align(AlignInBits / 8), 404 MachineMemOperand::MOLoad, &Fast) && 405 Fast; 406 } 407 408 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 409 unsigned Opcode) { 410 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 411 return false; 412 413 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 414 Query.MMODescrs[0].AlignInBits, 415 Query.Types[1].getAddressSpace(), Opcode); 416 } 417 418 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 419 const GCNTargetMachine &TM) 420 : ST(ST_) { 421 using namespace TargetOpcode; 422 423 auto GetAddrSpacePtr = [&TM](unsigned AS) { 424 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 425 }; 426 427 const LLT S1 = LLT::scalar(1); 428 const LLT S8 = LLT::scalar(8); 429 const LLT S16 = LLT::scalar(16); 430 const LLT S32 = LLT::scalar(32); 431 const LLT S64 = LLT::scalar(64); 432 const LLT S128 = LLT::scalar(128); 433 const LLT S256 = LLT::scalar(256); 434 const LLT S512 = LLT::scalar(512); 435 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 436 437 const LLT V2S8 = LLT::fixed_vector(2, 8); 438 const LLT V2S16 = LLT::fixed_vector(2, 16); 439 const LLT V4S16 = LLT::fixed_vector(4, 16); 440 441 const LLT V2S32 = LLT::fixed_vector(2, 32); 442 const LLT V3S32 = LLT::fixed_vector(3, 32); 443 const LLT V4S32 = LLT::fixed_vector(4, 32); 444 const LLT V5S32 = LLT::fixed_vector(5, 32); 445 const LLT V6S32 = LLT::fixed_vector(6, 32); 446 const LLT V7S32 = LLT::fixed_vector(7, 32); 447 const LLT V8S32 = LLT::fixed_vector(8, 32); 448 const LLT V9S32 = LLT::fixed_vector(9, 32); 449 const LLT V10S32 = LLT::fixed_vector(10, 32); 450 const LLT V11S32 = LLT::fixed_vector(11, 32); 451 const LLT V12S32 = LLT::fixed_vector(12, 32); 452 const LLT V13S32 = LLT::fixed_vector(13, 32); 453 const LLT V14S32 = LLT::fixed_vector(14, 32); 454 const LLT V15S32 = LLT::fixed_vector(15, 32); 455 const LLT V16S32 = LLT::fixed_vector(16, 32); 456 const LLT V32S32 = LLT::fixed_vector(32, 32); 457 458 const LLT V2S64 = LLT::fixed_vector(2, 64); 459 const LLT V3S64 = LLT::fixed_vector(3, 64); 460 const LLT V4S64 = LLT::fixed_vector(4, 64); 461 const LLT V5S64 = LLT::fixed_vector(5, 64); 462 const LLT V6S64 = LLT::fixed_vector(6, 64); 463 const LLT V7S64 = LLT::fixed_vector(7, 64); 464 const LLT V8S64 = LLT::fixed_vector(8, 64); 465 const LLT V16S64 = LLT::fixed_vector(16, 64); 466 467 std::initializer_list<LLT> AllS32Vectors = 468 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 469 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 470 std::initializer_list<LLT> AllS64Vectors = 471 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 472 473 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 474 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 475 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 476 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 477 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 478 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 479 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 480 481 const LLT CodePtr = FlatPtr; 482 483 const std::initializer_list<LLT> AddrSpaces64 = { 484 GlobalPtr, ConstantPtr, FlatPtr 485 }; 486 487 const std::initializer_list<LLT> AddrSpaces32 = { 488 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 489 }; 490 491 const std::initializer_list<LLT> FPTypesBase = { 492 S32, S64 493 }; 494 495 const std::initializer_list<LLT> FPTypes16 = { 496 S32, S64, S16 497 }; 498 499 const std::initializer_list<LLT> FPTypesPK16 = { 500 S32, S64, S16, V2S16 501 }; 502 503 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 504 505 // s1 for VCC branches, s32 for SCC branches. 506 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 507 508 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 509 // elements for v3s16 510 getActionDefinitionsBuilder(G_PHI) 511 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 512 .legalFor(AllS32Vectors) 513 .legalFor(AllS64Vectors) 514 .legalFor(AddrSpaces64) 515 .legalFor(AddrSpaces32) 516 .legalIf(isPointer(0)) 517 .clampScalar(0, S16, S256) 518 .widenScalarToNextPow2(0, 32) 519 .clampMaxNumElements(0, S32, 16) 520 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 521 .scalarize(0); 522 523 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 524 // Full set of gfx9 features. 525 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 526 .legalFor({S32, S16, V2S16}) 527 .clampScalar(0, S16, S32) 528 .clampMaxNumElements(0, S16, 2) 529 .scalarize(0) 530 .widenScalarToNextPow2(0, 32); 531 532 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 533 .legalFor({S32, S16, V2S16}) // Clamp modifier 534 .minScalarOrElt(0, S16) 535 .clampMaxNumElements(0, S16, 2) 536 .scalarize(0) 537 .widenScalarToNextPow2(0, 32) 538 .lower(); 539 } else if (ST.has16BitInsts()) { 540 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 541 .legalFor({S32, S16}) 542 .clampScalar(0, S16, S32) 543 .scalarize(0) 544 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 545 546 // Technically the saturating operations require clamp bit support, but this 547 // was introduced at the same time as 16-bit operations. 548 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 549 .legalFor({S32, S16}) // Clamp modifier 550 .minScalar(0, S16) 551 .scalarize(0) 552 .widenScalarToNextPow2(0, 16) 553 .lower(); 554 555 // We're just lowering this, but it helps get a better result to try to 556 // coerce to the desired type first. 557 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 558 .minScalar(0, S16) 559 .scalarize(0) 560 .lower(); 561 } else { 562 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 563 .legalFor({S32}) 564 .clampScalar(0, S32, S32) 565 .scalarize(0); 566 567 if (ST.hasIntClamp()) { 568 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 569 .legalFor({S32}) // Clamp modifier. 570 .scalarize(0) 571 .minScalarOrElt(0, S32) 572 .lower(); 573 } else { 574 // Clamp bit support was added in VI, along with 16-bit operations. 575 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 576 .minScalar(0, S32) 577 .scalarize(0) 578 .lower(); 579 } 580 581 // FIXME: DAG expansion gets better results. The widening uses the smaller 582 // range values and goes for the min/max lowering directly. 583 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 584 .minScalar(0, S32) 585 .scalarize(0) 586 .lower(); 587 } 588 589 getActionDefinitionsBuilder( 590 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 591 .customFor({S32, S64}) 592 .clampScalar(0, S32, S64) 593 .widenScalarToNextPow2(0, 32) 594 .scalarize(0); 595 596 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 597 .legalFor({S32}) 598 .maxScalarOrElt(0, S32); 599 600 if (ST.hasVOP3PInsts()) { 601 Mulh 602 .clampMaxNumElements(0, S8, 2) 603 .lowerFor({V2S8}); 604 } 605 606 Mulh 607 .scalarize(0) 608 .lower(); 609 610 // Report legal for any types we can handle anywhere. For the cases only legal 611 // on the SALU, RegBankSelect will be able to re-legalize. 612 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 613 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 614 .clampScalar(0, S32, S64) 615 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 616 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 617 .widenScalarToNextPow2(0) 618 .scalarize(0); 619 620 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 621 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 622 .legalFor({{S32, S1}, {S32, S32}}) 623 .minScalar(0, S32) 624 // TODO: .scalarize(0) 625 .lower(); 626 627 getActionDefinitionsBuilder(G_BITCAST) 628 // Don't worry about the size constraint. 629 .legalIf(all(isRegisterType(0), isRegisterType(1))) 630 .lower(); 631 632 633 getActionDefinitionsBuilder(G_CONSTANT) 634 .legalFor({S1, S32, S64, S16, GlobalPtr, 635 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 636 .legalIf(isPointer(0)) 637 .clampScalar(0, S32, S64) 638 .widenScalarToNextPow2(0); 639 640 getActionDefinitionsBuilder(G_FCONSTANT) 641 .legalFor({S32, S64, S16}) 642 .clampScalar(0, S16, S64); 643 644 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 645 .legalIf(isRegisterType(0)) 646 // s1 and s16 are special cases because they have legal operations on 647 // them, but don't really occupy registers in the normal way. 648 .legalFor({S1, S16}) 649 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 650 .clampScalarOrElt(0, S32, MaxScalar) 651 .widenScalarToNextPow2(0, 32) 652 .clampMaxNumElements(0, S32, 16); 653 654 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 655 656 // If the amount is divergent, we have to do a wave reduction to get the 657 // maximum value, so this is expanded during RegBankSelect. 658 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 659 .legalFor({{PrivatePtr, S32}}); 660 661 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 662 .customIf(typeIsNot(0, PrivatePtr)); 663 664 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 665 666 auto &FPOpActions = getActionDefinitionsBuilder( 667 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 668 .legalFor({S32, S64}); 669 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 670 .customFor({S32, S64}); 671 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 672 .customFor({S32, S64}); 673 674 if (ST.has16BitInsts()) { 675 if (ST.hasVOP3PInsts()) 676 FPOpActions.legalFor({S16, V2S16}); 677 else 678 FPOpActions.legalFor({S16}); 679 680 TrigActions.customFor({S16}); 681 FDIVActions.customFor({S16}); 682 } 683 684 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 685 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 686 687 if (ST.hasVOP3PInsts()) { 688 MinNumMaxNum.customFor(FPTypesPK16) 689 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 690 .clampMaxNumElements(0, S16, 2) 691 .clampScalar(0, S16, S64) 692 .scalarize(0); 693 } else if (ST.has16BitInsts()) { 694 MinNumMaxNum.customFor(FPTypes16) 695 .clampScalar(0, S16, S64) 696 .scalarize(0); 697 } else { 698 MinNumMaxNum.customFor(FPTypesBase) 699 .clampScalar(0, S32, S64) 700 .scalarize(0); 701 } 702 703 if (ST.hasVOP3PInsts()) 704 FPOpActions.clampMaxNumElements(0, S16, 2); 705 706 FPOpActions 707 .scalarize(0) 708 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 709 710 TrigActions 711 .scalarize(0) 712 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 713 714 FDIVActions 715 .scalarize(0) 716 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 717 718 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 719 .legalFor(FPTypesPK16) 720 .clampMaxNumElements(0, S16, 2) 721 .scalarize(0) 722 .clampScalar(0, S16, S64); 723 724 if (ST.has16BitInsts()) { 725 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 726 .legalFor({S32, S64, S16}) 727 .scalarize(0) 728 .clampScalar(0, S16, S64); 729 } else { 730 getActionDefinitionsBuilder(G_FSQRT) 731 .legalFor({S32, S64}) 732 .scalarize(0) 733 .clampScalar(0, S32, S64); 734 735 if (ST.hasFractBug()) { 736 getActionDefinitionsBuilder(G_FFLOOR) 737 .customFor({S64}) 738 .legalFor({S32, S64}) 739 .scalarize(0) 740 .clampScalar(0, S32, S64); 741 } else { 742 getActionDefinitionsBuilder(G_FFLOOR) 743 .legalFor({S32, S64}) 744 .scalarize(0) 745 .clampScalar(0, S32, S64); 746 } 747 } 748 749 getActionDefinitionsBuilder(G_FPTRUNC) 750 .legalFor({{S32, S64}, {S16, S32}}) 751 .scalarize(0) 752 .lower(); 753 754 getActionDefinitionsBuilder(G_FPEXT) 755 .legalFor({{S64, S32}, {S32, S16}}) 756 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 757 .scalarize(0); 758 759 getActionDefinitionsBuilder(G_FSUB) 760 // Use actual fsub instruction 761 .legalFor({S32}) 762 // Must use fadd + fneg 763 .lowerFor({S64, S16, V2S16}) 764 .scalarize(0) 765 .clampScalar(0, S32, S64); 766 767 // Whether this is legal depends on the floating point mode for the function. 768 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 769 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 770 FMad.customFor({S32, S16}); 771 else if (ST.hasMadMacF32Insts()) 772 FMad.customFor({S32}); 773 else if (ST.hasMadF16()) 774 FMad.customFor({S16}); 775 FMad.scalarize(0) 776 .lower(); 777 778 auto &FRem = getActionDefinitionsBuilder(G_FREM); 779 if (ST.has16BitInsts()) { 780 FRem.customFor({S16, S32, S64}); 781 } else { 782 FRem.minScalar(0, S32) 783 .customFor({S32, S64}); 784 } 785 FRem.scalarize(0); 786 787 // TODO: Do we need to clamp maximum bitwidth? 788 getActionDefinitionsBuilder(G_TRUNC) 789 .legalIf(isScalar(0)) 790 .legalFor({{V2S16, V2S32}}) 791 .clampMaxNumElements(0, S16, 2) 792 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 793 // situations (like an invalid implicit use), we don't want to infinite loop 794 // in the legalizer. 795 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 796 .alwaysLegal(); 797 798 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 799 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 800 {S32, S1}, {S64, S1}, {S16, S1}}) 801 .scalarize(0) 802 .clampScalar(0, S32, S64) 803 .widenScalarToNextPow2(1, 32); 804 805 // TODO: Split s1->s64 during regbankselect for VALU. 806 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 807 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 808 .lowerFor({{S32, S64}}) 809 .lowerIf(typeIs(1, S1)) 810 .customFor({{S64, S64}}); 811 if (ST.has16BitInsts()) 812 IToFP.legalFor({{S16, S16}}); 813 IToFP.clampScalar(1, S32, S64) 814 .minScalar(0, S32) 815 .scalarize(0) 816 .widenScalarToNextPow2(1); 817 818 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 819 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 820 .customFor({{S64, S32}, {S64, S64}}) 821 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 822 if (ST.has16BitInsts()) 823 FPToI.legalFor({{S16, S16}}); 824 else 825 FPToI.minScalar(1, S32); 826 827 FPToI.minScalar(0, S32) 828 .widenScalarToNextPow2(0, 32) 829 .scalarize(0) 830 .lower(); 831 832 // Lower roundeven into G_FRINT 833 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 834 .scalarize(0) 835 .lower(); 836 837 if (ST.has16BitInsts()) { 838 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 839 .legalFor({S16, S32, S64}) 840 .clampScalar(0, S16, S64) 841 .scalarize(0); 842 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 843 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 844 .legalFor({S32, S64}) 845 .clampScalar(0, S32, S64) 846 .scalarize(0); 847 } else { 848 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 849 .legalFor({S32}) 850 .customFor({S64}) 851 .clampScalar(0, S32, S64) 852 .scalarize(0); 853 } 854 855 getActionDefinitionsBuilder(G_PTR_ADD) 856 .legalIf(all(isPointer(0), sameSize(0, 1))) 857 .scalarize(0) 858 .scalarSameSizeAs(1, 0); 859 860 getActionDefinitionsBuilder(G_PTRMASK) 861 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 862 .scalarSameSizeAs(1, 0) 863 .scalarize(0); 864 865 auto &CmpBuilder = 866 getActionDefinitionsBuilder(G_ICMP) 867 // The compare output type differs based on the register bank of the output, 868 // so make both s1 and s32 legal. 869 // 870 // Scalar compares producing output in scc will be promoted to s32, as that 871 // is the allocatable register type that will be needed for the copy from 872 // scc. This will be promoted during RegBankSelect, and we assume something 873 // before that won't try to use s32 result types. 874 // 875 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 876 // bank. 877 .legalForCartesianProduct( 878 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 879 .legalForCartesianProduct( 880 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 881 if (ST.has16BitInsts()) { 882 CmpBuilder.legalFor({{S1, S16}}); 883 } 884 885 CmpBuilder 886 .widenScalarToNextPow2(1) 887 .clampScalar(1, S32, S64) 888 .scalarize(0) 889 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 890 891 getActionDefinitionsBuilder(G_FCMP) 892 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 893 .widenScalarToNextPow2(1) 894 .clampScalar(1, S32, S64) 895 .scalarize(0); 896 897 // FIXME: fpow has a selection pattern that should move to custom lowering. 898 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 899 if (ST.has16BitInsts()) 900 Exp2Ops.legalFor({S32, S16}); 901 else 902 Exp2Ops.legalFor({S32}); 903 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 904 Exp2Ops.scalarize(0); 905 906 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 907 if (ST.has16BitInsts()) 908 ExpOps.customFor({{S32}, {S16}}); 909 else 910 ExpOps.customFor({S32}); 911 ExpOps.clampScalar(0, MinScalarFPTy, S32) 912 .scalarize(0); 913 914 getActionDefinitionsBuilder(G_FPOWI) 915 .clampScalar(0, MinScalarFPTy, S32) 916 .lower(); 917 918 // The 64-bit versions produce 32-bit results, but only on the SALU. 919 getActionDefinitionsBuilder(G_CTPOP) 920 .legalFor({{S32, S32}, {S32, S64}}) 921 .clampScalar(0, S32, S32) 922 .clampScalar(1, S32, S64) 923 .scalarize(0) 924 .widenScalarToNextPow2(0, 32) 925 .widenScalarToNextPow2(1, 32); 926 927 // The hardware instructions return a different result on 0 than the generic 928 // instructions expect. The hardware produces -1, but these produce the 929 // bitwidth. 930 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 931 .scalarize(0) 932 .clampScalar(0, S32, S32) 933 .clampScalar(1, S32, S64) 934 .widenScalarToNextPow2(0, 32) 935 .widenScalarToNextPow2(1, 32) 936 .lower(); 937 938 // The 64-bit versions produce 32-bit results, but only on the SALU. 939 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 940 .legalFor({{S32, S32}, {S32, S64}}) 941 .clampScalar(0, S32, S32) 942 .clampScalar(1, S32, S64) 943 .scalarize(0) 944 .widenScalarToNextPow2(0, 32) 945 .widenScalarToNextPow2(1, 32); 946 947 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 948 // RegBankSelect. 949 getActionDefinitionsBuilder(G_BITREVERSE) 950 .legalFor({S32, S64}) 951 .clampScalar(0, S32, S64) 952 .scalarize(0) 953 .widenScalarToNextPow2(0); 954 955 if (ST.has16BitInsts()) { 956 getActionDefinitionsBuilder(G_BSWAP) 957 .legalFor({S16, S32, V2S16}) 958 .clampMaxNumElements(0, S16, 2) 959 // FIXME: Fixing non-power-of-2 before clamp is workaround for 960 // narrowScalar limitation. 961 .widenScalarToNextPow2(0) 962 .clampScalar(0, S16, S32) 963 .scalarize(0); 964 965 if (ST.hasVOP3PInsts()) { 966 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 967 .legalFor({S32, S16, V2S16}) 968 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 969 .clampMaxNumElements(0, S16, 2) 970 .minScalar(0, S16) 971 .widenScalarToNextPow2(0) 972 .scalarize(0) 973 .lower(); 974 } else { 975 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 976 .legalFor({S32, S16}) 977 .widenScalarToNextPow2(0) 978 .minScalar(0, S16) 979 .scalarize(0) 980 .lower(); 981 } 982 } else { 983 // TODO: Should have same legality without v_perm_b32 984 getActionDefinitionsBuilder(G_BSWAP) 985 .legalFor({S32}) 986 .lowerIf(scalarNarrowerThan(0, 32)) 987 // FIXME: Fixing non-power-of-2 before clamp is workaround for 988 // narrowScalar limitation. 989 .widenScalarToNextPow2(0) 990 .maxScalar(0, S32) 991 .scalarize(0) 992 .lower(); 993 994 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 995 .legalFor({S32}) 996 .minScalar(0, S32) 997 .widenScalarToNextPow2(0) 998 .scalarize(0) 999 .lower(); 1000 } 1001 1002 getActionDefinitionsBuilder(G_INTTOPTR) 1003 // List the common cases 1004 .legalForCartesianProduct(AddrSpaces64, {S64}) 1005 .legalForCartesianProduct(AddrSpaces32, {S32}) 1006 .scalarize(0) 1007 // Accept any address space as long as the size matches 1008 .legalIf(sameSize(0, 1)) 1009 .widenScalarIf(smallerThan(1, 0), 1010 [](const LegalityQuery &Query) { 1011 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1012 }) 1013 .narrowScalarIf(largerThan(1, 0), 1014 [](const LegalityQuery &Query) { 1015 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1016 }); 1017 1018 getActionDefinitionsBuilder(G_PTRTOINT) 1019 // List the common cases 1020 .legalForCartesianProduct(AddrSpaces64, {S64}) 1021 .legalForCartesianProduct(AddrSpaces32, {S32}) 1022 .scalarize(0) 1023 // Accept any address space as long as the size matches 1024 .legalIf(sameSize(0, 1)) 1025 .widenScalarIf(smallerThan(0, 1), 1026 [](const LegalityQuery &Query) { 1027 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1028 }) 1029 .narrowScalarIf( 1030 largerThan(0, 1), 1031 [](const LegalityQuery &Query) { 1032 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1033 }); 1034 1035 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1036 .scalarize(0) 1037 .custom(); 1038 1039 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1040 bool IsLoad) -> bool { 1041 const LLT DstTy = Query.Types[0]; 1042 1043 // Split vector extloads. 1044 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1045 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1046 1047 if (MemSize < DstTy.getSizeInBits()) 1048 MemSize = std::max(MemSize, AlignBits); 1049 1050 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1051 return true; 1052 1053 const LLT PtrTy = Query.Types[1]; 1054 unsigned AS = PtrTy.getAddressSpace(); 1055 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1056 return true; 1057 1058 // Catch weird sized loads that don't evenly divide into the access sizes 1059 // TODO: May be able to widen depending on alignment etc. 1060 unsigned NumRegs = (MemSize + 31) / 32; 1061 if (NumRegs == 3) { 1062 if (!ST.hasDwordx3LoadStores()) 1063 return true; 1064 } else { 1065 // If the alignment allows, these should have been widened. 1066 if (!isPowerOf2_32(NumRegs)) 1067 return true; 1068 } 1069 1070 if (AlignBits < MemSize) { 1071 const SITargetLowering *TLI = ST.getTargetLowering(); 1072 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1073 Align(AlignBits / 8)); 1074 } 1075 1076 return false; 1077 }; 1078 1079 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1080 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1081 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1082 1083 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1084 // LDS 1085 // TODO: Unsupported flat for SI. 1086 1087 for (unsigned Op : {G_LOAD, G_STORE}) { 1088 const bool IsStore = Op == G_STORE; 1089 1090 auto &Actions = getActionDefinitionsBuilder(Op); 1091 // Explicitly list some common cases. 1092 // TODO: Does this help compile time at all? 1093 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1094 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1095 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1096 {S64, GlobalPtr, 64, GlobalAlign32}, 1097 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1098 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1099 {S32, GlobalPtr, 8, GlobalAlign8}, 1100 {S32, GlobalPtr, 16, GlobalAlign16}, 1101 1102 {S32, LocalPtr, 32, 32}, 1103 {S64, LocalPtr, 64, 32}, 1104 {V2S32, LocalPtr, 64, 32}, 1105 {S32, LocalPtr, 8, 8}, 1106 {S32, LocalPtr, 16, 16}, 1107 {V2S16, LocalPtr, 32, 32}, 1108 1109 {S32, PrivatePtr, 32, 32}, 1110 {S32, PrivatePtr, 8, 8}, 1111 {S32, PrivatePtr, 16, 16}, 1112 {V2S16, PrivatePtr, 32, 32}, 1113 1114 {S32, ConstantPtr, 32, GlobalAlign32}, 1115 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1116 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1117 {S64, ConstantPtr, 64, GlobalAlign32}, 1118 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1119 Actions.legalIf( 1120 [=](const LegalityQuery &Query) -> bool { 1121 return isLoadStoreLegal(ST, Query); 1122 }); 1123 1124 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1125 // 64-bits. 1126 // 1127 // TODO: Should generalize bitcast action into coerce, which will also cover 1128 // inserting addrspacecasts. 1129 Actions.customIf(typeIs(1, Constant32Ptr)); 1130 1131 // Turn any illegal element vectors into something easier to deal 1132 // with. These will ultimately produce 32-bit scalar shifts to extract the 1133 // parts anyway. 1134 // 1135 // For odd 16-bit element vectors, prefer to split those into pieces with 1136 // 16-bit vector parts. 1137 Actions.bitcastIf( 1138 [=](const LegalityQuery &Query) -> bool { 1139 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1140 Query.MMODescrs[0].SizeInBits); 1141 }, bitcastToRegisterType(0)); 1142 1143 if (!IsStore) { 1144 // Widen suitably aligned loads by loading extra bytes. The standard 1145 // legalization actions can't properly express widening memory operands. 1146 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1147 return shouldWidenLoad(ST, Query, G_LOAD); 1148 }); 1149 } 1150 1151 // FIXME: load/store narrowing should be moved to lower action 1152 Actions 1153 .narrowScalarIf( 1154 [=](const LegalityQuery &Query) -> bool { 1155 return !Query.Types[0].isVector() && 1156 needToSplitMemOp(Query, Op == G_LOAD); 1157 }, 1158 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1159 const LLT DstTy = Query.Types[0]; 1160 const LLT PtrTy = Query.Types[1]; 1161 1162 const unsigned DstSize = DstTy.getSizeInBits(); 1163 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1164 1165 // Split extloads. 1166 if (DstSize > MemSize) 1167 return std::make_pair(0, LLT::scalar(MemSize)); 1168 1169 if (!isPowerOf2_32(DstSize)) { 1170 // We're probably decomposing an odd sized store. Try to split 1171 // to the widest type. TODO: Account for alignment. As-is it 1172 // should be OK, since the new parts will be further legalized. 1173 unsigned FloorSize = PowerOf2Floor(DstSize); 1174 return std::make_pair(0, LLT::scalar(FloorSize)); 1175 } 1176 1177 if (DstSize > 32 && (DstSize % 32 != 0)) { 1178 // FIXME: Need a way to specify non-extload of larger size if 1179 // suitably aligned. 1180 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1181 } 1182 1183 unsigned MaxSize = maxSizeForAddrSpace(ST, 1184 PtrTy.getAddressSpace(), 1185 Op == G_LOAD); 1186 if (MemSize > MaxSize) 1187 return std::make_pair(0, LLT::scalar(MaxSize)); 1188 1189 unsigned Align = Query.MMODescrs[0].AlignInBits; 1190 return std::make_pair(0, LLT::scalar(Align)); 1191 }) 1192 .fewerElementsIf( 1193 [=](const LegalityQuery &Query) -> bool { 1194 return Query.Types[0].isVector() && 1195 needToSplitMemOp(Query, Op == G_LOAD); 1196 }, 1197 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1198 const LLT DstTy = Query.Types[0]; 1199 const LLT PtrTy = Query.Types[1]; 1200 1201 LLT EltTy = DstTy.getElementType(); 1202 unsigned MaxSize = maxSizeForAddrSpace(ST, 1203 PtrTy.getAddressSpace(), 1204 Op == G_LOAD); 1205 1206 // FIXME: Handle widened to power of 2 results better. This ends 1207 // up scalarizing. 1208 // FIXME: 3 element stores scalarized on SI 1209 1210 // Split if it's too large for the address space. 1211 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1212 unsigned NumElts = DstTy.getNumElements(); 1213 unsigned EltSize = EltTy.getSizeInBits(); 1214 1215 if (MaxSize % EltSize == 0) { 1216 return std::make_pair( 1217 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1218 } 1219 1220 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1221 1222 // FIXME: Refine when odd breakdowns handled 1223 // The scalars will need to be re-legalized. 1224 if (NumPieces == 1 || NumPieces >= NumElts || 1225 NumElts % NumPieces != 0) 1226 return std::make_pair(0, EltTy); 1227 1228 return std::make_pair( 1229 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1230 } 1231 1232 // FIXME: We could probably handle weird extending loads better. 1233 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1234 if (DstTy.getSizeInBits() > MemSize) 1235 return std::make_pair(0, EltTy); 1236 1237 unsigned EltSize = EltTy.getSizeInBits(); 1238 unsigned DstSize = DstTy.getSizeInBits(); 1239 if (!isPowerOf2_32(DstSize)) { 1240 // We're probably decomposing an odd sized store. Try to split 1241 // to the widest type. TODO: Account for alignment. As-is it 1242 // should be OK, since the new parts will be further legalized. 1243 unsigned FloorSize = PowerOf2Floor(DstSize); 1244 return std::make_pair( 1245 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1246 } 1247 1248 // Need to split because of alignment. 1249 unsigned Align = Query.MMODescrs[0].AlignInBits; 1250 if (EltSize > Align && 1251 (EltSize / Align < DstTy.getNumElements())) { 1252 return std::make_pair( 1253 0, LLT::fixed_vector(EltSize / Align, EltTy)); 1254 } 1255 1256 // May need relegalization for the scalars. 1257 return std::make_pair(0, EltTy); 1258 }) 1259 .lowerIfMemSizeNotPow2() 1260 .minScalar(0, S32) 1261 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1262 .widenScalarToNextPow2(0) 1263 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1264 .lower(); 1265 } 1266 1267 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1268 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1269 {S32, GlobalPtr, 16, 2 * 8}, 1270 {S32, LocalPtr, 8, 8}, 1271 {S32, LocalPtr, 16, 16}, 1272 {S32, PrivatePtr, 8, 8}, 1273 {S32, PrivatePtr, 16, 16}, 1274 {S32, ConstantPtr, 8, 8}, 1275 {S32, ConstantPtr, 16, 2 * 8}}) 1276 .legalIf( 1277 [=](const LegalityQuery &Query) -> bool { 1278 return isLoadStoreLegal(ST, Query); 1279 }); 1280 1281 if (ST.hasFlatAddressSpace()) { 1282 ExtLoads.legalForTypesWithMemDesc( 1283 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1284 } 1285 1286 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1287 // 64-bits. 1288 // 1289 // TODO: Should generalize bitcast action into coerce, which will also cover 1290 // inserting addrspacecasts. 1291 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1292 1293 ExtLoads.clampScalar(0, S32, S32) 1294 .widenScalarToNextPow2(0) 1295 .unsupportedIfMemSizeNotPow2() 1296 .lower(); 1297 1298 auto &Atomics = getActionDefinitionsBuilder( 1299 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1300 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1301 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1302 G_ATOMICRMW_UMIN}) 1303 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1304 {S64, GlobalPtr}, {S64, LocalPtr}, 1305 {S32, RegionPtr}, {S64, RegionPtr}}); 1306 if (ST.hasFlatAddressSpace()) { 1307 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1308 } 1309 1310 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1311 if (ST.hasLDSFPAtomics()) { 1312 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1313 if (ST.hasGFX90AInsts()) 1314 Atomic.legalFor({{S64, LocalPtr}}); 1315 } 1316 if (ST.hasAtomicFaddInsts()) 1317 Atomic.legalFor({{S32, GlobalPtr}}); 1318 1319 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1320 // demarshalling 1321 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1322 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1323 {S32, FlatPtr}, {S64, FlatPtr}}) 1324 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1325 {S32, RegionPtr}, {S64, RegionPtr}}); 1326 // TODO: Pointer types, any 32-bit or 64-bit vector 1327 1328 // Condition should be s32 for scalar, s1 for vector. 1329 getActionDefinitionsBuilder(G_SELECT) 1330 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1331 LocalPtr, FlatPtr, PrivatePtr, 1332 LLT::fixed_vector(2, LocalPtr), 1333 LLT::fixed_vector(2, PrivatePtr)}, 1334 {S1, S32}) 1335 .clampScalar(0, S16, S64) 1336 .scalarize(1) 1337 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1338 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1339 .clampMaxNumElements(0, S32, 2) 1340 .clampMaxNumElements(0, LocalPtr, 2) 1341 .clampMaxNumElements(0, PrivatePtr, 2) 1342 .scalarize(0) 1343 .widenScalarToNextPow2(0) 1344 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1345 1346 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1347 // be more flexible with the shift amount type. 1348 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1349 .legalFor({{S32, S32}, {S64, S32}}); 1350 if (ST.has16BitInsts()) { 1351 if (ST.hasVOP3PInsts()) { 1352 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1353 .clampMaxNumElements(0, S16, 2); 1354 } else 1355 Shifts.legalFor({{S16, S16}}); 1356 1357 // TODO: Support 16-bit shift amounts for all types 1358 Shifts.widenScalarIf( 1359 [=](const LegalityQuery &Query) { 1360 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1361 // 32-bit amount. 1362 const LLT ValTy = Query.Types[0]; 1363 const LLT AmountTy = Query.Types[1]; 1364 return ValTy.getSizeInBits() <= 16 && 1365 AmountTy.getSizeInBits() < 16; 1366 }, changeTo(1, S16)); 1367 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1368 Shifts.clampScalar(1, S32, S32); 1369 Shifts.clampScalar(0, S16, S64); 1370 Shifts.widenScalarToNextPow2(0, 16); 1371 1372 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1373 .minScalar(0, S16) 1374 .scalarize(0) 1375 .lower(); 1376 } else { 1377 // Make sure we legalize the shift amount type first, as the general 1378 // expansion for the shifted type will produce much worse code if it hasn't 1379 // been truncated already. 1380 Shifts.clampScalar(1, S32, S32); 1381 Shifts.clampScalar(0, S32, S64); 1382 Shifts.widenScalarToNextPow2(0, 32); 1383 1384 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1385 .minScalar(0, S32) 1386 .scalarize(0) 1387 .lower(); 1388 } 1389 Shifts.scalarize(0); 1390 1391 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1392 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1393 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1394 unsigned IdxTypeIdx = 2; 1395 1396 getActionDefinitionsBuilder(Op) 1397 .customIf([=](const LegalityQuery &Query) { 1398 const LLT EltTy = Query.Types[EltTypeIdx]; 1399 const LLT VecTy = Query.Types[VecTypeIdx]; 1400 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1401 const unsigned EltSize = EltTy.getSizeInBits(); 1402 return (EltSize == 32 || EltSize == 64) && 1403 VecTy.getSizeInBits() % 32 == 0 && 1404 VecTy.getSizeInBits() <= MaxRegisterSize && 1405 IdxTy.getSizeInBits() == 32; 1406 }) 1407 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1408 bitcastToVectorElement32(VecTypeIdx)) 1409 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1410 .bitcastIf( 1411 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1412 [=](const LegalityQuery &Query) { 1413 // For > 64-bit element types, try to turn this into a 64-bit 1414 // element vector since we may be able to do better indexing 1415 // if this is scalar. If not, fall back to 32. 1416 const LLT EltTy = Query.Types[EltTypeIdx]; 1417 const LLT VecTy = Query.Types[VecTypeIdx]; 1418 const unsigned DstEltSize = EltTy.getSizeInBits(); 1419 const unsigned VecSize = VecTy.getSizeInBits(); 1420 1421 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1422 return std::make_pair( 1423 VecTypeIdx, 1424 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1425 }) 1426 .clampScalar(EltTypeIdx, S32, S64) 1427 .clampScalar(VecTypeIdx, S32, S64) 1428 .clampScalar(IdxTypeIdx, S32, S32) 1429 .clampMaxNumElements(VecTypeIdx, S32, 32) 1430 // TODO: Clamp elements for 64-bit vectors? 1431 // It should only be necessary with variable indexes. 1432 // As a last resort, lower to the stack 1433 .lower(); 1434 } 1435 1436 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1437 .unsupportedIf([=](const LegalityQuery &Query) { 1438 const LLT &EltTy = Query.Types[1].getElementType(); 1439 return Query.Types[0] != EltTy; 1440 }); 1441 1442 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1443 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1444 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1445 1446 // FIXME: Doesn't handle extract of illegal sizes. 1447 getActionDefinitionsBuilder(Op) 1448 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1449 // FIXME: Multiples of 16 should not be legal. 1450 .legalIf([=](const LegalityQuery &Query) { 1451 const LLT BigTy = Query.Types[BigTyIdx]; 1452 const LLT LitTy = Query.Types[LitTyIdx]; 1453 return (BigTy.getSizeInBits() % 32 == 0) && 1454 (LitTy.getSizeInBits() % 16 == 0); 1455 }) 1456 .widenScalarIf( 1457 [=](const LegalityQuery &Query) { 1458 const LLT BigTy = Query.Types[BigTyIdx]; 1459 return (BigTy.getScalarSizeInBits() < 16); 1460 }, 1461 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1462 .widenScalarIf( 1463 [=](const LegalityQuery &Query) { 1464 const LLT LitTy = Query.Types[LitTyIdx]; 1465 return (LitTy.getScalarSizeInBits() < 16); 1466 }, 1467 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1468 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1469 .widenScalarToNextPow2(BigTyIdx, 32); 1470 1471 } 1472 1473 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1474 .legalForCartesianProduct(AllS32Vectors, {S32}) 1475 .legalForCartesianProduct(AllS64Vectors, {S64}) 1476 .clampNumElements(0, V16S32, V32S32) 1477 .clampNumElements(0, V2S64, V16S64) 1478 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1479 1480 if (ST.hasScalarPackInsts()) { 1481 BuildVector 1482 // FIXME: Should probably widen s1 vectors straight to s32 1483 .minScalarOrElt(0, S16) 1484 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1485 .minScalar(1, S32); 1486 1487 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1488 .legalFor({V2S16, S32}) 1489 .lower(); 1490 BuildVector.minScalarOrElt(0, S32); 1491 } else { 1492 BuildVector.customFor({V2S16, S16}); 1493 BuildVector.minScalarOrElt(0, S32); 1494 1495 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1496 .customFor({V2S16, S32}) 1497 .lower(); 1498 } 1499 1500 BuildVector.legalIf(isRegisterType(0)); 1501 1502 // FIXME: Clamp maximum size 1503 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1504 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1505 .clampMaxNumElements(0, S32, 32) 1506 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1507 .clampMaxNumElements(0, S16, 64); 1508 1509 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1510 // pre-legalize. 1511 if (ST.hasVOP3PInsts()) { 1512 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1513 .customFor({V2S16, V2S16}) 1514 .lower(); 1515 } else 1516 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1517 1518 // Merge/Unmerge 1519 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1520 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1521 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1522 1523 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1524 const LLT Ty = Query.Types[TypeIdx]; 1525 if (Ty.isVector()) { 1526 const LLT &EltTy = Ty.getElementType(); 1527 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1528 return true; 1529 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1530 return true; 1531 } 1532 return false; 1533 }; 1534 1535 auto &Builder = getActionDefinitionsBuilder(Op) 1536 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1537 .lowerFor({{S16, V2S16}}) 1538 .lowerIf([=](const LegalityQuery &Query) { 1539 const LLT BigTy = Query.Types[BigTyIdx]; 1540 return BigTy.getSizeInBits() == 32; 1541 }) 1542 // Try to widen to s16 first for small types. 1543 // TODO: Only do this on targets with legal s16 shifts 1544 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1545 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1546 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1547 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1548 elementTypeIs(1, S16)), 1549 changeTo(1, V2S16)) 1550 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1551 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1552 // valid. 1553 .clampScalar(LitTyIdx, S32, S512) 1554 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1555 // Break up vectors with weird elements into scalars 1556 .fewerElementsIf( 1557 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1558 scalarize(0)) 1559 .fewerElementsIf( 1560 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1561 scalarize(1)) 1562 .clampScalar(BigTyIdx, S32, MaxScalar); 1563 1564 if (Op == G_MERGE_VALUES) { 1565 Builder.widenScalarIf( 1566 // TODO: Use 16-bit shifts if legal for 8-bit values? 1567 [=](const LegalityQuery &Query) { 1568 const LLT Ty = Query.Types[LitTyIdx]; 1569 return Ty.getSizeInBits() < 32; 1570 }, 1571 changeTo(LitTyIdx, S32)); 1572 } 1573 1574 Builder.widenScalarIf( 1575 [=](const LegalityQuery &Query) { 1576 const LLT Ty = Query.Types[BigTyIdx]; 1577 return !isPowerOf2_32(Ty.getSizeInBits()) && 1578 Ty.getSizeInBits() % 16 != 0; 1579 }, 1580 [=](const LegalityQuery &Query) { 1581 // Pick the next power of 2, or a multiple of 64 over 128. 1582 // Whichever is smaller. 1583 const LLT &Ty = Query.Types[BigTyIdx]; 1584 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1585 if (NewSizeInBits >= 256) { 1586 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1587 if (RoundedTo < NewSizeInBits) 1588 NewSizeInBits = RoundedTo; 1589 } 1590 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1591 }) 1592 // Any vectors left are the wrong size. Scalarize them. 1593 .scalarize(0) 1594 .scalarize(1); 1595 } 1596 1597 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1598 // RegBankSelect. 1599 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1600 .legalFor({{S32}, {S64}}); 1601 1602 if (ST.hasVOP3PInsts()) { 1603 SextInReg.lowerFor({{V2S16}}) 1604 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1605 // get more vector shift opportunities, since we'll get those when 1606 // expanded. 1607 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1608 } else if (ST.has16BitInsts()) { 1609 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1610 } else { 1611 // Prefer to promote to s32 before lowering if we don't have 16-bit 1612 // shifts. This avoid a lot of intermediate truncate and extend operations. 1613 SextInReg.lowerFor({{S32}, {S64}}); 1614 } 1615 1616 SextInReg 1617 .scalarize(0) 1618 .clampScalar(0, S32, S64) 1619 .lower(); 1620 1621 // TODO: Only Try to form v2s16 with legal packed instructions. 1622 getActionDefinitionsBuilder(G_FSHR) 1623 .legalFor({{S32, S32}}) 1624 .lowerFor({{V2S16, V2S16}}) 1625 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1626 .scalarize(0) 1627 .lower(); 1628 1629 if (ST.hasVOP3PInsts()) { 1630 getActionDefinitionsBuilder(G_FSHL) 1631 .lowerFor({{V2S16, V2S16}}) 1632 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1633 .scalarize(0) 1634 .lower(); 1635 } else { 1636 getActionDefinitionsBuilder(G_FSHL) 1637 .scalarize(0) 1638 .lower(); 1639 } 1640 1641 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1642 .legalFor({S64}); 1643 1644 getActionDefinitionsBuilder(G_FENCE) 1645 .alwaysLegal(); 1646 1647 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1648 .scalarize(0) 1649 .minScalar(0, S32) 1650 .lower(); 1651 1652 getActionDefinitionsBuilder({ 1653 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1654 G_FCOPYSIGN, 1655 1656 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1657 G_ATOMICRMW_NAND, 1658 G_ATOMICRMW_FSUB, 1659 G_READ_REGISTER, 1660 G_WRITE_REGISTER, 1661 1662 G_SADDO, G_SSUBO, 1663 1664 // TODO: Implement 1665 G_FMINIMUM, G_FMAXIMUM}).lower(); 1666 1667 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1668 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1669 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1670 .unsupported(); 1671 1672 getLegacyLegalizerInfo().computeTables(); 1673 verify(*ST.getInstrInfo()); 1674 } 1675 1676 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1677 MachineInstr &MI) const { 1678 MachineIRBuilder &B = Helper.MIRBuilder; 1679 MachineRegisterInfo &MRI = *B.getMRI(); 1680 1681 switch (MI.getOpcode()) { 1682 case TargetOpcode::G_ADDRSPACE_CAST: 1683 return legalizeAddrSpaceCast(MI, MRI, B); 1684 case TargetOpcode::G_FRINT: 1685 return legalizeFrint(MI, MRI, B); 1686 case TargetOpcode::G_FCEIL: 1687 return legalizeFceil(MI, MRI, B); 1688 case TargetOpcode::G_FREM: 1689 return legalizeFrem(MI, MRI, B); 1690 case TargetOpcode::G_INTRINSIC_TRUNC: 1691 return legalizeIntrinsicTrunc(MI, MRI, B); 1692 case TargetOpcode::G_SITOFP: 1693 return legalizeITOFP(MI, MRI, B, true); 1694 case TargetOpcode::G_UITOFP: 1695 return legalizeITOFP(MI, MRI, B, false); 1696 case TargetOpcode::G_FPTOSI: 1697 return legalizeFPTOI(MI, MRI, B, true); 1698 case TargetOpcode::G_FPTOUI: 1699 return legalizeFPTOI(MI, MRI, B, false); 1700 case TargetOpcode::G_FMINNUM: 1701 case TargetOpcode::G_FMAXNUM: 1702 case TargetOpcode::G_FMINNUM_IEEE: 1703 case TargetOpcode::G_FMAXNUM_IEEE: 1704 return legalizeMinNumMaxNum(Helper, MI); 1705 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1706 return legalizeExtractVectorElt(MI, MRI, B); 1707 case TargetOpcode::G_INSERT_VECTOR_ELT: 1708 return legalizeInsertVectorElt(MI, MRI, B); 1709 case TargetOpcode::G_SHUFFLE_VECTOR: 1710 return legalizeShuffleVector(MI, MRI, B); 1711 case TargetOpcode::G_FSIN: 1712 case TargetOpcode::G_FCOS: 1713 return legalizeSinCos(MI, MRI, B); 1714 case TargetOpcode::G_GLOBAL_VALUE: 1715 return legalizeGlobalValue(MI, MRI, B); 1716 case TargetOpcode::G_LOAD: 1717 case TargetOpcode::G_SEXTLOAD: 1718 case TargetOpcode::G_ZEXTLOAD: 1719 return legalizeLoad(Helper, MI); 1720 case TargetOpcode::G_FMAD: 1721 return legalizeFMad(MI, MRI, B); 1722 case TargetOpcode::G_FDIV: 1723 return legalizeFDIV(MI, MRI, B); 1724 case TargetOpcode::G_UDIV: 1725 case TargetOpcode::G_UREM: 1726 case TargetOpcode::G_UDIVREM: 1727 return legalizeUnsignedDIV_REM(MI, MRI, B); 1728 case TargetOpcode::G_SDIV: 1729 case TargetOpcode::G_SREM: 1730 case TargetOpcode::G_SDIVREM: 1731 return legalizeSignedDIV_REM(MI, MRI, B); 1732 case TargetOpcode::G_ATOMIC_CMPXCHG: 1733 return legalizeAtomicCmpXChg(MI, MRI, B); 1734 case TargetOpcode::G_FLOG: 1735 return legalizeFlog(MI, B, numbers::ln2f); 1736 case TargetOpcode::G_FLOG10: 1737 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1738 case TargetOpcode::G_FEXP: 1739 return legalizeFExp(MI, B); 1740 case TargetOpcode::G_FPOW: 1741 return legalizeFPow(MI, B); 1742 case TargetOpcode::G_FFLOOR: 1743 return legalizeFFloor(MI, MRI, B); 1744 case TargetOpcode::G_BUILD_VECTOR: 1745 return legalizeBuildVector(MI, MRI, B); 1746 default: 1747 return false; 1748 } 1749 1750 llvm_unreachable("expected switch to return"); 1751 } 1752 1753 Register AMDGPULegalizerInfo::getSegmentAperture( 1754 unsigned AS, 1755 MachineRegisterInfo &MRI, 1756 MachineIRBuilder &B) const { 1757 MachineFunction &MF = B.getMF(); 1758 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1759 const LLT S32 = LLT::scalar(32); 1760 1761 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1762 1763 if (ST.hasApertureRegs()) { 1764 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1765 // getreg. 1766 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1767 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1768 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1769 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1770 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1771 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1772 unsigned Encoding = 1773 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1774 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1775 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1776 1777 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1778 1779 B.buildInstr(AMDGPU::S_GETREG_B32) 1780 .addDef(GetReg) 1781 .addImm(Encoding); 1782 MRI.setType(GetReg, S32); 1783 1784 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1785 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1786 } 1787 1788 Register QueuePtr = MRI.createGenericVirtualRegister( 1789 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1790 1791 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1792 return Register(); 1793 1794 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1795 // private_segment_aperture_base_hi. 1796 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1797 1798 // TODO: can we be smarter about machine pointer info? 1799 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1800 MachineMemOperand *MMO = MF.getMachineMemOperand( 1801 PtrInfo, 1802 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1803 MachineMemOperand::MOInvariant, 1804 4, commonAlignment(Align(64), StructOffset)); 1805 1806 Register LoadAddr; 1807 1808 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1809 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1810 } 1811 1812 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1813 MachineInstr &MI, MachineRegisterInfo &MRI, 1814 MachineIRBuilder &B) const { 1815 MachineFunction &MF = B.getMF(); 1816 1817 const LLT S32 = LLT::scalar(32); 1818 Register Dst = MI.getOperand(0).getReg(); 1819 Register Src = MI.getOperand(1).getReg(); 1820 1821 LLT DstTy = MRI.getType(Dst); 1822 LLT SrcTy = MRI.getType(Src); 1823 unsigned DestAS = DstTy.getAddressSpace(); 1824 unsigned SrcAS = SrcTy.getAddressSpace(); 1825 1826 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1827 // vector element. 1828 assert(!DstTy.isVector()); 1829 1830 const AMDGPUTargetMachine &TM 1831 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1832 1833 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1834 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1835 return true; 1836 } 1837 1838 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1839 // Truncate. 1840 B.buildExtract(Dst, Src, 0); 1841 MI.eraseFromParent(); 1842 return true; 1843 } 1844 1845 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1846 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1847 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1848 1849 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1850 // another. Merge operands are required to be the same type, but creating an 1851 // extra ptrtoint would be kind of pointless. 1852 auto HighAddr = B.buildConstant( 1853 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1854 B.buildMerge(Dst, {Src, HighAddr}); 1855 MI.eraseFromParent(); 1856 return true; 1857 } 1858 1859 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1860 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1861 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1862 unsigned NullVal = TM.getNullPointerValue(DestAS); 1863 1864 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1865 auto FlatNull = B.buildConstant(SrcTy, 0); 1866 1867 // Extract low 32-bits of the pointer. 1868 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1869 1870 auto CmpRes = 1871 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1872 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1873 1874 MI.eraseFromParent(); 1875 return true; 1876 } 1877 1878 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1879 return false; 1880 1881 if (!ST.hasFlatAddressSpace()) 1882 return false; 1883 1884 auto SegmentNull = 1885 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1886 auto FlatNull = 1887 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1888 1889 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1890 if (!ApertureReg.isValid()) 1891 return false; 1892 1893 auto CmpRes = 1894 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1895 1896 // Coerce the type of the low half of the result so we can use merge_values. 1897 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1898 1899 // TODO: Should we allow mismatched types but matching sizes in merges to 1900 // avoid the ptrtoint? 1901 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1902 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1903 1904 MI.eraseFromParent(); 1905 return true; 1906 } 1907 1908 bool AMDGPULegalizerInfo::legalizeFrint( 1909 MachineInstr &MI, MachineRegisterInfo &MRI, 1910 MachineIRBuilder &B) const { 1911 Register Src = MI.getOperand(1).getReg(); 1912 LLT Ty = MRI.getType(Src); 1913 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1914 1915 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1916 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1917 1918 auto C1 = B.buildFConstant(Ty, C1Val); 1919 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1920 1921 // TODO: Should this propagate fast-math-flags? 1922 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1923 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1924 1925 auto C2 = B.buildFConstant(Ty, C2Val); 1926 auto Fabs = B.buildFAbs(Ty, Src); 1927 1928 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1929 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1930 MI.eraseFromParent(); 1931 return true; 1932 } 1933 1934 bool AMDGPULegalizerInfo::legalizeFceil( 1935 MachineInstr &MI, MachineRegisterInfo &MRI, 1936 MachineIRBuilder &B) const { 1937 1938 const LLT S1 = LLT::scalar(1); 1939 const LLT S64 = LLT::scalar(64); 1940 1941 Register Src = MI.getOperand(1).getReg(); 1942 assert(MRI.getType(Src) == S64); 1943 1944 // result = trunc(src) 1945 // if (src > 0.0 && src != result) 1946 // result += 1.0 1947 1948 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1949 1950 const auto Zero = B.buildFConstant(S64, 0.0); 1951 const auto One = B.buildFConstant(S64, 1.0); 1952 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1953 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1954 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1955 auto Add = B.buildSelect(S64, And, One, Zero); 1956 1957 // TODO: Should this propagate fast-math-flags? 1958 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1959 return true; 1960 } 1961 1962 bool AMDGPULegalizerInfo::legalizeFrem( 1963 MachineInstr &MI, MachineRegisterInfo &MRI, 1964 MachineIRBuilder &B) const { 1965 Register DstReg = MI.getOperand(0).getReg(); 1966 Register Src0Reg = MI.getOperand(1).getReg(); 1967 Register Src1Reg = MI.getOperand(2).getReg(); 1968 auto Flags = MI.getFlags(); 1969 LLT Ty = MRI.getType(DstReg); 1970 1971 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1972 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1973 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1974 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1975 MI.eraseFromParent(); 1976 return true; 1977 } 1978 1979 static MachineInstrBuilder extractF64Exponent(Register Hi, 1980 MachineIRBuilder &B) { 1981 const unsigned FractBits = 52; 1982 const unsigned ExpBits = 11; 1983 LLT S32 = LLT::scalar(32); 1984 1985 auto Const0 = B.buildConstant(S32, FractBits - 32); 1986 auto Const1 = B.buildConstant(S32, ExpBits); 1987 1988 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1989 .addUse(Hi) 1990 .addUse(Const0.getReg(0)) 1991 .addUse(Const1.getReg(0)); 1992 1993 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1994 } 1995 1996 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1997 MachineInstr &MI, MachineRegisterInfo &MRI, 1998 MachineIRBuilder &B) const { 1999 const LLT S1 = LLT::scalar(1); 2000 const LLT S32 = LLT::scalar(32); 2001 const LLT S64 = LLT::scalar(64); 2002 2003 Register Src = MI.getOperand(1).getReg(); 2004 assert(MRI.getType(Src) == S64); 2005 2006 // TODO: Should this use extract since the low half is unused? 2007 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2008 Register Hi = Unmerge.getReg(1); 2009 2010 // Extract the upper half, since this is where we will find the sign and 2011 // exponent. 2012 auto Exp = extractF64Exponent(Hi, B); 2013 2014 const unsigned FractBits = 52; 2015 2016 // Extract the sign bit. 2017 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2018 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2019 2020 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2021 2022 const auto Zero32 = B.buildConstant(S32, 0); 2023 2024 // Extend back to 64-bits. 2025 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2026 2027 auto Shr = B.buildAShr(S64, FractMask, Exp); 2028 auto Not = B.buildNot(S64, Shr); 2029 auto Tmp0 = B.buildAnd(S64, Src, Not); 2030 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2031 2032 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2033 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2034 2035 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2036 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2037 MI.eraseFromParent(); 2038 return true; 2039 } 2040 2041 bool AMDGPULegalizerInfo::legalizeITOFP( 2042 MachineInstr &MI, MachineRegisterInfo &MRI, 2043 MachineIRBuilder &B, bool Signed) const { 2044 2045 Register Dst = MI.getOperand(0).getReg(); 2046 Register Src = MI.getOperand(1).getReg(); 2047 2048 const LLT S64 = LLT::scalar(64); 2049 const LLT S32 = LLT::scalar(32); 2050 2051 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2052 2053 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2054 2055 auto CvtHi = Signed ? 2056 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2057 B.buildUITOFP(S64, Unmerge.getReg(1)); 2058 2059 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2060 2061 auto ThirtyTwo = B.buildConstant(S32, 32); 2062 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2063 .addUse(CvtHi.getReg(0)) 2064 .addUse(ThirtyTwo.getReg(0)); 2065 2066 // TODO: Should this propagate fast-math-flags? 2067 B.buildFAdd(Dst, LdExp, CvtLo); 2068 MI.eraseFromParent(); 2069 return true; 2070 } 2071 2072 // TODO: Copied from DAG implementation. Verify logic and document how this 2073 // actually works. 2074 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2075 MachineRegisterInfo &MRI, 2076 MachineIRBuilder &B, 2077 bool Signed) const { 2078 2079 Register Dst = MI.getOperand(0).getReg(); 2080 Register Src = MI.getOperand(1).getReg(); 2081 2082 const LLT S64 = LLT::scalar(64); 2083 const LLT S32 = LLT::scalar(32); 2084 2085 const LLT SrcLT = MRI.getType(Src); 2086 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2087 2088 unsigned Flags = MI.getFlags(); 2089 2090 // The basic idea of converting a floating point number into a pair of 32-bit 2091 // integers is illustrated as follows: 2092 // 2093 // tf := trunc(val); 2094 // hif := floor(tf * 2^-32); 2095 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2096 // hi := fptoi(hif); 2097 // lo := fptoi(lof); 2098 // 2099 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2100 MachineInstrBuilder Sign; 2101 if (Signed && SrcLT == S32) { 2102 // However, a 32-bit floating point number has only 23 bits mantissa and 2103 // it's not enough to hold all the significant bits of `lof` if val is 2104 // negative. To avoid the loss of precision, We need to take the absolute 2105 // value after truncating and flip the result back based on the original 2106 // signedness. 2107 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2108 Trunc = B.buildFAbs(S32, Trunc, Flags); 2109 } 2110 MachineInstrBuilder K0, K1; 2111 if (SrcLT == S64) { 2112 K0 = B.buildFConstant(S64, 2113 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2114 K1 = B.buildFConstant(S64, 2115 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2116 } else { 2117 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2118 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2119 } 2120 2121 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2122 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2123 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2124 2125 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2126 : B.buildFPTOUI(S32, FloorMul); 2127 auto Lo = B.buildFPTOUI(S32, Fma); 2128 2129 if (Signed && SrcLT == S32) { 2130 // Flip the result based on the signedness, which is either all 0s or 1s. 2131 Sign = B.buildMerge(S64, {Sign, Sign}); 2132 // r := xor({lo, hi}, sign) - sign; 2133 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2134 } else 2135 B.buildMerge(Dst, {Lo, Hi}); 2136 MI.eraseFromParent(); 2137 2138 return true; 2139 } 2140 2141 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2142 MachineInstr &MI) const { 2143 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2144 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2145 2146 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2147 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2148 2149 // With ieee_mode disabled, the instructions have the correct behavior 2150 // already for G_FMINNUM/G_FMAXNUM 2151 if (!MFI->getMode().IEEE) 2152 return !IsIEEEOp; 2153 2154 if (IsIEEEOp) 2155 return true; 2156 2157 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2158 } 2159 2160 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2161 MachineInstr &MI, MachineRegisterInfo &MRI, 2162 MachineIRBuilder &B) const { 2163 // TODO: Should move some of this into LegalizerHelper. 2164 2165 // TODO: Promote dynamic indexing of s16 to s32 2166 2167 // FIXME: Artifact combiner probably should have replaced the truncated 2168 // constant before this, so we shouldn't need 2169 // getConstantVRegValWithLookThrough. 2170 Optional<ValueAndVReg> MaybeIdxVal = 2171 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2172 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2173 return true; 2174 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2175 2176 Register Dst = MI.getOperand(0).getReg(); 2177 Register Vec = MI.getOperand(1).getReg(); 2178 2179 LLT VecTy = MRI.getType(Vec); 2180 LLT EltTy = VecTy.getElementType(); 2181 assert(EltTy == MRI.getType(Dst)); 2182 2183 if (IdxVal < VecTy.getNumElements()) 2184 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2185 else 2186 B.buildUndef(Dst); 2187 2188 MI.eraseFromParent(); 2189 return true; 2190 } 2191 2192 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2193 MachineInstr &MI, MachineRegisterInfo &MRI, 2194 MachineIRBuilder &B) const { 2195 // TODO: Should move some of this into LegalizerHelper. 2196 2197 // TODO: Promote dynamic indexing of s16 to s32 2198 2199 // FIXME: Artifact combiner probably should have replaced the truncated 2200 // constant before this, so we shouldn't need 2201 // getConstantVRegValWithLookThrough. 2202 Optional<ValueAndVReg> MaybeIdxVal = 2203 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2204 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2205 return true; 2206 2207 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2208 Register Dst = MI.getOperand(0).getReg(); 2209 Register Vec = MI.getOperand(1).getReg(); 2210 Register Ins = MI.getOperand(2).getReg(); 2211 2212 LLT VecTy = MRI.getType(Vec); 2213 LLT EltTy = VecTy.getElementType(); 2214 assert(EltTy == MRI.getType(Ins)); 2215 2216 if (IdxVal < VecTy.getNumElements()) 2217 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2218 else 2219 B.buildUndef(Dst); 2220 2221 MI.eraseFromParent(); 2222 return true; 2223 } 2224 2225 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2226 MachineInstr &MI, MachineRegisterInfo &MRI, 2227 MachineIRBuilder &B) const { 2228 const LLT V2S16 = LLT::fixed_vector(2, 16); 2229 2230 Register Dst = MI.getOperand(0).getReg(); 2231 Register Src0 = MI.getOperand(1).getReg(); 2232 LLT DstTy = MRI.getType(Dst); 2233 LLT SrcTy = MRI.getType(Src0); 2234 2235 if (SrcTy == V2S16 && DstTy == V2S16 && 2236 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2237 return true; 2238 2239 MachineIRBuilder HelperBuilder(MI); 2240 GISelObserverWrapper DummyObserver; 2241 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2242 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2243 } 2244 2245 bool AMDGPULegalizerInfo::legalizeSinCos( 2246 MachineInstr &MI, MachineRegisterInfo &MRI, 2247 MachineIRBuilder &B) const { 2248 2249 Register DstReg = MI.getOperand(0).getReg(); 2250 Register SrcReg = MI.getOperand(1).getReg(); 2251 LLT Ty = MRI.getType(DstReg); 2252 unsigned Flags = MI.getFlags(); 2253 2254 Register TrigVal; 2255 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2256 if (ST.hasTrigReducedRange()) { 2257 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2258 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2259 .addUse(MulVal.getReg(0)) 2260 .setMIFlags(Flags).getReg(0); 2261 } else 2262 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2263 2264 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2265 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2266 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2267 .addUse(TrigVal) 2268 .setMIFlags(Flags); 2269 MI.eraseFromParent(); 2270 return true; 2271 } 2272 2273 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2274 MachineIRBuilder &B, 2275 const GlobalValue *GV, 2276 int64_t Offset, 2277 unsigned GAFlags) const { 2278 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2279 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2280 // to the following code sequence: 2281 // 2282 // For constant address space: 2283 // s_getpc_b64 s[0:1] 2284 // s_add_u32 s0, s0, $symbol 2285 // s_addc_u32 s1, s1, 0 2286 // 2287 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2288 // a fixup or relocation is emitted to replace $symbol with a literal 2289 // constant, which is a pc-relative offset from the encoding of the $symbol 2290 // operand to the global variable. 2291 // 2292 // For global address space: 2293 // s_getpc_b64 s[0:1] 2294 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2295 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2296 // 2297 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2298 // fixups or relocations are emitted to replace $symbol@*@lo and 2299 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2300 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2301 // operand to the global variable. 2302 // 2303 // What we want here is an offset from the value returned by s_getpc 2304 // (which is the address of the s_add_u32 instruction) to the global 2305 // variable, but since the encoding of $symbol starts 4 bytes after the start 2306 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2307 // small. This requires us to add 4 to the global variable offset in order to 2308 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2309 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2310 // instruction. 2311 2312 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2313 2314 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2315 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2316 2317 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2318 .addDef(PCReg); 2319 2320 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2321 if (GAFlags == SIInstrInfo::MO_NONE) 2322 MIB.addImm(0); 2323 else 2324 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2325 2326 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2327 2328 if (PtrTy.getSizeInBits() == 32) 2329 B.buildExtract(DstReg, PCReg, 0); 2330 return true; 2331 } 2332 2333 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2334 MachineInstr &MI, MachineRegisterInfo &MRI, 2335 MachineIRBuilder &B) const { 2336 Register DstReg = MI.getOperand(0).getReg(); 2337 LLT Ty = MRI.getType(DstReg); 2338 unsigned AS = Ty.getAddressSpace(); 2339 2340 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2341 MachineFunction &MF = B.getMF(); 2342 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2343 2344 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2345 if (!MFI->isModuleEntryFunction() && 2346 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2347 const Function &Fn = MF.getFunction(); 2348 DiagnosticInfoUnsupported BadLDSDecl( 2349 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2350 DS_Warning); 2351 Fn.getContext().diagnose(BadLDSDecl); 2352 2353 // We currently don't have a way to correctly allocate LDS objects that 2354 // aren't directly associated with a kernel. We do force inlining of 2355 // functions that use local objects. However, if these dead functions are 2356 // not eliminated, we don't want a compile time error. Just emit a warning 2357 // and a trap, since there should be no callable path here. 2358 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2359 B.buildUndef(DstReg); 2360 MI.eraseFromParent(); 2361 return true; 2362 } 2363 2364 // TODO: We could emit code to handle the initialization somewhere. 2365 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2366 const SITargetLowering *TLI = ST.getTargetLowering(); 2367 if (!TLI->shouldUseLDSConstAddress(GV)) { 2368 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2369 return true; // Leave in place; 2370 } 2371 2372 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2373 Type *Ty = GV->getValueType(); 2374 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2375 // zero-sized type in other languages to declare the dynamic shared 2376 // memory which size is not known at the compile time. They will be 2377 // allocated by the runtime and placed directly after the static 2378 // allocated ones. They all share the same offset. 2379 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2380 // Adjust alignment for that dynamic shared memory array. 2381 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2382 LLT S32 = LLT::scalar(32); 2383 auto Sz = 2384 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2385 B.buildIntToPtr(DstReg, Sz); 2386 MI.eraseFromParent(); 2387 return true; 2388 } 2389 } 2390 2391 B.buildConstant( 2392 DstReg, 2393 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2394 MI.eraseFromParent(); 2395 return true; 2396 } 2397 2398 const Function &Fn = MF.getFunction(); 2399 DiagnosticInfoUnsupported BadInit( 2400 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2401 Fn.getContext().diagnose(BadInit); 2402 return true; 2403 } 2404 2405 const SITargetLowering *TLI = ST.getTargetLowering(); 2406 2407 if (TLI->shouldEmitFixup(GV)) { 2408 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2409 MI.eraseFromParent(); 2410 return true; 2411 } 2412 2413 if (TLI->shouldEmitPCReloc(GV)) { 2414 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2415 MI.eraseFromParent(); 2416 return true; 2417 } 2418 2419 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2420 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2421 2422 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2423 MachinePointerInfo::getGOT(MF), 2424 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2425 MachineMemOperand::MOInvariant, 2426 8 /*Size*/, Align(8)); 2427 2428 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2429 2430 if (Ty.getSizeInBits() == 32) { 2431 // Truncate if this is a 32-bit constant adrdess. 2432 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2433 B.buildExtract(DstReg, Load, 0); 2434 } else 2435 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2436 2437 MI.eraseFromParent(); 2438 return true; 2439 } 2440 2441 static LLT widenToNextPowerOf2(LLT Ty) { 2442 if (Ty.isVector()) 2443 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2444 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2445 } 2446 2447 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2448 MachineInstr &MI) const { 2449 MachineIRBuilder &B = Helper.MIRBuilder; 2450 MachineRegisterInfo &MRI = *B.getMRI(); 2451 GISelChangeObserver &Observer = Helper.Observer; 2452 2453 Register PtrReg = MI.getOperand(1).getReg(); 2454 LLT PtrTy = MRI.getType(PtrReg); 2455 unsigned AddrSpace = PtrTy.getAddressSpace(); 2456 2457 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2458 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2459 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2460 Observer.changingInstr(MI); 2461 MI.getOperand(1).setReg(Cast.getReg(0)); 2462 Observer.changedInstr(MI); 2463 return true; 2464 } 2465 2466 if (MI.getOpcode() != AMDGPU::G_LOAD) 2467 return false; 2468 2469 Register ValReg = MI.getOperand(0).getReg(); 2470 LLT ValTy = MRI.getType(ValReg); 2471 2472 MachineMemOperand *MMO = *MI.memoperands_begin(); 2473 const unsigned ValSize = ValTy.getSizeInBits(); 2474 const unsigned MemSize = 8 * MMO->getSize(); 2475 const Align MemAlign = MMO->getAlign(); 2476 const unsigned AlignInBits = 8 * MemAlign.value(); 2477 2478 // Widen non-power-of-2 loads to the alignment if needed 2479 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2480 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2481 2482 // This was already the correct extending load result type, so just adjust 2483 // the memory type. 2484 if (WideMemSize == ValSize) { 2485 MachineFunction &MF = B.getMF(); 2486 2487 MachineMemOperand *WideMMO = 2488 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2489 Observer.changingInstr(MI); 2490 MI.setMemRefs(MF, {WideMMO}); 2491 Observer.changedInstr(MI); 2492 return true; 2493 } 2494 2495 // Don't bother handling edge case that should probably never be produced. 2496 if (ValSize > WideMemSize) 2497 return false; 2498 2499 LLT WideTy = widenToNextPowerOf2(ValTy); 2500 2501 Register WideLoad; 2502 if (!WideTy.isVector()) { 2503 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2504 B.buildTrunc(ValReg, WideLoad).getReg(0); 2505 } else { 2506 // Extract the subvector. 2507 2508 if (isRegisterType(ValTy)) { 2509 // If this a case where G_EXTRACT is legal, use it. 2510 // (e.g. <3 x s32> -> <4 x s32>) 2511 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2512 B.buildExtract(ValReg, WideLoad, 0); 2513 } else { 2514 // For cases where the widened type isn't a nice register value, unmerge 2515 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2516 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2517 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2518 B.setInsertPt(B.getMBB(), MI.getIterator()); 2519 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2520 } 2521 } 2522 2523 MI.eraseFromParent(); 2524 return true; 2525 } 2526 2527 return false; 2528 } 2529 2530 bool AMDGPULegalizerInfo::legalizeFMad( 2531 MachineInstr &MI, MachineRegisterInfo &MRI, 2532 MachineIRBuilder &B) const { 2533 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2534 assert(Ty.isScalar()); 2535 2536 MachineFunction &MF = B.getMF(); 2537 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2538 2539 // TODO: Always legal with future ftz flag. 2540 // FIXME: Do we need just output? 2541 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2542 return true; 2543 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2544 return true; 2545 2546 MachineIRBuilder HelperBuilder(MI); 2547 GISelObserverWrapper DummyObserver; 2548 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2549 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2550 } 2551 2552 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2553 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2554 Register DstReg = MI.getOperand(0).getReg(); 2555 Register PtrReg = MI.getOperand(1).getReg(); 2556 Register CmpVal = MI.getOperand(2).getReg(); 2557 Register NewVal = MI.getOperand(3).getReg(); 2558 2559 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2560 "this should not have been custom lowered"); 2561 2562 LLT ValTy = MRI.getType(CmpVal); 2563 LLT VecTy = LLT::fixed_vector(2, ValTy); 2564 2565 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2566 2567 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2568 .addDef(DstReg) 2569 .addUse(PtrReg) 2570 .addUse(PackedVal) 2571 .setMemRefs(MI.memoperands()); 2572 2573 MI.eraseFromParent(); 2574 return true; 2575 } 2576 2577 bool AMDGPULegalizerInfo::legalizeFlog( 2578 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2579 Register Dst = MI.getOperand(0).getReg(); 2580 Register Src = MI.getOperand(1).getReg(); 2581 LLT Ty = B.getMRI()->getType(Dst); 2582 unsigned Flags = MI.getFlags(); 2583 2584 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2585 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2586 2587 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2588 MI.eraseFromParent(); 2589 return true; 2590 } 2591 2592 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2593 MachineIRBuilder &B) const { 2594 Register Dst = MI.getOperand(0).getReg(); 2595 Register Src = MI.getOperand(1).getReg(); 2596 unsigned Flags = MI.getFlags(); 2597 LLT Ty = B.getMRI()->getType(Dst); 2598 2599 auto K = B.buildFConstant(Ty, numbers::log2e); 2600 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2601 B.buildFExp2(Dst, Mul, Flags); 2602 MI.eraseFromParent(); 2603 return true; 2604 } 2605 2606 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2607 MachineIRBuilder &B) const { 2608 Register Dst = MI.getOperand(0).getReg(); 2609 Register Src0 = MI.getOperand(1).getReg(); 2610 Register Src1 = MI.getOperand(2).getReg(); 2611 unsigned Flags = MI.getFlags(); 2612 LLT Ty = B.getMRI()->getType(Dst); 2613 const LLT S16 = LLT::scalar(16); 2614 const LLT S32 = LLT::scalar(32); 2615 2616 if (Ty == S32) { 2617 auto Log = B.buildFLog2(S32, Src0, Flags); 2618 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2619 .addUse(Log.getReg(0)) 2620 .addUse(Src1) 2621 .setMIFlags(Flags); 2622 B.buildFExp2(Dst, Mul, Flags); 2623 } else if (Ty == S16) { 2624 // There's no f16 fmul_legacy, so we need to convert for it. 2625 auto Log = B.buildFLog2(S16, Src0, Flags); 2626 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2627 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2628 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2629 .addUse(Ext0.getReg(0)) 2630 .addUse(Ext1.getReg(0)) 2631 .setMIFlags(Flags); 2632 2633 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2634 } else 2635 return false; 2636 2637 MI.eraseFromParent(); 2638 return true; 2639 } 2640 2641 // Find a source register, ignoring any possible source modifiers. 2642 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2643 Register ModSrc = OrigSrc; 2644 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2645 ModSrc = SrcFNeg->getOperand(1).getReg(); 2646 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2647 ModSrc = SrcFAbs->getOperand(1).getReg(); 2648 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2649 ModSrc = SrcFAbs->getOperand(1).getReg(); 2650 return ModSrc; 2651 } 2652 2653 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2654 MachineRegisterInfo &MRI, 2655 MachineIRBuilder &B) const { 2656 2657 const LLT S1 = LLT::scalar(1); 2658 const LLT S64 = LLT::scalar(64); 2659 Register Dst = MI.getOperand(0).getReg(); 2660 Register OrigSrc = MI.getOperand(1).getReg(); 2661 unsigned Flags = MI.getFlags(); 2662 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2663 "this should not have been custom lowered"); 2664 2665 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2666 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2667 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2668 // V_FRACT bug is: 2669 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2670 // 2671 // Convert floor(x) to (x - fract(x)) 2672 2673 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2674 .addUse(OrigSrc) 2675 .setMIFlags(Flags); 2676 2677 // Give source modifier matching some assistance before obscuring a foldable 2678 // pattern. 2679 2680 // TODO: We can avoid the neg on the fract? The input sign to fract 2681 // shouldn't matter? 2682 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2683 2684 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2685 2686 Register Min = MRI.createGenericVirtualRegister(S64); 2687 2688 // We don't need to concern ourselves with the snan handling difference, so 2689 // use the one which will directly select. 2690 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2691 if (MFI->getMode().IEEE) 2692 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2693 else 2694 B.buildFMinNum(Min, Fract, Const, Flags); 2695 2696 Register CorrectedFract = Min; 2697 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2698 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2699 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2700 } 2701 2702 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2703 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2704 2705 MI.eraseFromParent(); 2706 return true; 2707 } 2708 2709 // Turn an illegal packed v2s16 build vector into bit operations. 2710 // TODO: This should probably be a bitcast action in LegalizerHelper. 2711 bool AMDGPULegalizerInfo::legalizeBuildVector( 2712 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2713 Register Dst = MI.getOperand(0).getReg(); 2714 const LLT S32 = LLT::scalar(32); 2715 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2716 2717 Register Src0 = MI.getOperand(1).getReg(); 2718 Register Src1 = MI.getOperand(2).getReg(); 2719 assert(MRI.getType(Src0) == LLT::scalar(16)); 2720 2721 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2722 B.buildBitcast(Dst, Merge); 2723 2724 MI.eraseFromParent(); 2725 return true; 2726 } 2727 2728 // Check that this is a G_XOR x, -1 2729 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2730 if (MI.getOpcode() != TargetOpcode::G_XOR) 2731 return false; 2732 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2733 return ConstVal && *ConstVal == -1; 2734 } 2735 2736 // Return the use branch instruction, otherwise null if the usage is invalid. 2737 static MachineInstr * 2738 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2739 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2740 Register CondDef = MI.getOperand(0).getReg(); 2741 if (!MRI.hasOneNonDBGUse(CondDef)) 2742 return nullptr; 2743 2744 MachineBasicBlock *Parent = MI.getParent(); 2745 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2746 2747 if (isNot(MRI, *UseMI)) { 2748 Register NegatedCond = UseMI->getOperand(0).getReg(); 2749 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2750 return nullptr; 2751 2752 // We're deleting the def of this value, so we need to remove it. 2753 UseMI->eraseFromParent(); 2754 2755 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2756 Negated = true; 2757 } 2758 2759 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2760 return nullptr; 2761 2762 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2763 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2764 if (Next == Parent->end()) { 2765 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2766 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2767 return nullptr; 2768 UncondBrTarget = &*NextMBB; 2769 } else { 2770 if (Next->getOpcode() != AMDGPU::G_BR) 2771 return nullptr; 2772 Br = &*Next; 2773 UncondBrTarget = Br->getOperand(0).getMBB(); 2774 } 2775 2776 return UseMI; 2777 } 2778 2779 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2780 const ArgDescriptor *Arg, 2781 const TargetRegisterClass *ArgRC, 2782 LLT ArgTy) const { 2783 MCRegister SrcReg = Arg->getRegister(); 2784 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2785 assert(DstReg.isVirtual() && "Virtual register expected"); 2786 2787 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2788 ArgTy); 2789 if (Arg->isMasked()) { 2790 // TODO: Should we try to emit this once in the entry block? 2791 const LLT S32 = LLT::scalar(32); 2792 const unsigned Mask = Arg->getMask(); 2793 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2794 2795 Register AndMaskSrc = LiveIn; 2796 2797 if (Shift != 0) { 2798 auto ShiftAmt = B.buildConstant(S32, Shift); 2799 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2800 } 2801 2802 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2803 } else { 2804 B.buildCopy(DstReg, LiveIn); 2805 } 2806 2807 return true; 2808 } 2809 2810 bool AMDGPULegalizerInfo::loadInputValue( 2811 Register DstReg, MachineIRBuilder &B, 2812 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2813 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2814 const ArgDescriptor *Arg; 2815 const TargetRegisterClass *ArgRC; 2816 LLT ArgTy; 2817 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2818 2819 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2820 return false; // TODO: Handle these 2821 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2822 } 2823 2824 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2825 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2826 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2827 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2828 return false; 2829 2830 MI.eraseFromParent(); 2831 return true; 2832 } 2833 2834 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2835 MachineRegisterInfo &MRI, 2836 MachineIRBuilder &B) const { 2837 Register Dst = MI.getOperand(0).getReg(); 2838 LLT DstTy = MRI.getType(Dst); 2839 LLT S16 = LLT::scalar(16); 2840 LLT S32 = LLT::scalar(32); 2841 LLT S64 = LLT::scalar(64); 2842 2843 if (DstTy == S16) 2844 return legalizeFDIV16(MI, MRI, B); 2845 if (DstTy == S32) 2846 return legalizeFDIV32(MI, MRI, B); 2847 if (DstTy == S64) 2848 return legalizeFDIV64(MI, MRI, B); 2849 2850 return false; 2851 } 2852 2853 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2854 Register DstDivReg, 2855 Register DstRemReg, 2856 Register X, 2857 Register Y) const { 2858 const LLT S1 = LLT::scalar(1); 2859 const LLT S32 = LLT::scalar(32); 2860 2861 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2862 // algorithm used here. 2863 2864 // Initial estimate of inv(y). 2865 auto FloatY = B.buildUITOFP(S32, Y); 2866 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2867 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2868 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2869 auto Z = B.buildFPTOUI(S32, ScaledY); 2870 2871 // One round of UNR. 2872 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2873 auto NegYZ = B.buildMul(S32, NegY, Z); 2874 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2875 2876 // Quotient/remainder estimate. 2877 auto Q = B.buildUMulH(S32, X, Z); 2878 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2879 2880 // First quotient/remainder refinement. 2881 auto One = B.buildConstant(S32, 1); 2882 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2883 if (DstDivReg) 2884 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2885 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2886 2887 // Second quotient/remainder refinement. 2888 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2889 if (DstDivReg) 2890 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2891 2892 if (DstRemReg) 2893 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2894 } 2895 2896 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2897 // 2898 // Return lo, hi of result 2899 // 2900 // %cvt.lo = G_UITOFP Val.lo 2901 // %cvt.hi = G_UITOFP Val.hi 2902 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2903 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2904 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2905 // %mul2 = G_FMUL %mul1, 2**(-32) 2906 // %trunc = G_INTRINSIC_TRUNC %mul2 2907 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2908 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2909 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2910 Register Val) { 2911 const LLT S32 = LLT::scalar(32); 2912 auto Unmerge = B.buildUnmerge(S32, Val); 2913 2914 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2915 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2916 2917 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2918 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2919 2920 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2921 auto Mul1 = 2922 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2923 2924 // 2**(-32) 2925 auto Mul2 = 2926 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2927 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2928 2929 // -(2**32) 2930 auto Mad2 = B.buildFMAD(S32, Trunc, 2931 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2932 2933 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2934 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2935 2936 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2937 } 2938 2939 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 2940 Register DstDivReg, 2941 Register DstRemReg, 2942 Register Numer, 2943 Register Denom) const { 2944 const LLT S32 = LLT::scalar(32); 2945 const LLT S64 = LLT::scalar(64); 2946 const LLT S1 = LLT::scalar(1); 2947 Register RcpLo, RcpHi; 2948 2949 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2950 2951 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2952 2953 auto Zero64 = B.buildConstant(S64, 0); 2954 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2955 2956 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2957 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2958 2959 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2960 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2961 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2962 2963 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2964 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2965 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2966 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2967 2968 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2969 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2970 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2971 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2972 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2973 2974 auto Zero32 = B.buildConstant(S32, 0); 2975 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2976 auto Add2_HiC = 2977 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2978 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2979 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2980 2981 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2982 Register NumerLo = UnmergeNumer.getReg(0); 2983 Register NumerHi = UnmergeNumer.getReg(1); 2984 2985 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2986 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2987 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2988 Register Mul3_Lo = UnmergeMul3.getReg(0); 2989 Register Mul3_Hi = UnmergeMul3.getReg(1); 2990 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2991 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2992 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2993 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2994 2995 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2996 Register DenomLo = UnmergeDenom.getReg(0); 2997 Register DenomHi = UnmergeDenom.getReg(1); 2998 2999 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3000 auto C1 = B.buildSExt(S32, CmpHi); 3001 3002 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3003 auto C2 = B.buildSExt(S32, CmpLo); 3004 3005 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3006 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3007 3008 // TODO: Here and below portions of the code can be enclosed into if/endif. 3009 // Currently control flow is unconditional and we have 4 selects after 3010 // potential endif to substitute PHIs. 3011 3012 // if C3 != 0 ... 3013 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3014 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3015 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3016 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3017 3018 auto One64 = B.buildConstant(S64, 1); 3019 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3020 3021 auto C4 = 3022 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3023 auto C5 = 3024 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3025 auto C6 = B.buildSelect( 3026 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3027 3028 // if (C6 != 0) 3029 auto Add4 = B.buildAdd(S64, Add3, One64); 3030 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3031 3032 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3033 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3034 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3035 3036 // endif C6 3037 // endif C3 3038 3039 if (DstDivReg) { 3040 auto Sel1 = B.buildSelect( 3041 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3042 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3043 Sel1, MulHi3); 3044 } 3045 3046 if (DstRemReg) { 3047 auto Sel2 = B.buildSelect( 3048 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3049 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3050 Sel2, Sub1); 3051 } 3052 } 3053 3054 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3055 MachineRegisterInfo &MRI, 3056 MachineIRBuilder &B) const { 3057 Register DstDivReg, DstRemReg; 3058 switch (MI.getOpcode()) { 3059 default: 3060 llvm_unreachable("Unexpected opcode!"); 3061 case AMDGPU::G_UDIV: { 3062 DstDivReg = MI.getOperand(0).getReg(); 3063 break; 3064 } 3065 case AMDGPU::G_UREM: { 3066 DstRemReg = MI.getOperand(0).getReg(); 3067 break; 3068 } 3069 case AMDGPU::G_UDIVREM: { 3070 DstDivReg = MI.getOperand(0).getReg(); 3071 DstRemReg = MI.getOperand(1).getReg(); 3072 break; 3073 } 3074 } 3075 3076 const LLT S64 = LLT::scalar(64); 3077 const LLT S32 = LLT::scalar(32); 3078 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3079 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3080 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3081 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3082 3083 if (Ty == S32) 3084 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3085 else if (Ty == S64) 3086 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3087 else 3088 return false; 3089 3090 MI.eraseFromParent(); 3091 return true; 3092 } 3093 3094 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3095 MachineRegisterInfo &MRI, 3096 MachineIRBuilder &B) const { 3097 const LLT S64 = LLT::scalar(64); 3098 const LLT S32 = LLT::scalar(32); 3099 3100 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3101 if (Ty != S32 && Ty != S64) 3102 return false; 3103 3104 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3105 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3106 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3107 3108 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3109 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3110 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3111 3112 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3113 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3114 3115 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3116 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3117 3118 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3119 switch (MI.getOpcode()) { 3120 default: 3121 llvm_unreachable("Unexpected opcode!"); 3122 case AMDGPU::G_SDIV: { 3123 DstDivReg = MI.getOperand(0).getReg(); 3124 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3125 break; 3126 } 3127 case AMDGPU::G_SREM: { 3128 DstRemReg = MI.getOperand(0).getReg(); 3129 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3130 break; 3131 } 3132 case AMDGPU::G_SDIVREM: { 3133 DstDivReg = MI.getOperand(0).getReg(); 3134 DstRemReg = MI.getOperand(1).getReg(); 3135 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3136 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3137 break; 3138 } 3139 } 3140 3141 if (Ty == S32) 3142 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3143 else 3144 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3145 3146 if (DstDivReg) { 3147 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3148 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3149 B.buildSub(DstDivReg, SignXor, Sign); 3150 } 3151 3152 if (DstRemReg) { 3153 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3154 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3155 B.buildSub(DstRemReg, SignXor, Sign); 3156 } 3157 3158 MI.eraseFromParent(); 3159 return true; 3160 } 3161 3162 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3163 MachineRegisterInfo &MRI, 3164 MachineIRBuilder &B) const { 3165 Register Res = MI.getOperand(0).getReg(); 3166 Register LHS = MI.getOperand(1).getReg(); 3167 Register RHS = MI.getOperand(2).getReg(); 3168 uint16_t Flags = MI.getFlags(); 3169 LLT ResTy = MRI.getType(Res); 3170 3171 const MachineFunction &MF = B.getMF(); 3172 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3173 MI.getFlag(MachineInstr::FmAfn); 3174 3175 if (!AllowInaccurateRcp) 3176 return false; 3177 3178 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3179 // 1 / x -> RCP(x) 3180 if (CLHS->isExactlyValue(1.0)) { 3181 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3182 .addUse(RHS) 3183 .setMIFlags(Flags); 3184 3185 MI.eraseFromParent(); 3186 return true; 3187 } 3188 3189 // -1 / x -> RCP( FNEG(x) ) 3190 if (CLHS->isExactlyValue(-1.0)) { 3191 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3192 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3193 .addUse(FNeg.getReg(0)) 3194 .setMIFlags(Flags); 3195 3196 MI.eraseFromParent(); 3197 return true; 3198 } 3199 } 3200 3201 // x / y -> x * (1.0 / y) 3202 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3203 .addUse(RHS) 3204 .setMIFlags(Flags); 3205 B.buildFMul(Res, LHS, RCP, Flags); 3206 3207 MI.eraseFromParent(); 3208 return true; 3209 } 3210 3211 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3212 MachineRegisterInfo &MRI, 3213 MachineIRBuilder &B) const { 3214 Register Res = MI.getOperand(0).getReg(); 3215 Register X = MI.getOperand(1).getReg(); 3216 Register Y = MI.getOperand(2).getReg(); 3217 uint16_t Flags = MI.getFlags(); 3218 LLT ResTy = MRI.getType(Res); 3219 3220 const MachineFunction &MF = B.getMF(); 3221 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3222 MI.getFlag(MachineInstr::FmAfn); 3223 3224 if (!AllowInaccurateRcp) 3225 return false; 3226 3227 auto NegY = B.buildFNeg(ResTy, Y); 3228 auto One = B.buildFConstant(ResTy, 1.0); 3229 3230 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3231 .addUse(Y) 3232 .setMIFlags(Flags); 3233 3234 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3235 R = B.buildFMA(ResTy, Tmp0, R, R); 3236 3237 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3238 R = B.buildFMA(ResTy, Tmp1, R, R); 3239 3240 auto Ret = B.buildFMul(ResTy, X, R); 3241 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3242 3243 B.buildFMA(Res, Tmp2, R, Ret); 3244 MI.eraseFromParent(); 3245 return true; 3246 } 3247 3248 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3249 MachineRegisterInfo &MRI, 3250 MachineIRBuilder &B) const { 3251 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3252 return true; 3253 3254 Register Res = MI.getOperand(0).getReg(); 3255 Register LHS = MI.getOperand(1).getReg(); 3256 Register RHS = MI.getOperand(2).getReg(); 3257 3258 uint16_t Flags = MI.getFlags(); 3259 3260 LLT S16 = LLT::scalar(16); 3261 LLT S32 = LLT::scalar(32); 3262 3263 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3264 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3265 3266 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3267 .addUse(RHSExt.getReg(0)) 3268 .setMIFlags(Flags); 3269 3270 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3271 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3272 3273 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3274 .addUse(RDst.getReg(0)) 3275 .addUse(RHS) 3276 .addUse(LHS) 3277 .setMIFlags(Flags); 3278 3279 MI.eraseFromParent(); 3280 return true; 3281 } 3282 3283 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3284 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3285 static void toggleSPDenormMode(bool Enable, 3286 MachineIRBuilder &B, 3287 const GCNSubtarget &ST, 3288 AMDGPU::SIModeRegisterDefaults Mode) { 3289 // Set SP denorm mode to this value. 3290 unsigned SPDenormMode = 3291 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3292 3293 if (ST.hasDenormModeInst()) { 3294 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3295 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3296 3297 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3298 B.buildInstr(AMDGPU::S_DENORM_MODE) 3299 .addImm(NewDenormModeValue); 3300 3301 } else { 3302 // Select FP32 bit field in mode register. 3303 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3304 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3305 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3306 3307 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3308 .addImm(SPDenormMode) 3309 .addImm(SPDenormModeBitField); 3310 } 3311 } 3312 3313 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3314 MachineRegisterInfo &MRI, 3315 MachineIRBuilder &B) const { 3316 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3317 return true; 3318 3319 Register Res = MI.getOperand(0).getReg(); 3320 Register LHS = MI.getOperand(1).getReg(); 3321 Register RHS = MI.getOperand(2).getReg(); 3322 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3323 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3324 3325 uint16_t Flags = MI.getFlags(); 3326 3327 LLT S32 = LLT::scalar(32); 3328 LLT S1 = LLT::scalar(1); 3329 3330 auto One = B.buildFConstant(S32, 1.0f); 3331 3332 auto DenominatorScaled = 3333 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3334 .addUse(LHS) 3335 .addUse(RHS) 3336 .addImm(0) 3337 .setMIFlags(Flags); 3338 auto NumeratorScaled = 3339 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3340 .addUse(LHS) 3341 .addUse(RHS) 3342 .addImm(1) 3343 .setMIFlags(Flags); 3344 3345 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3346 .addUse(DenominatorScaled.getReg(0)) 3347 .setMIFlags(Flags); 3348 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3349 3350 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3351 // aren't modeled as reading it. 3352 if (!Mode.allFP32Denormals()) 3353 toggleSPDenormMode(true, B, ST, Mode); 3354 3355 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3356 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3357 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3358 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3359 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3360 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3361 3362 if (!Mode.allFP32Denormals()) 3363 toggleSPDenormMode(false, B, ST, Mode); 3364 3365 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3366 .addUse(Fma4.getReg(0)) 3367 .addUse(Fma1.getReg(0)) 3368 .addUse(Fma3.getReg(0)) 3369 .addUse(NumeratorScaled.getReg(1)) 3370 .setMIFlags(Flags); 3371 3372 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3373 .addUse(Fmas.getReg(0)) 3374 .addUse(RHS) 3375 .addUse(LHS) 3376 .setMIFlags(Flags); 3377 3378 MI.eraseFromParent(); 3379 return true; 3380 } 3381 3382 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3383 MachineRegisterInfo &MRI, 3384 MachineIRBuilder &B) const { 3385 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3386 return true; 3387 3388 Register Res = MI.getOperand(0).getReg(); 3389 Register LHS = MI.getOperand(1).getReg(); 3390 Register RHS = MI.getOperand(2).getReg(); 3391 3392 uint16_t Flags = MI.getFlags(); 3393 3394 LLT S64 = LLT::scalar(64); 3395 LLT S1 = LLT::scalar(1); 3396 3397 auto One = B.buildFConstant(S64, 1.0); 3398 3399 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3400 .addUse(LHS) 3401 .addUse(RHS) 3402 .addImm(0) 3403 .setMIFlags(Flags); 3404 3405 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3406 3407 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3408 .addUse(DivScale0.getReg(0)) 3409 .setMIFlags(Flags); 3410 3411 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3412 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3413 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3414 3415 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3416 .addUse(LHS) 3417 .addUse(RHS) 3418 .addImm(1) 3419 .setMIFlags(Flags); 3420 3421 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3422 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3423 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3424 3425 Register Scale; 3426 if (!ST.hasUsableDivScaleConditionOutput()) { 3427 // Workaround a hardware bug on SI where the condition output from div_scale 3428 // is not usable. 3429 3430 LLT S32 = LLT::scalar(32); 3431 3432 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3433 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3434 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3435 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3436 3437 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3438 Scale1Unmerge.getReg(1)); 3439 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3440 Scale0Unmerge.getReg(1)); 3441 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3442 } else { 3443 Scale = DivScale1.getReg(1); 3444 } 3445 3446 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3447 .addUse(Fma4.getReg(0)) 3448 .addUse(Fma3.getReg(0)) 3449 .addUse(Mul.getReg(0)) 3450 .addUse(Scale) 3451 .setMIFlags(Flags); 3452 3453 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3454 .addUse(Fmas.getReg(0)) 3455 .addUse(RHS) 3456 .addUse(LHS) 3457 .setMIFlags(Flags); 3458 3459 MI.eraseFromParent(); 3460 return true; 3461 } 3462 3463 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3464 MachineRegisterInfo &MRI, 3465 MachineIRBuilder &B) const { 3466 Register Res = MI.getOperand(0).getReg(); 3467 Register LHS = MI.getOperand(2).getReg(); 3468 Register RHS = MI.getOperand(3).getReg(); 3469 uint16_t Flags = MI.getFlags(); 3470 3471 LLT S32 = LLT::scalar(32); 3472 LLT S1 = LLT::scalar(1); 3473 3474 auto Abs = B.buildFAbs(S32, RHS, Flags); 3475 const APFloat C0Val(1.0f); 3476 3477 auto C0 = B.buildConstant(S32, 0x6f800000); 3478 auto C1 = B.buildConstant(S32, 0x2f800000); 3479 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3480 3481 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3482 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3483 3484 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3485 3486 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3487 .addUse(Mul0.getReg(0)) 3488 .setMIFlags(Flags); 3489 3490 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3491 3492 B.buildFMul(Res, Sel, Mul1, Flags); 3493 3494 MI.eraseFromParent(); 3495 return true; 3496 } 3497 3498 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3499 // FIXME: Why do we handle this one but not other removed instructions? 3500 // 3501 // Reciprocal square root. The clamp prevents infinite results, clamping 3502 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3503 // +-max_float. 3504 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3505 MachineRegisterInfo &MRI, 3506 MachineIRBuilder &B) const { 3507 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3508 return true; 3509 3510 Register Dst = MI.getOperand(0).getReg(); 3511 Register Src = MI.getOperand(2).getReg(); 3512 auto Flags = MI.getFlags(); 3513 3514 LLT Ty = MRI.getType(Dst); 3515 3516 const fltSemantics *FltSemantics; 3517 if (Ty == LLT::scalar(32)) 3518 FltSemantics = &APFloat::IEEEsingle(); 3519 else if (Ty == LLT::scalar(64)) 3520 FltSemantics = &APFloat::IEEEdouble(); 3521 else 3522 return false; 3523 3524 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3525 .addUse(Src) 3526 .setMIFlags(Flags); 3527 3528 // We don't need to concern ourselves with the snan handling difference, since 3529 // the rsq quieted (or not) so use the one which will directly select. 3530 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3531 const bool UseIEEE = MFI->getMode().IEEE; 3532 3533 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3534 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3535 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3536 3537 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3538 3539 if (UseIEEE) 3540 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3541 else 3542 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3543 MI.eraseFromParent(); 3544 return true; 3545 } 3546 3547 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3548 switch (IID) { 3549 case Intrinsic::amdgcn_ds_fadd: 3550 return AMDGPU::G_ATOMICRMW_FADD; 3551 case Intrinsic::amdgcn_ds_fmin: 3552 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3553 case Intrinsic::amdgcn_ds_fmax: 3554 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3555 default: 3556 llvm_unreachable("not a DS FP intrinsic"); 3557 } 3558 } 3559 3560 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3561 MachineInstr &MI, 3562 Intrinsic::ID IID) const { 3563 GISelChangeObserver &Observer = Helper.Observer; 3564 Observer.changingInstr(MI); 3565 3566 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3567 3568 // The remaining operands were used to set fields in the MemOperand on 3569 // construction. 3570 for (int I = 6; I > 3; --I) 3571 MI.RemoveOperand(I); 3572 3573 MI.RemoveOperand(1); // Remove the intrinsic ID. 3574 Observer.changedInstr(MI); 3575 return true; 3576 } 3577 3578 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3579 MachineRegisterInfo &MRI, 3580 MachineIRBuilder &B) const { 3581 uint64_t Offset = 3582 ST.getTargetLowering()->getImplicitParameterOffset( 3583 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3584 LLT DstTy = MRI.getType(DstReg); 3585 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3586 3587 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3588 if (!loadInputValue(KernargPtrReg, B, 3589 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3590 return false; 3591 3592 // FIXME: This should be nuw 3593 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3594 return true; 3595 } 3596 3597 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3598 MachineRegisterInfo &MRI, 3599 MachineIRBuilder &B) const { 3600 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3601 if (!MFI->isEntryFunction()) { 3602 return legalizePreloadedArgIntrin(MI, MRI, B, 3603 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3604 } 3605 3606 Register DstReg = MI.getOperand(0).getReg(); 3607 if (!getImplicitArgPtr(DstReg, MRI, B)) 3608 return false; 3609 3610 MI.eraseFromParent(); 3611 return true; 3612 } 3613 3614 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3615 MachineRegisterInfo &MRI, 3616 MachineIRBuilder &B, 3617 unsigned AddrSpace) const { 3618 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3619 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3620 Register Hi32 = Unmerge.getReg(1); 3621 3622 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3623 MI.eraseFromParent(); 3624 return true; 3625 } 3626 3627 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3628 // offset (the offset that is included in bounds checking and swizzling, to be 3629 // split between the instruction's voffset and immoffset fields) and soffset 3630 // (the offset that is excluded from bounds checking and swizzling, to go in 3631 // the instruction's soffset field). This function takes the first kind of 3632 // offset and figures out how to split it between voffset and immoffset. 3633 std::tuple<Register, unsigned, unsigned> 3634 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3635 Register OrigOffset) const { 3636 const unsigned MaxImm = 4095; 3637 Register BaseReg; 3638 unsigned TotalConstOffset; 3639 const LLT S32 = LLT::scalar(32); 3640 MachineRegisterInfo &MRI = *B.getMRI(); 3641 3642 std::tie(BaseReg, TotalConstOffset) = 3643 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3644 3645 unsigned ImmOffset = TotalConstOffset; 3646 3647 // If BaseReg is a pointer, convert it to int. 3648 if (MRI.getType(BaseReg).isPointer()) 3649 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3650 3651 // If the immediate value is too big for the immoffset field, put the value 3652 // and -4096 into the immoffset field so that the value that is copied/added 3653 // for the voffset field is a multiple of 4096, and it stands more chance 3654 // of being CSEd with the copy/add for another similar load/store. 3655 // However, do not do that rounding down to a multiple of 4096 if that is a 3656 // negative number, as it appears to be illegal to have a negative offset 3657 // in the vgpr, even if adding the immediate offset makes it positive. 3658 unsigned Overflow = ImmOffset & ~MaxImm; 3659 ImmOffset -= Overflow; 3660 if ((int32_t)Overflow < 0) { 3661 Overflow += ImmOffset; 3662 ImmOffset = 0; 3663 } 3664 3665 if (Overflow != 0) { 3666 if (!BaseReg) { 3667 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3668 } else { 3669 auto OverflowVal = B.buildConstant(S32, Overflow); 3670 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3671 } 3672 } 3673 3674 if (!BaseReg) 3675 BaseReg = B.buildConstant(S32, 0).getReg(0); 3676 3677 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3678 } 3679 3680 /// Handle register layout difference for f16 images for some subtargets. 3681 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3682 MachineRegisterInfo &MRI, 3683 Register Reg, 3684 bool ImageStore) const { 3685 const LLT S16 = LLT::scalar(16); 3686 const LLT S32 = LLT::scalar(32); 3687 LLT StoreVT = MRI.getType(Reg); 3688 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3689 3690 if (ST.hasUnpackedD16VMem()) { 3691 auto Unmerge = B.buildUnmerge(S16, Reg); 3692 3693 SmallVector<Register, 4> WideRegs; 3694 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3695 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3696 3697 int NumElts = StoreVT.getNumElements(); 3698 3699 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3700 .getReg(0); 3701 } 3702 3703 if (ImageStore && ST.hasImageStoreD16Bug()) { 3704 if (StoreVT.getNumElements() == 2) { 3705 SmallVector<Register, 4> PackedRegs; 3706 Reg = B.buildBitcast(S32, Reg).getReg(0); 3707 PackedRegs.push_back(Reg); 3708 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3709 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3710 .getReg(0); 3711 } 3712 3713 if (StoreVT.getNumElements() == 3) { 3714 SmallVector<Register, 4> PackedRegs; 3715 auto Unmerge = B.buildUnmerge(S16, Reg); 3716 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3717 PackedRegs.push_back(Unmerge.getReg(I)); 3718 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3719 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3720 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3721 } 3722 3723 if (StoreVT.getNumElements() == 4) { 3724 SmallVector<Register, 4> PackedRegs; 3725 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3726 auto Unmerge = B.buildUnmerge(S32, Reg); 3727 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3728 PackedRegs.push_back(Unmerge.getReg(I)); 3729 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3730 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3731 .getReg(0); 3732 } 3733 3734 llvm_unreachable("invalid data type"); 3735 } 3736 3737 return Reg; 3738 } 3739 3740 Register AMDGPULegalizerInfo::fixStoreSourceType( 3741 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3742 MachineRegisterInfo *MRI = B.getMRI(); 3743 LLT Ty = MRI->getType(VData); 3744 3745 const LLT S16 = LLT::scalar(16); 3746 3747 // Fixup illegal register types for i8 stores. 3748 if (Ty == LLT::scalar(8) || Ty == S16) { 3749 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3750 return AnyExt; 3751 } 3752 3753 if (Ty.isVector()) { 3754 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3755 if (IsFormat) 3756 return handleD16VData(B, *MRI, VData); 3757 } 3758 } 3759 3760 return VData; 3761 } 3762 3763 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3764 MachineRegisterInfo &MRI, 3765 MachineIRBuilder &B, 3766 bool IsTyped, 3767 bool IsFormat) const { 3768 Register VData = MI.getOperand(1).getReg(); 3769 LLT Ty = MRI.getType(VData); 3770 LLT EltTy = Ty.getScalarType(); 3771 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3772 const LLT S32 = LLT::scalar(32); 3773 3774 VData = fixStoreSourceType(B, VData, IsFormat); 3775 Register RSrc = MI.getOperand(2).getReg(); 3776 3777 MachineMemOperand *MMO = *MI.memoperands_begin(); 3778 const int MemSize = MMO->getSize(); 3779 3780 unsigned ImmOffset; 3781 unsigned TotalOffset; 3782 3783 // The typed intrinsics add an immediate after the registers. 3784 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3785 3786 // The struct intrinsic variants add one additional operand over raw. 3787 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3788 Register VIndex; 3789 int OpOffset = 0; 3790 if (HasVIndex) { 3791 VIndex = MI.getOperand(3).getReg(); 3792 OpOffset = 1; 3793 } 3794 3795 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3796 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3797 3798 unsigned Format = 0; 3799 if (IsTyped) { 3800 Format = MI.getOperand(5 + OpOffset).getImm(); 3801 ++OpOffset; 3802 } 3803 3804 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3805 3806 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3807 if (TotalOffset != 0) 3808 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3809 3810 unsigned Opc; 3811 if (IsTyped) { 3812 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3813 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3814 } else if (IsFormat) { 3815 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3816 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3817 } else { 3818 switch (MemSize) { 3819 case 1: 3820 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3821 break; 3822 case 2: 3823 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3824 break; 3825 default: 3826 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3827 break; 3828 } 3829 } 3830 3831 if (!VIndex) 3832 VIndex = B.buildConstant(S32, 0).getReg(0); 3833 3834 auto MIB = B.buildInstr(Opc) 3835 .addUse(VData) // vdata 3836 .addUse(RSrc) // rsrc 3837 .addUse(VIndex) // vindex 3838 .addUse(VOffset) // voffset 3839 .addUse(SOffset) // soffset 3840 .addImm(ImmOffset); // offset(imm) 3841 3842 if (IsTyped) 3843 MIB.addImm(Format); 3844 3845 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3846 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3847 .addMemOperand(MMO); 3848 3849 MI.eraseFromParent(); 3850 return true; 3851 } 3852 3853 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3854 MachineRegisterInfo &MRI, 3855 MachineIRBuilder &B, 3856 bool IsFormat, 3857 bool IsTyped) const { 3858 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3859 MachineMemOperand *MMO = *MI.memoperands_begin(); 3860 const int MemSize = MMO->getSize(); 3861 const LLT S32 = LLT::scalar(32); 3862 3863 Register Dst = MI.getOperand(0).getReg(); 3864 Register RSrc = MI.getOperand(2).getReg(); 3865 3866 // The typed intrinsics add an immediate after the registers. 3867 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3868 3869 // The struct intrinsic variants add one additional operand over raw. 3870 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3871 Register VIndex; 3872 int OpOffset = 0; 3873 if (HasVIndex) { 3874 VIndex = MI.getOperand(3).getReg(); 3875 OpOffset = 1; 3876 } 3877 3878 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3879 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3880 3881 unsigned Format = 0; 3882 if (IsTyped) { 3883 Format = MI.getOperand(5 + OpOffset).getImm(); 3884 ++OpOffset; 3885 } 3886 3887 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3888 unsigned ImmOffset; 3889 unsigned TotalOffset; 3890 3891 LLT Ty = MRI.getType(Dst); 3892 LLT EltTy = Ty.getScalarType(); 3893 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3894 const bool Unpacked = ST.hasUnpackedD16VMem(); 3895 3896 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3897 if (TotalOffset != 0) 3898 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3899 3900 unsigned Opc; 3901 3902 if (IsTyped) { 3903 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3904 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3905 } else if (IsFormat) { 3906 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3907 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3908 } else { 3909 switch (MemSize) { 3910 case 1: 3911 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3912 break; 3913 case 2: 3914 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3915 break; 3916 default: 3917 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3918 break; 3919 } 3920 } 3921 3922 Register LoadDstReg; 3923 3924 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3925 LLT UnpackedTy = Ty.changeElementSize(32); 3926 3927 if (IsExtLoad) 3928 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3929 else if (Unpacked && IsD16 && Ty.isVector()) 3930 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3931 else 3932 LoadDstReg = Dst; 3933 3934 if (!VIndex) 3935 VIndex = B.buildConstant(S32, 0).getReg(0); 3936 3937 auto MIB = B.buildInstr(Opc) 3938 .addDef(LoadDstReg) // vdata 3939 .addUse(RSrc) // rsrc 3940 .addUse(VIndex) // vindex 3941 .addUse(VOffset) // voffset 3942 .addUse(SOffset) // soffset 3943 .addImm(ImmOffset); // offset(imm) 3944 3945 if (IsTyped) 3946 MIB.addImm(Format); 3947 3948 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3949 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3950 .addMemOperand(MMO); 3951 3952 if (LoadDstReg != Dst) { 3953 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3954 3955 // Widen result for extending loads was widened. 3956 if (IsExtLoad) 3957 B.buildTrunc(Dst, LoadDstReg); 3958 else { 3959 // Repack to original 16-bit vector result 3960 // FIXME: G_TRUNC should work, but legalization currently fails 3961 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3962 SmallVector<Register, 4> Repack; 3963 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3964 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3965 B.buildMerge(Dst, Repack); 3966 } 3967 } 3968 3969 MI.eraseFromParent(); 3970 return true; 3971 } 3972 3973 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3974 MachineIRBuilder &B, 3975 bool IsInc) const { 3976 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3977 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3978 B.buildInstr(Opc) 3979 .addDef(MI.getOperand(0).getReg()) 3980 .addUse(MI.getOperand(2).getReg()) 3981 .addUse(MI.getOperand(3).getReg()) 3982 .cloneMemRefs(MI); 3983 MI.eraseFromParent(); 3984 return true; 3985 } 3986 3987 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3988 switch (IntrID) { 3989 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3990 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3991 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3992 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3993 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3994 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3995 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3996 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3997 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3998 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3999 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4000 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4001 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4002 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4003 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4004 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4005 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4006 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4007 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4008 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4009 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4010 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4011 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4012 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4013 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4014 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4015 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4016 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4017 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4018 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4019 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4020 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4021 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4022 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4023 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4024 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4025 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4026 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4027 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4028 case Intrinsic::amdgcn_buffer_atomic_fadd: 4029 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4030 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4031 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4032 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4033 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4034 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4035 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4036 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4037 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4038 default: 4039 llvm_unreachable("unhandled atomic opcode"); 4040 } 4041 } 4042 4043 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4044 MachineIRBuilder &B, 4045 Intrinsic::ID IID) const { 4046 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4047 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4048 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4049 4050 Register Dst; 4051 4052 int OpOffset = 0; 4053 if (HasReturn) { 4054 // A few FP atomics do not support return values. 4055 Dst = MI.getOperand(0).getReg(); 4056 } else { 4057 OpOffset = -1; 4058 } 4059 4060 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4061 Register CmpVal; 4062 4063 if (IsCmpSwap) { 4064 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4065 ++OpOffset; 4066 } 4067 4068 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4069 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4070 4071 // The struct intrinsic variants add one additional operand over raw. 4072 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4073 Register VIndex; 4074 if (HasVIndex) { 4075 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4076 ++OpOffset; 4077 } 4078 4079 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4080 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4081 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4082 4083 MachineMemOperand *MMO = *MI.memoperands_begin(); 4084 4085 unsigned ImmOffset; 4086 unsigned TotalOffset; 4087 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 4088 if (TotalOffset != 0) 4089 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 4090 4091 if (!VIndex) 4092 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4093 4094 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4095 4096 if (HasReturn) 4097 MIB.addDef(Dst); 4098 4099 MIB.addUse(VData); // vdata 4100 4101 if (IsCmpSwap) 4102 MIB.addReg(CmpVal); 4103 4104 MIB.addUse(RSrc) // rsrc 4105 .addUse(VIndex) // vindex 4106 .addUse(VOffset) // voffset 4107 .addUse(SOffset) // soffset 4108 .addImm(ImmOffset) // offset(imm) 4109 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4110 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4111 .addMemOperand(MMO); 4112 4113 MI.eraseFromParent(); 4114 return true; 4115 } 4116 4117 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4118 /// vector with s16 typed elements. 4119 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4120 SmallVectorImpl<Register> &PackedAddrs, 4121 unsigned ArgOffset, 4122 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4123 bool IsA16, bool IsG16) { 4124 const LLT S16 = LLT::scalar(16); 4125 const LLT V2S16 = LLT::fixed_vector(2, 16); 4126 auto EndIdx = Intr->VAddrEnd; 4127 4128 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4129 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4130 if (!SrcOp.isReg()) 4131 continue; // _L to _LZ may have eliminated this. 4132 4133 Register AddrReg = SrcOp.getReg(); 4134 4135 if (I < Intr->GradientStart) { 4136 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4137 PackedAddrs.push_back(AddrReg); 4138 } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4139 (I >= Intr->CoordStart && !IsA16)) { 4140 // Handle any gradient or coordinate operands that should not be packed 4141 PackedAddrs.push_back(AddrReg); 4142 } else { 4143 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4144 // derivatives dx/dh and dx/dv are packed with undef. 4145 if (((I + 1) >= EndIdx) || 4146 ((Intr->NumGradients / 2) % 2 == 1 && 4147 (I == static_cast<unsigned>(Intr->GradientStart + 4148 (Intr->NumGradients / 2) - 1) || 4149 I == static_cast<unsigned>(Intr->GradientStart + 4150 Intr->NumGradients - 1))) || 4151 // Check for _L to _LZ optimization 4152 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4153 PackedAddrs.push_back( 4154 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4155 .getReg(0)); 4156 } else { 4157 PackedAddrs.push_back( 4158 B.buildBuildVector( 4159 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4160 .getReg(0)); 4161 ++I; 4162 } 4163 } 4164 } 4165 } 4166 4167 /// Convert from separate vaddr components to a single vector address register, 4168 /// and replace the remaining operands with $noreg. 4169 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4170 int DimIdx, int NumVAddrs) { 4171 const LLT S32 = LLT::scalar(32); 4172 4173 SmallVector<Register, 8> AddrRegs; 4174 for (int I = 0; I != NumVAddrs; ++I) { 4175 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4176 if (SrcOp.isReg()) { 4177 AddrRegs.push_back(SrcOp.getReg()); 4178 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4179 } 4180 } 4181 4182 int NumAddrRegs = AddrRegs.size(); 4183 if (NumAddrRegs != 1) { 4184 // Round up to 8 elements for v5-v7 4185 // FIXME: Missing intermediate sized register classes and instructions. 4186 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4187 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4188 auto Undef = B.buildUndef(S32); 4189 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4190 NumAddrRegs = RoundedNumRegs; 4191 } 4192 4193 auto VAddr = 4194 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4195 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4196 } 4197 4198 for (int I = 1; I != NumVAddrs; ++I) { 4199 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4200 if (SrcOp.isReg()) 4201 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4202 } 4203 } 4204 4205 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4206 /// 4207 /// Depending on the subtarget, load/store with 16-bit element data need to be 4208 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4209 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4210 /// registers. 4211 /// 4212 /// We don't want to directly select image instructions just yet, but also want 4213 /// to exposes all register repacking to the legalizer/combiners. We also don't 4214 /// want a selected instrution entering RegBankSelect. In order to avoid 4215 /// defining a multitude of intermediate image instructions, directly hack on 4216 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4217 /// now unnecessary arguments with $noreg. 4218 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4219 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4220 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4221 4222 const unsigned NumDefs = MI.getNumExplicitDefs(); 4223 const unsigned ArgOffset = NumDefs + 1; 4224 bool IsTFE = NumDefs == 2; 4225 // We are only processing the operands of d16 image operations on subtargets 4226 // that use the unpacked register layout, or need to repack the TFE result. 4227 4228 // TODO: Do we need to guard against already legalized intrinsics? 4229 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4230 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4231 4232 MachineRegisterInfo *MRI = B.getMRI(); 4233 const LLT S32 = LLT::scalar(32); 4234 const LLT S16 = LLT::scalar(16); 4235 const LLT V2S16 = LLT::fixed_vector(2, 16); 4236 4237 unsigned DMask = 0; 4238 4239 // Check for 16 bit addresses and pack if true. 4240 LLT GradTy = 4241 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4242 LLT AddrTy = 4243 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4244 const bool IsG16 = GradTy == S16; 4245 const bool IsA16 = AddrTy == S16; 4246 4247 int DMaskLanes = 0; 4248 if (!BaseOpcode->Atomic) { 4249 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4250 if (BaseOpcode->Gather4) { 4251 DMaskLanes = 4; 4252 } else if (DMask != 0) { 4253 DMaskLanes = countPopulation(DMask); 4254 } else if (!IsTFE && !BaseOpcode->Store) { 4255 // If dmask is 0, this is a no-op load. This can be eliminated. 4256 B.buildUndef(MI.getOperand(0)); 4257 MI.eraseFromParent(); 4258 return true; 4259 } 4260 } 4261 4262 Observer.changingInstr(MI); 4263 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4264 4265 unsigned NewOpcode = NumDefs == 0 ? 4266 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4267 4268 // Track that we legalized this 4269 MI.setDesc(B.getTII().get(NewOpcode)); 4270 4271 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4272 // dmask to be at least 1 otherwise the instruction will fail 4273 if (IsTFE && DMask == 0) { 4274 DMask = 0x1; 4275 DMaskLanes = 1; 4276 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4277 } 4278 4279 if (BaseOpcode->Atomic) { 4280 Register VData0 = MI.getOperand(2).getReg(); 4281 LLT Ty = MRI->getType(VData0); 4282 4283 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4284 if (Ty.isVector()) 4285 return false; 4286 4287 if (BaseOpcode->AtomicX2) { 4288 Register VData1 = MI.getOperand(3).getReg(); 4289 // The two values are packed in one register. 4290 LLT PackedTy = LLT::fixed_vector(2, Ty); 4291 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4292 MI.getOperand(2).setReg(Concat.getReg(0)); 4293 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4294 } 4295 } 4296 4297 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4298 4299 // Optimize _L to _LZ when _L is zero 4300 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4301 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4302 const ConstantFP *ConstantLod; 4303 4304 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4305 m_GFCst(ConstantLod))) { 4306 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4307 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4308 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4309 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4310 Intr->Dim); 4311 4312 // The starting indexes should remain in the same place. 4313 --CorrectedNumVAddrs; 4314 4315 MI.getOperand(MI.getNumExplicitDefs()) 4316 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4317 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4318 Intr = NewImageDimIntr; 4319 } 4320 } 4321 } 4322 4323 // Optimize _mip away, when 'lod' is zero 4324 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4325 int64_t ConstantLod; 4326 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4327 m_ICst(ConstantLod))) { 4328 if (ConstantLod == 0) { 4329 // TODO: Change intrinsic opcode and remove operand instead or replacing 4330 // it with 0, as the _L to _LZ handling is done above. 4331 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4332 --CorrectedNumVAddrs; 4333 } 4334 } 4335 } 4336 4337 // Rewrite the addressing register layout before doing anything else. 4338 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4339 // 16 bit gradients are supported, but are tied to the A16 control 4340 // so both gradients and addresses must be 16 bit 4341 return false; 4342 } 4343 4344 if (IsA16 && !ST.hasA16()) { 4345 // A16 not supported 4346 return false; 4347 } 4348 4349 if (IsA16 || IsG16) { 4350 if (Intr->NumVAddrs > 1) { 4351 SmallVector<Register, 4> PackedRegs; 4352 4353 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4354 IsG16); 4355 4356 // See also below in the non-a16 branch 4357 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4358 4359 if (!UseNSA && PackedRegs.size() > 1) { 4360 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4361 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4362 PackedRegs[0] = Concat.getReg(0); 4363 PackedRegs.resize(1); 4364 } 4365 4366 const unsigned NumPacked = PackedRegs.size(); 4367 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4368 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4369 if (!SrcOp.isReg()) { 4370 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4371 continue; 4372 } 4373 4374 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4375 4376 if (I - Intr->VAddrStart < NumPacked) 4377 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4378 else 4379 SrcOp.setReg(AMDGPU::NoRegister); 4380 } 4381 } 4382 } else { 4383 // If the register allocator cannot place the address registers contiguously 4384 // without introducing moves, then using the non-sequential address encoding 4385 // is always preferable, since it saves VALU instructions and is usually a 4386 // wash in terms of code size or even better. 4387 // 4388 // However, we currently have no way of hinting to the register allocator 4389 // that MIMG addresses should be placed contiguously when it is possible to 4390 // do so, so force non-NSA for the common 2-address case as a heuristic. 4391 // 4392 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4393 // allocation when possible. 4394 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4395 4396 if (!UseNSA && Intr->NumVAddrs > 1) 4397 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4398 Intr->NumVAddrs); 4399 } 4400 4401 int Flags = 0; 4402 if (IsA16) 4403 Flags |= 1; 4404 if (IsG16) 4405 Flags |= 2; 4406 MI.addOperand(MachineOperand::CreateImm(Flags)); 4407 4408 if (BaseOpcode->Store) { // No TFE for stores? 4409 // TODO: Handle dmask trim 4410 Register VData = MI.getOperand(1).getReg(); 4411 LLT Ty = MRI->getType(VData); 4412 if (!Ty.isVector() || Ty.getElementType() != S16) 4413 return true; 4414 4415 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4416 if (RepackedReg != VData) { 4417 MI.getOperand(1).setReg(RepackedReg); 4418 } 4419 4420 return true; 4421 } 4422 4423 Register DstReg = MI.getOperand(0).getReg(); 4424 LLT Ty = MRI->getType(DstReg); 4425 const LLT EltTy = Ty.getScalarType(); 4426 const bool IsD16 = Ty.getScalarType() == S16; 4427 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4428 4429 // Confirm that the return type is large enough for the dmask specified 4430 if (NumElts < DMaskLanes) 4431 return false; 4432 4433 if (NumElts > 4 || DMaskLanes > 4) 4434 return false; 4435 4436 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4437 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4438 4439 // The raw dword aligned data component of the load. The only legal cases 4440 // where this matters should be when using the packed D16 format, for 4441 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4442 LLT RoundedTy; 4443 4444 // S32 vector to to cover all data, plus TFE result element. 4445 LLT TFETy; 4446 4447 // Register type to use for each loaded component. Will be S32 or V2S16. 4448 LLT RegTy; 4449 4450 if (IsD16 && ST.hasUnpackedD16VMem()) { 4451 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4452 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4453 RegTy = S32; 4454 } else { 4455 unsigned EltSize = EltTy.getSizeInBits(); 4456 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4457 unsigned RoundedSize = 32 * RoundedElts; 4458 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4459 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4460 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4461 } 4462 4463 // The return type does not need adjustment. 4464 // TODO: Should we change s16 case to s32 or <2 x s16>? 4465 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4466 return true; 4467 4468 Register Dst1Reg; 4469 4470 // Insert after the instruction. 4471 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4472 4473 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4474 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4475 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4476 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4477 4478 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4479 4480 MI.getOperand(0).setReg(NewResultReg); 4481 4482 // In the IR, TFE is supposed to be used with a 2 element struct return 4483 // type. The intruction really returns these two values in one contiguous 4484 // register, with one additional dword beyond the loaded data. Rewrite the 4485 // return type to use a single register result. 4486 4487 if (IsTFE) { 4488 Dst1Reg = MI.getOperand(1).getReg(); 4489 if (MRI->getType(Dst1Reg) != S32) 4490 return false; 4491 4492 // TODO: Make sure the TFE operand bit is set. 4493 MI.RemoveOperand(1); 4494 4495 // Handle the easy case that requires no repack instructions. 4496 if (Ty == S32) { 4497 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4498 return true; 4499 } 4500 } 4501 4502 // Now figure out how to copy the new result register back into the old 4503 // result. 4504 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4505 4506 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4507 4508 if (ResultNumRegs == 1) { 4509 assert(!IsTFE); 4510 ResultRegs[0] = NewResultReg; 4511 } else { 4512 // We have to repack into a new vector of some kind. 4513 for (int I = 0; I != NumDataRegs; ++I) 4514 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4515 B.buildUnmerge(ResultRegs, NewResultReg); 4516 4517 // Drop the final TFE element to get the data part. The TFE result is 4518 // directly written to the right place already. 4519 if (IsTFE) 4520 ResultRegs.resize(NumDataRegs); 4521 } 4522 4523 // For an s16 scalar result, we form an s32 result with a truncate regardless 4524 // of packed vs. unpacked. 4525 if (IsD16 && !Ty.isVector()) { 4526 B.buildTrunc(DstReg, ResultRegs[0]); 4527 return true; 4528 } 4529 4530 // Avoid a build/concat_vector of 1 entry. 4531 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4532 B.buildBitcast(DstReg, ResultRegs[0]); 4533 return true; 4534 } 4535 4536 assert(Ty.isVector()); 4537 4538 if (IsD16) { 4539 // For packed D16 results with TFE enabled, all the data components are 4540 // S32. Cast back to the expected type. 4541 // 4542 // TODO: We don't really need to use load s32 elements. We would only need one 4543 // cast for the TFE result if a multiple of v2s16 was used. 4544 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4545 for (Register &Reg : ResultRegs) 4546 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4547 } else if (ST.hasUnpackedD16VMem()) { 4548 for (Register &Reg : ResultRegs) 4549 Reg = B.buildTrunc(S16, Reg).getReg(0); 4550 } 4551 } 4552 4553 auto padWithUndef = [&](LLT Ty, int NumElts) { 4554 if (NumElts == 0) 4555 return; 4556 Register Undef = B.buildUndef(Ty).getReg(0); 4557 for (int I = 0; I != NumElts; ++I) 4558 ResultRegs.push_back(Undef); 4559 }; 4560 4561 // Pad out any elements eliminated due to the dmask. 4562 LLT ResTy = MRI->getType(ResultRegs[0]); 4563 if (!ResTy.isVector()) { 4564 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4565 B.buildBuildVector(DstReg, ResultRegs); 4566 return true; 4567 } 4568 4569 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4570 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4571 4572 // Deal with the one annoying legal case. 4573 const LLT V3S16 = LLT::fixed_vector(3, 16); 4574 if (Ty == V3S16) { 4575 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4576 auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs); 4577 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4578 return true; 4579 } 4580 4581 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4582 B.buildConcatVectors(DstReg, ResultRegs); 4583 return true; 4584 } 4585 4586 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4587 LegalizerHelper &Helper, MachineInstr &MI) const { 4588 MachineIRBuilder &B = Helper.MIRBuilder; 4589 GISelChangeObserver &Observer = Helper.Observer; 4590 4591 Register Dst = MI.getOperand(0).getReg(); 4592 LLT Ty = B.getMRI()->getType(Dst); 4593 unsigned Size = Ty.getSizeInBits(); 4594 MachineFunction &MF = B.getMF(); 4595 4596 Observer.changingInstr(MI); 4597 4598 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4599 Ty = getBitcastRegisterType(Ty); 4600 Helper.bitcastDst(MI, Ty, 0); 4601 Dst = MI.getOperand(0).getReg(); 4602 B.setInsertPt(B.getMBB(), MI); 4603 } 4604 4605 // FIXME: We don't really need this intermediate instruction. The intrinsic 4606 // should be fixed to have a memory operand. Since it's readnone, we're not 4607 // allowed to add one. 4608 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4609 MI.RemoveOperand(1); // Remove intrinsic ID 4610 4611 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4612 // TODO: Should this use datalayout alignment? 4613 const unsigned MemSize = (Size + 7) / 8; 4614 const Align MemAlign(4); 4615 MachineMemOperand *MMO = MF.getMachineMemOperand( 4616 MachinePointerInfo(), 4617 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4618 MachineMemOperand::MOInvariant, 4619 MemSize, MemAlign); 4620 MI.addMemOperand(MF, MMO); 4621 4622 // There are no 96-bit result scalar loads, but widening to 128-bit should 4623 // always be legal. We may need to restore this to a 96-bit result if it turns 4624 // out this needs to be converted to a vector load during RegBankSelect. 4625 if (!isPowerOf2_32(Size)) { 4626 if (Ty.isVector()) 4627 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4628 else 4629 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4630 } 4631 4632 Observer.changedInstr(MI); 4633 return true; 4634 } 4635 4636 // TODO: Move to selection 4637 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4638 MachineRegisterInfo &MRI, 4639 MachineIRBuilder &B) const { 4640 if (!ST.isTrapHandlerEnabled() || 4641 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4642 return legalizeTrapEndpgm(MI, MRI, B); 4643 4644 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4645 switch (*HsaAbiVer) { 4646 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4647 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4648 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4649 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4650 return ST.supportsGetDoorbellID() ? 4651 legalizeTrapHsa(MI, MRI, B) : 4652 legalizeTrapHsaQueuePtr(MI, MRI, B); 4653 } 4654 } 4655 4656 llvm_unreachable("Unknown trap handler"); 4657 } 4658 4659 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4660 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4661 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4662 MI.eraseFromParent(); 4663 return true; 4664 } 4665 4666 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4667 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4668 // Pass queue pointer to trap handler as input, and insert trap instruction 4669 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4670 Register LiveIn = 4671 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4672 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4673 return false; 4674 4675 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4676 B.buildCopy(SGPR01, LiveIn); 4677 B.buildInstr(AMDGPU::S_TRAP) 4678 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4679 .addReg(SGPR01, RegState::Implicit); 4680 4681 MI.eraseFromParent(); 4682 return true; 4683 } 4684 4685 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4686 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4687 B.buildInstr(AMDGPU::S_TRAP) 4688 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4689 MI.eraseFromParent(); 4690 return true; 4691 } 4692 4693 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4694 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4695 // Is non-HSA path or trap-handler disabled? then, report a warning 4696 // accordingly 4697 if (!ST.isTrapHandlerEnabled() || 4698 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4699 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4700 "debugtrap handler not supported", 4701 MI.getDebugLoc(), DS_Warning); 4702 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4703 Ctx.diagnose(NoTrap); 4704 } else { 4705 // Insert debug-trap instruction 4706 B.buildInstr(AMDGPU::S_TRAP) 4707 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4708 } 4709 4710 MI.eraseFromParent(); 4711 return true; 4712 } 4713 4714 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4715 MachineIRBuilder &B) const { 4716 MachineRegisterInfo &MRI = *B.getMRI(); 4717 const LLT S16 = LLT::scalar(16); 4718 const LLT S32 = LLT::scalar(32); 4719 4720 Register DstReg = MI.getOperand(0).getReg(); 4721 Register NodePtr = MI.getOperand(2).getReg(); 4722 Register RayExtent = MI.getOperand(3).getReg(); 4723 Register RayOrigin = MI.getOperand(4).getReg(); 4724 Register RayDir = MI.getOperand(5).getReg(); 4725 Register RayInvDir = MI.getOperand(6).getReg(); 4726 Register TDescr = MI.getOperand(7).getReg(); 4727 4728 if (!ST.hasGFX10_AEncoding()) { 4729 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4730 "intrinsic not supported on subtarget", 4731 MI.getDebugLoc()); 4732 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4733 return false; 4734 } 4735 4736 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4737 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4738 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4739 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4740 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4741 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4742 4743 SmallVector<Register, 12> Ops; 4744 if (Is64) { 4745 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4746 Ops.push_back(Unmerge.getReg(0)); 4747 Ops.push_back(Unmerge.getReg(1)); 4748 } else { 4749 Ops.push_back(NodePtr); 4750 } 4751 Ops.push_back(RayExtent); 4752 4753 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4754 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4755 Ops.push_back(Unmerge.getReg(0)); 4756 Ops.push_back(Unmerge.getReg(1)); 4757 Ops.push_back(Unmerge.getReg(2)); 4758 }; 4759 4760 packLanes(RayOrigin); 4761 if (IsA16) { 4762 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4763 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4764 Register R1 = MRI.createGenericVirtualRegister(S32); 4765 Register R2 = MRI.createGenericVirtualRegister(S32); 4766 Register R3 = MRI.createGenericVirtualRegister(S32); 4767 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4768 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4769 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4770 Ops.push_back(R1); 4771 Ops.push_back(R2); 4772 Ops.push_back(R3); 4773 } else { 4774 packLanes(RayDir); 4775 packLanes(RayInvDir); 4776 } 4777 4778 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4779 .addDef(DstReg) 4780 .addImm(Opcode); 4781 4782 for (Register R : Ops) { 4783 MIB.addUse(R); 4784 } 4785 4786 MIB.addUse(TDescr) 4787 .addImm(IsA16 ? 1 : 0) 4788 .cloneMemRefs(MI); 4789 4790 MI.eraseFromParent(); 4791 return true; 4792 } 4793 4794 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4795 MachineInstr &MI) const { 4796 MachineIRBuilder &B = Helper.MIRBuilder; 4797 MachineRegisterInfo &MRI = *B.getMRI(); 4798 4799 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4800 auto IntrID = MI.getIntrinsicID(); 4801 switch (IntrID) { 4802 case Intrinsic::amdgcn_if: 4803 case Intrinsic::amdgcn_else: { 4804 MachineInstr *Br = nullptr; 4805 MachineBasicBlock *UncondBrTarget = nullptr; 4806 bool Negated = false; 4807 if (MachineInstr *BrCond = 4808 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4809 const SIRegisterInfo *TRI 4810 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4811 4812 Register Def = MI.getOperand(1).getReg(); 4813 Register Use = MI.getOperand(3).getReg(); 4814 4815 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4816 4817 if (Negated) 4818 std::swap(CondBrTarget, UncondBrTarget); 4819 4820 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4821 if (IntrID == Intrinsic::amdgcn_if) { 4822 B.buildInstr(AMDGPU::SI_IF) 4823 .addDef(Def) 4824 .addUse(Use) 4825 .addMBB(UncondBrTarget); 4826 } else { 4827 B.buildInstr(AMDGPU::SI_ELSE) 4828 .addDef(Def) 4829 .addUse(Use) 4830 .addMBB(UncondBrTarget); 4831 } 4832 4833 if (Br) { 4834 Br->getOperand(0).setMBB(CondBrTarget); 4835 } else { 4836 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4837 // since we're swapping branch targets it needs to be reinserted. 4838 // FIXME: IRTranslator should probably not do this 4839 B.buildBr(*CondBrTarget); 4840 } 4841 4842 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4843 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4844 MI.eraseFromParent(); 4845 BrCond->eraseFromParent(); 4846 return true; 4847 } 4848 4849 return false; 4850 } 4851 case Intrinsic::amdgcn_loop: { 4852 MachineInstr *Br = nullptr; 4853 MachineBasicBlock *UncondBrTarget = nullptr; 4854 bool Negated = false; 4855 if (MachineInstr *BrCond = 4856 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4857 const SIRegisterInfo *TRI 4858 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4859 4860 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4861 Register Reg = MI.getOperand(2).getReg(); 4862 4863 if (Negated) 4864 std::swap(CondBrTarget, UncondBrTarget); 4865 4866 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4867 B.buildInstr(AMDGPU::SI_LOOP) 4868 .addUse(Reg) 4869 .addMBB(UncondBrTarget); 4870 4871 if (Br) 4872 Br->getOperand(0).setMBB(CondBrTarget); 4873 else 4874 B.buildBr(*CondBrTarget); 4875 4876 MI.eraseFromParent(); 4877 BrCond->eraseFromParent(); 4878 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4879 return true; 4880 } 4881 4882 return false; 4883 } 4884 case Intrinsic::amdgcn_kernarg_segment_ptr: 4885 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4886 // This only makes sense to call in a kernel, so just lower to null. 4887 B.buildConstant(MI.getOperand(0).getReg(), 0); 4888 MI.eraseFromParent(); 4889 return true; 4890 } 4891 4892 return legalizePreloadedArgIntrin( 4893 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4894 case Intrinsic::amdgcn_implicitarg_ptr: 4895 return legalizeImplicitArgPtr(MI, MRI, B); 4896 case Intrinsic::amdgcn_workitem_id_x: 4897 return legalizePreloadedArgIntrin(MI, MRI, B, 4898 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4899 case Intrinsic::amdgcn_workitem_id_y: 4900 return legalizePreloadedArgIntrin(MI, MRI, B, 4901 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4902 case Intrinsic::amdgcn_workitem_id_z: 4903 return legalizePreloadedArgIntrin(MI, MRI, B, 4904 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4905 case Intrinsic::amdgcn_workgroup_id_x: 4906 return legalizePreloadedArgIntrin(MI, MRI, B, 4907 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4908 case Intrinsic::amdgcn_workgroup_id_y: 4909 return legalizePreloadedArgIntrin(MI, MRI, B, 4910 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4911 case Intrinsic::amdgcn_workgroup_id_z: 4912 return legalizePreloadedArgIntrin(MI, MRI, B, 4913 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4914 case Intrinsic::amdgcn_dispatch_ptr: 4915 return legalizePreloadedArgIntrin(MI, MRI, B, 4916 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4917 case Intrinsic::amdgcn_queue_ptr: 4918 return legalizePreloadedArgIntrin(MI, MRI, B, 4919 AMDGPUFunctionArgInfo::QUEUE_PTR); 4920 case Intrinsic::amdgcn_implicit_buffer_ptr: 4921 return legalizePreloadedArgIntrin( 4922 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4923 case Intrinsic::amdgcn_dispatch_id: 4924 return legalizePreloadedArgIntrin(MI, MRI, B, 4925 AMDGPUFunctionArgInfo::DISPATCH_ID); 4926 case Intrinsic::amdgcn_fdiv_fast: 4927 return legalizeFDIVFastIntrin(MI, MRI, B); 4928 case Intrinsic::amdgcn_is_shared: 4929 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4930 case Intrinsic::amdgcn_is_private: 4931 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4932 case Intrinsic::amdgcn_wavefrontsize: { 4933 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4934 MI.eraseFromParent(); 4935 return true; 4936 } 4937 case Intrinsic::amdgcn_s_buffer_load: 4938 return legalizeSBufferLoad(Helper, MI); 4939 case Intrinsic::amdgcn_raw_buffer_store: 4940 case Intrinsic::amdgcn_struct_buffer_store: 4941 return legalizeBufferStore(MI, MRI, B, false, false); 4942 case Intrinsic::amdgcn_raw_buffer_store_format: 4943 case Intrinsic::amdgcn_struct_buffer_store_format: 4944 return legalizeBufferStore(MI, MRI, B, false, true); 4945 case Intrinsic::amdgcn_raw_tbuffer_store: 4946 case Intrinsic::amdgcn_struct_tbuffer_store: 4947 return legalizeBufferStore(MI, MRI, B, true, true); 4948 case Intrinsic::amdgcn_raw_buffer_load: 4949 case Intrinsic::amdgcn_struct_buffer_load: 4950 return legalizeBufferLoad(MI, MRI, B, false, false); 4951 case Intrinsic::amdgcn_raw_buffer_load_format: 4952 case Intrinsic::amdgcn_struct_buffer_load_format: 4953 return legalizeBufferLoad(MI, MRI, B, true, false); 4954 case Intrinsic::amdgcn_raw_tbuffer_load: 4955 case Intrinsic::amdgcn_struct_tbuffer_load: 4956 return legalizeBufferLoad(MI, MRI, B, true, true); 4957 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4958 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4959 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4960 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4961 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4962 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4963 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4964 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4965 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4966 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4967 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4968 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4969 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4970 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4971 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4972 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4973 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4974 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4975 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4976 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4977 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4978 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4979 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4980 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4981 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4982 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4983 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4984 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4985 case Intrinsic::amdgcn_buffer_atomic_fadd: 4986 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4987 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4988 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4989 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4990 return legalizeBufferAtomic(MI, B, IntrID); 4991 case Intrinsic::amdgcn_atomic_inc: 4992 return legalizeAtomicIncDec(MI, B, true); 4993 case Intrinsic::amdgcn_atomic_dec: 4994 return legalizeAtomicIncDec(MI, B, false); 4995 case Intrinsic::trap: 4996 return legalizeTrapIntrinsic(MI, MRI, B); 4997 case Intrinsic::debugtrap: 4998 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4999 case Intrinsic::amdgcn_rsq_clamp: 5000 return legalizeRsqClampIntrinsic(MI, MRI, B); 5001 case Intrinsic::amdgcn_ds_fadd: 5002 case Intrinsic::amdgcn_ds_fmin: 5003 case Intrinsic::amdgcn_ds_fmax: 5004 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5005 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5006 return legalizeBVHIntrinsic(MI, B); 5007 default: { 5008 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5009 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5010 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5011 return true; 5012 } 5013 } 5014 5015 return true; 5016 } 5017