1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeNumElements(Pow2NElts); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 99 }; 100 } 101 102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 const LLT EltTy = Ty.getElementType(); 106 unsigned Size = Ty.getSizeInBits(); 107 unsigned Pieces = (Size + 63) / 64; 108 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 109 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 110 }; 111 } 112 113 // Increase the number of vector elements to reach the next multiple of 32-bit 114 // type. 115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 116 return [=](const LegalityQuery &Query) { 117 const LLT Ty = Query.Types[TypeIdx]; 118 119 const LLT EltTy = Ty.getElementType(); 120 const int Size = Ty.getSizeInBits(); 121 const int EltSize = EltTy.getSizeInBits(); 122 const int NextMul32 = (Size + 31) / 32; 123 124 assert(EltSize < 32); 125 126 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 127 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 128 }; 129 } 130 131 static LLT getBitcastRegisterType(const LLT Ty) { 132 const unsigned Size = Ty.getSizeInBits(); 133 134 LLT CoercedTy; 135 if (Size <= 32) { 136 // <2 x s8> -> s16 137 // <4 x s8> -> s32 138 return LLT::scalar(Size); 139 } 140 141 return LLT::scalarOrVector(Size / 32, 32); 142 } 143 144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 145 return [=](const LegalityQuery &Query) { 146 const LLT Ty = Query.Types[TypeIdx]; 147 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 148 }; 149 } 150 151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 152 return [=](const LegalityQuery &Query) { 153 const LLT Ty = Query.Types[TypeIdx]; 154 unsigned Size = Ty.getSizeInBits(); 155 assert(Size % 32 == 0); 156 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 157 }; 158 } 159 160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 161 return [=](const LegalityQuery &Query) { 162 const LLT QueryTy = Query.Types[TypeIdx]; 163 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 164 }; 165 } 166 167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 168 return [=](const LegalityQuery &Query) { 169 const LLT QueryTy = Query.Types[TypeIdx]; 170 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 171 }; 172 } 173 174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 const LLT QueryTy = Query.Types[TypeIdx]; 177 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 178 }; 179 } 180 181 static bool isRegisterSize(unsigned Size) { 182 return Size % 32 == 0 && Size <= MaxRegisterSize; 183 } 184 185 static bool isRegisterVectorElementType(LLT EltTy) { 186 const int EltSize = EltTy.getSizeInBits(); 187 return EltSize == 16 || EltSize % 32 == 0; 188 } 189 190 static bool isRegisterVectorType(LLT Ty) { 191 const int EltSize = Ty.getElementType().getSizeInBits(); 192 return EltSize == 32 || EltSize == 64 || 193 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 194 EltSize == 128 || EltSize == 256; 195 } 196 197 static bool isRegisterType(LLT Ty) { 198 if (!isRegisterSize(Ty.getSizeInBits())) 199 return false; 200 201 if (Ty.isVector()) 202 return isRegisterVectorType(Ty); 203 204 return true; 205 } 206 207 // Any combination of 32 or 64-bit elements up the maximum register size, and 208 // multiples of v2s16. 209 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 210 return [=](const LegalityQuery &Query) { 211 return isRegisterType(Query.Types[TypeIdx]); 212 }; 213 } 214 215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 216 return [=](const LegalityQuery &Query) { 217 const LLT QueryTy = Query.Types[TypeIdx]; 218 if (!QueryTy.isVector()) 219 return false; 220 const LLT EltTy = QueryTy.getElementType(); 221 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 222 }; 223 } 224 225 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 226 return [=](const LegalityQuery &Query) { 227 const LLT Ty = Query.Types[TypeIdx]; 228 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 229 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 230 }; 231 } 232 233 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 234 // handle some operations by just promoting the register during 235 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 236 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 237 bool IsLoad) { 238 switch (AS) { 239 case AMDGPUAS::PRIVATE_ADDRESS: 240 // FIXME: Private element size. 241 return ST.enableFlatScratch() ? 128 : 32; 242 case AMDGPUAS::LOCAL_ADDRESS: 243 return ST.useDS128() ? 128 : 64; 244 case AMDGPUAS::GLOBAL_ADDRESS: 245 case AMDGPUAS::CONSTANT_ADDRESS: 246 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 247 // Treat constant and global as identical. SMRD loads are sometimes usable for 248 // global loads (ideally constant address space should be eliminated) 249 // depending on the context. Legality cannot be context dependent, but 250 // RegBankSelect can split the load as necessary depending on the pointer 251 // register bank/uniformity and if the memory is invariant or not written in a 252 // kernel. 253 return IsLoad ? 512 : 128; 254 default: 255 // Flat addresses may contextually need to be split to 32-bit parts if they 256 // may alias scratch depending on the subtarget. 257 return 128; 258 } 259 } 260 261 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 262 const LegalityQuery &Query, 263 unsigned Opcode) { 264 const LLT Ty = Query.Types[0]; 265 266 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 267 const bool IsLoad = Opcode != AMDGPU::G_STORE; 268 269 unsigned RegSize = Ty.getSizeInBits(); 270 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 271 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 272 unsigned AS = Query.Types[1].getAddressSpace(); 273 274 // All of these need to be custom lowered to cast the pointer operand. 275 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 276 return false; 277 278 // TODO: We should be able to widen loads if the alignment is high enough, but 279 // we also need to modify the memory access size. 280 #if 0 281 // Accept widening loads based on alignment. 282 if (IsLoad && MemSize < Size) 283 MemSize = std::max(MemSize, Align); 284 #endif 285 286 // Only 1-byte and 2-byte to 32-bit extloads are valid. 287 if (MemSize != RegSize && RegSize != 32) 288 return false; 289 290 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 291 return false; 292 293 switch (MemSize) { 294 case 8: 295 case 16: 296 case 32: 297 case 64: 298 case 128: 299 break; 300 case 96: 301 if (!ST.hasDwordx3LoadStores()) 302 return false; 303 break; 304 case 256: 305 case 512: 306 // These may contextually need to be broken down. 307 break; 308 default: 309 return false; 310 } 311 312 assert(RegSize >= MemSize); 313 314 if (AlignBits < MemSize) { 315 const SITargetLowering *TLI = ST.getTargetLowering(); 316 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 317 Align(AlignBits / 8))) 318 return false; 319 } 320 321 return true; 322 } 323 324 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 325 // workaround this. Eventually it should ignore the type for loads and only care 326 // about the size. Return true in cases where we will workaround this for now by 327 // bitcasting. 328 static bool loadStoreBitcastWorkaround(const LLT Ty) { 329 if (EnableNewLegality) 330 return false; 331 332 const unsigned Size = Ty.getSizeInBits(); 333 if (Size <= 64) 334 return false; 335 if (!Ty.isVector()) 336 return true; 337 338 LLT EltTy = Ty.getElementType(); 339 if (EltTy.isPointer()) 340 return true; 341 342 unsigned EltSize = EltTy.getSizeInBits(); 343 return EltSize != 32 && EltSize != 64; 344 } 345 346 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 347 unsigned Opcode) { 348 const LLT Ty = Query.Types[0]; 349 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 350 !loadStoreBitcastWorkaround(Ty); 351 } 352 353 /// Return true if a load or store of the type should be lowered with a bitcast 354 /// to a different type. 355 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 356 const unsigned MemSizeInBits) { 357 const unsigned Size = Ty.getSizeInBits(); 358 if (Size != MemSizeInBits) 359 return Size <= 32 && Ty.isVector(); 360 361 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 362 return true; 363 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 364 !isRegisterVectorElementType(Ty.getElementType()); 365 } 366 367 /// Return true if we should legalize a load by widening an odd sized memory 368 /// access up to the alignment. Note this case when the memory access itself 369 /// changes, not the size of the result register. 370 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 371 unsigned AlignInBits, unsigned AddrSpace, 372 unsigned Opcode) { 373 // We don't want to widen cases that are naturally legal. 374 if (isPowerOf2_32(SizeInBits)) 375 return false; 376 377 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 378 // end up widening these for a scalar load during RegBankSelect, since there 379 // aren't 96-bit scalar loads. 380 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 381 return false; 382 383 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 384 return false; 385 386 // A load is known dereferenceable up to the alignment, so it's legal to widen 387 // to it. 388 // 389 // TODO: Could check dereferenceable for less aligned cases. 390 unsigned RoundedSize = NextPowerOf2(SizeInBits); 391 if (AlignInBits < RoundedSize) 392 return false; 393 394 // Do not widen if it would introduce a slow unaligned load. 395 const SITargetLowering *TLI = ST.getTargetLowering(); 396 bool Fast = false; 397 return TLI->allowsMisalignedMemoryAccessesImpl( 398 RoundedSize, AddrSpace, Align(AlignInBits / 8), 399 MachineMemOperand::MOLoad, &Fast) && 400 Fast; 401 } 402 403 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 404 unsigned Opcode) { 405 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 406 return false; 407 408 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 409 Query.MMODescrs[0].AlignInBits, 410 Query.Types[1].getAddressSpace(), Opcode); 411 } 412 413 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 414 const GCNTargetMachine &TM) 415 : ST(ST_) { 416 using namespace TargetOpcode; 417 418 auto GetAddrSpacePtr = [&TM](unsigned AS) { 419 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 420 }; 421 422 const LLT S1 = LLT::scalar(1); 423 const LLT S8 = LLT::scalar(8); 424 const LLT S16 = LLT::scalar(16); 425 const LLT S32 = LLT::scalar(32); 426 const LLT S64 = LLT::scalar(64); 427 const LLT S128 = LLT::scalar(128); 428 const LLT S256 = LLT::scalar(256); 429 const LLT S512 = LLT::scalar(512); 430 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 431 432 const LLT V2S8 = LLT::vector(2, 8); 433 const LLT V2S16 = LLT::vector(2, 16); 434 const LLT V4S16 = LLT::vector(4, 16); 435 436 const LLT V2S32 = LLT::vector(2, 32); 437 const LLT V3S32 = LLT::vector(3, 32); 438 const LLT V4S32 = LLT::vector(4, 32); 439 const LLT V5S32 = LLT::vector(5, 32); 440 const LLT V6S32 = LLT::vector(6, 32); 441 const LLT V7S32 = LLT::vector(7, 32); 442 const LLT V8S32 = LLT::vector(8, 32); 443 const LLT V9S32 = LLT::vector(9, 32); 444 const LLT V10S32 = LLT::vector(10, 32); 445 const LLT V11S32 = LLT::vector(11, 32); 446 const LLT V12S32 = LLT::vector(12, 32); 447 const LLT V13S32 = LLT::vector(13, 32); 448 const LLT V14S32 = LLT::vector(14, 32); 449 const LLT V15S32 = LLT::vector(15, 32); 450 const LLT V16S32 = LLT::vector(16, 32); 451 const LLT V32S32 = LLT::vector(32, 32); 452 453 const LLT V2S64 = LLT::vector(2, 64); 454 const LLT V3S64 = LLT::vector(3, 64); 455 const LLT V4S64 = LLT::vector(4, 64); 456 const LLT V5S64 = LLT::vector(5, 64); 457 const LLT V6S64 = LLT::vector(6, 64); 458 const LLT V7S64 = LLT::vector(7, 64); 459 const LLT V8S64 = LLT::vector(8, 64); 460 const LLT V16S64 = LLT::vector(16, 64); 461 462 std::initializer_list<LLT> AllS32Vectors = 463 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 464 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 465 std::initializer_list<LLT> AllS64Vectors = 466 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 467 468 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 469 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 470 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 471 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 472 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 473 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 474 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 475 476 const LLT CodePtr = FlatPtr; 477 478 const std::initializer_list<LLT> AddrSpaces64 = { 479 GlobalPtr, ConstantPtr, FlatPtr 480 }; 481 482 const std::initializer_list<LLT> AddrSpaces32 = { 483 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 484 }; 485 486 const std::initializer_list<LLT> FPTypesBase = { 487 S32, S64 488 }; 489 490 const std::initializer_list<LLT> FPTypes16 = { 491 S32, S64, S16 492 }; 493 494 const std::initializer_list<LLT> FPTypesPK16 = { 495 S32, S64, S16, V2S16 496 }; 497 498 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 499 500 setAction({G_BRCOND, S1}, Legal); // VCC branches 501 setAction({G_BRCOND, S32}, Legal); // SCC branches 502 503 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 504 // elements for v3s16 505 getActionDefinitionsBuilder(G_PHI) 506 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 507 .legalFor(AllS32Vectors) 508 .legalFor(AllS64Vectors) 509 .legalFor(AddrSpaces64) 510 .legalFor(AddrSpaces32) 511 .legalIf(isPointer(0)) 512 .clampScalar(0, S16, S256) 513 .widenScalarToNextPow2(0, 32) 514 .clampMaxNumElements(0, S32, 16) 515 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 516 .scalarize(0); 517 518 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 519 // Full set of gfx9 features. 520 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 521 .legalFor({S32, S16, V2S16}) 522 .clampScalar(0, S16, S32) 523 .clampMaxNumElements(0, S16, 2) 524 .scalarize(0) 525 .widenScalarToNextPow2(0, 32); 526 527 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 528 .legalFor({S32, S16, V2S16}) // Clamp modifier 529 .minScalarOrElt(0, S16) 530 .clampMaxNumElements(0, S16, 2) 531 .scalarize(0) 532 .widenScalarToNextPow2(0, 32) 533 .lower(); 534 } else if (ST.has16BitInsts()) { 535 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 536 .legalFor({S32, S16}) 537 .clampScalar(0, S16, S32) 538 .scalarize(0) 539 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 540 541 // Technically the saturating operations require clamp bit support, but this 542 // was introduced at the same time as 16-bit operations. 543 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 544 .legalFor({S32, S16}) // Clamp modifier 545 .minScalar(0, S16) 546 .scalarize(0) 547 .widenScalarToNextPow2(0, 16) 548 .lower(); 549 550 // We're just lowering this, but it helps get a better result to try to 551 // coerce to the desired type first. 552 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 553 .minScalar(0, S16) 554 .scalarize(0) 555 .lower(); 556 } else { 557 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 558 .legalFor({S32}) 559 .clampScalar(0, S32, S32) 560 .scalarize(0); 561 562 if (ST.hasIntClamp()) { 563 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 564 .legalFor({S32}) // Clamp modifier. 565 .scalarize(0) 566 .minScalarOrElt(0, S32) 567 .lower(); 568 } else { 569 // Clamp bit support was added in VI, along with 16-bit operations. 570 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 571 .minScalar(0, S32) 572 .scalarize(0) 573 .lower(); 574 } 575 576 // FIXME: DAG expansion gets better results. The widening uses the smaller 577 // range values and goes for the min/max lowering directly. 578 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 579 .minScalar(0, S32) 580 .scalarize(0) 581 .lower(); 582 } 583 584 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 585 .customFor({S32, S64}) 586 .clampScalar(0, S32, S64) 587 .widenScalarToNextPow2(0, 32) 588 .scalarize(0); 589 590 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 591 .legalFor({S32}) 592 .maxScalarOrElt(0, S32); 593 594 if (ST.hasVOP3PInsts()) { 595 Mulh 596 .clampMaxNumElements(0, S8, 2) 597 .lowerFor({V2S8}); 598 } 599 600 Mulh 601 .scalarize(0) 602 .lower(); 603 604 // Report legal for any types we can handle anywhere. For the cases only legal 605 // on the SALU, RegBankSelect will be able to re-legalize. 606 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 607 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 608 .clampScalar(0, S32, S64) 609 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 610 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 611 .widenScalarToNextPow2(0) 612 .scalarize(0); 613 614 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 615 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 616 .legalFor({{S32, S1}, {S32, S32}}) 617 .minScalar(0, S32) 618 // TODO: .scalarize(0) 619 .lower(); 620 621 getActionDefinitionsBuilder(G_BITCAST) 622 // Don't worry about the size constraint. 623 .legalIf(all(isRegisterType(0), isRegisterType(1))) 624 .lower(); 625 626 627 getActionDefinitionsBuilder(G_CONSTANT) 628 .legalFor({S1, S32, S64, S16, GlobalPtr, 629 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 630 .legalIf(isPointer(0)) 631 .clampScalar(0, S32, S64) 632 .widenScalarToNextPow2(0); 633 634 getActionDefinitionsBuilder(G_FCONSTANT) 635 .legalFor({S32, S64, S16}) 636 .clampScalar(0, S16, S64); 637 638 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 639 .legalIf(isRegisterType(0)) 640 // s1 and s16 are special cases because they have legal operations on 641 // them, but don't really occupy registers in the normal way. 642 .legalFor({S1, S16}) 643 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 644 .clampScalarOrElt(0, S32, MaxScalar) 645 .widenScalarToNextPow2(0, 32) 646 .clampMaxNumElements(0, S32, 16); 647 648 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 649 650 // If the amount is divergent, we have to do a wave reduction to get the 651 // maximum value, so this is expanded during RegBankSelect. 652 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 653 .legalFor({{PrivatePtr, S32}}); 654 655 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 656 .customIf(typeIsNot(0, PrivatePtr)); 657 658 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 659 660 auto &FPOpActions = getActionDefinitionsBuilder( 661 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 662 .legalFor({S32, S64}); 663 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 664 .customFor({S32, S64}); 665 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 666 .customFor({S32, S64}); 667 668 if (ST.has16BitInsts()) { 669 if (ST.hasVOP3PInsts()) 670 FPOpActions.legalFor({S16, V2S16}); 671 else 672 FPOpActions.legalFor({S16}); 673 674 TrigActions.customFor({S16}); 675 FDIVActions.customFor({S16}); 676 } 677 678 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 679 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 680 681 if (ST.hasVOP3PInsts()) { 682 MinNumMaxNum.customFor(FPTypesPK16) 683 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 684 .clampMaxNumElements(0, S16, 2) 685 .clampScalar(0, S16, S64) 686 .scalarize(0); 687 } else if (ST.has16BitInsts()) { 688 MinNumMaxNum.customFor(FPTypes16) 689 .clampScalar(0, S16, S64) 690 .scalarize(0); 691 } else { 692 MinNumMaxNum.customFor(FPTypesBase) 693 .clampScalar(0, S32, S64) 694 .scalarize(0); 695 } 696 697 if (ST.hasVOP3PInsts()) 698 FPOpActions.clampMaxNumElements(0, S16, 2); 699 700 FPOpActions 701 .scalarize(0) 702 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 703 704 TrigActions 705 .scalarize(0) 706 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 707 708 FDIVActions 709 .scalarize(0) 710 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 711 712 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 713 .legalFor(FPTypesPK16) 714 .clampMaxNumElements(0, S16, 2) 715 .scalarize(0) 716 .clampScalar(0, S16, S64); 717 718 if (ST.has16BitInsts()) { 719 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 720 .legalFor({S32, S64, S16}) 721 .scalarize(0) 722 .clampScalar(0, S16, S64); 723 } else { 724 getActionDefinitionsBuilder(G_FSQRT) 725 .legalFor({S32, S64}) 726 .scalarize(0) 727 .clampScalar(0, S32, S64); 728 729 if (ST.hasFractBug()) { 730 getActionDefinitionsBuilder(G_FFLOOR) 731 .customFor({S64}) 732 .legalFor({S32, S64}) 733 .scalarize(0) 734 .clampScalar(0, S32, S64); 735 } else { 736 getActionDefinitionsBuilder(G_FFLOOR) 737 .legalFor({S32, S64}) 738 .scalarize(0) 739 .clampScalar(0, S32, S64); 740 } 741 } 742 743 getActionDefinitionsBuilder(G_FPTRUNC) 744 .legalFor({{S32, S64}, {S16, S32}}) 745 .scalarize(0) 746 .lower(); 747 748 getActionDefinitionsBuilder(G_FPEXT) 749 .legalFor({{S64, S32}, {S32, S16}}) 750 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 751 .scalarize(0); 752 753 getActionDefinitionsBuilder(G_FSUB) 754 // Use actual fsub instruction 755 .legalFor({S32}) 756 // Must use fadd + fneg 757 .lowerFor({S64, S16, V2S16}) 758 .scalarize(0) 759 .clampScalar(0, S32, S64); 760 761 // Whether this is legal depends on the floating point mode for the function. 762 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 763 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 764 FMad.customFor({S32, S16}); 765 else if (ST.hasMadMacF32Insts()) 766 FMad.customFor({S32}); 767 else if (ST.hasMadF16()) 768 FMad.customFor({S16}); 769 FMad.scalarize(0) 770 .lower(); 771 772 auto &FRem = getActionDefinitionsBuilder(G_FREM); 773 if (ST.has16BitInsts()) { 774 FRem.customFor({S16, S32, S64}); 775 } else { 776 FRem.minScalar(0, S32) 777 .customFor({S32, S64}); 778 } 779 FRem.scalarize(0); 780 781 // TODO: Do we need to clamp maximum bitwidth? 782 getActionDefinitionsBuilder(G_TRUNC) 783 .legalIf(isScalar(0)) 784 .legalFor({{V2S16, V2S32}}) 785 .clampMaxNumElements(0, S16, 2) 786 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 787 // situations (like an invalid implicit use), we don't want to infinite loop 788 // in the legalizer. 789 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 790 .alwaysLegal(); 791 792 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 793 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 794 {S32, S1}, {S64, S1}, {S16, S1}}) 795 .scalarize(0) 796 .clampScalar(0, S32, S64) 797 .widenScalarToNextPow2(1, 32); 798 799 // TODO: Split s1->s64 during regbankselect for VALU. 800 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 801 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 802 .lowerFor({{S32, S64}}) 803 .lowerIf(typeIs(1, S1)) 804 .customFor({{S64, S64}}); 805 if (ST.has16BitInsts()) 806 IToFP.legalFor({{S16, S16}}); 807 IToFP.clampScalar(1, S32, S64) 808 .minScalar(0, S32) 809 .scalarize(0) 810 .widenScalarToNextPow2(1); 811 812 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 813 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 814 .customFor({{S64, S64}}) 815 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 816 if (ST.has16BitInsts()) 817 FPToI.legalFor({{S16, S16}}); 818 else 819 FPToI.minScalar(1, S32); 820 821 FPToI.minScalar(0, S32) 822 .scalarize(0) 823 .lower(); 824 825 // Lower roundeven into G_FRINT 826 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 827 .scalarize(0) 828 .lower(); 829 830 if (ST.has16BitInsts()) { 831 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 832 .legalFor({S16, S32, S64}) 833 .clampScalar(0, S16, S64) 834 .scalarize(0); 835 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 836 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 837 .legalFor({S32, S64}) 838 .clampScalar(0, S32, S64) 839 .scalarize(0); 840 } else { 841 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 842 .legalFor({S32}) 843 .customFor({S64}) 844 .clampScalar(0, S32, S64) 845 .scalarize(0); 846 } 847 848 getActionDefinitionsBuilder(G_PTR_ADD) 849 .legalIf(all(isPointer(0), sameSize(0, 1))) 850 .scalarize(0) 851 .scalarSameSizeAs(1, 0); 852 853 getActionDefinitionsBuilder(G_PTRMASK) 854 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 855 .scalarSameSizeAs(1, 0) 856 .scalarize(0); 857 858 auto &CmpBuilder = 859 getActionDefinitionsBuilder(G_ICMP) 860 // The compare output type differs based on the register bank of the output, 861 // so make both s1 and s32 legal. 862 // 863 // Scalar compares producing output in scc will be promoted to s32, as that 864 // is the allocatable register type that will be needed for the copy from 865 // scc. This will be promoted during RegBankSelect, and we assume something 866 // before that won't try to use s32 result types. 867 // 868 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 869 // bank. 870 .legalForCartesianProduct( 871 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 872 .legalForCartesianProduct( 873 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 874 if (ST.has16BitInsts()) { 875 CmpBuilder.legalFor({{S1, S16}}); 876 } 877 878 CmpBuilder 879 .widenScalarToNextPow2(1) 880 .clampScalar(1, S32, S64) 881 .scalarize(0) 882 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 883 884 getActionDefinitionsBuilder(G_FCMP) 885 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 886 .widenScalarToNextPow2(1) 887 .clampScalar(1, S32, S64) 888 .scalarize(0); 889 890 // FIXME: fpow has a selection pattern that should move to custom lowering. 891 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 892 if (ST.has16BitInsts()) 893 Exp2Ops.legalFor({S32, S16}); 894 else 895 Exp2Ops.legalFor({S32}); 896 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 897 Exp2Ops.scalarize(0); 898 899 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 900 if (ST.has16BitInsts()) 901 ExpOps.customFor({{S32}, {S16}}); 902 else 903 ExpOps.customFor({S32}); 904 ExpOps.clampScalar(0, MinScalarFPTy, S32) 905 .scalarize(0); 906 907 getActionDefinitionsBuilder(G_FPOWI) 908 .clampScalar(0, MinScalarFPTy, S32) 909 .lower(); 910 911 // The 64-bit versions produce 32-bit results, but only on the SALU. 912 getActionDefinitionsBuilder(G_CTPOP) 913 .legalFor({{S32, S32}, {S32, S64}}) 914 .clampScalar(0, S32, S32) 915 .clampScalar(1, S32, S64) 916 .scalarize(0) 917 .widenScalarToNextPow2(0, 32) 918 .widenScalarToNextPow2(1, 32); 919 920 // The hardware instructions return a different result on 0 than the generic 921 // instructions expect. The hardware produces -1, but these produce the 922 // bitwidth. 923 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 924 .scalarize(0) 925 .clampScalar(0, S32, S32) 926 .clampScalar(1, S32, S64) 927 .widenScalarToNextPow2(0, 32) 928 .widenScalarToNextPow2(1, 32) 929 .lower(); 930 931 // The 64-bit versions produce 32-bit results, but only on the SALU. 932 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 933 .legalFor({{S32, S32}, {S32, S64}}) 934 .clampScalar(0, S32, S32) 935 .clampScalar(1, S32, S64) 936 .scalarize(0) 937 .widenScalarToNextPow2(0, 32) 938 .widenScalarToNextPow2(1, 32); 939 940 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 941 // RegBankSelect. 942 getActionDefinitionsBuilder(G_BITREVERSE) 943 .legalFor({S32, S64}) 944 .clampScalar(0, S32, S64) 945 .scalarize(0) 946 .widenScalarToNextPow2(0); 947 948 if (ST.has16BitInsts()) { 949 getActionDefinitionsBuilder(G_BSWAP) 950 .legalFor({S16, S32, V2S16}) 951 .clampMaxNumElements(0, S16, 2) 952 // FIXME: Fixing non-power-of-2 before clamp is workaround for 953 // narrowScalar limitation. 954 .widenScalarToNextPow2(0) 955 .clampScalar(0, S16, S32) 956 .scalarize(0); 957 958 if (ST.hasVOP3PInsts()) { 959 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 960 .legalFor({S32, S16, V2S16}) 961 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 962 .clampMaxNumElements(0, S16, 2) 963 .minScalar(0, S16) 964 .widenScalarToNextPow2(0) 965 .scalarize(0) 966 .lower(); 967 } else { 968 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 969 .legalFor({S32, S16}) 970 .widenScalarToNextPow2(0) 971 .minScalar(0, S16) 972 .scalarize(0) 973 .lower(); 974 } 975 } else { 976 // TODO: Should have same legality without v_perm_b32 977 getActionDefinitionsBuilder(G_BSWAP) 978 .legalFor({S32}) 979 .lowerIf(scalarNarrowerThan(0, 32)) 980 // FIXME: Fixing non-power-of-2 before clamp is workaround for 981 // narrowScalar limitation. 982 .widenScalarToNextPow2(0) 983 .maxScalar(0, S32) 984 .scalarize(0) 985 .lower(); 986 987 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 988 .legalFor({S32}) 989 .minScalar(0, S32) 990 .widenScalarToNextPow2(0) 991 .scalarize(0) 992 .lower(); 993 } 994 995 getActionDefinitionsBuilder(G_INTTOPTR) 996 // List the common cases 997 .legalForCartesianProduct(AddrSpaces64, {S64}) 998 .legalForCartesianProduct(AddrSpaces32, {S32}) 999 .scalarize(0) 1000 // Accept any address space as long as the size matches 1001 .legalIf(sameSize(0, 1)) 1002 .widenScalarIf(smallerThan(1, 0), 1003 [](const LegalityQuery &Query) { 1004 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1005 }) 1006 .narrowScalarIf(largerThan(1, 0), 1007 [](const LegalityQuery &Query) { 1008 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1009 }); 1010 1011 getActionDefinitionsBuilder(G_PTRTOINT) 1012 // List the common cases 1013 .legalForCartesianProduct(AddrSpaces64, {S64}) 1014 .legalForCartesianProduct(AddrSpaces32, {S32}) 1015 .scalarize(0) 1016 // Accept any address space as long as the size matches 1017 .legalIf(sameSize(0, 1)) 1018 .widenScalarIf(smallerThan(0, 1), 1019 [](const LegalityQuery &Query) { 1020 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1021 }) 1022 .narrowScalarIf( 1023 largerThan(0, 1), 1024 [](const LegalityQuery &Query) { 1025 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1026 }); 1027 1028 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1029 .scalarize(0) 1030 .custom(); 1031 1032 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1033 bool IsLoad) -> bool { 1034 const LLT DstTy = Query.Types[0]; 1035 1036 // Split vector extloads. 1037 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1038 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1039 1040 if (MemSize < DstTy.getSizeInBits()) 1041 MemSize = std::max(MemSize, AlignBits); 1042 1043 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1044 return true; 1045 1046 const LLT PtrTy = Query.Types[1]; 1047 unsigned AS = PtrTy.getAddressSpace(); 1048 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1049 return true; 1050 1051 // Catch weird sized loads that don't evenly divide into the access sizes 1052 // TODO: May be able to widen depending on alignment etc. 1053 unsigned NumRegs = (MemSize + 31) / 32; 1054 if (NumRegs == 3) { 1055 if (!ST.hasDwordx3LoadStores()) 1056 return true; 1057 } else { 1058 // If the alignment allows, these should have been widened. 1059 if (!isPowerOf2_32(NumRegs)) 1060 return true; 1061 } 1062 1063 if (AlignBits < MemSize) { 1064 const SITargetLowering *TLI = ST.getTargetLowering(); 1065 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1066 Align(AlignBits / 8)); 1067 } 1068 1069 return false; 1070 }; 1071 1072 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1073 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1074 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1075 1076 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1077 // LDS 1078 // TODO: Unsupported flat for SI. 1079 1080 for (unsigned Op : {G_LOAD, G_STORE}) { 1081 const bool IsStore = Op == G_STORE; 1082 1083 auto &Actions = getActionDefinitionsBuilder(Op); 1084 // Explicitly list some common cases. 1085 // TODO: Does this help compile time at all? 1086 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1087 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1088 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1089 {S64, GlobalPtr, 64, GlobalAlign32}, 1090 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1091 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1092 {S32, GlobalPtr, 8, GlobalAlign8}, 1093 {S32, GlobalPtr, 16, GlobalAlign16}, 1094 1095 {S32, LocalPtr, 32, 32}, 1096 {S64, LocalPtr, 64, 32}, 1097 {V2S32, LocalPtr, 64, 32}, 1098 {S32, LocalPtr, 8, 8}, 1099 {S32, LocalPtr, 16, 16}, 1100 {V2S16, LocalPtr, 32, 32}, 1101 1102 {S32, PrivatePtr, 32, 32}, 1103 {S32, PrivatePtr, 8, 8}, 1104 {S32, PrivatePtr, 16, 16}, 1105 {V2S16, PrivatePtr, 32, 32}, 1106 1107 {S32, ConstantPtr, 32, GlobalAlign32}, 1108 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1109 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1110 {S64, ConstantPtr, 64, GlobalAlign32}, 1111 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1112 Actions.legalIf( 1113 [=](const LegalityQuery &Query) -> bool { 1114 return isLoadStoreLegal(ST, Query, Op); 1115 }); 1116 1117 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1118 // 64-bits. 1119 // 1120 // TODO: Should generalize bitcast action into coerce, which will also cover 1121 // inserting addrspacecasts. 1122 Actions.customIf(typeIs(1, Constant32Ptr)); 1123 1124 // Turn any illegal element vectors into something easier to deal 1125 // with. These will ultimately produce 32-bit scalar shifts to extract the 1126 // parts anyway. 1127 // 1128 // For odd 16-bit element vectors, prefer to split those into pieces with 1129 // 16-bit vector parts. 1130 Actions.bitcastIf( 1131 [=](const LegalityQuery &Query) -> bool { 1132 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1133 Query.MMODescrs[0].SizeInBits); 1134 }, bitcastToRegisterType(0)); 1135 1136 if (!IsStore) { 1137 // Widen suitably aligned loads by loading extra bytes. The standard 1138 // legalization actions can't properly express widening memory operands. 1139 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1140 return shouldWidenLoad(ST, Query, G_LOAD); 1141 }); 1142 } 1143 1144 // FIXME: load/store narrowing should be moved to lower action 1145 Actions 1146 .narrowScalarIf( 1147 [=](const LegalityQuery &Query) -> bool { 1148 return !Query.Types[0].isVector() && 1149 needToSplitMemOp(Query, Op == G_LOAD); 1150 }, 1151 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1152 const LLT DstTy = Query.Types[0]; 1153 const LLT PtrTy = Query.Types[1]; 1154 1155 const unsigned DstSize = DstTy.getSizeInBits(); 1156 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1157 1158 // Split extloads. 1159 if (DstSize > MemSize) 1160 return std::make_pair(0, LLT::scalar(MemSize)); 1161 1162 if (!isPowerOf2_32(DstSize)) { 1163 // We're probably decomposing an odd sized store. Try to split 1164 // to the widest type. TODO: Account for alignment. As-is it 1165 // should be OK, since the new parts will be further legalized. 1166 unsigned FloorSize = PowerOf2Floor(DstSize); 1167 return std::make_pair(0, LLT::scalar(FloorSize)); 1168 } 1169 1170 if (DstSize > 32 && (DstSize % 32 != 0)) { 1171 // FIXME: Need a way to specify non-extload of larger size if 1172 // suitably aligned. 1173 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1174 } 1175 1176 unsigned MaxSize = maxSizeForAddrSpace(ST, 1177 PtrTy.getAddressSpace(), 1178 Op == G_LOAD); 1179 if (MemSize > MaxSize) 1180 return std::make_pair(0, LLT::scalar(MaxSize)); 1181 1182 unsigned Align = Query.MMODescrs[0].AlignInBits; 1183 return std::make_pair(0, LLT::scalar(Align)); 1184 }) 1185 .fewerElementsIf( 1186 [=](const LegalityQuery &Query) -> bool { 1187 return Query.Types[0].isVector() && 1188 needToSplitMemOp(Query, Op == G_LOAD); 1189 }, 1190 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1191 const LLT DstTy = Query.Types[0]; 1192 const LLT PtrTy = Query.Types[1]; 1193 1194 LLT EltTy = DstTy.getElementType(); 1195 unsigned MaxSize = maxSizeForAddrSpace(ST, 1196 PtrTy.getAddressSpace(), 1197 Op == G_LOAD); 1198 1199 // FIXME: Handle widened to power of 2 results better. This ends 1200 // up scalarizing. 1201 // FIXME: 3 element stores scalarized on SI 1202 1203 // Split if it's too large for the address space. 1204 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1205 unsigned NumElts = DstTy.getNumElements(); 1206 unsigned EltSize = EltTy.getSizeInBits(); 1207 1208 if (MaxSize % EltSize == 0) { 1209 return std::make_pair( 1210 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1211 } 1212 1213 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1214 1215 // FIXME: Refine when odd breakdowns handled 1216 // The scalars will need to be re-legalized. 1217 if (NumPieces == 1 || NumPieces >= NumElts || 1218 NumElts % NumPieces != 0) 1219 return std::make_pair(0, EltTy); 1220 1221 return std::make_pair(0, 1222 LLT::vector(NumElts / NumPieces, EltTy)); 1223 } 1224 1225 // FIXME: We could probably handle weird extending loads better. 1226 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1227 if (DstTy.getSizeInBits() > MemSize) 1228 return std::make_pair(0, EltTy); 1229 1230 unsigned EltSize = EltTy.getSizeInBits(); 1231 unsigned DstSize = DstTy.getSizeInBits(); 1232 if (!isPowerOf2_32(DstSize)) { 1233 // We're probably decomposing an odd sized store. Try to split 1234 // to the widest type. TODO: Account for alignment. As-is it 1235 // should be OK, since the new parts will be further legalized. 1236 unsigned FloorSize = PowerOf2Floor(DstSize); 1237 return std::make_pair( 1238 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1239 } 1240 1241 // Need to split because of alignment. 1242 unsigned Align = Query.MMODescrs[0].AlignInBits; 1243 if (EltSize > Align && 1244 (EltSize / Align < DstTy.getNumElements())) { 1245 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1246 } 1247 1248 // May need relegalization for the scalars. 1249 return std::make_pair(0, EltTy); 1250 }) 1251 .lowerIfMemSizeNotPow2() 1252 .minScalar(0, S32); 1253 1254 if (IsStore) 1255 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1256 1257 Actions 1258 .widenScalarToNextPow2(0) 1259 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1260 .lower(); 1261 } 1262 1263 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1264 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1265 {S32, GlobalPtr, 16, 2 * 8}, 1266 {S32, LocalPtr, 8, 8}, 1267 {S32, LocalPtr, 16, 16}, 1268 {S32, PrivatePtr, 8, 8}, 1269 {S32, PrivatePtr, 16, 16}, 1270 {S32, ConstantPtr, 8, 8}, 1271 {S32, ConstantPtr, 16, 2 * 8}}); 1272 if (ST.hasFlatAddressSpace()) { 1273 ExtLoads.legalForTypesWithMemDesc( 1274 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1275 } 1276 1277 ExtLoads.clampScalar(0, S32, S32) 1278 .widenScalarToNextPow2(0) 1279 .unsupportedIfMemSizeNotPow2() 1280 .lower(); 1281 1282 auto &Atomics = getActionDefinitionsBuilder( 1283 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1284 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1285 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1286 G_ATOMICRMW_UMIN}) 1287 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1288 {S64, GlobalPtr}, {S64, LocalPtr}, 1289 {S32, RegionPtr}, {S64, RegionPtr}}); 1290 if (ST.hasFlatAddressSpace()) { 1291 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1292 } 1293 1294 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1295 if (ST.hasLDSFPAtomics()) { 1296 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1297 if (ST.hasGFX90AInsts()) 1298 Atomic.legalFor({{S64, LocalPtr}}); 1299 } 1300 if (ST.hasAtomicFaddInsts()) 1301 Atomic.legalFor({{S32, GlobalPtr}}); 1302 1303 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1304 // demarshalling 1305 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1306 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1307 {S32, FlatPtr}, {S64, FlatPtr}}) 1308 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1309 {S32, RegionPtr}, {S64, RegionPtr}}); 1310 // TODO: Pointer types, any 32-bit or 64-bit vector 1311 1312 // Condition should be s32 for scalar, s1 for vector. 1313 getActionDefinitionsBuilder(G_SELECT) 1314 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1315 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1316 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1317 .clampScalar(0, S16, S64) 1318 .scalarize(1) 1319 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1320 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1321 .clampMaxNumElements(0, S32, 2) 1322 .clampMaxNumElements(0, LocalPtr, 2) 1323 .clampMaxNumElements(0, PrivatePtr, 2) 1324 .scalarize(0) 1325 .widenScalarToNextPow2(0) 1326 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1327 1328 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1329 // be more flexible with the shift amount type. 1330 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1331 .legalFor({{S32, S32}, {S64, S32}}); 1332 if (ST.has16BitInsts()) { 1333 if (ST.hasVOP3PInsts()) { 1334 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1335 .clampMaxNumElements(0, S16, 2); 1336 } else 1337 Shifts.legalFor({{S16, S16}}); 1338 1339 // TODO: Support 16-bit shift amounts for all types 1340 Shifts.widenScalarIf( 1341 [=](const LegalityQuery &Query) { 1342 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1343 // 32-bit amount. 1344 const LLT ValTy = Query.Types[0]; 1345 const LLT AmountTy = Query.Types[1]; 1346 return ValTy.getSizeInBits() <= 16 && 1347 AmountTy.getSizeInBits() < 16; 1348 }, changeTo(1, S16)); 1349 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1350 Shifts.clampScalar(1, S32, S32); 1351 Shifts.clampScalar(0, S16, S64); 1352 Shifts.widenScalarToNextPow2(0, 16); 1353 1354 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1355 .minScalar(0, S16) 1356 .scalarize(0) 1357 .lower(); 1358 } else { 1359 // Make sure we legalize the shift amount type first, as the general 1360 // expansion for the shifted type will produce much worse code if it hasn't 1361 // been truncated already. 1362 Shifts.clampScalar(1, S32, S32); 1363 Shifts.clampScalar(0, S32, S64); 1364 Shifts.widenScalarToNextPow2(0, 32); 1365 1366 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1367 .minScalar(0, S32) 1368 .scalarize(0) 1369 .lower(); 1370 } 1371 Shifts.scalarize(0); 1372 1373 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1374 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1375 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1376 unsigned IdxTypeIdx = 2; 1377 1378 getActionDefinitionsBuilder(Op) 1379 .customIf([=](const LegalityQuery &Query) { 1380 const LLT EltTy = Query.Types[EltTypeIdx]; 1381 const LLT VecTy = Query.Types[VecTypeIdx]; 1382 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1383 const unsigned EltSize = EltTy.getSizeInBits(); 1384 return (EltSize == 32 || EltSize == 64) && 1385 VecTy.getSizeInBits() % 32 == 0 && 1386 VecTy.getSizeInBits() <= MaxRegisterSize && 1387 IdxTy.getSizeInBits() == 32; 1388 }) 1389 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1390 bitcastToVectorElement32(VecTypeIdx)) 1391 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1392 .bitcastIf( 1393 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1394 [=](const LegalityQuery &Query) { 1395 // For > 64-bit element types, try to turn this into a 64-bit 1396 // element vector since we may be able to do better indexing 1397 // if this is scalar. If not, fall back to 32. 1398 const LLT EltTy = Query.Types[EltTypeIdx]; 1399 const LLT VecTy = Query.Types[VecTypeIdx]; 1400 const unsigned DstEltSize = EltTy.getSizeInBits(); 1401 const unsigned VecSize = VecTy.getSizeInBits(); 1402 1403 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1404 return std::make_pair( 1405 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1406 }) 1407 .clampScalar(EltTypeIdx, S32, S64) 1408 .clampScalar(VecTypeIdx, S32, S64) 1409 .clampScalar(IdxTypeIdx, S32, S32) 1410 .clampMaxNumElements(VecTypeIdx, S32, 32) 1411 // TODO: Clamp elements for 64-bit vectors? 1412 // It should only be necessary with variable indexes. 1413 // As a last resort, lower to the stack 1414 .lower(); 1415 } 1416 1417 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1418 .unsupportedIf([=](const LegalityQuery &Query) { 1419 const LLT &EltTy = Query.Types[1].getElementType(); 1420 return Query.Types[0] != EltTy; 1421 }); 1422 1423 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1424 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1425 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1426 1427 // FIXME: Doesn't handle extract of illegal sizes. 1428 getActionDefinitionsBuilder(Op) 1429 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1430 // FIXME: Multiples of 16 should not be legal. 1431 .legalIf([=](const LegalityQuery &Query) { 1432 const LLT BigTy = Query.Types[BigTyIdx]; 1433 const LLT LitTy = Query.Types[LitTyIdx]; 1434 return (BigTy.getSizeInBits() % 32 == 0) && 1435 (LitTy.getSizeInBits() % 16 == 0); 1436 }) 1437 .widenScalarIf( 1438 [=](const LegalityQuery &Query) { 1439 const LLT BigTy = Query.Types[BigTyIdx]; 1440 return (BigTy.getScalarSizeInBits() < 16); 1441 }, 1442 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1443 .widenScalarIf( 1444 [=](const LegalityQuery &Query) { 1445 const LLT LitTy = Query.Types[LitTyIdx]; 1446 return (LitTy.getScalarSizeInBits() < 16); 1447 }, 1448 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1449 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1450 .widenScalarToNextPow2(BigTyIdx, 32); 1451 1452 } 1453 1454 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1455 .legalForCartesianProduct(AllS32Vectors, {S32}) 1456 .legalForCartesianProduct(AllS64Vectors, {S64}) 1457 .clampNumElements(0, V16S32, V32S32) 1458 .clampNumElements(0, V2S64, V16S64) 1459 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1460 1461 if (ST.hasScalarPackInsts()) { 1462 BuildVector 1463 // FIXME: Should probably widen s1 vectors straight to s32 1464 .minScalarOrElt(0, S16) 1465 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1466 .minScalar(1, S32); 1467 1468 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1469 .legalFor({V2S16, S32}) 1470 .lower(); 1471 BuildVector.minScalarOrElt(0, S32); 1472 } else { 1473 BuildVector.customFor({V2S16, S16}); 1474 BuildVector.minScalarOrElt(0, S32); 1475 1476 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1477 .customFor({V2S16, S32}) 1478 .lower(); 1479 } 1480 1481 BuildVector.legalIf(isRegisterType(0)); 1482 1483 // FIXME: Clamp maximum size 1484 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1485 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1486 .clampMaxNumElements(0, S32, 32) 1487 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1488 .clampMaxNumElements(0, S16, 64); 1489 1490 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1491 // pre-legalize. 1492 if (ST.hasVOP3PInsts()) { 1493 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1494 .customFor({V2S16, V2S16}) 1495 .lower(); 1496 } else 1497 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1498 1499 // Merge/Unmerge 1500 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1501 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1502 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1503 1504 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1505 const LLT Ty = Query.Types[TypeIdx]; 1506 if (Ty.isVector()) { 1507 const LLT &EltTy = Ty.getElementType(); 1508 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1509 return true; 1510 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1511 return true; 1512 } 1513 return false; 1514 }; 1515 1516 auto &Builder = getActionDefinitionsBuilder(Op) 1517 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1518 .lowerFor({{S16, V2S16}}) 1519 .lowerIf([=](const LegalityQuery &Query) { 1520 const LLT BigTy = Query.Types[BigTyIdx]; 1521 return BigTy.getSizeInBits() == 32; 1522 }) 1523 // Try to widen to s16 first for small types. 1524 // TODO: Only do this on targets with legal s16 shifts 1525 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1526 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1527 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1528 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1529 elementTypeIs(1, S16)), 1530 changeTo(1, V2S16)) 1531 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1532 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1533 // valid. 1534 .clampScalar(LitTyIdx, S32, S512) 1535 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1536 // Break up vectors with weird elements into scalars 1537 .fewerElementsIf( 1538 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1539 scalarize(0)) 1540 .fewerElementsIf( 1541 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1542 scalarize(1)) 1543 .clampScalar(BigTyIdx, S32, MaxScalar); 1544 1545 if (Op == G_MERGE_VALUES) { 1546 Builder.widenScalarIf( 1547 // TODO: Use 16-bit shifts if legal for 8-bit values? 1548 [=](const LegalityQuery &Query) { 1549 const LLT Ty = Query.Types[LitTyIdx]; 1550 return Ty.getSizeInBits() < 32; 1551 }, 1552 changeTo(LitTyIdx, S32)); 1553 } 1554 1555 Builder.widenScalarIf( 1556 [=](const LegalityQuery &Query) { 1557 const LLT Ty = Query.Types[BigTyIdx]; 1558 return !isPowerOf2_32(Ty.getSizeInBits()) && 1559 Ty.getSizeInBits() % 16 != 0; 1560 }, 1561 [=](const LegalityQuery &Query) { 1562 // Pick the next power of 2, or a multiple of 64 over 128. 1563 // Whichever is smaller. 1564 const LLT &Ty = Query.Types[BigTyIdx]; 1565 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1566 if (NewSizeInBits >= 256) { 1567 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1568 if (RoundedTo < NewSizeInBits) 1569 NewSizeInBits = RoundedTo; 1570 } 1571 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1572 }) 1573 // Any vectors left are the wrong size. Scalarize them. 1574 .scalarize(0) 1575 .scalarize(1); 1576 } 1577 1578 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1579 // RegBankSelect. 1580 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1581 .legalFor({{S32}, {S64}}); 1582 1583 if (ST.hasVOP3PInsts()) { 1584 SextInReg.lowerFor({{V2S16}}) 1585 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1586 // get more vector shift opportunities, since we'll get those when 1587 // expanded. 1588 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1589 } else if (ST.has16BitInsts()) { 1590 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1591 } else { 1592 // Prefer to promote to s32 before lowering if we don't have 16-bit 1593 // shifts. This avoid a lot of intermediate truncate and extend operations. 1594 SextInReg.lowerFor({{S32}, {S64}}); 1595 } 1596 1597 SextInReg 1598 .scalarize(0) 1599 .clampScalar(0, S32, S64) 1600 .lower(); 1601 1602 // TODO: Only Try to form v2s16 with legal packed instructions. 1603 getActionDefinitionsBuilder(G_FSHR) 1604 .legalFor({{S32, S32}}) 1605 .lowerFor({{V2S16, V2S16}}) 1606 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1607 .scalarize(0) 1608 .lower(); 1609 1610 if (ST.hasVOP3PInsts()) { 1611 getActionDefinitionsBuilder(G_FSHL) 1612 .lowerFor({{V2S16, V2S16}}) 1613 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1614 .scalarize(0) 1615 .lower(); 1616 } else { 1617 getActionDefinitionsBuilder(G_FSHL) 1618 .scalarize(0) 1619 .lower(); 1620 } 1621 1622 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1623 .legalFor({S64}); 1624 1625 getActionDefinitionsBuilder(G_FENCE) 1626 .alwaysLegal(); 1627 1628 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1629 .scalarize(0) 1630 .minScalar(0, S32) 1631 .lower(); 1632 1633 getActionDefinitionsBuilder({ 1634 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1635 G_FCOPYSIGN, 1636 1637 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1638 G_ATOMICRMW_NAND, 1639 G_ATOMICRMW_FSUB, 1640 G_READ_REGISTER, 1641 G_WRITE_REGISTER, 1642 1643 G_SADDO, G_SSUBO, 1644 1645 // TODO: Implement 1646 G_FMINIMUM, G_FMAXIMUM}).lower(); 1647 1648 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1649 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1650 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1651 .unsupported(); 1652 1653 computeTables(); 1654 verify(*ST.getInstrInfo()); 1655 } 1656 1657 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1658 MachineInstr &MI) const { 1659 MachineIRBuilder &B = Helper.MIRBuilder; 1660 MachineRegisterInfo &MRI = *B.getMRI(); 1661 1662 switch (MI.getOpcode()) { 1663 case TargetOpcode::G_ADDRSPACE_CAST: 1664 return legalizeAddrSpaceCast(MI, MRI, B); 1665 case TargetOpcode::G_FRINT: 1666 return legalizeFrint(MI, MRI, B); 1667 case TargetOpcode::G_FCEIL: 1668 return legalizeFceil(MI, MRI, B); 1669 case TargetOpcode::G_FREM: 1670 return legalizeFrem(MI, MRI, B); 1671 case TargetOpcode::G_INTRINSIC_TRUNC: 1672 return legalizeIntrinsicTrunc(MI, MRI, B); 1673 case TargetOpcode::G_SITOFP: 1674 return legalizeITOFP(MI, MRI, B, true); 1675 case TargetOpcode::G_UITOFP: 1676 return legalizeITOFP(MI, MRI, B, false); 1677 case TargetOpcode::G_FPTOSI: 1678 return legalizeFPTOI(MI, MRI, B, true); 1679 case TargetOpcode::G_FPTOUI: 1680 return legalizeFPTOI(MI, MRI, B, false); 1681 case TargetOpcode::G_FMINNUM: 1682 case TargetOpcode::G_FMAXNUM: 1683 case TargetOpcode::G_FMINNUM_IEEE: 1684 case TargetOpcode::G_FMAXNUM_IEEE: 1685 return legalizeMinNumMaxNum(Helper, MI); 1686 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1687 return legalizeExtractVectorElt(MI, MRI, B); 1688 case TargetOpcode::G_INSERT_VECTOR_ELT: 1689 return legalizeInsertVectorElt(MI, MRI, B); 1690 case TargetOpcode::G_SHUFFLE_VECTOR: 1691 return legalizeShuffleVector(MI, MRI, B); 1692 case TargetOpcode::G_FSIN: 1693 case TargetOpcode::G_FCOS: 1694 return legalizeSinCos(MI, MRI, B); 1695 case TargetOpcode::G_GLOBAL_VALUE: 1696 return legalizeGlobalValue(MI, MRI, B); 1697 case TargetOpcode::G_LOAD: 1698 return legalizeLoad(Helper, MI); 1699 case TargetOpcode::G_FMAD: 1700 return legalizeFMad(MI, MRI, B); 1701 case TargetOpcode::G_FDIV: 1702 return legalizeFDIV(MI, MRI, B); 1703 case TargetOpcode::G_UDIV: 1704 case TargetOpcode::G_UREM: 1705 return legalizeUDIV_UREM(MI, MRI, B); 1706 case TargetOpcode::G_SDIV: 1707 case TargetOpcode::G_SREM: 1708 return legalizeSDIV_SREM(MI, MRI, B); 1709 case TargetOpcode::G_ATOMIC_CMPXCHG: 1710 return legalizeAtomicCmpXChg(MI, MRI, B); 1711 case TargetOpcode::G_FLOG: 1712 return legalizeFlog(MI, B, numbers::ln2f); 1713 case TargetOpcode::G_FLOG10: 1714 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1715 case TargetOpcode::G_FEXP: 1716 return legalizeFExp(MI, B); 1717 case TargetOpcode::G_FPOW: 1718 return legalizeFPow(MI, B); 1719 case TargetOpcode::G_FFLOOR: 1720 return legalizeFFloor(MI, MRI, B); 1721 case TargetOpcode::G_BUILD_VECTOR: 1722 return legalizeBuildVector(MI, MRI, B); 1723 default: 1724 return false; 1725 } 1726 1727 llvm_unreachable("expected switch to return"); 1728 } 1729 1730 Register AMDGPULegalizerInfo::getSegmentAperture( 1731 unsigned AS, 1732 MachineRegisterInfo &MRI, 1733 MachineIRBuilder &B) const { 1734 MachineFunction &MF = B.getMF(); 1735 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1736 const LLT S32 = LLT::scalar(32); 1737 1738 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1739 1740 if (ST.hasApertureRegs()) { 1741 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1742 // getreg. 1743 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1744 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1745 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1746 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1747 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1748 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1749 unsigned Encoding = 1750 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1751 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1752 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1753 1754 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1755 1756 B.buildInstr(AMDGPU::S_GETREG_B32) 1757 .addDef(GetReg) 1758 .addImm(Encoding); 1759 MRI.setType(GetReg, S32); 1760 1761 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1762 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1763 } 1764 1765 Register QueuePtr = MRI.createGenericVirtualRegister( 1766 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1767 1768 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1769 return Register(); 1770 1771 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1772 // private_segment_aperture_base_hi. 1773 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1774 1775 // TODO: can we be smarter about machine pointer info? 1776 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1777 MachineMemOperand *MMO = MF.getMachineMemOperand( 1778 PtrInfo, 1779 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1780 MachineMemOperand::MOInvariant, 1781 4, commonAlignment(Align(64), StructOffset)); 1782 1783 Register LoadAddr; 1784 1785 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1786 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1787 } 1788 1789 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1790 MachineInstr &MI, MachineRegisterInfo &MRI, 1791 MachineIRBuilder &B) const { 1792 MachineFunction &MF = B.getMF(); 1793 1794 const LLT S32 = LLT::scalar(32); 1795 Register Dst = MI.getOperand(0).getReg(); 1796 Register Src = MI.getOperand(1).getReg(); 1797 1798 LLT DstTy = MRI.getType(Dst); 1799 LLT SrcTy = MRI.getType(Src); 1800 unsigned DestAS = DstTy.getAddressSpace(); 1801 unsigned SrcAS = SrcTy.getAddressSpace(); 1802 1803 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1804 // vector element. 1805 assert(!DstTy.isVector()); 1806 1807 const AMDGPUTargetMachine &TM 1808 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1809 1810 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1811 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1812 return true; 1813 } 1814 1815 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1816 // Truncate. 1817 B.buildExtract(Dst, Src, 0); 1818 MI.eraseFromParent(); 1819 return true; 1820 } 1821 1822 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1823 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1824 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1825 1826 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1827 // another. Merge operands are required to be the same type, but creating an 1828 // extra ptrtoint would be kind of pointless. 1829 auto HighAddr = B.buildConstant( 1830 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1831 B.buildMerge(Dst, {Src, HighAddr}); 1832 MI.eraseFromParent(); 1833 return true; 1834 } 1835 1836 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1837 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1838 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1839 unsigned NullVal = TM.getNullPointerValue(DestAS); 1840 1841 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1842 auto FlatNull = B.buildConstant(SrcTy, 0); 1843 1844 // Extract low 32-bits of the pointer. 1845 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1846 1847 auto CmpRes = 1848 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1849 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1850 1851 MI.eraseFromParent(); 1852 return true; 1853 } 1854 1855 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1856 return false; 1857 1858 if (!ST.hasFlatAddressSpace()) 1859 return false; 1860 1861 auto SegmentNull = 1862 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1863 auto FlatNull = 1864 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1865 1866 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1867 if (!ApertureReg.isValid()) 1868 return false; 1869 1870 auto CmpRes = 1871 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1872 1873 // Coerce the type of the low half of the result so we can use merge_values. 1874 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1875 1876 // TODO: Should we allow mismatched types but matching sizes in merges to 1877 // avoid the ptrtoint? 1878 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1879 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1880 1881 MI.eraseFromParent(); 1882 return true; 1883 } 1884 1885 bool AMDGPULegalizerInfo::legalizeFrint( 1886 MachineInstr &MI, MachineRegisterInfo &MRI, 1887 MachineIRBuilder &B) const { 1888 Register Src = MI.getOperand(1).getReg(); 1889 LLT Ty = MRI.getType(Src); 1890 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1891 1892 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1893 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1894 1895 auto C1 = B.buildFConstant(Ty, C1Val); 1896 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1897 1898 // TODO: Should this propagate fast-math-flags? 1899 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1900 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1901 1902 auto C2 = B.buildFConstant(Ty, C2Val); 1903 auto Fabs = B.buildFAbs(Ty, Src); 1904 1905 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1906 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1907 MI.eraseFromParent(); 1908 return true; 1909 } 1910 1911 bool AMDGPULegalizerInfo::legalizeFceil( 1912 MachineInstr &MI, MachineRegisterInfo &MRI, 1913 MachineIRBuilder &B) const { 1914 1915 const LLT S1 = LLT::scalar(1); 1916 const LLT S64 = LLT::scalar(64); 1917 1918 Register Src = MI.getOperand(1).getReg(); 1919 assert(MRI.getType(Src) == S64); 1920 1921 // result = trunc(src) 1922 // if (src > 0.0 && src != result) 1923 // result += 1.0 1924 1925 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1926 1927 const auto Zero = B.buildFConstant(S64, 0.0); 1928 const auto One = B.buildFConstant(S64, 1.0); 1929 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1930 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1931 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1932 auto Add = B.buildSelect(S64, And, One, Zero); 1933 1934 // TODO: Should this propagate fast-math-flags? 1935 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1936 return true; 1937 } 1938 1939 bool AMDGPULegalizerInfo::legalizeFrem( 1940 MachineInstr &MI, MachineRegisterInfo &MRI, 1941 MachineIRBuilder &B) const { 1942 Register DstReg = MI.getOperand(0).getReg(); 1943 Register Src0Reg = MI.getOperand(1).getReg(); 1944 Register Src1Reg = MI.getOperand(2).getReg(); 1945 auto Flags = MI.getFlags(); 1946 LLT Ty = MRI.getType(DstReg); 1947 1948 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1949 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1950 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1951 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1952 MI.eraseFromParent(); 1953 return true; 1954 } 1955 1956 static MachineInstrBuilder extractF64Exponent(Register Hi, 1957 MachineIRBuilder &B) { 1958 const unsigned FractBits = 52; 1959 const unsigned ExpBits = 11; 1960 LLT S32 = LLT::scalar(32); 1961 1962 auto Const0 = B.buildConstant(S32, FractBits - 32); 1963 auto Const1 = B.buildConstant(S32, ExpBits); 1964 1965 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1966 .addUse(Hi) 1967 .addUse(Const0.getReg(0)) 1968 .addUse(Const1.getReg(0)); 1969 1970 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1971 } 1972 1973 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1974 MachineInstr &MI, MachineRegisterInfo &MRI, 1975 MachineIRBuilder &B) const { 1976 const LLT S1 = LLT::scalar(1); 1977 const LLT S32 = LLT::scalar(32); 1978 const LLT S64 = LLT::scalar(64); 1979 1980 Register Src = MI.getOperand(1).getReg(); 1981 assert(MRI.getType(Src) == S64); 1982 1983 // TODO: Should this use extract since the low half is unused? 1984 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1985 Register Hi = Unmerge.getReg(1); 1986 1987 // Extract the upper half, since this is where we will find the sign and 1988 // exponent. 1989 auto Exp = extractF64Exponent(Hi, B); 1990 1991 const unsigned FractBits = 52; 1992 1993 // Extract the sign bit. 1994 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1995 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1996 1997 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1998 1999 const auto Zero32 = B.buildConstant(S32, 0); 2000 2001 // Extend back to 64-bits. 2002 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2003 2004 auto Shr = B.buildAShr(S64, FractMask, Exp); 2005 auto Not = B.buildNot(S64, Shr); 2006 auto Tmp0 = B.buildAnd(S64, Src, Not); 2007 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2008 2009 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2010 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2011 2012 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2013 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2014 MI.eraseFromParent(); 2015 return true; 2016 } 2017 2018 bool AMDGPULegalizerInfo::legalizeITOFP( 2019 MachineInstr &MI, MachineRegisterInfo &MRI, 2020 MachineIRBuilder &B, bool Signed) const { 2021 2022 Register Dst = MI.getOperand(0).getReg(); 2023 Register Src = MI.getOperand(1).getReg(); 2024 2025 const LLT S64 = LLT::scalar(64); 2026 const LLT S32 = LLT::scalar(32); 2027 2028 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2029 2030 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2031 2032 auto CvtHi = Signed ? 2033 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2034 B.buildUITOFP(S64, Unmerge.getReg(1)); 2035 2036 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2037 2038 auto ThirtyTwo = B.buildConstant(S32, 32); 2039 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2040 .addUse(CvtHi.getReg(0)) 2041 .addUse(ThirtyTwo.getReg(0)); 2042 2043 // TODO: Should this propagate fast-math-flags? 2044 B.buildFAdd(Dst, LdExp, CvtLo); 2045 MI.eraseFromParent(); 2046 return true; 2047 } 2048 2049 // TODO: Copied from DAG implementation. Verify logic and document how this 2050 // actually works. 2051 bool AMDGPULegalizerInfo::legalizeFPTOI( 2052 MachineInstr &MI, MachineRegisterInfo &MRI, 2053 MachineIRBuilder &B, bool Signed) const { 2054 2055 Register Dst = MI.getOperand(0).getReg(); 2056 Register Src = MI.getOperand(1).getReg(); 2057 2058 const LLT S64 = LLT::scalar(64); 2059 const LLT S32 = LLT::scalar(32); 2060 2061 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2062 2063 unsigned Flags = MI.getFlags(); 2064 2065 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2066 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2067 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2068 2069 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2070 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2071 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2072 2073 auto Hi = Signed ? 2074 B.buildFPTOSI(S32, FloorMul) : 2075 B.buildFPTOUI(S32, FloorMul); 2076 auto Lo = B.buildFPTOUI(S32, Fma); 2077 2078 B.buildMerge(Dst, { Lo, Hi }); 2079 MI.eraseFromParent(); 2080 2081 return true; 2082 } 2083 2084 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2085 MachineInstr &MI) const { 2086 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2087 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2088 2089 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2090 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2091 2092 // With ieee_mode disabled, the instructions have the correct behavior 2093 // already for G_FMINNUM/G_FMAXNUM 2094 if (!MFI->getMode().IEEE) 2095 return !IsIEEEOp; 2096 2097 if (IsIEEEOp) 2098 return true; 2099 2100 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2101 } 2102 2103 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2104 MachineInstr &MI, MachineRegisterInfo &MRI, 2105 MachineIRBuilder &B) const { 2106 // TODO: Should move some of this into LegalizerHelper. 2107 2108 // TODO: Promote dynamic indexing of s16 to s32 2109 2110 // FIXME: Artifact combiner probably should have replaced the truncated 2111 // constant before this, so we shouldn't need 2112 // getConstantVRegValWithLookThrough. 2113 Optional<ValueAndVReg> MaybeIdxVal = 2114 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2115 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2116 return true; 2117 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2118 2119 Register Dst = MI.getOperand(0).getReg(); 2120 Register Vec = MI.getOperand(1).getReg(); 2121 2122 LLT VecTy = MRI.getType(Vec); 2123 LLT EltTy = VecTy.getElementType(); 2124 assert(EltTy == MRI.getType(Dst)); 2125 2126 if (IdxVal < VecTy.getNumElements()) 2127 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2128 else 2129 B.buildUndef(Dst); 2130 2131 MI.eraseFromParent(); 2132 return true; 2133 } 2134 2135 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2136 MachineInstr &MI, MachineRegisterInfo &MRI, 2137 MachineIRBuilder &B) const { 2138 // TODO: Should move some of this into LegalizerHelper. 2139 2140 // TODO: Promote dynamic indexing of s16 to s32 2141 2142 // FIXME: Artifact combiner probably should have replaced the truncated 2143 // constant before this, so we shouldn't need 2144 // getConstantVRegValWithLookThrough. 2145 Optional<ValueAndVReg> MaybeIdxVal = 2146 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2147 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2148 return true; 2149 2150 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2151 Register Dst = MI.getOperand(0).getReg(); 2152 Register Vec = MI.getOperand(1).getReg(); 2153 Register Ins = MI.getOperand(2).getReg(); 2154 2155 LLT VecTy = MRI.getType(Vec); 2156 LLT EltTy = VecTy.getElementType(); 2157 assert(EltTy == MRI.getType(Ins)); 2158 2159 if (IdxVal < VecTy.getNumElements()) 2160 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2161 else 2162 B.buildUndef(Dst); 2163 2164 MI.eraseFromParent(); 2165 return true; 2166 } 2167 2168 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2169 MachineInstr &MI, MachineRegisterInfo &MRI, 2170 MachineIRBuilder &B) const { 2171 const LLT V2S16 = LLT::vector(2, 16); 2172 2173 Register Dst = MI.getOperand(0).getReg(); 2174 Register Src0 = MI.getOperand(1).getReg(); 2175 LLT DstTy = MRI.getType(Dst); 2176 LLT SrcTy = MRI.getType(Src0); 2177 2178 if (SrcTy == V2S16 && DstTy == V2S16 && 2179 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2180 return true; 2181 2182 MachineIRBuilder HelperBuilder(MI); 2183 GISelObserverWrapper DummyObserver; 2184 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2185 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2186 } 2187 2188 bool AMDGPULegalizerInfo::legalizeSinCos( 2189 MachineInstr &MI, MachineRegisterInfo &MRI, 2190 MachineIRBuilder &B) const { 2191 2192 Register DstReg = MI.getOperand(0).getReg(); 2193 Register SrcReg = MI.getOperand(1).getReg(); 2194 LLT Ty = MRI.getType(DstReg); 2195 unsigned Flags = MI.getFlags(); 2196 2197 Register TrigVal; 2198 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2199 if (ST.hasTrigReducedRange()) { 2200 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2201 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2202 .addUse(MulVal.getReg(0)) 2203 .setMIFlags(Flags).getReg(0); 2204 } else 2205 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2206 2207 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2208 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2209 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2210 .addUse(TrigVal) 2211 .setMIFlags(Flags); 2212 MI.eraseFromParent(); 2213 return true; 2214 } 2215 2216 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2217 MachineIRBuilder &B, 2218 const GlobalValue *GV, 2219 int64_t Offset, 2220 unsigned GAFlags) const { 2221 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2222 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2223 // to the following code sequence: 2224 // 2225 // For constant address space: 2226 // s_getpc_b64 s[0:1] 2227 // s_add_u32 s0, s0, $symbol 2228 // s_addc_u32 s1, s1, 0 2229 // 2230 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2231 // a fixup or relocation is emitted to replace $symbol with a literal 2232 // constant, which is a pc-relative offset from the encoding of the $symbol 2233 // operand to the global variable. 2234 // 2235 // For global address space: 2236 // s_getpc_b64 s[0:1] 2237 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2238 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2239 // 2240 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2241 // fixups or relocations are emitted to replace $symbol@*@lo and 2242 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2243 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2244 // operand to the global variable. 2245 // 2246 // What we want here is an offset from the value returned by s_getpc 2247 // (which is the address of the s_add_u32 instruction) to the global 2248 // variable, but since the encoding of $symbol starts 4 bytes after the start 2249 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2250 // small. This requires us to add 4 to the global variable offset in order to 2251 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2252 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2253 // instruction. 2254 2255 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2256 2257 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2258 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2259 2260 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2261 .addDef(PCReg); 2262 2263 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2264 if (GAFlags == SIInstrInfo::MO_NONE) 2265 MIB.addImm(0); 2266 else 2267 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2268 2269 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2270 2271 if (PtrTy.getSizeInBits() == 32) 2272 B.buildExtract(DstReg, PCReg, 0); 2273 return true; 2274 } 2275 2276 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2277 MachineInstr &MI, MachineRegisterInfo &MRI, 2278 MachineIRBuilder &B) const { 2279 Register DstReg = MI.getOperand(0).getReg(); 2280 LLT Ty = MRI.getType(DstReg); 2281 unsigned AS = Ty.getAddressSpace(); 2282 2283 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2284 MachineFunction &MF = B.getMF(); 2285 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2286 2287 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2288 if (!MFI->isModuleEntryFunction()) { 2289 const Function &Fn = MF.getFunction(); 2290 DiagnosticInfoUnsupported BadLDSDecl( 2291 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2292 DS_Warning); 2293 Fn.getContext().diagnose(BadLDSDecl); 2294 2295 // We currently don't have a way to correctly allocate LDS objects that 2296 // aren't directly associated with a kernel. We do force inlining of 2297 // functions that use local objects. However, if these dead functions are 2298 // not eliminated, we don't want a compile time error. Just emit a warning 2299 // and a trap, since there should be no callable path here. 2300 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2301 B.buildUndef(DstReg); 2302 MI.eraseFromParent(); 2303 return true; 2304 } 2305 2306 // TODO: We could emit code to handle the initialization somewhere. 2307 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2308 const SITargetLowering *TLI = ST.getTargetLowering(); 2309 if (!TLI->shouldUseLDSConstAddress(GV)) { 2310 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2311 return true; // Leave in place; 2312 } 2313 2314 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2315 Type *Ty = GV->getValueType(); 2316 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2317 // zero-sized type in other languages to declare the dynamic shared 2318 // memory which size is not known at the compile time. They will be 2319 // allocated by the runtime and placed directly after the static 2320 // allocated ones. They all share the same offset. 2321 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2322 // Adjust alignment for that dynamic shared memory array. 2323 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2324 LLT S32 = LLT::scalar(32); 2325 auto Sz = 2326 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2327 B.buildIntToPtr(DstReg, Sz); 2328 MI.eraseFromParent(); 2329 return true; 2330 } 2331 } 2332 2333 B.buildConstant( 2334 DstReg, 2335 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2336 MI.eraseFromParent(); 2337 return true; 2338 } 2339 2340 const Function &Fn = MF.getFunction(); 2341 DiagnosticInfoUnsupported BadInit( 2342 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2343 Fn.getContext().diagnose(BadInit); 2344 return true; 2345 } 2346 2347 const SITargetLowering *TLI = ST.getTargetLowering(); 2348 2349 if (TLI->shouldEmitFixup(GV)) { 2350 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2351 MI.eraseFromParent(); 2352 return true; 2353 } 2354 2355 if (TLI->shouldEmitPCReloc(GV)) { 2356 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2357 MI.eraseFromParent(); 2358 return true; 2359 } 2360 2361 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2362 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2363 2364 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2365 MachinePointerInfo::getGOT(MF), 2366 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2367 MachineMemOperand::MOInvariant, 2368 8 /*Size*/, Align(8)); 2369 2370 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2371 2372 if (Ty.getSizeInBits() == 32) { 2373 // Truncate if this is a 32-bit constant adrdess. 2374 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2375 B.buildExtract(DstReg, Load, 0); 2376 } else 2377 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2378 2379 MI.eraseFromParent(); 2380 return true; 2381 } 2382 2383 static LLT widenToNextPowerOf2(LLT Ty) { 2384 if (Ty.isVector()) 2385 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2386 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2387 } 2388 2389 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2390 MachineInstr &MI) const { 2391 MachineIRBuilder &B = Helper.MIRBuilder; 2392 MachineRegisterInfo &MRI = *B.getMRI(); 2393 GISelChangeObserver &Observer = Helper.Observer; 2394 2395 Register PtrReg = MI.getOperand(1).getReg(); 2396 LLT PtrTy = MRI.getType(PtrReg); 2397 unsigned AddrSpace = PtrTy.getAddressSpace(); 2398 2399 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2400 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2401 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2402 Observer.changingInstr(MI); 2403 MI.getOperand(1).setReg(Cast.getReg(0)); 2404 Observer.changedInstr(MI); 2405 return true; 2406 } 2407 2408 Register ValReg = MI.getOperand(0).getReg(); 2409 LLT ValTy = MRI.getType(ValReg); 2410 2411 MachineMemOperand *MMO = *MI.memoperands_begin(); 2412 const unsigned ValSize = ValTy.getSizeInBits(); 2413 const unsigned MemSize = 8 * MMO->getSize(); 2414 const Align MemAlign = MMO->getAlign(); 2415 const unsigned AlignInBits = 8 * MemAlign.value(); 2416 2417 // Widen non-power-of-2 loads to the alignment if needed 2418 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2419 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2420 2421 // This was already the correct extending load result type, so just adjust 2422 // the memory type. 2423 if (WideMemSize == ValSize) { 2424 MachineFunction &MF = B.getMF(); 2425 2426 MachineMemOperand *WideMMO = 2427 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2428 Observer.changingInstr(MI); 2429 MI.setMemRefs(MF, {WideMMO}); 2430 Observer.changedInstr(MI); 2431 return true; 2432 } 2433 2434 // Don't bother handling edge case that should probably never be produced. 2435 if (ValSize > WideMemSize) 2436 return false; 2437 2438 LLT WideTy = widenToNextPowerOf2(ValTy); 2439 2440 Register WideLoad; 2441 if (!WideTy.isVector()) { 2442 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2443 B.buildTrunc(ValReg, WideLoad).getReg(0); 2444 } else { 2445 // Extract the subvector. 2446 2447 if (isRegisterType(ValTy)) { 2448 // If this a case where G_EXTRACT is legal, use it. 2449 // (e.g. <3 x s32> -> <4 x s32>) 2450 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2451 B.buildExtract(ValReg, WideLoad, 0); 2452 } else { 2453 // For cases where the widened type isn't a nice register value, unmerge 2454 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2455 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2456 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2457 B.setInsertPt(B.getMBB(), MI.getIterator()); 2458 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2459 } 2460 } 2461 2462 MI.eraseFromParent(); 2463 return true; 2464 } 2465 2466 return false; 2467 } 2468 2469 bool AMDGPULegalizerInfo::legalizeFMad( 2470 MachineInstr &MI, MachineRegisterInfo &MRI, 2471 MachineIRBuilder &B) const { 2472 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2473 assert(Ty.isScalar()); 2474 2475 MachineFunction &MF = B.getMF(); 2476 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2477 2478 // TODO: Always legal with future ftz flag. 2479 // FIXME: Do we need just output? 2480 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2481 return true; 2482 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2483 return true; 2484 2485 MachineIRBuilder HelperBuilder(MI); 2486 GISelObserverWrapper DummyObserver; 2487 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2488 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2489 } 2490 2491 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2492 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2493 Register DstReg = MI.getOperand(0).getReg(); 2494 Register PtrReg = MI.getOperand(1).getReg(); 2495 Register CmpVal = MI.getOperand(2).getReg(); 2496 Register NewVal = MI.getOperand(3).getReg(); 2497 2498 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2499 "this should not have been custom lowered"); 2500 2501 LLT ValTy = MRI.getType(CmpVal); 2502 LLT VecTy = LLT::vector(2, ValTy); 2503 2504 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2505 2506 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2507 .addDef(DstReg) 2508 .addUse(PtrReg) 2509 .addUse(PackedVal) 2510 .setMemRefs(MI.memoperands()); 2511 2512 MI.eraseFromParent(); 2513 return true; 2514 } 2515 2516 bool AMDGPULegalizerInfo::legalizeFlog( 2517 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2518 Register Dst = MI.getOperand(0).getReg(); 2519 Register Src = MI.getOperand(1).getReg(); 2520 LLT Ty = B.getMRI()->getType(Dst); 2521 unsigned Flags = MI.getFlags(); 2522 2523 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2524 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2525 2526 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2527 MI.eraseFromParent(); 2528 return true; 2529 } 2530 2531 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2532 MachineIRBuilder &B) const { 2533 Register Dst = MI.getOperand(0).getReg(); 2534 Register Src = MI.getOperand(1).getReg(); 2535 unsigned Flags = MI.getFlags(); 2536 LLT Ty = B.getMRI()->getType(Dst); 2537 2538 auto K = B.buildFConstant(Ty, numbers::log2e); 2539 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2540 B.buildFExp2(Dst, Mul, Flags); 2541 MI.eraseFromParent(); 2542 return true; 2543 } 2544 2545 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2546 MachineIRBuilder &B) const { 2547 Register Dst = MI.getOperand(0).getReg(); 2548 Register Src0 = MI.getOperand(1).getReg(); 2549 Register Src1 = MI.getOperand(2).getReg(); 2550 unsigned Flags = MI.getFlags(); 2551 LLT Ty = B.getMRI()->getType(Dst); 2552 const LLT S16 = LLT::scalar(16); 2553 const LLT S32 = LLT::scalar(32); 2554 2555 if (Ty == S32) { 2556 auto Log = B.buildFLog2(S32, Src0, Flags); 2557 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2558 .addUse(Log.getReg(0)) 2559 .addUse(Src1) 2560 .setMIFlags(Flags); 2561 B.buildFExp2(Dst, Mul, Flags); 2562 } else if (Ty == S16) { 2563 // There's no f16 fmul_legacy, so we need to convert for it. 2564 auto Log = B.buildFLog2(S16, Src0, Flags); 2565 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2566 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2567 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2568 .addUse(Ext0.getReg(0)) 2569 .addUse(Ext1.getReg(0)) 2570 .setMIFlags(Flags); 2571 2572 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2573 } else 2574 return false; 2575 2576 MI.eraseFromParent(); 2577 return true; 2578 } 2579 2580 // Find a source register, ignoring any possible source modifiers. 2581 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2582 Register ModSrc = OrigSrc; 2583 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2584 ModSrc = SrcFNeg->getOperand(1).getReg(); 2585 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2586 ModSrc = SrcFAbs->getOperand(1).getReg(); 2587 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2588 ModSrc = SrcFAbs->getOperand(1).getReg(); 2589 return ModSrc; 2590 } 2591 2592 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2593 MachineRegisterInfo &MRI, 2594 MachineIRBuilder &B) const { 2595 2596 const LLT S1 = LLT::scalar(1); 2597 const LLT S64 = LLT::scalar(64); 2598 Register Dst = MI.getOperand(0).getReg(); 2599 Register OrigSrc = MI.getOperand(1).getReg(); 2600 unsigned Flags = MI.getFlags(); 2601 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2602 "this should not have been custom lowered"); 2603 2604 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2605 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2606 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2607 // V_FRACT bug is: 2608 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2609 // 2610 // Convert floor(x) to (x - fract(x)) 2611 2612 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2613 .addUse(OrigSrc) 2614 .setMIFlags(Flags); 2615 2616 // Give source modifier matching some assistance before obscuring a foldable 2617 // pattern. 2618 2619 // TODO: We can avoid the neg on the fract? The input sign to fract 2620 // shouldn't matter? 2621 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2622 2623 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2624 2625 Register Min = MRI.createGenericVirtualRegister(S64); 2626 2627 // We don't need to concern ourselves with the snan handling difference, so 2628 // use the one which will directly select. 2629 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2630 if (MFI->getMode().IEEE) 2631 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2632 else 2633 B.buildFMinNum(Min, Fract, Const, Flags); 2634 2635 Register CorrectedFract = Min; 2636 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2637 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2638 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2639 } 2640 2641 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2642 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2643 2644 MI.eraseFromParent(); 2645 return true; 2646 } 2647 2648 // Turn an illegal packed v2s16 build vector into bit operations. 2649 // TODO: This should probably be a bitcast action in LegalizerHelper. 2650 bool AMDGPULegalizerInfo::legalizeBuildVector( 2651 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2652 Register Dst = MI.getOperand(0).getReg(); 2653 const LLT S32 = LLT::scalar(32); 2654 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2655 2656 Register Src0 = MI.getOperand(1).getReg(); 2657 Register Src1 = MI.getOperand(2).getReg(); 2658 assert(MRI.getType(Src0) == LLT::scalar(16)); 2659 2660 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2661 B.buildBitcast(Dst, Merge); 2662 2663 MI.eraseFromParent(); 2664 return true; 2665 } 2666 2667 // Check that this is a G_XOR x, -1 2668 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2669 if (MI.getOpcode() != TargetOpcode::G_XOR) 2670 return false; 2671 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2672 return ConstVal && *ConstVal == -1; 2673 } 2674 2675 // Return the use branch instruction, otherwise null if the usage is invalid. 2676 static MachineInstr * 2677 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2678 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2679 Register CondDef = MI.getOperand(0).getReg(); 2680 if (!MRI.hasOneNonDBGUse(CondDef)) 2681 return nullptr; 2682 2683 MachineBasicBlock *Parent = MI.getParent(); 2684 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2685 2686 if (isNot(MRI, *UseMI)) { 2687 Register NegatedCond = UseMI->getOperand(0).getReg(); 2688 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2689 return nullptr; 2690 2691 // We're deleting the def of this value, so we need to remove it. 2692 UseMI->eraseFromParent(); 2693 2694 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2695 Negated = true; 2696 } 2697 2698 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2699 return nullptr; 2700 2701 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2702 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2703 if (Next == Parent->end()) { 2704 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2705 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2706 return nullptr; 2707 UncondBrTarget = &*NextMBB; 2708 } else { 2709 if (Next->getOpcode() != AMDGPU::G_BR) 2710 return nullptr; 2711 Br = &*Next; 2712 UncondBrTarget = Br->getOperand(0).getMBB(); 2713 } 2714 2715 return UseMI; 2716 } 2717 2718 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2719 const ArgDescriptor *Arg, 2720 const TargetRegisterClass *ArgRC, 2721 LLT ArgTy) const { 2722 MCRegister SrcReg = Arg->getRegister(); 2723 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2724 assert(DstReg.isVirtual() && "Virtual register expected"); 2725 2726 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2727 ArgTy); 2728 if (Arg->isMasked()) { 2729 // TODO: Should we try to emit this once in the entry block? 2730 const LLT S32 = LLT::scalar(32); 2731 const unsigned Mask = Arg->getMask(); 2732 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2733 2734 Register AndMaskSrc = LiveIn; 2735 2736 if (Shift != 0) { 2737 auto ShiftAmt = B.buildConstant(S32, Shift); 2738 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2739 } 2740 2741 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2742 } else { 2743 B.buildCopy(DstReg, LiveIn); 2744 } 2745 2746 return true; 2747 } 2748 2749 bool AMDGPULegalizerInfo::loadInputValue( 2750 Register DstReg, MachineIRBuilder &B, 2751 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2752 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2753 const ArgDescriptor *Arg; 2754 const TargetRegisterClass *ArgRC; 2755 LLT ArgTy; 2756 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2757 2758 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2759 return false; // TODO: Handle these 2760 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2761 } 2762 2763 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2764 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2765 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2766 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2767 return false; 2768 2769 MI.eraseFromParent(); 2770 return true; 2771 } 2772 2773 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2774 MachineRegisterInfo &MRI, 2775 MachineIRBuilder &B) const { 2776 Register Dst = MI.getOperand(0).getReg(); 2777 LLT DstTy = MRI.getType(Dst); 2778 LLT S16 = LLT::scalar(16); 2779 LLT S32 = LLT::scalar(32); 2780 LLT S64 = LLT::scalar(64); 2781 2782 if (DstTy == S16) 2783 return legalizeFDIV16(MI, MRI, B); 2784 if (DstTy == S32) 2785 return legalizeFDIV32(MI, MRI, B); 2786 if (DstTy == S64) 2787 return legalizeFDIV64(MI, MRI, B); 2788 2789 return false; 2790 } 2791 2792 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2793 Register DstReg, 2794 Register X, 2795 Register Y, 2796 bool IsDiv) const { 2797 const LLT S1 = LLT::scalar(1); 2798 const LLT S32 = LLT::scalar(32); 2799 2800 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2801 // algorithm used here. 2802 2803 // Initial estimate of inv(y). 2804 auto FloatY = B.buildUITOFP(S32, Y); 2805 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2806 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2807 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2808 auto Z = B.buildFPTOUI(S32, ScaledY); 2809 2810 // One round of UNR. 2811 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2812 auto NegYZ = B.buildMul(S32, NegY, Z); 2813 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2814 2815 // Quotient/remainder estimate. 2816 auto Q = B.buildUMulH(S32, X, Z); 2817 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2818 2819 // First quotient/remainder refinement. 2820 auto One = B.buildConstant(S32, 1); 2821 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2822 if (IsDiv) 2823 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2824 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2825 2826 // Second quotient/remainder refinement. 2827 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2828 if (IsDiv) 2829 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2830 else 2831 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2832 } 2833 2834 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2835 // 2836 // Return lo, hi of result 2837 // 2838 // %cvt.lo = G_UITOFP Val.lo 2839 // %cvt.hi = G_UITOFP Val.hi 2840 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2841 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2842 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2843 // %mul2 = G_FMUL %mul1, 2**(-32) 2844 // %trunc = G_INTRINSIC_TRUNC %mul2 2845 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2846 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2847 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2848 Register Val) { 2849 const LLT S32 = LLT::scalar(32); 2850 auto Unmerge = B.buildUnmerge(S32, Val); 2851 2852 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2853 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2854 2855 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2856 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2857 2858 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2859 auto Mul1 = 2860 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2861 2862 // 2**(-32) 2863 auto Mul2 = 2864 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2865 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2866 2867 // -(2**32) 2868 auto Mad2 = B.buildFMAD(S32, Trunc, 2869 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2870 2871 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2872 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2873 2874 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2875 } 2876 2877 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2878 Register DstReg, 2879 Register Numer, 2880 Register Denom, 2881 bool IsDiv) const { 2882 const LLT S32 = LLT::scalar(32); 2883 const LLT S64 = LLT::scalar(64); 2884 const LLT S1 = LLT::scalar(1); 2885 Register RcpLo, RcpHi; 2886 2887 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2888 2889 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2890 2891 auto Zero64 = B.buildConstant(S64, 0); 2892 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2893 2894 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2895 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2896 2897 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2898 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2899 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2900 2901 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2902 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2903 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2904 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2905 2906 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2907 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2908 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2909 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2910 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2911 2912 auto Zero32 = B.buildConstant(S32, 0); 2913 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2914 auto Add2_HiC = 2915 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2916 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2917 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2918 2919 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2920 Register NumerLo = UnmergeNumer.getReg(0); 2921 Register NumerHi = UnmergeNumer.getReg(1); 2922 2923 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2924 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2925 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2926 Register Mul3_Lo = UnmergeMul3.getReg(0); 2927 Register Mul3_Hi = UnmergeMul3.getReg(1); 2928 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2929 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2930 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2931 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2932 2933 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2934 Register DenomLo = UnmergeDenom.getReg(0); 2935 Register DenomHi = UnmergeDenom.getReg(1); 2936 2937 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2938 auto C1 = B.buildSExt(S32, CmpHi); 2939 2940 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2941 auto C2 = B.buildSExt(S32, CmpLo); 2942 2943 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2944 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2945 2946 // TODO: Here and below portions of the code can be enclosed into if/endif. 2947 // Currently control flow is unconditional and we have 4 selects after 2948 // potential endif to substitute PHIs. 2949 2950 // if C3 != 0 ... 2951 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2952 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2953 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2954 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2955 2956 auto One64 = B.buildConstant(S64, 1); 2957 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2958 2959 auto C4 = 2960 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2961 auto C5 = 2962 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2963 auto C6 = B.buildSelect( 2964 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2965 2966 // if (C6 != 0) 2967 auto Add4 = B.buildAdd(S64, Add3, One64); 2968 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2969 2970 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2971 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2972 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2973 2974 // endif C6 2975 // endif C3 2976 2977 if (IsDiv) { 2978 auto Sel1 = B.buildSelect( 2979 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2980 B.buildSelect(DstReg, 2981 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2982 } else { 2983 auto Sel2 = B.buildSelect( 2984 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2985 B.buildSelect(DstReg, 2986 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2987 } 2988 } 2989 2990 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2991 MachineRegisterInfo &MRI, 2992 MachineIRBuilder &B) const { 2993 const LLT S64 = LLT::scalar(64); 2994 const LLT S32 = LLT::scalar(32); 2995 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2996 Register DstReg = MI.getOperand(0).getReg(); 2997 Register Num = MI.getOperand(1).getReg(); 2998 Register Den = MI.getOperand(2).getReg(); 2999 LLT Ty = MRI.getType(DstReg); 3000 3001 if (Ty == S32) 3002 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 3003 else if (Ty == S64) 3004 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 3005 else 3006 return false; 3007 3008 MI.eraseFromParent(); 3009 return true; 3010 3011 } 3012 3013 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 3014 MachineRegisterInfo &MRI, 3015 MachineIRBuilder &B) const { 3016 const LLT S64 = LLT::scalar(64); 3017 const LLT S32 = LLT::scalar(32); 3018 3019 Register DstReg = MI.getOperand(0).getReg(); 3020 const LLT Ty = MRI.getType(DstReg); 3021 if (Ty != S32 && Ty != S64) 3022 return false; 3023 3024 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 3025 3026 Register LHS = MI.getOperand(1).getReg(); 3027 Register RHS = MI.getOperand(2).getReg(); 3028 3029 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3030 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3031 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3032 3033 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3034 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3035 3036 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3037 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3038 3039 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 3040 if (Ty == S32) 3041 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 3042 else 3043 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 3044 3045 Register Sign; 3046 if (IsDiv) 3047 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3048 else 3049 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3050 3051 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 3052 B.buildSub(DstReg, UDivRem, Sign); 3053 3054 MI.eraseFromParent(); 3055 return true; 3056 } 3057 3058 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3059 MachineRegisterInfo &MRI, 3060 MachineIRBuilder &B) const { 3061 Register Res = MI.getOperand(0).getReg(); 3062 Register LHS = MI.getOperand(1).getReg(); 3063 Register RHS = MI.getOperand(2).getReg(); 3064 uint16_t Flags = MI.getFlags(); 3065 LLT ResTy = MRI.getType(Res); 3066 3067 const MachineFunction &MF = B.getMF(); 3068 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3069 MI.getFlag(MachineInstr::FmAfn); 3070 3071 if (!AllowInaccurateRcp) 3072 return false; 3073 3074 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3075 // 1 / x -> RCP(x) 3076 if (CLHS->isExactlyValue(1.0)) { 3077 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3078 .addUse(RHS) 3079 .setMIFlags(Flags); 3080 3081 MI.eraseFromParent(); 3082 return true; 3083 } 3084 3085 // -1 / x -> RCP( FNEG(x) ) 3086 if (CLHS->isExactlyValue(-1.0)) { 3087 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3088 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3089 .addUse(FNeg.getReg(0)) 3090 .setMIFlags(Flags); 3091 3092 MI.eraseFromParent(); 3093 return true; 3094 } 3095 } 3096 3097 // x / y -> x * (1.0 / y) 3098 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3099 .addUse(RHS) 3100 .setMIFlags(Flags); 3101 B.buildFMul(Res, LHS, RCP, Flags); 3102 3103 MI.eraseFromParent(); 3104 return true; 3105 } 3106 3107 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3108 MachineRegisterInfo &MRI, 3109 MachineIRBuilder &B) const { 3110 Register Res = MI.getOperand(0).getReg(); 3111 Register X = MI.getOperand(1).getReg(); 3112 Register Y = MI.getOperand(2).getReg(); 3113 uint16_t Flags = MI.getFlags(); 3114 LLT ResTy = MRI.getType(Res); 3115 3116 const MachineFunction &MF = B.getMF(); 3117 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3118 MI.getFlag(MachineInstr::FmAfn); 3119 3120 if (!AllowInaccurateRcp) 3121 return false; 3122 3123 auto NegY = B.buildFNeg(ResTy, Y); 3124 auto One = B.buildFConstant(ResTy, 1.0); 3125 3126 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3127 .addUse(Y) 3128 .setMIFlags(Flags); 3129 3130 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3131 R = B.buildFMA(ResTy, Tmp0, R, R); 3132 3133 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3134 R = B.buildFMA(ResTy, Tmp1, R, R); 3135 3136 auto Ret = B.buildFMul(ResTy, X, R); 3137 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3138 3139 B.buildFMA(Res, Tmp2, R, Ret); 3140 MI.eraseFromParent(); 3141 return true; 3142 } 3143 3144 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3145 MachineRegisterInfo &MRI, 3146 MachineIRBuilder &B) const { 3147 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3148 return true; 3149 3150 Register Res = MI.getOperand(0).getReg(); 3151 Register LHS = MI.getOperand(1).getReg(); 3152 Register RHS = MI.getOperand(2).getReg(); 3153 3154 uint16_t Flags = MI.getFlags(); 3155 3156 LLT S16 = LLT::scalar(16); 3157 LLT S32 = LLT::scalar(32); 3158 3159 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3160 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3161 3162 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3163 .addUse(RHSExt.getReg(0)) 3164 .setMIFlags(Flags); 3165 3166 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3167 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3168 3169 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3170 .addUse(RDst.getReg(0)) 3171 .addUse(RHS) 3172 .addUse(LHS) 3173 .setMIFlags(Flags); 3174 3175 MI.eraseFromParent(); 3176 return true; 3177 } 3178 3179 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3180 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3181 static void toggleSPDenormMode(bool Enable, 3182 MachineIRBuilder &B, 3183 const GCNSubtarget &ST, 3184 AMDGPU::SIModeRegisterDefaults Mode) { 3185 // Set SP denorm mode to this value. 3186 unsigned SPDenormMode = 3187 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3188 3189 if (ST.hasDenormModeInst()) { 3190 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3191 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3192 3193 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3194 B.buildInstr(AMDGPU::S_DENORM_MODE) 3195 .addImm(NewDenormModeValue); 3196 3197 } else { 3198 // Select FP32 bit field in mode register. 3199 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3200 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3201 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3202 3203 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3204 .addImm(SPDenormMode) 3205 .addImm(SPDenormModeBitField); 3206 } 3207 } 3208 3209 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3210 MachineRegisterInfo &MRI, 3211 MachineIRBuilder &B) const { 3212 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3213 return true; 3214 3215 Register Res = MI.getOperand(0).getReg(); 3216 Register LHS = MI.getOperand(1).getReg(); 3217 Register RHS = MI.getOperand(2).getReg(); 3218 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3219 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3220 3221 uint16_t Flags = MI.getFlags(); 3222 3223 LLT S32 = LLT::scalar(32); 3224 LLT S1 = LLT::scalar(1); 3225 3226 auto One = B.buildFConstant(S32, 1.0f); 3227 3228 auto DenominatorScaled = 3229 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3230 .addUse(LHS) 3231 .addUse(RHS) 3232 .addImm(0) 3233 .setMIFlags(Flags); 3234 auto NumeratorScaled = 3235 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3236 .addUse(LHS) 3237 .addUse(RHS) 3238 .addImm(1) 3239 .setMIFlags(Flags); 3240 3241 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3242 .addUse(DenominatorScaled.getReg(0)) 3243 .setMIFlags(Flags); 3244 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3245 3246 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3247 // aren't modeled as reading it. 3248 if (!Mode.allFP32Denormals()) 3249 toggleSPDenormMode(true, B, ST, Mode); 3250 3251 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3252 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3253 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3254 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3255 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3256 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3257 3258 if (!Mode.allFP32Denormals()) 3259 toggleSPDenormMode(false, B, ST, Mode); 3260 3261 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3262 .addUse(Fma4.getReg(0)) 3263 .addUse(Fma1.getReg(0)) 3264 .addUse(Fma3.getReg(0)) 3265 .addUse(NumeratorScaled.getReg(1)) 3266 .setMIFlags(Flags); 3267 3268 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3269 .addUse(Fmas.getReg(0)) 3270 .addUse(RHS) 3271 .addUse(LHS) 3272 .setMIFlags(Flags); 3273 3274 MI.eraseFromParent(); 3275 return true; 3276 } 3277 3278 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3279 MachineRegisterInfo &MRI, 3280 MachineIRBuilder &B) const { 3281 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3282 return true; 3283 3284 Register Res = MI.getOperand(0).getReg(); 3285 Register LHS = MI.getOperand(1).getReg(); 3286 Register RHS = MI.getOperand(2).getReg(); 3287 3288 uint16_t Flags = MI.getFlags(); 3289 3290 LLT S64 = LLT::scalar(64); 3291 LLT S1 = LLT::scalar(1); 3292 3293 auto One = B.buildFConstant(S64, 1.0); 3294 3295 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3296 .addUse(LHS) 3297 .addUse(RHS) 3298 .addImm(0) 3299 .setMIFlags(Flags); 3300 3301 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3302 3303 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3304 .addUse(DivScale0.getReg(0)) 3305 .setMIFlags(Flags); 3306 3307 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3308 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3309 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3310 3311 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3312 .addUse(LHS) 3313 .addUse(RHS) 3314 .addImm(1) 3315 .setMIFlags(Flags); 3316 3317 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3318 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3319 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3320 3321 Register Scale; 3322 if (!ST.hasUsableDivScaleConditionOutput()) { 3323 // Workaround a hardware bug on SI where the condition output from div_scale 3324 // is not usable. 3325 3326 LLT S32 = LLT::scalar(32); 3327 3328 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3329 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3330 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3331 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3332 3333 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3334 Scale1Unmerge.getReg(1)); 3335 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3336 Scale0Unmerge.getReg(1)); 3337 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3338 } else { 3339 Scale = DivScale1.getReg(1); 3340 } 3341 3342 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3343 .addUse(Fma4.getReg(0)) 3344 .addUse(Fma3.getReg(0)) 3345 .addUse(Mul.getReg(0)) 3346 .addUse(Scale) 3347 .setMIFlags(Flags); 3348 3349 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3350 .addUse(Fmas.getReg(0)) 3351 .addUse(RHS) 3352 .addUse(LHS) 3353 .setMIFlags(Flags); 3354 3355 MI.eraseFromParent(); 3356 return true; 3357 } 3358 3359 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3360 MachineRegisterInfo &MRI, 3361 MachineIRBuilder &B) const { 3362 Register Res = MI.getOperand(0).getReg(); 3363 Register LHS = MI.getOperand(2).getReg(); 3364 Register RHS = MI.getOperand(3).getReg(); 3365 uint16_t Flags = MI.getFlags(); 3366 3367 LLT S32 = LLT::scalar(32); 3368 LLT S1 = LLT::scalar(1); 3369 3370 auto Abs = B.buildFAbs(S32, RHS, Flags); 3371 const APFloat C0Val(1.0f); 3372 3373 auto C0 = B.buildConstant(S32, 0x6f800000); 3374 auto C1 = B.buildConstant(S32, 0x2f800000); 3375 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3376 3377 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3378 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3379 3380 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3381 3382 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3383 .addUse(Mul0.getReg(0)) 3384 .setMIFlags(Flags); 3385 3386 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3387 3388 B.buildFMul(Res, Sel, Mul1, Flags); 3389 3390 MI.eraseFromParent(); 3391 return true; 3392 } 3393 3394 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3395 // FIXME: Why do we handle this one but not other removed instructions? 3396 // 3397 // Reciprocal square root. The clamp prevents infinite results, clamping 3398 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3399 // +-max_float. 3400 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3401 MachineRegisterInfo &MRI, 3402 MachineIRBuilder &B) const { 3403 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3404 return true; 3405 3406 Register Dst = MI.getOperand(0).getReg(); 3407 Register Src = MI.getOperand(2).getReg(); 3408 auto Flags = MI.getFlags(); 3409 3410 LLT Ty = MRI.getType(Dst); 3411 3412 const fltSemantics *FltSemantics; 3413 if (Ty == LLT::scalar(32)) 3414 FltSemantics = &APFloat::IEEEsingle(); 3415 else if (Ty == LLT::scalar(64)) 3416 FltSemantics = &APFloat::IEEEdouble(); 3417 else 3418 return false; 3419 3420 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3421 .addUse(Src) 3422 .setMIFlags(Flags); 3423 3424 // We don't need to concern ourselves with the snan handling difference, since 3425 // the rsq quieted (or not) so use the one which will directly select. 3426 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3427 const bool UseIEEE = MFI->getMode().IEEE; 3428 3429 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3430 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3431 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3432 3433 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3434 3435 if (UseIEEE) 3436 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3437 else 3438 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3439 MI.eraseFromParent(); 3440 return true; 3441 } 3442 3443 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3444 switch (IID) { 3445 case Intrinsic::amdgcn_ds_fadd: 3446 return AMDGPU::G_ATOMICRMW_FADD; 3447 case Intrinsic::amdgcn_ds_fmin: 3448 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3449 case Intrinsic::amdgcn_ds_fmax: 3450 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3451 default: 3452 llvm_unreachable("not a DS FP intrinsic"); 3453 } 3454 } 3455 3456 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3457 MachineInstr &MI, 3458 Intrinsic::ID IID) const { 3459 GISelChangeObserver &Observer = Helper.Observer; 3460 Observer.changingInstr(MI); 3461 3462 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3463 3464 // The remaining operands were used to set fields in the MemOperand on 3465 // construction. 3466 for (int I = 6; I > 3; --I) 3467 MI.RemoveOperand(I); 3468 3469 MI.RemoveOperand(1); // Remove the intrinsic ID. 3470 Observer.changedInstr(MI); 3471 return true; 3472 } 3473 3474 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3475 MachineRegisterInfo &MRI, 3476 MachineIRBuilder &B) const { 3477 uint64_t Offset = 3478 ST.getTargetLowering()->getImplicitParameterOffset( 3479 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3480 LLT DstTy = MRI.getType(DstReg); 3481 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3482 3483 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3484 if (!loadInputValue(KernargPtrReg, B, 3485 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3486 return false; 3487 3488 // FIXME: This should be nuw 3489 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3490 return true; 3491 } 3492 3493 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3494 MachineRegisterInfo &MRI, 3495 MachineIRBuilder &B) const { 3496 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3497 if (!MFI->isEntryFunction()) { 3498 return legalizePreloadedArgIntrin(MI, MRI, B, 3499 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3500 } 3501 3502 Register DstReg = MI.getOperand(0).getReg(); 3503 if (!getImplicitArgPtr(DstReg, MRI, B)) 3504 return false; 3505 3506 MI.eraseFromParent(); 3507 return true; 3508 } 3509 3510 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3511 MachineRegisterInfo &MRI, 3512 MachineIRBuilder &B, 3513 unsigned AddrSpace) const { 3514 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3515 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3516 Register Hi32 = Unmerge.getReg(1); 3517 3518 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3519 MI.eraseFromParent(); 3520 return true; 3521 } 3522 3523 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3524 // offset (the offset that is included in bounds checking and swizzling, to be 3525 // split between the instruction's voffset and immoffset fields) and soffset 3526 // (the offset that is excluded from bounds checking and swizzling, to go in 3527 // the instruction's soffset field). This function takes the first kind of 3528 // offset and figures out how to split it between voffset and immoffset. 3529 std::tuple<Register, unsigned, unsigned> 3530 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3531 Register OrigOffset) const { 3532 const unsigned MaxImm = 4095; 3533 Register BaseReg; 3534 unsigned TotalConstOffset; 3535 const LLT S32 = LLT::scalar(32); 3536 MachineRegisterInfo &MRI = *B.getMRI(); 3537 3538 std::tie(BaseReg, TotalConstOffset) = 3539 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3540 3541 unsigned ImmOffset = TotalConstOffset; 3542 3543 // If BaseReg is a pointer, convert it to int. 3544 if (MRI.getType(BaseReg).isPointer()) 3545 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3546 3547 // If the immediate value is too big for the immoffset field, put the value 3548 // and -4096 into the immoffset field so that the value that is copied/added 3549 // for the voffset field is a multiple of 4096, and it stands more chance 3550 // of being CSEd with the copy/add for another similar load/store. 3551 // However, do not do that rounding down to a multiple of 4096 if that is a 3552 // negative number, as it appears to be illegal to have a negative offset 3553 // in the vgpr, even if adding the immediate offset makes it positive. 3554 unsigned Overflow = ImmOffset & ~MaxImm; 3555 ImmOffset -= Overflow; 3556 if ((int32_t)Overflow < 0) { 3557 Overflow += ImmOffset; 3558 ImmOffset = 0; 3559 } 3560 3561 if (Overflow != 0) { 3562 if (!BaseReg) { 3563 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3564 } else { 3565 auto OverflowVal = B.buildConstant(S32, Overflow); 3566 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3567 } 3568 } 3569 3570 if (!BaseReg) 3571 BaseReg = B.buildConstant(S32, 0).getReg(0); 3572 3573 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3574 } 3575 3576 /// Handle register layout difference for f16 images for some subtargets. 3577 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3578 MachineRegisterInfo &MRI, 3579 Register Reg, 3580 bool ImageStore) const { 3581 const LLT S16 = LLT::scalar(16); 3582 const LLT S32 = LLT::scalar(32); 3583 LLT StoreVT = MRI.getType(Reg); 3584 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3585 3586 if (ST.hasUnpackedD16VMem()) { 3587 auto Unmerge = B.buildUnmerge(S16, Reg); 3588 3589 SmallVector<Register, 4> WideRegs; 3590 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3591 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3592 3593 int NumElts = StoreVT.getNumElements(); 3594 3595 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3596 } 3597 3598 if (ImageStore && ST.hasImageStoreD16Bug()) { 3599 if (StoreVT.getNumElements() == 2) { 3600 SmallVector<Register, 4> PackedRegs; 3601 Reg = B.buildBitcast(S32, Reg).getReg(0); 3602 PackedRegs.push_back(Reg); 3603 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3604 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); 3605 } 3606 3607 if (StoreVT.getNumElements() == 3) { 3608 SmallVector<Register, 4> PackedRegs; 3609 auto Unmerge = B.buildUnmerge(S16, Reg); 3610 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3611 PackedRegs.push_back(Unmerge.getReg(I)); 3612 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3613 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); 3614 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); 3615 } 3616 3617 if (StoreVT.getNumElements() == 4) { 3618 SmallVector<Register, 4> PackedRegs; 3619 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); 3620 auto Unmerge = B.buildUnmerge(S32, Reg); 3621 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3622 PackedRegs.push_back(Unmerge.getReg(I)); 3623 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3624 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); 3625 } 3626 3627 llvm_unreachable("invalid data type"); 3628 } 3629 3630 return Reg; 3631 } 3632 3633 Register AMDGPULegalizerInfo::fixStoreSourceType( 3634 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3635 MachineRegisterInfo *MRI = B.getMRI(); 3636 LLT Ty = MRI->getType(VData); 3637 3638 const LLT S16 = LLT::scalar(16); 3639 3640 // Fixup illegal register types for i8 stores. 3641 if (Ty == LLT::scalar(8) || Ty == S16) { 3642 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3643 return AnyExt; 3644 } 3645 3646 if (Ty.isVector()) { 3647 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3648 if (IsFormat) 3649 return handleD16VData(B, *MRI, VData); 3650 } 3651 } 3652 3653 return VData; 3654 } 3655 3656 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3657 MachineRegisterInfo &MRI, 3658 MachineIRBuilder &B, 3659 bool IsTyped, 3660 bool IsFormat) const { 3661 Register VData = MI.getOperand(1).getReg(); 3662 LLT Ty = MRI.getType(VData); 3663 LLT EltTy = Ty.getScalarType(); 3664 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3665 const LLT S32 = LLT::scalar(32); 3666 3667 VData = fixStoreSourceType(B, VData, IsFormat); 3668 Register RSrc = MI.getOperand(2).getReg(); 3669 3670 MachineMemOperand *MMO = *MI.memoperands_begin(); 3671 const int MemSize = MMO->getSize(); 3672 3673 unsigned ImmOffset; 3674 unsigned TotalOffset; 3675 3676 // The typed intrinsics add an immediate after the registers. 3677 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3678 3679 // The struct intrinsic variants add one additional operand over raw. 3680 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3681 Register VIndex; 3682 int OpOffset = 0; 3683 if (HasVIndex) { 3684 VIndex = MI.getOperand(3).getReg(); 3685 OpOffset = 1; 3686 } 3687 3688 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3689 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3690 3691 unsigned Format = 0; 3692 if (IsTyped) { 3693 Format = MI.getOperand(5 + OpOffset).getImm(); 3694 ++OpOffset; 3695 } 3696 3697 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3698 3699 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3700 if (TotalOffset != 0) 3701 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3702 3703 unsigned Opc; 3704 if (IsTyped) { 3705 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3706 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3707 } else if (IsFormat) { 3708 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3709 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3710 } else { 3711 switch (MemSize) { 3712 case 1: 3713 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3714 break; 3715 case 2: 3716 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3717 break; 3718 default: 3719 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3720 break; 3721 } 3722 } 3723 3724 if (!VIndex) 3725 VIndex = B.buildConstant(S32, 0).getReg(0); 3726 3727 auto MIB = B.buildInstr(Opc) 3728 .addUse(VData) // vdata 3729 .addUse(RSrc) // rsrc 3730 .addUse(VIndex) // vindex 3731 .addUse(VOffset) // voffset 3732 .addUse(SOffset) // soffset 3733 .addImm(ImmOffset); // offset(imm) 3734 3735 if (IsTyped) 3736 MIB.addImm(Format); 3737 3738 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3739 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3740 .addMemOperand(MMO); 3741 3742 MI.eraseFromParent(); 3743 return true; 3744 } 3745 3746 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3747 MachineRegisterInfo &MRI, 3748 MachineIRBuilder &B, 3749 bool IsFormat, 3750 bool IsTyped) const { 3751 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3752 MachineMemOperand *MMO = *MI.memoperands_begin(); 3753 const int MemSize = MMO->getSize(); 3754 const LLT S32 = LLT::scalar(32); 3755 3756 Register Dst = MI.getOperand(0).getReg(); 3757 Register RSrc = MI.getOperand(2).getReg(); 3758 3759 // The typed intrinsics add an immediate after the registers. 3760 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3761 3762 // The struct intrinsic variants add one additional operand over raw. 3763 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3764 Register VIndex; 3765 int OpOffset = 0; 3766 if (HasVIndex) { 3767 VIndex = MI.getOperand(3).getReg(); 3768 OpOffset = 1; 3769 } 3770 3771 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3772 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3773 3774 unsigned Format = 0; 3775 if (IsTyped) { 3776 Format = MI.getOperand(5 + OpOffset).getImm(); 3777 ++OpOffset; 3778 } 3779 3780 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3781 unsigned ImmOffset; 3782 unsigned TotalOffset; 3783 3784 LLT Ty = MRI.getType(Dst); 3785 LLT EltTy = Ty.getScalarType(); 3786 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3787 const bool Unpacked = ST.hasUnpackedD16VMem(); 3788 3789 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3790 if (TotalOffset != 0) 3791 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3792 3793 unsigned Opc; 3794 3795 if (IsTyped) { 3796 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3797 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3798 } else if (IsFormat) { 3799 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3800 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3801 } else { 3802 switch (MemSize) { 3803 case 1: 3804 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3805 break; 3806 case 2: 3807 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3808 break; 3809 default: 3810 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3811 break; 3812 } 3813 } 3814 3815 Register LoadDstReg; 3816 3817 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3818 LLT UnpackedTy = Ty.changeElementSize(32); 3819 3820 if (IsExtLoad) 3821 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3822 else if (Unpacked && IsD16 && Ty.isVector()) 3823 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3824 else 3825 LoadDstReg = Dst; 3826 3827 if (!VIndex) 3828 VIndex = B.buildConstant(S32, 0).getReg(0); 3829 3830 auto MIB = B.buildInstr(Opc) 3831 .addDef(LoadDstReg) // vdata 3832 .addUse(RSrc) // rsrc 3833 .addUse(VIndex) // vindex 3834 .addUse(VOffset) // voffset 3835 .addUse(SOffset) // soffset 3836 .addImm(ImmOffset); // offset(imm) 3837 3838 if (IsTyped) 3839 MIB.addImm(Format); 3840 3841 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3842 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3843 .addMemOperand(MMO); 3844 3845 if (LoadDstReg != Dst) { 3846 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3847 3848 // Widen result for extending loads was widened. 3849 if (IsExtLoad) 3850 B.buildTrunc(Dst, LoadDstReg); 3851 else { 3852 // Repack to original 16-bit vector result 3853 // FIXME: G_TRUNC should work, but legalization currently fails 3854 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3855 SmallVector<Register, 4> Repack; 3856 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3857 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3858 B.buildMerge(Dst, Repack); 3859 } 3860 } 3861 3862 MI.eraseFromParent(); 3863 return true; 3864 } 3865 3866 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3867 MachineIRBuilder &B, 3868 bool IsInc) const { 3869 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3870 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3871 B.buildInstr(Opc) 3872 .addDef(MI.getOperand(0).getReg()) 3873 .addUse(MI.getOperand(2).getReg()) 3874 .addUse(MI.getOperand(3).getReg()) 3875 .cloneMemRefs(MI); 3876 MI.eraseFromParent(); 3877 return true; 3878 } 3879 3880 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3881 switch (IntrID) { 3882 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3883 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3884 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3885 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3886 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3887 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3888 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3889 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3890 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3891 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3892 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3893 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3894 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3895 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3896 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3897 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3898 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3899 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3900 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3901 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3902 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3903 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3904 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3905 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3906 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3907 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3908 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3909 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3910 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3911 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3912 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3913 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3914 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3915 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3916 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3917 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3918 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3919 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3921 case Intrinsic::amdgcn_buffer_atomic_fadd: 3922 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3923 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3924 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3925 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 3926 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 3927 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 3928 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 3929 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 3930 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 3931 default: 3932 llvm_unreachable("unhandled atomic opcode"); 3933 } 3934 } 3935 3936 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3937 MachineIRBuilder &B, 3938 Intrinsic::ID IID) const { 3939 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3940 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3941 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3942 3943 Register Dst; 3944 3945 int OpOffset = 0; 3946 if (HasReturn) { 3947 // A few FP atomics do not support return values. 3948 Dst = MI.getOperand(0).getReg(); 3949 } else { 3950 OpOffset = -1; 3951 } 3952 3953 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3954 Register CmpVal; 3955 3956 if (IsCmpSwap) { 3957 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3958 ++OpOffset; 3959 } 3960 3961 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3962 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3963 3964 // The struct intrinsic variants add one additional operand over raw. 3965 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3966 Register VIndex; 3967 if (HasVIndex) { 3968 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3969 ++OpOffset; 3970 } 3971 3972 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3973 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3974 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3975 3976 MachineMemOperand *MMO = *MI.memoperands_begin(); 3977 3978 unsigned ImmOffset; 3979 unsigned TotalOffset; 3980 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3981 if (TotalOffset != 0) 3982 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3983 3984 if (!VIndex) 3985 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3986 3987 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3988 3989 if (HasReturn) 3990 MIB.addDef(Dst); 3991 3992 MIB.addUse(VData); // vdata 3993 3994 if (IsCmpSwap) 3995 MIB.addReg(CmpVal); 3996 3997 MIB.addUse(RSrc) // rsrc 3998 .addUse(VIndex) // vindex 3999 .addUse(VOffset) // voffset 4000 .addUse(SOffset) // soffset 4001 .addImm(ImmOffset) // offset(imm) 4002 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4003 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4004 .addMemOperand(MMO); 4005 4006 MI.eraseFromParent(); 4007 return true; 4008 } 4009 4010 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 4011 /// vector with s16 typed elements. 4012 static void packImageA16AddressToDwords( 4013 MachineIRBuilder &B, MachineInstr &MI, 4014 SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset, 4015 const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) { 4016 const LLT S16 = LLT::scalar(16); 4017 const LLT V2S16 = LLT::vector(2, 16); 4018 4019 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4020 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4021 if (!SrcOp.isReg()) 4022 continue; // _L to _LZ may have eliminated this. 4023 4024 Register AddrReg = SrcOp.getReg(); 4025 4026 if (I < Intr->GradientStart) { 4027 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4028 PackedAddrs.push_back(AddrReg); 4029 } else { 4030 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4031 // derivatives dx/dh and dx/dv are packed with undef. 4032 if (((I + 1) >= EndIdx) || 4033 ((Intr->NumGradients / 2) % 2 == 1 && 4034 (I == static_cast<unsigned>(Intr->GradientStart + 4035 (Intr->NumGradients / 2) - 1) || 4036 I == static_cast<unsigned>(Intr->GradientStart + 4037 Intr->NumGradients - 1))) || 4038 // Check for _L to _LZ optimization 4039 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4040 PackedAddrs.push_back( 4041 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4042 .getReg(0)); 4043 } else { 4044 PackedAddrs.push_back( 4045 B.buildBuildVector( 4046 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4047 .getReg(0)); 4048 ++I; 4049 } 4050 } 4051 } 4052 } 4053 4054 /// Convert from separate vaddr components to a single vector address register, 4055 /// and replace the remaining operands with $noreg. 4056 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4057 int DimIdx, int NumVAddrs) { 4058 const LLT S32 = LLT::scalar(32); 4059 4060 SmallVector<Register, 8> AddrRegs; 4061 for (int I = 0; I != NumVAddrs; ++I) { 4062 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4063 if (SrcOp.isReg()) { 4064 AddrRegs.push_back(SrcOp.getReg()); 4065 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4066 } 4067 } 4068 4069 int NumAddrRegs = AddrRegs.size(); 4070 if (NumAddrRegs != 1) { 4071 // Round up to 8 elements for v5-v7 4072 // FIXME: Missing intermediate sized register classes and instructions. 4073 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4074 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4075 auto Undef = B.buildUndef(S32); 4076 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4077 NumAddrRegs = RoundedNumRegs; 4078 } 4079 4080 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 4081 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4082 } 4083 4084 for (int I = 1; I != NumVAddrs; ++I) { 4085 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4086 if (SrcOp.isReg()) 4087 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4088 } 4089 } 4090 4091 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4092 /// 4093 /// Depending on the subtarget, load/store with 16-bit element data need to be 4094 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4095 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4096 /// registers. 4097 /// 4098 /// We don't want to directly select image instructions just yet, but also want 4099 /// to exposes all register repacking to the legalizer/combiners. We also don't 4100 /// want a selected instrution entering RegBankSelect. In order to avoid 4101 /// defining a multitude of intermediate image instructions, directly hack on 4102 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4103 /// now unnecessary arguments with $noreg. 4104 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4105 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4106 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4107 4108 const unsigned NumDefs = MI.getNumExplicitDefs(); 4109 const unsigned ArgOffset = NumDefs + 1; 4110 bool IsTFE = NumDefs == 2; 4111 // We are only processing the operands of d16 image operations on subtargets 4112 // that use the unpacked register layout, or need to repack the TFE result. 4113 4114 // TODO: Do we need to guard against already legalized intrinsics? 4115 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4116 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4117 4118 MachineRegisterInfo *MRI = B.getMRI(); 4119 const LLT S32 = LLT::scalar(32); 4120 const LLT S16 = LLT::scalar(16); 4121 const LLT V2S16 = LLT::vector(2, 16); 4122 4123 unsigned DMask = 0; 4124 4125 // Check for 16 bit addresses and pack if true. 4126 LLT GradTy = 4127 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4128 LLT AddrTy = 4129 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4130 const bool IsG16 = GradTy == S16; 4131 const bool IsA16 = AddrTy == S16; 4132 4133 int DMaskLanes = 0; 4134 if (!BaseOpcode->Atomic) { 4135 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4136 if (BaseOpcode->Gather4) { 4137 DMaskLanes = 4; 4138 } else if (DMask != 0) { 4139 DMaskLanes = countPopulation(DMask); 4140 } else if (!IsTFE && !BaseOpcode->Store) { 4141 // If dmask is 0, this is a no-op load. This can be eliminated. 4142 B.buildUndef(MI.getOperand(0)); 4143 MI.eraseFromParent(); 4144 return true; 4145 } 4146 } 4147 4148 Observer.changingInstr(MI); 4149 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4150 4151 unsigned NewOpcode = NumDefs == 0 ? 4152 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4153 4154 // Track that we legalized this 4155 MI.setDesc(B.getTII().get(NewOpcode)); 4156 4157 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4158 // dmask to be at least 1 otherwise the instruction will fail 4159 if (IsTFE && DMask == 0) { 4160 DMask = 0x1; 4161 DMaskLanes = 1; 4162 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4163 } 4164 4165 if (BaseOpcode->Atomic) { 4166 Register VData0 = MI.getOperand(2).getReg(); 4167 LLT Ty = MRI->getType(VData0); 4168 4169 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4170 if (Ty.isVector()) 4171 return false; 4172 4173 if (BaseOpcode->AtomicX2) { 4174 Register VData1 = MI.getOperand(3).getReg(); 4175 // The two values are packed in one register. 4176 LLT PackedTy = LLT::vector(2, Ty); 4177 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4178 MI.getOperand(2).setReg(Concat.getReg(0)); 4179 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4180 } 4181 } 4182 4183 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4184 4185 // Optimize _L to _LZ when _L is zero 4186 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4187 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4188 const ConstantFP *ConstantLod; 4189 4190 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4191 m_GFCst(ConstantLod))) { 4192 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4193 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4194 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4195 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4196 Intr->Dim); 4197 4198 // The starting indexes should remain in the same place. 4199 --CorrectedNumVAddrs; 4200 4201 MI.getOperand(MI.getNumExplicitDefs()) 4202 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4203 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4204 Intr = NewImageDimIntr; 4205 } 4206 } 4207 } 4208 4209 // Optimize _mip away, when 'lod' is zero 4210 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4211 int64_t ConstantLod; 4212 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4213 m_ICst(ConstantLod))) { 4214 if (ConstantLod == 0) { 4215 // TODO: Change intrinsic opcode and remove operand instead or replacing 4216 // it with 0, as the _L to _LZ handling is done above. 4217 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4218 --CorrectedNumVAddrs; 4219 } 4220 } 4221 } 4222 4223 // Rewrite the addressing register layout before doing anything else. 4224 if (IsA16 || IsG16) { 4225 if (IsA16) { 4226 // Target must support the feature and gradients need to be 16 bit too 4227 if (!ST.hasA16() || !IsG16) 4228 return false; 4229 } else if (!ST.hasG16()) 4230 return false; 4231 4232 if (Intr->NumVAddrs > 1) { 4233 SmallVector<Register, 4> PackedRegs; 4234 // Don't compress addresses for G16 4235 const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart; 4236 packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr, 4237 PackEndIdx); 4238 4239 if (!IsA16) { 4240 // Add uncompressed address 4241 for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) { 4242 int AddrReg = MI.getOperand(ArgOffset + I).getReg(); 4243 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 4244 PackedRegs.push_back(AddrReg); 4245 } 4246 } 4247 4248 // See also below in the non-a16 branch 4249 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4250 4251 if (!UseNSA && PackedRegs.size() > 1) { 4252 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4253 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4254 PackedRegs[0] = Concat.getReg(0); 4255 PackedRegs.resize(1); 4256 } 4257 4258 const unsigned NumPacked = PackedRegs.size(); 4259 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4260 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4261 if (!SrcOp.isReg()) { 4262 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4263 continue; 4264 } 4265 4266 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4267 4268 if (I - Intr->VAddrStart < NumPacked) 4269 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4270 else 4271 SrcOp.setReg(AMDGPU::NoRegister); 4272 } 4273 } 4274 } else { 4275 // If the register allocator cannot place the address registers contiguously 4276 // without introducing moves, then using the non-sequential address encoding 4277 // is always preferable, since it saves VALU instructions and is usually a 4278 // wash in terms of code size or even better. 4279 // 4280 // However, we currently have no way of hinting to the register allocator 4281 // that MIMG addresses should be placed contiguously when it is possible to 4282 // do so, so force non-NSA for the common 2-address case as a heuristic. 4283 // 4284 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4285 // allocation when possible. 4286 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4287 4288 if (!UseNSA && Intr->NumVAddrs > 1) 4289 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4290 Intr->NumVAddrs); 4291 } 4292 4293 int Flags = 0; 4294 if (IsA16) 4295 Flags |= 1; 4296 if (IsG16) 4297 Flags |= 2; 4298 MI.addOperand(MachineOperand::CreateImm(Flags)); 4299 4300 if (BaseOpcode->Store) { // No TFE for stores? 4301 // TODO: Handle dmask trim 4302 Register VData = MI.getOperand(1).getReg(); 4303 LLT Ty = MRI->getType(VData); 4304 if (!Ty.isVector() || Ty.getElementType() != S16) 4305 return true; 4306 4307 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4308 if (RepackedReg != VData) { 4309 MI.getOperand(1).setReg(RepackedReg); 4310 } 4311 4312 return true; 4313 } 4314 4315 Register DstReg = MI.getOperand(0).getReg(); 4316 LLT Ty = MRI->getType(DstReg); 4317 const LLT EltTy = Ty.getScalarType(); 4318 const bool IsD16 = Ty.getScalarType() == S16; 4319 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4320 4321 // Confirm that the return type is large enough for the dmask specified 4322 if (NumElts < DMaskLanes) 4323 return false; 4324 4325 if (NumElts > 4 || DMaskLanes > 4) 4326 return false; 4327 4328 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4329 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4330 4331 // The raw dword aligned data component of the load. The only legal cases 4332 // where this matters should be when using the packed D16 format, for 4333 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4334 LLT RoundedTy; 4335 4336 // S32 vector to to cover all data, plus TFE result element. 4337 LLT TFETy; 4338 4339 // Register type to use for each loaded component. Will be S32 or V2S16. 4340 LLT RegTy; 4341 4342 if (IsD16 && ST.hasUnpackedD16VMem()) { 4343 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4344 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4345 RegTy = S32; 4346 } else { 4347 unsigned EltSize = EltTy.getSizeInBits(); 4348 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4349 unsigned RoundedSize = 32 * RoundedElts; 4350 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4351 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4352 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4353 } 4354 4355 // The return type does not need adjustment. 4356 // TODO: Should we change s16 case to s32 or <2 x s16>? 4357 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4358 return true; 4359 4360 Register Dst1Reg; 4361 4362 // Insert after the instruction. 4363 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4364 4365 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4366 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4367 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4368 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4369 4370 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4371 4372 MI.getOperand(0).setReg(NewResultReg); 4373 4374 // In the IR, TFE is supposed to be used with a 2 element struct return 4375 // type. The intruction really returns these two values in one contiguous 4376 // register, with one additional dword beyond the loaded data. Rewrite the 4377 // return type to use a single register result. 4378 4379 if (IsTFE) { 4380 Dst1Reg = MI.getOperand(1).getReg(); 4381 if (MRI->getType(Dst1Reg) != S32) 4382 return false; 4383 4384 // TODO: Make sure the TFE operand bit is set. 4385 MI.RemoveOperand(1); 4386 4387 // Handle the easy case that requires no repack instructions. 4388 if (Ty == S32) { 4389 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4390 return true; 4391 } 4392 } 4393 4394 // Now figure out how to copy the new result register back into the old 4395 // result. 4396 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4397 4398 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4399 4400 if (ResultNumRegs == 1) { 4401 assert(!IsTFE); 4402 ResultRegs[0] = NewResultReg; 4403 } else { 4404 // We have to repack into a new vector of some kind. 4405 for (int I = 0; I != NumDataRegs; ++I) 4406 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4407 B.buildUnmerge(ResultRegs, NewResultReg); 4408 4409 // Drop the final TFE element to get the data part. The TFE result is 4410 // directly written to the right place already. 4411 if (IsTFE) 4412 ResultRegs.resize(NumDataRegs); 4413 } 4414 4415 // For an s16 scalar result, we form an s32 result with a truncate regardless 4416 // of packed vs. unpacked. 4417 if (IsD16 && !Ty.isVector()) { 4418 B.buildTrunc(DstReg, ResultRegs[0]); 4419 return true; 4420 } 4421 4422 // Avoid a build/concat_vector of 1 entry. 4423 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4424 B.buildBitcast(DstReg, ResultRegs[0]); 4425 return true; 4426 } 4427 4428 assert(Ty.isVector()); 4429 4430 if (IsD16) { 4431 // For packed D16 results with TFE enabled, all the data components are 4432 // S32. Cast back to the expected type. 4433 // 4434 // TODO: We don't really need to use load s32 elements. We would only need one 4435 // cast for the TFE result if a multiple of v2s16 was used. 4436 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4437 for (Register &Reg : ResultRegs) 4438 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4439 } else if (ST.hasUnpackedD16VMem()) { 4440 for (Register &Reg : ResultRegs) 4441 Reg = B.buildTrunc(S16, Reg).getReg(0); 4442 } 4443 } 4444 4445 auto padWithUndef = [&](LLT Ty, int NumElts) { 4446 if (NumElts == 0) 4447 return; 4448 Register Undef = B.buildUndef(Ty).getReg(0); 4449 for (int I = 0; I != NumElts; ++I) 4450 ResultRegs.push_back(Undef); 4451 }; 4452 4453 // Pad out any elements eliminated due to the dmask. 4454 LLT ResTy = MRI->getType(ResultRegs[0]); 4455 if (!ResTy.isVector()) { 4456 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4457 B.buildBuildVector(DstReg, ResultRegs); 4458 return true; 4459 } 4460 4461 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4462 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4463 4464 // Deal with the one annoying legal case. 4465 const LLT V3S16 = LLT::vector(3, 16); 4466 if (Ty == V3S16) { 4467 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4468 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4469 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4470 return true; 4471 } 4472 4473 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4474 B.buildConcatVectors(DstReg, ResultRegs); 4475 return true; 4476 } 4477 4478 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4479 LegalizerHelper &Helper, MachineInstr &MI) const { 4480 MachineIRBuilder &B = Helper.MIRBuilder; 4481 GISelChangeObserver &Observer = Helper.Observer; 4482 4483 Register Dst = MI.getOperand(0).getReg(); 4484 LLT Ty = B.getMRI()->getType(Dst); 4485 unsigned Size = Ty.getSizeInBits(); 4486 MachineFunction &MF = B.getMF(); 4487 4488 Observer.changingInstr(MI); 4489 4490 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4491 Ty = getBitcastRegisterType(Ty); 4492 Helper.bitcastDst(MI, Ty, 0); 4493 Dst = MI.getOperand(0).getReg(); 4494 B.setInsertPt(B.getMBB(), MI); 4495 } 4496 4497 // FIXME: We don't really need this intermediate instruction. The intrinsic 4498 // should be fixed to have a memory operand. Since it's readnone, we're not 4499 // allowed to add one. 4500 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4501 MI.RemoveOperand(1); // Remove intrinsic ID 4502 4503 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4504 // TODO: Should this use datalayout alignment? 4505 const unsigned MemSize = (Size + 7) / 8; 4506 const Align MemAlign(4); 4507 MachineMemOperand *MMO = MF.getMachineMemOperand( 4508 MachinePointerInfo(), 4509 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4510 MachineMemOperand::MOInvariant, 4511 MemSize, MemAlign); 4512 MI.addMemOperand(MF, MMO); 4513 4514 // There are no 96-bit result scalar loads, but widening to 128-bit should 4515 // always be legal. We may need to restore this to a 96-bit result if it turns 4516 // out this needs to be converted to a vector load during RegBankSelect. 4517 if (!isPowerOf2_32(Size)) { 4518 if (Ty.isVector()) 4519 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4520 else 4521 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4522 } 4523 4524 Observer.changedInstr(MI); 4525 return true; 4526 } 4527 4528 // TODO: Move to selection 4529 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4530 MachineRegisterInfo &MRI, 4531 MachineIRBuilder &B) const { 4532 if (!ST.isTrapHandlerEnabled() || 4533 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4534 return legalizeTrapEndpgm(MI, MRI, B); 4535 4536 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4537 switch (*HsaAbiVer) { 4538 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4539 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4540 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4541 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4542 return ST.supportsGetDoorbellID() ? 4543 legalizeTrapHsa(MI, MRI, B) : 4544 legalizeTrapHsaQueuePtr(MI, MRI, B); 4545 } 4546 } 4547 4548 llvm_unreachable("Unknown trap handler"); 4549 } 4550 4551 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4552 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4553 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4554 MI.eraseFromParent(); 4555 return true; 4556 } 4557 4558 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4559 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4560 // Pass queue pointer to trap handler as input, and insert trap instruction 4561 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4562 Register LiveIn = 4563 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4564 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4565 return false; 4566 4567 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4568 B.buildCopy(SGPR01, LiveIn); 4569 B.buildInstr(AMDGPU::S_TRAP) 4570 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4571 .addReg(SGPR01, RegState::Implicit); 4572 4573 MI.eraseFromParent(); 4574 return true; 4575 } 4576 4577 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4578 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4579 B.buildInstr(AMDGPU::S_TRAP) 4580 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4581 MI.eraseFromParent(); 4582 return true; 4583 } 4584 4585 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4586 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4587 // Is non-HSA path or trap-handler disabled? then, report a warning 4588 // accordingly 4589 if (!ST.isTrapHandlerEnabled() || 4590 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4591 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4592 "debugtrap handler not supported", 4593 MI.getDebugLoc(), DS_Warning); 4594 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4595 Ctx.diagnose(NoTrap); 4596 } else { 4597 // Insert debug-trap instruction 4598 B.buildInstr(AMDGPU::S_TRAP) 4599 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4600 } 4601 4602 MI.eraseFromParent(); 4603 return true; 4604 } 4605 4606 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4607 MachineIRBuilder &B) const { 4608 MachineRegisterInfo &MRI = *B.getMRI(); 4609 const LLT S16 = LLT::scalar(16); 4610 const LLT S32 = LLT::scalar(32); 4611 4612 Register DstReg = MI.getOperand(0).getReg(); 4613 Register NodePtr = MI.getOperand(2).getReg(); 4614 Register RayExtent = MI.getOperand(3).getReg(); 4615 Register RayOrigin = MI.getOperand(4).getReg(); 4616 Register RayDir = MI.getOperand(5).getReg(); 4617 Register RayInvDir = MI.getOperand(6).getReg(); 4618 Register TDescr = MI.getOperand(7).getReg(); 4619 4620 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4621 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4622 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4623 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4624 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4625 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4626 4627 SmallVector<Register, 12> Ops; 4628 if (Is64) { 4629 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4630 Ops.push_back(Unmerge.getReg(0)); 4631 Ops.push_back(Unmerge.getReg(1)); 4632 } else { 4633 Ops.push_back(NodePtr); 4634 } 4635 Ops.push_back(RayExtent); 4636 4637 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4638 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4639 Ops.push_back(Unmerge.getReg(0)); 4640 Ops.push_back(Unmerge.getReg(1)); 4641 Ops.push_back(Unmerge.getReg(2)); 4642 }; 4643 4644 packLanes(RayOrigin); 4645 if (IsA16) { 4646 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4647 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4648 Register R1 = MRI.createGenericVirtualRegister(S32); 4649 Register R2 = MRI.createGenericVirtualRegister(S32); 4650 Register R3 = MRI.createGenericVirtualRegister(S32); 4651 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4652 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4653 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4654 Ops.push_back(R1); 4655 Ops.push_back(R2); 4656 Ops.push_back(R3); 4657 } else { 4658 packLanes(RayDir); 4659 packLanes(RayInvDir); 4660 } 4661 4662 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4663 .addDef(DstReg) 4664 .addImm(Opcode); 4665 4666 for (Register R : Ops) { 4667 MIB.addUse(R); 4668 } 4669 4670 MIB.addUse(TDescr) 4671 .addImm(IsA16 ? 1 : 0) 4672 .cloneMemRefs(MI); 4673 4674 MI.eraseFromParent(); 4675 return true; 4676 } 4677 4678 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4679 MachineInstr &MI) const { 4680 MachineIRBuilder &B = Helper.MIRBuilder; 4681 MachineRegisterInfo &MRI = *B.getMRI(); 4682 4683 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4684 auto IntrID = MI.getIntrinsicID(); 4685 switch (IntrID) { 4686 case Intrinsic::amdgcn_if: 4687 case Intrinsic::amdgcn_else: { 4688 MachineInstr *Br = nullptr; 4689 MachineBasicBlock *UncondBrTarget = nullptr; 4690 bool Negated = false; 4691 if (MachineInstr *BrCond = 4692 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4693 const SIRegisterInfo *TRI 4694 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4695 4696 Register Def = MI.getOperand(1).getReg(); 4697 Register Use = MI.getOperand(3).getReg(); 4698 4699 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4700 4701 if (Negated) 4702 std::swap(CondBrTarget, UncondBrTarget); 4703 4704 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4705 if (IntrID == Intrinsic::amdgcn_if) { 4706 B.buildInstr(AMDGPU::SI_IF) 4707 .addDef(Def) 4708 .addUse(Use) 4709 .addMBB(UncondBrTarget); 4710 } else { 4711 B.buildInstr(AMDGPU::SI_ELSE) 4712 .addDef(Def) 4713 .addUse(Use) 4714 .addMBB(UncondBrTarget); 4715 } 4716 4717 if (Br) { 4718 Br->getOperand(0).setMBB(CondBrTarget); 4719 } else { 4720 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4721 // since we're swapping branch targets it needs to be reinserted. 4722 // FIXME: IRTranslator should probably not do this 4723 B.buildBr(*CondBrTarget); 4724 } 4725 4726 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4727 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4728 MI.eraseFromParent(); 4729 BrCond->eraseFromParent(); 4730 return true; 4731 } 4732 4733 return false; 4734 } 4735 case Intrinsic::amdgcn_loop: { 4736 MachineInstr *Br = nullptr; 4737 MachineBasicBlock *UncondBrTarget = nullptr; 4738 bool Negated = false; 4739 if (MachineInstr *BrCond = 4740 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4741 const SIRegisterInfo *TRI 4742 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4743 4744 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4745 Register Reg = MI.getOperand(2).getReg(); 4746 4747 if (Negated) 4748 std::swap(CondBrTarget, UncondBrTarget); 4749 4750 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4751 B.buildInstr(AMDGPU::SI_LOOP) 4752 .addUse(Reg) 4753 .addMBB(UncondBrTarget); 4754 4755 if (Br) 4756 Br->getOperand(0).setMBB(CondBrTarget); 4757 else 4758 B.buildBr(*CondBrTarget); 4759 4760 MI.eraseFromParent(); 4761 BrCond->eraseFromParent(); 4762 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4763 return true; 4764 } 4765 4766 return false; 4767 } 4768 case Intrinsic::amdgcn_kernarg_segment_ptr: 4769 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4770 // This only makes sense to call in a kernel, so just lower to null. 4771 B.buildConstant(MI.getOperand(0).getReg(), 0); 4772 MI.eraseFromParent(); 4773 return true; 4774 } 4775 4776 return legalizePreloadedArgIntrin( 4777 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4778 case Intrinsic::amdgcn_implicitarg_ptr: 4779 return legalizeImplicitArgPtr(MI, MRI, B); 4780 case Intrinsic::amdgcn_workitem_id_x: 4781 return legalizePreloadedArgIntrin(MI, MRI, B, 4782 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4783 case Intrinsic::amdgcn_workitem_id_y: 4784 return legalizePreloadedArgIntrin(MI, MRI, B, 4785 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4786 case Intrinsic::amdgcn_workitem_id_z: 4787 return legalizePreloadedArgIntrin(MI, MRI, B, 4788 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4789 case Intrinsic::amdgcn_workgroup_id_x: 4790 return legalizePreloadedArgIntrin(MI, MRI, B, 4791 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4792 case Intrinsic::amdgcn_workgroup_id_y: 4793 return legalizePreloadedArgIntrin(MI, MRI, B, 4794 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4795 case Intrinsic::amdgcn_workgroup_id_z: 4796 return legalizePreloadedArgIntrin(MI, MRI, B, 4797 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4798 case Intrinsic::amdgcn_dispatch_ptr: 4799 return legalizePreloadedArgIntrin(MI, MRI, B, 4800 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4801 case Intrinsic::amdgcn_queue_ptr: 4802 return legalizePreloadedArgIntrin(MI, MRI, B, 4803 AMDGPUFunctionArgInfo::QUEUE_PTR); 4804 case Intrinsic::amdgcn_implicit_buffer_ptr: 4805 return legalizePreloadedArgIntrin( 4806 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4807 case Intrinsic::amdgcn_dispatch_id: 4808 return legalizePreloadedArgIntrin(MI, MRI, B, 4809 AMDGPUFunctionArgInfo::DISPATCH_ID); 4810 case Intrinsic::amdgcn_fdiv_fast: 4811 return legalizeFDIVFastIntrin(MI, MRI, B); 4812 case Intrinsic::amdgcn_is_shared: 4813 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4814 case Intrinsic::amdgcn_is_private: 4815 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4816 case Intrinsic::amdgcn_wavefrontsize: { 4817 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4818 MI.eraseFromParent(); 4819 return true; 4820 } 4821 case Intrinsic::amdgcn_s_buffer_load: 4822 return legalizeSBufferLoad(Helper, MI); 4823 case Intrinsic::amdgcn_raw_buffer_store: 4824 case Intrinsic::amdgcn_struct_buffer_store: 4825 return legalizeBufferStore(MI, MRI, B, false, false); 4826 case Intrinsic::amdgcn_raw_buffer_store_format: 4827 case Intrinsic::amdgcn_struct_buffer_store_format: 4828 return legalizeBufferStore(MI, MRI, B, false, true); 4829 case Intrinsic::amdgcn_raw_tbuffer_store: 4830 case Intrinsic::amdgcn_struct_tbuffer_store: 4831 return legalizeBufferStore(MI, MRI, B, true, true); 4832 case Intrinsic::amdgcn_raw_buffer_load: 4833 case Intrinsic::amdgcn_struct_buffer_load: 4834 return legalizeBufferLoad(MI, MRI, B, false, false); 4835 case Intrinsic::amdgcn_raw_buffer_load_format: 4836 case Intrinsic::amdgcn_struct_buffer_load_format: 4837 return legalizeBufferLoad(MI, MRI, B, true, false); 4838 case Intrinsic::amdgcn_raw_tbuffer_load: 4839 case Intrinsic::amdgcn_struct_tbuffer_load: 4840 return legalizeBufferLoad(MI, MRI, B, true, true); 4841 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4842 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4843 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4844 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4845 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4846 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4847 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4848 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4849 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4850 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4851 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4852 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4853 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4854 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4855 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4856 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4857 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4858 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4859 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4860 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4861 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4862 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4863 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4864 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4865 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4866 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4867 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4868 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4869 case Intrinsic::amdgcn_buffer_atomic_fadd: 4870 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4871 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4872 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4873 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4874 return legalizeBufferAtomic(MI, B, IntrID); 4875 case Intrinsic::amdgcn_atomic_inc: 4876 return legalizeAtomicIncDec(MI, B, true); 4877 case Intrinsic::amdgcn_atomic_dec: 4878 return legalizeAtomicIncDec(MI, B, false); 4879 case Intrinsic::trap: 4880 return legalizeTrapIntrinsic(MI, MRI, B); 4881 case Intrinsic::debugtrap: 4882 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4883 case Intrinsic::amdgcn_rsq_clamp: 4884 return legalizeRsqClampIntrinsic(MI, MRI, B); 4885 case Intrinsic::amdgcn_ds_fadd: 4886 case Intrinsic::amdgcn_ds_fmin: 4887 case Intrinsic::amdgcn_ds_fmax: 4888 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4889 case Intrinsic::amdgcn_image_bvh_intersect_ray: 4890 return legalizeBVHIntrinsic(MI, B); 4891 default: { 4892 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4893 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4894 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4895 return true; 4896 } 4897 } 4898 4899 return true; 4900 } 4901