1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeNumElements(Pow2NElts); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 99 }; 100 } 101 102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 const LLT EltTy = Ty.getElementType(); 106 unsigned Size = Ty.getSizeInBits(); 107 unsigned Pieces = (Size + 63) / 64; 108 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 109 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 110 }; 111 } 112 113 // Increase the number of vector elements to reach the next multiple of 32-bit 114 // type. 115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 116 return [=](const LegalityQuery &Query) { 117 const LLT Ty = Query.Types[TypeIdx]; 118 119 const LLT EltTy = Ty.getElementType(); 120 const int Size = Ty.getSizeInBits(); 121 const int EltSize = EltTy.getSizeInBits(); 122 const int NextMul32 = (Size + 31) / 32; 123 124 assert(EltSize < 32); 125 126 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 127 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 128 }; 129 } 130 131 static LLT getBitcastRegisterType(const LLT Ty) { 132 const unsigned Size = Ty.getSizeInBits(); 133 134 LLT CoercedTy; 135 if (Size <= 32) { 136 // <2 x s8> -> s16 137 // <4 x s8> -> s32 138 return LLT::scalar(Size); 139 } 140 141 return LLT::scalarOrVector(Size / 32, 32); 142 } 143 144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 145 return [=](const LegalityQuery &Query) { 146 const LLT Ty = Query.Types[TypeIdx]; 147 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 148 }; 149 } 150 151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 152 return [=](const LegalityQuery &Query) { 153 const LLT Ty = Query.Types[TypeIdx]; 154 unsigned Size = Ty.getSizeInBits(); 155 assert(Size % 32 == 0); 156 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 157 }; 158 } 159 160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 161 return [=](const LegalityQuery &Query) { 162 const LLT QueryTy = Query.Types[TypeIdx]; 163 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 164 }; 165 } 166 167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 168 return [=](const LegalityQuery &Query) { 169 const LLT QueryTy = Query.Types[TypeIdx]; 170 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 171 }; 172 } 173 174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 const LLT QueryTy = Query.Types[TypeIdx]; 177 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 178 }; 179 } 180 181 static bool isRegisterSize(unsigned Size) { 182 return Size % 32 == 0 && Size <= MaxRegisterSize; 183 } 184 185 static bool isRegisterVectorElementType(LLT EltTy) { 186 const int EltSize = EltTy.getSizeInBits(); 187 return EltSize == 16 || EltSize % 32 == 0; 188 } 189 190 static bool isRegisterVectorType(LLT Ty) { 191 const int EltSize = Ty.getElementType().getSizeInBits(); 192 return EltSize == 32 || EltSize == 64 || 193 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 194 EltSize == 128 || EltSize == 256; 195 } 196 197 static bool isRegisterType(LLT Ty) { 198 if (!isRegisterSize(Ty.getSizeInBits())) 199 return false; 200 201 if (Ty.isVector()) 202 return isRegisterVectorType(Ty); 203 204 return true; 205 } 206 207 // Any combination of 32 or 64-bit elements up the maximum register size, and 208 // multiples of v2s16. 209 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 210 return [=](const LegalityQuery &Query) { 211 return isRegisterType(Query.Types[TypeIdx]); 212 }; 213 } 214 215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 216 return [=](const LegalityQuery &Query) { 217 const LLT QueryTy = Query.Types[TypeIdx]; 218 if (!QueryTy.isVector()) 219 return false; 220 const LLT EltTy = QueryTy.getElementType(); 221 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 222 }; 223 } 224 225 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 226 return [=](const LegalityQuery &Query) { 227 const LLT Ty = Query.Types[TypeIdx]; 228 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 229 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 230 }; 231 } 232 233 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 234 // handle some operations by just promoting the register during 235 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 236 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 237 bool IsLoad) { 238 switch (AS) { 239 case AMDGPUAS::PRIVATE_ADDRESS: 240 // FIXME: Private element size. 241 return ST.enableFlatScratch() ? 128 : 32; 242 case AMDGPUAS::LOCAL_ADDRESS: 243 return ST.useDS128() ? 128 : 64; 244 case AMDGPUAS::GLOBAL_ADDRESS: 245 case AMDGPUAS::CONSTANT_ADDRESS: 246 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 247 // Treat constant and global as identical. SMRD loads are sometimes usable for 248 // global loads (ideally constant address space should be eliminated) 249 // depending on the context. Legality cannot be context dependent, but 250 // RegBankSelect can split the load as necessary depending on the pointer 251 // register bank/uniformity and if the memory is invariant or not written in a 252 // kernel. 253 return IsLoad ? 512 : 128; 254 default: 255 // Flat addresses may contextually need to be split to 32-bit parts if they 256 // may alias scratch depending on the subtarget. 257 return 128; 258 } 259 } 260 261 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 262 const LegalityQuery &Query, 263 unsigned Opcode) { 264 const LLT Ty = Query.Types[0]; 265 266 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 267 const bool IsLoad = Opcode != AMDGPU::G_STORE; 268 269 unsigned RegSize = Ty.getSizeInBits(); 270 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 271 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 272 unsigned AS = Query.Types[1].getAddressSpace(); 273 274 // All of these need to be custom lowered to cast the pointer operand. 275 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 276 return false; 277 278 // TODO: We should be able to widen loads if the alignment is high enough, but 279 // we also need to modify the memory access size. 280 #if 0 281 // Accept widening loads based on alignment. 282 if (IsLoad && MemSize < Size) 283 MemSize = std::max(MemSize, Align); 284 #endif 285 286 // Only 1-byte and 2-byte to 32-bit extloads are valid. 287 if (MemSize != RegSize && RegSize != 32) 288 return false; 289 290 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 291 return false; 292 293 switch (MemSize) { 294 case 8: 295 case 16: 296 case 32: 297 case 64: 298 case 128: 299 break; 300 case 96: 301 if (!ST.hasDwordx3LoadStores()) 302 return false; 303 break; 304 case 256: 305 case 512: 306 // These may contextually need to be broken down. 307 break; 308 default: 309 return false; 310 } 311 312 assert(RegSize >= MemSize); 313 314 if (AlignBits < MemSize) { 315 const SITargetLowering *TLI = ST.getTargetLowering(); 316 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 317 Align(AlignBits / 8))) 318 return false; 319 } 320 321 return true; 322 } 323 324 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 325 // workaround this. Eventually it should ignore the type for loads and only care 326 // about the size. Return true in cases where we will workaround this for now by 327 // bitcasting. 328 static bool loadStoreBitcastWorkaround(const LLT Ty) { 329 if (EnableNewLegality) 330 return false; 331 332 const unsigned Size = Ty.getSizeInBits(); 333 if (Size <= 64) 334 return false; 335 if (!Ty.isVector()) 336 return true; 337 338 LLT EltTy = Ty.getElementType(); 339 if (EltTy.isPointer()) 340 return true; 341 342 unsigned EltSize = EltTy.getSizeInBits(); 343 return EltSize != 32 && EltSize != 64; 344 } 345 346 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 347 unsigned Opcode) { 348 const LLT Ty = Query.Types[0]; 349 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 350 !loadStoreBitcastWorkaround(Ty); 351 } 352 353 /// Return true if a load or store of the type should be lowered with a bitcast 354 /// to a different type. 355 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 356 const unsigned MemSizeInBits) { 357 const unsigned Size = Ty.getSizeInBits(); 358 if (Size != MemSizeInBits) 359 return Size <= 32 && Ty.isVector(); 360 361 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 362 return true; 363 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 364 !isRegisterVectorElementType(Ty.getElementType()); 365 } 366 367 /// Return true if we should legalize a load by widening an odd sized memory 368 /// access up to the alignment. Note this case when the memory access itself 369 /// changes, not the size of the result register. 370 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 371 unsigned AlignInBits, unsigned AddrSpace, 372 unsigned Opcode) { 373 // We don't want to widen cases that are naturally legal. 374 if (isPowerOf2_32(SizeInBits)) 375 return false; 376 377 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 378 // end up widening these for a scalar load during RegBankSelect, since there 379 // aren't 96-bit scalar loads. 380 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 381 return false; 382 383 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 384 return false; 385 386 // A load is known dereferenceable up to the alignment, so it's legal to widen 387 // to it. 388 // 389 // TODO: Could check dereferenceable for less aligned cases. 390 unsigned RoundedSize = NextPowerOf2(SizeInBits); 391 if (AlignInBits < RoundedSize) 392 return false; 393 394 // Do not widen if it would introduce a slow unaligned load. 395 const SITargetLowering *TLI = ST.getTargetLowering(); 396 bool Fast = false; 397 return TLI->allowsMisalignedMemoryAccessesImpl( 398 RoundedSize, AddrSpace, Align(AlignInBits / 8), 399 MachineMemOperand::MOLoad, &Fast) && 400 Fast; 401 } 402 403 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 404 unsigned Opcode) { 405 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 406 return false; 407 408 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 409 Query.MMODescrs[0].AlignInBits, 410 Query.Types[1].getAddressSpace(), Opcode); 411 } 412 413 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 414 const GCNTargetMachine &TM) 415 : ST(ST_) { 416 using namespace TargetOpcode; 417 418 auto GetAddrSpacePtr = [&TM](unsigned AS) { 419 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 420 }; 421 422 const LLT S1 = LLT::scalar(1); 423 const LLT S8 = LLT::scalar(8); 424 const LLT S16 = LLT::scalar(16); 425 const LLT S32 = LLT::scalar(32); 426 const LLT S64 = LLT::scalar(64); 427 const LLT S128 = LLT::scalar(128); 428 const LLT S256 = LLT::scalar(256); 429 const LLT S512 = LLT::scalar(512); 430 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 431 432 const LLT V2S8 = LLT::vector(2, 8); 433 const LLT V2S16 = LLT::vector(2, 16); 434 const LLT V4S16 = LLT::vector(4, 16); 435 436 const LLT V2S32 = LLT::vector(2, 32); 437 const LLT V3S32 = LLT::vector(3, 32); 438 const LLT V4S32 = LLT::vector(4, 32); 439 const LLT V5S32 = LLT::vector(5, 32); 440 const LLT V6S32 = LLT::vector(6, 32); 441 const LLT V7S32 = LLT::vector(7, 32); 442 const LLT V8S32 = LLT::vector(8, 32); 443 const LLT V9S32 = LLT::vector(9, 32); 444 const LLT V10S32 = LLT::vector(10, 32); 445 const LLT V11S32 = LLT::vector(11, 32); 446 const LLT V12S32 = LLT::vector(12, 32); 447 const LLT V13S32 = LLT::vector(13, 32); 448 const LLT V14S32 = LLT::vector(14, 32); 449 const LLT V15S32 = LLT::vector(15, 32); 450 const LLT V16S32 = LLT::vector(16, 32); 451 const LLT V32S32 = LLT::vector(32, 32); 452 453 const LLT V2S64 = LLT::vector(2, 64); 454 const LLT V3S64 = LLT::vector(3, 64); 455 const LLT V4S64 = LLT::vector(4, 64); 456 const LLT V5S64 = LLT::vector(5, 64); 457 const LLT V6S64 = LLT::vector(6, 64); 458 const LLT V7S64 = LLT::vector(7, 64); 459 const LLT V8S64 = LLT::vector(8, 64); 460 const LLT V16S64 = LLT::vector(16, 64); 461 462 std::initializer_list<LLT> AllS32Vectors = 463 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 464 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 465 std::initializer_list<LLT> AllS64Vectors = 466 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 467 468 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 469 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 470 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 471 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 472 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 473 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 474 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 475 476 const LLT CodePtr = FlatPtr; 477 478 const std::initializer_list<LLT> AddrSpaces64 = { 479 GlobalPtr, ConstantPtr, FlatPtr 480 }; 481 482 const std::initializer_list<LLT> AddrSpaces32 = { 483 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 484 }; 485 486 const std::initializer_list<LLT> FPTypesBase = { 487 S32, S64 488 }; 489 490 const std::initializer_list<LLT> FPTypes16 = { 491 S32, S64, S16 492 }; 493 494 const std::initializer_list<LLT> FPTypesPK16 = { 495 S32, S64, S16, V2S16 496 }; 497 498 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 499 500 setAction({G_BRCOND, S1}, Legal); // VCC branches 501 setAction({G_BRCOND, S32}, Legal); // SCC branches 502 503 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 504 // elements for v3s16 505 getActionDefinitionsBuilder(G_PHI) 506 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 507 .legalFor(AllS32Vectors) 508 .legalFor(AllS64Vectors) 509 .legalFor(AddrSpaces64) 510 .legalFor(AddrSpaces32) 511 .legalIf(isPointer(0)) 512 .clampScalar(0, S16, S256) 513 .widenScalarToNextPow2(0, 32) 514 .clampMaxNumElements(0, S32, 16) 515 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 516 .scalarize(0); 517 518 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 519 // Full set of gfx9 features. 520 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 521 .legalFor({S32, S16, V2S16}) 522 .clampScalar(0, S16, S32) 523 .clampMaxNumElements(0, S16, 2) 524 .scalarize(0) 525 .widenScalarToNextPow2(0, 32); 526 527 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 528 .legalFor({S32, S16, V2S16}) // Clamp modifier 529 .minScalarOrElt(0, S16) 530 .clampMaxNumElements(0, S16, 2) 531 .scalarize(0) 532 .widenScalarToNextPow2(0, 32) 533 .lower(); 534 } else if (ST.has16BitInsts()) { 535 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 536 .legalFor({S32, S16}) 537 .clampScalar(0, S16, S32) 538 .scalarize(0) 539 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 540 541 // Technically the saturating operations require clamp bit support, but this 542 // was introduced at the same time as 16-bit operations. 543 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 544 .legalFor({S32, S16}) // Clamp modifier 545 .minScalar(0, S16) 546 .scalarize(0) 547 .widenScalarToNextPow2(0, 16) 548 .lower(); 549 550 // We're just lowering this, but it helps get a better result to try to 551 // coerce to the desired type first. 552 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 553 .minScalar(0, S16) 554 .scalarize(0) 555 .lower(); 556 } else { 557 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 558 .legalFor({S32}) 559 .clampScalar(0, S32, S32) 560 .scalarize(0); 561 562 if (ST.hasIntClamp()) { 563 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 564 .legalFor({S32}) // Clamp modifier. 565 .scalarize(0) 566 .minScalarOrElt(0, S32) 567 .lower(); 568 } else { 569 // Clamp bit support was added in VI, along with 16-bit operations. 570 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 571 .minScalar(0, S32) 572 .scalarize(0) 573 .lower(); 574 } 575 576 // FIXME: DAG expansion gets better results. The widening uses the smaller 577 // range values and goes for the min/max lowering directly. 578 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 579 .minScalar(0, S32) 580 .scalarize(0) 581 .lower(); 582 } 583 584 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 585 .customFor({S32, S64}) 586 .clampScalar(0, S32, S64) 587 .widenScalarToNextPow2(0, 32) 588 .scalarize(0); 589 590 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 591 .legalFor({S32}) 592 .maxScalarOrElt(0, S32); 593 594 if (ST.hasVOP3PInsts()) { 595 Mulh 596 .clampMaxNumElements(0, S8, 2) 597 .lowerFor({V2S8}); 598 } 599 600 Mulh 601 .scalarize(0) 602 .lower(); 603 604 // Report legal for any types we can handle anywhere. For the cases only legal 605 // on the SALU, RegBankSelect will be able to re-legalize. 606 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 607 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 608 .clampScalar(0, S32, S64) 609 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 610 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 611 .widenScalarToNextPow2(0) 612 .scalarize(0); 613 614 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 615 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 616 .legalFor({{S32, S1}, {S32, S32}}) 617 .minScalar(0, S32) 618 // TODO: .scalarize(0) 619 .lower(); 620 621 getActionDefinitionsBuilder(G_BITCAST) 622 // Don't worry about the size constraint. 623 .legalIf(all(isRegisterType(0), isRegisterType(1))) 624 .lower(); 625 626 627 getActionDefinitionsBuilder(G_CONSTANT) 628 .legalFor({S1, S32, S64, S16, GlobalPtr, 629 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 630 .legalIf(isPointer(0)) 631 .clampScalar(0, S32, S64) 632 .widenScalarToNextPow2(0); 633 634 getActionDefinitionsBuilder(G_FCONSTANT) 635 .legalFor({S32, S64, S16}) 636 .clampScalar(0, S16, S64); 637 638 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 639 .legalIf(isRegisterType(0)) 640 // s1 and s16 are special cases because they have legal operations on 641 // them, but don't really occupy registers in the normal way. 642 .legalFor({S1, S16}) 643 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 644 .clampScalarOrElt(0, S32, MaxScalar) 645 .widenScalarToNextPow2(0, 32) 646 .clampMaxNumElements(0, S32, 16); 647 648 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 649 650 // If the amount is divergent, we have to do a wave reduction to get the 651 // maximum value, so this is expanded during RegBankSelect. 652 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 653 .legalFor({{PrivatePtr, S32}}); 654 655 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 656 .customIf(typeIsNot(0, PrivatePtr)); 657 658 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 659 660 auto &FPOpActions = getActionDefinitionsBuilder( 661 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 662 .legalFor({S32, S64}); 663 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 664 .customFor({S32, S64}); 665 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 666 .customFor({S32, S64}); 667 668 if (ST.has16BitInsts()) { 669 if (ST.hasVOP3PInsts()) 670 FPOpActions.legalFor({S16, V2S16}); 671 else 672 FPOpActions.legalFor({S16}); 673 674 TrigActions.customFor({S16}); 675 FDIVActions.customFor({S16}); 676 } 677 678 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 679 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 680 681 if (ST.hasVOP3PInsts()) { 682 MinNumMaxNum.customFor(FPTypesPK16) 683 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 684 .clampMaxNumElements(0, S16, 2) 685 .clampScalar(0, S16, S64) 686 .scalarize(0); 687 } else if (ST.has16BitInsts()) { 688 MinNumMaxNum.customFor(FPTypes16) 689 .clampScalar(0, S16, S64) 690 .scalarize(0); 691 } else { 692 MinNumMaxNum.customFor(FPTypesBase) 693 .clampScalar(0, S32, S64) 694 .scalarize(0); 695 } 696 697 if (ST.hasVOP3PInsts()) 698 FPOpActions.clampMaxNumElements(0, S16, 2); 699 700 FPOpActions 701 .scalarize(0) 702 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 703 704 TrigActions 705 .scalarize(0) 706 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 707 708 FDIVActions 709 .scalarize(0) 710 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 711 712 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 713 .legalFor(FPTypesPK16) 714 .clampMaxNumElements(0, S16, 2) 715 .scalarize(0) 716 .clampScalar(0, S16, S64); 717 718 if (ST.has16BitInsts()) { 719 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 720 .legalFor({S32, S64, S16}) 721 .scalarize(0) 722 .clampScalar(0, S16, S64); 723 } else { 724 getActionDefinitionsBuilder(G_FSQRT) 725 .legalFor({S32, S64}) 726 .scalarize(0) 727 .clampScalar(0, S32, S64); 728 729 if (ST.hasFractBug()) { 730 getActionDefinitionsBuilder(G_FFLOOR) 731 .customFor({S64}) 732 .legalFor({S32, S64}) 733 .scalarize(0) 734 .clampScalar(0, S32, S64); 735 } else { 736 getActionDefinitionsBuilder(G_FFLOOR) 737 .legalFor({S32, S64}) 738 .scalarize(0) 739 .clampScalar(0, S32, S64); 740 } 741 } 742 743 getActionDefinitionsBuilder(G_FPTRUNC) 744 .legalFor({{S32, S64}, {S16, S32}}) 745 .scalarize(0) 746 .lower(); 747 748 getActionDefinitionsBuilder(G_FPEXT) 749 .legalFor({{S64, S32}, {S32, S16}}) 750 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 751 .scalarize(0); 752 753 getActionDefinitionsBuilder(G_FSUB) 754 // Use actual fsub instruction 755 .legalFor({S32}) 756 // Must use fadd + fneg 757 .lowerFor({S64, S16, V2S16}) 758 .scalarize(0) 759 .clampScalar(0, S32, S64); 760 761 // Whether this is legal depends on the floating point mode for the function. 762 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 763 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 764 FMad.customFor({S32, S16}); 765 else if (ST.hasMadMacF32Insts()) 766 FMad.customFor({S32}); 767 else if (ST.hasMadF16()) 768 FMad.customFor({S16}); 769 FMad.scalarize(0) 770 .lower(); 771 772 auto &FRem = getActionDefinitionsBuilder(G_FREM); 773 if (ST.has16BitInsts()) { 774 FRem.customFor({S16, S32, S64}); 775 } else { 776 FRem.minScalar(0, S32) 777 .customFor({S32, S64}); 778 } 779 FRem.scalarize(0); 780 781 // TODO: Do we need to clamp maximum bitwidth? 782 getActionDefinitionsBuilder(G_TRUNC) 783 .legalIf(isScalar(0)) 784 .legalFor({{V2S16, V2S32}}) 785 .clampMaxNumElements(0, S16, 2) 786 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 787 // situations (like an invalid implicit use), we don't want to infinite loop 788 // in the legalizer. 789 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 790 .alwaysLegal(); 791 792 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 793 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 794 {S32, S1}, {S64, S1}, {S16, S1}}) 795 .scalarize(0) 796 .clampScalar(0, S32, S64) 797 .widenScalarToNextPow2(1, 32); 798 799 // TODO: Split s1->s64 during regbankselect for VALU. 800 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 801 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 802 .lowerFor({{S32, S64}}) 803 .lowerIf(typeIs(1, S1)) 804 .customFor({{S64, S64}}); 805 if (ST.has16BitInsts()) 806 IToFP.legalFor({{S16, S16}}); 807 IToFP.clampScalar(1, S32, S64) 808 .minScalar(0, S32) 809 .scalarize(0) 810 .widenScalarToNextPow2(1); 811 812 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 813 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 814 .customFor({{S64, S64}}) 815 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 816 if (ST.has16BitInsts()) 817 FPToI.legalFor({{S16, S16}}); 818 else 819 FPToI.minScalar(1, S32); 820 821 FPToI.minScalar(0, S32) 822 .widenScalarToNextPow2(0, 32) 823 .scalarize(0) 824 .lower(); 825 826 // Lower roundeven into G_FRINT 827 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 828 .scalarize(0) 829 .lower(); 830 831 if (ST.has16BitInsts()) { 832 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 833 .legalFor({S16, S32, S64}) 834 .clampScalar(0, S16, S64) 835 .scalarize(0); 836 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 837 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 838 .legalFor({S32, S64}) 839 .clampScalar(0, S32, S64) 840 .scalarize(0); 841 } else { 842 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 843 .legalFor({S32}) 844 .customFor({S64}) 845 .clampScalar(0, S32, S64) 846 .scalarize(0); 847 } 848 849 getActionDefinitionsBuilder(G_PTR_ADD) 850 .legalIf(all(isPointer(0), sameSize(0, 1))) 851 .scalarize(0) 852 .scalarSameSizeAs(1, 0); 853 854 getActionDefinitionsBuilder(G_PTRMASK) 855 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 856 .scalarSameSizeAs(1, 0) 857 .scalarize(0); 858 859 auto &CmpBuilder = 860 getActionDefinitionsBuilder(G_ICMP) 861 // The compare output type differs based on the register bank of the output, 862 // so make both s1 and s32 legal. 863 // 864 // Scalar compares producing output in scc will be promoted to s32, as that 865 // is the allocatable register type that will be needed for the copy from 866 // scc. This will be promoted during RegBankSelect, and we assume something 867 // before that won't try to use s32 result types. 868 // 869 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 870 // bank. 871 .legalForCartesianProduct( 872 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 873 .legalForCartesianProduct( 874 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 875 if (ST.has16BitInsts()) { 876 CmpBuilder.legalFor({{S1, S16}}); 877 } 878 879 CmpBuilder 880 .widenScalarToNextPow2(1) 881 .clampScalar(1, S32, S64) 882 .scalarize(0) 883 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 884 885 getActionDefinitionsBuilder(G_FCMP) 886 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 887 .widenScalarToNextPow2(1) 888 .clampScalar(1, S32, S64) 889 .scalarize(0); 890 891 // FIXME: fpow has a selection pattern that should move to custom lowering. 892 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 893 if (ST.has16BitInsts()) 894 Exp2Ops.legalFor({S32, S16}); 895 else 896 Exp2Ops.legalFor({S32}); 897 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 898 Exp2Ops.scalarize(0); 899 900 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 901 if (ST.has16BitInsts()) 902 ExpOps.customFor({{S32}, {S16}}); 903 else 904 ExpOps.customFor({S32}); 905 ExpOps.clampScalar(0, MinScalarFPTy, S32) 906 .scalarize(0); 907 908 getActionDefinitionsBuilder(G_FPOWI) 909 .clampScalar(0, MinScalarFPTy, S32) 910 .lower(); 911 912 // The 64-bit versions produce 32-bit results, but only on the SALU. 913 getActionDefinitionsBuilder(G_CTPOP) 914 .legalFor({{S32, S32}, {S32, S64}}) 915 .clampScalar(0, S32, S32) 916 .clampScalar(1, S32, S64) 917 .scalarize(0) 918 .widenScalarToNextPow2(0, 32) 919 .widenScalarToNextPow2(1, 32); 920 921 // The hardware instructions return a different result on 0 than the generic 922 // instructions expect. The hardware produces -1, but these produce the 923 // bitwidth. 924 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 925 .scalarize(0) 926 .clampScalar(0, S32, S32) 927 .clampScalar(1, S32, S64) 928 .widenScalarToNextPow2(0, 32) 929 .widenScalarToNextPow2(1, 32) 930 .lower(); 931 932 // The 64-bit versions produce 32-bit results, but only on the SALU. 933 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 934 .legalFor({{S32, S32}, {S32, S64}}) 935 .clampScalar(0, S32, S32) 936 .clampScalar(1, S32, S64) 937 .scalarize(0) 938 .widenScalarToNextPow2(0, 32) 939 .widenScalarToNextPow2(1, 32); 940 941 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 942 // RegBankSelect. 943 getActionDefinitionsBuilder(G_BITREVERSE) 944 .legalFor({S32, S64}) 945 .clampScalar(0, S32, S64) 946 .scalarize(0) 947 .widenScalarToNextPow2(0); 948 949 if (ST.has16BitInsts()) { 950 getActionDefinitionsBuilder(G_BSWAP) 951 .legalFor({S16, S32, V2S16}) 952 .clampMaxNumElements(0, S16, 2) 953 // FIXME: Fixing non-power-of-2 before clamp is workaround for 954 // narrowScalar limitation. 955 .widenScalarToNextPow2(0) 956 .clampScalar(0, S16, S32) 957 .scalarize(0); 958 959 if (ST.hasVOP3PInsts()) { 960 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 961 .legalFor({S32, S16, V2S16}) 962 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 963 .clampMaxNumElements(0, S16, 2) 964 .minScalar(0, S16) 965 .widenScalarToNextPow2(0) 966 .scalarize(0) 967 .lower(); 968 } else { 969 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 970 .legalFor({S32, S16}) 971 .widenScalarToNextPow2(0) 972 .minScalar(0, S16) 973 .scalarize(0) 974 .lower(); 975 } 976 } else { 977 // TODO: Should have same legality without v_perm_b32 978 getActionDefinitionsBuilder(G_BSWAP) 979 .legalFor({S32}) 980 .lowerIf(scalarNarrowerThan(0, 32)) 981 // FIXME: Fixing non-power-of-2 before clamp is workaround for 982 // narrowScalar limitation. 983 .widenScalarToNextPow2(0) 984 .maxScalar(0, S32) 985 .scalarize(0) 986 .lower(); 987 988 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 989 .legalFor({S32}) 990 .minScalar(0, S32) 991 .widenScalarToNextPow2(0) 992 .scalarize(0) 993 .lower(); 994 } 995 996 getActionDefinitionsBuilder(G_INTTOPTR) 997 // List the common cases 998 .legalForCartesianProduct(AddrSpaces64, {S64}) 999 .legalForCartesianProduct(AddrSpaces32, {S32}) 1000 .scalarize(0) 1001 // Accept any address space as long as the size matches 1002 .legalIf(sameSize(0, 1)) 1003 .widenScalarIf(smallerThan(1, 0), 1004 [](const LegalityQuery &Query) { 1005 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1006 }) 1007 .narrowScalarIf(largerThan(1, 0), 1008 [](const LegalityQuery &Query) { 1009 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1010 }); 1011 1012 getActionDefinitionsBuilder(G_PTRTOINT) 1013 // List the common cases 1014 .legalForCartesianProduct(AddrSpaces64, {S64}) 1015 .legalForCartesianProduct(AddrSpaces32, {S32}) 1016 .scalarize(0) 1017 // Accept any address space as long as the size matches 1018 .legalIf(sameSize(0, 1)) 1019 .widenScalarIf(smallerThan(0, 1), 1020 [](const LegalityQuery &Query) { 1021 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1022 }) 1023 .narrowScalarIf( 1024 largerThan(0, 1), 1025 [](const LegalityQuery &Query) { 1026 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1027 }); 1028 1029 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1030 .scalarize(0) 1031 .custom(); 1032 1033 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1034 bool IsLoad) -> bool { 1035 const LLT DstTy = Query.Types[0]; 1036 1037 // Split vector extloads. 1038 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1039 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1040 1041 if (MemSize < DstTy.getSizeInBits()) 1042 MemSize = std::max(MemSize, AlignBits); 1043 1044 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1045 return true; 1046 1047 const LLT PtrTy = Query.Types[1]; 1048 unsigned AS = PtrTy.getAddressSpace(); 1049 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1050 return true; 1051 1052 // Catch weird sized loads that don't evenly divide into the access sizes 1053 // TODO: May be able to widen depending on alignment etc. 1054 unsigned NumRegs = (MemSize + 31) / 32; 1055 if (NumRegs == 3) { 1056 if (!ST.hasDwordx3LoadStores()) 1057 return true; 1058 } else { 1059 // If the alignment allows, these should have been widened. 1060 if (!isPowerOf2_32(NumRegs)) 1061 return true; 1062 } 1063 1064 if (AlignBits < MemSize) { 1065 const SITargetLowering *TLI = ST.getTargetLowering(); 1066 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1067 Align(AlignBits / 8)); 1068 } 1069 1070 return false; 1071 }; 1072 1073 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1074 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1075 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1076 1077 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1078 // LDS 1079 // TODO: Unsupported flat for SI. 1080 1081 for (unsigned Op : {G_LOAD, G_STORE}) { 1082 const bool IsStore = Op == G_STORE; 1083 1084 auto &Actions = getActionDefinitionsBuilder(Op); 1085 // Explicitly list some common cases. 1086 // TODO: Does this help compile time at all? 1087 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1088 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1089 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1090 {S64, GlobalPtr, 64, GlobalAlign32}, 1091 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1092 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1093 {S32, GlobalPtr, 8, GlobalAlign8}, 1094 {S32, GlobalPtr, 16, GlobalAlign16}, 1095 1096 {S32, LocalPtr, 32, 32}, 1097 {S64, LocalPtr, 64, 32}, 1098 {V2S32, LocalPtr, 64, 32}, 1099 {S32, LocalPtr, 8, 8}, 1100 {S32, LocalPtr, 16, 16}, 1101 {V2S16, LocalPtr, 32, 32}, 1102 1103 {S32, PrivatePtr, 32, 32}, 1104 {S32, PrivatePtr, 8, 8}, 1105 {S32, PrivatePtr, 16, 16}, 1106 {V2S16, PrivatePtr, 32, 32}, 1107 1108 {S32, ConstantPtr, 32, GlobalAlign32}, 1109 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1110 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1111 {S64, ConstantPtr, 64, GlobalAlign32}, 1112 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1113 Actions.legalIf( 1114 [=](const LegalityQuery &Query) -> bool { 1115 return isLoadStoreLegal(ST, Query, Op); 1116 }); 1117 1118 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1119 // 64-bits. 1120 // 1121 // TODO: Should generalize bitcast action into coerce, which will also cover 1122 // inserting addrspacecasts. 1123 Actions.customIf(typeIs(1, Constant32Ptr)); 1124 1125 // Turn any illegal element vectors into something easier to deal 1126 // with. These will ultimately produce 32-bit scalar shifts to extract the 1127 // parts anyway. 1128 // 1129 // For odd 16-bit element vectors, prefer to split those into pieces with 1130 // 16-bit vector parts. 1131 Actions.bitcastIf( 1132 [=](const LegalityQuery &Query) -> bool { 1133 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1134 Query.MMODescrs[0].SizeInBits); 1135 }, bitcastToRegisterType(0)); 1136 1137 if (!IsStore) { 1138 // Widen suitably aligned loads by loading extra bytes. The standard 1139 // legalization actions can't properly express widening memory operands. 1140 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1141 return shouldWidenLoad(ST, Query, G_LOAD); 1142 }); 1143 } 1144 1145 // FIXME: load/store narrowing should be moved to lower action 1146 Actions 1147 .narrowScalarIf( 1148 [=](const LegalityQuery &Query) -> bool { 1149 return !Query.Types[0].isVector() && 1150 needToSplitMemOp(Query, Op == G_LOAD); 1151 }, 1152 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1153 const LLT DstTy = Query.Types[0]; 1154 const LLT PtrTy = Query.Types[1]; 1155 1156 const unsigned DstSize = DstTy.getSizeInBits(); 1157 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1158 1159 // Split extloads. 1160 if (DstSize > MemSize) 1161 return std::make_pair(0, LLT::scalar(MemSize)); 1162 1163 if (!isPowerOf2_32(DstSize)) { 1164 // We're probably decomposing an odd sized store. Try to split 1165 // to the widest type. TODO: Account for alignment. As-is it 1166 // should be OK, since the new parts will be further legalized. 1167 unsigned FloorSize = PowerOf2Floor(DstSize); 1168 return std::make_pair(0, LLT::scalar(FloorSize)); 1169 } 1170 1171 if (DstSize > 32 && (DstSize % 32 != 0)) { 1172 // FIXME: Need a way to specify non-extload of larger size if 1173 // suitably aligned. 1174 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1175 } 1176 1177 unsigned MaxSize = maxSizeForAddrSpace(ST, 1178 PtrTy.getAddressSpace(), 1179 Op == G_LOAD); 1180 if (MemSize > MaxSize) 1181 return std::make_pair(0, LLT::scalar(MaxSize)); 1182 1183 unsigned Align = Query.MMODescrs[0].AlignInBits; 1184 return std::make_pair(0, LLT::scalar(Align)); 1185 }) 1186 .fewerElementsIf( 1187 [=](const LegalityQuery &Query) -> bool { 1188 return Query.Types[0].isVector() && 1189 needToSplitMemOp(Query, Op == G_LOAD); 1190 }, 1191 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1192 const LLT DstTy = Query.Types[0]; 1193 const LLT PtrTy = Query.Types[1]; 1194 1195 LLT EltTy = DstTy.getElementType(); 1196 unsigned MaxSize = maxSizeForAddrSpace(ST, 1197 PtrTy.getAddressSpace(), 1198 Op == G_LOAD); 1199 1200 // FIXME: Handle widened to power of 2 results better. This ends 1201 // up scalarizing. 1202 // FIXME: 3 element stores scalarized on SI 1203 1204 // Split if it's too large for the address space. 1205 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1206 unsigned NumElts = DstTy.getNumElements(); 1207 unsigned EltSize = EltTy.getSizeInBits(); 1208 1209 if (MaxSize % EltSize == 0) { 1210 return std::make_pair( 1211 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1212 } 1213 1214 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1215 1216 // FIXME: Refine when odd breakdowns handled 1217 // The scalars will need to be re-legalized. 1218 if (NumPieces == 1 || NumPieces >= NumElts || 1219 NumElts % NumPieces != 0) 1220 return std::make_pair(0, EltTy); 1221 1222 return std::make_pair(0, 1223 LLT::vector(NumElts / NumPieces, EltTy)); 1224 } 1225 1226 // FIXME: We could probably handle weird extending loads better. 1227 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1228 if (DstTy.getSizeInBits() > MemSize) 1229 return std::make_pair(0, EltTy); 1230 1231 unsigned EltSize = EltTy.getSizeInBits(); 1232 unsigned DstSize = DstTy.getSizeInBits(); 1233 if (!isPowerOf2_32(DstSize)) { 1234 // We're probably decomposing an odd sized store. Try to split 1235 // to the widest type. TODO: Account for alignment. As-is it 1236 // should be OK, since the new parts will be further legalized. 1237 unsigned FloorSize = PowerOf2Floor(DstSize); 1238 return std::make_pair( 1239 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1240 } 1241 1242 // Need to split because of alignment. 1243 unsigned Align = Query.MMODescrs[0].AlignInBits; 1244 if (EltSize > Align && 1245 (EltSize / Align < DstTy.getNumElements())) { 1246 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1247 } 1248 1249 // May need relegalization for the scalars. 1250 return std::make_pair(0, EltTy); 1251 }) 1252 .lowerIfMemSizeNotPow2() 1253 .minScalar(0, S32); 1254 1255 if (IsStore) 1256 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1257 1258 Actions 1259 .widenScalarToNextPow2(0) 1260 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1261 .lower(); 1262 } 1263 1264 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1265 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1266 {S32, GlobalPtr, 16, 2 * 8}, 1267 {S32, LocalPtr, 8, 8}, 1268 {S32, LocalPtr, 16, 16}, 1269 {S32, PrivatePtr, 8, 8}, 1270 {S32, PrivatePtr, 16, 16}, 1271 {S32, ConstantPtr, 8, 8}, 1272 {S32, ConstantPtr, 16, 2 * 8}}); 1273 if (ST.hasFlatAddressSpace()) { 1274 ExtLoads.legalForTypesWithMemDesc( 1275 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1276 } 1277 1278 ExtLoads.clampScalar(0, S32, S32) 1279 .widenScalarToNextPow2(0) 1280 .unsupportedIfMemSizeNotPow2() 1281 .lower(); 1282 1283 auto &Atomics = getActionDefinitionsBuilder( 1284 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1285 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1286 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1287 G_ATOMICRMW_UMIN}) 1288 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1289 {S64, GlobalPtr}, {S64, LocalPtr}, 1290 {S32, RegionPtr}, {S64, RegionPtr}}); 1291 if (ST.hasFlatAddressSpace()) { 1292 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1293 } 1294 1295 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1296 if (ST.hasLDSFPAtomics()) { 1297 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1298 if (ST.hasGFX90AInsts()) 1299 Atomic.legalFor({{S64, LocalPtr}}); 1300 } 1301 if (ST.hasAtomicFaddInsts()) 1302 Atomic.legalFor({{S32, GlobalPtr}}); 1303 1304 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1305 // demarshalling 1306 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1307 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1308 {S32, FlatPtr}, {S64, FlatPtr}}) 1309 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1310 {S32, RegionPtr}, {S64, RegionPtr}}); 1311 // TODO: Pointer types, any 32-bit or 64-bit vector 1312 1313 // Condition should be s32 for scalar, s1 for vector. 1314 getActionDefinitionsBuilder(G_SELECT) 1315 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1316 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1317 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1318 .clampScalar(0, S16, S64) 1319 .scalarize(1) 1320 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1321 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1322 .clampMaxNumElements(0, S32, 2) 1323 .clampMaxNumElements(0, LocalPtr, 2) 1324 .clampMaxNumElements(0, PrivatePtr, 2) 1325 .scalarize(0) 1326 .widenScalarToNextPow2(0) 1327 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1328 1329 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1330 // be more flexible with the shift amount type. 1331 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1332 .legalFor({{S32, S32}, {S64, S32}}); 1333 if (ST.has16BitInsts()) { 1334 if (ST.hasVOP3PInsts()) { 1335 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1336 .clampMaxNumElements(0, S16, 2); 1337 } else 1338 Shifts.legalFor({{S16, S16}}); 1339 1340 // TODO: Support 16-bit shift amounts for all types 1341 Shifts.widenScalarIf( 1342 [=](const LegalityQuery &Query) { 1343 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1344 // 32-bit amount. 1345 const LLT ValTy = Query.Types[0]; 1346 const LLT AmountTy = Query.Types[1]; 1347 return ValTy.getSizeInBits() <= 16 && 1348 AmountTy.getSizeInBits() < 16; 1349 }, changeTo(1, S16)); 1350 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1351 Shifts.clampScalar(1, S32, S32); 1352 Shifts.clampScalar(0, S16, S64); 1353 Shifts.widenScalarToNextPow2(0, 16); 1354 1355 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1356 .minScalar(0, S16) 1357 .scalarize(0) 1358 .lower(); 1359 } else { 1360 // Make sure we legalize the shift amount type first, as the general 1361 // expansion for the shifted type will produce much worse code if it hasn't 1362 // been truncated already. 1363 Shifts.clampScalar(1, S32, S32); 1364 Shifts.clampScalar(0, S32, S64); 1365 Shifts.widenScalarToNextPow2(0, 32); 1366 1367 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1368 .minScalar(0, S32) 1369 .scalarize(0) 1370 .lower(); 1371 } 1372 Shifts.scalarize(0); 1373 1374 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1375 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1376 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1377 unsigned IdxTypeIdx = 2; 1378 1379 getActionDefinitionsBuilder(Op) 1380 .customIf([=](const LegalityQuery &Query) { 1381 const LLT EltTy = Query.Types[EltTypeIdx]; 1382 const LLT VecTy = Query.Types[VecTypeIdx]; 1383 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1384 const unsigned EltSize = EltTy.getSizeInBits(); 1385 return (EltSize == 32 || EltSize == 64) && 1386 VecTy.getSizeInBits() % 32 == 0 && 1387 VecTy.getSizeInBits() <= MaxRegisterSize && 1388 IdxTy.getSizeInBits() == 32; 1389 }) 1390 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1391 bitcastToVectorElement32(VecTypeIdx)) 1392 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1393 .bitcastIf( 1394 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1395 [=](const LegalityQuery &Query) { 1396 // For > 64-bit element types, try to turn this into a 64-bit 1397 // element vector since we may be able to do better indexing 1398 // if this is scalar. If not, fall back to 32. 1399 const LLT EltTy = Query.Types[EltTypeIdx]; 1400 const LLT VecTy = Query.Types[VecTypeIdx]; 1401 const unsigned DstEltSize = EltTy.getSizeInBits(); 1402 const unsigned VecSize = VecTy.getSizeInBits(); 1403 1404 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1405 return std::make_pair( 1406 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1407 }) 1408 .clampScalar(EltTypeIdx, S32, S64) 1409 .clampScalar(VecTypeIdx, S32, S64) 1410 .clampScalar(IdxTypeIdx, S32, S32) 1411 .clampMaxNumElements(VecTypeIdx, S32, 32) 1412 // TODO: Clamp elements for 64-bit vectors? 1413 // It should only be necessary with variable indexes. 1414 // As a last resort, lower to the stack 1415 .lower(); 1416 } 1417 1418 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1419 .unsupportedIf([=](const LegalityQuery &Query) { 1420 const LLT &EltTy = Query.Types[1].getElementType(); 1421 return Query.Types[0] != EltTy; 1422 }); 1423 1424 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1425 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1426 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1427 1428 // FIXME: Doesn't handle extract of illegal sizes. 1429 getActionDefinitionsBuilder(Op) 1430 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1431 // FIXME: Multiples of 16 should not be legal. 1432 .legalIf([=](const LegalityQuery &Query) { 1433 const LLT BigTy = Query.Types[BigTyIdx]; 1434 const LLT LitTy = Query.Types[LitTyIdx]; 1435 return (BigTy.getSizeInBits() % 32 == 0) && 1436 (LitTy.getSizeInBits() % 16 == 0); 1437 }) 1438 .widenScalarIf( 1439 [=](const LegalityQuery &Query) { 1440 const LLT BigTy = Query.Types[BigTyIdx]; 1441 return (BigTy.getScalarSizeInBits() < 16); 1442 }, 1443 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1444 .widenScalarIf( 1445 [=](const LegalityQuery &Query) { 1446 const LLT LitTy = Query.Types[LitTyIdx]; 1447 return (LitTy.getScalarSizeInBits() < 16); 1448 }, 1449 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1450 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1451 .widenScalarToNextPow2(BigTyIdx, 32); 1452 1453 } 1454 1455 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1456 .legalForCartesianProduct(AllS32Vectors, {S32}) 1457 .legalForCartesianProduct(AllS64Vectors, {S64}) 1458 .clampNumElements(0, V16S32, V32S32) 1459 .clampNumElements(0, V2S64, V16S64) 1460 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1461 1462 if (ST.hasScalarPackInsts()) { 1463 BuildVector 1464 // FIXME: Should probably widen s1 vectors straight to s32 1465 .minScalarOrElt(0, S16) 1466 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1467 .minScalar(1, S32); 1468 1469 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1470 .legalFor({V2S16, S32}) 1471 .lower(); 1472 BuildVector.minScalarOrElt(0, S32); 1473 } else { 1474 BuildVector.customFor({V2S16, S16}); 1475 BuildVector.minScalarOrElt(0, S32); 1476 1477 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1478 .customFor({V2S16, S32}) 1479 .lower(); 1480 } 1481 1482 BuildVector.legalIf(isRegisterType(0)); 1483 1484 // FIXME: Clamp maximum size 1485 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1486 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1487 .clampMaxNumElements(0, S32, 32) 1488 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1489 .clampMaxNumElements(0, S16, 64); 1490 1491 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1492 // pre-legalize. 1493 if (ST.hasVOP3PInsts()) { 1494 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1495 .customFor({V2S16, V2S16}) 1496 .lower(); 1497 } else 1498 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1499 1500 // Merge/Unmerge 1501 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1502 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1503 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1504 1505 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1506 const LLT Ty = Query.Types[TypeIdx]; 1507 if (Ty.isVector()) { 1508 const LLT &EltTy = Ty.getElementType(); 1509 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1510 return true; 1511 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1512 return true; 1513 } 1514 return false; 1515 }; 1516 1517 auto &Builder = getActionDefinitionsBuilder(Op) 1518 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1519 .lowerFor({{S16, V2S16}}) 1520 .lowerIf([=](const LegalityQuery &Query) { 1521 const LLT BigTy = Query.Types[BigTyIdx]; 1522 return BigTy.getSizeInBits() == 32; 1523 }) 1524 // Try to widen to s16 first for small types. 1525 // TODO: Only do this on targets with legal s16 shifts 1526 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1527 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1528 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1529 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1530 elementTypeIs(1, S16)), 1531 changeTo(1, V2S16)) 1532 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1533 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1534 // valid. 1535 .clampScalar(LitTyIdx, S32, S512) 1536 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1537 // Break up vectors with weird elements into scalars 1538 .fewerElementsIf( 1539 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1540 scalarize(0)) 1541 .fewerElementsIf( 1542 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1543 scalarize(1)) 1544 .clampScalar(BigTyIdx, S32, MaxScalar); 1545 1546 if (Op == G_MERGE_VALUES) { 1547 Builder.widenScalarIf( 1548 // TODO: Use 16-bit shifts if legal for 8-bit values? 1549 [=](const LegalityQuery &Query) { 1550 const LLT Ty = Query.Types[LitTyIdx]; 1551 return Ty.getSizeInBits() < 32; 1552 }, 1553 changeTo(LitTyIdx, S32)); 1554 } 1555 1556 Builder.widenScalarIf( 1557 [=](const LegalityQuery &Query) { 1558 const LLT Ty = Query.Types[BigTyIdx]; 1559 return !isPowerOf2_32(Ty.getSizeInBits()) && 1560 Ty.getSizeInBits() % 16 != 0; 1561 }, 1562 [=](const LegalityQuery &Query) { 1563 // Pick the next power of 2, or a multiple of 64 over 128. 1564 // Whichever is smaller. 1565 const LLT &Ty = Query.Types[BigTyIdx]; 1566 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1567 if (NewSizeInBits >= 256) { 1568 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1569 if (RoundedTo < NewSizeInBits) 1570 NewSizeInBits = RoundedTo; 1571 } 1572 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1573 }) 1574 // Any vectors left are the wrong size. Scalarize them. 1575 .scalarize(0) 1576 .scalarize(1); 1577 } 1578 1579 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1580 // RegBankSelect. 1581 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1582 .legalFor({{S32}, {S64}}); 1583 1584 if (ST.hasVOP3PInsts()) { 1585 SextInReg.lowerFor({{V2S16}}) 1586 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1587 // get more vector shift opportunities, since we'll get those when 1588 // expanded. 1589 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1590 } else if (ST.has16BitInsts()) { 1591 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1592 } else { 1593 // Prefer to promote to s32 before lowering if we don't have 16-bit 1594 // shifts. This avoid a lot of intermediate truncate and extend operations. 1595 SextInReg.lowerFor({{S32}, {S64}}); 1596 } 1597 1598 SextInReg 1599 .scalarize(0) 1600 .clampScalar(0, S32, S64) 1601 .lower(); 1602 1603 // TODO: Only Try to form v2s16 with legal packed instructions. 1604 getActionDefinitionsBuilder(G_FSHR) 1605 .legalFor({{S32, S32}}) 1606 .lowerFor({{V2S16, V2S16}}) 1607 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1608 .scalarize(0) 1609 .lower(); 1610 1611 if (ST.hasVOP3PInsts()) { 1612 getActionDefinitionsBuilder(G_FSHL) 1613 .lowerFor({{V2S16, V2S16}}) 1614 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1615 .scalarize(0) 1616 .lower(); 1617 } else { 1618 getActionDefinitionsBuilder(G_FSHL) 1619 .scalarize(0) 1620 .lower(); 1621 } 1622 1623 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1624 .legalFor({S64}); 1625 1626 getActionDefinitionsBuilder(G_FENCE) 1627 .alwaysLegal(); 1628 1629 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1630 .scalarize(0) 1631 .minScalar(0, S32) 1632 .lower(); 1633 1634 getActionDefinitionsBuilder({ 1635 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1636 G_FCOPYSIGN, 1637 1638 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1639 G_ATOMICRMW_NAND, 1640 G_ATOMICRMW_FSUB, 1641 G_READ_REGISTER, 1642 G_WRITE_REGISTER, 1643 1644 G_SADDO, G_SSUBO, 1645 1646 // TODO: Implement 1647 G_FMINIMUM, G_FMAXIMUM}).lower(); 1648 1649 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1650 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1651 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1652 .unsupported(); 1653 1654 computeTables(); 1655 verify(*ST.getInstrInfo()); 1656 } 1657 1658 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1659 MachineInstr &MI) const { 1660 MachineIRBuilder &B = Helper.MIRBuilder; 1661 MachineRegisterInfo &MRI = *B.getMRI(); 1662 1663 switch (MI.getOpcode()) { 1664 case TargetOpcode::G_ADDRSPACE_CAST: 1665 return legalizeAddrSpaceCast(MI, MRI, B); 1666 case TargetOpcode::G_FRINT: 1667 return legalizeFrint(MI, MRI, B); 1668 case TargetOpcode::G_FCEIL: 1669 return legalizeFceil(MI, MRI, B); 1670 case TargetOpcode::G_FREM: 1671 return legalizeFrem(MI, MRI, B); 1672 case TargetOpcode::G_INTRINSIC_TRUNC: 1673 return legalizeIntrinsicTrunc(MI, MRI, B); 1674 case TargetOpcode::G_SITOFP: 1675 return legalizeITOFP(MI, MRI, B, true); 1676 case TargetOpcode::G_UITOFP: 1677 return legalizeITOFP(MI, MRI, B, false); 1678 case TargetOpcode::G_FPTOSI: 1679 return legalizeFPTOI(MI, MRI, B, true); 1680 case TargetOpcode::G_FPTOUI: 1681 return legalizeFPTOI(MI, MRI, B, false); 1682 case TargetOpcode::G_FMINNUM: 1683 case TargetOpcode::G_FMAXNUM: 1684 case TargetOpcode::G_FMINNUM_IEEE: 1685 case TargetOpcode::G_FMAXNUM_IEEE: 1686 return legalizeMinNumMaxNum(Helper, MI); 1687 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1688 return legalizeExtractVectorElt(MI, MRI, B); 1689 case TargetOpcode::G_INSERT_VECTOR_ELT: 1690 return legalizeInsertVectorElt(MI, MRI, B); 1691 case TargetOpcode::G_SHUFFLE_VECTOR: 1692 return legalizeShuffleVector(MI, MRI, B); 1693 case TargetOpcode::G_FSIN: 1694 case TargetOpcode::G_FCOS: 1695 return legalizeSinCos(MI, MRI, B); 1696 case TargetOpcode::G_GLOBAL_VALUE: 1697 return legalizeGlobalValue(MI, MRI, B); 1698 case TargetOpcode::G_LOAD: 1699 return legalizeLoad(Helper, MI); 1700 case TargetOpcode::G_FMAD: 1701 return legalizeFMad(MI, MRI, B); 1702 case TargetOpcode::G_FDIV: 1703 return legalizeFDIV(MI, MRI, B); 1704 case TargetOpcode::G_UDIV: 1705 case TargetOpcode::G_UREM: 1706 return legalizeUDIV_UREM(MI, MRI, B); 1707 case TargetOpcode::G_SDIV: 1708 case TargetOpcode::G_SREM: 1709 return legalizeSDIV_SREM(MI, MRI, B); 1710 case TargetOpcode::G_ATOMIC_CMPXCHG: 1711 return legalizeAtomicCmpXChg(MI, MRI, B); 1712 case TargetOpcode::G_FLOG: 1713 return legalizeFlog(MI, B, numbers::ln2f); 1714 case TargetOpcode::G_FLOG10: 1715 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1716 case TargetOpcode::G_FEXP: 1717 return legalizeFExp(MI, B); 1718 case TargetOpcode::G_FPOW: 1719 return legalizeFPow(MI, B); 1720 case TargetOpcode::G_FFLOOR: 1721 return legalizeFFloor(MI, MRI, B); 1722 case TargetOpcode::G_BUILD_VECTOR: 1723 return legalizeBuildVector(MI, MRI, B); 1724 default: 1725 return false; 1726 } 1727 1728 llvm_unreachable("expected switch to return"); 1729 } 1730 1731 Register AMDGPULegalizerInfo::getSegmentAperture( 1732 unsigned AS, 1733 MachineRegisterInfo &MRI, 1734 MachineIRBuilder &B) const { 1735 MachineFunction &MF = B.getMF(); 1736 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1737 const LLT S32 = LLT::scalar(32); 1738 1739 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1740 1741 if (ST.hasApertureRegs()) { 1742 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1743 // getreg. 1744 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1745 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1746 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1747 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1748 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1749 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1750 unsigned Encoding = 1751 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1752 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1753 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1754 1755 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1756 1757 B.buildInstr(AMDGPU::S_GETREG_B32) 1758 .addDef(GetReg) 1759 .addImm(Encoding); 1760 MRI.setType(GetReg, S32); 1761 1762 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1763 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1764 } 1765 1766 Register QueuePtr = MRI.createGenericVirtualRegister( 1767 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1768 1769 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1770 return Register(); 1771 1772 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1773 // private_segment_aperture_base_hi. 1774 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1775 1776 // TODO: can we be smarter about machine pointer info? 1777 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1778 MachineMemOperand *MMO = MF.getMachineMemOperand( 1779 PtrInfo, 1780 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1781 MachineMemOperand::MOInvariant, 1782 4, commonAlignment(Align(64), StructOffset)); 1783 1784 Register LoadAddr; 1785 1786 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1787 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1788 } 1789 1790 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1791 MachineInstr &MI, MachineRegisterInfo &MRI, 1792 MachineIRBuilder &B) const { 1793 MachineFunction &MF = B.getMF(); 1794 1795 const LLT S32 = LLT::scalar(32); 1796 Register Dst = MI.getOperand(0).getReg(); 1797 Register Src = MI.getOperand(1).getReg(); 1798 1799 LLT DstTy = MRI.getType(Dst); 1800 LLT SrcTy = MRI.getType(Src); 1801 unsigned DestAS = DstTy.getAddressSpace(); 1802 unsigned SrcAS = SrcTy.getAddressSpace(); 1803 1804 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1805 // vector element. 1806 assert(!DstTy.isVector()); 1807 1808 const AMDGPUTargetMachine &TM 1809 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1810 1811 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1812 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1813 return true; 1814 } 1815 1816 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1817 // Truncate. 1818 B.buildExtract(Dst, Src, 0); 1819 MI.eraseFromParent(); 1820 return true; 1821 } 1822 1823 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1824 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1825 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1826 1827 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1828 // another. Merge operands are required to be the same type, but creating an 1829 // extra ptrtoint would be kind of pointless. 1830 auto HighAddr = B.buildConstant( 1831 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1832 B.buildMerge(Dst, {Src, HighAddr}); 1833 MI.eraseFromParent(); 1834 return true; 1835 } 1836 1837 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1838 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1839 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1840 unsigned NullVal = TM.getNullPointerValue(DestAS); 1841 1842 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1843 auto FlatNull = B.buildConstant(SrcTy, 0); 1844 1845 // Extract low 32-bits of the pointer. 1846 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1847 1848 auto CmpRes = 1849 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1850 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1851 1852 MI.eraseFromParent(); 1853 return true; 1854 } 1855 1856 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1857 return false; 1858 1859 if (!ST.hasFlatAddressSpace()) 1860 return false; 1861 1862 auto SegmentNull = 1863 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1864 auto FlatNull = 1865 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1866 1867 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1868 if (!ApertureReg.isValid()) 1869 return false; 1870 1871 auto CmpRes = 1872 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1873 1874 // Coerce the type of the low half of the result so we can use merge_values. 1875 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1876 1877 // TODO: Should we allow mismatched types but matching sizes in merges to 1878 // avoid the ptrtoint? 1879 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1880 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1881 1882 MI.eraseFromParent(); 1883 return true; 1884 } 1885 1886 bool AMDGPULegalizerInfo::legalizeFrint( 1887 MachineInstr &MI, MachineRegisterInfo &MRI, 1888 MachineIRBuilder &B) const { 1889 Register Src = MI.getOperand(1).getReg(); 1890 LLT Ty = MRI.getType(Src); 1891 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1892 1893 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1894 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1895 1896 auto C1 = B.buildFConstant(Ty, C1Val); 1897 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1898 1899 // TODO: Should this propagate fast-math-flags? 1900 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1901 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1902 1903 auto C2 = B.buildFConstant(Ty, C2Val); 1904 auto Fabs = B.buildFAbs(Ty, Src); 1905 1906 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1907 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1908 MI.eraseFromParent(); 1909 return true; 1910 } 1911 1912 bool AMDGPULegalizerInfo::legalizeFceil( 1913 MachineInstr &MI, MachineRegisterInfo &MRI, 1914 MachineIRBuilder &B) const { 1915 1916 const LLT S1 = LLT::scalar(1); 1917 const LLT S64 = LLT::scalar(64); 1918 1919 Register Src = MI.getOperand(1).getReg(); 1920 assert(MRI.getType(Src) == S64); 1921 1922 // result = trunc(src) 1923 // if (src > 0.0 && src != result) 1924 // result += 1.0 1925 1926 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1927 1928 const auto Zero = B.buildFConstant(S64, 0.0); 1929 const auto One = B.buildFConstant(S64, 1.0); 1930 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1931 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1932 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1933 auto Add = B.buildSelect(S64, And, One, Zero); 1934 1935 // TODO: Should this propagate fast-math-flags? 1936 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1937 return true; 1938 } 1939 1940 bool AMDGPULegalizerInfo::legalizeFrem( 1941 MachineInstr &MI, MachineRegisterInfo &MRI, 1942 MachineIRBuilder &B) const { 1943 Register DstReg = MI.getOperand(0).getReg(); 1944 Register Src0Reg = MI.getOperand(1).getReg(); 1945 Register Src1Reg = MI.getOperand(2).getReg(); 1946 auto Flags = MI.getFlags(); 1947 LLT Ty = MRI.getType(DstReg); 1948 1949 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1950 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1951 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1952 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1953 MI.eraseFromParent(); 1954 return true; 1955 } 1956 1957 static MachineInstrBuilder extractF64Exponent(Register Hi, 1958 MachineIRBuilder &B) { 1959 const unsigned FractBits = 52; 1960 const unsigned ExpBits = 11; 1961 LLT S32 = LLT::scalar(32); 1962 1963 auto Const0 = B.buildConstant(S32, FractBits - 32); 1964 auto Const1 = B.buildConstant(S32, ExpBits); 1965 1966 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1967 .addUse(Hi) 1968 .addUse(Const0.getReg(0)) 1969 .addUse(Const1.getReg(0)); 1970 1971 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1972 } 1973 1974 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1975 MachineInstr &MI, MachineRegisterInfo &MRI, 1976 MachineIRBuilder &B) const { 1977 const LLT S1 = LLT::scalar(1); 1978 const LLT S32 = LLT::scalar(32); 1979 const LLT S64 = LLT::scalar(64); 1980 1981 Register Src = MI.getOperand(1).getReg(); 1982 assert(MRI.getType(Src) == S64); 1983 1984 // TODO: Should this use extract since the low half is unused? 1985 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1986 Register Hi = Unmerge.getReg(1); 1987 1988 // Extract the upper half, since this is where we will find the sign and 1989 // exponent. 1990 auto Exp = extractF64Exponent(Hi, B); 1991 1992 const unsigned FractBits = 52; 1993 1994 // Extract the sign bit. 1995 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1996 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1997 1998 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1999 2000 const auto Zero32 = B.buildConstant(S32, 0); 2001 2002 // Extend back to 64-bits. 2003 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2004 2005 auto Shr = B.buildAShr(S64, FractMask, Exp); 2006 auto Not = B.buildNot(S64, Shr); 2007 auto Tmp0 = B.buildAnd(S64, Src, Not); 2008 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2009 2010 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2011 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2012 2013 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2014 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2015 MI.eraseFromParent(); 2016 return true; 2017 } 2018 2019 bool AMDGPULegalizerInfo::legalizeITOFP( 2020 MachineInstr &MI, MachineRegisterInfo &MRI, 2021 MachineIRBuilder &B, bool Signed) const { 2022 2023 Register Dst = MI.getOperand(0).getReg(); 2024 Register Src = MI.getOperand(1).getReg(); 2025 2026 const LLT S64 = LLT::scalar(64); 2027 const LLT S32 = LLT::scalar(32); 2028 2029 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2030 2031 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2032 2033 auto CvtHi = Signed ? 2034 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2035 B.buildUITOFP(S64, Unmerge.getReg(1)); 2036 2037 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2038 2039 auto ThirtyTwo = B.buildConstant(S32, 32); 2040 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2041 .addUse(CvtHi.getReg(0)) 2042 .addUse(ThirtyTwo.getReg(0)); 2043 2044 // TODO: Should this propagate fast-math-flags? 2045 B.buildFAdd(Dst, LdExp, CvtLo); 2046 MI.eraseFromParent(); 2047 return true; 2048 } 2049 2050 // TODO: Copied from DAG implementation. Verify logic and document how this 2051 // actually works. 2052 bool AMDGPULegalizerInfo::legalizeFPTOI( 2053 MachineInstr &MI, MachineRegisterInfo &MRI, 2054 MachineIRBuilder &B, bool Signed) const { 2055 2056 Register Dst = MI.getOperand(0).getReg(); 2057 Register Src = MI.getOperand(1).getReg(); 2058 2059 const LLT S64 = LLT::scalar(64); 2060 const LLT S32 = LLT::scalar(32); 2061 2062 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2063 2064 unsigned Flags = MI.getFlags(); 2065 2066 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2067 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2068 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2069 2070 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2071 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2072 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2073 2074 auto Hi = Signed ? 2075 B.buildFPTOSI(S32, FloorMul) : 2076 B.buildFPTOUI(S32, FloorMul); 2077 auto Lo = B.buildFPTOUI(S32, Fma); 2078 2079 B.buildMerge(Dst, { Lo, Hi }); 2080 MI.eraseFromParent(); 2081 2082 return true; 2083 } 2084 2085 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2086 MachineInstr &MI) const { 2087 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2088 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2089 2090 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2091 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2092 2093 // With ieee_mode disabled, the instructions have the correct behavior 2094 // already for G_FMINNUM/G_FMAXNUM 2095 if (!MFI->getMode().IEEE) 2096 return !IsIEEEOp; 2097 2098 if (IsIEEEOp) 2099 return true; 2100 2101 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2102 } 2103 2104 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2105 MachineInstr &MI, MachineRegisterInfo &MRI, 2106 MachineIRBuilder &B) const { 2107 // TODO: Should move some of this into LegalizerHelper. 2108 2109 // TODO: Promote dynamic indexing of s16 to s32 2110 2111 // FIXME: Artifact combiner probably should have replaced the truncated 2112 // constant before this, so we shouldn't need 2113 // getConstantVRegValWithLookThrough. 2114 Optional<ValueAndVReg> MaybeIdxVal = 2115 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2116 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2117 return true; 2118 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2119 2120 Register Dst = MI.getOperand(0).getReg(); 2121 Register Vec = MI.getOperand(1).getReg(); 2122 2123 LLT VecTy = MRI.getType(Vec); 2124 LLT EltTy = VecTy.getElementType(); 2125 assert(EltTy == MRI.getType(Dst)); 2126 2127 if (IdxVal < VecTy.getNumElements()) 2128 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2129 else 2130 B.buildUndef(Dst); 2131 2132 MI.eraseFromParent(); 2133 return true; 2134 } 2135 2136 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2137 MachineInstr &MI, MachineRegisterInfo &MRI, 2138 MachineIRBuilder &B) const { 2139 // TODO: Should move some of this into LegalizerHelper. 2140 2141 // TODO: Promote dynamic indexing of s16 to s32 2142 2143 // FIXME: Artifact combiner probably should have replaced the truncated 2144 // constant before this, so we shouldn't need 2145 // getConstantVRegValWithLookThrough. 2146 Optional<ValueAndVReg> MaybeIdxVal = 2147 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2148 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2149 return true; 2150 2151 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2152 Register Dst = MI.getOperand(0).getReg(); 2153 Register Vec = MI.getOperand(1).getReg(); 2154 Register Ins = MI.getOperand(2).getReg(); 2155 2156 LLT VecTy = MRI.getType(Vec); 2157 LLT EltTy = VecTy.getElementType(); 2158 assert(EltTy == MRI.getType(Ins)); 2159 2160 if (IdxVal < VecTy.getNumElements()) 2161 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2162 else 2163 B.buildUndef(Dst); 2164 2165 MI.eraseFromParent(); 2166 return true; 2167 } 2168 2169 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2170 MachineInstr &MI, MachineRegisterInfo &MRI, 2171 MachineIRBuilder &B) const { 2172 const LLT V2S16 = LLT::vector(2, 16); 2173 2174 Register Dst = MI.getOperand(0).getReg(); 2175 Register Src0 = MI.getOperand(1).getReg(); 2176 LLT DstTy = MRI.getType(Dst); 2177 LLT SrcTy = MRI.getType(Src0); 2178 2179 if (SrcTy == V2S16 && DstTy == V2S16 && 2180 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2181 return true; 2182 2183 MachineIRBuilder HelperBuilder(MI); 2184 GISelObserverWrapper DummyObserver; 2185 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2186 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2187 } 2188 2189 bool AMDGPULegalizerInfo::legalizeSinCos( 2190 MachineInstr &MI, MachineRegisterInfo &MRI, 2191 MachineIRBuilder &B) const { 2192 2193 Register DstReg = MI.getOperand(0).getReg(); 2194 Register SrcReg = MI.getOperand(1).getReg(); 2195 LLT Ty = MRI.getType(DstReg); 2196 unsigned Flags = MI.getFlags(); 2197 2198 Register TrigVal; 2199 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2200 if (ST.hasTrigReducedRange()) { 2201 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2202 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2203 .addUse(MulVal.getReg(0)) 2204 .setMIFlags(Flags).getReg(0); 2205 } else 2206 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2207 2208 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2209 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2210 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2211 .addUse(TrigVal) 2212 .setMIFlags(Flags); 2213 MI.eraseFromParent(); 2214 return true; 2215 } 2216 2217 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2218 MachineIRBuilder &B, 2219 const GlobalValue *GV, 2220 int64_t Offset, 2221 unsigned GAFlags) const { 2222 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2223 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2224 // to the following code sequence: 2225 // 2226 // For constant address space: 2227 // s_getpc_b64 s[0:1] 2228 // s_add_u32 s0, s0, $symbol 2229 // s_addc_u32 s1, s1, 0 2230 // 2231 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2232 // a fixup or relocation is emitted to replace $symbol with a literal 2233 // constant, which is a pc-relative offset from the encoding of the $symbol 2234 // operand to the global variable. 2235 // 2236 // For global address space: 2237 // s_getpc_b64 s[0:1] 2238 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2239 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2240 // 2241 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2242 // fixups or relocations are emitted to replace $symbol@*@lo and 2243 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2244 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2245 // operand to the global variable. 2246 // 2247 // What we want here is an offset from the value returned by s_getpc 2248 // (which is the address of the s_add_u32 instruction) to the global 2249 // variable, but since the encoding of $symbol starts 4 bytes after the start 2250 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2251 // small. This requires us to add 4 to the global variable offset in order to 2252 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2253 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2254 // instruction. 2255 2256 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2257 2258 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2259 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2260 2261 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2262 .addDef(PCReg); 2263 2264 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2265 if (GAFlags == SIInstrInfo::MO_NONE) 2266 MIB.addImm(0); 2267 else 2268 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2269 2270 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2271 2272 if (PtrTy.getSizeInBits() == 32) 2273 B.buildExtract(DstReg, PCReg, 0); 2274 return true; 2275 } 2276 2277 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2278 MachineInstr &MI, MachineRegisterInfo &MRI, 2279 MachineIRBuilder &B) const { 2280 Register DstReg = MI.getOperand(0).getReg(); 2281 LLT Ty = MRI.getType(DstReg); 2282 unsigned AS = Ty.getAddressSpace(); 2283 2284 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2285 MachineFunction &MF = B.getMF(); 2286 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2287 2288 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2289 if (!MFI->isModuleEntryFunction() && 2290 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2291 const Function &Fn = MF.getFunction(); 2292 DiagnosticInfoUnsupported BadLDSDecl( 2293 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2294 DS_Warning); 2295 Fn.getContext().diagnose(BadLDSDecl); 2296 2297 // We currently don't have a way to correctly allocate LDS objects that 2298 // aren't directly associated with a kernel. We do force inlining of 2299 // functions that use local objects. However, if these dead functions are 2300 // not eliminated, we don't want a compile time error. Just emit a warning 2301 // and a trap, since there should be no callable path here. 2302 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2303 B.buildUndef(DstReg); 2304 MI.eraseFromParent(); 2305 return true; 2306 } 2307 2308 // TODO: We could emit code to handle the initialization somewhere. 2309 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2310 const SITargetLowering *TLI = ST.getTargetLowering(); 2311 if (!TLI->shouldUseLDSConstAddress(GV)) { 2312 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2313 return true; // Leave in place; 2314 } 2315 2316 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2317 Type *Ty = GV->getValueType(); 2318 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2319 // zero-sized type in other languages to declare the dynamic shared 2320 // memory which size is not known at the compile time. They will be 2321 // allocated by the runtime and placed directly after the static 2322 // allocated ones. They all share the same offset. 2323 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2324 // Adjust alignment for that dynamic shared memory array. 2325 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2326 LLT S32 = LLT::scalar(32); 2327 auto Sz = 2328 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2329 B.buildIntToPtr(DstReg, Sz); 2330 MI.eraseFromParent(); 2331 return true; 2332 } 2333 } 2334 2335 B.buildConstant( 2336 DstReg, 2337 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2338 MI.eraseFromParent(); 2339 return true; 2340 } 2341 2342 const Function &Fn = MF.getFunction(); 2343 DiagnosticInfoUnsupported BadInit( 2344 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2345 Fn.getContext().diagnose(BadInit); 2346 return true; 2347 } 2348 2349 const SITargetLowering *TLI = ST.getTargetLowering(); 2350 2351 if (TLI->shouldEmitFixup(GV)) { 2352 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2353 MI.eraseFromParent(); 2354 return true; 2355 } 2356 2357 if (TLI->shouldEmitPCReloc(GV)) { 2358 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2359 MI.eraseFromParent(); 2360 return true; 2361 } 2362 2363 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2364 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2365 2366 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2367 MachinePointerInfo::getGOT(MF), 2368 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2369 MachineMemOperand::MOInvariant, 2370 8 /*Size*/, Align(8)); 2371 2372 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2373 2374 if (Ty.getSizeInBits() == 32) { 2375 // Truncate if this is a 32-bit constant adrdess. 2376 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2377 B.buildExtract(DstReg, Load, 0); 2378 } else 2379 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2380 2381 MI.eraseFromParent(); 2382 return true; 2383 } 2384 2385 static LLT widenToNextPowerOf2(LLT Ty) { 2386 if (Ty.isVector()) 2387 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2388 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2389 } 2390 2391 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2392 MachineInstr &MI) const { 2393 MachineIRBuilder &B = Helper.MIRBuilder; 2394 MachineRegisterInfo &MRI = *B.getMRI(); 2395 GISelChangeObserver &Observer = Helper.Observer; 2396 2397 Register PtrReg = MI.getOperand(1).getReg(); 2398 LLT PtrTy = MRI.getType(PtrReg); 2399 unsigned AddrSpace = PtrTy.getAddressSpace(); 2400 2401 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2402 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2403 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2404 Observer.changingInstr(MI); 2405 MI.getOperand(1).setReg(Cast.getReg(0)); 2406 Observer.changedInstr(MI); 2407 return true; 2408 } 2409 2410 Register ValReg = MI.getOperand(0).getReg(); 2411 LLT ValTy = MRI.getType(ValReg); 2412 2413 MachineMemOperand *MMO = *MI.memoperands_begin(); 2414 const unsigned ValSize = ValTy.getSizeInBits(); 2415 const unsigned MemSize = 8 * MMO->getSize(); 2416 const Align MemAlign = MMO->getAlign(); 2417 const unsigned AlignInBits = 8 * MemAlign.value(); 2418 2419 // Widen non-power-of-2 loads to the alignment if needed 2420 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2421 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2422 2423 // This was already the correct extending load result type, so just adjust 2424 // the memory type. 2425 if (WideMemSize == ValSize) { 2426 MachineFunction &MF = B.getMF(); 2427 2428 MachineMemOperand *WideMMO = 2429 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2430 Observer.changingInstr(MI); 2431 MI.setMemRefs(MF, {WideMMO}); 2432 Observer.changedInstr(MI); 2433 return true; 2434 } 2435 2436 // Don't bother handling edge case that should probably never be produced. 2437 if (ValSize > WideMemSize) 2438 return false; 2439 2440 LLT WideTy = widenToNextPowerOf2(ValTy); 2441 2442 Register WideLoad; 2443 if (!WideTy.isVector()) { 2444 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2445 B.buildTrunc(ValReg, WideLoad).getReg(0); 2446 } else { 2447 // Extract the subvector. 2448 2449 if (isRegisterType(ValTy)) { 2450 // If this a case where G_EXTRACT is legal, use it. 2451 // (e.g. <3 x s32> -> <4 x s32>) 2452 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2453 B.buildExtract(ValReg, WideLoad, 0); 2454 } else { 2455 // For cases where the widened type isn't a nice register value, unmerge 2456 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2457 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2458 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2459 B.setInsertPt(B.getMBB(), MI.getIterator()); 2460 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2461 } 2462 } 2463 2464 MI.eraseFromParent(); 2465 return true; 2466 } 2467 2468 return false; 2469 } 2470 2471 bool AMDGPULegalizerInfo::legalizeFMad( 2472 MachineInstr &MI, MachineRegisterInfo &MRI, 2473 MachineIRBuilder &B) const { 2474 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2475 assert(Ty.isScalar()); 2476 2477 MachineFunction &MF = B.getMF(); 2478 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2479 2480 // TODO: Always legal with future ftz flag. 2481 // FIXME: Do we need just output? 2482 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2483 return true; 2484 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2485 return true; 2486 2487 MachineIRBuilder HelperBuilder(MI); 2488 GISelObserverWrapper DummyObserver; 2489 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2490 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2491 } 2492 2493 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2494 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2495 Register DstReg = MI.getOperand(0).getReg(); 2496 Register PtrReg = MI.getOperand(1).getReg(); 2497 Register CmpVal = MI.getOperand(2).getReg(); 2498 Register NewVal = MI.getOperand(3).getReg(); 2499 2500 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2501 "this should not have been custom lowered"); 2502 2503 LLT ValTy = MRI.getType(CmpVal); 2504 LLT VecTy = LLT::vector(2, ValTy); 2505 2506 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2507 2508 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2509 .addDef(DstReg) 2510 .addUse(PtrReg) 2511 .addUse(PackedVal) 2512 .setMemRefs(MI.memoperands()); 2513 2514 MI.eraseFromParent(); 2515 return true; 2516 } 2517 2518 bool AMDGPULegalizerInfo::legalizeFlog( 2519 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2520 Register Dst = MI.getOperand(0).getReg(); 2521 Register Src = MI.getOperand(1).getReg(); 2522 LLT Ty = B.getMRI()->getType(Dst); 2523 unsigned Flags = MI.getFlags(); 2524 2525 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2526 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2527 2528 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2529 MI.eraseFromParent(); 2530 return true; 2531 } 2532 2533 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2534 MachineIRBuilder &B) const { 2535 Register Dst = MI.getOperand(0).getReg(); 2536 Register Src = MI.getOperand(1).getReg(); 2537 unsigned Flags = MI.getFlags(); 2538 LLT Ty = B.getMRI()->getType(Dst); 2539 2540 auto K = B.buildFConstant(Ty, numbers::log2e); 2541 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2542 B.buildFExp2(Dst, Mul, Flags); 2543 MI.eraseFromParent(); 2544 return true; 2545 } 2546 2547 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2548 MachineIRBuilder &B) const { 2549 Register Dst = MI.getOperand(0).getReg(); 2550 Register Src0 = MI.getOperand(1).getReg(); 2551 Register Src1 = MI.getOperand(2).getReg(); 2552 unsigned Flags = MI.getFlags(); 2553 LLT Ty = B.getMRI()->getType(Dst); 2554 const LLT S16 = LLT::scalar(16); 2555 const LLT S32 = LLT::scalar(32); 2556 2557 if (Ty == S32) { 2558 auto Log = B.buildFLog2(S32, Src0, Flags); 2559 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2560 .addUse(Log.getReg(0)) 2561 .addUse(Src1) 2562 .setMIFlags(Flags); 2563 B.buildFExp2(Dst, Mul, Flags); 2564 } else if (Ty == S16) { 2565 // There's no f16 fmul_legacy, so we need to convert for it. 2566 auto Log = B.buildFLog2(S16, Src0, Flags); 2567 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2568 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2569 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2570 .addUse(Ext0.getReg(0)) 2571 .addUse(Ext1.getReg(0)) 2572 .setMIFlags(Flags); 2573 2574 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2575 } else 2576 return false; 2577 2578 MI.eraseFromParent(); 2579 return true; 2580 } 2581 2582 // Find a source register, ignoring any possible source modifiers. 2583 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2584 Register ModSrc = OrigSrc; 2585 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2586 ModSrc = SrcFNeg->getOperand(1).getReg(); 2587 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2588 ModSrc = SrcFAbs->getOperand(1).getReg(); 2589 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2590 ModSrc = SrcFAbs->getOperand(1).getReg(); 2591 return ModSrc; 2592 } 2593 2594 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2595 MachineRegisterInfo &MRI, 2596 MachineIRBuilder &B) const { 2597 2598 const LLT S1 = LLT::scalar(1); 2599 const LLT S64 = LLT::scalar(64); 2600 Register Dst = MI.getOperand(0).getReg(); 2601 Register OrigSrc = MI.getOperand(1).getReg(); 2602 unsigned Flags = MI.getFlags(); 2603 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2604 "this should not have been custom lowered"); 2605 2606 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2607 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2608 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2609 // V_FRACT bug is: 2610 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2611 // 2612 // Convert floor(x) to (x - fract(x)) 2613 2614 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2615 .addUse(OrigSrc) 2616 .setMIFlags(Flags); 2617 2618 // Give source modifier matching some assistance before obscuring a foldable 2619 // pattern. 2620 2621 // TODO: We can avoid the neg on the fract? The input sign to fract 2622 // shouldn't matter? 2623 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2624 2625 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2626 2627 Register Min = MRI.createGenericVirtualRegister(S64); 2628 2629 // We don't need to concern ourselves with the snan handling difference, so 2630 // use the one which will directly select. 2631 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2632 if (MFI->getMode().IEEE) 2633 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2634 else 2635 B.buildFMinNum(Min, Fract, Const, Flags); 2636 2637 Register CorrectedFract = Min; 2638 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2639 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2640 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2641 } 2642 2643 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2644 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2645 2646 MI.eraseFromParent(); 2647 return true; 2648 } 2649 2650 // Turn an illegal packed v2s16 build vector into bit operations. 2651 // TODO: This should probably be a bitcast action in LegalizerHelper. 2652 bool AMDGPULegalizerInfo::legalizeBuildVector( 2653 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2654 Register Dst = MI.getOperand(0).getReg(); 2655 const LLT S32 = LLT::scalar(32); 2656 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2657 2658 Register Src0 = MI.getOperand(1).getReg(); 2659 Register Src1 = MI.getOperand(2).getReg(); 2660 assert(MRI.getType(Src0) == LLT::scalar(16)); 2661 2662 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2663 B.buildBitcast(Dst, Merge); 2664 2665 MI.eraseFromParent(); 2666 return true; 2667 } 2668 2669 // Check that this is a G_XOR x, -1 2670 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2671 if (MI.getOpcode() != TargetOpcode::G_XOR) 2672 return false; 2673 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2674 return ConstVal && *ConstVal == -1; 2675 } 2676 2677 // Return the use branch instruction, otherwise null if the usage is invalid. 2678 static MachineInstr * 2679 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2680 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2681 Register CondDef = MI.getOperand(0).getReg(); 2682 if (!MRI.hasOneNonDBGUse(CondDef)) 2683 return nullptr; 2684 2685 MachineBasicBlock *Parent = MI.getParent(); 2686 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2687 2688 if (isNot(MRI, *UseMI)) { 2689 Register NegatedCond = UseMI->getOperand(0).getReg(); 2690 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2691 return nullptr; 2692 2693 // We're deleting the def of this value, so we need to remove it. 2694 UseMI->eraseFromParent(); 2695 2696 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2697 Negated = true; 2698 } 2699 2700 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2701 return nullptr; 2702 2703 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2704 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2705 if (Next == Parent->end()) { 2706 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2707 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2708 return nullptr; 2709 UncondBrTarget = &*NextMBB; 2710 } else { 2711 if (Next->getOpcode() != AMDGPU::G_BR) 2712 return nullptr; 2713 Br = &*Next; 2714 UncondBrTarget = Br->getOperand(0).getMBB(); 2715 } 2716 2717 return UseMI; 2718 } 2719 2720 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2721 const ArgDescriptor *Arg, 2722 const TargetRegisterClass *ArgRC, 2723 LLT ArgTy) const { 2724 MCRegister SrcReg = Arg->getRegister(); 2725 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2726 assert(DstReg.isVirtual() && "Virtual register expected"); 2727 2728 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2729 ArgTy); 2730 if (Arg->isMasked()) { 2731 // TODO: Should we try to emit this once in the entry block? 2732 const LLT S32 = LLT::scalar(32); 2733 const unsigned Mask = Arg->getMask(); 2734 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2735 2736 Register AndMaskSrc = LiveIn; 2737 2738 if (Shift != 0) { 2739 auto ShiftAmt = B.buildConstant(S32, Shift); 2740 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2741 } 2742 2743 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2744 } else { 2745 B.buildCopy(DstReg, LiveIn); 2746 } 2747 2748 return true; 2749 } 2750 2751 bool AMDGPULegalizerInfo::loadInputValue( 2752 Register DstReg, MachineIRBuilder &B, 2753 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2754 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2755 const ArgDescriptor *Arg; 2756 const TargetRegisterClass *ArgRC; 2757 LLT ArgTy; 2758 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2759 2760 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2761 return false; // TODO: Handle these 2762 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2763 } 2764 2765 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2766 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2767 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2768 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2769 return false; 2770 2771 MI.eraseFromParent(); 2772 return true; 2773 } 2774 2775 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2776 MachineRegisterInfo &MRI, 2777 MachineIRBuilder &B) const { 2778 Register Dst = MI.getOperand(0).getReg(); 2779 LLT DstTy = MRI.getType(Dst); 2780 LLT S16 = LLT::scalar(16); 2781 LLT S32 = LLT::scalar(32); 2782 LLT S64 = LLT::scalar(64); 2783 2784 if (DstTy == S16) 2785 return legalizeFDIV16(MI, MRI, B); 2786 if (DstTy == S32) 2787 return legalizeFDIV32(MI, MRI, B); 2788 if (DstTy == S64) 2789 return legalizeFDIV64(MI, MRI, B); 2790 2791 return false; 2792 } 2793 2794 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2795 Register DstReg, 2796 Register X, 2797 Register Y, 2798 bool IsDiv) const { 2799 const LLT S1 = LLT::scalar(1); 2800 const LLT S32 = LLT::scalar(32); 2801 2802 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2803 // algorithm used here. 2804 2805 // Initial estimate of inv(y). 2806 auto FloatY = B.buildUITOFP(S32, Y); 2807 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2808 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2809 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2810 auto Z = B.buildFPTOUI(S32, ScaledY); 2811 2812 // One round of UNR. 2813 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2814 auto NegYZ = B.buildMul(S32, NegY, Z); 2815 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2816 2817 // Quotient/remainder estimate. 2818 auto Q = B.buildUMulH(S32, X, Z); 2819 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2820 2821 // First quotient/remainder refinement. 2822 auto One = B.buildConstant(S32, 1); 2823 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2824 if (IsDiv) 2825 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2826 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2827 2828 // Second quotient/remainder refinement. 2829 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2830 if (IsDiv) 2831 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2832 else 2833 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2834 } 2835 2836 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2837 // 2838 // Return lo, hi of result 2839 // 2840 // %cvt.lo = G_UITOFP Val.lo 2841 // %cvt.hi = G_UITOFP Val.hi 2842 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2843 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2844 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2845 // %mul2 = G_FMUL %mul1, 2**(-32) 2846 // %trunc = G_INTRINSIC_TRUNC %mul2 2847 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2848 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2849 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2850 Register Val) { 2851 const LLT S32 = LLT::scalar(32); 2852 auto Unmerge = B.buildUnmerge(S32, Val); 2853 2854 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2855 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2856 2857 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2858 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2859 2860 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2861 auto Mul1 = 2862 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2863 2864 // 2**(-32) 2865 auto Mul2 = 2866 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2867 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2868 2869 // -(2**32) 2870 auto Mad2 = B.buildFMAD(S32, Trunc, 2871 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2872 2873 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2874 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2875 2876 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2877 } 2878 2879 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2880 Register DstReg, 2881 Register Numer, 2882 Register Denom, 2883 bool IsDiv) const { 2884 const LLT S32 = LLT::scalar(32); 2885 const LLT S64 = LLT::scalar(64); 2886 const LLT S1 = LLT::scalar(1); 2887 Register RcpLo, RcpHi; 2888 2889 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2890 2891 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2892 2893 auto Zero64 = B.buildConstant(S64, 0); 2894 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2895 2896 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2897 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2898 2899 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2900 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2901 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2902 2903 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2904 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2905 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2906 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2907 2908 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2909 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2910 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2911 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2912 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2913 2914 auto Zero32 = B.buildConstant(S32, 0); 2915 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2916 auto Add2_HiC = 2917 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2918 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2919 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2920 2921 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2922 Register NumerLo = UnmergeNumer.getReg(0); 2923 Register NumerHi = UnmergeNumer.getReg(1); 2924 2925 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2926 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2927 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2928 Register Mul3_Lo = UnmergeMul3.getReg(0); 2929 Register Mul3_Hi = UnmergeMul3.getReg(1); 2930 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2931 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2932 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2933 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2934 2935 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2936 Register DenomLo = UnmergeDenom.getReg(0); 2937 Register DenomHi = UnmergeDenom.getReg(1); 2938 2939 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2940 auto C1 = B.buildSExt(S32, CmpHi); 2941 2942 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2943 auto C2 = B.buildSExt(S32, CmpLo); 2944 2945 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2946 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2947 2948 // TODO: Here and below portions of the code can be enclosed into if/endif. 2949 // Currently control flow is unconditional and we have 4 selects after 2950 // potential endif to substitute PHIs. 2951 2952 // if C3 != 0 ... 2953 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2954 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2955 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2956 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2957 2958 auto One64 = B.buildConstant(S64, 1); 2959 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2960 2961 auto C4 = 2962 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2963 auto C5 = 2964 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2965 auto C6 = B.buildSelect( 2966 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2967 2968 // if (C6 != 0) 2969 auto Add4 = B.buildAdd(S64, Add3, One64); 2970 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2971 2972 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2973 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2974 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2975 2976 // endif C6 2977 // endif C3 2978 2979 if (IsDiv) { 2980 auto Sel1 = B.buildSelect( 2981 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2982 B.buildSelect(DstReg, 2983 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2984 } else { 2985 auto Sel2 = B.buildSelect( 2986 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2987 B.buildSelect(DstReg, 2988 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2989 } 2990 } 2991 2992 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2993 MachineRegisterInfo &MRI, 2994 MachineIRBuilder &B) const { 2995 const LLT S64 = LLT::scalar(64); 2996 const LLT S32 = LLT::scalar(32); 2997 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2998 Register DstReg = MI.getOperand(0).getReg(); 2999 Register Num = MI.getOperand(1).getReg(); 3000 Register Den = MI.getOperand(2).getReg(); 3001 LLT Ty = MRI.getType(DstReg); 3002 3003 if (Ty == S32) 3004 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 3005 else if (Ty == S64) 3006 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 3007 else 3008 return false; 3009 3010 MI.eraseFromParent(); 3011 return true; 3012 3013 } 3014 3015 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 3016 MachineRegisterInfo &MRI, 3017 MachineIRBuilder &B) const { 3018 const LLT S64 = LLT::scalar(64); 3019 const LLT S32 = LLT::scalar(32); 3020 3021 Register DstReg = MI.getOperand(0).getReg(); 3022 const LLT Ty = MRI.getType(DstReg); 3023 if (Ty != S32 && Ty != S64) 3024 return false; 3025 3026 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 3027 3028 Register LHS = MI.getOperand(1).getReg(); 3029 Register RHS = MI.getOperand(2).getReg(); 3030 3031 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3032 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3033 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3034 3035 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3036 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3037 3038 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3039 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3040 3041 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 3042 if (Ty == S32) 3043 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 3044 else 3045 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 3046 3047 Register Sign; 3048 if (IsDiv) 3049 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3050 else 3051 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3052 3053 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 3054 B.buildSub(DstReg, UDivRem, Sign); 3055 3056 MI.eraseFromParent(); 3057 return true; 3058 } 3059 3060 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3061 MachineRegisterInfo &MRI, 3062 MachineIRBuilder &B) const { 3063 Register Res = MI.getOperand(0).getReg(); 3064 Register LHS = MI.getOperand(1).getReg(); 3065 Register RHS = MI.getOperand(2).getReg(); 3066 uint16_t Flags = MI.getFlags(); 3067 LLT ResTy = MRI.getType(Res); 3068 3069 const MachineFunction &MF = B.getMF(); 3070 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3071 MI.getFlag(MachineInstr::FmAfn); 3072 3073 if (!AllowInaccurateRcp) 3074 return false; 3075 3076 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3077 // 1 / x -> RCP(x) 3078 if (CLHS->isExactlyValue(1.0)) { 3079 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3080 .addUse(RHS) 3081 .setMIFlags(Flags); 3082 3083 MI.eraseFromParent(); 3084 return true; 3085 } 3086 3087 // -1 / x -> RCP( FNEG(x) ) 3088 if (CLHS->isExactlyValue(-1.0)) { 3089 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3090 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3091 .addUse(FNeg.getReg(0)) 3092 .setMIFlags(Flags); 3093 3094 MI.eraseFromParent(); 3095 return true; 3096 } 3097 } 3098 3099 // x / y -> x * (1.0 / y) 3100 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3101 .addUse(RHS) 3102 .setMIFlags(Flags); 3103 B.buildFMul(Res, LHS, RCP, Flags); 3104 3105 MI.eraseFromParent(); 3106 return true; 3107 } 3108 3109 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3110 MachineRegisterInfo &MRI, 3111 MachineIRBuilder &B) const { 3112 Register Res = MI.getOperand(0).getReg(); 3113 Register X = MI.getOperand(1).getReg(); 3114 Register Y = MI.getOperand(2).getReg(); 3115 uint16_t Flags = MI.getFlags(); 3116 LLT ResTy = MRI.getType(Res); 3117 3118 const MachineFunction &MF = B.getMF(); 3119 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3120 MI.getFlag(MachineInstr::FmAfn); 3121 3122 if (!AllowInaccurateRcp) 3123 return false; 3124 3125 auto NegY = B.buildFNeg(ResTy, Y); 3126 auto One = B.buildFConstant(ResTy, 1.0); 3127 3128 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3129 .addUse(Y) 3130 .setMIFlags(Flags); 3131 3132 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3133 R = B.buildFMA(ResTy, Tmp0, R, R); 3134 3135 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3136 R = B.buildFMA(ResTy, Tmp1, R, R); 3137 3138 auto Ret = B.buildFMul(ResTy, X, R); 3139 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3140 3141 B.buildFMA(Res, Tmp2, R, Ret); 3142 MI.eraseFromParent(); 3143 return true; 3144 } 3145 3146 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3147 MachineRegisterInfo &MRI, 3148 MachineIRBuilder &B) const { 3149 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3150 return true; 3151 3152 Register Res = MI.getOperand(0).getReg(); 3153 Register LHS = MI.getOperand(1).getReg(); 3154 Register RHS = MI.getOperand(2).getReg(); 3155 3156 uint16_t Flags = MI.getFlags(); 3157 3158 LLT S16 = LLT::scalar(16); 3159 LLT S32 = LLT::scalar(32); 3160 3161 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3162 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3163 3164 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3165 .addUse(RHSExt.getReg(0)) 3166 .setMIFlags(Flags); 3167 3168 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3169 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3170 3171 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3172 .addUse(RDst.getReg(0)) 3173 .addUse(RHS) 3174 .addUse(LHS) 3175 .setMIFlags(Flags); 3176 3177 MI.eraseFromParent(); 3178 return true; 3179 } 3180 3181 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3182 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3183 static void toggleSPDenormMode(bool Enable, 3184 MachineIRBuilder &B, 3185 const GCNSubtarget &ST, 3186 AMDGPU::SIModeRegisterDefaults Mode) { 3187 // Set SP denorm mode to this value. 3188 unsigned SPDenormMode = 3189 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3190 3191 if (ST.hasDenormModeInst()) { 3192 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3193 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3194 3195 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3196 B.buildInstr(AMDGPU::S_DENORM_MODE) 3197 .addImm(NewDenormModeValue); 3198 3199 } else { 3200 // Select FP32 bit field in mode register. 3201 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3202 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3203 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3204 3205 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3206 .addImm(SPDenormMode) 3207 .addImm(SPDenormModeBitField); 3208 } 3209 } 3210 3211 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3212 MachineRegisterInfo &MRI, 3213 MachineIRBuilder &B) const { 3214 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3215 return true; 3216 3217 Register Res = MI.getOperand(0).getReg(); 3218 Register LHS = MI.getOperand(1).getReg(); 3219 Register RHS = MI.getOperand(2).getReg(); 3220 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3221 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3222 3223 uint16_t Flags = MI.getFlags(); 3224 3225 LLT S32 = LLT::scalar(32); 3226 LLT S1 = LLT::scalar(1); 3227 3228 auto One = B.buildFConstant(S32, 1.0f); 3229 3230 auto DenominatorScaled = 3231 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3232 .addUse(LHS) 3233 .addUse(RHS) 3234 .addImm(0) 3235 .setMIFlags(Flags); 3236 auto NumeratorScaled = 3237 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3238 .addUse(LHS) 3239 .addUse(RHS) 3240 .addImm(1) 3241 .setMIFlags(Flags); 3242 3243 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3244 .addUse(DenominatorScaled.getReg(0)) 3245 .setMIFlags(Flags); 3246 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3247 3248 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3249 // aren't modeled as reading it. 3250 if (!Mode.allFP32Denormals()) 3251 toggleSPDenormMode(true, B, ST, Mode); 3252 3253 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3254 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3255 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3256 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3257 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3258 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3259 3260 if (!Mode.allFP32Denormals()) 3261 toggleSPDenormMode(false, B, ST, Mode); 3262 3263 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3264 .addUse(Fma4.getReg(0)) 3265 .addUse(Fma1.getReg(0)) 3266 .addUse(Fma3.getReg(0)) 3267 .addUse(NumeratorScaled.getReg(1)) 3268 .setMIFlags(Flags); 3269 3270 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3271 .addUse(Fmas.getReg(0)) 3272 .addUse(RHS) 3273 .addUse(LHS) 3274 .setMIFlags(Flags); 3275 3276 MI.eraseFromParent(); 3277 return true; 3278 } 3279 3280 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3281 MachineRegisterInfo &MRI, 3282 MachineIRBuilder &B) const { 3283 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3284 return true; 3285 3286 Register Res = MI.getOperand(0).getReg(); 3287 Register LHS = MI.getOperand(1).getReg(); 3288 Register RHS = MI.getOperand(2).getReg(); 3289 3290 uint16_t Flags = MI.getFlags(); 3291 3292 LLT S64 = LLT::scalar(64); 3293 LLT S1 = LLT::scalar(1); 3294 3295 auto One = B.buildFConstant(S64, 1.0); 3296 3297 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3298 .addUse(LHS) 3299 .addUse(RHS) 3300 .addImm(0) 3301 .setMIFlags(Flags); 3302 3303 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3304 3305 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3306 .addUse(DivScale0.getReg(0)) 3307 .setMIFlags(Flags); 3308 3309 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3310 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3311 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3312 3313 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3314 .addUse(LHS) 3315 .addUse(RHS) 3316 .addImm(1) 3317 .setMIFlags(Flags); 3318 3319 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3320 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3321 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3322 3323 Register Scale; 3324 if (!ST.hasUsableDivScaleConditionOutput()) { 3325 // Workaround a hardware bug on SI where the condition output from div_scale 3326 // is not usable. 3327 3328 LLT S32 = LLT::scalar(32); 3329 3330 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3331 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3332 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3333 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3334 3335 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3336 Scale1Unmerge.getReg(1)); 3337 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3338 Scale0Unmerge.getReg(1)); 3339 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3340 } else { 3341 Scale = DivScale1.getReg(1); 3342 } 3343 3344 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3345 .addUse(Fma4.getReg(0)) 3346 .addUse(Fma3.getReg(0)) 3347 .addUse(Mul.getReg(0)) 3348 .addUse(Scale) 3349 .setMIFlags(Flags); 3350 3351 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3352 .addUse(Fmas.getReg(0)) 3353 .addUse(RHS) 3354 .addUse(LHS) 3355 .setMIFlags(Flags); 3356 3357 MI.eraseFromParent(); 3358 return true; 3359 } 3360 3361 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3362 MachineRegisterInfo &MRI, 3363 MachineIRBuilder &B) const { 3364 Register Res = MI.getOperand(0).getReg(); 3365 Register LHS = MI.getOperand(2).getReg(); 3366 Register RHS = MI.getOperand(3).getReg(); 3367 uint16_t Flags = MI.getFlags(); 3368 3369 LLT S32 = LLT::scalar(32); 3370 LLT S1 = LLT::scalar(1); 3371 3372 auto Abs = B.buildFAbs(S32, RHS, Flags); 3373 const APFloat C0Val(1.0f); 3374 3375 auto C0 = B.buildConstant(S32, 0x6f800000); 3376 auto C1 = B.buildConstant(S32, 0x2f800000); 3377 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3378 3379 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3380 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3381 3382 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3383 3384 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3385 .addUse(Mul0.getReg(0)) 3386 .setMIFlags(Flags); 3387 3388 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3389 3390 B.buildFMul(Res, Sel, Mul1, Flags); 3391 3392 MI.eraseFromParent(); 3393 return true; 3394 } 3395 3396 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3397 // FIXME: Why do we handle this one but not other removed instructions? 3398 // 3399 // Reciprocal square root. The clamp prevents infinite results, clamping 3400 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3401 // +-max_float. 3402 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3403 MachineRegisterInfo &MRI, 3404 MachineIRBuilder &B) const { 3405 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3406 return true; 3407 3408 Register Dst = MI.getOperand(0).getReg(); 3409 Register Src = MI.getOperand(2).getReg(); 3410 auto Flags = MI.getFlags(); 3411 3412 LLT Ty = MRI.getType(Dst); 3413 3414 const fltSemantics *FltSemantics; 3415 if (Ty == LLT::scalar(32)) 3416 FltSemantics = &APFloat::IEEEsingle(); 3417 else if (Ty == LLT::scalar(64)) 3418 FltSemantics = &APFloat::IEEEdouble(); 3419 else 3420 return false; 3421 3422 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3423 .addUse(Src) 3424 .setMIFlags(Flags); 3425 3426 // We don't need to concern ourselves with the snan handling difference, since 3427 // the rsq quieted (or not) so use the one which will directly select. 3428 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3429 const bool UseIEEE = MFI->getMode().IEEE; 3430 3431 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3432 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3433 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3434 3435 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3436 3437 if (UseIEEE) 3438 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3439 else 3440 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3441 MI.eraseFromParent(); 3442 return true; 3443 } 3444 3445 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3446 switch (IID) { 3447 case Intrinsic::amdgcn_ds_fadd: 3448 return AMDGPU::G_ATOMICRMW_FADD; 3449 case Intrinsic::amdgcn_ds_fmin: 3450 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3451 case Intrinsic::amdgcn_ds_fmax: 3452 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3453 default: 3454 llvm_unreachable("not a DS FP intrinsic"); 3455 } 3456 } 3457 3458 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3459 MachineInstr &MI, 3460 Intrinsic::ID IID) const { 3461 GISelChangeObserver &Observer = Helper.Observer; 3462 Observer.changingInstr(MI); 3463 3464 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3465 3466 // The remaining operands were used to set fields in the MemOperand on 3467 // construction. 3468 for (int I = 6; I > 3; --I) 3469 MI.RemoveOperand(I); 3470 3471 MI.RemoveOperand(1); // Remove the intrinsic ID. 3472 Observer.changedInstr(MI); 3473 return true; 3474 } 3475 3476 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3477 MachineRegisterInfo &MRI, 3478 MachineIRBuilder &B) const { 3479 uint64_t Offset = 3480 ST.getTargetLowering()->getImplicitParameterOffset( 3481 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3482 LLT DstTy = MRI.getType(DstReg); 3483 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3484 3485 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3486 if (!loadInputValue(KernargPtrReg, B, 3487 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3488 return false; 3489 3490 // FIXME: This should be nuw 3491 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3492 return true; 3493 } 3494 3495 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3496 MachineRegisterInfo &MRI, 3497 MachineIRBuilder &B) const { 3498 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3499 if (!MFI->isEntryFunction()) { 3500 return legalizePreloadedArgIntrin(MI, MRI, B, 3501 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3502 } 3503 3504 Register DstReg = MI.getOperand(0).getReg(); 3505 if (!getImplicitArgPtr(DstReg, MRI, B)) 3506 return false; 3507 3508 MI.eraseFromParent(); 3509 return true; 3510 } 3511 3512 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3513 MachineRegisterInfo &MRI, 3514 MachineIRBuilder &B, 3515 unsigned AddrSpace) const { 3516 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3517 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3518 Register Hi32 = Unmerge.getReg(1); 3519 3520 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3521 MI.eraseFromParent(); 3522 return true; 3523 } 3524 3525 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3526 // offset (the offset that is included in bounds checking and swizzling, to be 3527 // split between the instruction's voffset and immoffset fields) and soffset 3528 // (the offset that is excluded from bounds checking and swizzling, to go in 3529 // the instruction's soffset field). This function takes the first kind of 3530 // offset and figures out how to split it between voffset and immoffset. 3531 std::tuple<Register, unsigned, unsigned> 3532 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3533 Register OrigOffset) const { 3534 const unsigned MaxImm = 4095; 3535 Register BaseReg; 3536 unsigned TotalConstOffset; 3537 const LLT S32 = LLT::scalar(32); 3538 MachineRegisterInfo &MRI = *B.getMRI(); 3539 3540 std::tie(BaseReg, TotalConstOffset) = 3541 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3542 3543 unsigned ImmOffset = TotalConstOffset; 3544 3545 // If BaseReg is a pointer, convert it to int. 3546 if (MRI.getType(BaseReg).isPointer()) 3547 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3548 3549 // If the immediate value is too big for the immoffset field, put the value 3550 // and -4096 into the immoffset field so that the value that is copied/added 3551 // for the voffset field is a multiple of 4096, and it stands more chance 3552 // of being CSEd with the copy/add for another similar load/store. 3553 // However, do not do that rounding down to a multiple of 4096 if that is a 3554 // negative number, as it appears to be illegal to have a negative offset 3555 // in the vgpr, even if adding the immediate offset makes it positive. 3556 unsigned Overflow = ImmOffset & ~MaxImm; 3557 ImmOffset -= Overflow; 3558 if ((int32_t)Overflow < 0) { 3559 Overflow += ImmOffset; 3560 ImmOffset = 0; 3561 } 3562 3563 if (Overflow != 0) { 3564 if (!BaseReg) { 3565 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3566 } else { 3567 auto OverflowVal = B.buildConstant(S32, Overflow); 3568 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3569 } 3570 } 3571 3572 if (!BaseReg) 3573 BaseReg = B.buildConstant(S32, 0).getReg(0); 3574 3575 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3576 } 3577 3578 /// Handle register layout difference for f16 images for some subtargets. 3579 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3580 MachineRegisterInfo &MRI, 3581 Register Reg, 3582 bool ImageStore) const { 3583 const LLT S16 = LLT::scalar(16); 3584 const LLT S32 = LLT::scalar(32); 3585 LLT StoreVT = MRI.getType(Reg); 3586 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3587 3588 if (ST.hasUnpackedD16VMem()) { 3589 auto Unmerge = B.buildUnmerge(S16, Reg); 3590 3591 SmallVector<Register, 4> WideRegs; 3592 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3593 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3594 3595 int NumElts = StoreVT.getNumElements(); 3596 3597 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3598 } 3599 3600 if (ImageStore && ST.hasImageStoreD16Bug()) { 3601 if (StoreVT.getNumElements() == 2) { 3602 SmallVector<Register, 4> PackedRegs; 3603 Reg = B.buildBitcast(S32, Reg).getReg(0); 3604 PackedRegs.push_back(Reg); 3605 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3606 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); 3607 } 3608 3609 if (StoreVT.getNumElements() == 3) { 3610 SmallVector<Register, 4> PackedRegs; 3611 auto Unmerge = B.buildUnmerge(S16, Reg); 3612 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3613 PackedRegs.push_back(Unmerge.getReg(I)); 3614 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3615 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); 3616 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); 3617 } 3618 3619 if (StoreVT.getNumElements() == 4) { 3620 SmallVector<Register, 4> PackedRegs; 3621 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); 3622 auto Unmerge = B.buildUnmerge(S32, Reg); 3623 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3624 PackedRegs.push_back(Unmerge.getReg(I)); 3625 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3626 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); 3627 } 3628 3629 llvm_unreachable("invalid data type"); 3630 } 3631 3632 return Reg; 3633 } 3634 3635 Register AMDGPULegalizerInfo::fixStoreSourceType( 3636 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3637 MachineRegisterInfo *MRI = B.getMRI(); 3638 LLT Ty = MRI->getType(VData); 3639 3640 const LLT S16 = LLT::scalar(16); 3641 3642 // Fixup illegal register types for i8 stores. 3643 if (Ty == LLT::scalar(8) || Ty == S16) { 3644 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3645 return AnyExt; 3646 } 3647 3648 if (Ty.isVector()) { 3649 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3650 if (IsFormat) 3651 return handleD16VData(B, *MRI, VData); 3652 } 3653 } 3654 3655 return VData; 3656 } 3657 3658 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3659 MachineRegisterInfo &MRI, 3660 MachineIRBuilder &B, 3661 bool IsTyped, 3662 bool IsFormat) const { 3663 Register VData = MI.getOperand(1).getReg(); 3664 LLT Ty = MRI.getType(VData); 3665 LLT EltTy = Ty.getScalarType(); 3666 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3667 const LLT S32 = LLT::scalar(32); 3668 3669 VData = fixStoreSourceType(B, VData, IsFormat); 3670 Register RSrc = MI.getOperand(2).getReg(); 3671 3672 MachineMemOperand *MMO = *MI.memoperands_begin(); 3673 const int MemSize = MMO->getSize(); 3674 3675 unsigned ImmOffset; 3676 unsigned TotalOffset; 3677 3678 // The typed intrinsics add an immediate after the registers. 3679 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3680 3681 // The struct intrinsic variants add one additional operand over raw. 3682 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3683 Register VIndex; 3684 int OpOffset = 0; 3685 if (HasVIndex) { 3686 VIndex = MI.getOperand(3).getReg(); 3687 OpOffset = 1; 3688 } 3689 3690 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3691 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3692 3693 unsigned Format = 0; 3694 if (IsTyped) { 3695 Format = MI.getOperand(5 + OpOffset).getImm(); 3696 ++OpOffset; 3697 } 3698 3699 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3700 3701 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3702 if (TotalOffset != 0) 3703 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3704 3705 unsigned Opc; 3706 if (IsTyped) { 3707 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3708 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3709 } else if (IsFormat) { 3710 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3711 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3712 } else { 3713 switch (MemSize) { 3714 case 1: 3715 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3716 break; 3717 case 2: 3718 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3719 break; 3720 default: 3721 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3722 break; 3723 } 3724 } 3725 3726 if (!VIndex) 3727 VIndex = B.buildConstant(S32, 0).getReg(0); 3728 3729 auto MIB = B.buildInstr(Opc) 3730 .addUse(VData) // vdata 3731 .addUse(RSrc) // rsrc 3732 .addUse(VIndex) // vindex 3733 .addUse(VOffset) // voffset 3734 .addUse(SOffset) // soffset 3735 .addImm(ImmOffset); // offset(imm) 3736 3737 if (IsTyped) 3738 MIB.addImm(Format); 3739 3740 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3741 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3742 .addMemOperand(MMO); 3743 3744 MI.eraseFromParent(); 3745 return true; 3746 } 3747 3748 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3749 MachineRegisterInfo &MRI, 3750 MachineIRBuilder &B, 3751 bool IsFormat, 3752 bool IsTyped) const { 3753 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3754 MachineMemOperand *MMO = *MI.memoperands_begin(); 3755 const int MemSize = MMO->getSize(); 3756 const LLT S32 = LLT::scalar(32); 3757 3758 Register Dst = MI.getOperand(0).getReg(); 3759 Register RSrc = MI.getOperand(2).getReg(); 3760 3761 // The typed intrinsics add an immediate after the registers. 3762 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3763 3764 // The struct intrinsic variants add one additional operand over raw. 3765 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3766 Register VIndex; 3767 int OpOffset = 0; 3768 if (HasVIndex) { 3769 VIndex = MI.getOperand(3).getReg(); 3770 OpOffset = 1; 3771 } 3772 3773 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3774 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3775 3776 unsigned Format = 0; 3777 if (IsTyped) { 3778 Format = MI.getOperand(5 + OpOffset).getImm(); 3779 ++OpOffset; 3780 } 3781 3782 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3783 unsigned ImmOffset; 3784 unsigned TotalOffset; 3785 3786 LLT Ty = MRI.getType(Dst); 3787 LLT EltTy = Ty.getScalarType(); 3788 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3789 const bool Unpacked = ST.hasUnpackedD16VMem(); 3790 3791 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3792 if (TotalOffset != 0) 3793 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3794 3795 unsigned Opc; 3796 3797 if (IsTyped) { 3798 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3799 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3800 } else if (IsFormat) { 3801 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3802 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3803 } else { 3804 switch (MemSize) { 3805 case 1: 3806 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3807 break; 3808 case 2: 3809 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3810 break; 3811 default: 3812 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3813 break; 3814 } 3815 } 3816 3817 Register LoadDstReg; 3818 3819 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3820 LLT UnpackedTy = Ty.changeElementSize(32); 3821 3822 if (IsExtLoad) 3823 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3824 else if (Unpacked && IsD16 && Ty.isVector()) 3825 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3826 else 3827 LoadDstReg = Dst; 3828 3829 if (!VIndex) 3830 VIndex = B.buildConstant(S32, 0).getReg(0); 3831 3832 auto MIB = B.buildInstr(Opc) 3833 .addDef(LoadDstReg) // vdata 3834 .addUse(RSrc) // rsrc 3835 .addUse(VIndex) // vindex 3836 .addUse(VOffset) // voffset 3837 .addUse(SOffset) // soffset 3838 .addImm(ImmOffset); // offset(imm) 3839 3840 if (IsTyped) 3841 MIB.addImm(Format); 3842 3843 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3844 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3845 .addMemOperand(MMO); 3846 3847 if (LoadDstReg != Dst) { 3848 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3849 3850 // Widen result for extending loads was widened. 3851 if (IsExtLoad) 3852 B.buildTrunc(Dst, LoadDstReg); 3853 else { 3854 // Repack to original 16-bit vector result 3855 // FIXME: G_TRUNC should work, but legalization currently fails 3856 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3857 SmallVector<Register, 4> Repack; 3858 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3859 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3860 B.buildMerge(Dst, Repack); 3861 } 3862 } 3863 3864 MI.eraseFromParent(); 3865 return true; 3866 } 3867 3868 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3869 MachineIRBuilder &B, 3870 bool IsInc) const { 3871 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3872 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3873 B.buildInstr(Opc) 3874 .addDef(MI.getOperand(0).getReg()) 3875 .addUse(MI.getOperand(2).getReg()) 3876 .addUse(MI.getOperand(3).getReg()) 3877 .cloneMemRefs(MI); 3878 MI.eraseFromParent(); 3879 return true; 3880 } 3881 3882 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3883 switch (IntrID) { 3884 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3885 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3886 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3887 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3888 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3889 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3890 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3891 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3892 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3893 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3894 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3895 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3896 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3897 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3898 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3899 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3900 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3901 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3902 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3903 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3904 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3905 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3906 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3907 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3908 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3909 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3910 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3911 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3912 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3913 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3914 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3915 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3916 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3917 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3918 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3919 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3920 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3921 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3922 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3923 case Intrinsic::amdgcn_buffer_atomic_fadd: 3924 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3925 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3926 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3927 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 3928 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 3929 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 3930 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 3931 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 3932 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 3933 default: 3934 llvm_unreachable("unhandled atomic opcode"); 3935 } 3936 } 3937 3938 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3939 MachineIRBuilder &B, 3940 Intrinsic::ID IID) const { 3941 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3942 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3943 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3944 3945 Register Dst; 3946 3947 int OpOffset = 0; 3948 if (HasReturn) { 3949 // A few FP atomics do not support return values. 3950 Dst = MI.getOperand(0).getReg(); 3951 } else { 3952 OpOffset = -1; 3953 } 3954 3955 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3956 Register CmpVal; 3957 3958 if (IsCmpSwap) { 3959 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3960 ++OpOffset; 3961 } 3962 3963 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3964 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3965 3966 // The struct intrinsic variants add one additional operand over raw. 3967 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3968 Register VIndex; 3969 if (HasVIndex) { 3970 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3971 ++OpOffset; 3972 } 3973 3974 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3975 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3976 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3977 3978 MachineMemOperand *MMO = *MI.memoperands_begin(); 3979 3980 unsigned ImmOffset; 3981 unsigned TotalOffset; 3982 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3983 if (TotalOffset != 0) 3984 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3985 3986 if (!VIndex) 3987 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3988 3989 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3990 3991 if (HasReturn) 3992 MIB.addDef(Dst); 3993 3994 MIB.addUse(VData); // vdata 3995 3996 if (IsCmpSwap) 3997 MIB.addReg(CmpVal); 3998 3999 MIB.addUse(RSrc) // rsrc 4000 .addUse(VIndex) // vindex 4001 .addUse(VOffset) // voffset 4002 .addUse(SOffset) // soffset 4003 .addImm(ImmOffset) // offset(imm) 4004 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4005 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4006 .addMemOperand(MMO); 4007 4008 MI.eraseFromParent(); 4009 return true; 4010 } 4011 4012 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4013 /// vector with s16 typed elements. 4014 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4015 SmallVectorImpl<Register> &PackedAddrs, 4016 unsigned ArgOffset, 4017 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4018 bool IsA16, bool IsG16) { 4019 const LLT S16 = LLT::scalar(16); 4020 const LLT V2S16 = LLT::vector(2, 16); 4021 auto EndIdx = Intr->VAddrEnd; 4022 4023 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4024 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4025 if (!SrcOp.isReg()) 4026 continue; // _L to _LZ may have eliminated this. 4027 4028 Register AddrReg = SrcOp.getReg(); 4029 4030 if (I < Intr->GradientStart) { 4031 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4032 PackedAddrs.push_back(AddrReg); 4033 } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4034 (I >= Intr->CoordStart && !IsA16)) { 4035 // Handle any gradient or coordinate operands that should not be packed 4036 PackedAddrs.push_back(AddrReg); 4037 } else { 4038 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4039 // derivatives dx/dh and dx/dv are packed with undef. 4040 if (((I + 1) >= EndIdx) || 4041 ((Intr->NumGradients / 2) % 2 == 1 && 4042 (I == static_cast<unsigned>(Intr->GradientStart + 4043 (Intr->NumGradients / 2) - 1) || 4044 I == static_cast<unsigned>(Intr->GradientStart + 4045 Intr->NumGradients - 1))) || 4046 // Check for _L to _LZ optimization 4047 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4048 PackedAddrs.push_back( 4049 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4050 .getReg(0)); 4051 } else { 4052 PackedAddrs.push_back( 4053 B.buildBuildVector( 4054 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4055 .getReg(0)); 4056 ++I; 4057 } 4058 } 4059 } 4060 } 4061 4062 /// Convert from separate vaddr components to a single vector address register, 4063 /// and replace the remaining operands with $noreg. 4064 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4065 int DimIdx, int NumVAddrs) { 4066 const LLT S32 = LLT::scalar(32); 4067 4068 SmallVector<Register, 8> AddrRegs; 4069 for (int I = 0; I != NumVAddrs; ++I) { 4070 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4071 if (SrcOp.isReg()) { 4072 AddrRegs.push_back(SrcOp.getReg()); 4073 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4074 } 4075 } 4076 4077 int NumAddrRegs = AddrRegs.size(); 4078 if (NumAddrRegs != 1) { 4079 // Round up to 8 elements for v5-v7 4080 // FIXME: Missing intermediate sized register classes and instructions. 4081 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4082 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4083 auto Undef = B.buildUndef(S32); 4084 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4085 NumAddrRegs = RoundedNumRegs; 4086 } 4087 4088 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 4089 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4090 } 4091 4092 for (int I = 1; I != NumVAddrs; ++I) { 4093 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4094 if (SrcOp.isReg()) 4095 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4096 } 4097 } 4098 4099 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4100 /// 4101 /// Depending on the subtarget, load/store with 16-bit element data need to be 4102 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4103 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4104 /// registers. 4105 /// 4106 /// We don't want to directly select image instructions just yet, but also want 4107 /// to exposes all register repacking to the legalizer/combiners. We also don't 4108 /// want a selected instrution entering RegBankSelect. In order to avoid 4109 /// defining a multitude of intermediate image instructions, directly hack on 4110 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4111 /// now unnecessary arguments with $noreg. 4112 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4113 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4114 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4115 4116 const unsigned NumDefs = MI.getNumExplicitDefs(); 4117 const unsigned ArgOffset = NumDefs + 1; 4118 bool IsTFE = NumDefs == 2; 4119 // We are only processing the operands of d16 image operations on subtargets 4120 // that use the unpacked register layout, or need to repack the TFE result. 4121 4122 // TODO: Do we need to guard against already legalized intrinsics? 4123 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4124 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4125 4126 MachineRegisterInfo *MRI = B.getMRI(); 4127 const LLT S32 = LLT::scalar(32); 4128 const LLT S16 = LLT::scalar(16); 4129 const LLT V2S16 = LLT::vector(2, 16); 4130 4131 unsigned DMask = 0; 4132 4133 // Check for 16 bit addresses and pack if true. 4134 LLT GradTy = 4135 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4136 LLT AddrTy = 4137 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4138 const bool IsG16 = GradTy == S16; 4139 const bool IsA16 = AddrTy == S16; 4140 4141 int DMaskLanes = 0; 4142 if (!BaseOpcode->Atomic) { 4143 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4144 if (BaseOpcode->Gather4) { 4145 DMaskLanes = 4; 4146 } else if (DMask != 0) { 4147 DMaskLanes = countPopulation(DMask); 4148 } else if (!IsTFE && !BaseOpcode->Store) { 4149 // If dmask is 0, this is a no-op load. This can be eliminated. 4150 B.buildUndef(MI.getOperand(0)); 4151 MI.eraseFromParent(); 4152 return true; 4153 } 4154 } 4155 4156 Observer.changingInstr(MI); 4157 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4158 4159 unsigned NewOpcode = NumDefs == 0 ? 4160 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4161 4162 // Track that we legalized this 4163 MI.setDesc(B.getTII().get(NewOpcode)); 4164 4165 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4166 // dmask to be at least 1 otherwise the instruction will fail 4167 if (IsTFE && DMask == 0) { 4168 DMask = 0x1; 4169 DMaskLanes = 1; 4170 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4171 } 4172 4173 if (BaseOpcode->Atomic) { 4174 Register VData0 = MI.getOperand(2).getReg(); 4175 LLT Ty = MRI->getType(VData0); 4176 4177 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4178 if (Ty.isVector()) 4179 return false; 4180 4181 if (BaseOpcode->AtomicX2) { 4182 Register VData1 = MI.getOperand(3).getReg(); 4183 // The two values are packed in one register. 4184 LLT PackedTy = LLT::vector(2, Ty); 4185 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4186 MI.getOperand(2).setReg(Concat.getReg(0)); 4187 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4188 } 4189 } 4190 4191 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4192 4193 // Optimize _L to _LZ when _L is zero 4194 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4195 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4196 const ConstantFP *ConstantLod; 4197 4198 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4199 m_GFCst(ConstantLod))) { 4200 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4201 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4202 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4203 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4204 Intr->Dim); 4205 4206 // The starting indexes should remain in the same place. 4207 --CorrectedNumVAddrs; 4208 4209 MI.getOperand(MI.getNumExplicitDefs()) 4210 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4211 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4212 Intr = NewImageDimIntr; 4213 } 4214 } 4215 } 4216 4217 // Optimize _mip away, when 'lod' is zero 4218 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4219 int64_t ConstantLod; 4220 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4221 m_ICst(ConstantLod))) { 4222 if (ConstantLod == 0) { 4223 // TODO: Change intrinsic opcode and remove operand instead or replacing 4224 // it with 0, as the _L to _LZ handling is done above. 4225 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4226 --CorrectedNumVAddrs; 4227 } 4228 } 4229 } 4230 4231 // Rewrite the addressing register layout before doing anything else. 4232 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4233 // 16 bit gradients are supported, but are tied to the A16 control 4234 // so both gradients and addresses must be 16 bit 4235 return false; 4236 } 4237 4238 if (IsA16 && !ST.hasA16()) { 4239 // A16 not supported 4240 return false; 4241 } 4242 4243 if (IsA16 || IsG16) { 4244 if (Intr->NumVAddrs > 1) { 4245 SmallVector<Register, 4> PackedRegs; 4246 4247 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4248 IsG16); 4249 4250 // See also below in the non-a16 branch 4251 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4252 4253 if (!UseNSA && PackedRegs.size() > 1) { 4254 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4255 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4256 PackedRegs[0] = Concat.getReg(0); 4257 PackedRegs.resize(1); 4258 } 4259 4260 const unsigned NumPacked = PackedRegs.size(); 4261 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4262 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4263 if (!SrcOp.isReg()) { 4264 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4265 continue; 4266 } 4267 4268 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4269 4270 if (I - Intr->VAddrStart < NumPacked) 4271 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4272 else 4273 SrcOp.setReg(AMDGPU::NoRegister); 4274 } 4275 } 4276 } else { 4277 // If the register allocator cannot place the address registers contiguously 4278 // without introducing moves, then using the non-sequential address encoding 4279 // is always preferable, since it saves VALU instructions and is usually a 4280 // wash in terms of code size or even better. 4281 // 4282 // However, we currently have no way of hinting to the register allocator 4283 // that MIMG addresses should be placed contiguously when it is possible to 4284 // do so, so force non-NSA for the common 2-address case as a heuristic. 4285 // 4286 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4287 // allocation when possible. 4288 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4289 4290 if (!UseNSA && Intr->NumVAddrs > 1) 4291 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4292 Intr->NumVAddrs); 4293 } 4294 4295 int Flags = 0; 4296 if (IsA16) 4297 Flags |= 1; 4298 if (IsG16) 4299 Flags |= 2; 4300 MI.addOperand(MachineOperand::CreateImm(Flags)); 4301 4302 if (BaseOpcode->Store) { // No TFE for stores? 4303 // TODO: Handle dmask trim 4304 Register VData = MI.getOperand(1).getReg(); 4305 LLT Ty = MRI->getType(VData); 4306 if (!Ty.isVector() || Ty.getElementType() != S16) 4307 return true; 4308 4309 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4310 if (RepackedReg != VData) { 4311 MI.getOperand(1).setReg(RepackedReg); 4312 } 4313 4314 return true; 4315 } 4316 4317 Register DstReg = MI.getOperand(0).getReg(); 4318 LLT Ty = MRI->getType(DstReg); 4319 const LLT EltTy = Ty.getScalarType(); 4320 const bool IsD16 = Ty.getScalarType() == S16; 4321 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4322 4323 // Confirm that the return type is large enough for the dmask specified 4324 if (NumElts < DMaskLanes) 4325 return false; 4326 4327 if (NumElts > 4 || DMaskLanes > 4) 4328 return false; 4329 4330 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4331 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4332 4333 // The raw dword aligned data component of the load. The only legal cases 4334 // where this matters should be when using the packed D16 format, for 4335 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4336 LLT RoundedTy; 4337 4338 // S32 vector to to cover all data, plus TFE result element. 4339 LLT TFETy; 4340 4341 // Register type to use for each loaded component. Will be S32 or V2S16. 4342 LLT RegTy; 4343 4344 if (IsD16 && ST.hasUnpackedD16VMem()) { 4345 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4346 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4347 RegTy = S32; 4348 } else { 4349 unsigned EltSize = EltTy.getSizeInBits(); 4350 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4351 unsigned RoundedSize = 32 * RoundedElts; 4352 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4353 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4354 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4355 } 4356 4357 // The return type does not need adjustment. 4358 // TODO: Should we change s16 case to s32 or <2 x s16>? 4359 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4360 return true; 4361 4362 Register Dst1Reg; 4363 4364 // Insert after the instruction. 4365 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4366 4367 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4368 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4369 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4370 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4371 4372 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4373 4374 MI.getOperand(0).setReg(NewResultReg); 4375 4376 // In the IR, TFE is supposed to be used with a 2 element struct return 4377 // type. The intruction really returns these two values in one contiguous 4378 // register, with one additional dword beyond the loaded data. Rewrite the 4379 // return type to use a single register result. 4380 4381 if (IsTFE) { 4382 Dst1Reg = MI.getOperand(1).getReg(); 4383 if (MRI->getType(Dst1Reg) != S32) 4384 return false; 4385 4386 // TODO: Make sure the TFE operand bit is set. 4387 MI.RemoveOperand(1); 4388 4389 // Handle the easy case that requires no repack instructions. 4390 if (Ty == S32) { 4391 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4392 return true; 4393 } 4394 } 4395 4396 // Now figure out how to copy the new result register back into the old 4397 // result. 4398 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4399 4400 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4401 4402 if (ResultNumRegs == 1) { 4403 assert(!IsTFE); 4404 ResultRegs[0] = NewResultReg; 4405 } else { 4406 // We have to repack into a new vector of some kind. 4407 for (int I = 0; I != NumDataRegs; ++I) 4408 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4409 B.buildUnmerge(ResultRegs, NewResultReg); 4410 4411 // Drop the final TFE element to get the data part. The TFE result is 4412 // directly written to the right place already. 4413 if (IsTFE) 4414 ResultRegs.resize(NumDataRegs); 4415 } 4416 4417 // For an s16 scalar result, we form an s32 result with a truncate regardless 4418 // of packed vs. unpacked. 4419 if (IsD16 && !Ty.isVector()) { 4420 B.buildTrunc(DstReg, ResultRegs[0]); 4421 return true; 4422 } 4423 4424 // Avoid a build/concat_vector of 1 entry. 4425 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4426 B.buildBitcast(DstReg, ResultRegs[0]); 4427 return true; 4428 } 4429 4430 assert(Ty.isVector()); 4431 4432 if (IsD16) { 4433 // For packed D16 results with TFE enabled, all the data components are 4434 // S32. Cast back to the expected type. 4435 // 4436 // TODO: We don't really need to use load s32 elements. We would only need one 4437 // cast for the TFE result if a multiple of v2s16 was used. 4438 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4439 for (Register &Reg : ResultRegs) 4440 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4441 } else if (ST.hasUnpackedD16VMem()) { 4442 for (Register &Reg : ResultRegs) 4443 Reg = B.buildTrunc(S16, Reg).getReg(0); 4444 } 4445 } 4446 4447 auto padWithUndef = [&](LLT Ty, int NumElts) { 4448 if (NumElts == 0) 4449 return; 4450 Register Undef = B.buildUndef(Ty).getReg(0); 4451 for (int I = 0; I != NumElts; ++I) 4452 ResultRegs.push_back(Undef); 4453 }; 4454 4455 // Pad out any elements eliminated due to the dmask. 4456 LLT ResTy = MRI->getType(ResultRegs[0]); 4457 if (!ResTy.isVector()) { 4458 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4459 B.buildBuildVector(DstReg, ResultRegs); 4460 return true; 4461 } 4462 4463 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4464 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4465 4466 // Deal with the one annoying legal case. 4467 const LLT V3S16 = LLT::vector(3, 16); 4468 if (Ty == V3S16) { 4469 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4470 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4471 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4472 return true; 4473 } 4474 4475 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4476 B.buildConcatVectors(DstReg, ResultRegs); 4477 return true; 4478 } 4479 4480 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4481 LegalizerHelper &Helper, MachineInstr &MI) const { 4482 MachineIRBuilder &B = Helper.MIRBuilder; 4483 GISelChangeObserver &Observer = Helper.Observer; 4484 4485 Register Dst = MI.getOperand(0).getReg(); 4486 LLT Ty = B.getMRI()->getType(Dst); 4487 unsigned Size = Ty.getSizeInBits(); 4488 MachineFunction &MF = B.getMF(); 4489 4490 Observer.changingInstr(MI); 4491 4492 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4493 Ty = getBitcastRegisterType(Ty); 4494 Helper.bitcastDst(MI, Ty, 0); 4495 Dst = MI.getOperand(0).getReg(); 4496 B.setInsertPt(B.getMBB(), MI); 4497 } 4498 4499 // FIXME: We don't really need this intermediate instruction. The intrinsic 4500 // should be fixed to have a memory operand. Since it's readnone, we're not 4501 // allowed to add one. 4502 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4503 MI.RemoveOperand(1); // Remove intrinsic ID 4504 4505 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4506 // TODO: Should this use datalayout alignment? 4507 const unsigned MemSize = (Size + 7) / 8; 4508 const Align MemAlign(4); 4509 MachineMemOperand *MMO = MF.getMachineMemOperand( 4510 MachinePointerInfo(), 4511 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4512 MachineMemOperand::MOInvariant, 4513 MemSize, MemAlign); 4514 MI.addMemOperand(MF, MMO); 4515 4516 // There are no 96-bit result scalar loads, but widening to 128-bit should 4517 // always be legal. We may need to restore this to a 96-bit result if it turns 4518 // out this needs to be converted to a vector load during RegBankSelect. 4519 if (!isPowerOf2_32(Size)) { 4520 if (Ty.isVector()) 4521 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4522 else 4523 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4524 } 4525 4526 Observer.changedInstr(MI); 4527 return true; 4528 } 4529 4530 // TODO: Move to selection 4531 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4532 MachineRegisterInfo &MRI, 4533 MachineIRBuilder &B) const { 4534 if (!ST.isTrapHandlerEnabled() || 4535 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4536 return legalizeTrapEndpgm(MI, MRI, B); 4537 4538 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4539 switch (*HsaAbiVer) { 4540 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4541 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4542 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4543 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4544 return ST.supportsGetDoorbellID() ? 4545 legalizeTrapHsa(MI, MRI, B) : 4546 legalizeTrapHsaQueuePtr(MI, MRI, B); 4547 } 4548 } 4549 4550 llvm_unreachable("Unknown trap handler"); 4551 } 4552 4553 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4554 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4555 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4556 MI.eraseFromParent(); 4557 return true; 4558 } 4559 4560 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4561 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4562 // Pass queue pointer to trap handler as input, and insert trap instruction 4563 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4564 Register LiveIn = 4565 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4566 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4567 return false; 4568 4569 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4570 B.buildCopy(SGPR01, LiveIn); 4571 B.buildInstr(AMDGPU::S_TRAP) 4572 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4573 .addReg(SGPR01, RegState::Implicit); 4574 4575 MI.eraseFromParent(); 4576 return true; 4577 } 4578 4579 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4580 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4581 B.buildInstr(AMDGPU::S_TRAP) 4582 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4583 MI.eraseFromParent(); 4584 return true; 4585 } 4586 4587 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4588 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4589 // Is non-HSA path or trap-handler disabled? then, report a warning 4590 // accordingly 4591 if (!ST.isTrapHandlerEnabled() || 4592 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4593 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4594 "debugtrap handler not supported", 4595 MI.getDebugLoc(), DS_Warning); 4596 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4597 Ctx.diagnose(NoTrap); 4598 } else { 4599 // Insert debug-trap instruction 4600 B.buildInstr(AMDGPU::S_TRAP) 4601 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4602 } 4603 4604 MI.eraseFromParent(); 4605 return true; 4606 } 4607 4608 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4609 MachineIRBuilder &B) const { 4610 MachineRegisterInfo &MRI = *B.getMRI(); 4611 const LLT S16 = LLT::scalar(16); 4612 const LLT S32 = LLT::scalar(32); 4613 4614 Register DstReg = MI.getOperand(0).getReg(); 4615 Register NodePtr = MI.getOperand(2).getReg(); 4616 Register RayExtent = MI.getOperand(3).getReg(); 4617 Register RayOrigin = MI.getOperand(4).getReg(); 4618 Register RayDir = MI.getOperand(5).getReg(); 4619 Register RayInvDir = MI.getOperand(6).getReg(); 4620 Register TDescr = MI.getOperand(7).getReg(); 4621 4622 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4623 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4624 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4625 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4626 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4627 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4628 4629 SmallVector<Register, 12> Ops; 4630 if (Is64) { 4631 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4632 Ops.push_back(Unmerge.getReg(0)); 4633 Ops.push_back(Unmerge.getReg(1)); 4634 } else { 4635 Ops.push_back(NodePtr); 4636 } 4637 Ops.push_back(RayExtent); 4638 4639 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4640 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4641 Ops.push_back(Unmerge.getReg(0)); 4642 Ops.push_back(Unmerge.getReg(1)); 4643 Ops.push_back(Unmerge.getReg(2)); 4644 }; 4645 4646 packLanes(RayOrigin); 4647 if (IsA16) { 4648 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4649 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4650 Register R1 = MRI.createGenericVirtualRegister(S32); 4651 Register R2 = MRI.createGenericVirtualRegister(S32); 4652 Register R3 = MRI.createGenericVirtualRegister(S32); 4653 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4654 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4655 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4656 Ops.push_back(R1); 4657 Ops.push_back(R2); 4658 Ops.push_back(R3); 4659 } else { 4660 packLanes(RayDir); 4661 packLanes(RayInvDir); 4662 } 4663 4664 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4665 .addDef(DstReg) 4666 .addImm(Opcode); 4667 4668 for (Register R : Ops) { 4669 MIB.addUse(R); 4670 } 4671 4672 MIB.addUse(TDescr) 4673 .addImm(IsA16 ? 1 : 0) 4674 .cloneMemRefs(MI); 4675 4676 MI.eraseFromParent(); 4677 return true; 4678 } 4679 4680 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4681 MachineInstr &MI) const { 4682 MachineIRBuilder &B = Helper.MIRBuilder; 4683 MachineRegisterInfo &MRI = *B.getMRI(); 4684 4685 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4686 auto IntrID = MI.getIntrinsicID(); 4687 switch (IntrID) { 4688 case Intrinsic::amdgcn_if: 4689 case Intrinsic::amdgcn_else: { 4690 MachineInstr *Br = nullptr; 4691 MachineBasicBlock *UncondBrTarget = nullptr; 4692 bool Negated = false; 4693 if (MachineInstr *BrCond = 4694 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4695 const SIRegisterInfo *TRI 4696 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4697 4698 Register Def = MI.getOperand(1).getReg(); 4699 Register Use = MI.getOperand(3).getReg(); 4700 4701 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4702 4703 if (Negated) 4704 std::swap(CondBrTarget, UncondBrTarget); 4705 4706 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4707 if (IntrID == Intrinsic::amdgcn_if) { 4708 B.buildInstr(AMDGPU::SI_IF) 4709 .addDef(Def) 4710 .addUse(Use) 4711 .addMBB(UncondBrTarget); 4712 } else { 4713 B.buildInstr(AMDGPU::SI_ELSE) 4714 .addDef(Def) 4715 .addUse(Use) 4716 .addMBB(UncondBrTarget); 4717 } 4718 4719 if (Br) { 4720 Br->getOperand(0).setMBB(CondBrTarget); 4721 } else { 4722 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4723 // since we're swapping branch targets it needs to be reinserted. 4724 // FIXME: IRTranslator should probably not do this 4725 B.buildBr(*CondBrTarget); 4726 } 4727 4728 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4729 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4730 MI.eraseFromParent(); 4731 BrCond->eraseFromParent(); 4732 return true; 4733 } 4734 4735 return false; 4736 } 4737 case Intrinsic::amdgcn_loop: { 4738 MachineInstr *Br = nullptr; 4739 MachineBasicBlock *UncondBrTarget = nullptr; 4740 bool Negated = false; 4741 if (MachineInstr *BrCond = 4742 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4743 const SIRegisterInfo *TRI 4744 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4745 4746 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4747 Register Reg = MI.getOperand(2).getReg(); 4748 4749 if (Negated) 4750 std::swap(CondBrTarget, UncondBrTarget); 4751 4752 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4753 B.buildInstr(AMDGPU::SI_LOOP) 4754 .addUse(Reg) 4755 .addMBB(UncondBrTarget); 4756 4757 if (Br) 4758 Br->getOperand(0).setMBB(CondBrTarget); 4759 else 4760 B.buildBr(*CondBrTarget); 4761 4762 MI.eraseFromParent(); 4763 BrCond->eraseFromParent(); 4764 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4765 return true; 4766 } 4767 4768 return false; 4769 } 4770 case Intrinsic::amdgcn_kernarg_segment_ptr: 4771 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4772 // This only makes sense to call in a kernel, so just lower to null. 4773 B.buildConstant(MI.getOperand(0).getReg(), 0); 4774 MI.eraseFromParent(); 4775 return true; 4776 } 4777 4778 return legalizePreloadedArgIntrin( 4779 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4780 case Intrinsic::amdgcn_implicitarg_ptr: 4781 return legalizeImplicitArgPtr(MI, MRI, B); 4782 case Intrinsic::amdgcn_workitem_id_x: 4783 return legalizePreloadedArgIntrin(MI, MRI, B, 4784 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4785 case Intrinsic::amdgcn_workitem_id_y: 4786 return legalizePreloadedArgIntrin(MI, MRI, B, 4787 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4788 case Intrinsic::amdgcn_workitem_id_z: 4789 return legalizePreloadedArgIntrin(MI, MRI, B, 4790 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4791 case Intrinsic::amdgcn_workgroup_id_x: 4792 return legalizePreloadedArgIntrin(MI, MRI, B, 4793 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4794 case Intrinsic::amdgcn_workgroup_id_y: 4795 return legalizePreloadedArgIntrin(MI, MRI, B, 4796 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4797 case Intrinsic::amdgcn_workgroup_id_z: 4798 return legalizePreloadedArgIntrin(MI, MRI, B, 4799 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4800 case Intrinsic::amdgcn_dispatch_ptr: 4801 return legalizePreloadedArgIntrin(MI, MRI, B, 4802 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4803 case Intrinsic::amdgcn_queue_ptr: 4804 return legalizePreloadedArgIntrin(MI, MRI, B, 4805 AMDGPUFunctionArgInfo::QUEUE_PTR); 4806 case Intrinsic::amdgcn_implicit_buffer_ptr: 4807 return legalizePreloadedArgIntrin( 4808 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4809 case Intrinsic::amdgcn_dispatch_id: 4810 return legalizePreloadedArgIntrin(MI, MRI, B, 4811 AMDGPUFunctionArgInfo::DISPATCH_ID); 4812 case Intrinsic::amdgcn_fdiv_fast: 4813 return legalizeFDIVFastIntrin(MI, MRI, B); 4814 case Intrinsic::amdgcn_is_shared: 4815 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4816 case Intrinsic::amdgcn_is_private: 4817 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4818 case Intrinsic::amdgcn_wavefrontsize: { 4819 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4820 MI.eraseFromParent(); 4821 return true; 4822 } 4823 case Intrinsic::amdgcn_s_buffer_load: 4824 return legalizeSBufferLoad(Helper, MI); 4825 case Intrinsic::amdgcn_raw_buffer_store: 4826 case Intrinsic::amdgcn_struct_buffer_store: 4827 return legalizeBufferStore(MI, MRI, B, false, false); 4828 case Intrinsic::amdgcn_raw_buffer_store_format: 4829 case Intrinsic::amdgcn_struct_buffer_store_format: 4830 return legalizeBufferStore(MI, MRI, B, false, true); 4831 case Intrinsic::amdgcn_raw_tbuffer_store: 4832 case Intrinsic::amdgcn_struct_tbuffer_store: 4833 return legalizeBufferStore(MI, MRI, B, true, true); 4834 case Intrinsic::amdgcn_raw_buffer_load: 4835 case Intrinsic::amdgcn_struct_buffer_load: 4836 return legalizeBufferLoad(MI, MRI, B, false, false); 4837 case Intrinsic::amdgcn_raw_buffer_load_format: 4838 case Intrinsic::amdgcn_struct_buffer_load_format: 4839 return legalizeBufferLoad(MI, MRI, B, true, false); 4840 case Intrinsic::amdgcn_raw_tbuffer_load: 4841 case Intrinsic::amdgcn_struct_tbuffer_load: 4842 return legalizeBufferLoad(MI, MRI, B, true, true); 4843 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4844 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4845 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4846 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4847 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4848 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4849 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4850 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4851 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4852 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4853 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4854 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4855 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4856 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4857 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4858 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4859 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4860 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4861 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4862 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4863 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4864 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4865 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4866 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4867 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4868 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4869 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4870 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4871 case Intrinsic::amdgcn_buffer_atomic_fadd: 4872 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4873 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4874 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4875 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4876 return legalizeBufferAtomic(MI, B, IntrID); 4877 case Intrinsic::amdgcn_atomic_inc: 4878 return legalizeAtomicIncDec(MI, B, true); 4879 case Intrinsic::amdgcn_atomic_dec: 4880 return legalizeAtomicIncDec(MI, B, false); 4881 case Intrinsic::trap: 4882 return legalizeTrapIntrinsic(MI, MRI, B); 4883 case Intrinsic::debugtrap: 4884 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4885 case Intrinsic::amdgcn_rsq_clamp: 4886 return legalizeRsqClampIntrinsic(MI, MRI, B); 4887 case Intrinsic::amdgcn_ds_fadd: 4888 case Intrinsic::amdgcn_ds_fmin: 4889 case Intrinsic::amdgcn_ds_fmax: 4890 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4891 case Intrinsic::amdgcn_image_bvh_intersect_ray: 4892 return legalizeBVHIntrinsic(MI, B); 4893 default: { 4894 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4895 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4896 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4897 return true; 4898 } 4899 } 4900 4901 return true; 4902 } 4903