1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeNumElements(Pow2NElts); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 99 }; 100 } 101 102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 const LLT EltTy = Ty.getElementType(); 106 unsigned Size = Ty.getSizeInBits(); 107 unsigned Pieces = (Size + 63) / 64; 108 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 109 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 110 }; 111 } 112 113 // Increase the number of vector elements to reach the next multiple of 32-bit 114 // type. 115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 116 return [=](const LegalityQuery &Query) { 117 const LLT Ty = Query.Types[TypeIdx]; 118 119 const LLT EltTy = Ty.getElementType(); 120 const int Size = Ty.getSizeInBits(); 121 const int EltSize = EltTy.getSizeInBits(); 122 const int NextMul32 = (Size + 31) / 32; 123 124 assert(EltSize < 32); 125 126 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 127 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 128 }; 129 } 130 131 static LLT getBitcastRegisterType(const LLT Ty) { 132 const unsigned Size = Ty.getSizeInBits(); 133 134 LLT CoercedTy; 135 if (Size <= 32) { 136 // <2 x s8> -> s16 137 // <4 x s8> -> s32 138 return LLT::scalar(Size); 139 } 140 141 return LLT::scalarOrVector(Size / 32, 32); 142 } 143 144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 145 return [=](const LegalityQuery &Query) { 146 const LLT Ty = Query.Types[TypeIdx]; 147 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 148 }; 149 } 150 151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 152 return [=](const LegalityQuery &Query) { 153 const LLT Ty = Query.Types[TypeIdx]; 154 unsigned Size = Ty.getSizeInBits(); 155 assert(Size % 32 == 0); 156 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 157 }; 158 } 159 160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 161 return [=](const LegalityQuery &Query) { 162 const LLT QueryTy = Query.Types[TypeIdx]; 163 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 164 }; 165 } 166 167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 168 return [=](const LegalityQuery &Query) { 169 const LLT QueryTy = Query.Types[TypeIdx]; 170 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 171 }; 172 } 173 174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 const LLT QueryTy = Query.Types[TypeIdx]; 177 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 178 }; 179 } 180 181 static bool isRegisterSize(unsigned Size) { 182 return Size % 32 == 0 && Size <= MaxRegisterSize; 183 } 184 185 static bool isRegisterVectorElementType(LLT EltTy) { 186 const int EltSize = EltTy.getSizeInBits(); 187 return EltSize == 16 || EltSize % 32 == 0; 188 } 189 190 static bool isRegisterVectorType(LLT Ty) { 191 const int EltSize = Ty.getElementType().getSizeInBits(); 192 return EltSize == 32 || EltSize == 64 || 193 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 194 EltSize == 128 || EltSize == 256; 195 } 196 197 static bool isRegisterType(LLT Ty) { 198 if (!isRegisterSize(Ty.getSizeInBits())) 199 return false; 200 201 if (Ty.isVector()) 202 return isRegisterVectorType(Ty); 203 204 return true; 205 } 206 207 // Any combination of 32 or 64-bit elements up the maximum register size, and 208 // multiples of v2s16. 209 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 210 return [=](const LegalityQuery &Query) { 211 return isRegisterType(Query.Types[TypeIdx]); 212 }; 213 } 214 215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 216 return [=](const LegalityQuery &Query) { 217 const LLT QueryTy = Query.Types[TypeIdx]; 218 if (!QueryTy.isVector()) 219 return false; 220 const LLT EltTy = QueryTy.getElementType(); 221 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 222 }; 223 } 224 225 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 226 return [=](const LegalityQuery &Query) { 227 const LLT Ty = Query.Types[TypeIdx]; 228 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 229 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 230 }; 231 } 232 233 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 234 // handle some operations by just promoting the register during 235 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 236 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 237 bool IsLoad) { 238 switch (AS) { 239 case AMDGPUAS::PRIVATE_ADDRESS: 240 // FIXME: Private element size. 241 return ST.enableFlatScratch() ? 128 : 32; 242 case AMDGPUAS::LOCAL_ADDRESS: 243 return ST.useDS128() ? 128 : 64; 244 case AMDGPUAS::GLOBAL_ADDRESS: 245 case AMDGPUAS::CONSTANT_ADDRESS: 246 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 247 // Treat constant and global as identical. SMRD loads are sometimes usable for 248 // global loads (ideally constant address space should be eliminated) 249 // depending on the context. Legality cannot be context dependent, but 250 // RegBankSelect can split the load as necessary depending on the pointer 251 // register bank/uniformity and if the memory is invariant or not written in a 252 // kernel. 253 return IsLoad ? 512 : 128; 254 default: 255 // Flat addresses may contextually need to be split to 32-bit parts if they 256 // may alias scratch depending on the subtarget. 257 return 128; 258 } 259 } 260 261 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 262 const LegalityQuery &Query, 263 unsigned Opcode) { 264 const LLT Ty = Query.Types[0]; 265 266 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 267 const bool IsLoad = Opcode != AMDGPU::G_STORE; 268 269 unsigned RegSize = Ty.getSizeInBits(); 270 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 271 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 272 unsigned AS = Query.Types[1].getAddressSpace(); 273 274 // All of these need to be custom lowered to cast the pointer operand. 275 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 276 return false; 277 278 // TODO: We should be able to widen loads if the alignment is high enough, but 279 // we also need to modify the memory access size. 280 #if 0 281 // Accept widening loads based on alignment. 282 if (IsLoad && MemSize < Size) 283 MemSize = std::max(MemSize, Align); 284 #endif 285 286 // Only 1-byte and 2-byte to 32-bit extloads are valid. 287 if (MemSize != RegSize && RegSize != 32) 288 return false; 289 290 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 291 return false; 292 293 switch (MemSize) { 294 case 8: 295 case 16: 296 case 32: 297 case 64: 298 case 128: 299 break; 300 case 96: 301 if (!ST.hasDwordx3LoadStores()) 302 return false; 303 break; 304 case 256: 305 case 512: 306 // These may contextually need to be broken down. 307 break; 308 default: 309 return false; 310 } 311 312 assert(RegSize >= MemSize); 313 314 if (AlignBits < MemSize) { 315 const SITargetLowering *TLI = ST.getTargetLowering(); 316 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 317 Align(AlignBits / 8))) 318 return false; 319 } 320 321 return true; 322 } 323 324 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 325 // workaround this. Eventually it should ignore the type for loads and only care 326 // about the size. Return true in cases where we will workaround this for now by 327 // bitcasting. 328 static bool loadStoreBitcastWorkaround(const LLT Ty) { 329 if (EnableNewLegality) 330 return false; 331 332 const unsigned Size = Ty.getSizeInBits(); 333 if (Size <= 64) 334 return false; 335 if (!Ty.isVector()) 336 return true; 337 338 LLT EltTy = Ty.getElementType(); 339 if (EltTy.isPointer()) 340 return true; 341 342 unsigned EltSize = EltTy.getSizeInBits(); 343 return EltSize != 32 && EltSize != 64; 344 } 345 346 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 347 unsigned Opcode) { 348 const LLT Ty = Query.Types[0]; 349 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 350 !loadStoreBitcastWorkaround(Ty); 351 } 352 353 /// Return true if a load or store of the type should be lowered with a bitcast 354 /// to a different type. 355 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 356 const unsigned MemSizeInBits) { 357 const unsigned Size = Ty.getSizeInBits(); 358 if (Size != MemSizeInBits) 359 return Size <= 32 && Ty.isVector(); 360 361 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 362 return true; 363 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 364 !isRegisterVectorElementType(Ty.getElementType()); 365 } 366 367 /// Return true if we should legalize a load by widening an odd sized memory 368 /// access up to the alignment. Note this case when the memory access itself 369 /// changes, not the size of the result register. 370 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 371 unsigned AlignInBits, unsigned AddrSpace, 372 unsigned Opcode) { 373 // We don't want to widen cases that are naturally legal. 374 if (isPowerOf2_32(SizeInBits)) 375 return false; 376 377 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 378 // end up widening these for a scalar load during RegBankSelect, since there 379 // aren't 96-bit scalar loads. 380 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 381 return false; 382 383 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 384 return false; 385 386 // A load is known dereferenceable up to the alignment, so it's legal to widen 387 // to it. 388 // 389 // TODO: Could check dereferenceable for less aligned cases. 390 unsigned RoundedSize = NextPowerOf2(SizeInBits); 391 if (AlignInBits < RoundedSize) 392 return false; 393 394 // Do not widen if it would introduce a slow unaligned load. 395 const SITargetLowering *TLI = ST.getTargetLowering(); 396 bool Fast = false; 397 return TLI->allowsMisalignedMemoryAccessesImpl( 398 RoundedSize, AddrSpace, Align(AlignInBits / 8), 399 MachineMemOperand::MOLoad, &Fast) && 400 Fast; 401 } 402 403 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 404 unsigned Opcode) { 405 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 406 return false; 407 408 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 409 Query.MMODescrs[0].AlignInBits, 410 Query.Types[1].getAddressSpace(), Opcode); 411 } 412 413 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 414 const GCNTargetMachine &TM) 415 : ST(ST_) { 416 using namespace TargetOpcode; 417 418 auto GetAddrSpacePtr = [&TM](unsigned AS) { 419 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 420 }; 421 422 const LLT S1 = LLT::scalar(1); 423 const LLT S8 = LLT::scalar(8); 424 const LLT S16 = LLT::scalar(16); 425 const LLT S32 = LLT::scalar(32); 426 const LLT S64 = LLT::scalar(64); 427 const LLT S128 = LLT::scalar(128); 428 const LLT S256 = LLT::scalar(256); 429 const LLT S512 = LLT::scalar(512); 430 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 431 432 const LLT V2S8 = LLT::vector(2, 8); 433 const LLT V2S16 = LLT::vector(2, 16); 434 const LLT V4S16 = LLT::vector(4, 16); 435 436 const LLT V2S32 = LLT::vector(2, 32); 437 const LLT V3S32 = LLT::vector(3, 32); 438 const LLT V4S32 = LLT::vector(4, 32); 439 const LLT V5S32 = LLT::vector(5, 32); 440 const LLT V6S32 = LLT::vector(6, 32); 441 const LLT V7S32 = LLT::vector(7, 32); 442 const LLT V8S32 = LLT::vector(8, 32); 443 const LLT V9S32 = LLT::vector(9, 32); 444 const LLT V10S32 = LLT::vector(10, 32); 445 const LLT V11S32 = LLT::vector(11, 32); 446 const LLT V12S32 = LLT::vector(12, 32); 447 const LLT V13S32 = LLT::vector(13, 32); 448 const LLT V14S32 = LLT::vector(14, 32); 449 const LLT V15S32 = LLT::vector(15, 32); 450 const LLT V16S32 = LLT::vector(16, 32); 451 const LLT V32S32 = LLT::vector(32, 32); 452 453 const LLT V2S64 = LLT::vector(2, 64); 454 const LLT V3S64 = LLT::vector(3, 64); 455 const LLT V4S64 = LLT::vector(4, 64); 456 const LLT V5S64 = LLT::vector(5, 64); 457 const LLT V6S64 = LLT::vector(6, 64); 458 const LLT V7S64 = LLT::vector(7, 64); 459 const LLT V8S64 = LLT::vector(8, 64); 460 const LLT V16S64 = LLT::vector(16, 64); 461 462 std::initializer_list<LLT> AllS32Vectors = 463 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 464 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 465 std::initializer_list<LLT> AllS64Vectors = 466 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 467 468 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 469 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 470 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 471 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 472 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 473 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 474 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 475 476 const LLT CodePtr = FlatPtr; 477 478 const std::initializer_list<LLT> AddrSpaces64 = { 479 GlobalPtr, ConstantPtr, FlatPtr 480 }; 481 482 const std::initializer_list<LLT> AddrSpaces32 = { 483 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 484 }; 485 486 const std::initializer_list<LLT> FPTypesBase = { 487 S32, S64 488 }; 489 490 const std::initializer_list<LLT> FPTypes16 = { 491 S32, S64, S16 492 }; 493 494 const std::initializer_list<LLT> FPTypesPK16 = { 495 S32, S64, S16, V2S16 496 }; 497 498 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 499 500 setAction({G_BRCOND, S1}, Legal); // VCC branches 501 setAction({G_BRCOND, S32}, Legal); // SCC branches 502 503 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 504 // elements for v3s16 505 getActionDefinitionsBuilder(G_PHI) 506 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 507 .legalFor(AllS32Vectors) 508 .legalFor(AllS64Vectors) 509 .legalFor(AddrSpaces64) 510 .legalFor(AddrSpaces32) 511 .legalIf(isPointer(0)) 512 .clampScalar(0, S16, S256) 513 .widenScalarToNextPow2(0, 32) 514 .clampMaxNumElements(0, S32, 16) 515 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 516 .scalarize(0); 517 518 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 519 // Full set of gfx9 features. 520 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 521 .legalFor({S32, S16, V2S16}) 522 .clampScalar(0, S16, S32) 523 .clampMaxNumElements(0, S16, 2) 524 .scalarize(0) 525 .widenScalarToNextPow2(0, 32); 526 527 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 528 .legalFor({S32, S16, V2S16}) // Clamp modifier 529 .minScalarOrElt(0, S16) 530 .clampMaxNumElements(0, S16, 2) 531 .scalarize(0) 532 .widenScalarToNextPow2(0, 32) 533 .lower(); 534 } else if (ST.has16BitInsts()) { 535 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 536 .legalFor({S32, S16}) 537 .clampScalar(0, S16, S32) 538 .scalarize(0) 539 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 540 541 // Technically the saturating operations require clamp bit support, but this 542 // was introduced at the same time as 16-bit operations. 543 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 544 .legalFor({S32, S16}) // Clamp modifier 545 .minScalar(0, S16) 546 .scalarize(0) 547 .widenScalarToNextPow2(0, 16) 548 .lower(); 549 550 // We're just lowering this, but it helps get a better result to try to 551 // coerce to the desired type first. 552 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 553 .minScalar(0, S16) 554 .scalarize(0) 555 .lower(); 556 } else { 557 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 558 .legalFor({S32}) 559 .clampScalar(0, S32, S32) 560 .scalarize(0); 561 562 if (ST.hasIntClamp()) { 563 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 564 .legalFor({S32}) // Clamp modifier. 565 .scalarize(0) 566 .minScalarOrElt(0, S32) 567 .lower(); 568 } else { 569 // Clamp bit support was added in VI, along with 16-bit operations. 570 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 571 .minScalar(0, S32) 572 .scalarize(0) 573 .lower(); 574 } 575 576 // FIXME: DAG expansion gets better results. The widening uses the smaller 577 // range values and goes for the min/max lowering directly. 578 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 579 .minScalar(0, S32) 580 .scalarize(0) 581 .lower(); 582 } 583 584 getActionDefinitionsBuilder( 585 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 586 .customFor({S32, S64}) 587 .clampScalar(0, S32, S64) 588 .widenScalarToNextPow2(0, 32) 589 .scalarize(0); 590 591 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 592 .legalFor({S32}) 593 .maxScalarOrElt(0, S32); 594 595 if (ST.hasVOP3PInsts()) { 596 Mulh 597 .clampMaxNumElements(0, S8, 2) 598 .lowerFor({V2S8}); 599 } 600 601 Mulh 602 .scalarize(0) 603 .lower(); 604 605 // Report legal for any types we can handle anywhere. For the cases only legal 606 // on the SALU, RegBankSelect will be able to re-legalize. 607 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 608 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 609 .clampScalar(0, S32, S64) 610 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 611 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 612 .widenScalarToNextPow2(0) 613 .scalarize(0); 614 615 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 616 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 617 .legalFor({{S32, S1}, {S32, S32}}) 618 .minScalar(0, S32) 619 // TODO: .scalarize(0) 620 .lower(); 621 622 getActionDefinitionsBuilder(G_BITCAST) 623 // Don't worry about the size constraint. 624 .legalIf(all(isRegisterType(0), isRegisterType(1))) 625 .lower(); 626 627 628 getActionDefinitionsBuilder(G_CONSTANT) 629 .legalFor({S1, S32, S64, S16, GlobalPtr, 630 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 631 .legalIf(isPointer(0)) 632 .clampScalar(0, S32, S64) 633 .widenScalarToNextPow2(0); 634 635 getActionDefinitionsBuilder(G_FCONSTANT) 636 .legalFor({S32, S64, S16}) 637 .clampScalar(0, S16, S64); 638 639 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 640 .legalIf(isRegisterType(0)) 641 // s1 and s16 are special cases because they have legal operations on 642 // them, but don't really occupy registers in the normal way. 643 .legalFor({S1, S16}) 644 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 645 .clampScalarOrElt(0, S32, MaxScalar) 646 .widenScalarToNextPow2(0, 32) 647 .clampMaxNumElements(0, S32, 16); 648 649 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 650 651 // If the amount is divergent, we have to do a wave reduction to get the 652 // maximum value, so this is expanded during RegBankSelect. 653 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 654 .legalFor({{PrivatePtr, S32}}); 655 656 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 657 .customIf(typeIsNot(0, PrivatePtr)); 658 659 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 660 661 auto &FPOpActions = getActionDefinitionsBuilder( 662 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 663 .legalFor({S32, S64}); 664 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 665 .customFor({S32, S64}); 666 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 667 .customFor({S32, S64}); 668 669 if (ST.has16BitInsts()) { 670 if (ST.hasVOP3PInsts()) 671 FPOpActions.legalFor({S16, V2S16}); 672 else 673 FPOpActions.legalFor({S16}); 674 675 TrigActions.customFor({S16}); 676 FDIVActions.customFor({S16}); 677 } 678 679 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 680 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 681 682 if (ST.hasVOP3PInsts()) { 683 MinNumMaxNum.customFor(FPTypesPK16) 684 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 685 .clampMaxNumElements(0, S16, 2) 686 .clampScalar(0, S16, S64) 687 .scalarize(0); 688 } else if (ST.has16BitInsts()) { 689 MinNumMaxNum.customFor(FPTypes16) 690 .clampScalar(0, S16, S64) 691 .scalarize(0); 692 } else { 693 MinNumMaxNum.customFor(FPTypesBase) 694 .clampScalar(0, S32, S64) 695 .scalarize(0); 696 } 697 698 if (ST.hasVOP3PInsts()) 699 FPOpActions.clampMaxNumElements(0, S16, 2); 700 701 FPOpActions 702 .scalarize(0) 703 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 704 705 TrigActions 706 .scalarize(0) 707 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 708 709 FDIVActions 710 .scalarize(0) 711 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 712 713 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 714 .legalFor(FPTypesPK16) 715 .clampMaxNumElements(0, S16, 2) 716 .scalarize(0) 717 .clampScalar(0, S16, S64); 718 719 if (ST.has16BitInsts()) { 720 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 721 .legalFor({S32, S64, S16}) 722 .scalarize(0) 723 .clampScalar(0, S16, S64); 724 } else { 725 getActionDefinitionsBuilder(G_FSQRT) 726 .legalFor({S32, S64}) 727 .scalarize(0) 728 .clampScalar(0, S32, S64); 729 730 if (ST.hasFractBug()) { 731 getActionDefinitionsBuilder(G_FFLOOR) 732 .customFor({S64}) 733 .legalFor({S32, S64}) 734 .scalarize(0) 735 .clampScalar(0, S32, S64); 736 } else { 737 getActionDefinitionsBuilder(G_FFLOOR) 738 .legalFor({S32, S64}) 739 .scalarize(0) 740 .clampScalar(0, S32, S64); 741 } 742 } 743 744 getActionDefinitionsBuilder(G_FPTRUNC) 745 .legalFor({{S32, S64}, {S16, S32}}) 746 .scalarize(0) 747 .lower(); 748 749 getActionDefinitionsBuilder(G_FPEXT) 750 .legalFor({{S64, S32}, {S32, S16}}) 751 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 752 .scalarize(0); 753 754 getActionDefinitionsBuilder(G_FSUB) 755 // Use actual fsub instruction 756 .legalFor({S32}) 757 // Must use fadd + fneg 758 .lowerFor({S64, S16, V2S16}) 759 .scalarize(0) 760 .clampScalar(0, S32, S64); 761 762 // Whether this is legal depends on the floating point mode for the function. 763 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 764 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 765 FMad.customFor({S32, S16}); 766 else if (ST.hasMadMacF32Insts()) 767 FMad.customFor({S32}); 768 else if (ST.hasMadF16()) 769 FMad.customFor({S16}); 770 FMad.scalarize(0) 771 .lower(); 772 773 auto &FRem = getActionDefinitionsBuilder(G_FREM); 774 if (ST.has16BitInsts()) { 775 FRem.customFor({S16, S32, S64}); 776 } else { 777 FRem.minScalar(0, S32) 778 .customFor({S32, S64}); 779 } 780 FRem.scalarize(0); 781 782 // TODO: Do we need to clamp maximum bitwidth? 783 getActionDefinitionsBuilder(G_TRUNC) 784 .legalIf(isScalar(0)) 785 .legalFor({{V2S16, V2S32}}) 786 .clampMaxNumElements(0, S16, 2) 787 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 788 // situations (like an invalid implicit use), we don't want to infinite loop 789 // in the legalizer. 790 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 791 .alwaysLegal(); 792 793 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 794 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 795 {S32, S1}, {S64, S1}, {S16, S1}}) 796 .scalarize(0) 797 .clampScalar(0, S32, S64) 798 .widenScalarToNextPow2(1, 32); 799 800 // TODO: Split s1->s64 during regbankselect for VALU. 801 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 802 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 803 .lowerFor({{S32, S64}}) 804 .lowerIf(typeIs(1, S1)) 805 .customFor({{S64, S64}}); 806 if (ST.has16BitInsts()) 807 IToFP.legalFor({{S16, S16}}); 808 IToFP.clampScalar(1, S32, S64) 809 .minScalar(0, S32) 810 .scalarize(0) 811 .widenScalarToNextPow2(1); 812 813 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 814 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 815 .customFor({{S64, S64}}) 816 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 817 if (ST.has16BitInsts()) 818 FPToI.legalFor({{S16, S16}}); 819 else 820 FPToI.minScalar(1, S32); 821 822 FPToI.minScalar(0, S32) 823 .widenScalarToNextPow2(0, 32) 824 .scalarize(0) 825 .lower(); 826 827 // Lower roundeven into G_FRINT 828 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 829 .scalarize(0) 830 .lower(); 831 832 if (ST.has16BitInsts()) { 833 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 834 .legalFor({S16, S32, S64}) 835 .clampScalar(0, S16, S64) 836 .scalarize(0); 837 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 838 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 839 .legalFor({S32, S64}) 840 .clampScalar(0, S32, S64) 841 .scalarize(0); 842 } else { 843 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 844 .legalFor({S32}) 845 .customFor({S64}) 846 .clampScalar(0, S32, S64) 847 .scalarize(0); 848 } 849 850 getActionDefinitionsBuilder(G_PTR_ADD) 851 .legalIf(all(isPointer(0), sameSize(0, 1))) 852 .scalarize(0) 853 .scalarSameSizeAs(1, 0); 854 855 getActionDefinitionsBuilder(G_PTRMASK) 856 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 857 .scalarSameSizeAs(1, 0) 858 .scalarize(0); 859 860 auto &CmpBuilder = 861 getActionDefinitionsBuilder(G_ICMP) 862 // The compare output type differs based on the register bank of the output, 863 // so make both s1 and s32 legal. 864 // 865 // Scalar compares producing output in scc will be promoted to s32, as that 866 // is the allocatable register type that will be needed for the copy from 867 // scc. This will be promoted during RegBankSelect, and we assume something 868 // before that won't try to use s32 result types. 869 // 870 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 871 // bank. 872 .legalForCartesianProduct( 873 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 874 .legalForCartesianProduct( 875 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 876 if (ST.has16BitInsts()) { 877 CmpBuilder.legalFor({{S1, S16}}); 878 } 879 880 CmpBuilder 881 .widenScalarToNextPow2(1) 882 .clampScalar(1, S32, S64) 883 .scalarize(0) 884 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 885 886 getActionDefinitionsBuilder(G_FCMP) 887 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 888 .widenScalarToNextPow2(1) 889 .clampScalar(1, S32, S64) 890 .scalarize(0); 891 892 // FIXME: fpow has a selection pattern that should move to custom lowering. 893 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 894 if (ST.has16BitInsts()) 895 Exp2Ops.legalFor({S32, S16}); 896 else 897 Exp2Ops.legalFor({S32}); 898 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 899 Exp2Ops.scalarize(0); 900 901 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 902 if (ST.has16BitInsts()) 903 ExpOps.customFor({{S32}, {S16}}); 904 else 905 ExpOps.customFor({S32}); 906 ExpOps.clampScalar(0, MinScalarFPTy, S32) 907 .scalarize(0); 908 909 getActionDefinitionsBuilder(G_FPOWI) 910 .clampScalar(0, MinScalarFPTy, S32) 911 .lower(); 912 913 // The 64-bit versions produce 32-bit results, but only on the SALU. 914 getActionDefinitionsBuilder(G_CTPOP) 915 .legalFor({{S32, S32}, {S32, S64}}) 916 .clampScalar(0, S32, S32) 917 .clampScalar(1, S32, S64) 918 .scalarize(0) 919 .widenScalarToNextPow2(0, 32) 920 .widenScalarToNextPow2(1, 32); 921 922 // The hardware instructions return a different result on 0 than the generic 923 // instructions expect. The hardware produces -1, but these produce the 924 // bitwidth. 925 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 926 .scalarize(0) 927 .clampScalar(0, S32, S32) 928 .clampScalar(1, S32, S64) 929 .widenScalarToNextPow2(0, 32) 930 .widenScalarToNextPow2(1, 32) 931 .lower(); 932 933 // The 64-bit versions produce 32-bit results, but only on the SALU. 934 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 935 .legalFor({{S32, S32}, {S32, S64}}) 936 .clampScalar(0, S32, S32) 937 .clampScalar(1, S32, S64) 938 .scalarize(0) 939 .widenScalarToNextPow2(0, 32) 940 .widenScalarToNextPow2(1, 32); 941 942 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 943 // RegBankSelect. 944 getActionDefinitionsBuilder(G_BITREVERSE) 945 .legalFor({S32, S64}) 946 .clampScalar(0, S32, S64) 947 .scalarize(0) 948 .widenScalarToNextPow2(0); 949 950 if (ST.has16BitInsts()) { 951 getActionDefinitionsBuilder(G_BSWAP) 952 .legalFor({S16, S32, V2S16}) 953 .clampMaxNumElements(0, S16, 2) 954 // FIXME: Fixing non-power-of-2 before clamp is workaround for 955 // narrowScalar limitation. 956 .widenScalarToNextPow2(0) 957 .clampScalar(0, S16, S32) 958 .scalarize(0); 959 960 if (ST.hasVOP3PInsts()) { 961 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 962 .legalFor({S32, S16, V2S16}) 963 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 964 .clampMaxNumElements(0, S16, 2) 965 .minScalar(0, S16) 966 .widenScalarToNextPow2(0) 967 .scalarize(0) 968 .lower(); 969 } else { 970 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 971 .legalFor({S32, S16}) 972 .widenScalarToNextPow2(0) 973 .minScalar(0, S16) 974 .scalarize(0) 975 .lower(); 976 } 977 } else { 978 // TODO: Should have same legality without v_perm_b32 979 getActionDefinitionsBuilder(G_BSWAP) 980 .legalFor({S32}) 981 .lowerIf(scalarNarrowerThan(0, 32)) 982 // FIXME: Fixing non-power-of-2 before clamp is workaround for 983 // narrowScalar limitation. 984 .widenScalarToNextPow2(0) 985 .maxScalar(0, S32) 986 .scalarize(0) 987 .lower(); 988 989 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 990 .legalFor({S32}) 991 .minScalar(0, S32) 992 .widenScalarToNextPow2(0) 993 .scalarize(0) 994 .lower(); 995 } 996 997 getActionDefinitionsBuilder(G_INTTOPTR) 998 // List the common cases 999 .legalForCartesianProduct(AddrSpaces64, {S64}) 1000 .legalForCartesianProduct(AddrSpaces32, {S32}) 1001 .scalarize(0) 1002 // Accept any address space as long as the size matches 1003 .legalIf(sameSize(0, 1)) 1004 .widenScalarIf(smallerThan(1, 0), 1005 [](const LegalityQuery &Query) { 1006 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1007 }) 1008 .narrowScalarIf(largerThan(1, 0), 1009 [](const LegalityQuery &Query) { 1010 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1011 }); 1012 1013 getActionDefinitionsBuilder(G_PTRTOINT) 1014 // List the common cases 1015 .legalForCartesianProduct(AddrSpaces64, {S64}) 1016 .legalForCartesianProduct(AddrSpaces32, {S32}) 1017 .scalarize(0) 1018 // Accept any address space as long as the size matches 1019 .legalIf(sameSize(0, 1)) 1020 .widenScalarIf(smallerThan(0, 1), 1021 [](const LegalityQuery &Query) { 1022 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1023 }) 1024 .narrowScalarIf( 1025 largerThan(0, 1), 1026 [](const LegalityQuery &Query) { 1027 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1028 }); 1029 1030 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1031 .scalarize(0) 1032 .custom(); 1033 1034 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1035 bool IsLoad) -> bool { 1036 const LLT DstTy = Query.Types[0]; 1037 1038 // Split vector extloads. 1039 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1040 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1041 1042 if (MemSize < DstTy.getSizeInBits()) 1043 MemSize = std::max(MemSize, AlignBits); 1044 1045 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1046 return true; 1047 1048 const LLT PtrTy = Query.Types[1]; 1049 unsigned AS = PtrTy.getAddressSpace(); 1050 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1051 return true; 1052 1053 // Catch weird sized loads that don't evenly divide into the access sizes 1054 // TODO: May be able to widen depending on alignment etc. 1055 unsigned NumRegs = (MemSize + 31) / 32; 1056 if (NumRegs == 3) { 1057 if (!ST.hasDwordx3LoadStores()) 1058 return true; 1059 } else { 1060 // If the alignment allows, these should have been widened. 1061 if (!isPowerOf2_32(NumRegs)) 1062 return true; 1063 } 1064 1065 if (AlignBits < MemSize) { 1066 const SITargetLowering *TLI = ST.getTargetLowering(); 1067 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1068 Align(AlignBits / 8)); 1069 } 1070 1071 return false; 1072 }; 1073 1074 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1075 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1076 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1077 1078 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1079 // LDS 1080 // TODO: Unsupported flat for SI. 1081 1082 for (unsigned Op : {G_LOAD, G_STORE}) { 1083 const bool IsStore = Op == G_STORE; 1084 1085 auto &Actions = getActionDefinitionsBuilder(Op); 1086 // Explicitly list some common cases. 1087 // TODO: Does this help compile time at all? 1088 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1089 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1090 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1091 {S64, GlobalPtr, 64, GlobalAlign32}, 1092 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1093 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1094 {S32, GlobalPtr, 8, GlobalAlign8}, 1095 {S32, GlobalPtr, 16, GlobalAlign16}, 1096 1097 {S32, LocalPtr, 32, 32}, 1098 {S64, LocalPtr, 64, 32}, 1099 {V2S32, LocalPtr, 64, 32}, 1100 {S32, LocalPtr, 8, 8}, 1101 {S32, LocalPtr, 16, 16}, 1102 {V2S16, LocalPtr, 32, 32}, 1103 1104 {S32, PrivatePtr, 32, 32}, 1105 {S32, PrivatePtr, 8, 8}, 1106 {S32, PrivatePtr, 16, 16}, 1107 {V2S16, PrivatePtr, 32, 32}, 1108 1109 {S32, ConstantPtr, 32, GlobalAlign32}, 1110 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1111 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1112 {S64, ConstantPtr, 64, GlobalAlign32}, 1113 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1114 Actions.legalIf( 1115 [=](const LegalityQuery &Query) -> bool { 1116 return isLoadStoreLegal(ST, Query, Op); 1117 }); 1118 1119 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1120 // 64-bits. 1121 // 1122 // TODO: Should generalize bitcast action into coerce, which will also cover 1123 // inserting addrspacecasts. 1124 Actions.customIf(typeIs(1, Constant32Ptr)); 1125 1126 // Turn any illegal element vectors into something easier to deal 1127 // with. These will ultimately produce 32-bit scalar shifts to extract the 1128 // parts anyway. 1129 // 1130 // For odd 16-bit element vectors, prefer to split those into pieces with 1131 // 16-bit vector parts. 1132 Actions.bitcastIf( 1133 [=](const LegalityQuery &Query) -> bool { 1134 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1135 Query.MMODescrs[0].SizeInBits); 1136 }, bitcastToRegisterType(0)); 1137 1138 if (!IsStore) { 1139 // Widen suitably aligned loads by loading extra bytes. The standard 1140 // legalization actions can't properly express widening memory operands. 1141 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1142 return shouldWidenLoad(ST, Query, G_LOAD); 1143 }); 1144 } 1145 1146 // FIXME: load/store narrowing should be moved to lower action 1147 Actions 1148 .narrowScalarIf( 1149 [=](const LegalityQuery &Query) -> bool { 1150 return !Query.Types[0].isVector() && 1151 needToSplitMemOp(Query, Op == G_LOAD); 1152 }, 1153 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1154 const LLT DstTy = Query.Types[0]; 1155 const LLT PtrTy = Query.Types[1]; 1156 1157 const unsigned DstSize = DstTy.getSizeInBits(); 1158 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1159 1160 // Split extloads. 1161 if (DstSize > MemSize) 1162 return std::make_pair(0, LLT::scalar(MemSize)); 1163 1164 if (!isPowerOf2_32(DstSize)) { 1165 // We're probably decomposing an odd sized store. Try to split 1166 // to the widest type. TODO: Account for alignment. As-is it 1167 // should be OK, since the new parts will be further legalized. 1168 unsigned FloorSize = PowerOf2Floor(DstSize); 1169 return std::make_pair(0, LLT::scalar(FloorSize)); 1170 } 1171 1172 if (DstSize > 32 && (DstSize % 32 != 0)) { 1173 // FIXME: Need a way to specify non-extload of larger size if 1174 // suitably aligned. 1175 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1176 } 1177 1178 unsigned MaxSize = maxSizeForAddrSpace(ST, 1179 PtrTy.getAddressSpace(), 1180 Op == G_LOAD); 1181 if (MemSize > MaxSize) 1182 return std::make_pair(0, LLT::scalar(MaxSize)); 1183 1184 unsigned Align = Query.MMODescrs[0].AlignInBits; 1185 return std::make_pair(0, LLT::scalar(Align)); 1186 }) 1187 .fewerElementsIf( 1188 [=](const LegalityQuery &Query) -> bool { 1189 return Query.Types[0].isVector() && 1190 needToSplitMemOp(Query, Op == G_LOAD); 1191 }, 1192 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1193 const LLT DstTy = Query.Types[0]; 1194 const LLT PtrTy = Query.Types[1]; 1195 1196 LLT EltTy = DstTy.getElementType(); 1197 unsigned MaxSize = maxSizeForAddrSpace(ST, 1198 PtrTy.getAddressSpace(), 1199 Op == G_LOAD); 1200 1201 // FIXME: Handle widened to power of 2 results better. This ends 1202 // up scalarizing. 1203 // FIXME: 3 element stores scalarized on SI 1204 1205 // Split if it's too large for the address space. 1206 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1207 unsigned NumElts = DstTy.getNumElements(); 1208 unsigned EltSize = EltTy.getSizeInBits(); 1209 1210 if (MaxSize % EltSize == 0) { 1211 return std::make_pair( 1212 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1213 } 1214 1215 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1216 1217 // FIXME: Refine when odd breakdowns handled 1218 // The scalars will need to be re-legalized. 1219 if (NumPieces == 1 || NumPieces >= NumElts || 1220 NumElts % NumPieces != 0) 1221 return std::make_pair(0, EltTy); 1222 1223 return std::make_pair(0, 1224 LLT::vector(NumElts / NumPieces, EltTy)); 1225 } 1226 1227 // FIXME: We could probably handle weird extending loads better. 1228 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1229 if (DstTy.getSizeInBits() > MemSize) 1230 return std::make_pair(0, EltTy); 1231 1232 unsigned EltSize = EltTy.getSizeInBits(); 1233 unsigned DstSize = DstTy.getSizeInBits(); 1234 if (!isPowerOf2_32(DstSize)) { 1235 // We're probably decomposing an odd sized store. Try to split 1236 // to the widest type. TODO: Account for alignment. As-is it 1237 // should be OK, since the new parts will be further legalized. 1238 unsigned FloorSize = PowerOf2Floor(DstSize); 1239 return std::make_pair( 1240 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1241 } 1242 1243 // Need to split because of alignment. 1244 unsigned Align = Query.MMODescrs[0].AlignInBits; 1245 if (EltSize > Align && 1246 (EltSize / Align < DstTy.getNumElements())) { 1247 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1248 } 1249 1250 // May need relegalization for the scalars. 1251 return std::make_pair(0, EltTy); 1252 }) 1253 .lowerIfMemSizeNotPow2() 1254 .minScalar(0, S32); 1255 1256 if (IsStore) 1257 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1258 1259 Actions 1260 .widenScalarToNextPow2(0) 1261 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1262 .lower(); 1263 } 1264 1265 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1266 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1267 {S32, GlobalPtr, 16, 2 * 8}, 1268 {S32, LocalPtr, 8, 8}, 1269 {S32, LocalPtr, 16, 16}, 1270 {S32, PrivatePtr, 8, 8}, 1271 {S32, PrivatePtr, 16, 16}, 1272 {S32, ConstantPtr, 8, 8}, 1273 {S32, ConstantPtr, 16, 2 * 8}}); 1274 if (ST.hasFlatAddressSpace()) { 1275 ExtLoads.legalForTypesWithMemDesc( 1276 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1277 } 1278 1279 ExtLoads.clampScalar(0, S32, S32) 1280 .widenScalarToNextPow2(0) 1281 .unsupportedIfMemSizeNotPow2() 1282 .lower(); 1283 1284 auto &Atomics = getActionDefinitionsBuilder( 1285 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1286 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1287 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1288 G_ATOMICRMW_UMIN}) 1289 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1290 {S64, GlobalPtr}, {S64, LocalPtr}, 1291 {S32, RegionPtr}, {S64, RegionPtr}}); 1292 if (ST.hasFlatAddressSpace()) { 1293 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1294 } 1295 1296 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1297 if (ST.hasLDSFPAtomics()) { 1298 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1299 if (ST.hasGFX90AInsts()) 1300 Atomic.legalFor({{S64, LocalPtr}}); 1301 } 1302 if (ST.hasAtomicFaddInsts()) 1303 Atomic.legalFor({{S32, GlobalPtr}}); 1304 1305 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1306 // demarshalling 1307 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1308 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1309 {S32, FlatPtr}, {S64, FlatPtr}}) 1310 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1311 {S32, RegionPtr}, {S64, RegionPtr}}); 1312 // TODO: Pointer types, any 32-bit or 64-bit vector 1313 1314 // Condition should be s32 for scalar, s1 for vector. 1315 getActionDefinitionsBuilder(G_SELECT) 1316 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1317 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1318 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1319 .clampScalar(0, S16, S64) 1320 .scalarize(1) 1321 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1322 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1323 .clampMaxNumElements(0, S32, 2) 1324 .clampMaxNumElements(0, LocalPtr, 2) 1325 .clampMaxNumElements(0, PrivatePtr, 2) 1326 .scalarize(0) 1327 .widenScalarToNextPow2(0) 1328 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1329 1330 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1331 // be more flexible with the shift amount type. 1332 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1333 .legalFor({{S32, S32}, {S64, S32}}); 1334 if (ST.has16BitInsts()) { 1335 if (ST.hasVOP3PInsts()) { 1336 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1337 .clampMaxNumElements(0, S16, 2); 1338 } else 1339 Shifts.legalFor({{S16, S16}}); 1340 1341 // TODO: Support 16-bit shift amounts for all types 1342 Shifts.widenScalarIf( 1343 [=](const LegalityQuery &Query) { 1344 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1345 // 32-bit amount. 1346 const LLT ValTy = Query.Types[0]; 1347 const LLT AmountTy = Query.Types[1]; 1348 return ValTy.getSizeInBits() <= 16 && 1349 AmountTy.getSizeInBits() < 16; 1350 }, changeTo(1, S16)); 1351 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1352 Shifts.clampScalar(1, S32, S32); 1353 Shifts.clampScalar(0, S16, S64); 1354 Shifts.widenScalarToNextPow2(0, 16); 1355 1356 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1357 .minScalar(0, S16) 1358 .scalarize(0) 1359 .lower(); 1360 } else { 1361 // Make sure we legalize the shift amount type first, as the general 1362 // expansion for the shifted type will produce much worse code if it hasn't 1363 // been truncated already. 1364 Shifts.clampScalar(1, S32, S32); 1365 Shifts.clampScalar(0, S32, S64); 1366 Shifts.widenScalarToNextPow2(0, 32); 1367 1368 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1369 .minScalar(0, S32) 1370 .scalarize(0) 1371 .lower(); 1372 } 1373 Shifts.scalarize(0); 1374 1375 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1376 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1377 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1378 unsigned IdxTypeIdx = 2; 1379 1380 getActionDefinitionsBuilder(Op) 1381 .customIf([=](const LegalityQuery &Query) { 1382 const LLT EltTy = Query.Types[EltTypeIdx]; 1383 const LLT VecTy = Query.Types[VecTypeIdx]; 1384 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1385 const unsigned EltSize = EltTy.getSizeInBits(); 1386 return (EltSize == 32 || EltSize == 64) && 1387 VecTy.getSizeInBits() % 32 == 0 && 1388 VecTy.getSizeInBits() <= MaxRegisterSize && 1389 IdxTy.getSizeInBits() == 32; 1390 }) 1391 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1392 bitcastToVectorElement32(VecTypeIdx)) 1393 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1394 .bitcastIf( 1395 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1396 [=](const LegalityQuery &Query) { 1397 // For > 64-bit element types, try to turn this into a 64-bit 1398 // element vector since we may be able to do better indexing 1399 // if this is scalar. If not, fall back to 32. 1400 const LLT EltTy = Query.Types[EltTypeIdx]; 1401 const LLT VecTy = Query.Types[VecTypeIdx]; 1402 const unsigned DstEltSize = EltTy.getSizeInBits(); 1403 const unsigned VecSize = VecTy.getSizeInBits(); 1404 1405 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1406 return std::make_pair( 1407 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1408 }) 1409 .clampScalar(EltTypeIdx, S32, S64) 1410 .clampScalar(VecTypeIdx, S32, S64) 1411 .clampScalar(IdxTypeIdx, S32, S32) 1412 .clampMaxNumElements(VecTypeIdx, S32, 32) 1413 // TODO: Clamp elements for 64-bit vectors? 1414 // It should only be necessary with variable indexes. 1415 // As a last resort, lower to the stack 1416 .lower(); 1417 } 1418 1419 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1420 .unsupportedIf([=](const LegalityQuery &Query) { 1421 const LLT &EltTy = Query.Types[1].getElementType(); 1422 return Query.Types[0] != EltTy; 1423 }); 1424 1425 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1426 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1427 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1428 1429 // FIXME: Doesn't handle extract of illegal sizes. 1430 getActionDefinitionsBuilder(Op) 1431 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1432 // FIXME: Multiples of 16 should not be legal. 1433 .legalIf([=](const LegalityQuery &Query) { 1434 const LLT BigTy = Query.Types[BigTyIdx]; 1435 const LLT LitTy = Query.Types[LitTyIdx]; 1436 return (BigTy.getSizeInBits() % 32 == 0) && 1437 (LitTy.getSizeInBits() % 16 == 0); 1438 }) 1439 .widenScalarIf( 1440 [=](const LegalityQuery &Query) { 1441 const LLT BigTy = Query.Types[BigTyIdx]; 1442 return (BigTy.getScalarSizeInBits() < 16); 1443 }, 1444 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1445 .widenScalarIf( 1446 [=](const LegalityQuery &Query) { 1447 const LLT LitTy = Query.Types[LitTyIdx]; 1448 return (LitTy.getScalarSizeInBits() < 16); 1449 }, 1450 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1451 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1452 .widenScalarToNextPow2(BigTyIdx, 32); 1453 1454 } 1455 1456 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1457 .legalForCartesianProduct(AllS32Vectors, {S32}) 1458 .legalForCartesianProduct(AllS64Vectors, {S64}) 1459 .clampNumElements(0, V16S32, V32S32) 1460 .clampNumElements(0, V2S64, V16S64) 1461 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1462 1463 if (ST.hasScalarPackInsts()) { 1464 BuildVector 1465 // FIXME: Should probably widen s1 vectors straight to s32 1466 .minScalarOrElt(0, S16) 1467 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1468 .minScalar(1, S32); 1469 1470 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1471 .legalFor({V2S16, S32}) 1472 .lower(); 1473 BuildVector.minScalarOrElt(0, S32); 1474 } else { 1475 BuildVector.customFor({V2S16, S16}); 1476 BuildVector.minScalarOrElt(0, S32); 1477 1478 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1479 .customFor({V2S16, S32}) 1480 .lower(); 1481 } 1482 1483 BuildVector.legalIf(isRegisterType(0)); 1484 1485 // FIXME: Clamp maximum size 1486 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1487 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1488 .clampMaxNumElements(0, S32, 32) 1489 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1490 .clampMaxNumElements(0, S16, 64); 1491 1492 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1493 // pre-legalize. 1494 if (ST.hasVOP3PInsts()) { 1495 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1496 .customFor({V2S16, V2S16}) 1497 .lower(); 1498 } else 1499 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1500 1501 // Merge/Unmerge 1502 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1503 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1504 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1505 1506 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1507 const LLT Ty = Query.Types[TypeIdx]; 1508 if (Ty.isVector()) { 1509 const LLT &EltTy = Ty.getElementType(); 1510 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1511 return true; 1512 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1513 return true; 1514 } 1515 return false; 1516 }; 1517 1518 auto &Builder = getActionDefinitionsBuilder(Op) 1519 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1520 .lowerFor({{S16, V2S16}}) 1521 .lowerIf([=](const LegalityQuery &Query) { 1522 const LLT BigTy = Query.Types[BigTyIdx]; 1523 return BigTy.getSizeInBits() == 32; 1524 }) 1525 // Try to widen to s16 first for small types. 1526 // TODO: Only do this on targets with legal s16 shifts 1527 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1528 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1529 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1530 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1531 elementTypeIs(1, S16)), 1532 changeTo(1, V2S16)) 1533 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1534 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1535 // valid. 1536 .clampScalar(LitTyIdx, S32, S512) 1537 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1538 // Break up vectors with weird elements into scalars 1539 .fewerElementsIf( 1540 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1541 scalarize(0)) 1542 .fewerElementsIf( 1543 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1544 scalarize(1)) 1545 .clampScalar(BigTyIdx, S32, MaxScalar); 1546 1547 if (Op == G_MERGE_VALUES) { 1548 Builder.widenScalarIf( 1549 // TODO: Use 16-bit shifts if legal for 8-bit values? 1550 [=](const LegalityQuery &Query) { 1551 const LLT Ty = Query.Types[LitTyIdx]; 1552 return Ty.getSizeInBits() < 32; 1553 }, 1554 changeTo(LitTyIdx, S32)); 1555 } 1556 1557 Builder.widenScalarIf( 1558 [=](const LegalityQuery &Query) { 1559 const LLT Ty = Query.Types[BigTyIdx]; 1560 return !isPowerOf2_32(Ty.getSizeInBits()) && 1561 Ty.getSizeInBits() % 16 != 0; 1562 }, 1563 [=](const LegalityQuery &Query) { 1564 // Pick the next power of 2, or a multiple of 64 over 128. 1565 // Whichever is smaller. 1566 const LLT &Ty = Query.Types[BigTyIdx]; 1567 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1568 if (NewSizeInBits >= 256) { 1569 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1570 if (RoundedTo < NewSizeInBits) 1571 NewSizeInBits = RoundedTo; 1572 } 1573 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1574 }) 1575 // Any vectors left are the wrong size. Scalarize them. 1576 .scalarize(0) 1577 .scalarize(1); 1578 } 1579 1580 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1581 // RegBankSelect. 1582 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1583 .legalFor({{S32}, {S64}}); 1584 1585 if (ST.hasVOP3PInsts()) { 1586 SextInReg.lowerFor({{V2S16}}) 1587 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1588 // get more vector shift opportunities, since we'll get those when 1589 // expanded. 1590 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1591 } else if (ST.has16BitInsts()) { 1592 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1593 } else { 1594 // Prefer to promote to s32 before lowering if we don't have 16-bit 1595 // shifts. This avoid a lot of intermediate truncate and extend operations. 1596 SextInReg.lowerFor({{S32}, {S64}}); 1597 } 1598 1599 SextInReg 1600 .scalarize(0) 1601 .clampScalar(0, S32, S64) 1602 .lower(); 1603 1604 // TODO: Only Try to form v2s16 with legal packed instructions. 1605 getActionDefinitionsBuilder(G_FSHR) 1606 .legalFor({{S32, S32}}) 1607 .lowerFor({{V2S16, V2S16}}) 1608 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1609 .scalarize(0) 1610 .lower(); 1611 1612 if (ST.hasVOP3PInsts()) { 1613 getActionDefinitionsBuilder(G_FSHL) 1614 .lowerFor({{V2S16, V2S16}}) 1615 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1616 .scalarize(0) 1617 .lower(); 1618 } else { 1619 getActionDefinitionsBuilder(G_FSHL) 1620 .scalarize(0) 1621 .lower(); 1622 } 1623 1624 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1625 .legalFor({S64}); 1626 1627 getActionDefinitionsBuilder(G_FENCE) 1628 .alwaysLegal(); 1629 1630 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1631 .scalarize(0) 1632 .minScalar(0, S32) 1633 .lower(); 1634 1635 getActionDefinitionsBuilder({ 1636 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1637 G_FCOPYSIGN, 1638 1639 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1640 G_ATOMICRMW_NAND, 1641 G_ATOMICRMW_FSUB, 1642 G_READ_REGISTER, 1643 G_WRITE_REGISTER, 1644 1645 G_SADDO, G_SSUBO, 1646 1647 // TODO: Implement 1648 G_FMINIMUM, G_FMAXIMUM}).lower(); 1649 1650 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1651 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1652 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1653 .unsupported(); 1654 1655 computeTables(); 1656 verify(*ST.getInstrInfo()); 1657 } 1658 1659 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1660 MachineInstr &MI) const { 1661 MachineIRBuilder &B = Helper.MIRBuilder; 1662 MachineRegisterInfo &MRI = *B.getMRI(); 1663 1664 switch (MI.getOpcode()) { 1665 case TargetOpcode::G_ADDRSPACE_CAST: 1666 return legalizeAddrSpaceCast(MI, MRI, B); 1667 case TargetOpcode::G_FRINT: 1668 return legalizeFrint(MI, MRI, B); 1669 case TargetOpcode::G_FCEIL: 1670 return legalizeFceil(MI, MRI, B); 1671 case TargetOpcode::G_FREM: 1672 return legalizeFrem(MI, MRI, B); 1673 case TargetOpcode::G_INTRINSIC_TRUNC: 1674 return legalizeIntrinsicTrunc(MI, MRI, B); 1675 case TargetOpcode::G_SITOFP: 1676 return legalizeITOFP(MI, MRI, B, true); 1677 case TargetOpcode::G_UITOFP: 1678 return legalizeITOFP(MI, MRI, B, false); 1679 case TargetOpcode::G_FPTOSI: 1680 return legalizeFPTOI(MI, MRI, B, true); 1681 case TargetOpcode::G_FPTOUI: 1682 return legalizeFPTOI(MI, MRI, B, false); 1683 case TargetOpcode::G_FMINNUM: 1684 case TargetOpcode::G_FMAXNUM: 1685 case TargetOpcode::G_FMINNUM_IEEE: 1686 case TargetOpcode::G_FMAXNUM_IEEE: 1687 return legalizeMinNumMaxNum(Helper, MI); 1688 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1689 return legalizeExtractVectorElt(MI, MRI, B); 1690 case TargetOpcode::G_INSERT_VECTOR_ELT: 1691 return legalizeInsertVectorElt(MI, MRI, B); 1692 case TargetOpcode::G_SHUFFLE_VECTOR: 1693 return legalizeShuffleVector(MI, MRI, B); 1694 case TargetOpcode::G_FSIN: 1695 case TargetOpcode::G_FCOS: 1696 return legalizeSinCos(MI, MRI, B); 1697 case TargetOpcode::G_GLOBAL_VALUE: 1698 return legalizeGlobalValue(MI, MRI, B); 1699 case TargetOpcode::G_LOAD: 1700 return legalizeLoad(Helper, MI); 1701 case TargetOpcode::G_FMAD: 1702 return legalizeFMad(MI, MRI, B); 1703 case TargetOpcode::G_FDIV: 1704 return legalizeFDIV(MI, MRI, B); 1705 case TargetOpcode::G_UDIV: 1706 case TargetOpcode::G_UREM: 1707 case TargetOpcode::G_UDIVREM: 1708 return legalizeUnsignedDIV_REM(MI, MRI, B); 1709 case TargetOpcode::G_SDIV: 1710 case TargetOpcode::G_SREM: 1711 case TargetOpcode::G_SDIVREM: 1712 return legalizeSignedDIV_REM(MI, MRI, B); 1713 case TargetOpcode::G_ATOMIC_CMPXCHG: 1714 return legalizeAtomicCmpXChg(MI, MRI, B); 1715 case TargetOpcode::G_FLOG: 1716 return legalizeFlog(MI, B, numbers::ln2f); 1717 case TargetOpcode::G_FLOG10: 1718 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1719 case TargetOpcode::G_FEXP: 1720 return legalizeFExp(MI, B); 1721 case TargetOpcode::G_FPOW: 1722 return legalizeFPow(MI, B); 1723 case TargetOpcode::G_FFLOOR: 1724 return legalizeFFloor(MI, MRI, B); 1725 case TargetOpcode::G_BUILD_VECTOR: 1726 return legalizeBuildVector(MI, MRI, B); 1727 default: 1728 return false; 1729 } 1730 1731 llvm_unreachable("expected switch to return"); 1732 } 1733 1734 Register AMDGPULegalizerInfo::getSegmentAperture( 1735 unsigned AS, 1736 MachineRegisterInfo &MRI, 1737 MachineIRBuilder &B) const { 1738 MachineFunction &MF = B.getMF(); 1739 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1740 const LLT S32 = LLT::scalar(32); 1741 1742 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1743 1744 if (ST.hasApertureRegs()) { 1745 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1746 // getreg. 1747 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1748 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1749 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1750 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1751 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1752 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1753 unsigned Encoding = 1754 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1755 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1756 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1757 1758 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1759 1760 B.buildInstr(AMDGPU::S_GETREG_B32) 1761 .addDef(GetReg) 1762 .addImm(Encoding); 1763 MRI.setType(GetReg, S32); 1764 1765 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1766 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1767 } 1768 1769 Register QueuePtr = MRI.createGenericVirtualRegister( 1770 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1771 1772 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1773 return Register(); 1774 1775 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1776 // private_segment_aperture_base_hi. 1777 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1778 1779 // TODO: can we be smarter about machine pointer info? 1780 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1781 MachineMemOperand *MMO = MF.getMachineMemOperand( 1782 PtrInfo, 1783 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1784 MachineMemOperand::MOInvariant, 1785 4, commonAlignment(Align(64), StructOffset)); 1786 1787 Register LoadAddr; 1788 1789 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1790 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1791 } 1792 1793 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1794 MachineInstr &MI, MachineRegisterInfo &MRI, 1795 MachineIRBuilder &B) const { 1796 MachineFunction &MF = B.getMF(); 1797 1798 const LLT S32 = LLT::scalar(32); 1799 Register Dst = MI.getOperand(0).getReg(); 1800 Register Src = MI.getOperand(1).getReg(); 1801 1802 LLT DstTy = MRI.getType(Dst); 1803 LLT SrcTy = MRI.getType(Src); 1804 unsigned DestAS = DstTy.getAddressSpace(); 1805 unsigned SrcAS = SrcTy.getAddressSpace(); 1806 1807 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1808 // vector element. 1809 assert(!DstTy.isVector()); 1810 1811 const AMDGPUTargetMachine &TM 1812 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1813 1814 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1815 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1816 return true; 1817 } 1818 1819 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1820 // Truncate. 1821 B.buildExtract(Dst, Src, 0); 1822 MI.eraseFromParent(); 1823 return true; 1824 } 1825 1826 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1827 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1828 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1829 1830 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1831 // another. Merge operands are required to be the same type, but creating an 1832 // extra ptrtoint would be kind of pointless. 1833 auto HighAddr = B.buildConstant( 1834 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1835 B.buildMerge(Dst, {Src, HighAddr}); 1836 MI.eraseFromParent(); 1837 return true; 1838 } 1839 1840 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1841 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1842 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1843 unsigned NullVal = TM.getNullPointerValue(DestAS); 1844 1845 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1846 auto FlatNull = B.buildConstant(SrcTy, 0); 1847 1848 // Extract low 32-bits of the pointer. 1849 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1850 1851 auto CmpRes = 1852 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1853 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1854 1855 MI.eraseFromParent(); 1856 return true; 1857 } 1858 1859 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1860 return false; 1861 1862 if (!ST.hasFlatAddressSpace()) 1863 return false; 1864 1865 auto SegmentNull = 1866 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1867 auto FlatNull = 1868 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1869 1870 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1871 if (!ApertureReg.isValid()) 1872 return false; 1873 1874 auto CmpRes = 1875 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1876 1877 // Coerce the type of the low half of the result so we can use merge_values. 1878 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1879 1880 // TODO: Should we allow mismatched types but matching sizes in merges to 1881 // avoid the ptrtoint? 1882 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1883 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1884 1885 MI.eraseFromParent(); 1886 return true; 1887 } 1888 1889 bool AMDGPULegalizerInfo::legalizeFrint( 1890 MachineInstr &MI, MachineRegisterInfo &MRI, 1891 MachineIRBuilder &B) const { 1892 Register Src = MI.getOperand(1).getReg(); 1893 LLT Ty = MRI.getType(Src); 1894 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1895 1896 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1897 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1898 1899 auto C1 = B.buildFConstant(Ty, C1Val); 1900 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1901 1902 // TODO: Should this propagate fast-math-flags? 1903 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1904 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1905 1906 auto C2 = B.buildFConstant(Ty, C2Val); 1907 auto Fabs = B.buildFAbs(Ty, Src); 1908 1909 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1910 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1911 MI.eraseFromParent(); 1912 return true; 1913 } 1914 1915 bool AMDGPULegalizerInfo::legalizeFceil( 1916 MachineInstr &MI, MachineRegisterInfo &MRI, 1917 MachineIRBuilder &B) const { 1918 1919 const LLT S1 = LLT::scalar(1); 1920 const LLT S64 = LLT::scalar(64); 1921 1922 Register Src = MI.getOperand(1).getReg(); 1923 assert(MRI.getType(Src) == S64); 1924 1925 // result = trunc(src) 1926 // if (src > 0.0 && src != result) 1927 // result += 1.0 1928 1929 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1930 1931 const auto Zero = B.buildFConstant(S64, 0.0); 1932 const auto One = B.buildFConstant(S64, 1.0); 1933 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1934 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1935 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1936 auto Add = B.buildSelect(S64, And, One, Zero); 1937 1938 // TODO: Should this propagate fast-math-flags? 1939 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1940 return true; 1941 } 1942 1943 bool AMDGPULegalizerInfo::legalizeFrem( 1944 MachineInstr &MI, MachineRegisterInfo &MRI, 1945 MachineIRBuilder &B) const { 1946 Register DstReg = MI.getOperand(0).getReg(); 1947 Register Src0Reg = MI.getOperand(1).getReg(); 1948 Register Src1Reg = MI.getOperand(2).getReg(); 1949 auto Flags = MI.getFlags(); 1950 LLT Ty = MRI.getType(DstReg); 1951 1952 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1953 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1954 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1955 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1956 MI.eraseFromParent(); 1957 return true; 1958 } 1959 1960 static MachineInstrBuilder extractF64Exponent(Register Hi, 1961 MachineIRBuilder &B) { 1962 const unsigned FractBits = 52; 1963 const unsigned ExpBits = 11; 1964 LLT S32 = LLT::scalar(32); 1965 1966 auto Const0 = B.buildConstant(S32, FractBits - 32); 1967 auto Const1 = B.buildConstant(S32, ExpBits); 1968 1969 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1970 .addUse(Hi) 1971 .addUse(Const0.getReg(0)) 1972 .addUse(Const1.getReg(0)); 1973 1974 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1975 } 1976 1977 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1978 MachineInstr &MI, MachineRegisterInfo &MRI, 1979 MachineIRBuilder &B) const { 1980 const LLT S1 = LLT::scalar(1); 1981 const LLT S32 = LLT::scalar(32); 1982 const LLT S64 = LLT::scalar(64); 1983 1984 Register Src = MI.getOperand(1).getReg(); 1985 assert(MRI.getType(Src) == S64); 1986 1987 // TODO: Should this use extract since the low half is unused? 1988 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1989 Register Hi = Unmerge.getReg(1); 1990 1991 // Extract the upper half, since this is where we will find the sign and 1992 // exponent. 1993 auto Exp = extractF64Exponent(Hi, B); 1994 1995 const unsigned FractBits = 52; 1996 1997 // Extract the sign bit. 1998 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1999 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2000 2001 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2002 2003 const auto Zero32 = B.buildConstant(S32, 0); 2004 2005 // Extend back to 64-bits. 2006 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2007 2008 auto Shr = B.buildAShr(S64, FractMask, Exp); 2009 auto Not = B.buildNot(S64, Shr); 2010 auto Tmp0 = B.buildAnd(S64, Src, Not); 2011 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2012 2013 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2014 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2015 2016 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2017 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2018 MI.eraseFromParent(); 2019 return true; 2020 } 2021 2022 bool AMDGPULegalizerInfo::legalizeITOFP( 2023 MachineInstr &MI, MachineRegisterInfo &MRI, 2024 MachineIRBuilder &B, bool Signed) const { 2025 2026 Register Dst = MI.getOperand(0).getReg(); 2027 Register Src = MI.getOperand(1).getReg(); 2028 2029 const LLT S64 = LLT::scalar(64); 2030 const LLT S32 = LLT::scalar(32); 2031 2032 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2033 2034 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2035 2036 auto CvtHi = Signed ? 2037 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2038 B.buildUITOFP(S64, Unmerge.getReg(1)); 2039 2040 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2041 2042 auto ThirtyTwo = B.buildConstant(S32, 32); 2043 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2044 .addUse(CvtHi.getReg(0)) 2045 .addUse(ThirtyTwo.getReg(0)); 2046 2047 // TODO: Should this propagate fast-math-flags? 2048 B.buildFAdd(Dst, LdExp, CvtLo); 2049 MI.eraseFromParent(); 2050 return true; 2051 } 2052 2053 // TODO: Copied from DAG implementation. Verify logic and document how this 2054 // actually works. 2055 bool AMDGPULegalizerInfo::legalizeFPTOI( 2056 MachineInstr &MI, MachineRegisterInfo &MRI, 2057 MachineIRBuilder &B, bool Signed) const { 2058 2059 Register Dst = MI.getOperand(0).getReg(); 2060 Register Src = MI.getOperand(1).getReg(); 2061 2062 const LLT S64 = LLT::scalar(64); 2063 const LLT S32 = LLT::scalar(32); 2064 2065 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2066 2067 unsigned Flags = MI.getFlags(); 2068 2069 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2070 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2071 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2072 2073 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2074 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2075 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2076 2077 auto Hi = Signed ? 2078 B.buildFPTOSI(S32, FloorMul) : 2079 B.buildFPTOUI(S32, FloorMul); 2080 auto Lo = B.buildFPTOUI(S32, Fma); 2081 2082 B.buildMerge(Dst, { Lo, Hi }); 2083 MI.eraseFromParent(); 2084 2085 return true; 2086 } 2087 2088 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2089 MachineInstr &MI) const { 2090 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2091 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2092 2093 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2094 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2095 2096 // With ieee_mode disabled, the instructions have the correct behavior 2097 // already for G_FMINNUM/G_FMAXNUM 2098 if (!MFI->getMode().IEEE) 2099 return !IsIEEEOp; 2100 2101 if (IsIEEEOp) 2102 return true; 2103 2104 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2105 } 2106 2107 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2108 MachineInstr &MI, MachineRegisterInfo &MRI, 2109 MachineIRBuilder &B) const { 2110 // TODO: Should move some of this into LegalizerHelper. 2111 2112 // TODO: Promote dynamic indexing of s16 to s32 2113 2114 // FIXME: Artifact combiner probably should have replaced the truncated 2115 // constant before this, so we shouldn't need 2116 // getConstantVRegValWithLookThrough. 2117 Optional<ValueAndVReg> MaybeIdxVal = 2118 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2119 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2120 return true; 2121 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2122 2123 Register Dst = MI.getOperand(0).getReg(); 2124 Register Vec = MI.getOperand(1).getReg(); 2125 2126 LLT VecTy = MRI.getType(Vec); 2127 LLT EltTy = VecTy.getElementType(); 2128 assert(EltTy == MRI.getType(Dst)); 2129 2130 if (IdxVal < VecTy.getNumElements()) 2131 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2132 else 2133 B.buildUndef(Dst); 2134 2135 MI.eraseFromParent(); 2136 return true; 2137 } 2138 2139 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2140 MachineInstr &MI, MachineRegisterInfo &MRI, 2141 MachineIRBuilder &B) const { 2142 // TODO: Should move some of this into LegalizerHelper. 2143 2144 // TODO: Promote dynamic indexing of s16 to s32 2145 2146 // FIXME: Artifact combiner probably should have replaced the truncated 2147 // constant before this, so we shouldn't need 2148 // getConstantVRegValWithLookThrough. 2149 Optional<ValueAndVReg> MaybeIdxVal = 2150 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2151 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2152 return true; 2153 2154 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2155 Register Dst = MI.getOperand(0).getReg(); 2156 Register Vec = MI.getOperand(1).getReg(); 2157 Register Ins = MI.getOperand(2).getReg(); 2158 2159 LLT VecTy = MRI.getType(Vec); 2160 LLT EltTy = VecTy.getElementType(); 2161 assert(EltTy == MRI.getType(Ins)); 2162 2163 if (IdxVal < VecTy.getNumElements()) 2164 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2165 else 2166 B.buildUndef(Dst); 2167 2168 MI.eraseFromParent(); 2169 return true; 2170 } 2171 2172 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2173 MachineInstr &MI, MachineRegisterInfo &MRI, 2174 MachineIRBuilder &B) const { 2175 const LLT V2S16 = LLT::vector(2, 16); 2176 2177 Register Dst = MI.getOperand(0).getReg(); 2178 Register Src0 = MI.getOperand(1).getReg(); 2179 LLT DstTy = MRI.getType(Dst); 2180 LLT SrcTy = MRI.getType(Src0); 2181 2182 if (SrcTy == V2S16 && DstTy == V2S16 && 2183 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2184 return true; 2185 2186 MachineIRBuilder HelperBuilder(MI); 2187 GISelObserverWrapper DummyObserver; 2188 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2189 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2190 } 2191 2192 bool AMDGPULegalizerInfo::legalizeSinCos( 2193 MachineInstr &MI, MachineRegisterInfo &MRI, 2194 MachineIRBuilder &B) const { 2195 2196 Register DstReg = MI.getOperand(0).getReg(); 2197 Register SrcReg = MI.getOperand(1).getReg(); 2198 LLT Ty = MRI.getType(DstReg); 2199 unsigned Flags = MI.getFlags(); 2200 2201 Register TrigVal; 2202 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2203 if (ST.hasTrigReducedRange()) { 2204 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2205 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2206 .addUse(MulVal.getReg(0)) 2207 .setMIFlags(Flags).getReg(0); 2208 } else 2209 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2210 2211 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2212 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2213 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2214 .addUse(TrigVal) 2215 .setMIFlags(Flags); 2216 MI.eraseFromParent(); 2217 return true; 2218 } 2219 2220 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2221 MachineIRBuilder &B, 2222 const GlobalValue *GV, 2223 int64_t Offset, 2224 unsigned GAFlags) const { 2225 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2226 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2227 // to the following code sequence: 2228 // 2229 // For constant address space: 2230 // s_getpc_b64 s[0:1] 2231 // s_add_u32 s0, s0, $symbol 2232 // s_addc_u32 s1, s1, 0 2233 // 2234 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2235 // a fixup or relocation is emitted to replace $symbol with a literal 2236 // constant, which is a pc-relative offset from the encoding of the $symbol 2237 // operand to the global variable. 2238 // 2239 // For global address space: 2240 // s_getpc_b64 s[0:1] 2241 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2242 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2243 // 2244 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2245 // fixups or relocations are emitted to replace $symbol@*@lo and 2246 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2247 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2248 // operand to the global variable. 2249 // 2250 // What we want here is an offset from the value returned by s_getpc 2251 // (which is the address of the s_add_u32 instruction) to the global 2252 // variable, but since the encoding of $symbol starts 4 bytes after the start 2253 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2254 // small. This requires us to add 4 to the global variable offset in order to 2255 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2256 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2257 // instruction. 2258 2259 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2260 2261 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2262 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2263 2264 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2265 .addDef(PCReg); 2266 2267 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2268 if (GAFlags == SIInstrInfo::MO_NONE) 2269 MIB.addImm(0); 2270 else 2271 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2272 2273 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2274 2275 if (PtrTy.getSizeInBits() == 32) 2276 B.buildExtract(DstReg, PCReg, 0); 2277 return true; 2278 } 2279 2280 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2281 MachineInstr &MI, MachineRegisterInfo &MRI, 2282 MachineIRBuilder &B) const { 2283 Register DstReg = MI.getOperand(0).getReg(); 2284 LLT Ty = MRI.getType(DstReg); 2285 unsigned AS = Ty.getAddressSpace(); 2286 2287 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2288 MachineFunction &MF = B.getMF(); 2289 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2290 2291 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2292 if (!MFI->isModuleEntryFunction() && 2293 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2294 const Function &Fn = MF.getFunction(); 2295 DiagnosticInfoUnsupported BadLDSDecl( 2296 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2297 DS_Warning); 2298 Fn.getContext().diagnose(BadLDSDecl); 2299 2300 // We currently don't have a way to correctly allocate LDS objects that 2301 // aren't directly associated with a kernel. We do force inlining of 2302 // functions that use local objects. However, if these dead functions are 2303 // not eliminated, we don't want a compile time error. Just emit a warning 2304 // and a trap, since there should be no callable path here. 2305 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2306 B.buildUndef(DstReg); 2307 MI.eraseFromParent(); 2308 return true; 2309 } 2310 2311 // TODO: We could emit code to handle the initialization somewhere. 2312 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2313 const SITargetLowering *TLI = ST.getTargetLowering(); 2314 if (!TLI->shouldUseLDSConstAddress(GV)) { 2315 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2316 return true; // Leave in place; 2317 } 2318 2319 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2320 Type *Ty = GV->getValueType(); 2321 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2322 // zero-sized type in other languages to declare the dynamic shared 2323 // memory which size is not known at the compile time. They will be 2324 // allocated by the runtime and placed directly after the static 2325 // allocated ones. They all share the same offset. 2326 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2327 // Adjust alignment for that dynamic shared memory array. 2328 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2329 LLT S32 = LLT::scalar(32); 2330 auto Sz = 2331 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2332 B.buildIntToPtr(DstReg, Sz); 2333 MI.eraseFromParent(); 2334 return true; 2335 } 2336 } 2337 2338 B.buildConstant( 2339 DstReg, 2340 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2341 MI.eraseFromParent(); 2342 return true; 2343 } 2344 2345 const Function &Fn = MF.getFunction(); 2346 DiagnosticInfoUnsupported BadInit( 2347 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2348 Fn.getContext().diagnose(BadInit); 2349 return true; 2350 } 2351 2352 const SITargetLowering *TLI = ST.getTargetLowering(); 2353 2354 if (TLI->shouldEmitFixup(GV)) { 2355 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2356 MI.eraseFromParent(); 2357 return true; 2358 } 2359 2360 if (TLI->shouldEmitPCReloc(GV)) { 2361 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2362 MI.eraseFromParent(); 2363 return true; 2364 } 2365 2366 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2367 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2368 2369 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2370 MachinePointerInfo::getGOT(MF), 2371 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2372 MachineMemOperand::MOInvariant, 2373 8 /*Size*/, Align(8)); 2374 2375 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2376 2377 if (Ty.getSizeInBits() == 32) { 2378 // Truncate if this is a 32-bit constant adrdess. 2379 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2380 B.buildExtract(DstReg, Load, 0); 2381 } else 2382 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2383 2384 MI.eraseFromParent(); 2385 return true; 2386 } 2387 2388 static LLT widenToNextPowerOf2(LLT Ty) { 2389 if (Ty.isVector()) 2390 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2391 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2392 } 2393 2394 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2395 MachineInstr &MI) const { 2396 MachineIRBuilder &B = Helper.MIRBuilder; 2397 MachineRegisterInfo &MRI = *B.getMRI(); 2398 GISelChangeObserver &Observer = Helper.Observer; 2399 2400 Register PtrReg = MI.getOperand(1).getReg(); 2401 LLT PtrTy = MRI.getType(PtrReg); 2402 unsigned AddrSpace = PtrTy.getAddressSpace(); 2403 2404 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2405 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2406 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2407 Observer.changingInstr(MI); 2408 MI.getOperand(1).setReg(Cast.getReg(0)); 2409 Observer.changedInstr(MI); 2410 return true; 2411 } 2412 2413 Register ValReg = MI.getOperand(0).getReg(); 2414 LLT ValTy = MRI.getType(ValReg); 2415 2416 MachineMemOperand *MMO = *MI.memoperands_begin(); 2417 const unsigned ValSize = ValTy.getSizeInBits(); 2418 const unsigned MemSize = 8 * MMO->getSize(); 2419 const Align MemAlign = MMO->getAlign(); 2420 const unsigned AlignInBits = 8 * MemAlign.value(); 2421 2422 // Widen non-power-of-2 loads to the alignment if needed 2423 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2424 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2425 2426 // This was already the correct extending load result type, so just adjust 2427 // the memory type. 2428 if (WideMemSize == ValSize) { 2429 MachineFunction &MF = B.getMF(); 2430 2431 MachineMemOperand *WideMMO = 2432 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2433 Observer.changingInstr(MI); 2434 MI.setMemRefs(MF, {WideMMO}); 2435 Observer.changedInstr(MI); 2436 return true; 2437 } 2438 2439 // Don't bother handling edge case that should probably never be produced. 2440 if (ValSize > WideMemSize) 2441 return false; 2442 2443 LLT WideTy = widenToNextPowerOf2(ValTy); 2444 2445 Register WideLoad; 2446 if (!WideTy.isVector()) { 2447 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2448 B.buildTrunc(ValReg, WideLoad).getReg(0); 2449 } else { 2450 // Extract the subvector. 2451 2452 if (isRegisterType(ValTy)) { 2453 // If this a case where G_EXTRACT is legal, use it. 2454 // (e.g. <3 x s32> -> <4 x s32>) 2455 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2456 B.buildExtract(ValReg, WideLoad, 0); 2457 } else { 2458 // For cases where the widened type isn't a nice register value, unmerge 2459 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2460 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2461 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2462 B.setInsertPt(B.getMBB(), MI.getIterator()); 2463 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2464 } 2465 } 2466 2467 MI.eraseFromParent(); 2468 return true; 2469 } 2470 2471 return false; 2472 } 2473 2474 bool AMDGPULegalizerInfo::legalizeFMad( 2475 MachineInstr &MI, MachineRegisterInfo &MRI, 2476 MachineIRBuilder &B) const { 2477 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2478 assert(Ty.isScalar()); 2479 2480 MachineFunction &MF = B.getMF(); 2481 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2482 2483 // TODO: Always legal with future ftz flag. 2484 // FIXME: Do we need just output? 2485 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2486 return true; 2487 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2488 return true; 2489 2490 MachineIRBuilder HelperBuilder(MI); 2491 GISelObserverWrapper DummyObserver; 2492 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2493 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2494 } 2495 2496 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2497 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2498 Register DstReg = MI.getOperand(0).getReg(); 2499 Register PtrReg = MI.getOperand(1).getReg(); 2500 Register CmpVal = MI.getOperand(2).getReg(); 2501 Register NewVal = MI.getOperand(3).getReg(); 2502 2503 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2504 "this should not have been custom lowered"); 2505 2506 LLT ValTy = MRI.getType(CmpVal); 2507 LLT VecTy = LLT::vector(2, ValTy); 2508 2509 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2510 2511 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2512 .addDef(DstReg) 2513 .addUse(PtrReg) 2514 .addUse(PackedVal) 2515 .setMemRefs(MI.memoperands()); 2516 2517 MI.eraseFromParent(); 2518 return true; 2519 } 2520 2521 bool AMDGPULegalizerInfo::legalizeFlog( 2522 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2523 Register Dst = MI.getOperand(0).getReg(); 2524 Register Src = MI.getOperand(1).getReg(); 2525 LLT Ty = B.getMRI()->getType(Dst); 2526 unsigned Flags = MI.getFlags(); 2527 2528 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2529 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2530 2531 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2532 MI.eraseFromParent(); 2533 return true; 2534 } 2535 2536 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2537 MachineIRBuilder &B) const { 2538 Register Dst = MI.getOperand(0).getReg(); 2539 Register Src = MI.getOperand(1).getReg(); 2540 unsigned Flags = MI.getFlags(); 2541 LLT Ty = B.getMRI()->getType(Dst); 2542 2543 auto K = B.buildFConstant(Ty, numbers::log2e); 2544 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2545 B.buildFExp2(Dst, Mul, Flags); 2546 MI.eraseFromParent(); 2547 return true; 2548 } 2549 2550 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2551 MachineIRBuilder &B) const { 2552 Register Dst = MI.getOperand(0).getReg(); 2553 Register Src0 = MI.getOperand(1).getReg(); 2554 Register Src1 = MI.getOperand(2).getReg(); 2555 unsigned Flags = MI.getFlags(); 2556 LLT Ty = B.getMRI()->getType(Dst); 2557 const LLT S16 = LLT::scalar(16); 2558 const LLT S32 = LLT::scalar(32); 2559 2560 if (Ty == S32) { 2561 auto Log = B.buildFLog2(S32, Src0, Flags); 2562 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2563 .addUse(Log.getReg(0)) 2564 .addUse(Src1) 2565 .setMIFlags(Flags); 2566 B.buildFExp2(Dst, Mul, Flags); 2567 } else if (Ty == S16) { 2568 // There's no f16 fmul_legacy, so we need to convert for it. 2569 auto Log = B.buildFLog2(S16, Src0, Flags); 2570 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2571 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2572 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2573 .addUse(Ext0.getReg(0)) 2574 .addUse(Ext1.getReg(0)) 2575 .setMIFlags(Flags); 2576 2577 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2578 } else 2579 return false; 2580 2581 MI.eraseFromParent(); 2582 return true; 2583 } 2584 2585 // Find a source register, ignoring any possible source modifiers. 2586 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2587 Register ModSrc = OrigSrc; 2588 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2589 ModSrc = SrcFNeg->getOperand(1).getReg(); 2590 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2591 ModSrc = SrcFAbs->getOperand(1).getReg(); 2592 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2593 ModSrc = SrcFAbs->getOperand(1).getReg(); 2594 return ModSrc; 2595 } 2596 2597 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2598 MachineRegisterInfo &MRI, 2599 MachineIRBuilder &B) const { 2600 2601 const LLT S1 = LLT::scalar(1); 2602 const LLT S64 = LLT::scalar(64); 2603 Register Dst = MI.getOperand(0).getReg(); 2604 Register OrigSrc = MI.getOperand(1).getReg(); 2605 unsigned Flags = MI.getFlags(); 2606 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2607 "this should not have been custom lowered"); 2608 2609 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2610 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2611 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2612 // V_FRACT bug is: 2613 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2614 // 2615 // Convert floor(x) to (x - fract(x)) 2616 2617 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2618 .addUse(OrigSrc) 2619 .setMIFlags(Flags); 2620 2621 // Give source modifier matching some assistance before obscuring a foldable 2622 // pattern. 2623 2624 // TODO: We can avoid the neg on the fract? The input sign to fract 2625 // shouldn't matter? 2626 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2627 2628 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2629 2630 Register Min = MRI.createGenericVirtualRegister(S64); 2631 2632 // We don't need to concern ourselves with the snan handling difference, so 2633 // use the one which will directly select. 2634 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2635 if (MFI->getMode().IEEE) 2636 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2637 else 2638 B.buildFMinNum(Min, Fract, Const, Flags); 2639 2640 Register CorrectedFract = Min; 2641 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2642 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2643 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2644 } 2645 2646 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2647 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2648 2649 MI.eraseFromParent(); 2650 return true; 2651 } 2652 2653 // Turn an illegal packed v2s16 build vector into bit operations. 2654 // TODO: This should probably be a bitcast action in LegalizerHelper. 2655 bool AMDGPULegalizerInfo::legalizeBuildVector( 2656 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2657 Register Dst = MI.getOperand(0).getReg(); 2658 const LLT S32 = LLT::scalar(32); 2659 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2660 2661 Register Src0 = MI.getOperand(1).getReg(); 2662 Register Src1 = MI.getOperand(2).getReg(); 2663 assert(MRI.getType(Src0) == LLT::scalar(16)); 2664 2665 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2666 B.buildBitcast(Dst, Merge); 2667 2668 MI.eraseFromParent(); 2669 return true; 2670 } 2671 2672 // Check that this is a G_XOR x, -1 2673 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2674 if (MI.getOpcode() != TargetOpcode::G_XOR) 2675 return false; 2676 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2677 return ConstVal && *ConstVal == -1; 2678 } 2679 2680 // Return the use branch instruction, otherwise null if the usage is invalid. 2681 static MachineInstr * 2682 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2683 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2684 Register CondDef = MI.getOperand(0).getReg(); 2685 if (!MRI.hasOneNonDBGUse(CondDef)) 2686 return nullptr; 2687 2688 MachineBasicBlock *Parent = MI.getParent(); 2689 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2690 2691 if (isNot(MRI, *UseMI)) { 2692 Register NegatedCond = UseMI->getOperand(0).getReg(); 2693 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2694 return nullptr; 2695 2696 // We're deleting the def of this value, so we need to remove it. 2697 UseMI->eraseFromParent(); 2698 2699 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2700 Negated = true; 2701 } 2702 2703 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2704 return nullptr; 2705 2706 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2707 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2708 if (Next == Parent->end()) { 2709 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2710 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2711 return nullptr; 2712 UncondBrTarget = &*NextMBB; 2713 } else { 2714 if (Next->getOpcode() != AMDGPU::G_BR) 2715 return nullptr; 2716 Br = &*Next; 2717 UncondBrTarget = Br->getOperand(0).getMBB(); 2718 } 2719 2720 return UseMI; 2721 } 2722 2723 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2724 const ArgDescriptor *Arg, 2725 const TargetRegisterClass *ArgRC, 2726 LLT ArgTy) const { 2727 MCRegister SrcReg = Arg->getRegister(); 2728 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2729 assert(DstReg.isVirtual() && "Virtual register expected"); 2730 2731 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2732 ArgTy); 2733 if (Arg->isMasked()) { 2734 // TODO: Should we try to emit this once in the entry block? 2735 const LLT S32 = LLT::scalar(32); 2736 const unsigned Mask = Arg->getMask(); 2737 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2738 2739 Register AndMaskSrc = LiveIn; 2740 2741 if (Shift != 0) { 2742 auto ShiftAmt = B.buildConstant(S32, Shift); 2743 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2744 } 2745 2746 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2747 } else { 2748 B.buildCopy(DstReg, LiveIn); 2749 } 2750 2751 return true; 2752 } 2753 2754 bool AMDGPULegalizerInfo::loadInputValue( 2755 Register DstReg, MachineIRBuilder &B, 2756 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2757 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2758 const ArgDescriptor *Arg; 2759 const TargetRegisterClass *ArgRC; 2760 LLT ArgTy; 2761 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2762 2763 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2764 return false; // TODO: Handle these 2765 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2766 } 2767 2768 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2769 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2770 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2771 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2772 return false; 2773 2774 MI.eraseFromParent(); 2775 return true; 2776 } 2777 2778 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2779 MachineRegisterInfo &MRI, 2780 MachineIRBuilder &B) const { 2781 Register Dst = MI.getOperand(0).getReg(); 2782 LLT DstTy = MRI.getType(Dst); 2783 LLT S16 = LLT::scalar(16); 2784 LLT S32 = LLT::scalar(32); 2785 LLT S64 = LLT::scalar(64); 2786 2787 if (DstTy == S16) 2788 return legalizeFDIV16(MI, MRI, B); 2789 if (DstTy == S32) 2790 return legalizeFDIV32(MI, MRI, B); 2791 if (DstTy == S64) 2792 return legalizeFDIV64(MI, MRI, B); 2793 2794 return false; 2795 } 2796 2797 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2798 Register DstDivReg, 2799 Register DstRemReg, 2800 Register X, 2801 Register Y) const { 2802 const LLT S1 = LLT::scalar(1); 2803 const LLT S32 = LLT::scalar(32); 2804 2805 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2806 // algorithm used here. 2807 2808 // Initial estimate of inv(y). 2809 auto FloatY = B.buildUITOFP(S32, Y); 2810 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2811 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2812 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2813 auto Z = B.buildFPTOUI(S32, ScaledY); 2814 2815 // One round of UNR. 2816 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2817 auto NegYZ = B.buildMul(S32, NegY, Z); 2818 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2819 2820 // Quotient/remainder estimate. 2821 auto Q = B.buildUMulH(S32, X, Z); 2822 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2823 2824 // First quotient/remainder refinement. 2825 auto One = B.buildConstant(S32, 1); 2826 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2827 if (DstDivReg) 2828 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2829 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2830 2831 // Second quotient/remainder refinement. 2832 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2833 if (DstDivReg) 2834 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2835 2836 if (DstRemReg) 2837 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2838 } 2839 2840 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2841 // 2842 // Return lo, hi of result 2843 // 2844 // %cvt.lo = G_UITOFP Val.lo 2845 // %cvt.hi = G_UITOFP Val.hi 2846 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2847 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2848 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2849 // %mul2 = G_FMUL %mul1, 2**(-32) 2850 // %trunc = G_INTRINSIC_TRUNC %mul2 2851 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2852 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2853 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2854 Register Val) { 2855 const LLT S32 = LLT::scalar(32); 2856 auto Unmerge = B.buildUnmerge(S32, Val); 2857 2858 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2859 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2860 2861 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2862 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2863 2864 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2865 auto Mul1 = 2866 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2867 2868 // 2**(-32) 2869 auto Mul2 = 2870 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2871 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2872 2873 // -(2**32) 2874 auto Mad2 = B.buildFMAD(S32, Trunc, 2875 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2876 2877 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2878 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2879 2880 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2881 } 2882 2883 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 2884 Register DstDivReg, 2885 Register DstRemReg, 2886 Register Numer, 2887 Register Denom) const { 2888 const LLT S32 = LLT::scalar(32); 2889 const LLT S64 = LLT::scalar(64); 2890 const LLT S1 = LLT::scalar(1); 2891 Register RcpLo, RcpHi; 2892 2893 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2894 2895 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2896 2897 auto Zero64 = B.buildConstant(S64, 0); 2898 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2899 2900 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2901 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2902 2903 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2904 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2905 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2906 2907 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2908 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2909 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2910 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2911 2912 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2913 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2914 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2915 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2916 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2917 2918 auto Zero32 = B.buildConstant(S32, 0); 2919 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2920 auto Add2_HiC = 2921 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2922 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2923 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2924 2925 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2926 Register NumerLo = UnmergeNumer.getReg(0); 2927 Register NumerHi = UnmergeNumer.getReg(1); 2928 2929 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2930 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2931 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2932 Register Mul3_Lo = UnmergeMul3.getReg(0); 2933 Register Mul3_Hi = UnmergeMul3.getReg(1); 2934 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2935 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2936 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2937 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2938 2939 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2940 Register DenomLo = UnmergeDenom.getReg(0); 2941 Register DenomHi = UnmergeDenom.getReg(1); 2942 2943 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2944 auto C1 = B.buildSExt(S32, CmpHi); 2945 2946 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2947 auto C2 = B.buildSExt(S32, CmpLo); 2948 2949 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2950 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2951 2952 // TODO: Here and below portions of the code can be enclosed into if/endif. 2953 // Currently control flow is unconditional and we have 4 selects after 2954 // potential endif to substitute PHIs. 2955 2956 // if C3 != 0 ... 2957 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2958 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2959 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2960 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2961 2962 auto One64 = B.buildConstant(S64, 1); 2963 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2964 2965 auto C4 = 2966 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2967 auto C5 = 2968 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2969 auto C6 = B.buildSelect( 2970 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2971 2972 // if (C6 != 0) 2973 auto Add4 = B.buildAdd(S64, Add3, One64); 2974 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2975 2976 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2977 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2978 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2979 2980 // endif C6 2981 // endif C3 2982 2983 if (DstDivReg) { 2984 auto Sel1 = B.buildSelect( 2985 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2986 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 2987 Sel1, MulHi3); 2988 } 2989 2990 if (DstRemReg) { 2991 auto Sel2 = B.buildSelect( 2992 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2993 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 2994 Sel2, Sub1); 2995 } 2996 } 2997 2998 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 2999 MachineRegisterInfo &MRI, 3000 MachineIRBuilder &B) const { 3001 Register DstDivReg, DstRemReg; 3002 switch (MI.getOpcode()) { 3003 default: 3004 llvm_unreachable("Unexpected opcode!"); 3005 case AMDGPU::G_UDIV: { 3006 DstDivReg = MI.getOperand(0).getReg(); 3007 break; 3008 } 3009 case AMDGPU::G_UREM: { 3010 DstRemReg = MI.getOperand(0).getReg(); 3011 break; 3012 } 3013 case AMDGPU::G_UDIVREM: { 3014 DstDivReg = MI.getOperand(0).getReg(); 3015 DstRemReg = MI.getOperand(1).getReg(); 3016 break; 3017 } 3018 } 3019 3020 const LLT S64 = LLT::scalar(64); 3021 const LLT S32 = LLT::scalar(32); 3022 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3023 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3024 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3025 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3026 3027 if (Ty == S32) 3028 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3029 else if (Ty == S64) 3030 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3031 else 3032 return false; 3033 3034 MI.eraseFromParent(); 3035 return true; 3036 } 3037 3038 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3039 MachineRegisterInfo &MRI, 3040 MachineIRBuilder &B) const { 3041 const LLT S64 = LLT::scalar(64); 3042 const LLT S32 = LLT::scalar(32); 3043 3044 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3045 if (Ty != S32 && Ty != S64) 3046 return false; 3047 3048 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3049 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3050 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3051 3052 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3053 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3054 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3055 3056 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3057 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3058 3059 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3060 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3061 3062 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3063 switch (MI.getOpcode()) { 3064 default: 3065 llvm_unreachable("Unexpected opcode!"); 3066 case AMDGPU::G_SDIV: { 3067 DstDivReg = MI.getOperand(0).getReg(); 3068 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3069 break; 3070 } 3071 case AMDGPU::G_SREM: { 3072 DstRemReg = MI.getOperand(0).getReg(); 3073 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3074 break; 3075 } 3076 case AMDGPU::G_SDIVREM: { 3077 DstDivReg = MI.getOperand(0).getReg(); 3078 DstRemReg = MI.getOperand(1).getReg(); 3079 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3080 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3081 break; 3082 } 3083 } 3084 3085 if (Ty == S32) 3086 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3087 else 3088 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3089 3090 if (DstDivReg) { 3091 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3092 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3093 B.buildSub(DstDivReg, SignXor, Sign); 3094 } 3095 3096 if (DstRemReg) { 3097 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3098 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3099 B.buildSub(DstRemReg, SignXor, Sign); 3100 } 3101 3102 MI.eraseFromParent(); 3103 return true; 3104 } 3105 3106 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3107 MachineRegisterInfo &MRI, 3108 MachineIRBuilder &B) const { 3109 Register Res = MI.getOperand(0).getReg(); 3110 Register LHS = MI.getOperand(1).getReg(); 3111 Register RHS = MI.getOperand(2).getReg(); 3112 uint16_t Flags = MI.getFlags(); 3113 LLT ResTy = MRI.getType(Res); 3114 3115 const MachineFunction &MF = B.getMF(); 3116 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3117 MI.getFlag(MachineInstr::FmAfn); 3118 3119 if (!AllowInaccurateRcp) 3120 return false; 3121 3122 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3123 // 1 / x -> RCP(x) 3124 if (CLHS->isExactlyValue(1.0)) { 3125 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3126 .addUse(RHS) 3127 .setMIFlags(Flags); 3128 3129 MI.eraseFromParent(); 3130 return true; 3131 } 3132 3133 // -1 / x -> RCP( FNEG(x) ) 3134 if (CLHS->isExactlyValue(-1.0)) { 3135 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3136 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3137 .addUse(FNeg.getReg(0)) 3138 .setMIFlags(Flags); 3139 3140 MI.eraseFromParent(); 3141 return true; 3142 } 3143 } 3144 3145 // x / y -> x * (1.0 / y) 3146 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3147 .addUse(RHS) 3148 .setMIFlags(Flags); 3149 B.buildFMul(Res, LHS, RCP, Flags); 3150 3151 MI.eraseFromParent(); 3152 return true; 3153 } 3154 3155 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3156 MachineRegisterInfo &MRI, 3157 MachineIRBuilder &B) const { 3158 Register Res = MI.getOperand(0).getReg(); 3159 Register X = MI.getOperand(1).getReg(); 3160 Register Y = MI.getOperand(2).getReg(); 3161 uint16_t Flags = MI.getFlags(); 3162 LLT ResTy = MRI.getType(Res); 3163 3164 const MachineFunction &MF = B.getMF(); 3165 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3166 MI.getFlag(MachineInstr::FmAfn); 3167 3168 if (!AllowInaccurateRcp) 3169 return false; 3170 3171 auto NegY = B.buildFNeg(ResTy, Y); 3172 auto One = B.buildFConstant(ResTy, 1.0); 3173 3174 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3175 .addUse(Y) 3176 .setMIFlags(Flags); 3177 3178 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3179 R = B.buildFMA(ResTy, Tmp0, R, R); 3180 3181 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3182 R = B.buildFMA(ResTy, Tmp1, R, R); 3183 3184 auto Ret = B.buildFMul(ResTy, X, R); 3185 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3186 3187 B.buildFMA(Res, Tmp2, R, Ret); 3188 MI.eraseFromParent(); 3189 return true; 3190 } 3191 3192 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3193 MachineRegisterInfo &MRI, 3194 MachineIRBuilder &B) const { 3195 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3196 return true; 3197 3198 Register Res = MI.getOperand(0).getReg(); 3199 Register LHS = MI.getOperand(1).getReg(); 3200 Register RHS = MI.getOperand(2).getReg(); 3201 3202 uint16_t Flags = MI.getFlags(); 3203 3204 LLT S16 = LLT::scalar(16); 3205 LLT S32 = LLT::scalar(32); 3206 3207 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3208 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3209 3210 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3211 .addUse(RHSExt.getReg(0)) 3212 .setMIFlags(Flags); 3213 3214 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3215 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3216 3217 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3218 .addUse(RDst.getReg(0)) 3219 .addUse(RHS) 3220 .addUse(LHS) 3221 .setMIFlags(Flags); 3222 3223 MI.eraseFromParent(); 3224 return true; 3225 } 3226 3227 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3228 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3229 static void toggleSPDenormMode(bool Enable, 3230 MachineIRBuilder &B, 3231 const GCNSubtarget &ST, 3232 AMDGPU::SIModeRegisterDefaults Mode) { 3233 // Set SP denorm mode to this value. 3234 unsigned SPDenormMode = 3235 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3236 3237 if (ST.hasDenormModeInst()) { 3238 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3239 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3240 3241 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3242 B.buildInstr(AMDGPU::S_DENORM_MODE) 3243 .addImm(NewDenormModeValue); 3244 3245 } else { 3246 // Select FP32 bit field in mode register. 3247 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3248 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3249 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3250 3251 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3252 .addImm(SPDenormMode) 3253 .addImm(SPDenormModeBitField); 3254 } 3255 } 3256 3257 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3258 MachineRegisterInfo &MRI, 3259 MachineIRBuilder &B) const { 3260 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3261 return true; 3262 3263 Register Res = MI.getOperand(0).getReg(); 3264 Register LHS = MI.getOperand(1).getReg(); 3265 Register RHS = MI.getOperand(2).getReg(); 3266 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3267 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3268 3269 uint16_t Flags = MI.getFlags(); 3270 3271 LLT S32 = LLT::scalar(32); 3272 LLT S1 = LLT::scalar(1); 3273 3274 auto One = B.buildFConstant(S32, 1.0f); 3275 3276 auto DenominatorScaled = 3277 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3278 .addUse(LHS) 3279 .addUse(RHS) 3280 .addImm(0) 3281 .setMIFlags(Flags); 3282 auto NumeratorScaled = 3283 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3284 .addUse(LHS) 3285 .addUse(RHS) 3286 .addImm(1) 3287 .setMIFlags(Flags); 3288 3289 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3290 .addUse(DenominatorScaled.getReg(0)) 3291 .setMIFlags(Flags); 3292 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3293 3294 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3295 // aren't modeled as reading it. 3296 if (!Mode.allFP32Denormals()) 3297 toggleSPDenormMode(true, B, ST, Mode); 3298 3299 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3300 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3301 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3302 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3303 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3304 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3305 3306 if (!Mode.allFP32Denormals()) 3307 toggleSPDenormMode(false, B, ST, Mode); 3308 3309 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3310 .addUse(Fma4.getReg(0)) 3311 .addUse(Fma1.getReg(0)) 3312 .addUse(Fma3.getReg(0)) 3313 .addUse(NumeratorScaled.getReg(1)) 3314 .setMIFlags(Flags); 3315 3316 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3317 .addUse(Fmas.getReg(0)) 3318 .addUse(RHS) 3319 .addUse(LHS) 3320 .setMIFlags(Flags); 3321 3322 MI.eraseFromParent(); 3323 return true; 3324 } 3325 3326 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3327 MachineRegisterInfo &MRI, 3328 MachineIRBuilder &B) const { 3329 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3330 return true; 3331 3332 Register Res = MI.getOperand(0).getReg(); 3333 Register LHS = MI.getOperand(1).getReg(); 3334 Register RHS = MI.getOperand(2).getReg(); 3335 3336 uint16_t Flags = MI.getFlags(); 3337 3338 LLT S64 = LLT::scalar(64); 3339 LLT S1 = LLT::scalar(1); 3340 3341 auto One = B.buildFConstant(S64, 1.0); 3342 3343 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3344 .addUse(LHS) 3345 .addUse(RHS) 3346 .addImm(0) 3347 .setMIFlags(Flags); 3348 3349 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3350 3351 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3352 .addUse(DivScale0.getReg(0)) 3353 .setMIFlags(Flags); 3354 3355 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3356 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3357 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3358 3359 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3360 .addUse(LHS) 3361 .addUse(RHS) 3362 .addImm(1) 3363 .setMIFlags(Flags); 3364 3365 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3366 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3367 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3368 3369 Register Scale; 3370 if (!ST.hasUsableDivScaleConditionOutput()) { 3371 // Workaround a hardware bug on SI where the condition output from div_scale 3372 // is not usable. 3373 3374 LLT S32 = LLT::scalar(32); 3375 3376 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3377 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3378 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3379 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3380 3381 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3382 Scale1Unmerge.getReg(1)); 3383 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3384 Scale0Unmerge.getReg(1)); 3385 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3386 } else { 3387 Scale = DivScale1.getReg(1); 3388 } 3389 3390 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3391 .addUse(Fma4.getReg(0)) 3392 .addUse(Fma3.getReg(0)) 3393 .addUse(Mul.getReg(0)) 3394 .addUse(Scale) 3395 .setMIFlags(Flags); 3396 3397 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3398 .addUse(Fmas.getReg(0)) 3399 .addUse(RHS) 3400 .addUse(LHS) 3401 .setMIFlags(Flags); 3402 3403 MI.eraseFromParent(); 3404 return true; 3405 } 3406 3407 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3408 MachineRegisterInfo &MRI, 3409 MachineIRBuilder &B) const { 3410 Register Res = MI.getOperand(0).getReg(); 3411 Register LHS = MI.getOperand(2).getReg(); 3412 Register RHS = MI.getOperand(3).getReg(); 3413 uint16_t Flags = MI.getFlags(); 3414 3415 LLT S32 = LLT::scalar(32); 3416 LLT S1 = LLT::scalar(1); 3417 3418 auto Abs = B.buildFAbs(S32, RHS, Flags); 3419 const APFloat C0Val(1.0f); 3420 3421 auto C0 = B.buildConstant(S32, 0x6f800000); 3422 auto C1 = B.buildConstant(S32, 0x2f800000); 3423 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3424 3425 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3426 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3427 3428 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3429 3430 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3431 .addUse(Mul0.getReg(0)) 3432 .setMIFlags(Flags); 3433 3434 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3435 3436 B.buildFMul(Res, Sel, Mul1, Flags); 3437 3438 MI.eraseFromParent(); 3439 return true; 3440 } 3441 3442 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3443 // FIXME: Why do we handle this one but not other removed instructions? 3444 // 3445 // Reciprocal square root. The clamp prevents infinite results, clamping 3446 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3447 // +-max_float. 3448 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3449 MachineRegisterInfo &MRI, 3450 MachineIRBuilder &B) const { 3451 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3452 return true; 3453 3454 Register Dst = MI.getOperand(0).getReg(); 3455 Register Src = MI.getOperand(2).getReg(); 3456 auto Flags = MI.getFlags(); 3457 3458 LLT Ty = MRI.getType(Dst); 3459 3460 const fltSemantics *FltSemantics; 3461 if (Ty == LLT::scalar(32)) 3462 FltSemantics = &APFloat::IEEEsingle(); 3463 else if (Ty == LLT::scalar(64)) 3464 FltSemantics = &APFloat::IEEEdouble(); 3465 else 3466 return false; 3467 3468 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3469 .addUse(Src) 3470 .setMIFlags(Flags); 3471 3472 // We don't need to concern ourselves with the snan handling difference, since 3473 // the rsq quieted (or not) so use the one which will directly select. 3474 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3475 const bool UseIEEE = MFI->getMode().IEEE; 3476 3477 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3478 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3479 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3480 3481 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3482 3483 if (UseIEEE) 3484 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3485 else 3486 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3487 MI.eraseFromParent(); 3488 return true; 3489 } 3490 3491 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3492 switch (IID) { 3493 case Intrinsic::amdgcn_ds_fadd: 3494 return AMDGPU::G_ATOMICRMW_FADD; 3495 case Intrinsic::amdgcn_ds_fmin: 3496 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3497 case Intrinsic::amdgcn_ds_fmax: 3498 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3499 default: 3500 llvm_unreachable("not a DS FP intrinsic"); 3501 } 3502 } 3503 3504 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3505 MachineInstr &MI, 3506 Intrinsic::ID IID) const { 3507 GISelChangeObserver &Observer = Helper.Observer; 3508 Observer.changingInstr(MI); 3509 3510 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3511 3512 // The remaining operands were used to set fields in the MemOperand on 3513 // construction. 3514 for (int I = 6; I > 3; --I) 3515 MI.RemoveOperand(I); 3516 3517 MI.RemoveOperand(1); // Remove the intrinsic ID. 3518 Observer.changedInstr(MI); 3519 return true; 3520 } 3521 3522 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3523 MachineRegisterInfo &MRI, 3524 MachineIRBuilder &B) const { 3525 uint64_t Offset = 3526 ST.getTargetLowering()->getImplicitParameterOffset( 3527 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3528 LLT DstTy = MRI.getType(DstReg); 3529 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3530 3531 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3532 if (!loadInputValue(KernargPtrReg, B, 3533 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3534 return false; 3535 3536 // FIXME: This should be nuw 3537 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3538 return true; 3539 } 3540 3541 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3542 MachineRegisterInfo &MRI, 3543 MachineIRBuilder &B) const { 3544 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3545 if (!MFI->isEntryFunction()) { 3546 return legalizePreloadedArgIntrin(MI, MRI, B, 3547 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3548 } 3549 3550 Register DstReg = MI.getOperand(0).getReg(); 3551 if (!getImplicitArgPtr(DstReg, MRI, B)) 3552 return false; 3553 3554 MI.eraseFromParent(); 3555 return true; 3556 } 3557 3558 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3559 MachineRegisterInfo &MRI, 3560 MachineIRBuilder &B, 3561 unsigned AddrSpace) const { 3562 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3563 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3564 Register Hi32 = Unmerge.getReg(1); 3565 3566 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3567 MI.eraseFromParent(); 3568 return true; 3569 } 3570 3571 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3572 // offset (the offset that is included in bounds checking and swizzling, to be 3573 // split between the instruction's voffset and immoffset fields) and soffset 3574 // (the offset that is excluded from bounds checking and swizzling, to go in 3575 // the instruction's soffset field). This function takes the first kind of 3576 // offset and figures out how to split it between voffset and immoffset. 3577 std::tuple<Register, unsigned, unsigned> 3578 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3579 Register OrigOffset) const { 3580 const unsigned MaxImm = 4095; 3581 Register BaseReg; 3582 unsigned TotalConstOffset; 3583 const LLT S32 = LLT::scalar(32); 3584 MachineRegisterInfo &MRI = *B.getMRI(); 3585 3586 std::tie(BaseReg, TotalConstOffset) = 3587 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3588 3589 unsigned ImmOffset = TotalConstOffset; 3590 3591 // If BaseReg is a pointer, convert it to int. 3592 if (MRI.getType(BaseReg).isPointer()) 3593 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3594 3595 // If the immediate value is too big for the immoffset field, put the value 3596 // and -4096 into the immoffset field so that the value that is copied/added 3597 // for the voffset field is a multiple of 4096, and it stands more chance 3598 // of being CSEd with the copy/add for another similar load/store. 3599 // However, do not do that rounding down to a multiple of 4096 if that is a 3600 // negative number, as it appears to be illegal to have a negative offset 3601 // in the vgpr, even if adding the immediate offset makes it positive. 3602 unsigned Overflow = ImmOffset & ~MaxImm; 3603 ImmOffset -= Overflow; 3604 if ((int32_t)Overflow < 0) { 3605 Overflow += ImmOffset; 3606 ImmOffset = 0; 3607 } 3608 3609 if (Overflow != 0) { 3610 if (!BaseReg) { 3611 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3612 } else { 3613 auto OverflowVal = B.buildConstant(S32, Overflow); 3614 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3615 } 3616 } 3617 3618 if (!BaseReg) 3619 BaseReg = B.buildConstant(S32, 0).getReg(0); 3620 3621 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3622 } 3623 3624 /// Handle register layout difference for f16 images for some subtargets. 3625 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3626 MachineRegisterInfo &MRI, 3627 Register Reg, 3628 bool ImageStore) const { 3629 const LLT S16 = LLT::scalar(16); 3630 const LLT S32 = LLT::scalar(32); 3631 LLT StoreVT = MRI.getType(Reg); 3632 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3633 3634 if (ST.hasUnpackedD16VMem()) { 3635 auto Unmerge = B.buildUnmerge(S16, Reg); 3636 3637 SmallVector<Register, 4> WideRegs; 3638 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3639 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3640 3641 int NumElts = StoreVT.getNumElements(); 3642 3643 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3644 } 3645 3646 if (ImageStore && ST.hasImageStoreD16Bug()) { 3647 if (StoreVT.getNumElements() == 2) { 3648 SmallVector<Register, 4> PackedRegs; 3649 Reg = B.buildBitcast(S32, Reg).getReg(0); 3650 PackedRegs.push_back(Reg); 3651 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3652 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); 3653 } 3654 3655 if (StoreVT.getNumElements() == 3) { 3656 SmallVector<Register, 4> PackedRegs; 3657 auto Unmerge = B.buildUnmerge(S16, Reg); 3658 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3659 PackedRegs.push_back(Unmerge.getReg(I)); 3660 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3661 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); 3662 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); 3663 } 3664 3665 if (StoreVT.getNumElements() == 4) { 3666 SmallVector<Register, 4> PackedRegs; 3667 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); 3668 auto Unmerge = B.buildUnmerge(S32, Reg); 3669 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3670 PackedRegs.push_back(Unmerge.getReg(I)); 3671 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3672 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); 3673 } 3674 3675 llvm_unreachable("invalid data type"); 3676 } 3677 3678 return Reg; 3679 } 3680 3681 Register AMDGPULegalizerInfo::fixStoreSourceType( 3682 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3683 MachineRegisterInfo *MRI = B.getMRI(); 3684 LLT Ty = MRI->getType(VData); 3685 3686 const LLT S16 = LLT::scalar(16); 3687 3688 // Fixup illegal register types for i8 stores. 3689 if (Ty == LLT::scalar(8) || Ty == S16) { 3690 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3691 return AnyExt; 3692 } 3693 3694 if (Ty.isVector()) { 3695 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3696 if (IsFormat) 3697 return handleD16VData(B, *MRI, VData); 3698 } 3699 } 3700 3701 return VData; 3702 } 3703 3704 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3705 MachineRegisterInfo &MRI, 3706 MachineIRBuilder &B, 3707 bool IsTyped, 3708 bool IsFormat) const { 3709 Register VData = MI.getOperand(1).getReg(); 3710 LLT Ty = MRI.getType(VData); 3711 LLT EltTy = Ty.getScalarType(); 3712 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3713 const LLT S32 = LLT::scalar(32); 3714 3715 VData = fixStoreSourceType(B, VData, IsFormat); 3716 Register RSrc = MI.getOperand(2).getReg(); 3717 3718 MachineMemOperand *MMO = *MI.memoperands_begin(); 3719 const int MemSize = MMO->getSize(); 3720 3721 unsigned ImmOffset; 3722 unsigned TotalOffset; 3723 3724 // The typed intrinsics add an immediate after the registers. 3725 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3726 3727 // The struct intrinsic variants add one additional operand over raw. 3728 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3729 Register VIndex; 3730 int OpOffset = 0; 3731 if (HasVIndex) { 3732 VIndex = MI.getOperand(3).getReg(); 3733 OpOffset = 1; 3734 } 3735 3736 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3737 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3738 3739 unsigned Format = 0; 3740 if (IsTyped) { 3741 Format = MI.getOperand(5 + OpOffset).getImm(); 3742 ++OpOffset; 3743 } 3744 3745 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3746 3747 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3748 if (TotalOffset != 0) 3749 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3750 3751 unsigned Opc; 3752 if (IsTyped) { 3753 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3754 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3755 } else if (IsFormat) { 3756 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3757 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3758 } else { 3759 switch (MemSize) { 3760 case 1: 3761 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3762 break; 3763 case 2: 3764 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3765 break; 3766 default: 3767 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3768 break; 3769 } 3770 } 3771 3772 if (!VIndex) 3773 VIndex = B.buildConstant(S32, 0).getReg(0); 3774 3775 auto MIB = B.buildInstr(Opc) 3776 .addUse(VData) // vdata 3777 .addUse(RSrc) // rsrc 3778 .addUse(VIndex) // vindex 3779 .addUse(VOffset) // voffset 3780 .addUse(SOffset) // soffset 3781 .addImm(ImmOffset); // offset(imm) 3782 3783 if (IsTyped) 3784 MIB.addImm(Format); 3785 3786 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3787 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3788 .addMemOperand(MMO); 3789 3790 MI.eraseFromParent(); 3791 return true; 3792 } 3793 3794 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3795 MachineRegisterInfo &MRI, 3796 MachineIRBuilder &B, 3797 bool IsFormat, 3798 bool IsTyped) const { 3799 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3800 MachineMemOperand *MMO = *MI.memoperands_begin(); 3801 const int MemSize = MMO->getSize(); 3802 const LLT S32 = LLT::scalar(32); 3803 3804 Register Dst = MI.getOperand(0).getReg(); 3805 Register RSrc = MI.getOperand(2).getReg(); 3806 3807 // The typed intrinsics add an immediate after the registers. 3808 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3809 3810 // The struct intrinsic variants add one additional operand over raw. 3811 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3812 Register VIndex; 3813 int OpOffset = 0; 3814 if (HasVIndex) { 3815 VIndex = MI.getOperand(3).getReg(); 3816 OpOffset = 1; 3817 } 3818 3819 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3820 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3821 3822 unsigned Format = 0; 3823 if (IsTyped) { 3824 Format = MI.getOperand(5 + OpOffset).getImm(); 3825 ++OpOffset; 3826 } 3827 3828 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3829 unsigned ImmOffset; 3830 unsigned TotalOffset; 3831 3832 LLT Ty = MRI.getType(Dst); 3833 LLT EltTy = Ty.getScalarType(); 3834 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3835 const bool Unpacked = ST.hasUnpackedD16VMem(); 3836 3837 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3838 if (TotalOffset != 0) 3839 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3840 3841 unsigned Opc; 3842 3843 if (IsTyped) { 3844 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3845 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3846 } else if (IsFormat) { 3847 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3848 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3849 } else { 3850 switch (MemSize) { 3851 case 1: 3852 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3853 break; 3854 case 2: 3855 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3856 break; 3857 default: 3858 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3859 break; 3860 } 3861 } 3862 3863 Register LoadDstReg; 3864 3865 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3866 LLT UnpackedTy = Ty.changeElementSize(32); 3867 3868 if (IsExtLoad) 3869 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3870 else if (Unpacked && IsD16 && Ty.isVector()) 3871 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3872 else 3873 LoadDstReg = Dst; 3874 3875 if (!VIndex) 3876 VIndex = B.buildConstant(S32, 0).getReg(0); 3877 3878 auto MIB = B.buildInstr(Opc) 3879 .addDef(LoadDstReg) // vdata 3880 .addUse(RSrc) // rsrc 3881 .addUse(VIndex) // vindex 3882 .addUse(VOffset) // voffset 3883 .addUse(SOffset) // soffset 3884 .addImm(ImmOffset); // offset(imm) 3885 3886 if (IsTyped) 3887 MIB.addImm(Format); 3888 3889 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3890 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3891 .addMemOperand(MMO); 3892 3893 if (LoadDstReg != Dst) { 3894 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3895 3896 // Widen result for extending loads was widened. 3897 if (IsExtLoad) 3898 B.buildTrunc(Dst, LoadDstReg); 3899 else { 3900 // Repack to original 16-bit vector result 3901 // FIXME: G_TRUNC should work, but legalization currently fails 3902 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3903 SmallVector<Register, 4> Repack; 3904 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3905 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3906 B.buildMerge(Dst, Repack); 3907 } 3908 } 3909 3910 MI.eraseFromParent(); 3911 return true; 3912 } 3913 3914 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3915 MachineIRBuilder &B, 3916 bool IsInc) const { 3917 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3918 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3919 B.buildInstr(Opc) 3920 .addDef(MI.getOperand(0).getReg()) 3921 .addUse(MI.getOperand(2).getReg()) 3922 .addUse(MI.getOperand(3).getReg()) 3923 .cloneMemRefs(MI); 3924 MI.eraseFromParent(); 3925 return true; 3926 } 3927 3928 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3929 switch (IntrID) { 3930 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3931 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3932 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3933 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3934 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3935 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3936 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3937 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3938 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3939 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3940 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3941 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3942 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3943 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3944 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3945 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3946 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3947 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3948 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3949 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3950 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3951 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3952 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3953 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3954 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3955 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3956 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3957 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3958 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3959 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3960 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3961 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3962 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3963 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3964 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3965 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3966 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3967 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3968 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3969 case Intrinsic::amdgcn_buffer_atomic_fadd: 3970 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3971 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3972 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3973 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 3974 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 3975 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 3976 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 3977 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 3978 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 3979 default: 3980 llvm_unreachable("unhandled atomic opcode"); 3981 } 3982 } 3983 3984 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3985 MachineIRBuilder &B, 3986 Intrinsic::ID IID) const { 3987 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3988 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3989 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3990 3991 Register Dst; 3992 3993 int OpOffset = 0; 3994 if (HasReturn) { 3995 // A few FP atomics do not support return values. 3996 Dst = MI.getOperand(0).getReg(); 3997 } else { 3998 OpOffset = -1; 3999 } 4000 4001 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4002 Register CmpVal; 4003 4004 if (IsCmpSwap) { 4005 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4006 ++OpOffset; 4007 } 4008 4009 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4010 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4011 4012 // The struct intrinsic variants add one additional operand over raw. 4013 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4014 Register VIndex; 4015 if (HasVIndex) { 4016 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4017 ++OpOffset; 4018 } 4019 4020 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4021 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4022 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4023 4024 MachineMemOperand *MMO = *MI.memoperands_begin(); 4025 4026 unsigned ImmOffset; 4027 unsigned TotalOffset; 4028 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 4029 if (TotalOffset != 0) 4030 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 4031 4032 if (!VIndex) 4033 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4034 4035 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4036 4037 if (HasReturn) 4038 MIB.addDef(Dst); 4039 4040 MIB.addUse(VData); // vdata 4041 4042 if (IsCmpSwap) 4043 MIB.addReg(CmpVal); 4044 4045 MIB.addUse(RSrc) // rsrc 4046 .addUse(VIndex) // vindex 4047 .addUse(VOffset) // voffset 4048 .addUse(SOffset) // soffset 4049 .addImm(ImmOffset) // offset(imm) 4050 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4051 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4052 .addMemOperand(MMO); 4053 4054 MI.eraseFromParent(); 4055 return true; 4056 } 4057 4058 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4059 /// vector with s16 typed elements. 4060 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4061 SmallVectorImpl<Register> &PackedAddrs, 4062 unsigned ArgOffset, 4063 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4064 bool IsA16, bool IsG16) { 4065 const LLT S16 = LLT::scalar(16); 4066 const LLT V2S16 = LLT::vector(2, 16); 4067 auto EndIdx = Intr->VAddrEnd; 4068 4069 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4070 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4071 if (!SrcOp.isReg()) 4072 continue; // _L to _LZ may have eliminated this. 4073 4074 Register AddrReg = SrcOp.getReg(); 4075 4076 if (I < Intr->GradientStart) { 4077 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4078 PackedAddrs.push_back(AddrReg); 4079 } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4080 (I >= Intr->CoordStart && !IsA16)) { 4081 // Handle any gradient or coordinate operands that should not be packed 4082 PackedAddrs.push_back(AddrReg); 4083 } else { 4084 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4085 // derivatives dx/dh and dx/dv are packed with undef. 4086 if (((I + 1) >= EndIdx) || 4087 ((Intr->NumGradients / 2) % 2 == 1 && 4088 (I == static_cast<unsigned>(Intr->GradientStart + 4089 (Intr->NumGradients / 2) - 1) || 4090 I == static_cast<unsigned>(Intr->GradientStart + 4091 Intr->NumGradients - 1))) || 4092 // Check for _L to _LZ optimization 4093 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4094 PackedAddrs.push_back( 4095 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4096 .getReg(0)); 4097 } else { 4098 PackedAddrs.push_back( 4099 B.buildBuildVector( 4100 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4101 .getReg(0)); 4102 ++I; 4103 } 4104 } 4105 } 4106 } 4107 4108 /// Convert from separate vaddr components to a single vector address register, 4109 /// and replace the remaining operands with $noreg. 4110 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4111 int DimIdx, int NumVAddrs) { 4112 const LLT S32 = LLT::scalar(32); 4113 4114 SmallVector<Register, 8> AddrRegs; 4115 for (int I = 0; I != NumVAddrs; ++I) { 4116 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4117 if (SrcOp.isReg()) { 4118 AddrRegs.push_back(SrcOp.getReg()); 4119 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4120 } 4121 } 4122 4123 int NumAddrRegs = AddrRegs.size(); 4124 if (NumAddrRegs != 1) { 4125 // Round up to 8 elements for v5-v7 4126 // FIXME: Missing intermediate sized register classes and instructions. 4127 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4128 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4129 auto Undef = B.buildUndef(S32); 4130 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4131 NumAddrRegs = RoundedNumRegs; 4132 } 4133 4134 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 4135 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4136 } 4137 4138 for (int I = 1; I != NumVAddrs; ++I) { 4139 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4140 if (SrcOp.isReg()) 4141 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4142 } 4143 } 4144 4145 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4146 /// 4147 /// Depending on the subtarget, load/store with 16-bit element data need to be 4148 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4149 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4150 /// registers. 4151 /// 4152 /// We don't want to directly select image instructions just yet, but also want 4153 /// to exposes all register repacking to the legalizer/combiners. We also don't 4154 /// want a selected instrution entering RegBankSelect. In order to avoid 4155 /// defining a multitude of intermediate image instructions, directly hack on 4156 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4157 /// now unnecessary arguments with $noreg. 4158 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4159 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4160 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4161 4162 const unsigned NumDefs = MI.getNumExplicitDefs(); 4163 const unsigned ArgOffset = NumDefs + 1; 4164 bool IsTFE = NumDefs == 2; 4165 // We are only processing the operands of d16 image operations on subtargets 4166 // that use the unpacked register layout, or need to repack the TFE result. 4167 4168 // TODO: Do we need to guard against already legalized intrinsics? 4169 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4170 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4171 4172 MachineRegisterInfo *MRI = B.getMRI(); 4173 const LLT S32 = LLT::scalar(32); 4174 const LLT S16 = LLT::scalar(16); 4175 const LLT V2S16 = LLT::vector(2, 16); 4176 4177 unsigned DMask = 0; 4178 4179 // Check for 16 bit addresses and pack if true. 4180 LLT GradTy = 4181 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4182 LLT AddrTy = 4183 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4184 const bool IsG16 = GradTy == S16; 4185 const bool IsA16 = AddrTy == S16; 4186 4187 int DMaskLanes = 0; 4188 if (!BaseOpcode->Atomic) { 4189 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4190 if (BaseOpcode->Gather4) { 4191 DMaskLanes = 4; 4192 } else if (DMask != 0) { 4193 DMaskLanes = countPopulation(DMask); 4194 } else if (!IsTFE && !BaseOpcode->Store) { 4195 // If dmask is 0, this is a no-op load. This can be eliminated. 4196 B.buildUndef(MI.getOperand(0)); 4197 MI.eraseFromParent(); 4198 return true; 4199 } 4200 } 4201 4202 Observer.changingInstr(MI); 4203 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4204 4205 unsigned NewOpcode = NumDefs == 0 ? 4206 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4207 4208 // Track that we legalized this 4209 MI.setDesc(B.getTII().get(NewOpcode)); 4210 4211 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4212 // dmask to be at least 1 otherwise the instruction will fail 4213 if (IsTFE && DMask == 0) { 4214 DMask = 0x1; 4215 DMaskLanes = 1; 4216 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4217 } 4218 4219 if (BaseOpcode->Atomic) { 4220 Register VData0 = MI.getOperand(2).getReg(); 4221 LLT Ty = MRI->getType(VData0); 4222 4223 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4224 if (Ty.isVector()) 4225 return false; 4226 4227 if (BaseOpcode->AtomicX2) { 4228 Register VData1 = MI.getOperand(3).getReg(); 4229 // The two values are packed in one register. 4230 LLT PackedTy = LLT::vector(2, Ty); 4231 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4232 MI.getOperand(2).setReg(Concat.getReg(0)); 4233 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4234 } 4235 } 4236 4237 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4238 4239 // Optimize _L to _LZ when _L is zero 4240 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4241 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4242 const ConstantFP *ConstantLod; 4243 4244 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4245 m_GFCst(ConstantLod))) { 4246 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4247 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4248 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4249 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4250 Intr->Dim); 4251 4252 // The starting indexes should remain in the same place. 4253 --CorrectedNumVAddrs; 4254 4255 MI.getOperand(MI.getNumExplicitDefs()) 4256 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4257 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4258 Intr = NewImageDimIntr; 4259 } 4260 } 4261 } 4262 4263 // Optimize _mip away, when 'lod' is zero 4264 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4265 int64_t ConstantLod; 4266 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4267 m_ICst(ConstantLod))) { 4268 if (ConstantLod == 0) { 4269 // TODO: Change intrinsic opcode and remove operand instead or replacing 4270 // it with 0, as the _L to _LZ handling is done above. 4271 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4272 --CorrectedNumVAddrs; 4273 } 4274 } 4275 } 4276 4277 // Rewrite the addressing register layout before doing anything else. 4278 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4279 // 16 bit gradients are supported, but are tied to the A16 control 4280 // so both gradients and addresses must be 16 bit 4281 return false; 4282 } 4283 4284 if (IsA16 && !ST.hasA16()) { 4285 // A16 not supported 4286 return false; 4287 } 4288 4289 if (IsA16 || IsG16) { 4290 if (Intr->NumVAddrs > 1) { 4291 SmallVector<Register, 4> PackedRegs; 4292 4293 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4294 IsG16); 4295 4296 // See also below in the non-a16 branch 4297 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4298 4299 if (!UseNSA && PackedRegs.size() > 1) { 4300 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4301 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4302 PackedRegs[0] = Concat.getReg(0); 4303 PackedRegs.resize(1); 4304 } 4305 4306 const unsigned NumPacked = PackedRegs.size(); 4307 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4308 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4309 if (!SrcOp.isReg()) { 4310 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4311 continue; 4312 } 4313 4314 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4315 4316 if (I - Intr->VAddrStart < NumPacked) 4317 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4318 else 4319 SrcOp.setReg(AMDGPU::NoRegister); 4320 } 4321 } 4322 } else { 4323 // If the register allocator cannot place the address registers contiguously 4324 // without introducing moves, then using the non-sequential address encoding 4325 // is always preferable, since it saves VALU instructions and is usually a 4326 // wash in terms of code size or even better. 4327 // 4328 // However, we currently have no way of hinting to the register allocator 4329 // that MIMG addresses should be placed contiguously when it is possible to 4330 // do so, so force non-NSA for the common 2-address case as a heuristic. 4331 // 4332 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4333 // allocation when possible. 4334 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4335 4336 if (!UseNSA && Intr->NumVAddrs > 1) 4337 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4338 Intr->NumVAddrs); 4339 } 4340 4341 int Flags = 0; 4342 if (IsA16) 4343 Flags |= 1; 4344 if (IsG16) 4345 Flags |= 2; 4346 MI.addOperand(MachineOperand::CreateImm(Flags)); 4347 4348 if (BaseOpcode->Store) { // No TFE for stores? 4349 // TODO: Handle dmask trim 4350 Register VData = MI.getOperand(1).getReg(); 4351 LLT Ty = MRI->getType(VData); 4352 if (!Ty.isVector() || Ty.getElementType() != S16) 4353 return true; 4354 4355 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4356 if (RepackedReg != VData) { 4357 MI.getOperand(1).setReg(RepackedReg); 4358 } 4359 4360 return true; 4361 } 4362 4363 Register DstReg = MI.getOperand(0).getReg(); 4364 LLT Ty = MRI->getType(DstReg); 4365 const LLT EltTy = Ty.getScalarType(); 4366 const bool IsD16 = Ty.getScalarType() == S16; 4367 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4368 4369 // Confirm that the return type is large enough for the dmask specified 4370 if (NumElts < DMaskLanes) 4371 return false; 4372 4373 if (NumElts > 4 || DMaskLanes > 4) 4374 return false; 4375 4376 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4377 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4378 4379 // The raw dword aligned data component of the load. The only legal cases 4380 // where this matters should be when using the packed D16 format, for 4381 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4382 LLT RoundedTy; 4383 4384 // S32 vector to to cover all data, plus TFE result element. 4385 LLT TFETy; 4386 4387 // Register type to use for each loaded component. Will be S32 or V2S16. 4388 LLT RegTy; 4389 4390 if (IsD16 && ST.hasUnpackedD16VMem()) { 4391 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4392 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4393 RegTy = S32; 4394 } else { 4395 unsigned EltSize = EltTy.getSizeInBits(); 4396 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4397 unsigned RoundedSize = 32 * RoundedElts; 4398 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4399 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4400 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4401 } 4402 4403 // The return type does not need adjustment. 4404 // TODO: Should we change s16 case to s32 or <2 x s16>? 4405 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4406 return true; 4407 4408 Register Dst1Reg; 4409 4410 // Insert after the instruction. 4411 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4412 4413 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4414 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4415 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4416 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4417 4418 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4419 4420 MI.getOperand(0).setReg(NewResultReg); 4421 4422 // In the IR, TFE is supposed to be used with a 2 element struct return 4423 // type. The intruction really returns these two values in one contiguous 4424 // register, with one additional dword beyond the loaded data. Rewrite the 4425 // return type to use a single register result. 4426 4427 if (IsTFE) { 4428 Dst1Reg = MI.getOperand(1).getReg(); 4429 if (MRI->getType(Dst1Reg) != S32) 4430 return false; 4431 4432 // TODO: Make sure the TFE operand bit is set. 4433 MI.RemoveOperand(1); 4434 4435 // Handle the easy case that requires no repack instructions. 4436 if (Ty == S32) { 4437 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4438 return true; 4439 } 4440 } 4441 4442 // Now figure out how to copy the new result register back into the old 4443 // result. 4444 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4445 4446 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4447 4448 if (ResultNumRegs == 1) { 4449 assert(!IsTFE); 4450 ResultRegs[0] = NewResultReg; 4451 } else { 4452 // We have to repack into a new vector of some kind. 4453 for (int I = 0; I != NumDataRegs; ++I) 4454 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4455 B.buildUnmerge(ResultRegs, NewResultReg); 4456 4457 // Drop the final TFE element to get the data part. The TFE result is 4458 // directly written to the right place already. 4459 if (IsTFE) 4460 ResultRegs.resize(NumDataRegs); 4461 } 4462 4463 // For an s16 scalar result, we form an s32 result with a truncate regardless 4464 // of packed vs. unpacked. 4465 if (IsD16 && !Ty.isVector()) { 4466 B.buildTrunc(DstReg, ResultRegs[0]); 4467 return true; 4468 } 4469 4470 // Avoid a build/concat_vector of 1 entry. 4471 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4472 B.buildBitcast(DstReg, ResultRegs[0]); 4473 return true; 4474 } 4475 4476 assert(Ty.isVector()); 4477 4478 if (IsD16) { 4479 // For packed D16 results with TFE enabled, all the data components are 4480 // S32. Cast back to the expected type. 4481 // 4482 // TODO: We don't really need to use load s32 elements. We would only need one 4483 // cast for the TFE result if a multiple of v2s16 was used. 4484 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4485 for (Register &Reg : ResultRegs) 4486 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4487 } else if (ST.hasUnpackedD16VMem()) { 4488 for (Register &Reg : ResultRegs) 4489 Reg = B.buildTrunc(S16, Reg).getReg(0); 4490 } 4491 } 4492 4493 auto padWithUndef = [&](LLT Ty, int NumElts) { 4494 if (NumElts == 0) 4495 return; 4496 Register Undef = B.buildUndef(Ty).getReg(0); 4497 for (int I = 0; I != NumElts; ++I) 4498 ResultRegs.push_back(Undef); 4499 }; 4500 4501 // Pad out any elements eliminated due to the dmask. 4502 LLT ResTy = MRI->getType(ResultRegs[0]); 4503 if (!ResTy.isVector()) { 4504 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4505 B.buildBuildVector(DstReg, ResultRegs); 4506 return true; 4507 } 4508 4509 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4510 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4511 4512 // Deal with the one annoying legal case. 4513 const LLT V3S16 = LLT::vector(3, 16); 4514 if (Ty == V3S16) { 4515 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4516 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4517 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4518 return true; 4519 } 4520 4521 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4522 B.buildConcatVectors(DstReg, ResultRegs); 4523 return true; 4524 } 4525 4526 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4527 LegalizerHelper &Helper, MachineInstr &MI) const { 4528 MachineIRBuilder &B = Helper.MIRBuilder; 4529 GISelChangeObserver &Observer = Helper.Observer; 4530 4531 Register Dst = MI.getOperand(0).getReg(); 4532 LLT Ty = B.getMRI()->getType(Dst); 4533 unsigned Size = Ty.getSizeInBits(); 4534 MachineFunction &MF = B.getMF(); 4535 4536 Observer.changingInstr(MI); 4537 4538 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4539 Ty = getBitcastRegisterType(Ty); 4540 Helper.bitcastDst(MI, Ty, 0); 4541 Dst = MI.getOperand(0).getReg(); 4542 B.setInsertPt(B.getMBB(), MI); 4543 } 4544 4545 // FIXME: We don't really need this intermediate instruction. The intrinsic 4546 // should be fixed to have a memory operand. Since it's readnone, we're not 4547 // allowed to add one. 4548 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4549 MI.RemoveOperand(1); // Remove intrinsic ID 4550 4551 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4552 // TODO: Should this use datalayout alignment? 4553 const unsigned MemSize = (Size + 7) / 8; 4554 const Align MemAlign(4); 4555 MachineMemOperand *MMO = MF.getMachineMemOperand( 4556 MachinePointerInfo(), 4557 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4558 MachineMemOperand::MOInvariant, 4559 MemSize, MemAlign); 4560 MI.addMemOperand(MF, MMO); 4561 4562 // There are no 96-bit result scalar loads, but widening to 128-bit should 4563 // always be legal. We may need to restore this to a 96-bit result if it turns 4564 // out this needs to be converted to a vector load during RegBankSelect. 4565 if (!isPowerOf2_32(Size)) { 4566 if (Ty.isVector()) 4567 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4568 else 4569 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4570 } 4571 4572 Observer.changedInstr(MI); 4573 return true; 4574 } 4575 4576 // TODO: Move to selection 4577 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4578 MachineRegisterInfo &MRI, 4579 MachineIRBuilder &B) const { 4580 if (!ST.isTrapHandlerEnabled() || 4581 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4582 return legalizeTrapEndpgm(MI, MRI, B); 4583 4584 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4585 switch (*HsaAbiVer) { 4586 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4587 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4588 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4589 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4590 return ST.supportsGetDoorbellID() ? 4591 legalizeTrapHsa(MI, MRI, B) : 4592 legalizeTrapHsaQueuePtr(MI, MRI, B); 4593 } 4594 } 4595 4596 llvm_unreachable("Unknown trap handler"); 4597 } 4598 4599 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4600 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4601 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4602 MI.eraseFromParent(); 4603 return true; 4604 } 4605 4606 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4607 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4608 // Pass queue pointer to trap handler as input, and insert trap instruction 4609 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4610 Register LiveIn = 4611 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4612 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4613 return false; 4614 4615 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4616 B.buildCopy(SGPR01, LiveIn); 4617 B.buildInstr(AMDGPU::S_TRAP) 4618 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4619 .addReg(SGPR01, RegState::Implicit); 4620 4621 MI.eraseFromParent(); 4622 return true; 4623 } 4624 4625 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4626 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4627 B.buildInstr(AMDGPU::S_TRAP) 4628 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4629 MI.eraseFromParent(); 4630 return true; 4631 } 4632 4633 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4634 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4635 // Is non-HSA path or trap-handler disabled? then, report a warning 4636 // accordingly 4637 if (!ST.isTrapHandlerEnabled() || 4638 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4639 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4640 "debugtrap handler not supported", 4641 MI.getDebugLoc(), DS_Warning); 4642 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4643 Ctx.diagnose(NoTrap); 4644 } else { 4645 // Insert debug-trap instruction 4646 B.buildInstr(AMDGPU::S_TRAP) 4647 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4648 } 4649 4650 MI.eraseFromParent(); 4651 return true; 4652 } 4653 4654 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4655 MachineIRBuilder &B) const { 4656 MachineRegisterInfo &MRI = *B.getMRI(); 4657 const LLT S16 = LLT::scalar(16); 4658 const LLT S32 = LLT::scalar(32); 4659 4660 Register DstReg = MI.getOperand(0).getReg(); 4661 Register NodePtr = MI.getOperand(2).getReg(); 4662 Register RayExtent = MI.getOperand(3).getReg(); 4663 Register RayOrigin = MI.getOperand(4).getReg(); 4664 Register RayDir = MI.getOperand(5).getReg(); 4665 Register RayInvDir = MI.getOperand(6).getReg(); 4666 Register TDescr = MI.getOperand(7).getReg(); 4667 4668 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4669 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4670 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4671 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4672 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4673 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4674 4675 SmallVector<Register, 12> Ops; 4676 if (Is64) { 4677 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4678 Ops.push_back(Unmerge.getReg(0)); 4679 Ops.push_back(Unmerge.getReg(1)); 4680 } else { 4681 Ops.push_back(NodePtr); 4682 } 4683 Ops.push_back(RayExtent); 4684 4685 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4686 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4687 Ops.push_back(Unmerge.getReg(0)); 4688 Ops.push_back(Unmerge.getReg(1)); 4689 Ops.push_back(Unmerge.getReg(2)); 4690 }; 4691 4692 packLanes(RayOrigin); 4693 if (IsA16) { 4694 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4695 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4696 Register R1 = MRI.createGenericVirtualRegister(S32); 4697 Register R2 = MRI.createGenericVirtualRegister(S32); 4698 Register R3 = MRI.createGenericVirtualRegister(S32); 4699 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4700 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4701 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4702 Ops.push_back(R1); 4703 Ops.push_back(R2); 4704 Ops.push_back(R3); 4705 } else { 4706 packLanes(RayDir); 4707 packLanes(RayInvDir); 4708 } 4709 4710 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4711 .addDef(DstReg) 4712 .addImm(Opcode); 4713 4714 for (Register R : Ops) { 4715 MIB.addUse(R); 4716 } 4717 4718 MIB.addUse(TDescr) 4719 .addImm(IsA16 ? 1 : 0) 4720 .cloneMemRefs(MI); 4721 4722 MI.eraseFromParent(); 4723 return true; 4724 } 4725 4726 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4727 MachineInstr &MI) const { 4728 MachineIRBuilder &B = Helper.MIRBuilder; 4729 MachineRegisterInfo &MRI = *B.getMRI(); 4730 4731 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4732 auto IntrID = MI.getIntrinsicID(); 4733 switch (IntrID) { 4734 case Intrinsic::amdgcn_if: 4735 case Intrinsic::amdgcn_else: { 4736 MachineInstr *Br = nullptr; 4737 MachineBasicBlock *UncondBrTarget = nullptr; 4738 bool Negated = false; 4739 if (MachineInstr *BrCond = 4740 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4741 const SIRegisterInfo *TRI 4742 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4743 4744 Register Def = MI.getOperand(1).getReg(); 4745 Register Use = MI.getOperand(3).getReg(); 4746 4747 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4748 4749 if (Negated) 4750 std::swap(CondBrTarget, UncondBrTarget); 4751 4752 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4753 if (IntrID == Intrinsic::amdgcn_if) { 4754 B.buildInstr(AMDGPU::SI_IF) 4755 .addDef(Def) 4756 .addUse(Use) 4757 .addMBB(UncondBrTarget); 4758 } else { 4759 B.buildInstr(AMDGPU::SI_ELSE) 4760 .addDef(Def) 4761 .addUse(Use) 4762 .addMBB(UncondBrTarget); 4763 } 4764 4765 if (Br) { 4766 Br->getOperand(0).setMBB(CondBrTarget); 4767 } else { 4768 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4769 // since we're swapping branch targets it needs to be reinserted. 4770 // FIXME: IRTranslator should probably not do this 4771 B.buildBr(*CondBrTarget); 4772 } 4773 4774 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4775 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4776 MI.eraseFromParent(); 4777 BrCond->eraseFromParent(); 4778 return true; 4779 } 4780 4781 return false; 4782 } 4783 case Intrinsic::amdgcn_loop: { 4784 MachineInstr *Br = nullptr; 4785 MachineBasicBlock *UncondBrTarget = nullptr; 4786 bool Negated = false; 4787 if (MachineInstr *BrCond = 4788 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4789 const SIRegisterInfo *TRI 4790 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4791 4792 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4793 Register Reg = MI.getOperand(2).getReg(); 4794 4795 if (Negated) 4796 std::swap(CondBrTarget, UncondBrTarget); 4797 4798 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4799 B.buildInstr(AMDGPU::SI_LOOP) 4800 .addUse(Reg) 4801 .addMBB(UncondBrTarget); 4802 4803 if (Br) 4804 Br->getOperand(0).setMBB(CondBrTarget); 4805 else 4806 B.buildBr(*CondBrTarget); 4807 4808 MI.eraseFromParent(); 4809 BrCond->eraseFromParent(); 4810 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4811 return true; 4812 } 4813 4814 return false; 4815 } 4816 case Intrinsic::amdgcn_kernarg_segment_ptr: 4817 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4818 // This only makes sense to call in a kernel, so just lower to null. 4819 B.buildConstant(MI.getOperand(0).getReg(), 0); 4820 MI.eraseFromParent(); 4821 return true; 4822 } 4823 4824 return legalizePreloadedArgIntrin( 4825 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4826 case Intrinsic::amdgcn_implicitarg_ptr: 4827 return legalizeImplicitArgPtr(MI, MRI, B); 4828 case Intrinsic::amdgcn_workitem_id_x: 4829 return legalizePreloadedArgIntrin(MI, MRI, B, 4830 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4831 case Intrinsic::amdgcn_workitem_id_y: 4832 return legalizePreloadedArgIntrin(MI, MRI, B, 4833 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4834 case Intrinsic::amdgcn_workitem_id_z: 4835 return legalizePreloadedArgIntrin(MI, MRI, B, 4836 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4837 case Intrinsic::amdgcn_workgroup_id_x: 4838 return legalizePreloadedArgIntrin(MI, MRI, B, 4839 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4840 case Intrinsic::amdgcn_workgroup_id_y: 4841 return legalizePreloadedArgIntrin(MI, MRI, B, 4842 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4843 case Intrinsic::amdgcn_workgroup_id_z: 4844 return legalizePreloadedArgIntrin(MI, MRI, B, 4845 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4846 case Intrinsic::amdgcn_dispatch_ptr: 4847 return legalizePreloadedArgIntrin(MI, MRI, B, 4848 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4849 case Intrinsic::amdgcn_queue_ptr: 4850 return legalizePreloadedArgIntrin(MI, MRI, B, 4851 AMDGPUFunctionArgInfo::QUEUE_PTR); 4852 case Intrinsic::amdgcn_implicit_buffer_ptr: 4853 return legalizePreloadedArgIntrin( 4854 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4855 case Intrinsic::amdgcn_dispatch_id: 4856 return legalizePreloadedArgIntrin(MI, MRI, B, 4857 AMDGPUFunctionArgInfo::DISPATCH_ID); 4858 case Intrinsic::amdgcn_fdiv_fast: 4859 return legalizeFDIVFastIntrin(MI, MRI, B); 4860 case Intrinsic::amdgcn_is_shared: 4861 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4862 case Intrinsic::amdgcn_is_private: 4863 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4864 case Intrinsic::amdgcn_wavefrontsize: { 4865 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4866 MI.eraseFromParent(); 4867 return true; 4868 } 4869 case Intrinsic::amdgcn_s_buffer_load: 4870 return legalizeSBufferLoad(Helper, MI); 4871 case Intrinsic::amdgcn_raw_buffer_store: 4872 case Intrinsic::amdgcn_struct_buffer_store: 4873 return legalizeBufferStore(MI, MRI, B, false, false); 4874 case Intrinsic::amdgcn_raw_buffer_store_format: 4875 case Intrinsic::amdgcn_struct_buffer_store_format: 4876 return legalizeBufferStore(MI, MRI, B, false, true); 4877 case Intrinsic::amdgcn_raw_tbuffer_store: 4878 case Intrinsic::amdgcn_struct_tbuffer_store: 4879 return legalizeBufferStore(MI, MRI, B, true, true); 4880 case Intrinsic::amdgcn_raw_buffer_load: 4881 case Intrinsic::amdgcn_struct_buffer_load: 4882 return legalizeBufferLoad(MI, MRI, B, false, false); 4883 case Intrinsic::amdgcn_raw_buffer_load_format: 4884 case Intrinsic::amdgcn_struct_buffer_load_format: 4885 return legalizeBufferLoad(MI, MRI, B, true, false); 4886 case Intrinsic::amdgcn_raw_tbuffer_load: 4887 case Intrinsic::amdgcn_struct_tbuffer_load: 4888 return legalizeBufferLoad(MI, MRI, B, true, true); 4889 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4890 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4891 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4892 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4893 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4894 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4895 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4896 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4897 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4898 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4899 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4900 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4901 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4902 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4903 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4904 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4905 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4906 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4907 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4908 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4909 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4910 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4911 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4912 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4913 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4914 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4915 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4916 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4917 case Intrinsic::amdgcn_buffer_atomic_fadd: 4918 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4919 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4920 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4921 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4922 return legalizeBufferAtomic(MI, B, IntrID); 4923 case Intrinsic::amdgcn_atomic_inc: 4924 return legalizeAtomicIncDec(MI, B, true); 4925 case Intrinsic::amdgcn_atomic_dec: 4926 return legalizeAtomicIncDec(MI, B, false); 4927 case Intrinsic::trap: 4928 return legalizeTrapIntrinsic(MI, MRI, B); 4929 case Intrinsic::debugtrap: 4930 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4931 case Intrinsic::amdgcn_rsq_clamp: 4932 return legalizeRsqClampIntrinsic(MI, MRI, B); 4933 case Intrinsic::amdgcn_ds_fadd: 4934 case Intrinsic::amdgcn_ds_fmin: 4935 case Intrinsic::amdgcn_ds_fmax: 4936 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4937 case Intrinsic::amdgcn_image_bvh_intersect_ray: 4938 return legalizeBVHIntrinsic(MI, B); 4939 default: { 4940 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4941 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4942 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4943 return true; 4944 } 4945 } 4946 4947 return true; 4948 } 4949