1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeNumElements(Pow2NElts); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 99 }; 100 } 101 102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 const LLT EltTy = Ty.getElementType(); 106 unsigned Size = Ty.getSizeInBits(); 107 unsigned Pieces = (Size + 63) / 64; 108 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 109 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 110 }; 111 } 112 113 // Increase the number of vector elements to reach the next multiple of 32-bit 114 // type. 115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 116 return [=](const LegalityQuery &Query) { 117 const LLT Ty = Query.Types[TypeIdx]; 118 119 const LLT EltTy = Ty.getElementType(); 120 const int Size = Ty.getSizeInBits(); 121 const int EltSize = EltTy.getSizeInBits(); 122 const int NextMul32 = (Size + 31) / 32; 123 124 assert(EltSize < 32); 125 126 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 127 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 128 }; 129 } 130 131 static LLT getBitcastRegisterType(const LLT Ty) { 132 const unsigned Size = Ty.getSizeInBits(); 133 134 LLT CoercedTy; 135 if (Size <= 32) { 136 // <2 x s8> -> s16 137 // <4 x s8> -> s32 138 return LLT::scalar(Size); 139 } 140 141 return LLT::scalarOrVector(Size / 32, 32); 142 } 143 144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 145 return [=](const LegalityQuery &Query) { 146 const LLT Ty = Query.Types[TypeIdx]; 147 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 148 }; 149 } 150 151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 152 return [=](const LegalityQuery &Query) { 153 const LLT Ty = Query.Types[TypeIdx]; 154 unsigned Size = Ty.getSizeInBits(); 155 assert(Size % 32 == 0); 156 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 157 }; 158 } 159 160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 161 return [=](const LegalityQuery &Query) { 162 const LLT QueryTy = Query.Types[TypeIdx]; 163 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 164 }; 165 } 166 167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 168 return [=](const LegalityQuery &Query) { 169 const LLT QueryTy = Query.Types[TypeIdx]; 170 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 171 }; 172 } 173 174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 const LLT QueryTy = Query.Types[TypeIdx]; 177 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 178 }; 179 } 180 181 static bool isRegisterSize(unsigned Size) { 182 return Size % 32 == 0 && Size <= MaxRegisterSize; 183 } 184 185 static bool isRegisterVectorElementType(LLT EltTy) { 186 const int EltSize = EltTy.getSizeInBits(); 187 return EltSize == 16 || EltSize % 32 == 0; 188 } 189 190 static bool isRegisterVectorType(LLT Ty) { 191 const int EltSize = Ty.getElementType().getSizeInBits(); 192 return EltSize == 32 || EltSize == 64 || 193 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 194 EltSize == 128 || EltSize == 256; 195 } 196 197 static bool isRegisterType(LLT Ty) { 198 if (!isRegisterSize(Ty.getSizeInBits())) 199 return false; 200 201 if (Ty.isVector()) 202 return isRegisterVectorType(Ty); 203 204 return true; 205 } 206 207 // Any combination of 32 or 64-bit elements up the maximum register size, and 208 // multiples of v2s16. 209 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 210 return [=](const LegalityQuery &Query) { 211 return isRegisterType(Query.Types[TypeIdx]); 212 }; 213 } 214 215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 216 return [=](const LegalityQuery &Query) { 217 const LLT QueryTy = Query.Types[TypeIdx]; 218 if (!QueryTy.isVector()) 219 return false; 220 const LLT EltTy = QueryTy.getElementType(); 221 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 222 }; 223 } 224 225 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 226 return [=](const LegalityQuery &Query) { 227 const LLT Ty = Query.Types[TypeIdx]; 228 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 229 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 230 }; 231 } 232 233 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 234 // handle some operations by just promoting the register during 235 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 236 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 237 bool IsLoad) { 238 switch (AS) { 239 case AMDGPUAS::PRIVATE_ADDRESS: 240 // FIXME: Private element size. 241 return ST.enableFlatScratch() ? 128 : 32; 242 case AMDGPUAS::LOCAL_ADDRESS: 243 return ST.useDS128() ? 128 : 64; 244 case AMDGPUAS::GLOBAL_ADDRESS: 245 case AMDGPUAS::CONSTANT_ADDRESS: 246 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 247 // Treat constant and global as identical. SMRD loads are sometimes usable for 248 // global loads (ideally constant address space should be eliminated) 249 // depending on the context. Legality cannot be context dependent, but 250 // RegBankSelect can split the load as necessary depending on the pointer 251 // register bank/uniformity and if the memory is invariant or not written in a 252 // kernel. 253 return IsLoad ? 512 : 128; 254 default: 255 // Flat addresses may contextually need to be split to 32-bit parts if they 256 // may alias scratch depending on the subtarget. 257 return 128; 258 } 259 } 260 261 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 262 const LegalityQuery &Query, 263 unsigned Opcode) { 264 const LLT Ty = Query.Types[0]; 265 266 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 267 const bool IsLoad = Opcode != AMDGPU::G_STORE; 268 269 unsigned RegSize = Ty.getSizeInBits(); 270 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 271 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 272 unsigned AS = Query.Types[1].getAddressSpace(); 273 274 // All of these need to be custom lowered to cast the pointer operand. 275 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 276 return false; 277 278 // TODO: We should be able to widen loads if the alignment is high enough, but 279 // we also need to modify the memory access size. 280 #if 0 281 // Accept widening loads based on alignment. 282 if (IsLoad && MemSize < Size) 283 MemSize = std::max(MemSize, Align); 284 #endif 285 286 // Only 1-byte and 2-byte to 32-bit extloads are valid. 287 if (MemSize != RegSize && RegSize != 32) 288 return false; 289 290 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 291 return false; 292 293 switch (MemSize) { 294 case 8: 295 case 16: 296 case 32: 297 case 64: 298 case 128: 299 break; 300 case 96: 301 if (!ST.hasDwordx3LoadStores()) 302 return false; 303 break; 304 case 256: 305 case 512: 306 // These may contextually need to be broken down. 307 break; 308 default: 309 return false; 310 } 311 312 assert(RegSize >= MemSize); 313 314 if (AlignBits < MemSize) { 315 const SITargetLowering *TLI = ST.getTargetLowering(); 316 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 317 Align(AlignBits / 8))) 318 return false; 319 } 320 321 return true; 322 } 323 324 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 325 // workaround this. Eventually it should ignore the type for loads and only care 326 // about the size. Return true in cases where we will workaround this for now by 327 // bitcasting. 328 static bool loadStoreBitcastWorkaround(const LLT Ty) { 329 if (EnableNewLegality) 330 return false; 331 332 const unsigned Size = Ty.getSizeInBits(); 333 if (Size <= 64) 334 return false; 335 if (!Ty.isVector()) 336 return true; 337 338 LLT EltTy = Ty.getElementType(); 339 if (EltTy.isPointer()) 340 return true; 341 342 unsigned EltSize = EltTy.getSizeInBits(); 343 return EltSize != 32 && EltSize != 64; 344 } 345 346 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 347 unsigned Opcode) { 348 const LLT Ty = Query.Types[0]; 349 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 350 !loadStoreBitcastWorkaround(Ty); 351 } 352 353 /// Return true if a load or store of the type should be lowered with a bitcast 354 /// to a different type. 355 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 356 const unsigned MemSizeInBits) { 357 const unsigned Size = Ty.getSizeInBits(); 358 if (Size != MemSizeInBits) 359 return Size <= 32 && Ty.isVector(); 360 361 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 362 return true; 363 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 364 !isRegisterVectorElementType(Ty.getElementType()); 365 } 366 367 /// Return true if we should legalize a load by widening an odd sized memory 368 /// access up to the alignment. Note this case when the memory access itself 369 /// changes, not the size of the result register. 370 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 371 unsigned AlignInBits, unsigned AddrSpace, 372 unsigned Opcode) { 373 // We don't want to widen cases that are naturally legal. 374 if (isPowerOf2_32(SizeInBits)) 375 return false; 376 377 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 378 // end up widening these for a scalar load during RegBankSelect, since there 379 // aren't 96-bit scalar loads. 380 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 381 return false; 382 383 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 384 return false; 385 386 // A load is known dereferenceable up to the alignment, so it's legal to widen 387 // to it. 388 // 389 // TODO: Could check dereferenceable for less aligned cases. 390 unsigned RoundedSize = NextPowerOf2(SizeInBits); 391 if (AlignInBits < RoundedSize) 392 return false; 393 394 // Do not widen if it would introduce a slow unaligned load. 395 const SITargetLowering *TLI = ST.getTargetLowering(); 396 bool Fast = false; 397 return TLI->allowsMisalignedMemoryAccessesImpl( 398 RoundedSize, AddrSpace, Align(AlignInBits / 8), 399 MachineMemOperand::MOLoad, &Fast) && 400 Fast; 401 } 402 403 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 404 unsigned Opcode) { 405 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 406 return false; 407 408 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 409 Query.MMODescrs[0].AlignInBits, 410 Query.Types[1].getAddressSpace(), Opcode); 411 } 412 413 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 414 const GCNTargetMachine &TM) 415 : ST(ST_) { 416 using namespace TargetOpcode; 417 418 auto GetAddrSpacePtr = [&TM](unsigned AS) { 419 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 420 }; 421 422 const LLT S1 = LLT::scalar(1); 423 const LLT S8 = LLT::scalar(8); 424 const LLT S16 = LLT::scalar(16); 425 const LLT S32 = LLT::scalar(32); 426 const LLT S64 = LLT::scalar(64); 427 const LLT S128 = LLT::scalar(128); 428 const LLT S256 = LLT::scalar(256); 429 const LLT S512 = LLT::scalar(512); 430 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 431 432 const LLT V2S8 = LLT::vector(2, 8); 433 const LLT V2S16 = LLT::vector(2, 16); 434 const LLT V4S16 = LLT::vector(4, 16); 435 436 const LLT V2S32 = LLT::vector(2, 32); 437 const LLT V3S32 = LLT::vector(3, 32); 438 const LLT V4S32 = LLT::vector(4, 32); 439 const LLT V5S32 = LLT::vector(5, 32); 440 const LLT V6S32 = LLT::vector(6, 32); 441 const LLT V7S32 = LLT::vector(7, 32); 442 const LLT V8S32 = LLT::vector(8, 32); 443 const LLT V9S32 = LLT::vector(9, 32); 444 const LLT V10S32 = LLT::vector(10, 32); 445 const LLT V11S32 = LLT::vector(11, 32); 446 const LLT V12S32 = LLT::vector(12, 32); 447 const LLT V13S32 = LLT::vector(13, 32); 448 const LLT V14S32 = LLT::vector(14, 32); 449 const LLT V15S32 = LLT::vector(15, 32); 450 const LLT V16S32 = LLT::vector(16, 32); 451 const LLT V32S32 = LLT::vector(32, 32); 452 453 const LLT V2S64 = LLT::vector(2, 64); 454 const LLT V3S64 = LLT::vector(3, 64); 455 const LLT V4S64 = LLT::vector(4, 64); 456 const LLT V5S64 = LLT::vector(5, 64); 457 const LLT V6S64 = LLT::vector(6, 64); 458 const LLT V7S64 = LLT::vector(7, 64); 459 const LLT V8S64 = LLT::vector(8, 64); 460 const LLT V16S64 = LLT::vector(16, 64); 461 462 std::initializer_list<LLT> AllS32Vectors = 463 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 464 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 465 std::initializer_list<LLT> AllS64Vectors = 466 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 467 468 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 469 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 470 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 471 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 472 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 473 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 474 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 475 476 const LLT CodePtr = FlatPtr; 477 478 const std::initializer_list<LLT> AddrSpaces64 = { 479 GlobalPtr, ConstantPtr, FlatPtr 480 }; 481 482 const std::initializer_list<LLT> AddrSpaces32 = { 483 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 484 }; 485 486 const std::initializer_list<LLT> FPTypesBase = { 487 S32, S64 488 }; 489 490 const std::initializer_list<LLT> FPTypes16 = { 491 S32, S64, S16 492 }; 493 494 const std::initializer_list<LLT> FPTypesPK16 = { 495 S32, S64, S16, V2S16 496 }; 497 498 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 499 500 setAction({G_BRCOND, S1}, Legal); // VCC branches 501 setAction({G_BRCOND, S32}, Legal); // SCC branches 502 503 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 504 // elements for v3s16 505 getActionDefinitionsBuilder(G_PHI) 506 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 507 .legalFor(AllS32Vectors) 508 .legalFor(AllS64Vectors) 509 .legalFor(AddrSpaces64) 510 .legalFor(AddrSpaces32) 511 .legalIf(isPointer(0)) 512 .clampScalar(0, S16, S256) 513 .widenScalarToNextPow2(0, 32) 514 .clampMaxNumElements(0, S32, 16) 515 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 516 .scalarize(0); 517 518 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 519 // Full set of gfx9 features. 520 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 521 .legalFor({S32, S16, V2S16}) 522 .clampScalar(0, S16, S32) 523 .clampMaxNumElements(0, S16, 2) 524 .scalarize(0) 525 .widenScalarToNextPow2(0, 32); 526 527 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 528 .legalFor({S32, S16, V2S16}) // Clamp modifier 529 .minScalarOrElt(0, S16) 530 .clampMaxNumElements(0, S16, 2) 531 .scalarize(0) 532 .widenScalarToNextPow2(0, 32) 533 .lower(); 534 } else if (ST.has16BitInsts()) { 535 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 536 .legalFor({S32, S16}) 537 .clampScalar(0, S16, S32) 538 .scalarize(0) 539 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 540 541 // Technically the saturating operations require clamp bit support, but this 542 // was introduced at the same time as 16-bit operations. 543 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 544 .legalFor({S32, S16}) // Clamp modifier 545 .minScalar(0, S16) 546 .scalarize(0) 547 .widenScalarToNextPow2(0, 16) 548 .lower(); 549 550 // We're just lowering this, but it helps get a better result to try to 551 // coerce to the desired type first. 552 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 553 .minScalar(0, S16) 554 .scalarize(0) 555 .lower(); 556 } else { 557 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 558 .legalFor({S32}) 559 .clampScalar(0, S32, S32) 560 .scalarize(0); 561 562 if (ST.hasIntClamp()) { 563 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 564 .legalFor({S32}) // Clamp modifier. 565 .scalarize(0) 566 .minScalarOrElt(0, S32) 567 .lower(); 568 } else { 569 // Clamp bit support was added in VI, along with 16-bit operations. 570 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 571 .minScalar(0, S32) 572 .scalarize(0) 573 .lower(); 574 } 575 576 // FIXME: DAG expansion gets better results. The widening uses the smaller 577 // range values and goes for the min/max lowering directly. 578 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 579 .minScalar(0, S32) 580 .scalarize(0) 581 .lower(); 582 } 583 584 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 585 .customFor({S32, S64}) 586 .clampScalar(0, S32, S64) 587 .widenScalarToNextPow2(0, 32) 588 .scalarize(0); 589 590 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 591 .legalFor({S32}) 592 .maxScalarOrElt(0, S32); 593 594 if (ST.hasVOP3PInsts()) { 595 Mulh 596 .clampMaxNumElements(0, S8, 2) 597 .lowerFor({V2S8}); 598 } 599 600 Mulh 601 .scalarize(0) 602 .lower(); 603 604 // Report legal for any types we can handle anywhere. For the cases only legal 605 // on the SALU, RegBankSelect will be able to re-legalize. 606 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 607 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 608 .clampScalar(0, S32, S64) 609 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 610 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 611 .widenScalarToNextPow2(0) 612 .scalarize(0); 613 614 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 615 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 616 .legalFor({{S32, S1}, {S32, S32}}) 617 .minScalar(0, S32) 618 // TODO: .scalarize(0) 619 .lower(); 620 621 getActionDefinitionsBuilder(G_BITCAST) 622 // Don't worry about the size constraint. 623 .legalIf(all(isRegisterType(0), isRegisterType(1))) 624 .lower(); 625 626 627 getActionDefinitionsBuilder(G_CONSTANT) 628 .legalFor({S1, S32, S64, S16, GlobalPtr, 629 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 630 .legalIf(isPointer(0)) 631 .clampScalar(0, S32, S64) 632 .widenScalarToNextPow2(0); 633 634 getActionDefinitionsBuilder(G_FCONSTANT) 635 .legalFor({S32, S64, S16}) 636 .clampScalar(0, S16, S64); 637 638 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 639 .legalIf(isRegisterType(0)) 640 // s1 and s16 are special cases because they have legal operations on 641 // them, but don't really occupy registers in the normal way. 642 .legalFor({S1, S16}) 643 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 644 .clampScalarOrElt(0, S32, MaxScalar) 645 .widenScalarToNextPow2(0, 32) 646 .clampMaxNumElements(0, S32, 16); 647 648 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 649 650 // If the amount is divergent, we have to do a wave reduction to get the 651 // maximum value, so this is expanded during RegBankSelect. 652 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 653 .legalFor({{PrivatePtr, S32}}); 654 655 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 656 .customIf(typeIsNot(0, PrivatePtr)); 657 658 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 659 660 auto &FPOpActions = getActionDefinitionsBuilder( 661 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 662 .legalFor({S32, S64}); 663 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 664 .customFor({S32, S64}); 665 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 666 .customFor({S32, S64}); 667 668 if (ST.has16BitInsts()) { 669 if (ST.hasVOP3PInsts()) 670 FPOpActions.legalFor({S16, V2S16}); 671 else 672 FPOpActions.legalFor({S16}); 673 674 TrigActions.customFor({S16}); 675 FDIVActions.customFor({S16}); 676 } 677 678 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 679 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 680 681 if (ST.hasVOP3PInsts()) { 682 MinNumMaxNum.customFor(FPTypesPK16) 683 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 684 .clampMaxNumElements(0, S16, 2) 685 .clampScalar(0, S16, S64) 686 .scalarize(0); 687 } else if (ST.has16BitInsts()) { 688 MinNumMaxNum.customFor(FPTypes16) 689 .clampScalar(0, S16, S64) 690 .scalarize(0); 691 } else { 692 MinNumMaxNum.customFor(FPTypesBase) 693 .clampScalar(0, S32, S64) 694 .scalarize(0); 695 } 696 697 if (ST.hasVOP3PInsts()) 698 FPOpActions.clampMaxNumElements(0, S16, 2); 699 700 FPOpActions 701 .scalarize(0) 702 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 703 704 TrigActions 705 .scalarize(0) 706 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 707 708 FDIVActions 709 .scalarize(0) 710 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 711 712 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 713 .legalFor(FPTypesPK16) 714 .clampMaxNumElements(0, S16, 2) 715 .scalarize(0) 716 .clampScalar(0, S16, S64); 717 718 if (ST.has16BitInsts()) { 719 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 720 .legalFor({S32, S64, S16}) 721 .scalarize(0) 722 .clampScalar(0, S16, S64); 723 } else { 724 getActionDefinitionsBuilder(G_FSQRT) 725 .legalFor({S32, S64}) 726 .scalarize(0) 727 .clampScalar(0, S32, S64); 728 729 if (ST.hasFractBug()) { 730 getActionDefinitionsBuilder(G_FFLOOR) 731 .customFor({S64}) 732 .legalFor({S32, S64}) 733 .scalarize(0) 734 .clampScalar(0, S32, S64); 735 } else { 736 getActionDefinitionsBuilder(G_FFLOOR) 737 .legalFor({S32, S64}) 738 .scalarize(0) 739 .clampScalar(0, S32, S64); 740 } 741 } 742 743 getActionDefinitionsBuilder(G_FPTRUNC) 744 .legalFor({{S32, S64}, {S16, S32}}) 745 .scalarize(0) 746 .lower(); 747 748 getActionDefinitionsBuilder(G_FPEXT) 749 .legalFor({{S64, S32}, {S32, S16}}) 750 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 751 .scalarize(0); 752 753 getActionDefinitionsBuilder(G_FSUB) 754 // Use actual fsub instruction 755 .legalFor({S32}) 756 // Must use fadd + fneg 757 .lowerFor({S64, S16, V2S16}) 758 .scalarize(0) 759 .clampScalar(0, S32, S64); 760 761 // Whether this is legal depends on the floating point mode for the function. 762 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 763 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 764 FMad.customFor({S32, S16}); 765 else if (ST.hasMadMacF32Insts()) 766 FMad.customFor({S32}); 767 else if (ST.hasMadF16()) 768 FMad.customFor({S16}); 769 FMad.scalarize(0) 770 .lower(); 771 772 auto &FRem = getActionDefinitionsBuilder(G_FREM); 773 if (ST.has16BitInsts()) { 774 FRem.customFor({S16, S32, S64}); 775 } else { 776 FRem.minScalar(0, S32) 777 .customFor({S32, S64}); 778 } 779 FRem.scalarize(0); 780 781 // TODO: Do we need to clamp maximum bitwidth? 782 getActionDefinitionsBuilder(G_TRUNC) 783 .legalIf(isScalar(0)) 784 .legalFor({{V2S16, V2S32}}) 785 .clampMaxNumElements(0, S16, 2) 786 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 787 // situations (like an invalid implicit use), we don't want to infinite loop 788 // in the legalizer. 789 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 790 .alwaysLegal(); 791 792 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 793 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 794 {S32, S1}, {S64, S1}, {S16, S1}}) 795 .scalarize(0) 796 .clampScalar(0, S32, S64) 797 .widenScalarToNextPow2(1, 32); 798 799 // TODO: Split s1->s64 during regbankselect for VALU. 800 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 801 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 802 .lowerFor({{S32, S64}}) 803 .lowerIf(typeIs(1, S1)) 804 .customFor({{S64, S64}}); 805 if (ST.has16BitInsts()) 806 IToFP.legalFor({{S16, S16}}); 807 IToFP.clampScalar(1, S32, S64) 808 .minScalar(0, S32) 809 .scalarize(0) 810 .widenScalarToNextPow2(1); 811 812 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 813 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 814 .customFor({{S64, S64}}) 815 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 816 if (ST.has16BitInsts()) 817 FPToI.legalFor({{S16, S16}}); 818 else 819 FPToI.minScalar(1, S32); 820 821 FPToI.minScalar(0, S32) 822 .widenScalarToNextPow2(0, 32) 823 .scalarize(0) 824 .lower(); 825 826 // Lower roundeven into G_FRINT 827 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 828 .scalarize(0) 829 .lower(); 830 831 if (ST.has16BitInsts()) { 832 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 833 .legalFor({S16, S32, S64}) 834 .clampScalar(0, S16, S64) 835 .scalarize(0); 836 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 837 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 838 .legalFor({S32, S64}) 839 .clampScalar(0, S32, S64) 840 .scalarize(0); 841 } else { 842 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 843 .legalFor({S32}) 844 .customFor({S64}) 845 .clampScalar(0, S32, S64) 846 .scalarize(0); 847 } 848 849 getActionDefinitionsBuilder(G_PTR_ADD) 850 .legalIf(all(isPointer(0), sameSize(0, 1))) 851 .scalarize(0) 852 .scalarSameSizeAs(1, 0); 853 854 getActionDefinitionsBuilder(G_PTRMASK) 855 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 856 .scalarSameSizeAs(1, 0) 857 .scalarize(0); 858 859 auto &CmpBuilder = 860 getActionDefinitionsBuilder(G_ICMP) 861 // The compare output type differs based on the register bank of the output, 862 // so make both s1 and s32 legal. 863 // 864 // Scalar compares producing output in scc will be promoted to s32, as that 865 // is the allocatable register type that will be needed for the copy from 866 // scc. This will be promoted during RegBankSelect, and we assume something 867 // before that won't try to use s32 result types. 868 // 869 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 870 // bank. 871 .legalForCartesianProduct( 872 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 873 .legalForCartesianProduct( 874 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 875 if (ST.has16BitInsts()) { 876 CmpBuilder.legalFor({{S1, S16}}); 877 } 878 879 CmpBuilder 880 .widenScalarToNextPow2(1) 881 .clampScalar(1, S32, S64) 882 .scalarize(0) 883 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 884 885 getActionDefinitionsBuilder(G_FCMP) 886 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 887 .widenScalarToNextPow2(1) 888 .clampScalar(1, S32, S64) 889 .scalarize(0); 890 891 // FIXME: fpow has a selection pattern that should move to custom lowering. 892 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 893 if (ST.has16BitInsts()) 894 Exp2Ops.legalFor({S32, S16}); 895 else 896 Exp2Ops.legalFor({S32}); 897 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 898 Exp2Ops.scalarize(0); 899 900 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 901 if (ST.has16BitInsts()) 902 ExpOps.customFor({{S32}, {S16}}); 903 else 904 ExpOps.customFor({S32}); 905 ExpOps.clampScalar(0, MinScalarFPTy, S32) 906 .scalarize(0); 907 908 getActionDefinitionsBuilder(G_FPOWI) 909 .clampScalar(0, MinScalarFPTy, S32) 910 .lower(); 911 912 // The 64-bit versions produce 32-bit results, but only on the SALU. 913 getActionDefinitionsBuilder(G_CTPOP) 914 .legalFor({{S32, S32}, {S32, S64}}) 915 .clampScalar(0, S32, S32) 916 .clampScalar(1, S32, S64) 917 .scalarize(0) 918 .widenScalarToNextPow2(0, 32) 919 .widenScalarToNextPow2(1, 32); 920 921 // The hardware instructions return a different result on 0 than the generic 922 // instructions expect. The hardware produces -1, but these produce the 923 // bitwidth. 924 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 925 .scalarize(0) 926 .clampScalar(0, S32, S32) 927 .clampScalar(1, S32, S64) 928 .widenScalarToNextPow2(0, 32) 929 .widenScalarToNextPow2(1, 32) 930 .lower(); 931 932 // The 64-bit versions produce 32-bit results, but only on the SALU. 933 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 934 .legalFor({{S32, S32}, {S32, S64}}) 935 .clampScalar(0, S32, S32) 936 .clampScalar(1, S32, S64) 937 .scalarize(0) 938 .widenScalarToNextPow2(0, 32) 939 .widenScalarToNextPow2(1, 32); 940 941 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 942 // RegBankSelect. 943 getActionDefinitionsBuilder(G_BITREVERSE) 944 .legalFor({S32, S64}) 945 .clampScalar(0, S32, S64) 946 .scalarize(0) 947 .widenScalarToNextPow2(0); 948 949 if (ST.has16BitInsts()) { 950 getActionDefinitionsBuilder(G_BSWAP) 951 .legalFor({S16, S32, V2S16}) 952 .clampMaxNumElements(0, S16, 2) 953 // FIXME: Fixing non-power-of-2 before clamp is workaround for 954 // narrowScalar limitation. 955 .widenScalarToNextPow2(0) 956 .clampScalar(0, S16, S32) 957 .scalarize(0); 958 959 if (ST.hasVOP3PInsts()) { 960 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 961 .legalFor({S32, S16, V2S16}) 962 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 963 .clampMaxNumElements(0, S16, 2) 964 .minScalar(0, S16) 965 .widenScalarToNextPow2(0) 966 .scalarize(0) 967 .lower(); 968 } else { 969 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 970 .legalFor({S32, S16}) 971 .widenScalarToNextPow2(0) 972 .minScalar(0, S16) 973 .scalarize(0) 974 .lower(); 975 } 976 } else { 977 // TODO: Should have same legality without v_perm_b32 978 getActionDefinitionsBuilder(G_BSWAP) 979 .legalFor({S32}) 980 .lowerIf(scalarNarrowerThan(0, 32)) 981 // FIXME: Fixing non-power-of-2 before clamp is workaround for 982 // narrowScalar limitation. 983 .widenScalarToNextPow2(0) 984 .maxScalar(0, S32) 985 .scalarize(0) 986 .lower(); 987 988 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 989 .legalFor({S32}) 990 .minScalar(0, S32) 991 .widenScalarToNextPow2(0) 992 .scalarize(0) 993 .lower(); 994 } 995 996 getActionDefinitionsBuilder(G_INTTOPTR) 997 // List the common cases 998 .legalForCartesianProduct(AddrSpaces64, {S64}) 999 .legalForCartesianProduct(AddrSpaces32, {S32}) 1000 .scalarize(0) 1001 // Accept any address space as long as the size matches 1002 .legalIf(sameSize(0, 1)) 1003 .widenScalarIf(smallerThan(1, 0), 1004 [](const LegalityQuery &Query) { 1005 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1006 }) 1007 .narrowScalarIf(largerThan(1, 0), 1008 [](const LegalityQuery &Query) { 1009 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1010 }); 1011 1012 getActionDefinitionsBuilder(G_PTRTOINT) 1013 // List the common cases 1014 .legalForCartesianProduct(AddrSpaces64, {S64}) 1015 .legalForCartesianProduct(AddrSpaces32, {S32}) 1016 .scalarize(0) 1017 // Accept any address space as long as the size matches 1018 .legalIf(sameSize(0, 1)) 1019 .widenScalarIf(smallerThan(0, 1), 1020 [](const LegalityQuery &Query) { 1021 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1022 }) 1023 .narrowScalarIf( 1024 largerThan(0, 1), 1025 [](const LegalityQuery &Query) { 1026 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1027 }); 1028 1029 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1030 .scalarize(0) 1031 .custom(); 1032 1033 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1034 bool IsLoad) -> bool { 1035 const LLT DstTy = Query.Types[0]; 1036 1037 // Split vector extloads. 1038 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1039 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1040 1041 if (MemSize < DstTy.getSizeInBits()) 1042 MemSize = std::max(MemSize, AlignBits); 1043 1044 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1045 return true; 1046 1047 const LLT PtrTy = Query.Types[1]; 1048 unsigned AS = PtrTy.getAddressSpace(); 1049 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1050 return true; 1051 1052 // Catch weird sized loads that don't evenly divide into the access sizes 1053 // TODO: May be able to widen depending on alignment etc. 1054 unsigned NumRegs = (MemSize + 31) / 32; 1055 if (NumRegs == 3) { 1056 if (!ST.hasDwordx3LoadStores()) 1057 return true; 1058 } else { 1059 // If the alignment allows, these should have been widened. 1060 if (!isPowerOf2_32(NumRegs)) 1061 return true; 1062 } 1063 1064 if (AlignBits < MemSize) { 1065 const SITargetLowering *TLI = ST.getTargetLowering(); 1066 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1067 Align(AlignBits / 8)); 1068 } 1069 1070 return false; 1071 }; 1072 1073 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1074 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1075 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1076 1077 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1078 // LDS 1079 // TODO: Unsupported flat for SI. 1080 1081 for (unsigned Op : {G_LOAD, G_STORE}) { 1082 const bool IsStore = Op == G_STORE; 1083 1084 auto &Actions = getActionDefinitionsBuilder(Op); 1085 // Explicitly list some common cases. 1086 // TODO: Does this help compile time at all? 1087 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1088 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1089 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1090 {S64, GlobalPtr, 64, GlobalAlign32}, 1091 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1092 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1093 {S32, GlobalPtr, 8, GlobalAlign8}, 1094 {S32, GlobalPtr, 16, GlobalAlign16}, 1095 1096 {S32, LocalPtr, 32, 32}, 1097 {S64, LocalPtr, 64, 32}, 1098 {V2S32, LocalPtr, 64, 32}, 1099 {S32, LocalPtr, 8, 8}, 1100 {S32, LocalPtr, 16, 16}, 1101 {V2S16, LocalPtr, 32, 32}, 1102 1103 {S32, PrivatePtr, 32, 32}, 1104 {S32, PrivatePtr, 8, 8}, 1105 {S32, PrivatePtr, 16, 16}, 1106 {V2S16, PrivatePtr, 32, 32}, 1107 1108 {S32, ConstantPtr, 32, GlobalAlign32}, 1109 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1110 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1111 {S64, ConstantPtr, 64, GlobalAlign32}, 1112 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1113 Actions.legalIf( 1114 [=](const LegalityQuery &Query) -> bool { 1115 return isLoadStoreLegal(ST, Query, Op); 1116 }); 1117 1118 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1119 // 64-bits. 1120 // 1121 // TODO: Should generalize bitcast action into coerce, which will also cover 1122 // inserting addrspacecasts. 1123 Actions.customIf(typeIs(1, Constant32Ptr)); 1124 1125 // Turn any illegal element vectors into something easier to deal 1126 // with. These will ultimately produce 32-bit scalar shifts to extract the 1127 // parts anyway. 1128 // 1129 // For odd 16-bit element vectors, prefer to split those into pieces with 1130 // 16-bit vector parts. 1131 Actions.bitcastIf( 1132 [=](const LegalityQuery &Query) -> bool { 1133 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1134 Query.MMODescrs[0].SizeInBits); 1135 }, bitcastToRegisterType(0)); 1136 1137 if (!IsStore) { 1138 // Widen suitably aligned loads by loading extra bytes. The standard 1139 // legalization actions can't properly express widening memory operands. 1140 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1141 return shouldWidenLoad(ST, Query, G_LOAD); 1142 }); 1143 } 1144 1145 // FIXME: load/store narrowing should be moved to lower action 1146 Actions 1147 .narrowScalarIf( 1148 [=](const LegalityQuery &Query) -> bool { 1149 return !Query.Types[0].isVector() && 1150 needToSplitMemOp(Query, Op == G_LOAD); 1151 }, 1152 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1153 const LLT DstTy = Query.Types[0]; 1154 const LLT PtrTy = Query.Types[1]; 1155 1156 const unsigned DstSize = DstTy.getSizeInBits(); 1157 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1158 1159 // Split extloads. 1160 if (DstSize > MemSize) 1161 return std::make_pair(0, LLT::scalar(MemSize)); 1162 1163 if (!isPowerOf2_32(DstSize)) { 1164 // We're probably decomposing an odd sized store. Try to split 1165 // to the widest type. TODO: Account for alignment. As-is it 1166 // should be OK, since the new parts will be further legalized. 1167 unsigned FloorSize = PowerOf2Floor(DstSize); 1168 return std::make_pair(0, LLT::scalar(FloorSize)); 1169 } 1170 1171 if (DstSize > 32 && (DstSize % 32 != 0)) { 1172 // FIXME: Need a way to specify non-extload of larger size if 1173 // suitably aligned. 1174 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1175 } 1176 1177 unsigned MaxSize = maxSizeForAddrSpace(ST, 1178 PtrTy.getAddressSpace(), 1179 Op == G_LOAD); 1180 if (MemSize > MaxSize) 1181 return std::make_pair(0, LLT::scalar(MaxSize)); 1182 1183 unsigned Align = Query.MMODescrs[0].AlignInBits; 1184 return std::make_pair(0, LLT::scalar(Align)); 1185 }) 1186 .fewerElementsIf( 1187 [=](const LegalityQuery &Query) -> bool { 1188 return Query.Types[0].isVector() && 1189 needToSplitMemOp(Query, Op == G_LOAD); 1190 }, 1191 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1192 const LLT DstTy = Query.Types[0]; 1193 const LLT PtrTy = Query.Types[1]; 1194 1195 LLT EltTy = DstTy.getElementType(); 1196 unsigned MaxSize = maxSizeForAddrSpace(ST, 1197 PtrTy.getAddressSpace(), 1198 Op == G_LOAD); 1199 1200 // FIXME: Handle widened to power of 2 results better. This ends 1201 // up scalarizing. 1202 // FIXME: 3 element stores scalarized on SI 1203 1204 // Split if it's too large for the address space. 1205 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1206 unsigned NumElts = DstTy.getNumElements(); 1207 unsigned EltSize = EltTy.getSizeInBits(); 1208 1209 if (MaxSize % EltSize == 0) { 1210 return std::make_pair( 1211 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1212 } 1213 1214 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1215 1216 // FIXME: Refine when odd breakdowns handled 1217 // The scalars will need to be re-legalized. 1218 if (NumPieces == 1 || NumPieces >= NumElts || 1219 NumElts % NumPieces != 0) 1220 return std::make_pair(0, EltTy); 1221 1222 return std::make_pair(0, 1223 LLT::vector(NumElts / NumPieces, EltTy)); 1224 } 1225 1226 // FIXME: We could probably handle weird extending loads better. 1227 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1228 if (DstTy.getSizeInBits() > MemSize) 1229 return std::make_pair(0, EltTy); 1230 1231 unsigned EltSize = EltTy.getSizeInBits(); 1232 unsigned DstSize = DstTy.getSizeInBits(); 1233 if (!isPowerOf2_32(DstSize)) { 1234 // We're probably decomposing an odd sized store. Try to split 1235 // to the widest type. TODO: Account for alignment. As-is it 1236 // should be OK, since the new parts will be further legalized. 1237 unsigned FloorSize = PowerOf2Floor(DstSize); 1238 return std::make_pair( 1239 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1240 } 1241 1242 // Need to split because of alignment. 1243 unsigned Align = Query.MMODescrs[0].AlignInBits; 1244 if (EltSize > Align && 1245 (EltSize / Align < DstTy.getNumElements())) { 1246 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1247 } 1248 1249 // May need relegalization for the scalars. 1250 return std::make_pair(0, EltTy); 1251 }) 1252 .lowerIfMemSizeNotPow2() 1253 .minScalar(0, S32); 1254 1255 if (IsStore) 1256 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1257 1258 Actions 1259 .widenScalarToNextPow2(0) 1260 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1261 .lower(); 1262 } 1263 1264 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1265 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1266 {S32, GlobalPtr, 16, 2 * 8}, 1267 {S32, LocalPtr, 8, 8}, 1268 {S32, LocalPtr, 16, 16}, 1269 {S32, PrivatePtr, 8, 8}, 1270 {S32, PrivatePtr, 16, 16}, 1271 {S32, ConstantPtr, 8, 8}, 1272 {S32, ConstantPtr, 16, 2 * 8}}); 1273 if (ST.hasFlatAddressSpace()) { 1274 ExtLoads.legalForTypesWithMemDesc( 1275 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1276 } 1277 1278 ExtLoads.clampScalar(0, S32, S32) 1279 .widenScalarToNextPow2(0) 1280 .unsupportedIfMemSizeNotPow2() 1281 .lower(); 1282 1283 auto &Atomics = getActionDefinitionsBuilder( 1284 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1285 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1286 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1287 G_ATOMICRMW_UMIN}) 1288 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1289 {S64, GlobalPtr}, {S64, LocalPtr}, 1290 {S32, RegionPtr}, {S64, RegionPtr}}); 1291 if (ST.hasFlatAddressSpace()) { 1292 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1293 } 1294 1295 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1296 if (ST.hasLDSFPAtomics()) { 1297 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1298 if (ST.hasGFX90AInsts()) 1299 Atomic.legalFor({{S64, LocalPtr}}); 1300 } 1301 if (ST.hasAtomicFaddInsts()) 1302 Atomic.legalFor({{S32, GlobalPtr}}); 1303 1304 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1305 // demarshalling 1306 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1307 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1308 {S32, FlatPtr}, {S64, FlatPtr}}) 1309 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1310 {S32, RegionPtr}, {S64, RegionPtr}}); 1311 // TODO: Pointer types, any 32-bit or 64-bit vector 1312 1313 // Condition should be s32 for scalar, s1 for vector. 1314 getActionDefinitionsBuilder(G_SELECT) 1315 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1316 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1317 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1318 .clampScalar(0, S16, S64) 1319 .scalarize(1) 1320 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1321 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1322 .clampMaxNumElements(0, S32, 2) 1323 .clampMaxNumElements(0, LocalPtr, 2) 1324 .clampMaxNumElements(0, PrivatePtr, 2) 1325 .scalarize(0) 1326 .widenScalarToNextPow2(0) 1327 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1328 1329 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1330 // be more flexible with the shift amount type. 1331 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1332 .legalFor({{S32, S32}, {S64, S32}}); 1333 if (ST.has16BitInsts()) { 1334 if (ST.hasVOP3PInsts()) { 1335 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1336 .clampMaxNumElements(0, S16, 2); 1337 } else 1338 Shifts.legalFor({{S16, S16}}); 1339 1340 // TODO: Support 16-bit shift amounts for all types 1341 Shifts.widenScalarIf( 1342 [=](const LegalityQuery &Query) { 1343 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1344 // 32-bit amount. 1345 const LLT ValTy = Query.Types[0]; 1346 const LLT AmountTy = Query.Types[1]; 1347 return ValTy.getSizeInBits() <= 16 && 1348 AmountTy.getSizeInBits() < 16; 1349 }, changeTo(1, S16)); 1350 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1351 Shifts.clampScalar(1, S32, S32); 1352 Shifts.clampScalar(0, S16, S64); 1353 Shifts.widenScalarToNextPow2(0, 16); 1354 1355 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1356 .minScalar(0, S16) 1357 .scalarize(0) 1358 .lower(); 1359 } else { 1360 // Make sure we legalize the shift amount type first, as the general 1361 // expansion for the shifted type will produce much worse code if it hasn't 1362 // been truncated already. 1363 Shifts.clampScalar(1, S32, S32); 1364 Shifts.clampScalar(0, S32, S64); 1365 Shifts.widenScalarToNextPow2(0, 32); 1366 1367 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1368 .minScalar(0, S32) 1369 .scalarize(0) 1370 .lower(); 1371 } 1372 Shifts.scalarize(0); 1373 1374 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1375 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1376 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1377 unsigned IdxTypeIdx = 2; 1378 1379 getActionDefinitionsBuilder(Op) 1380 .customIf([=](const LegalityQuery &Query) { 1381 const LLT EltTy = Query.Types[EltTypeIdx]; 1382 const LLT VecTy = Query.Types[VecTypeIdx]; 1383 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1384 const unsigned EltSize = EltTy.getSizeInBits(); 1385 return (EltSize == 32 || EltSize == 64) && 1386 VecTy.getSizeInBits() % 32 == 0 && 1387 VecTy.getSizeInBits() <= MaxRegisterSize && 1388 IdxTy.getSizeInBits() == 32; 1389 }) 1390 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1391 bitcastToVectorElement32(VecTypeIdx)) 1392 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1393 .bitcastIf( 1394 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1395 [=](const LegalityQuery &Query) { 1396 // For > 64-bit element types, try to turn this into a 64-bit 1397 // element vector since we may be able to do better indexing 1398 // if this is scalar. If not, fall back to 32. 1399 const LLT EltTy = Query.Types[EltTypeIdx]; 1400 const LLT VecTy = Query.Types[VecTypeIdx]; 1401 const unsigned DstEltSize = EltTy.getSizeInBits(); 1402 const unsigned VecSize = VecTy.getSizeInBits(); 1403 1404 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1405 return std::make_pair( 1406 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1407 }) 1408 .clampScalar(EltTypeIdx, S32, S64) 1409 .clampScalar(VecTypeIdx, S32, S64) 1410 .clampScalar(IdxTypeIdx, S32, S32) 1411 .clampMaxNumElements(VecTypeIdx, S32, 32) 1412 // TODO: Clamp elements for 64-bit vectors? 1413 // It should only be necessary with variable indexes. 1414 // As a last resort, lower to the stack 1415 .lower(); 1416 } 1417 1418 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1419 .unsupportedIf([=](const LegalityQuery &Query) { 1420 const LLT &EltTy = Query.Types[1].getElementType(); 1421 return Query.Types[0] != EltTy; 1422 }); 1423 1424 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1425 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1426 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1427 1428 // FIXME: Doesn't handle extract of illegal sizes. 1429 getActionDefinitionsBuilder(Op) 1430 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1431 // FIXME: Multiples of 16 should not be legal. 1432 .legalIf([=](const LegalityQuery &Query) { 1433 const LLT BigTy = Query.Types[BigTyIdx]; 1434 const LLT LitTy = Query.Types[LitTyIdx]; 1435 return (BigTy.getSizeInBits() % 32 == 0) && 1436 (LitTy.getSizeInBits() % 16 == 0); 1437 }) 1438 .widenScalarIf( 1439 [=](const LegalityQuery &Query) { 1440 const LLT BigTy = Query.Types[BigTyIdx]; 1441 return (BigTy.getScalarSizeInBits() < 16); 1442 }, 1443 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1444 .widenScalarIf( 1445 [=](const LegalityQuery &Query) { 1446 const LLT LitTy = Query.Types[LitTyIdx]; 1447 return (LitTy.getScalarSizeInBits() < 16); 1448 }, 1449 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1450 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1451 .widenScalarToNextPow2(BigTyIdx, 32); 1452 1453 } 1454 1455 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1456 .legalForCartesianProduct(AllS32Vectors, {S32}) 1457 .legalForCartesianProduct(AllS64Vectors, {S64}) 1458 .clampNumElements(0, V16S32, V32S32) 1459 .clampNumElements(0, V2S64, V16S64) 1460 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1461 1462 if (ST.hasScalarPackInsts()) { 1463 BuildVector 1464 // FIXME: Should probably widen s1 vectors straight to s32 1465 .minScalarOrElt(0, S16) 1466 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1467 .minScalar(1, S32); 1468 1469 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1470 .legalFor({V2S16, S32}) 1471 .lower(); 1472 BuildVector.minScalarOrElt(0, S32); 1473 } else { 1474 BuildVector.customFor({V2S16, S16}); 1475 BuildVector.minScalarOrElt(0, S32); 1476 1477 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1478 .customFor({V2S16, S32}) 1479 .lower(); 1480 } 1481 1482 BuildVector.legalIf(isRegisterType(0)); 1483 1484 // FIXME: Clamp maximum size 1485 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1486 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1487 .clampMaxNumElements(0, S32, 32) 1488 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1489 .clampMaxNumElements(0, S16, 64); 1490 1491 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1492 // pre-legalize. 1493 if (ST.hasVOP3PInsts()) { 1494 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1495 .customFor({V2S16, V2S16}) 1496 .lower(); 1497 } else 1498 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1499 1500 // Merge/Unmerge 1501 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1502 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1503 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1504 1505 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1506 const LLT Ty = Query.Types[TypeIdx]; 1507 if (Ty.isVector()) { 1508 const LLT &EltTy = Ty.getElementType(); 1509 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1510 return true; 1511 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1512 return true; 1513 } 1514 return false; 1515 }; 1516 1517 auto &Builder = getActionDefinitionsBuilder(Op) 1518 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1519 .lowerFor({{S16, V2S16}}) 1520 .lowerIf([=](const LegalityQuery &Query) { 1521 const LLT BigTy = Query.Types[BigTyIdx]; 1522 return BigTy.getSizeInBits() == 32; 1523 }) 1524 // Try to widen to s16 first for small types. 1525 // TODO: Only do this on targets with legal s16 shifts 1526 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1527 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1528 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1529 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1530 elementTypeIs(1, S16)), 1531 changeTo(1, V2S16)) 1532 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1533 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1534 // valid. 1535 .clampScalar(LitTyIdx, S32, S512) 1536 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1537 // Break up vectors with weird elements into scalars 1538 .fewerElementsIf( 1539 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1540 scalarize(0)) 1541 .fewerElementsIf( 1542 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1543 scalarize(1)) 1544 .clampScalar(BigTyIdx, S32, MaxScalar); 1545 1546 if (Op == G_MERGE_VALUES) { 1547 Builder.widenScalarIf( 1548 // TODO: Use 16-bit shifts if legal for 8-bit values? 1549 [=](const LegalityQuery &Query) { 1550 const LLT Ty = Query.Types[LitTyIdx]; 1551 return Ty.getSizeInBits() < 32; 1552 }, 1553 changeTo(LitTyIdx, S32)); 1554 } 1555 1556 Builder.widenScalarIf( 1557 [=](const LegalityQuery &Query) { 1558 const LLT Ty = Query.Types[BigTyIdx]; 1559 return !isPowerOf2_32(Ty.getSizeInBits()) && 1560 Ty.getSizeInBits() % 16 != 0; 1561 }, 1562 [=](const LegalityQuery &Query) { 1563 // Pick the next power of 2, or a multiple of 64 over 128. 1564 // Whichever is smaller. 1565 const LLT &Ty = Query.Types[BigTyIdx]; 1566 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1567 if (NewSizeInBits >= 256) { 1568 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1569 if (RoundedTo < NewSizeInBits) 1570 NewSizeInBits = RoundedTo; 1571 } 1572 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1573 }) 1574 // Any vectors left are the wrong size. Scalarize them. 1575 .scalarize(0) 1576 .scalarize(1); 1577 } 1578 1579 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1580 // RegBankSelect. 1581 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1582 .legalFor({{S32}, {S64}}); 1583 1584 if (ST.hasVOP3PInsts()) { 1585 SextInReg.lowerFor({{V2S16}}) 1586 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1587 // get more vector shift opportunities, since we'll get those when 1588 // expanded. 1589 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1590 } else if (ST.has16BitInsts()) { 1591 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1592 } else { 1593 // Prefer to promote to s32 before lowering if we don't have 16-bit 1594 // shifts. This avoid a lot of intermediate truncate and extend operations. 1595 SextInReg.lowerFor({{S32}, {S64}}); 1596 } 1597 1598 SextInReg 1599 .scalarize(0) 1600 .clampScalar(0, S32, S64) 1601 .lower(); 1602 1603 // TODO: Only Try to form v2s16 with legal packed instructions. 1604 getActionDefinitionsBuilder(G_FSHR) 1605 .legalFor({{S32, S32}}) 1606 .lowerFor({{V2S16, V2S16}}) 1607 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1608 .scalarize(0) 1609 .lower(); 1610 1611 if (ST.hasVOP3PInsts()) { 1612 getActionDefinitionsBuilder(G_FSHL) 1613 .lowerFor({{V2S16, V2S16}}) 1614 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1615 .scalarize(0) 1616 .lower(); 1617 } else { 1618 getActionDefinitionsBuilder(G_FSHL) 1619 .scalarize(0) 1620 .lower(); 1621 } 1622 1623 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1624 .legalFor({S64}); 1625 1626 getActionDefinitionsBuilder(G_FENCE) 1627 .alwaysLegal(); 1628 1629 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1630 .scalarize(0) 1631 .minScalar(0, S32) 1632 .lower(); 1633 1634 getActionDefinitionsBuilder({ 1635 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1636 G_FCOPYSIGN, 1637 1638 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1639 G_ATOMICRMW_NAND, 1640 G_ATOMICRMW_FSUB, 1641 G_READ_REGISTER, 1642 G_WRITE_REGISTER, 1643 1644 G_SADDO, G_SSUBO, 1645 1646 // TODO: Implement 1647 G_FMINIMUM, G_FMAXIMUM}).lower(); 1648 1649 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1650 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1651 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1652 .unsupported(); 1653 1654 computeTables(); 1655 verify(*ST.getInstrInfo()); 1656 } 1657 1658 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1659 MachineInstr &MI) const { 1660 MachineIRBuilder &B = Helper.MIRBuilder; 1661 MachineRegisterInfo &MRI = *B.getMRI(); 1662 1663 switch (MI.getOpcode()) { 1664 case TargetOpcode::G_ADDRSPACE_CAST: 1665 return legalizeAddrSpaceCast(MI, MRI, B); 1666 case TargetOpcode::G_FRINT: 1667 return legalizeFrint(MI, MRI, B); 1668 case TargetOpcode::G_FCEIL: 1669 return legalizeFceil(MI, MRI, B); 1670 case TargetOpcode::G_FREM: 1671 return legalizeFrem(MI, MRI, B); 1672 case TargetOpcode::G_INTRINSIC_TRUNC: 1673 return legalizeIntrinsicTrunc(MI, MRI, B); 1674 case TargetOpcode::G_SITOFP: 1675 return legalizeITOFP(MI, MRI, B, true); 1676 case TargetOpcode::G_UITOFP: 1677 return legalizeITOFP(MI, MRI, B, false); 1678 case TargetOpcode::G_FPTOSI: 1679 return legalizeFPTOI(MI, MRI, B, true); 1680 case TargetOpcode::G_FPTOUI: 1681 return legalizeFPTOI(MI, MRI, B, false); 1682 case TargetOpcode::G_FMINNUM: 1683 case TargetOpcode::G_FMAXNUM: 1684 case TargetOpcode::G_FMINNUM_IEEE: 1685 case TargetOpcode::G_FMAXNUM_IEEE: 1686 return legalizeMinNumMaxNum(Helper, MI); 1687 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1688 return legalizeExtractVectorElt(MI, MRI, B); 1689 case TargetOpcode::G_INSERT_VECTOR_ELT: 1690 return legalizeInsertVectorElt(MI, MRI, B); 1691 case TargetOpcode::G_SHUFFLE_VECTOR: 1692 return legalizeShuffleVector(MI, MRI, B); 1693 case TargetOpcode::G_FSIN: 1694 case TargetOpcode::G_FCOS: 1695 return legalizeSinCos(MI, MRI, B); 1696 case TargetOpcode::G_GLOBAL_VALUE: 1697 return legalizeGlobalValue(MI, MRI, B); 1698 case TargetOpcode::G_LOAD: 1699 return legalizeLoad(Helper, MI); 1700 case TargetOpcode::G_FMAD: 1701 return legalizeFMad(MI, MRI, B); 1702 case TargetOpcode::G_FDIV: 1703 return legalizeFDIV(MI, MRI, B); 1704 case TargetOpcode::G_UDIV: 1705 case TargetOpcode::G_UREM: 1706 return legalizeUDIV_UREM(MI, MRI, B); 1707 case TargetOpcode::G_SDIV: 1708 case TargetOpcode::G_SREM: 1709 return legalizeSDIV_SREM(MI, MRI, B); 1710 case TargetOpcode::G_ATOMIC_CMPXCHG: 1711 return legalizeAtomicCmpXChg(MI, MRI, B); 1712 case TargetOpcode::G_FLOG: 1713 return legalizeFlog(MI, B, numbers::ln2f); 1714 case TargetOpcode::G_FLOG10: 1715 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1716 case TargetOpcode::G_FEXP: 1717 return legalizeFExp(MI, B); 1718 case TargetOpcode::G_FPOW: 1719 return legalizeFPow(MI, B); 1720 case TargetOpcode::G_FFLOOR: 1721 return legalizeFFloor(MI, MRI, B); 1722 case TargetOpcode::G_BUILD_VECTOR: 1723 return legalizeBuildVector(MI, MRI, B); 1724 default: 1725 return false; 1726 } 1727 1728 llvm_unreachable("expected switch to return"); 1729 } 1730 1731 Register AMDGPULegalizerInfo::getSegmentAperture( 1732 unsigned AS, 1733 MachineRegisterInfo &MRI, 1734 MachineIRBuilder &B) const { 1735 MachineFunction &MF = B.getMF(); 1736 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1737 const LLT S32 = LLT::scalar(32); 1738 1739 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1740 1741 if (ST.hasApertureRegs()) { 1742 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1743 // getreg. 1744 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1745 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1746 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1747 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1748 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1749 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1750 unsigned Encoding = 1751 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1752 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1753 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1754 1755 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1756 1757 B.buildInstr(AMDGPU::S_GETREG_B32) 1758 .addDef(GetReg) 1759 .addImm(Encoding); 1760 MRI.setType(GetReg, S32); 1761 1762 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1763 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1764 } 1765 1766 Register QueuePtr = MRI.createGenericVirtualRegister( 1767 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1768 1769 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1770 return Register(); 1771 1772 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1773 // private_segment_aperture_base_hi. 1774 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1775 1776 // TODO: can we be smarter about machine pointer info? 1777 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1778 MachineMemOperand *MMO = MF.getMachineMemOperand( 1779 PtrInfo, 1780 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1781 MachineMemOperand::MOInvariant, 1782 4, commonAlignment(Align(64), StructOffset)); 1783 1784 Register LoadAddr; 1785 1786 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1787 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1788 } 1789 1790 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1791 MachineInstr &MI, MachineRegisterInfo &MRI, 1792 MachineIRBuilder &B) const { 1793 MachineFunction &MF = B.getMF(); 1794 1795 const LLT S32 = LLT::scalar(32); 1796 Register Dst = MI.getOperand(0).getReg(); 1797 Register Src = MI.getOperand(1).getReg(); 1798 1799 LLT DstTy = MRI.getType(Dst); 1800 LLT SrcTy = MRI.getType(Src); 1801 unsigned DestAS = DstTy.getAddressSpace(); 1802 unsigned SrcAS = SrcTy.getAddressSpace(); 1803 1804 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1805 // vector element. 1806 assert(!DstTy.isVector()); 1807 1808 const AMDGPUTargetMachine &TM 1809 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1810 1811 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1812 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1813 return true; 1814 } 1815 1816 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1817 // Truncate. 1818 B.buildExtract(Dst, Src, 0); 1819 MI.eraseFromParent(); 1820 return true; 1821 } 1822 1823 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1824 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1825 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1826 1827 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1828 // another. Merge operands are required to be the same type, but creating an 1829 // extra ptrtoint would be kind of pointless. 1830 auto HighAddr = B.buildConstant( 1831 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1832 B.buildMerge(Dst, {Src, HighAddr}); 1833 MI.eraseFromParent(); 1834 return true; 1835 } 1836 1837 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1838 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1839 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1840 unsigned NullVal = TM.getNullPointerValue(DestAS); 1841 1842 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1843 auto FlatNull = B.buildConstant(SrcTy, 0); 1844 1845 // Extract low 32-bits of the pointer. 1846 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1847 1848 auto CmpRes = 1849 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1850 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1851 1852 MI.eraseFromParent(); 1853 return true; 1854 } 1855 1856 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1857 return false; 1858 1859 if (!ST.hasFlatAddressSpace()) 1860 return false; 1861 1862 auto SegmentNull = 1863 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1864 auto FlatNull = 1865 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1866 1867 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1868 if (!ApertureReg.isValid()) 1869 return false; 1870 1871 auto CmpRes = 1872 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1873 1874 // Coerce the type of the low half of the result so we can use merge_values. 1875 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1876 1877 // TODO: Should we allow mismatched types but matching sizes in merges to 1878 // avoid the ptrtoint? 1879 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1880 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1881 1882 MI.eraseFromParent(); 1883 return true; 1884 } 1885 1886 bool AMDGPULegalizerInfo::legalizeFrint( 1887 MachineInstr &MI, MachineRegisterInfo &MRI, 1888 MachineIRBuilder &B) const { 1889 Register Src = MI.getOperand(1).getReg(); 1890 LLT Ty = MRI.getType(Src); 1891 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1892 1893 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1894 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1895 1896 auto C1 = B.buildFConstant(Ty, C1Val); 1897 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1898 1899 // TODO: Should this propagate fast-math-flags? 1900 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1901 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1902 1903 auto C2 = B.buildFConstant(Ty, C2Val); 1904 auto Fabs = B.buildFAbs(Ty, Src); 1905 1906 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1907 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1908 MI.eraseFromParent(); 1909 return true; 1910 } 1911 1912 bool AMDGPULegalizerInfo::legalizeFceil( 1913 MachineInstr &MI, MachineRegisterInfo &MRI, 1914 MachineIRBuilder &B) const { 1915 1916 const LLT S1 = LLT::scalar(1); 1917 const LLT S64 = LLT::scalar(64); 1918 1919 Register Src = MI.getOperand(1).getReg(); 1920 assert(MRI.getType(Src) == S64); 1921 1922 // result = trunc(src) 1923 // if (src > 0.0 && src != result) 1924 // result += 1.0 1925 1926 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1927 1928 const auto Zero = B.buildFConstant(S64, 0.0); 1929 const auto One = B.buildFConstant(S64, 1.0); 1930 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1931 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1932 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1933 auto Add = B.buildSelect(S64, And, One, Zero); 1934 1935 // TODO: Should this propagate fast-math-flags? 1936 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1937 return true; 1938 } 1939 1940 bool AMDGPULegalizerInfo::legalizeFrem( 1941 MachineInstr &MI, MachineRegisterInfo &MRI, 1942 MachineIRBuilder &B) const { 1943 Register DstReg = MI.getOperand(0).getReg(); 1944 Register Src0Reg = MI.getOperand(1).getReg(); 1945 Register Src1Reg = MI.getOperand(2).getReg(); 1946 auto Flags = MI.getFlags(); 1947 LLT Ty = MRI.getType(DstReg); 1948 1949 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1950 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1951 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1952 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1953 MI.eraseFromParent(); 1954 return true; 1955 } 1956 1957 static MachineInstrBuilder extractF64Exponent(Register Hi, 1958 MachineIRBuilder &B) { 1959 const unsigned FractBits = 52; 1960 const unsigned ExpBits = 11; 1961 LLT S32 = LLT::scalar(32); 1962 1963 auto Const0 = B.buildConstant(S32, FractBits - 32); 1964 auto Const1 = B.buildConstant(S32, ExpBits); 1965 1966 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1967 .addUse(Hi) 1968 .addUse(Const0.getReg(0)) 1969 .addUse(Const1.getReg(0)); 1970 1971 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1972 } 1973 1974 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1975 MachineInstr &MI, MachineRegisterInfo &MRI, 1976 MachineIRBuilder &B) const { 1977 const LLT S1 = LLT::scalar(1); 1978 const LLT S32 = LLT::scalar(32); 1979 const LLT S64 = LLT::scalar(64); 1980 1981 Register Src = MI.getOperand(1).getReg(); 1982 assert(MRI.getType(Src) == S64); 1983 1984 // TODO: Should this use extract since the low half is unused? 1985 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1986 Register Hi = Unmerge.getReg(1); 1987 1988 // Extract the upper half, since this is where we will find the sign and 1989 // exponent. 1990 auto Exp = extractF64Exponent(Hi, B); 1991 1992 const unsigned FractBits = 52; 1993 1994 // Extract the sign bit. 1995 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1996 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1997 1998 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1999 2000 const auto Zero32 = B.buildConstant(S32, 0); 2001 2002 // Extend back to 64-bits. 2003 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2004 2005 auto Shr = B.buildAShr(S64, FractMask, Exp); 2006 auto Not = B.buildNot(S64, Shr); 2007 auto Tmp0 = B.buildAnd(S64, Src, Not); 2008 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2009 2010 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2011 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2012 2013 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2014 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2015 MI.eraseFromParent(); 2016 return true; 2017 } 2018 2019 bool AMDGPULegalizerInfo::legalizeITOFP( 2020 MachineInstr &MI, MachineRegisterInfo &MRI, 2021 MachineIRBuilder &B, bool Signed) const { 2022 2023 Register Dst = MI.getOperand(0).getReg(); 2024 Register Src = MI.getOperand(1).getReg(); 2025 2026 const LLT S64 = LLT::scalar(64); 2027 const LLT S32 = LLT::scalar(32); 2028 2029 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2030 2031 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2032 2033 auto CvtHi = Signed ? 2034 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2035 B.buildUITOFP(S64, Unmerge.getReg(1)); 2036 2037 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2038 2039 auto ThirtyTwo = B.buildConstant(S32, 32); 2040 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2041 .addUse(CvtHi.getReg(0)) 2042 .addUse(ThirtyTwo.getReg(0)); 2043 2044 // TODO: Should this propagate fast-math-flags? 2045 B.buildFAdd(Dst, LdExp, CvtLo); 2046 MI.eraseFromParent(); 2047 return true; 2048 } 2049 2050 // TODO: Copied from DAG implementation. Verify logic and document how this 2051 // actually works. 2052 bool AMDGPULegalizerInfo::legalizeFPTOI( 2053 MachineInstr &MI, MachineRegisterInfo &MRI, 2054 MachineIRBuilder &B, bool Signed) const { 2055 2056 Register Dst = MI.getOperand(0).getReg(); 2057 Register Src = MI.getOperand(1).getReg(); 2058 2059 const LLT S64 = LLT::scalar(64); 2060 const LLT S32 = LLT::scalar(32); 2061 2062 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2063 2064 unsigned Flags = MI.getFlags(); 2065 2066 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2067 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2068 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2069 2070 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2071 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2072 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2073 2074 auto Hi = Signed ? 2075 B.buildFPTOSI(S32, FloorMul) : 2076 B.buildFPTOUI(S32, FloorMul); 2077 auto Lo = B.buildFPTOUI(S32, Fma); 2078 2079 B.buildMerge(Dst, { Lo, Hi }); 2080 MI.eraseFromParent(); 2081 2082 return true; 2083 } 2084 2085 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2086 MachineInstr &MI) const { 2087 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2088 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2089 2090 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2091 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2092 2093 // With ieee_mode disabled, the instructions have the correct behavior 2094 // already for G_FMINNUM/G_FMAXNUM 2095 if (!MFI->getMode().IEEE) 2096 return !IsIEEEOp; 2097 2098 if (IsIEEEOp) 2099 return true; 2100 2101 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2102 } 2103 2104 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2105 MachineInstr &MI, MachineRegisterInfo &MRI, 2106 MachineIRBuilder &B) const { 2107 // TODO: Should move some of this into LegalizerHelper. 2108 2109 // TODO: Promote dynamic indexing of s16 to s32 2110 2111 // FIXME: Artifact combiner probably should have replaced the truncated 2112 // constant before this, so we shouldn't need 2113 // getConstantVRegValWithLookThrough. 2114 Optional<ValueAndVReg> MaybeIdxVal = 2115 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2116 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2117 return true; 2118 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2119 2120 Register Dst = MI.getOperand(0).getReg(); 2121 Register Vec = MI.getOperand(1).getReg(); 2122 2123 LLT VecTy = MRI.getType(Vec); 2124 LLT EltTy = VecTy.getElementType(); 2125 assert(EltTy == MRI.getType(Dst)); 2126 2127 if (IdxVal < VecTy.getNumElements()) 2128 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2129 else 2130 B.buildUndef(Dst); 2131 2132 MI.eraseFromParent(); 2133 return true; 2134 } 2135 2136 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2137 MachineInstr &MI, MachineRegisterInfo &MRI, 2138 MachineIRBuilder &B) const { 2139 // TODO: Should move some of this into LegalizerHelper. 2140 2141 // TODO: Promote dynamic indexing of s16 to s32 2142 2143 // FIXME: Artifact combiner probably should have replaced the truncated 2144 // constant before this, so we shouldn't need 2145 // getConstantVRegValWithLookThrough. 2146 Optional<ValueAndVReg> MaybeIdxVal = 2147 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2148 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2149 return true; 2150 2151 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2152 Register Dst = MI.getOperand(0).getReg(); 2153 Register Vec = MI.getOperand(1).getReg(); 2154 Register Ins = MI.getOperand(2).getReg(); 2155 2156 LLT VecTy = MRI.getType(Vec); 2157 LLT EltTy = VecTy.getElementType(); 2158 assert(EltTy == MRI.getType(Ins)); 2159 2160 if (IdxVal < VecTy.getNumElements()) 2161 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2162 else 2163 B.buildUndef(Dst); 2164 2165 MI.eraseFromParent(); 2166 return true; 2167 } 2168 2169 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2170 MachineInstr &MI, MachineRegisterInfo &MRI, 2171 MachineIRBuilder &B) const { 2172 const LLT V2S16 = LLT::vector(2, 16); 2173 2174 Register Dst = MI.getOperand(0).getReg(); 2175 Register Src0 = MI.getOperand(1).getReg(); 2176 LLT DstTy = MRI.getType(Dst); 2177 LLT SrcTy = MRI.getType(Src0); 2178 2179 if (SrcTy == V2S16 && DstTy == V2S16 && 2180 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2181 return true; 2182 2183 MachineIRBuilder HelperBuilder(MI); 2184 GISelObserverWrapper DummyObserver; 2185 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2186 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2187 } 2188 2189 bool AMDGPULegalizerInfo::legalizeSinCos( 2190 MachineInstr &MI, MachineRegisterInfo &MRI, 2191 MachineIRBuilder &B) const { 2192 2193 Register DstReg = MI.getOperand(0).getReg(); 2194 Register SrcReg = MI.getOperand(1).getReg(); 2195 LLT Ty = MRI.getType(DstReg); 2196 unsigned Flags = MI.getFlags(); 2197 2198 Register TrigVal; 2199 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2200 if (ST.hasTrigReducedRange()) { 2201 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2202 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2203 .addUse(MulVal.getReg(0)) 2204 .setMIFlags(Flags).getReg(0); 2205 } else 2206 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2207 2208 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2209 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2210 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2211 .addUse(TrigVal) 2212 .setMIFlags(Flags); 2213 MI.eraseFromParent(); 2214 return true; 2215 } 2216 2217 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2218 MachineIRBuilder &B, 2219 const GlobalValue *GV, 2220 int64_t Offset, 2221 unsigned GAFlags) const { 2222 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2223 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2224 // to the following code sequence: 2225 // 2226 // For constant address space: 2227 // s_getpc_b64 s[0:1] 2228 // s_add_u32 s0, s0, $symbol 2229 // s_addc_u32 s1, s1, 0 2230 // 2231 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2232 // a fixup or relocation is emitted to replace $symbol with a literal 2233 // constant, which is a pc-relative offset from the encoding of the $symbol 2234 // operand to the global variable. 2235 // 2236 // For global address space: 2237 // s_getpc_b64 s[0:1] 2238 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2239 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2240 // 2241 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2242 // fixups or relocations are emitted to replace $symbol@*@lo and 2243 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2244 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2245 // operand to the global variable. 2246 // 2247 // What we want here is an offset from the value returned by s_getpc 2248 // (which is the address of the s_add_u32 instruction) to the global 2249 // variable, but since the encoding of $symbol starts 4 bytes after the start 2250 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2251 // small. This requires us to add 4 to the global variable offset in order to 2252 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2253 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2254 // instruction. 2255 2256 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2257 2258 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2259 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2260 2261 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2262 .addDef(PCReg); 2263 2264 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2265 if (GAFlags == SIInstrInfo::MO_NONE) 2266 MIB.addImm(0); 2267 else 2268 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2269 2270 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2271 2272 if (PtrTy.getSizeInBits() == 32) 2273 B.buildExtract(DstReg, PCReg, 0); 2274 return true; 2275 } 2276 2277 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2278 MachineInstr &MI, MachineRegisterInfo &MRI, 2279 MachineIRBuilder &B) const { 2280 Register DstReg = MI.getOperand(0).getReg(); 2281 LLT Ty = MRI.getType(DstReg); 2282 unsigned AS = Ty.getAddressSpace(); 2283 2284 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2285 MachineFunction &MF = B.getMF(); 2286 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2287 2288 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2289 if (!MFI->isModuleEntryFunction()) { 2290 const Function &Fn = MF.getFunction(); 2291 DiagnosticInfoUnsupported BadLDSDecl( 2292 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2293 DS_Warning); 2294 Fn.getContext().diagnose(BadLDSDecl); 2295 2296 // We currently don't have a way to correctly allocate LDS objects that 2297 // aren't directly associated with a kernel. We do force inlining of 2298 // functions that use local objects. However, if these dead functions are 2299 // not eliminated, we don't want a compile time error. Just emit a warning 2300 // and a trap, since there should be no callable path here. 2301 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2302 B.buildUndef(DstReg); 2303 MI.eraseFromParent(); 2304 return true; 2305 } 2306 2307 // TODO: We could emit code to handle the initialization somewhere. 2308 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2309 const SITargetLowering *TLI = ST.getTargetLowering(); 2310 if (!TLI->shouldUseLDSConstAddress(GV)) { 2311 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2312 return true; // Leave in place; 2313 } 2314 2315 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2316 Type *Ty = GV->getValueType(); 2317 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2318 // zero-sized type in other languages to declare the dynamic shared 2319 // memory which size is not known at the compile time. They will be 2320 // allocated by the runtime and placed directly after the static 2321 // allocated ones. They all share the same offset. 2322 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2323 // Adjust alignment for that dynamic shared memory array. 2324 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2325 LLT S32 = LLT::scalar(32); 2326 auto Sz = 2327 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2328 B.buildIntToPtr(DstReg, Sz); 2329 MI.eraseFromParent(); 2330 return true; 2331 } 2332 } 2333 2334 B.buildConstant( 2335 DstReg, 2336 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2337 MI.eraseFromParent(); 2338 return true; 2339 } 2340 2341 const Function &Fn = MF.getFunction(); 2342 DiagnosticInfoUnsupported BadInit( 2343 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2344 Fn.getContext().diagnose(BadInit); 2345 return true; 2346 } 2347 2348 const SITargetLowering *TLI = ST.getTargetLowering(); 2349 2350 if (TLI->shouldEmitFixup(GV)) { 2351 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2352 MI.eraseFromParent(); 2353 return true; 2354 } 2355 2356 if (TLI->shouldEmitPCReloc(GV)) { 2357 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2358 MI.eraseFromParent(); 2359 return true; 2360 } 2361 2362 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2363 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2364 2365 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2366 MachinePointerInfo::getGOT(MF), 2367 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2368 MachineMemOperand::MOInvariant, 2369 8 /*Size*/, Align(8)); 2370 2371 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2372 2373 if (Ty.getSizeInBits() == 32) { 2374 // Truncate if this is a 32-bit constant adrdess. 2375 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2376 B.buildExtract(DstReg, Load, 0); 2377 } else 2378 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2379 2380 MI.eraseFromParent(); 2381 return true; 2382 } 2383 2384 static LLT widenToNextPowerOf2(LLT Ty) { 2385 if (Ty.isVector()) 2386 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2387 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2388 } 2389 2390 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2391 MachineInstr &MI) const { 2392 MachineIRBuilder &B = Helper.MIRBuilder; 2393 MachineRegisterInfo &MRI = *B.getMRI(); 2394 GISelChangeObserver &Observer = Helper.Observer; 2395 2396 Register PtrReg = MI.getOperand(1).getReg(); 2397 LLT PtrTy = MRI.getType(PtrReg); 2398 unsigned AddrSpace = PtrTy.getAddressSpace(); 2399 2400 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2401 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2402 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2403 Observer.changingInstr(MI); 2404 MI.getOperand(1).setReg(Cast.getReg(0)); 2405 Observer.changedInstr(MI); 2406 return true; 2407 } 2408 2409 Register ValReg = MI.getOperand(0).getReg(); 2410 LLT ValTy = MRI.getType(ValReg); 2411 2412 MachineMemOperand *MMO = *MI.memoperands_begin(); 2413 const unsigned ValSize = ValTy.getSizeInBits(); 2414 const unsigned MemSize = 8 * MMO->getSize(); 2415 const Align MemAlign = MMO->getAlign(); 2416 const unsigned AlignInBits = 8 * MemAlign.value(); 2417 2418 // Widen non-power-of-2 loads to the alignment if needed 2419 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2420 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2421 2422 // This was already the correct extending load result type, so just adjust 2423 // the memory type. 2424 if (WideMemSize == ValSize) { 2425 MachineFunction &MF = B.getMF(); 2426 2427 MachineMemOperand *WideMMO = 2428 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2429 Observer.changingInstr(MI); 2430 MI.setMemRefs(MF, {WideMMO}); 2431 Observer.changedInstr(MI); 2432 return true; 2433 } 2434 2435 // Don't bother handling edge case that should probably never be produced. 2436 if (ValSize > WideMemSize) 2437 return false; 2438 2439 LLT WideTy = widenToNextPowerOf2(ValTy); 2440 2441 Register WideLoad; 2442 if (!WideTy.isVector()) { 2443 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2444 B.buildTrunc(ValReg, WideLoad).getReg(0); 2445 } else { 2446 // Extract the subvector. 2447 2448 if (isRegisterType(ValTy)) { 2449 // If this a case where G_EXTRACT is legal, use it. 2450 // (e.g. <3 x s32> -> <4 x s32>) 2451 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2452 B.buildExtract(ValReg, WideLoad, 0); 2453 } else { 2454 // For cases where the widened type isn't a nice register value, unmerge 2455 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2456 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2457 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2458 B.setInsertPt(B.getMBB(), MI.getIterator()); 2459 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2460 } 2461 } 2462 2463 MI.eraseFromParent(); 2464 return true; 2465 } 2466 2467 return false; 2468 } 2469 2470 bool AMDGPULegalizerInfo::legalizeFMad( 2471 MachineInstr &MI, MachineRegisterInfo &MRI, 2472 MachineIRBuilder &B) const { 2473 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2474 assert(Ty.isScalar()); 2475 2476 MachineFunction &MF = B.getMF(); 2477 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2478 2479 // TODO: Always legal with future ftz flag. 2480 // FIXME: Do we need just output? 2481 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2482 return true; 2483 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2484 return true; 2485 2486 MachineIRBuilder HelperBuilder(MI); 2487 GISelObserverWrapper DummyObserver; 2488 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2489 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2490 } 2491 2492 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2493 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2494 Register DstReg = MI.getOperand(0).getReg(); 2495 Register PtrReg = MI.getOperand(1).getReg(); 2496 Register CmpVal = MI.getOperand(2).getReg(); 2497 Register NewVal = MI.getOperand(3).getReg(); 2498 2499 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2500 "this should not have been custom lowered"); 2501 2502 LLT ValTy = MRI.getType(CmpVal); 2503 LLT VecTy = LLT::vector(2, ValTy); 2504 2505 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2506 2507 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2508 .addDef(DstReg) 2509 .addUse(PtrReg) 2510 .addUse(PackedVal) 2511 .setMemRefs(MI.memoperands()); 2512 2513 MI.eraseFromParent(); 2514 return true; 2515 } 2516 2517 bool AMDGPULegalizerInfo::legalizeFlog( 2518 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2519 Register Dst = MI.getOperand(0).getReg(); 2520 Register Src = MI.getOperand(1).getReg(); 2521 LLT Ty = B.getMRI()->getType(Dst); 2522 unsigned Flags = MI.getFlags(); 2523 2524 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2525 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2526 2527 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2528 MI.eraseFromParent(); 2529 return true; 2530 } 2531 2532 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2533 MachineIRBuilder &B) const { 2534 Register Dst = MI.getOperand(0).getReg(); 2535 Register Src = MI.getOperand(1).getReg(); 2536 unsigned Flags = MI.getFlags(); 2537 LLT Ty = B.getMRI()->getType(Dst); 2538 2539 auto K = B.buildFConstant(Ty, numbers::log2e); 2540 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2541 B.buildFExp2(Dst, Mul, Flags); 2542 MI.eraseFromParent(); 2543 return true; 2544 } 2545 2546 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2547 MachineIRBuilder &B) const { 2548 Register Dst = MI.getOperand(0).getReg(); 2549 Register Src0 = MI.getOperand(1).getReg(); 2550 Register Src1 = MI.getOperand(2).getReg(); 2551 unsigned Flags = MI.getFlags(); 2552 LLT Ty = B.getMRI()->getType(Dst); 2553 const LLT S16 = LLT::scalar(16); 2554 const LLT S32 = LLT::scalar(32); 2555 2556 if (Ty == S32) { 2557 auto Log = B.buildFLog2(S32, Src0, Flags); 2558 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2559 .addUse(Log.getReg(0)) 2560 .addUse(Src1) 2561 .setMIFlags(Flags); 2562 B.buildFExp2(Dst, Mul, Flags); 2563 } else if (Ty == S16) { 2564 // There's no f16 fmul_legacy, so we need to convert for it. 2565 auto Log = B.buildFLog2(S16, Src0, Flags); 2566 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2567 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2568 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2569 .addUse(Ext0.getReg(0)) 2570 .addUse(Ext1.getReg(0)) 2571 .setMIFlags(Flags); 2572 2573 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2574 } else 2575 return false; 2576 2577 MI.eraseFromParent(); 2578 return true; 2579 } 2580 2581 // Find a source register, ignoring any possible source modifiers. 2582 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2583 Register ModSrc = OrigSrc; 2584 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2585 ModSrc = SrcFNeg->getOperand(1).getReg(); 2586 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2587 ModSrc = SrcFAbs->getOperand(1).getReg(); 2588 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2589 ModSrc = SrcFAbs->getOperand(1).getReg(); 2590 return ModSrc; 2591 } 2592 2593 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2594 MachineRegisterInfo &MRI, 2595 MachineIRBuilder &B) const { 2596 2597 const LLT S1 = LLT::scalar(1); 2598 const LLT S64 = LLT::scalar(64); 2599 Register Dst = MI.getOperand(0).getReg(); 2600 Register OrigSrc = MI.getOperand(1).getReg(); 2601 unsigned Flags = MI.getFlags(); 2602 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2603 "this should not have been custom lowered"); 2604 2605 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2606 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2607 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2608 // V_FRACT bug is: 2609 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2610 // 2611 // Convert floor(x) to (x - fract(x)) 2612 2613 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2614 .addUse(OrigSrc) 2615 .setMIFlags(Flags); 2616 2617 // Give source modifier matching some assistance before obscuring a foldable 2618 // pattern. 2619 2620 // TODO: We can avoid the neg on the fract? The input sign to fract 2621 // shouldn't matter? 2622 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2623 2624 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2625 2626 Register Min = MRI.createGenericVirtualRegister(S64); 2627 2628 // We don't need to concern ourselves with the snan handling difference, so 2629 // use the one which will directly select. 2630 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2631 if (MFI->getMode().IEEE) 2632 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2633 else 2634 B.buildFMinNum(Min, Fract, Const, Flags); 2635 2636 Register CorrectedFract = Min; 2637 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2638 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2639 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2640 } 2641 2642 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2643 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2644 2645 MI.eraseFromParent(); 2646 return true; 2647 } 2648 2649 // Turn an illegal packed v2s16 build vector into bit operations. 2650 // TODO: This should probably be a bitcast action in LegalizerHelper. 2651 bool AMDGPULegalizerInfo::legalizeBuildVector( 2652 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2653 Register Dst = MI.getOperand(0).getReg(); 2654 const LLT S32 = LLT::scalar(32); 2655 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2656 2657 Register Src0 = MI.getOperand(1).getReg(); 2658 Register Src1 = MI.getOperand(2).getReg(); 2659 assert(MRI.getType(Src0) == LLT::scalar(16)); 2660 2661 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2662 B.buildBitcast(Dst, Merge); 2663 2664 MI.eraseFromParent(); 2665 return true; 2666 } 2667 2668 // Check that this is a G_XOR x, -1 2669 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2670 if (MI.getOpcode() != TargetOpcode::G_XOR) 2671 return false; 2672 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2673 return ConstVal && *ConstVal == -1; 2674 } 2675 2676 // Return the use branch instruction, otherwise null if the usage is invalid. 2677 static MachineInstr * 2678 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2679 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2680 Register CondDef = MI.getOperand(0).getReg(); 2681 if (!MRI.hasOneNonDBGUse(CondDef)) 2682 return nullptr; 2683 2684 MachineBasicBlock *Parent = MI.getParent(); 2685 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2686 2687 if (isNot(MRI, *UseMI)) { 2688 Register NegatedCond = UseMI->getOperand(0).getReg(); 2689 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2690 return nullptr; 2691 2692 // We're deleting the def of this value, so we need to remove it. 2693 UseMI->eraseFromParent(); 2694 2695 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2696 Negated = true; 2697 } 2698 2699 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2700 return nullptr; 2701 2702 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2703 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2704 if (Next == Parent->end()) { 2705 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2706 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2707 return nullptr; 2708 UncondBrTarget = &*NextMBB; 2709 } else { 2710 if (Next->getOpcode() != AMDGPU::G_BR) 2711 return nullptr; 2712 Br = &*Next; 2713 UncondBrTarget = Br->getOperand(0).getMBB(); 2714 } 2715 2716 return UseMI; 2717 } 2718 2719 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2720 const ArgDescriptor *Arg, 2721 const TargetRegisterClass *ArgRC, 2722 LLT ArgTy) const { 2723 MCRegister SrcReg = Arg->getRegister(); 2724 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2725 assert(DstReg.isVirtual() && "Virtual register expected"); 2726 2727 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2728 ArgTy); 2729 if (Arg->isMasked()) { 2730 // TODO: Should we try to emit this once in the entry block? 2731 const LLT S32 = LLT::scalar(32); 2732 const unsigned Mask = Arg->getMask(); 2733 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2734 2735 Register AndMaskSrc = LiveIn; 2736 2737 if (Shift != 0) { 2738 auto ShiftAmt = B.buildConstant(S32, Shift); 2739 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2740 } 2741 2742 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2743 } else { 2744 B.buildCopy(DstReg, LiveIn); 2745 } 2746 2747 return true; 2748 } 2749 2750 bool AMDGPULegalizerInfo::loadInputValue( 2751 Register DstReg, MachineIRBuilder &B, 2752 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2753 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2754 const ArgDescriptor *Arg; 2755 const TargetRegisterClass *ArgRC; 2756 LLT ArgTy; 2757 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2758 2759 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2760 return false; // TODO: Handle these 2761 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2762 } 2763 2764 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2765 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2766 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2767 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2768 return false; 2769 2770 MI.eraseFromParent(); 2771 return true; 2772 } 2773 2774 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2775 MachineRegisterInfo &MRI, 2776 MachineIRBuilder &B) const { 2777 Register Dst = MI.getOperand(0).getReg(); 2778 LLT DstTy = MRI.getType(Dst); 2779 LLT S16 = LLT::scalar(16); 2780 LLT S32 = LLT::scalar(32); 2781 LLT S64 = LLT::scalar(64); 2782 2783 if (DstTy == S16) 2784 return legalizeFDIV16(MI, MRI, B); 2785 if (DstTy == S32) 2786 return legalizeFDIV32(MI, MRI, B); 2787 if (DstTy == S64) 2788 return legalizeFDIV64(MI, MRI, B); 2789 2790 return false; 2791 } 2792 2793 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2794 Register DstReg, 2795 Register X, 2796 Register Y, 2797 bool IsDiv) const { 2798 const LLT S1 = LLT::scalar(1); 2799 const LLT S32 = LLT::scalar(32); 2800 2801 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2802 // algorithm used here. 2803 2804 // Initial estimate of inv(y). 2805 auto FloatY = B.buildUITOFP(S32, Y); 2806 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2807 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2808 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2809 auto Z = B.buildFPTOUI(S32, ScaledY); 2810 2811 // One round of UNR. 2812 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2813 auto NegYZ = B.buildMul(S32, NegY, Z); 2814 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2815 2816 // Quotient/remainder estimate. 2817 auto Q = B.buildUMulH(S32, X, Z); 2818 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2819 2820 // First quotient/remainder refinement. 2821 auto One = B.buildConstant(S32, 1); 2822 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2823 if (IsDiv) 2824 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2825 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2826 2827 // Second quotient/remainder refinement. 2828 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2829 if (IsDiv) 2830 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2831 else 2832 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2833 } 2834 2835 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2836 // 2837 // Return lo, hi of result 2838 // 2839 // %cvt.lo = G_UITOFP Val.lo 2840 // %cvt.hi = G_UITOFP Val.hi 2841 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2842 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2843 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2844 // %mul2 = G_FMUL %mul1, 2**(-32) 2845 // %trunc = G_INTRINSIC_TRUNC %mul2 2846 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2847 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2848 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2849 Register Val) { 2850 const LLT S32 = LLT::scalar(32); 2851 auto Unmerge = B.buildUnmerge(S32, Val); 2852 2853 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2854 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2855 2856 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2857 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2858 2859 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2860 auto Mul1 = 2861 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2862 2863 // 2**(-32) 2864 auto Mul2 = 2865 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2866 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2867 2868 // -(2**32) 2869 auto Mad2 = B.buildFMAD(S32, Trunc, 2870 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2871 2872 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2873 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2874 2875 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2876 } 2877 2878 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2879 Register DstReg, 2880 Register Numer, 2881 Register Denom, 2882 bool IsDiv) const { 2883 const LLT S32 = LLT::scalar(32); 2884 const LLT S64 = LLT::scalar(64); 2885 const LLT S1 = LLT::scalar(1); 2886 Register RcpLo, RcpHi; 2887 2888 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2889 2890 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2891 2892 auto Zero64 = B.buildConstant(S64, 0); 2893 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2894 2895 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2896 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2897 2898 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2899 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2900 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2901 2902 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2903 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2904 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2905 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2906 2907 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2908 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2909 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2910 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2911 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2912 2913 auto Zero32 = B.buildConstant(S32, 0); 2914 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2915 auto Add2_HiC = 2916 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2917 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2918 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2919 2920 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2921 Register NumerLo = UnmergeNumer.getReg(0); 2922 Register NumerHi = UnmergeNumer.getReg(1); 2923 2924 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2925 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2926 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2927 Register Mul3_Lo = UnmergeMul3.getReg(0); 2928 Register Mul3_Hi = UnmergeMul3.getReg(1); 2929 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2930 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2931 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2932 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2933 2934 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2935 Register DenomLo = UnmergeDenom.getReg(0); 2936 Register DenomHi = UnmergeDenom.getReg(1); 2937 2938 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2939 auto C1 = B.buildSExt(S32, CmpHi); 2940 2941 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2942 auto C2 = B.buildSExt(S32, CmpLo); 2943 2944 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2945 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2946 2947 // TODO: Here and below portions of the code can be enclosed into if/endif. 2948 // Currently control flow is unconditional and we have 4 selects after 2949 // potential endif to substitute PHIs. 2950 2951 // if C3 != 0 ... 2952 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2953 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2954 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2955 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2956 2957 auto One64 = B.buildConstant(S64, 1); 2958 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2959 2960 auto C4 = 2961 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2962 auto C5 = 2963 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2964 auto C6 = B.buildSelect( 2965 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2966 2967 // if (C6 != 0) 2968 auto Add4 = B.buildAdd(S64, Add3, One64); 2969 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2970 2971 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2972 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2973 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2974 2975 // endif C6 2976 // endif C3 2977 2978 if (IsDiv) { 2979 auto Sel1 = B.buildSelect( 2980 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2981 B.buildSelect(DstReg, 2982 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2983 } else { 2984 auto Sel2 = B.buildSelect( 2985 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2986 B.buildSelect(DstReg, 2987 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2988 } 2989 } 2990 2991 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2992 MachineRegisterInfo &MRI, 2993 MachineIRBuilder &B) const { 2994 const LLT S64 = LLT::scalar(64); 2995 const LLT S32 = LLT::scalar(32); 2996 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2997 Register DstReg = MI.getOperand(0).getReg(); 2998 Register Num = MI.getOperand(1).getReg(); 2999 Register Den = MI.getOperand(2).getReg(); 3000 LLT Ty = MRI.getType(DstReg); 3001 3002 if (Ty == S32) 3003 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 3004 else if (Ty == S64) 3005 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 3006 else 3007 return false; 3008 3009 MI.eraseFromParent(); 3010 return true; 3011 3012 } 3013 3014 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 3015 MachineRegisterInfo &MRI, 3016 MachineIRBuilder &B) const { 3017 const LLT S64 = LLT::scalar(64); 3018 const LLT S32 = LLT::scalar(32); 3019 3020 Register DstReg = MI.getOperand(0).getReg(); 3021 const LLT Ty = MRI.getType(DstReg); 3022 if (Ty != S32 && Ty != S64) 3023 return false; 3024 3025 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 3026 3027 Register LHS = MI.getOperand(1).getReg(); 3028 Register RHS = MI.getOperand(2).getReg(); 3029 3030 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3031 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3032 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3033 3034 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3035 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3036 3037 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3038 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3039 3040 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 3041 if (Ty == S32) 3042 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 3043 else 3044 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 3045 3046 Register Sign; 3047 if (IsDiv) 3048 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3049 else 3050 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3051 3052 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 3053 B.buildSub(DstReg, UDivRem, Sign); 3054 3055 MI.eraseFromParent(); 3056 return true; 3057 } 3058 3059 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3060 MachineRegisterInfo &MRI, 3061 MachineIRBuilder &B) const { 3062 Register Res = MI.getOperand(0).getReg(); 3063 Register LHS = MI.getOperand(1).getReg(); 3064 Register RHS = MI.getOperand(2).getReg(); 3065 uint16_t Flags = MI.getFlags(); 3066 LLT ResTy = MRI.getType(Res); 3067 3068 const MachineFunction &MF = B.getMF(); 3069 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3070 MI.getFlag(MachineInstr::FmAfn); 3071 3072 if (!AllowInaccurateRcp) 3073 return false; 3074 3075 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3076 // 1 / x -> RCP(x) 3077 if (CLHS->isExactlyValue(1.0)) { 3078 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3079 .addUse(RHS) 3080 .setMIFlags(Flags); 3081 3082 MI.eraseFromParent(); 3083 return true; 3084 } 3085 3086 // -1 / x -> RCP( FNEG(x) ) 3087 if (CLHS->isExactlyValue(-1.0)) { 3088 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3089 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3090 .addUse(FNeg.getReg(0)) 3091 .setMIFlags(Flags); 3092 3093 MI.eraseFromParent(); 3094 return true; 3095 } 3096 } 3097 3098 // x / y -> x * (1.0 / y) 3099 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3100 .addUse(RHS) 3101 .setMIFlags(Flags); 3102 B.buildFMul(Res, LHS, RCP, Flags); 3103 3104 MI.eraseFromParent(); 3105 return true; 3106 } 3107 3108 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3109 MachineRegisterInfo &MRI, 3110 MachineIRBuilder &B) const { 3111 Register Res = MI.getOperand(0).getReg(); 3112 Register X = MI.getOperand(1).getReg(); 3113 Register Y = MI.getOperand(2).getReg(); 3114 uint16_t Flags = MI.getFlags(); 3115 LLT ResTy = MRI.getType(Res); 3116 3117 const MachineFunction &MF = B.getMF(); 3118 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3119 MI.getFlag(MachineInstr::FmAfn); 3120 3121 if (!AllowInaccurateRcp) 3122 return false; 3123 3124 auto NegY = B.buildFNeg(ResTy, Y); 3125 auto One = B.buildFConstant(ResTy, 1.0); 3126 3127 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3128 .addUse(Y) 3129 .setMIFlags(Flags); 3130 3131 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3132 R = B.buildFMA(ResTy, Tmp0, R, R); 3133 3134 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3135 R = B.buildFMA(ResTy, Tmp1, R, R); 3136 3137 auto Ret = B.buildFMul(ResTy, X, R); 3138 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3139 3140 B.buildFMA(Res, Tmp2, R, Ret); 3141 MI.eraseFromParent(); 3142 return true; 3143 } 3144 3145 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3146 MachineRegisterInfo &MRI, 3147 MachineIRBuilder &B) const { 3148 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3149 return true; 3150 3151 Register Res = MI.getOperand(0).getReg(); 3152 Register LHS = MI.getOperand(1).getReg(); 3153 Register RHS = MI.getOperand(2).getReg(); 3154 3155 uint16_t Flags = MI.getFlags(); 3156 3157 LLT S16 = LLT::scalar(16); 3158 LLT S32 = LLT::scalar(32); 3159 3160 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3161 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3162 3163 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3164 .addUse(RHSExt.getReg(0)) 3165 .setMIFlags(Flags); 3166 3167 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3168 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3169 3170 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3171 .addUse(RDst.getReg(0)) 3172 .addUse(RHS) 3173 .addUse(LHS) 3174 .setMIFlags(Flags); 3175 3176 MI.eraseFromParent(); 3177 return true; 3178 } 3179 3180 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3181 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3182 static void toggleSPDenormMode(bool Enable, 3183 MachineIRBuilder &B, 3184 const GCNSubtarget &ST, 3185 AMDGPU::SIModeRegisterDefaults Mode) { 3186 // Set SP denorm mode to this value. 3187 unsigned SPDenormMode = 3188 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3189 3190 if (ST.hasDenormModeInst()) { 3191 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3192 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3193 3194 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3195 B.buildInstr(AMDGPU::S_DENORM_MODE) 3196 .addImm(NewDenormModeValue); 3197 3198 } else { 3199 // Select FP32 bit field in mode register. 3200 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3201 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3202 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3203 3204 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3205 .addImm(SPDenormMode) 3206 .addImm(SPDenormModeBitField); 3207 } 3208 } 3209 3210 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3211 MachineRegisterInfo &MRI, 3212 MachineIRBuilder &B) const { 3213 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3214 return true; 3215 3216 Register Res = MI.getOperand(0).getReg(); 3217 Register LHS = MI.getOperand(1).getReg(); 3218 Register RHS = MI.getOperand(2).getReg(); 3219 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3220 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3221 3222 uint16_t Flags = MI.getFlags(); 3223 3224 LLT S32 = LLT::scalar(32); 3225 LLT S1 = LLT::scalar(1); 3226 3227 auto One = B.buildFConstant(S32, 1.0f); 3228 3229 auto DenominatorScaled = 3230 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3231 .addUse(LHS) 3232 .addUse(RHS) 3233 .addImm(0) 3234 .setMIFlags(Flags); 3235 auto NumeratorScaled = 3236 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3237 .addUse(LHS) 3238 .addUse(RHS) 3239 .addImm(1) 3240 .setMIFlags(Flags); 3241 3242 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3243 .addUse(DenominatorScaled.getReg(0)) 3244 .setMIFlags(Flags); 3245 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3246 3247 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3248 // aren't modeled as reading it. 3249 if (!Mode.allFP32Denormals()) 3250 toggleSPDenormMode(true, B, ST, Mode); 3251 3252 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3253 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3254 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3255 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3256 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3257 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3258 3259 if (!Mode.allFP32Denormals()) 3260 toggleSPDenormMode(false, B, ST, Mode); 3261 3262 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3263 .addUse(Fma4.getReg(0)) 3264 .addUse(Fma1.getReg(0)) 3265 .addUse(Fma3.getReg(0)) 3266 .addUse(NumeratorScaled.getReg(1)) 3267 .setMIFlags(Flags); 3268 3269 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3270 .addUse(Fmas.getReg(0)) 3271 .addUse(RHS) 3272 .addUse(LHS) 3273 .setMIFlags(Flags); 3274 3275 MI.eraseFromParent(); 3276 return true; 3277 } 3278 3279 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3280 MachineRegisterInfo &MRI, 3281 MachineIRBuilder &B) const { 3282 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3283 return true; 3284 3285 Register Res = MI.getOperand(0).getReg(); 3286 Register LHS = MI.getOperand(1).getReg(); 3287 Register RHS = MI.getOperand(2).getReg(); 3288 3289 uint16_t Flags = MI.getFlags(); 3290 3291 LLT S64 = LLT::scalar(64); 3292 LLT S1 = LLT::scalar(1); 3293 3294 auto One = B.buildFConstant(S64, 1.0); 3295 3296 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3297 .addUse(LHS) 3298 .addUse(RHS) 3299 .addImm(0) 3300 .setMIFlags(Flags); 3301 3302 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3303 3304 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3305 .addUse(DivScale0.getReg(0)) 3306 .setMIFlags(Flags); 3307 3308 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3309 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3310 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3311 3312 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3313 .addUse(LHS) 3314 .addUse(RHS) 3315 .addImm(1) 3316 .setMIFlags(Flags); 3317 3318 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3319 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3320 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3321 3322 Register Scale; 3323 if (!ST.hasUsableDivScaleConditionOutput()) { 3324 // Workaround a hardware bug on SI where the condition output from div_scale 3325 // is not usable. 3326 3327 LLT S32 = LLT::scalar(32); 3328 3329 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3330 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3331 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3332 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3333 3334 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3335 Scale1Unmerge.getReg(1)); 3336 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3337 Scale0Unmerge.getReg(1)); 3338 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3339 } else { 3340 Scale = DivScale1.getReg(1); 3341 } 3342 3343 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3344 .addUse(Fma4.getReg(0)) 3345 .addUse(Fma3.getReg(0)) 3346 .addUse(Mul.getReg(0)) 3347 .addUse(Scale) 3348 .setMIFlags(Flags); 3349 3350 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3351 .addUse(Fmas.getReg(0)) 3352 .addUse(RHS) 3353 .addUse(LHS) 3354 .setMIFlags(Flags); 3355 3356 MI.eraseFromParent(); 3357 return true; 3358 } 3359 3360 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3361 MachineRegisterInfo &MRI, 3362 MachineIRBuilder &B) const { 3363 Register Res = MI.getOperand(0).getReg(); 3364 Register LHS = MI.getOperand(2).getReg(); 3365 Register RHS = MI.getOperand(3).getReg(); 3366 uint16_t Flags = MI.getFlags(); 3367 3368 LLT S32 = LLT::scalar(32); 3369 LLT S1 = LLT::scalar(1); 3370 3371 auto Abs = B.buildFAbs(S32, RHS, Flags); 3372 const APFloat C0Val(1.0f); 3373 3374 auto C0 = B.buildConstant(S32, 0x6f800000); 3375 auto C1 = B.buildConstant(S32, 0x2f800000); 3376 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3377 3378 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3379 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3380 3381 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3382 3383 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3384 .addUse(Mul0.getReg(0)) 3385 .setMIFlags(Flags); 3386 3387 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3388 3389 B.buildFMul(Res, Sel, Mul1, Flags); 3390 3391 MI.eraseFromParent(); 3392 return true; 3393 } 3394 3395 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3396 // FIXME: Why do we handle this one but not other removed instructions? 3397 // 3398 // Reciprocal square root. The clamp prevents infinite results, clamping 3399 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3400 // +-max_float. 3401 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3402 MachineRegisterInfo &MRI, 3403 MachineIRBuilder &B) const { 3404 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3405 return true; 3406 3407 Register Dst = MI.getOperand(0).getReg(); 3408 Register Src = MI.getOperand(2).getReg(); 3409 auto Flags = MI.getFlags(); 3410 3411 LLT Ty = MRI.getType(Dst); 3412 3413 const fltSemantics *FltSemantics; 3414 if (Ty == LLT::scalar(32)) 3415 FltSemantics = &APFloat::IEEEsingle(); 3416 else if (Ty == LLT::scalar(64)) 3417 FltSemantics = &APFloat::IEEEdouble(); 3418 else 3419 return false; 3420 3421 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3422 .addUse(Src) 3423 .setMIFlags(Flags); 3424 3425 // We don't need to concern ourselves with the snan handling difference, since 3426 // the rsq quieted (or not) so use the one which will directly select. 3427 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3428 const bool UseIEEE = MFI->getMode().IEEE; 3429 3430 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3431 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3432 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3433 3434 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3435 3436 if (UseIEEE) 3437 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3438 else 3439 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3440 MI.eraseFromParent(); 3441 return true; 3442 } 3443 3444 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3445 switch (IID) { 3446 case Intrinsic::amdgcn_ds_fadd: 3447 return AMDGPU::G_ATOMICRMW_FADD; 3448 case Intrinsic::amdgcn_ds_fmin: 3449 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3450 case Intrinsic::amdgcn_ds_fmax: 3451 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3452 default: 3453 llvm_unreachable("not a DS FP intrinsic"); 3454 } 3455 } 3456 3457 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3458 MachineInstr &MI, 3459 Intrinsic::ID IID) const { 3460 GISelChangeObserver &Observer = Helper.Observer; 3461 Observer.changingInstr(MI); 3462 3463 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3464 3465 // The remaining operands were used to set fields in the MemOperand on 3466 // construction. 3467 for (int I = 6; I > 3; --I) 3468 MI.RemoveOperand(I); 3469 3470 MI.RemoveOperand(1); // Remove the intrinsic ID. 3471 Observer.changedInstr(MI); 3472 return true; 3473 } 3474 3475 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3476 MachineRegisterInfo &MRI, 3477 MachineIRBuilder &B) const { 3478 uint64_t Offset = 3479 ST.getTargetLowering()->getImplicitParameterOffset( 3480 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3481 LLT DstTy = MRI.getType(DstReg); 3482 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3483 3484 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3485 if (!loadInputValue(KernargPtrReg, B, 3486 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3487 return false; 3488 3489 // FIXME: This should be nuw 3490 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3491 return true; 3492 } 3493 3494 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3495 MachineRegisterInfo &MRI, 3496 MachineIRBuilder &B) const { 3497 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3498 if (!MFI->isEntryFunction()) { 3499 return legalizePreloadedArgIntrin(MI, MRI, B, 3500 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3501 } 3502 3503 Register DstReg = MI.getOperand(0).getReg(); 3504 if (!getImplicitArgPtr(DstReg, MRI, B)) 3505 return false; 3506 3507 MI.eraseFromParent(); 3508 return true; 3509 } 3510 3511 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3512 MachineRegisterInfo &MRI, 3513 MachineIRBuilder &B, 3514 unsigned AddrSpace) const { 3515 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3516 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3517 Register Hi32 = Unmerge.getReg(1); 3518 3519 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3520 MI.eraseFromParent(); 3521 return true; 3522 } 3523 3524 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3525 // offset (the offset that is included in bounds checking and swizzling, to be 3526 // split between the instruction's voffset and immoffset fields) and soffset 3527 // (the offset that is excluded from bounds checking and swizzling, to go in 3528 // the instruction's soffset field). This function takes the first kind of 3529 // offset and figures out how to split it between voffset and immoffset. 3530 std::tuple<Register, unsigned, unsigned> 3531 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3532 Register OrigOffset) const { 3533 const unsigned MaxImm = 4095; 3534 Register BaseReg; 3535 unsigned TotalConstOffset; 3536 const LLT S32 = LLT::scalar(32); 3537 MachineRegisterInfo &MRI = *B.getMRI(); 3538 3539 std::tie(BaseReg, TotalConstOffset) = 3540 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3541 3542 unsigned ImmOffset = TotalConstOffset; 3543 3544 // If BaseReg is a pointer, convert it to int. 3545 if (MRI.getType(BaseReg).isPointer()) 3546 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3547 3548 // If the immediate value is too big for the immoffset field, put the value 3549 // and -4096 into the immoffset field so that the value that is copied/added 3550 // for the voffset field is a multiple of 4096, and it stands more chance 3551 // of being CSEd with the copy/add for another similar load/store. 3552 // However, do not do that rounding down to a multiple of 4096 if that is a 3553 // negative number, as it appears to be illegal to have a negative offset 3554 // in the vgpr, even if adding the immediate offset makes it positive. 3555 unsigned Overflow = ImmOffset & ~MaxImm; 3556 ImmOffset -= Overflow; 3557 if ((int32_t)Overflow < 0) { 3558 Overflow += ImmOffset; 3559 ImmOffset = 0; 3560 } 3561 3562 if (Overflow != 0) { 3563 if (!BaseReg) { 3564 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3565 } else { 3566 auto OverflowVal = B.buildConstant(S32, Overflow); 3567 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3568 } 3569 } 3570 3571 if (!BaseReg) 3572 BaseReg = B.buildConstant(S32, 0).getReg(0); 3573 3574 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3575 } 3576 3577 /// Handle register layout difference for f16 images for some subtargets. 3578 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3579 MachineRegisterInfo &MRI, 3580 Register Reg, 3581 bool ImageStore) const { 3582 const LLT S16 = LLT::scalar(16); 3583 const LLT S32 = LLT::scalar(32); 3584 LLT StoreVT = MRI.getType(Reg); 3585 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3586 3587 if (ST.hasUnpackedD16VMem()) { 3588 auto Unmerge = B.buildUnmerge(S16, Reg); 3589 3590 SmallVector<Register, 4> WideRegs; 3591 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3592 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3593 3594 int NumElts = StoreVT.getNumElements(); 3595 3596 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3597 } 3598 3599 if (ImageStore && ST.hasImageStoreD16Bug()) { 3600 if (StoreVT.getNumElements() == 2) { 3601 SmallVector<Register, 4> PackedRegs; 3602 Reg = B.buildBitcast(S32, Reg).getReg(0); 3603 PackedRegs.push_back(Reg); 3604 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3605 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); 3606 } 3607 3608 if (StoreVT.getNumElements() == 3) { 3609 SmallVector<Register, 4> PackedRegs; 3610 auto Unmerge = B.buildUnmerge(S16, Reg); 3611 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3612 PackedRegs.push_back(Unmerge.getReg(I)); 3613 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3614 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); 3615 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); 3616 } 3617 3618 if (StoreVT.getNumElements() == 4) { 3619 SmallVector<Register, 4> PackedRegs; 3620 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); 3621 auto Unmerge = B.buildUnmerge(S32, Reg); 3622 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3623 PackedRegs.push_back(Unmerge.getReg(I)); 3624 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3625 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); 3626 } 3627 3628 llvm_unreachable("invalid data type"); 3629 } 3630 3631 return Reg; 3632 } 3633 3634 Register AMDGPULegalizerInfo::fixStoreSourceType( 3635 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3636 MachineRegisterInfo *MRI = B.getMRI(); 3637 LLT Ty = MRI->getType(VData); 3638 3639 const LLT S16 = LLT::scalar(16); 3640 3641 // Fixup illegal register types for i8 stores. 3642 if (Ty == LLT::scalar(8) || Ty == S16) { 3643 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3644 return AnyExt; 3645 } 3646 3647 if (Ty.isVector()) { 3648 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3649 if (IsFormat) 3650 return handleD16VData(B, *MRI, VData); 3651 } 3652 } 3653 3654 return VData; 3655 } 3656 3657 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3658 MachineRegisterInfo &MRI, 3659 MachineIRBuilder &B, 3660 bool IsTyped, 3661 bool IsFormat) const { 3662 Register VData = MI.getOperand(1).getReg(); 3663 LLT Ty = MRI.getType(VData); 3664 LLT EltTy = Ty.getScalarType(); 3665 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3666 const LLT S32 = LLT::scalar(32); 3667 3668 VData = fixStoreSourceType(B, VData, IsFormat); 3669 Register RSrc = MI.getOperand(2).getReg(); 3670 3671 MachineMemOperand *MMO = *MI.memoperands_begin(); 3672 const int MemSize = MMO->getSize(); 3673 3674 unsigned ImmOffset; 3675 unsigned TotalOffset; 3676 3677 // The typed intrinsics add an immediate after the registers. 3678 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3679 3680 // The struct intrinsic variants add one additional operand over raw. 3681 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3682 Register VIndex; 3683 int OpOffset = 0; 3684 if (HasVIndex) { 3685 VIndex = MI.getOperand(3).getReg(); 3686 OpOffset = 1; 3687 } 3688 3689 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3690 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3691 3692 unsigned Format = 0; 3693 if (IsTyped) { 3694 Format = MI.getOperand(5 + OpOffset).getImm(); 3695 ++OpOffset; 3696 } 3697 3698 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3699 3700 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3701 if (TotalOffset != 0) 3702 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3703 3704 unsigned Opc; 3705 if (IsTyped) { 3706 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3707 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3708 } else if (IsFormat) { 3709 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3710 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3711 } else { 3712 switch (MemSize) { 3713 case 1: 3714 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3715 break; 3716 case 2: 3717 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3718 break; 3719 default: 3720 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3721 break; 3722 } 3723 } 3724 3725 if (!VIndex) 3726 VIndex = B.buildConstant(S32, 0).getReg(0); 3727 3728 auto MIB = B.buildInstr(Opc) 3729 .addUse(VData) // vdata 3730 .addUse(RSrc) // rsrc 3731 .addUse(VIndex) // vindex 3732 .addUse(VOffset) // voffset 3733 .addUse(SOffset) // soffset 3734 .addImm(ImmOffset); // offset(imm) 3735 3736 if (IsTyped) 3737 MIB.addImm(Format); 3738 3739 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3740 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3741 .addMemOperand(MMO); 3742 3743 MI.eraseFromParent(); 3744 return true; 3745 } 3746 3747 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3748 MachineRegisterInfo &MRI, 3749 MachineIRBuilder &B, 3750 bool IsFormat, 3751 bool IsTyped) const { 3752 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3753 MachineMemOperand *MMO = *MI.memoperands_begin(); 3754 const int MemSize = MMO->getSize(); 3755 const LLT S32 = LLT::scalar(32); 3756 3757 Register Dst = MI.getOperand(0).getReg(); 3758 Register RSrc = MI.getOperand(2).getReg(); 3759 3760 // The typed intrinsics add an immediate after the registers. 3761 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3762 3763 // The struct intrinsic variants add one additional operand over raw. 3764 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3765 Register VIndex; 3766 int OpOffset = 0; 3767 if (HasVIndex) { 3768 VIndex = MI.getOperand(3).getReg(); 3769 OpOffset = 1; 3770 } 3771 3772 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3773 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3774 3775 unsigned Format = 0; 3776 if (IsTyped) { 3777 Format = MI.getOperand(5 + OpOffset).getImm(); 3778 ++OpOffset; 3779 } 3780 3781 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3782 unsigned ImmOffset; 3783 unsigned TotalOffset; 3784 3785 LLT Ty = MRI.getType(Dst); 3786 LLT EltTy = Ty.getScalarType(); 3787 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3788 const bool Unpacked = ST.hasUnpackedD16VMem(); 3789 3790 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3791 if (TotalOffset != 0) 3792 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3793 3794 unsigned Opc; 3795 3796 if (IsTyped) { 3797 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3798 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3799 } else if (IsFormat) { 3800 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3801 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3802 } else { 3803 switch (MemSize) { 3804 case 1: 3805 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3806 break; 3807 case 2: 3808 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3809 break; 3810 default: 3811 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3812 break; 3813 } 3814 } 3815 3816 Register LoadDstReg; 3817 3818 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3819 LLT UnpackedTy = Ty.changeElementSize(32); 3820 3821 if (IsExtLoad) 3822 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3823 else if (Unpacked && IsD16 && Ty.isVector()) 3824 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3825 else 3826 LoadDstReg = Dst; 3827 3828 if (!VIndex) 3829 VIndex = B.buildConstant(S32, 0).getReg(0); 3830 3831 auto MIB = B.buildInstr(Opc) 3832 .addDef(LoadDstReg) // vdata 3833 .addUse(RSrc) // rsrc 3834 .addUse(VIndex) // vindex 3835 .addUse(VOffset) // voffset 3836 .addUse(SOffset) // soffset 3837 .addImm(ImmOffset); // offset(imm) 3838 3839 if (IsTyped) 3840 MIB.addImm(Format); 3841 3842 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3843 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3844 .addMemOperand(MMO); 3845 3846 if (LoadDstReg != Dst) { 3847 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3848 3849 // Widen result for extending loads was widened. 3850 if (IsExtLoad) 3851 B.buildTrunc(Dst, LoadDstReg); 3852 else { 3853 // Repack to original 16-bit vector result 3854 // FIXME: G_TRUNC should work, but legalization currently fails 3855 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3856 SmallVector<Register, 4> Repack; 3857 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3858 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3859 B.buildMerge(Dst, Repack); 3860 } 3861 } 3862 3863 MI.eraseFromParent(); 3864 return true; 3865 } 3866 3867 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3868 MachineIRBuilder &B, 3869 bool IsInc) const { 3870 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3871 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3872 B.buildInstr(Opc) 3873 .addDef(MI.getOperand(0).getReg()) 3874 .addUse(MI.getOperand(2).getReg()) 3875 .addUse(MI.getOperand(3).getReg()) 3876 .cloneMemRefs(MI); 3877 MI.eraseFromParent(); 3878 return true; 3879 } 3880 3881 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3882 switch (IntrID) { 3883 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3884 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3885 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3886 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3887 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3888 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3889 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3890 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3891 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3892 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3893 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3894 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3895 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3896 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3897 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3898 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3899 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3900 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3901 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3902 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3903 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3904 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3905 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3906 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3907 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3908 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3909 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3910 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3911 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3912 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3913 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3914 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3915 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3916 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3917 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3918 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3919 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3920 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3921 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3922 case Intrinsic::amdgcn_buffer_atomic_fadd: 3923 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3924 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3925 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3926 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 3927 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 3928 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 3929 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 3930 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 3931 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 3932 default: 3933 llvm_unreachable("unhandled atomic opcode"); 3934 } 3935 } 3936 3937 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3938 MachineIRBuilder &B, 3939 Intrinsic::ID IID) const { 3940 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3941 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3942 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3943 3944 Register Dst; 3945 3946 int OpOffset = 0; 3947 if (HasReturn) { 3948 // A few FP atomics do not support return values. 3949 Dst = MI.getOperand(0).getReg(); 3950 } else { 3951 OpOffset = -1; 3952 } 3953 3954 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3955 Register CmpVal; 3956 3957 if (IsCmpSwap) { 3958 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3959 ++OpOffset; 3960 } 3961 3962 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3963 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3964 3965 // The struct intrinsic variants add one additional operand over raw. 3966 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3967 Register VIndex; 3968 if (HasVIndex) { 3969 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3970 ++OpOffset; 3971 } 3972 3973 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3974 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3975 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3976 3977 MachineMemOperand *MMO = *MI.memoperands_begin(); 3978 3979 unsigned ImmOffset; 3980 unsigned TotalOffset; 3981 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3982 if (TotalOffset != 0) 3983 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3984 3985 if (!VIndex) 3986 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3987 3988 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3989 3990 if (HasReturn) 3991 MIB.addDef(Dst); 3992 3993 MIB.addUse(VData); // vdata 3994 3995 if (IsCmpSwap) 3996 MIB.addReg(CmpVal); 3997 3998 MIB.addUse(RSrc) // rsrc 3999 .addUse(VIndex) // vindex 4000 .addUse(VOffset) // voffset 4001 .addUse(SOffset) // soffset 4002 .addImm(ImmOffset) // offset(imm) 4003 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4004 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4005 .addMemOperand(MMO); 4006 4007 MI.eraseFromParent(); 4008 return true; 4009 } 4010 4011 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 4012 /// vector with s16 typed elements. 4013 static void packImageA16AddressToDwords( 4014 MachineIRBuilder &B, MachineInstr &MI, 4015 SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset, 4016 const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) { 4017 const LLT S16 = LLT::scalar(16); 4018 const LLT V2S16 = LLT::vector(2, 16); 4019 4020 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4021 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4022 if (!SrcOp.isReg()) 4023 continue; // _L to _LZ may have eliminated this. 4024 4025 Register AddrReg = SrcOp.getReg(); 4026 4027 if (I < Intr->GradientStart) { 4028 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4029 PackedAddrs.push_back(AddrReg); 4030 } else { 4031 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4032 // derivatives dx/dh and dx/dv are packed with undef. 4033 if (((I + 1) >= EndIdx) || 4034 ((Intr->NumGradients / 2) % 2 == 1 && 4035 (I == static_cast<unsigned>(Intr->GradientStart + 4036 (Intr->NumGradients / 2) - 1) || 4037 I == static_cast<unsigned>(Intr->GradientStart + 4038 Intr->NumGradients - 1))) || 4039 // Check for _L to _LZ optimization 4040 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4041 PackedAddrs.push_back( 4042 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4043 .getReg(0)); 4044 } else { 4045 PackedAddrs.push_back( 4046 B.buildBuildVector( 4047 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4048 .getReg(0)); 4049 ++I; 4050 } 4051 } 4052 } 4053 } 4054 4055 /// Convert from separate vaddr components to a single vector address register, 4056 /// and replace the remaining operands with $noreg. 4057 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4058 int DimIdx, int NumVAddrs) { 4059 const LLT S32 = LLT::scalar(32); 4060 4061 SmallVector<Register, 8> AddrRegs; 4062 for (int I = 0; I != NumVAddrs; ++I) { 4063 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4064 if (SrcOp.isReg()) { 4065 AddrRegs.push_back(SrcOp.getReg()); 4066 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4067 } 4068 } 4069 4070 int NumAddrRegs = AddrRegs.size(); 4071 if (NumAddrRegs != 1) { 4072 // Round up to 8 elements for v5-v7 4073 // FIXME: Missing intermediate sized register classes and instructions. 4074 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4075 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4076 auto Undef = B.buildUndef(S32); 4077 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4078 NumAddrRegs = RoundedNumRegs; 4079 } 4080 4081 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 4082 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4083 } 4084 4085 for (int I = 1; I != NumVAddrs; ++I) { 4086 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4087 if (SrcOp.isReg()) 4088 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4089 } 4090 } 4091 4092 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4093 /// 4094 /// Depending on the subtarget, load/store with 16-bit element data need to be 4095 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4096 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4097 /// registers. 4098 /// 4099 /// We don't want to directly select image instructions just yet, but also want 4100 /// to exposes all register repacking to the legalizer/combiners. We also don't 4101 /// want a selected instrution entering RegBankSelect. In order to avoid 4102 /// defining a multitude of intermediate image instructions, directly hack on 4103 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4104 /// now unnecessary arguments with $noreg. 4105 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4106 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4107 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4108 4109 const unsigned NumDefs = MI.getNumExplicitDefs(); 4110 const unsigned ArgOffset = NumDefs + 1; 4111 bool IsTFE = NumDefs == 2; 4112 // We are only processing the operands of d16 image operations on subtargets 4113 // that use the unpacked register layout, or need to repack the TFE result. 4114 4115 // TODO: Do we need to guard against already legalized intrinsics? 4116 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4117 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4118 4119 MachineRegisterInfo *MRI = B.getMRI(); 4120 const LLT S32 = LLT::scalar(32); 4121 const LLT S16 = LLT::scalar(16); 4122 const LLT V2S16 = LLT::vector(2, 16); 4123 4124 unsigned DMask = 0; 4125 4126 // Check for 16 bit addresses and pack if true. 4127 LLT GradTy = 4128 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4129 LLT AddrTy = 4130 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4131 const bool IsG16 = GradTy == S16; 4132 const bool IsA16 = AddrTy == S16; 4133 4134 int DMaskLanes = 0; 4135 if (!BaseOpcode->Atomic) { 4136 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4137 if (BaseOpcode->Gather4) { 4138 DMaskLanes = 4; 4139 } else if (DMask != 0) { 4140 DMaskLanes = countPopulation(DMask); 4141 } else if (!IsTFE && !BaseOpcode->Store) { 4142 // If dmask is 0, this is a no-op load. This can be eliminated. 4143 B.buildUndef(MI.getOperand(0)); 4144 MI.eraseFromParent(); 4145 return true; 4146 } 4147 } 4148 4149 Observer.changingInstr(MI); 4150 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4151 4152 unsigned NewOpcode = NumDefs == 0 ? 4153 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4154 4155 // Track that we legalized this 4156 MI.setDesc(B.getTII().get(NewOpcode)); 4157 4158 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4159 // dmask to be at least 1 otherwise the instruction will fail 4160 if (IsTFE && DMask == 0) { 4161 DMask = 0x1; 4162 DMaskLanes = 1; 4163 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4164 } 4165 4166 if (BaseOpcode->Atomic) { 4167 Register VData0 = MI.getOperand(2).getReg(); 4168 LLT Ty = MRI->getType(VData0); 4169 4170 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4171 if (Ty.isVector()) 4172 return false; 4173 4174 if (BaseOpcode->AtomicX2) { 4175 Register VData1 = MI.getOperand(3).getReg(); 4176 // The two values are packed in one register. 4177 LLT PackedTy = LLT::vector(2, Ty); 4178 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4179 MI.getOperand(2).setReg(Concat.getReg(0)); 4180 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4181 } 4182 } 4183 4184 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4185 4186 // Optimize _L to _LZ when _L is zero 4187 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4188 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4189 const ConstantFP *ConstantLod; 4190 4191 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4192 m_GFCst(ConstantLod))) { 4193 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4194 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4195 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4196 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4197 Intr->Dim); 4198 4199 // The starting indexes should remain in the same place. 4200 --CorrectedNumVAddrs; 4201 4202 MI.getOperand(MI.getNumExplicitDefs()) 4203 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4204 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4205 Intr = NewImageDimIntr; 4206 } 4207 } 4208 } 4209 4210 // Optimize _mip away, when 'lod' is zero 4211 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4212 int64_t ConstantLod; 4213 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4214 m_ICst(ConstantLod))) { 4215 if (ConstantLod == 0) { 4216 // TODO: Change intrinsic opcode and remove operand instead or replacing 4217 // it with 0, as the _L to _LZ handling is done above. 4218 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4219 --CorrectedNumVAddrs; 4220 } 4221 } 4222 } 4223 4224 // Rewrite the addressing register layout before doing anything else. 4225 if (IsA16 || IsG16) { 4226 if (IsA16) { 4227 // Target must support the feature and gradients need to be 16 bit too 4228 if (!ST.hasA16() || !IsG16) 4229 return false; 4230 } else if (!ST.hasG16()) 4231 return false; 4232 4233 if (Intr->NumVAddrs > 1) { 4234 SmallVector<Register, 4> PackedRegs; 4235 // Don't compress addresses for G16 4236 const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart; 4237 packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr, 4238 PackEndIdx); 4239 4240 if (!IsA16) { 4241 // Add uncompressed address 4242 for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) { 4243 int AddrReg = MI.getOperand(ArgOffset + I).getReg(); 4244 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 4245 PackedRegs.push_back(AddrReg); 4246 } 4247 } 4248 4249 // See also below in the non-a16 branch 4250 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4251 4252 if (!UseNSA && PackedRegs.size() > 1) { 4253 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4254 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4255 PackedRegs[0] = Concat.getReg(0); 4256 PackedRegs.resize(1); 4257 } 4258 4259 const unsigned NumPacked = PackedRegs.size(); 4260 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4261 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4262 if (!SrcOp.isReg()) { 4263 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4264 continue; 4265 } 4266 4267 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4268 4269 if (I - Intr->VAddrStart < NumPacked) 4270 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4271 else 4272 SrcOp.setReg(AMDGPU::NoRegister); 4273 } 4274 } 4275 } else { 4276 // If the register allocator cannot place the address registers contiguously 4277 // without introducing moves, then using the non-sequential address encoding 4278 // is always preferable, since it saves VALU instructions and is usually a 4279 // wash in terms of code size or even better. 4280 // 4281 // However, we currently have no way of hinting to the register allocator 4282 // that MIMG addresses should be placed contiguously when it is possible to 4283 // do so, so force non-NSA for the common 2-address case as a heuristic. 4284 // 4285 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4286 // allocation when possible. 4287 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4288 4289 if (!UseNSA && Intr->NumVAddrs > 1) 4290 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4291 Intr->NumVAddrs); 4292 } 4293 4294 int Flags = 0; 4295 if (IsA16) 4296 Flags |= 1; 4297 if (IsG16) 4298 Flags |= 2; 4299 MI.addOperand(MachineOperand::CreateImm(Flags)); 4300 4301 if (BaseOpcode->Store) { // No TFE for stores? 4302 // TODO: Handle dmask trim 4303 Register VData = MI.getOperand(1).getReg(); 4304 LLT Ty = MRI->getType(VData); 4305 if (!Ty.isVector() || Ty.getElementType() != S16) 4306 return true; 4307 4308 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4309 if (RepackedReg != VData) { 4310 MI.getOperand(1).setReg(RepackedReg); 4311 } 4312 4313 return true; 4314 } 4315 4316 Register DstReg = MI.getOperand(0).getReg(); 4317 LLT Ty = MRI->getType(DstReg); 4318 const LLT EltTy = Ty.getScalarType(); 4319 const bool IsD16 = Ty.getScalarType() == S16; 4320 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4321 4322 // Confirm that the return type is large enough for the dmask specified 4323 if (NumElts < DMaskLanes) 4324 return false; 4325 4326 if (NumElts > 4 || DMaskLanes > 4) 4327 return false; 4328 4329 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4330 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4331 4332 // The raw dword aligned data component of the load. The only legal cases 4333 // where this matters should be when using the packed D16 format, for 4334 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4335 LLT RoundedTy; 4336 4337 // S32 vector to to cover all data, plus TFE result element. 4338 LLT TFETy; 4339 4340 // Register type to use for each loaded component. Will be S32 or V2S16. 4341 LLT RegTy; 4342 4343 if (IsD16 && ST.hasUnpackedD16VMem()) { 4344 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4345 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4346 RegTy = S32; 4347 } else { 4348 unsigned EltSize = EltTy.getSizeInBits(); 4349 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4350 unsigned RoundedSize = 32 * RoundedElts; 4351 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4352 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4353 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4354 } 4355 4356 // The return type does not need adjustment. 4357 // TODO: Should we change s16 case to s32 or <2 x s16>? 4358 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4359 return true; 4360 4361 Register Dst1Reg; 4362 4363 // Insert after the instruction. 4364 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4365 4366 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4367 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4368 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4369 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4370 4371 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4372 4373 MI.getOperand(0).setReg(NewResultReg); 4374 4375 // In the IR, TFE is supposed to be used with a 2 element struct return 4376 // type. The intruction really returns these two values in one contiguous 4377 // register, with one additional dword beyond the loaded data. Rewrite the 4378 // return type to use a single register result. 4379 4380 if (IsTFE) { 4381 Dst1Reg = MI.getOperand(1).getReg(); 4382 if (MRI->getType(Dst1Reg) != S32) 4383 return false; 4384 4385 // TODO: Make sure the TFE operand bit is set. 4386 MI.RemoveOperand(1); 4387 4388 // Handle the easy case that requires no repack instructions. 4389 if (Ty == S32) { 4390 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4391 return true; 4392 } 4393 } 4394 4395 // Now figure out how to copy the new result register back into the old 4396 // result. 4397 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4398 4399 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4400 4401 if (ResultNumRegs == 1) { 4402 assert(!IsTFE); 4403 ResultRegs[0] = NewResultReg; 4404 } else { 4405 // We have to repack into a new vector of some kind. 4406 for (int I = 0; I != NumDataRegs; ++I) 4407 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4408 B.buildUnmerge(ResultRegs, NewResultReg); 4409 4410 // Drop the final TFE element to get the data part. The TFE result is 4411 // directly written to the right place already. 4412 if (IsTFE) 4413 ResultRegs.resize(NumDataRegs); 4414 } 4415 4416 // For an s16 scalar result, we form an s32 result with a truncate regardless 4417 // of packed vs. unpacked. 4418 if (IsD16 && !Ty.isVector()) { 4419 B.buildTrunc(DstReg, ResultRegs[0]); 4420 return true; 4421 } 4422 4423 // Avoid a build/concat_vector of 1 entry. 4424 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4425 B.buildBitcast(DstReg, ResultRegs[0]); 4426 return true; 4427 } 4428 4429 assert(Ty.isVector()); 4430 4431 if (IsD16) { 4432 // For packed D16 results with TFE enabled, all the data components are 4433 // S32. Cast back to the expected type. 4434 // 4435 // TODO: We don't really need to use load s32 elements. We would only need one 4436 // cast for the TFE result if a multiple of v2s16 was used. 4437 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4438 for (Register &Reg : ResultRegs) 4439 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4440 } else if (ST.hasUnpackedD16VMem()) { 4441 for (Register &Reg : ResultRegs) 4442 Reg = B.buildTrunc(S16, Reg).getReg(0); 4443 } 4444 } 4445 4446 auto padWithUndef = [&](LLT Ty, int NumElts) { 4447 if (NumElts == 0) 4448 return; 4449 Register Undef = B.buildUndef(Ty).getReg(0); 4450 for (int I = 0; I != NumElts; ++I) 4451 ResultRegs.push_back(Undef); 4452 }; 4453 4454 // Pad out any elements eliminated due to the dmask. 4455 LLT ResTy = MRI->getType(ResultRegs[0]); 4456 if (!ResTy.isVector()) { 4457 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4458 B.buildBuildVector(DstReg, ResultRegs); 4459 return true; 4460 } 4461 4462 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4463 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4464 4465 // Deal with the one annoying legal case. 4466 const LLT V3S16 = LLT::vector(3, 16); 4467 if (Ty == V3S16) { 4468 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4469 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4470 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4471 return true; 4472 } 4473 4474 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4475 B.buildConcatVectors(DstReg, ResultRegs); 4476 return true; 4477 } 4478 4479 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4480 LegalizerHelper &Helper, MachineInstr &MI) const { 4481 MachineIRBuilder &B = Helper.MIRBuilder; 4482 GISelChangeObserver &Observer = Helper.Observer; 4483 4484 Register Dst = MI.getOperand(0).getReg(); 4485 LLT Ty = B.getMRI()->getType(Dst); 4486 unsigned Size = Ty.getSizeInBits(); 4487 MachineFunction &MF = B.getMF(); 4488 4489 Observer.changingInstr(MI); 4490 4491 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4492 Ty = getBitcastRegisterType(Ty); 4493 Helper.bitcastDst(MI, Ty, 0); 4494 Dst = MI.getOperand(0).getReg(); 4495 B.setInsertPt(B.getMBB(), MI); 4496 } 4497 4498 // FIXME: We don't really need this intermediate instruction. The intrinsic 4499 // should be fixed to have a memory operand. Since it's readnone, we're not 4500 // allowed to add one. 4501 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4502 MI.RemoveOperand(1); // Remove intrinsic ID 4503 4504 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4505 // TODO: Should this use datalayout alignment? 4506 const unsigned MemSize = (Size + 7) / 8; 4507 const Align MemAlign(4); 4508 MachineMemOperand *MMO = MF.getMachineMemOperand( 4509 MachinePointerInfo(), 4510 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4511 MachineMemOperand::MOInvariant, 4512 MemSize, MemAlign); 4513 MI.addMemOperand(MF, MMO); 4514 4515 // There are no 96-bit result scalar loads, but widening to 128-bit should 4516 // always be legal. We may need to restore this to a 96-bit result if it turns 4517 // out this needs to be converted to a vector load during RegBankSelect. 4518 if (!isPowerOf2_32(Size)) { 4519 if (Ty.isVector()) 4520 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4521 else 4522 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4523 } 4524 4525 Observer.changedInstr(MI); 4526 return true; 4527 } 4528 4529 // TODO: Move to selection 4530 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4531 MachineRegisterInfo &MRI, 4532 MachineIRBuilder &B) const { 4533 if (!ST.isTrapHandlerEnabled() || 4534 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4535 return legalizeTrapEndpgm(MI, MRI, B); 4536 4537 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4538 switch (*HsaAbiVer) { 4539 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4540 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4541 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4542 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4543 return ST.supportsGetDoorbellID() ? 4544 legalizeTrapHsa(MI, MRI, B) : 4545 legalizeTrapHsaQueuePtr(MI, MRI, B); 4546 } 4547 } 4548 4549 llvm_unreachable("Unknown trap handler"); 4550 } 4551 4552 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4553 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4554 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4555 MI.eraseFromParent(); 4556 return true; 4557 } 4558 4559 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4560 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4561 // Pass queue pointer to trap handler as input, and insert trap instruction 4562 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4563 Register LiveIn = 4564 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4565 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4566 return false; 4567 4568 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4569 B.buildCopy(SGPR01, LiveIn); 4570 B.buildInstr(AMDGPU::S_TRAP) 4571 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4572 .addReg(SGPR01, RegState::Implicit); 4573 4574 MI.eraseFromParent(); 4575 return true; 4576 } 4577 4578 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4579 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4580 B.buildInstr(AMDGPU::S_TRAP) 4581 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4582 MI.eraseFromParent(); 4583 return true; 4584 } 4585 4586 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4587 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4588 // Is non-HSA path or trap-handler disabled? then, report a warning 4589 // accordingly 4590 if (!ST.isTrapHandlerEnabled() || 4591 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4592 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4593 "debugtrap handler not supported", 4594 MI.getDebugLoc(), DS_Warning); 4595 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4596 Ctx.diagnose(NoTrap); 4597 } else { 4598 // Insert debug-trap instruction 4599 B.buildInstr(AMDGPU::S_TRAP) 4600 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4601 } 4602 4603 MI.eraseFromParent(); 4604 return true; 4605 } 4606 4607 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4608 MachineIRBuilder &B) const { 4609 MachineRegisterInfo &MRI = *B.getMRI(); 4610 const LLT S16 = LLT::scalar(16); 4611 const LLT S32 = LLT::scalar(32); 4612 4613 Register DstReg = MI.getOperand(0).getReg(); 4614 Register NodePtr = MI.getOperand(2).getReg(); 4615 Register RayExtent = MI.getOperand(3).getReg(); 4616 Register RayOrigin = MI.getOperand(4).getReg(); 4617 Register RayDir = MI.getOperand(5).getReg(); 4618 Register RayInvDir = MI.getOperand(6).getReg(); 4619 Register TDescr = MI.getOperand(7).getReg(); 4620 4621 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4622 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4623 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4624 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4625 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4626 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4627 4628 SmallVector<Register, 12> Ops; 4629 if (Is64) { 4630 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4631 Ops.push_back(Unmerge.getReg(0)); 4632 Ops.push_back(Unmerge.getReg(1)); 4633 } else { 4634 Ops.push_back(NodePtr); 4635 } 4636 Ops.push_back(RayExtent); 4637 4638 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4639 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4640 Ops.push_back(Unmerge.getReg(0)); 4641 Ops.push_back(Unmerge.getReg(1)); 4642 Ops.push_back(Unmerge.getReg(2)); 4643 }; 4644 4645 packLanes(RayOrigin); 4646 if (IsA16) { 4647 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4648 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4649 Register R1 = MRI.createGenericVirtualRegister(S32); 4650 Register R2 = MRI.createGenericVirtualRegister(S32); 4651 Register R3 = MRI.createGenericVirtualRegister(S32); 4652 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4653 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4654 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4655 Ops.push_back(R1); 4656 Ops.push_back(R2); 4657 Ops.push_back(R3); 4658 } else { 4659 packLanes(RayDir); 4660 packLanes(RayInvDir); 4661 } 4662 4663 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4664 .addDef(DstReg) 4665 .addImm(Opcode); 4666 4667 for (Register R : Ops) { 4668 MIB.addUse(R); 4669 } 4670 4671 MIB.addUse(TDescr) 4672 .addImm(IsA16 ? 1 : 0) 4673 .cloneMemRefs(MI); 4674 4675 MI.eraseFromParent(); 4676 return true; 4677 } 4678 4679 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4680 MachineInstr &MI) const { 4681 MachineIRBuilder &B = Helper.MIRBuilder; 4682 MachineRegisterInfo &MRI = *B.getMRI(); 4683 4684 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4685 auto IntrID = MI.getIntrinsicID(); 4686 switch (IntrID) { 4687 case Intrinsic::amdgcn_if: 4688 case Intrinsic::amdgcn_else: { 4689 MachineInstr *Br = nullptr; 4690 MachineBasicBlock *UncondBrTarget = nullptr; 4691 bool Negated = false; 4692 if (MachineInstr *BrCond = 4693 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4694 const SIRegisterInfo *TRI 4695 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4696 4697 Register Def = MI.getOperand(1).getReg(); 4698 Register Use = MI.getOperand(3).getReg(); 4699 4700 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4701 4702 if (Negated) 4703 std::swap(CondBrTarget, UncondBrTarget); 4704 4705 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4706 if (IntrID == Intrinsic::amdgcn_if) { 4707 B.buildInstr(AMDGPU::SI_IF) 4708 .addDef(Def) 4709 .addUse(Use) 4710 .addMBB(UncondBrTarget); 4711 } else { 4712 B.buildInstr(AMDGPU::SI_ELSE) 4713 .addDef(Def) 4714 .addUse(Use) 4715 .addMBB(UncondBrTarget); 4716 } 4717 4718 if (Br) { 4719 Br->getOperand(0).setMBB(CondBrTarget); 4720 } else { 4721 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4722 // since we're swapping branch targets it needs to be reinserted. 4723 // FIXME: IRTranslator should probably not do this 4724 B.buildBr(*CondBrTarget); 4725 } 4726 4727 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4728 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4729 MI.eraseFromParent(); 4730 BrCond->eraseFromParent(); 4731 return true; 4732 } 4733 4734 return false; 4735 } 4736 case Intrinsic::amdgcn_loop: { 4737 MachineInstr *Br = nullptr; 4738 MachineBasicBlock *UncondBrTarget = nullptr; 4739 bool Negated = false; 4740 if (MachineInstr *BrCond = 4741 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4742 const SIRegisterInfo *TRI 4743 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4744 4745 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4746 Register Reg = MI.getOperand(2).getReg(); 4747 4748 if (Negated) 4749 std::swap(CondBrTarget, UncondBrTarget); 4750 4751 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4752 B.buildInstr(AMDGPU::SI_LOOP) 4753 .addUse(Reg) 4754 .addMBB(UncondBrTarget); 4755 4756 if (Br) 4757 Br->getOperand(0).setMBB(CondBrTarget); 4758 else 4759 B.buildBr(*CondBrTarget); 4760 4761 MI.eraseFromParent(); 4762 BrCond->eraseFromParent(); 4763 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4764 return true; 4765 } 4766 4767 return false; 4768 } 4769 case Intrinsic::amdgcn_kernarg_segment_ptr: 4770 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4771 // This only makes sense to call in a kernel, so just lower to null. 4772 B.buildConstant(MI.getOperand(0).getReg(), 0); 4773 MI.eraseFromParent(); 4774 return true; 4775 } 4776 4777 return legalizePreloadedArgIntrin( 4778 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4779 case Intrinsic::amdgcn_implicitarg_ptr: 4780 return legalizeImplicitArgPtr(MI, MRI, B); 4781 case Intrinsic::amdgcn_workitem_id_x: 4782 return legalizePreloadedArgIntrin(MI, MRI, B, 4783 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4784 case Intrinsic::amdgcn_workitem_id_y: 4785 return legalizePreloadedArgIntrin(MI, MRI, B, 4786 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4787 case Intrinsic::amdgcn_workitem_id_z: 4788 return legalizePreloadedArgIntrin(MI, MRI, B, 4789 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4790 case Intrinsic::amdgcn_workgroup_id_x: 4791 return legalizePreloadedArgIntrin(MI, MRI, B, 4792 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4793 case Intrinsic::amdgcn_workgroup_id_y: 4794 return legalizePreloadedArgIntrin(MI, MRI, B, 4795 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4796 case Intrinsic::amdgcn_workgroup_id_z: 4797 return legalizePreloadedArgIntrin(MI, MRI, B, 4798 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4799 case Intrinsic::amdgcn_dispatch_ptr: 4800 return legalizePreloadedArgIntrin(MI, MRI, B, 4801 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4802 case Intrinsic::amdgcn_queue_ptr: 4803 return legalizePreloadedArgIntrin(MI, MRI, B, 4804 AMDGPUFunctionArgInfo::QUEUE_PTR); 4805 case Intrinsic::amdgcn_implicit_buffer_ptr: 4806 return legalizePreloadedArgIntrin( 4807 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4808 case Intrinsic::amdgcn_dispatch_id: 4809 return legalizePreloadedArgIntrin(MI, MRI, B, 4810 AMDGPUFunctionArgInfo::DISPATCH_ID); 4811 case Intrinsic::amdgcn_fdiv_fast: 4812 return legalizeFDIVFastIntrin(MI, MRI, B); 4813 case Intrinsic::amdgcn_is_shared: 4814 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4815 case Intrinsic::amdgcn_is_private: 4816 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4817 case Intrinsic::amdgcn_wavefrontsize: { 4818 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4819 MI.eraseFromParent(); 4820 return true; 4821 } 4822 case Intrinsic::amdgcn_s_buffer_load: 4823 return legalizeSBufferLoad(Helper, MI); 4824 case Intrinsic::amdgcn_raw_buffer_store: 4825 case Intrinsic::amdgcn_struct_buffer_store: 4826 return legalizeBufferStore(MI, MRI, B, false, false); 4827 case Intrinsic::amdgcn_raw_buffer_store_format: 4828 case Intrinsic::amdgcn_struct_buffer_store_format: 4829 return legalizeBufferStore(MI, MRI, B, false, true); 4830 case Intrinsic::amdgcn_raw_tbuffer_store: 4831 case Intrinsic::amdgcn_struct_tbuffer_store: 4832 return legalizeBufferStore(MI, MRI, B, true, true); 4833 case Intrinsic::amdgcn_raw_buffer_load: 4834 case Intrinsic::amdgcn_struct_buffer_load: 4835 return legalizeBufferLoad(MI, MRI, B, false, false); 4836 case Intrinsic::amdgcn_raw_buffer_load_format: 4837 case Intrinsic::amdgcn_struct_buffer_load_format: 4838 return legalizeBufferLoad(MI, MRI, B, true, false); 4839 case Intrinsic::amdgcn_raw_tbuffer_load: 4840 case Intrinsic::amdgcn_struct_tbuffer_load: 4841 return legalizeBufferLoad(MI, MRI, B, true, true); 4842 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4843 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4844 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4845 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4846 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4847 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4848 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4849 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4850 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4851 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4852 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4853 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4854 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4855 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4856 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4857 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4858 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4859 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4860 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4861 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4862 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4863 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4864 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4865 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4866 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4867 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4868 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4869 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4870 case Intrinsic::amdgcn_buffer_atomic_fadd: 4871 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4872 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4873 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4874 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4875 return legalizeBufferAtomic(MI, B, IntrID); 4876 case Intrinsic::amdgcn_atomic_inc: 4877 return legalizeAtomicIncDec(MI, B, true); 4878 case Intrinsic::amdgcn_atomic_dec: 4879 return legalizeAtomicIncDec(MI, B, false); 4880 case Intrinsic::trap: 4881 return legalizeTrapIntrinsic(MI, MRI, B); 4882 case Intrinsic::debugtrap: 4883 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4884 case Intrinsic::amdgcn_rsq_clamp: 4885 return legalizeRsqClampIntrinsic(MI, MRI, B); 4886 case Intrinsic::amdgcn_ds_fadd: 4887 case Intrinsic::amdgcn_ds_fmin: 4888 case Intrinsic::amdgcn_ds_fmax: 4889 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4890 case Intrinsic::amdgcn_image_bvh_intersect_ray: 4891 return legalizeBVHIntrinsic(MI, B); 4892 default: { 4893 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4894 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4895 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4896 return true; 4897 } 4898 } 4899 4900 return true; 4901 } 4902