1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/ADT/ScopeExit.h" 22 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 25 #include "llvm/IR/DiagnosticInfo.h" 26 #include "llvm/IR/IntrinsicsAMDGPU.h" 27 28 #define DEBUG_TYPE "amdgpu-legalinfo" 29 30 using namespace llvm; 31 using namespace LegalizeActions; 32 using namespace LegalizeMutations; 33 using namespace LegalityPredicates; 34 using namespace MIPatternMatch; 35 36 // Hack until load/store selection patterns support any tuple of legal types. 37 static cl::opt<bool> EnableNewLegality( 38 "amdgpu-global-isel-new-legality", 39 cl::desc("Use GlobalISel desired legality, rather than try to use" 40 "rules compatible with selection patterns"), 41 cl::init(false), 42 cl::ReallyHidden); 43 44 static constexpr unsigned MaxRegisterSize = 1024; 45 46 // Round the number of elements to the next power of two elements 47 static LLT getPow2VectorType(LLT Ty) { 48 unsigned NElts = Ty.getNumElements(); 49 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 50 return Ty.changeNumElements(Pow2NElts); 51 } 52 53 // Round the number of bits to the next power of two bits 54 static LLT getPow2ScalarType(LLT Ty) { 55 unsigned Bits = Ty.getSizeInBits(); 56 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 57 return LLT::scalar(Pow2Bits); 58 } 59 60 /// \returs true if this is an odd sized vector which should widen by adding an 61 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 62 /// excludes s1 vectors, which should always be scalarized. 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 if (!Ty.isVector()) 67 return false; 68 69 const LLT EltTy = Ty.getElementType(); 70 const unsigned EltSize = EltTy.getSizeInBits(); 71 return Ty.getNumElements() % 2 != 0 && 72 EltSize > 1 && EltSize < 32 && 73 Ty.getSizeInBits() % 32 != 0; 74 }; 75 } 76 77 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 78 return [=](const LegalityQuery &Query) { 79 const LLT Ty = Query.Types[TypeIdx]; 80 return Ty.getSizeInBits() % 32 == 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LLT getBitcastRegisterType(const LLT Ty) { 130 const unsigned Size = Ty.getSizeInBits(); 131 132 LLT CoercedTy; 133 if (Size <= 32) { 134 // <2 x s8> -> s16 135 // <4 x s8> -> s32 136 return LLT::scalar(Size); 137 } 138 139 return LLT::scalarOrVector(Size / 32, 32); 140 } 141 142 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 143 return [=](const LegalityQuery &Query) { 144 const LLT Ty = Query.Types[TypeIdx]; 145 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 146 }; 147 } 148 149 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT Ty = Query.Types[TypeIdx]; 152 unsigned Size = Ty.getSizeInBits(); 153 assert(Size % 32 == 0); 154 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 155 }; 156 } 157 158 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 159 return [=](const LegalityQuery &Query) { 160 const LLT QueryTy = Query.Types[TypeIdx]; 161 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 162 }; 163 } 164 165 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 166 return [=](const LegalityQuery &Query) { 167 const LLT QueryTy = Query.Types[TypeIdx]; 168 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 169 }; 170 } 171 172 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 173 return [=](const LegalityQuery &Query) { 174 const LLT QueryTy = Query.Types[TypeIdx]; 175 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 176 }; 177 } 178 179 static bool isRegisterSize(unsigned Size) { 180 return Size % 32 == 0 && Size <= MaxRegisterSize; 181 } 182 183 static bool isRegisterVectorElementType(LLT EltTy) { 184 const int EltSize = EltTy.getSizeInBits(); 185 return EltSize == 16 || EltSize % 32 == 0; 186 } 187 188 static bool isRegisterVectorType(LLT Ty) { 189 const int EltSize = Ty.getElementType().getSizeInBits(); 190 return EltSize == 32 || EltSize == 64 || 191 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 192 EltSize == 128 || EltSize == 256; 193 } 194 195 static bool isRegisterType(LLT Ty) { 196 if (!isRegisterSize(Ty.getSizeInBits())) 197 return false; 198 199 if (Ty.isVector()) 200 return isRegisterVectorType(Ty); 201 202 return true; 203 } 204 205 // Any combination of 32 or 64-bit elements up the maximum register size, and 206 // multiples of v2s16. 207 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 208 return [=](const LegalityQuery &Query) { 209 return isRegisterType(Query.Types[TypeIdx]); 210 }; 211 } 212 213 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 const LLT QueryTy = Query.Types[TypeIdx]; 216 if (!QueryTy.isVector()) 217 return false; 218 const LLT EltTy = QueryTy.getElementType(); 219 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 220 }; 221 } 222 223 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 224 return [=](const LegalityQuery &Query) { 225 const LLT Ty = Query.Types[TypeIdx]; 226 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 227 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 228 }; 229 } 230 231 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 232 // handle some operations by just promoting the register during 233 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 234 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 235 bool IsLoad) { 236 switch (AS) { 237 case AMDGPUAS::PRIVATE_ADDRESS: 238 // FIXME: Private element size. 239 return ST.enableFlatScratch() ? 128 : 32; 240 case AMDGPUAS::LOCAL_ADDRESS: 241 return ST.useDS128() ? 128 : 64; 242 case AMDGPUAS::GLOBAL_ADDRESS: 243 case AMDGPUAS::CONSTANT_ADDRESS: 244 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 245 // Treat constant and global as identical. SMRD loads are sometimes usable for 246 // global loads (ideally constant address space should be eliminated) 247 // depending on the context. Legality cannot be context dependent, but 248 // RegBankSelect can split the load as necessary depending on the pointer 249 // register bank/uniformity and if the memory is invariant or not written in a 250 // kernel. 251 return IsLoad ? 512 : 128; 252 default: 253 // Flat addresses may contextually need to be split to 32-bit parts if they 254 // may alias scratch depending on the subtarget. 255 return 128; 256 } 257 } 258 259 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 260 const LegalityQuery &Query, 261 unsigned Opcode) { 262 const LLT Ty = Query.Types[0]; 263 264 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 265 const bool IsLoad = Opcode != AMDGPU::G_STORE; 266 267 unsigned RegSize = Ty.getSizeInBits(); 268 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 269 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 270 unsigned AS = Query.Types[1].getAddressSpace(); 271 272 // All of these need to be custom lowered to cast the pointer operand. 273 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 274 return false; 275 276 // TODO: We should be able to widen loads if the alignment is high enough, but 277 // we also need to modify the memory access size. 278 #if 0 279 // Accept widening loads based on alignment. 280 if (IsLoad && MemSize < Size) 281 MemSize = std::max(MemSize, Align); 282 #endif 283 284 // Only 1-byte and 2-byte to 32-bit extloads are valid. 285 if (MemSize != RegSize && RegSize != 32) 286 return false; 287 288 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 289 return false; 290 291 switch (MemSize) { 292 case 8: 293 case 16: 294 case 32: 295 case 64: 296 case 128: 297 break; 298 case 96: 299 if (!ST.hasDwordx3LoadStores()) 300 return false; 301 break; 302 case 256: 303 case 512: 304 // These may contextually need to be broken down. 305 break; 306 default: 307 return false; 308 } 309 310 assert(RegSize >= MemSize); 311 312 if (AlignBits < MemSize) { 313 const SITargetLowering *TLI = ST.getTargetLowering(); 314 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 315 Align(AlignBits / 8))) 316 return false; 317 } 318 319 return true; 320 } 321 322 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 323 // workaround this. Eventually it should ignore the type for loads and only care 324 // about the size. Return true in cases where we will workaround this for now by 325 // bitcasting. 326 static bool loadStoreBitcastWorkaround(const LLT Ty) { 327 if (EnableNewLegality) 328 return false; 329 330 const unsigned Size = Ty.getSizeInBits(); 331 if (Size <= 64) 332 return false; 333 if (!Ty.isVector()) 334 return true; 335 336 LLT EltTy = Ty.getElementType(); 337 if (EltTy.isPointer()) 338 return true; 339 340 unsigned EltSize = EltTy.getSizeInBits(); 341 return EltSize != 32 && EltSize != 64; 342 } 343 344 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 345 unsigned Opcode) { 346 const LLT Ty = Query.Types[0]; 347 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 348 !loadStoreBitcastWorkaround(Ty); 349 } 350 351 /// Return true if a load or store of the type should be lowered with a bitcast 352 /// to a different type. 353 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 354 const unsigned MemSizeInBits) { 355 const unsigned Size = Ty.getSizeInBits(); 356 if (Size != MemSizeInBits) 357 return Size <= 32 && Ty.isVector(); 358 359 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 360 return true; 361 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 362 !isRegisterVectorElementType(Ty.getElementType()); 363 } 364 365 /// Return true if we should legalize a load by widening an odd sized memory 366 /// access up to the alignment. Note this case when the memory access itself 367 /// changes, not the size of the result register. 368 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 369 unsigned AlignInBits, unsigned AddrSpace, 370 unsigned Opcode) { 371 // We don't want to widen cases that are naturally legal. 372 if (isPowerOf2_32(SizeInBits)) 373 return false; 374 375 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 376 // end up widening these for a scalar load during RegBankSelect, since there 377 // aren't 96-bit scalar loads. 378 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 379 return false; 380 381 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 382 return false; 383 384 // A load is known dereferenceable up to the alignment, so it's legal to widen 385 // to it. 386 // 387 // TODO: Could check dereferenceable for less aligned cases. 388 unsigned RoundedSize = NextPowerOf2(SizeInBits); 389 if (AlignInBits < RoundedSize) 390 return false; 391 392 // Do not widen if it would introduce a slow unaligned load. 393 const SITargetLowering *TLI = ST.getTargetLowering(); 394 bool Fast = false; 395 return TLI->allowsMisalignedMemoryAccessesImpl( 396 RoundedSize, AddrSpace, Align(AlignInBits / 8), 397 MachineMemOperand::MOLoad, &Fast) && 398 Fast; 399 } 400 401 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 402 unsigned Opcode) { 403 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 404 return false; 405 406 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 407 Query.MMODescrs[0].AlignInBits, 408 Query.Types[1].getAddressSpace(), Opcode); 409 } 410 411 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 412 const GCNTargetMachine &TM) 413 : ST(ST_) { 414 using namespace TargetOpcode; 415 416 auto GetAddrSpacePtr = [&TM](unsigned AS) { 417 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 418 }; 419 420 const LLT S1 = LLT::scalar(1); 421 const LLT S8 = LLT::scalar(8); 422 const LLT S16 = LLT::scalar(16); 423 const LLT S32 = LLT::scalar(32); 424 const LLT S64 = LLT::scalar(64); 425 const LLT S128 = LLT::scalar(128); 426 const LLT S256 = LLT::scalar(256); 427 const LLT S512 = LLT::scalar(512); 428 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 429 430 const LLT V2S8 = LLT::vector(2, 8); 431 const LLT V2S16 = LLT::vector(2, 16); 432 const LLT V4S16 = LLT::vector(4, 16); 433 434 const LLT V2S32 = LLT::vector(2, 32); 435 const LLT V3S32 = LLT::vector(3, 32); 436 const LLT V4S32 = LLT::vector(4, 32); 437 const LLT V5S32 = LLT::vector(5, 32); 438 const LLT V6S32 = LLT::vector(6, 32); 439 const LLT V7S32 = LLT::vector(7, 32); 440 const LLT V8S32 = LLT::vector(8, 32); 441 const LLT V9S32 = LLT::vector(9, 32); 442 const LLT V10S32 = LLT::vector(10, 32); 443 const LLT V11S32 = LLT::vector(11, 32); 444 const LLT V12S32 = LLT::vector(12, 32); 445 const LLT V13S32 = LLT::vector(13, 32); 446 const LLT V14S32 = LLT::vector(14, 32); 447 const LLT V15S32 = LLT::vector(15, 32); 448 const LLT V16S32 = LLT::vector(16, 32); 449 const LLT V32S32 = LLT::vector(32, 32); 450 451 const LLT V2S64 = LLT::vector(2, 64); 452 const LLT V3S64 = LLT::vector(3, 64); 453 const LLT V4S64 = LLT::vector(4, 64); 454 const LLT V5S64 = LLT::vector(5, 64); 455 const LLT V6S64 = LLT::vector(6, 64); 456 const LLT V7S64 = LLT::vector(7, 64); 457 const LLT V8S64 = LLT::vector(8, 64); 458 const LLT V16S64 = LLT::vector(16, 64); 459 460 std::initializer_list<LLT> AllS32Vectors = 461 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 462 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 463 std::initializer_list<LLT> AllS64Vectors = 464 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 465 466 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 467 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 468 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 469 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 470 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 471 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 472 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 473 474 const LLT CodePtr = FlatPtr; 475 476 const std::initializer_list<LLT> AddrSpaces64 = { 477 GlobalPtr, ConstantPtr, FlatPtr 478 }; 479 480 const std::initializer_list<LLT> AddrSpaces32 = { 481 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 482 }; 483 484 const std::initializer_list<LLT> FPTypesBase = { 485 S32, S64 486 }; 487 488 const std::initializer_list<LLT> FPTypes16 = { 489 S32, S64, S16 490 }; 491 492 const std::initializer_list<LLT> FPTypesPK16 = { 493 S32, S64, S16, V2S16 494 }; 495 496 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 497 498 setAction({G_BRCOND, S1}, Legal); // VCC branches 499 setAction({G_BRCOND, S32}, Legal); // SCC branches 500 501 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 502 // elements for v3s16 503 getActionDefinitionsBuilder(G_PHI) 504 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 505 .legalFor(AllS32Vectors) 506 .legalFor(AllS64Vectors) 507 .legalFor(AddrSpaces64) 508 .legalFor(AddrSpaces32) 509 .legalIf(isPointer(0)) 510 .clampScalar(0, S16, S256) 511 .widenScalarToNextPow2(0, 32) 512 .clampMaxNumElements(0, S32, 16) 513 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 514 .scalarize(0); 515 516 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 517 // Full set of gfx9 features. 518 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 519 .legalFor({S32, S16, V2S16}) 520 .clampScalar(0, S16, S32) 521 .clampMaxNumElements(0, S16, 2) 522 .scalarize(0) 523 .widenScalarToNextPow2(0, 32); 524 525 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 526 .legalFor({S32, S16, V2S16}) // Clamp modifier 527 .minScalarOrElt(0, S16) 528 .clampMaxNumElements(0, S16, 2) 529 .scalarize(0) 530 .widenScalarToNextPow2(0, 32) 531 .lower(); 532 } else if (ST.has16BitInsts()) { 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16}) 535 .clampScalar(0, S16, S32) 536 .scalarize(0) 537 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 538 539 // Technically the saturating operations require clamp bit support, but this 540 // was introduced at the same time as 16-bit operations. 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 542 .legalFor({S32, S16}) // Clamp modifier 543 .minScalar(0, S16) 544 .scalarize(0) 545 .widenScalarToNextPow2(0, 16) 546 .lower(); 547 548 // We're just lowering this, but it helps get a better result to try to 549 // coerce to the desired type first. 550 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 551 .minScalar(0, S16) 552 .scalarize(0) 553 .lower(); 554 } else { 555 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 556 .legalFor({S32}) 557 .clampScalar(0, S32, S32) 558 .scalarize(0); 559 560 if (ST.hasIntClamp()) { 561 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 562 .legalFor({S32}) // Clamp modifier. 563 .scalarize(0) 564 .minScalarOrElt(0, S32) 565 .lower(); 566 } else { 567 // Clamp bit support was added in VI, along with 16-bit operations. 568 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 569 .minScalar(0, S32) 570 .scalarize(0) 571 .lower(); 572 } 573 574 // FIXME: DAG expansion gets better results. The widening uses the smaller 575 // range values and goes for the min/max lowering directly. 576 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 577 .minScalar(0, S32) 578 .scalarize(0) 579 .lower(); 580 } 581 582 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 583 .customFor({S32, S64}) 584 .clampScalar(0, S32, S64) 585 .widenScalarToNextPow2(0, 32) 586 .scalarize(0); 587 588 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 589 .legalFor({S32}) 590 .maxScalarOrElt(0, S32); 591 592 if (ST.hasVOP3PInsts()) { 593 Mulh 594 .clampMaxNumElements(0, S8, 2) 595 .lowerFor({V2S8}); 596 } 597 598 Mulh 599 .scalarize(0) 600 .lower(); 601 602 // Report legal for any types we can handle anywhere. For the cases only legal 603 // on the SALU, RegBankSelect will be able to re-legalize. 604 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 605 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 606 .clampScalar(0, S32, S64) 607 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 608 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 609 .widenScalarToNextPow2(0) 610 .scalarize(0); 611 612 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 613 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 614 .legalFor({{S32, S1}, {S32, S32}}) 615 .minScalar(0, S32) 616 // TODO: .scalarize(0) 617 .lower(); 618 619 getActionDefinitionsBuilder(G_BITCAST) 620 // Don't worry about the size constraint. 621 .legalIf(all(isRegisterType(0), isRegisterType(1))) 622 .lower(); 623 624 625 getActionDefinitionsBuilder(G_CONSTANT) 626 .legalFor({S1, S32, S64, S16, GlobalPtr, 627 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 628 .legalIf(isPointer(0)) 629 .clampScalar(0, S32, S64) 630 .widenScalarToNextPow2(0); 631 632 getActionDefinitionsBuilder(G_FCONSTANT) 633 .legalFor({S32, S64, S16}) 634 .clampScalar(0, S16, S64); 635 636 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 637 .legalIf(isRegisterType(0)) 638 // s1 and s16 are special cases because they have legal operations on 639 // them, but don't really occupy registers in the normal way. 640 .legalFor({S1, S16}) 641 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 642 .clampScalarOrElt(0, S32, MaxScalar) 643 .widenScalarToNextPow2(0, 32) 644 .clampMaxNumElements(0, S32, 16); 645 646 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 647 648 // If the amount is divergent, we have to do a wave reduction to get the 649 // maximum value, so this is expanded during RegBankSelect. 650 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 651 .legalFor({{PrivatePtr, S32}}); 652 653 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 654 .customIf(typeIsNot(0, PrivatePtr)); 655 656 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 657 658 auto &FPOpActions = getActionDefinitionsBuilder( 659 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 660 .legalFor({S32, S64}); 661 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 662 .customFor({S32, S64}); 663 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 664 .customFor({S32, S64}); 665 666 if (ST.has16BitInsts()) { 667 if (ST.hasVOP3PInsts()) 668 FPOpActions.legalFor({S16, V2S16}); 669 else 670 FPOpActions.legalFor({S16}); 671 672 TrigActions.customFor({S16}); 673 FDIVActions.customFor({S16}); 674 } 675 676 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 677 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 678 679 if (ST.hasVOP3PInsts()) { 680 MinNumMaxNum.customFor(FPTypesPK16) 681 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 682 .clampMaxNumElements(0, S16, 2) 683 .clampScalar(0, S16, S64) 684 .scalarize(0); 685 } else if (ST.has16BitInsts()) { 686 MinNumMaxNum.customFor(FPTypes16) 687 .clampScalar(0, S16, S64) 688 .scalarize(0); 689 } else { 690 MinNumMaxNum.customFor(FPTypesBase) 691 .clampScalar(0, S32, S64) 692 .scalarize(0); 693 } 694 695 if (ST.hasVOP3PInsts()) 696 FPOpActions.clampMaxNumElements(0, S16, 2); 697 698 FPOpActions 699 .scalarize(0) 700 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 701 702 TrigActions 703 .scalarize(0) 704 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 705 706 FDIVActions 707 .scalarize(0) 708 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 709 710 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 711 .legalFor(FPTypesPK16) 712 .clampMaxNumElements(0, S16, 2) 713 .scalarize(0) 714 .clampScalar(0, S16, S64); 715 716 if (ST.has16BitInsts()) { 717 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 718 .legalFor({S32, S64, S16}) 719 .scalarize(0) 720 .clampScalar(0, S16, S64); 721 } else { 722 getActionDefinitionsBuilder(G_FSQRT) 723 .legalFor({S32, S64}) 724 .scalarize(0) 725 .clampScalar(0, S32, S64); 726 727 if (ST.hasFractBug()) { 728 getActionDefinitionsBuilder(G_FFLOOR) 729 .customFor({S64}) 730 .legalFor({S32, S64}) 731 .scalarize(0) 732 .clampScalar(0, S32, S64); 733 } else { 734 getActionDefinitionsBuilder(G_FFLOOR) 735 .legalFor({S32, S64}) 736 .scalarize(0) 737 .clampScalar(0, S32, S64); 738 } 739 } 740 741 getActionDefinitionsBuilder(G_FPTRUNC) 742 .legalFor({{S32, S64}, {S16, S32}}) 743 .scalarize(0) 744 .lower(); 745 746 getActionDefinitionsBuilder(G_FPEXT) 747 .legalFor({{S64, S32}, {S32, S16}}) 748 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 749 .scalarize(0); 750 751 getActionDefinitionsBuilder(G_FSUB) 752 // Use actual fsub instruction 753 .legalFor({S32}) 754 // Must use fadd + fneg 755 .lowerFor({S64, S16, V2S16}) 756 .scalarize(0) 757 .clampScalar(0, S32, S64); 758 759 // Whether this is legal depends on the floating point mode for the function. 760 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 761 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 762 FMad.customFor({S32, S16}); 763 else if (ST.hasMadMacF32Insts()) 764 FMad.customFor({S32}); 765 else if (ST.hasMadF16()) 766 FMad.customFor({S16}); 767 FMad.scalarize(0) 768 .lower(); 769 770 auto &FRem = getActionDefinitionsBuilder(G_FREM); 771 if (ST.has16BitInsts()) { 772 FRem.customFor({S16, S32, S64}); 773 } else { 774 FRem.minScalar(0, S32) 775 .customFor({S32, S64}); 776 } 777 FRem.scalarize(0); 778 779 // TODO: Do we need to clamp maximum bitwidth? 780 getActionDefinitionsBuilder(G_TRUNC) 781 .legalIf(isScalar(0)) 782 .legalFor({{V2S16, V2S32}}) 783 .clampMaxNumElements(0, S16, 2) 784 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 785 // situations (like an invalid implicit use), we don't want to infinite loop 786 // in the legalizer. 787 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 788 .alwaysLegal(); 789 790 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 791 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 792 {S32, S1}, {S64, S1}, {S16, S1}}) 793 .scalarize(0) 794 .clampScalar(0, S32, S64) 795 .widenScalarToNextPow2(1, 32); 796 797 // TODO: Split s1->s64 during regbankselect for VALU. 798 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 799 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 800 .lowerFor({{S32, S64}}) 801 .lowerIf(typeIs(1, S1)) 802 .customFor({{S64, S64}}); 803 if (ST.has16BitInsts()) 804 IToFP.legalFor({{S16, S16}}); 805 IToFP.clampScalar(1, S32, S64) 806 .minScalar(0, S32) 807 .scalarize(0) 808 .widenScalarToNextPow2(1); 809 810 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 811 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 812 .customFor({{S64, S64}}) 813 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 814 if (ST.has16BitInsts()) 815 FPToI.legalFor({{S16, S16}}); 816 else 817 FPToI.minScalar(1, S32); 818 819 FPToI.minScalar(0, S32) 820 .scalarize(0) 821 .lower(); 822 823 // Lower roundeven into G_FRINT 824 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 825 .scalarize(0) 826 .lower(); 827 828 if (ST.has16BitInsts()) { 829 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 830 .legalFor({S16, S32, S64}) 831 .clampScalar(0, S16, S64) 832 .scalarize(0); 833 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 834 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 835 .legalFor({S32, S64}) 836 .clampScalar(0, S32, S64) 837 .scalarize(0); 838 } else { 839 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 840 .legalFor({S32}) 841 .customFor({S64}) 842 .clampScalar(0, S32, S64) 843 .scalarize(0); 844 } 845 846 getActionDefinitionsBuilder(G_PTR_ADD) 847 .legalIf(all(isPointer(0), sameSize(0, 1))) 848 .scalarize(0) 849 .scalarSameSizeAs(1, 0); 850 851 getActionDefinitionsBuilder(G_PTRMASK) 852 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 853 .scalarSameSizeAs(1, 0) 854 .scalarize(0); 855 856 auto &CmpBuilder = 857 getActionDefinitionsBuilder(G_ICMP) 858 // The compare output type differs based on the register bank of the output, 859 // so make both s1 and s32 legal. 860 // 861 // Scalar compares producing output in scc will be promoted to s32, as that 862 // is the allocatable register type that will be needed for the copy from 863 // scc. This will be promoted during RegBankSelect, and we assume something 864 // before that won't try to use s32 result types. 865 // 866 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 867 // bank. 868 .legalForCartesianProduct( 869 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 870 .legalForCartesianProduct( 871 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 872 if (ST.has16BitInsts()) { 873 CmpBuilder.legalFor({{S1, S16}}); 874 } 875 876 CmpBuilder 877 .widenScalarToNextPow2(1) 878 .clampScalar(1, S32, S64) 879 .scalarize(0) 880 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 881 882 getActionDefinitionsBuilder(G_FCMP) 883 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 884 .widenScalarToNextPow2(1) 885 .clampScalar(1, S32, S64) 886 .scalarize(0); 887 888 // FIXME: fpow has a selection pattern that should move to custom lowering. 889 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 890 if (ST.has16BitInsts()) 891 Exp2Ops.legalFor({S32, S16}); 892 else 893 Exp2Ops.legalFor({S32}); 894 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 895 Exp2Ops.scalarize(0); 896 897 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 898 if (ST.has16BitInsts()) 899 ExpOps.customFor({{S32}, {S16}}); 900 else 901 ExpOps.customFor({S32}); 902 ExpOps.clampScalar(0, MinScalarFPTy, S32) 903 .scalarize(0); 904 905 getActionDefinitionsBuilder(G_FPOWI) 906 .clampScalar(0, MinScalarFPTy, S32) 907 .lower(); 908 909 // The 64-bit versions produce 32-bit results, but only on the SALU. 910 getActionDefinitionsBuilder(G_CTPOP) 911 .legalFor({{S32, S32}, {S32, S64}}) 912 .clampScalar(0, S32, S32) 913 .clampScalar(1, S32, S64) 914 .scalarize(0) 915 .widenScalarToNextPow2(0, 32) 916 .widenScalarToNextPow2(1, 32); 917 918 // The hardware instructions return a different result on 0 than the generic 919 // instructions expect. The hardware produces -1, but these produce the 920 // bitwidth. 921 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 922 .scalarize(0) 923 .clampScalar(0, S32, S32) 924 .clampScalar(1, S32, S64) 925 .widenScalarToNextPow2(0, 32) 926 .widenScalarToNextPow2(1, 32) 927 .lower(); 928 929 // The 64-bit versions produce 32-bit results, but only on the SALU. 930 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 931 .legalFor({{S32, S32}, {S32, S64}}) 932 .clampScalar(0, S32, S32) 933 .clampScalar(1, S32, S64) 934 .scalarize(0) 935 .widenScalarToNextPow2(0, 32) 936 .widenScalarToNextPow2(1, 32); 937 938 getActionDefinitionsBuilder(G_BITREVERSE) 939 .legalFor({S32}) 940 .clampScalar(0, S32, S32) 941 .scalarize(0); 942 943 if (ST.has16BitInsts()) { 944 getActionDefinitionsBuilder(G_BSWAP) 945 .legalFor({S16, S32, V2S16}) 946 .clampMaxNumElements(0, S16, 2) 947 // FIXME: Fixing non-power-of-2 before clamp is workaround for 948 // narrowScalar limitation. 949 .widenScalarToNextPow2(0) 950 .clampScalar(0, S16, S32) 951 .scalarize(0); 952 953 if (ST.hasVOP3PInsts()) { 954 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 955 .legalFor({S32, S16, V2S16}) 956 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 957 .clampMaxNumElements(0, S16, 2) 958 .minScalar(0, S16) 959 .widenScalarToNextPow2(0) 960 .scalarize(0) 961 .lower(); 962 } else { 963 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 964 .legalFor({S32, S16}) 965 .widenScalarToNextPow2(0) 966 .minScalar(0, S16) 967 .scalarize(0) 968 .lower(); 969 } 970 } else { 971 // TODO: Should have same legality without v_perm_b32 972 getActionDefinitionsBuilder(G_BSWAP) 973 .legalFor({S32}) 974 .lowerIf(scalarNarrowerThan(0, 32)) 975 // FIXME: Fixing non-power-of-2 before clamp is workaround for 976 // narrowScalar limitation. 977 .widenScalarToNextPow2(0) 978 .maxScalar(0, S32) 979 .scalarize(0) 980 .lower(); 981 982 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 983 .legalFor({S32}) 984 .minScalar(0, S32) 985 .widenScalarToNextPow2(0) 986 .scalarize(0) 987 .lower(); 988 } 989 990 getActionDefinitionsBuilder(G_INTTOPTR) 991 // List the common cases 992 .legalForCartesianProduct(AddrSpaces64, {S64}) 993 .legalForCartesianProduct(AddrSpaces32, {S32}) 994 .scalarize(0) 995 // Accept any address space as long as the size matches 996 .legalIf(sameSize(0, 1)) 997 .widenScalarIf(smallerThan(1, 0), 998 [](const LegalityQuery &Query) { 999 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1000 }) 1001 .narrowScalarIf(largerThan(1, 0), 1002 [](const LegalityQuery &Query) { 1003 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1004 }); 1005 1006 getActionDefinitionsBuilder(G_PTRTOINT) 1007 // List the common cases 1008 .legalForCartesianProduct(AddrSpaces64, {S64}) 1009 .legalForCartesianProduct(AddrSpaces32, {S32}) 1010 .scalarize(0) 1011 // Accept any address space as long as the size matches 1012 .legalIf(sameSize(0, 1)) 1013 .widenScalarIf(smallerThan(0, 1), 1014 [](const LegalityQuery &Query) { 1015 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1016 }) 1017 .narrowScalarIf( 1018 largerThan(0, 1), 1019 [](const LegalityQuery &Query) { 1020 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1021 }); 1022 1023 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1024 .scalarize(0) 1025 .custom(); 1026 1027 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1028 bool IsLoad) -> bool { 1029 const LLT DstTy = Query.Types[0]; 1030 1031 // Split vector extloads. 1032 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1033 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1034 1035 if (MemSize < DstTy.getSizeInBits()) 1036 MemSize = std::max(MemSize, AlignBits); 1037 1038 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1039 return true; 1040 1041 const LLT PtrTy = Query.Types[1]; 1042 unsigned AS = PtrTy.getAddressSpace(); 1043 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1044 return true; 1045 1046 // Catch weird sized loads that don't evenly divide into the access sizes 1047 // TODO: May be able to widen depending on alignment etc. 1048 unsigned NumRegs = (MemSize + 31) / 32; 1049 if (NumRegs == 3) { 1050 if (!ST.hasDwordx3LoadStores()) 1051 return true; 1052 } else { 1053 // If the alignment allows, these should have been widened. 1054 if (!isPowerOf2_32(NumRegs)) 1055 return true; 1056 } 1057 1058 if (AlignBits < MemSize) { 1059 const SITargetLowering *TLI = ST.getTargetLowering(); 1060 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1061 Align(AlignBits / 8)); 1062 } 1063 1064 return false; 1065 }; 1066 1067 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1068 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1069 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1070 1071 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1072 // LDS 1073 // TODO: Unsupported flat for SI. 1074 1075 for (unsigned Op : {G_LOAD, G_STORE}) { 1076 const bool IsStore = Op == G_STORE; 1077 1078 auto &Actions = getActionDefinitionsBuilder(Op); 1079 // Explicitly list some common cases. 1080 // TODO: Does this help compile time at all? 1081 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1082 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1083 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1084 {S64, GlobalPtr, 64, GlobalAlign32}, 1085 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1086 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1087 {S32, GlobalPtr, 8, GlobalAlign8}, 1088 {S32, GlobalPtr, 16, GlobalAlign16}, 1089 1090 {S32, LocalPtr, 32, 32}, 1091 {S64, LocalPtr, 64, 32}, 1092 {V2S32, LocalPtr, 64, 32}, 1093 {S32, LocalPtr, 8, 8}, 1094 {S32, LocalPtr, 16, 16}, 1095 {V2S16, LocalPtr, 32, 32}, 1096 1097 {S32, PrivatePtr, 32, 32}, 1098 {S32, PrivatePtr, 8, 8}, 1099 {S32, PrivatePtr, 16, 16}, 1100 {V2S16, PrivatePtr, 32, 32}, 1101 1102 {S32, ConstantPtr, 32, GlobalAlign32}, 1103 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1104 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1105 {S64, ConstantPtr, 64, GlobalAlign32}, 1106 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1107 Actions.legalIf( 1108 [=](const LegalityQuery &Query) -> bool { 1109 return isLoadStoreLegal(ST, Query, Op); 1110 }); 1111 1112 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1113 // 64-bits. 1114 // 1115 // TODO: Should generalize bitcast action into coerce, which will also cover 1116 // inserting addrspacecasts. 1117 Actions.customIf(typeIs(1, Constant32Ptr)); 1118 1119 // Turn any illegal element vectors into something easier to deal 1120 // with. These will ultimately produce 32-bit scalar shifts to extract the 1121 // parts anyway. 1122 // 1123 // For odd 16-bit element vectors, prefer to split those into pieces with 1124 // 16-bit vector parts. 1125 Actions.bitcastIf( 1126 [=](const LegalityQuery &Query) -> bool { 1127 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1128 Query.MMODescrs[0].SizeInBits); 1129 }, bitcastToRegisterType(0)); 1130 1131 if (!IsStore) { 1132 // Widen suitably aligned loads by loading extra bytes. The standard 1133 // legalization actions can't properly express widening memory operands. 1134 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1135 return shouldWidenLoad(ST, Query, G_LOAD); 1136 }); 1137 } 1138 1139 // FIXME: load/store narrowing should be moved to lower action 1140 Actions 1141 .narrowScalarIf( 1142 [=](const LegalityQuery &Query) -> bool { 1143 return !Query.Types[0].isVector() && 1144 needToSplitMemOp(Query, Op == G_LOAD); 1145 }, 1146 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1147 const LLT DstTy = Query.Types[0]; 1148 const LLT PtrTy = Query.Types[1]; 1149 1150 const unsigned DstSize = DstTy.getSizeInBits(); 1151 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1152 1153 // Split extloads. 1154 if (DstSize > MemSize) 1155 return std::make_pair(0, LLT::scalar(MemSize)); 1156 1157 if (!isPowerOf2_32(DstSize)) { 1158 // We're probably decomposing an odd sized store. Try to split 1159 // to the widest type. TODO: Account for alignment. As-is it 1160 // should be OK, since the new parts will be further legalized. 1161 unsigned FloorSize = PowerOf2Floor(DstSize); 1162 return std::make_pair(0, LLT::scalar(FloorSize)); 1163 } 1164 1165 if (DstSize > 32 && (DstSize % 32 != 0)) { 1166 // FIXME: Need a way to specify non-extload of larger size if 1167 // suitably aligned. 1168 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1169 } 1170 1171 unsigned MaxSize = maxSizeForAddrSpace(ST, 1172 PtrTy.getAddressSpace(), 1173 Op == G_LOAD); 1174 if (MemSize > MaxSize) 1175 return std::make_pair(0, LLT::scalar(MaxSize)); 1176 1177 unsigned Align = Query.MMODescrs[0].AlignInBits; 1178 return std::make_pair(0, LLT::scalar(Align)); 1179 }) 1180 .fewerElementsIf( 1181 [=](const LegalityQuery &Query) -> bool { 1182 return Query.Types[0].isVector() && 1183 needToSplitMemOp(Query, Op == G_LOAD); 1184 }, 1185 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1186 const LLT DstTy = Query.Types[0]; 1187 const LLT PtrTy = Query.Types[1]; 1188 1189 LLT EltTy = DstTy.getElementType(); 1190 unsigned MaxSize = maxSizeForAddrSpace(ST, 1191 PtrTy.getAddressSpace(), 1192 Op == G_LOAD); 1193 1194 // FIXME: Handle widened to power of 2 results better. This ends 1195 // up scalarizing. 1196 // FIXME: 3 element stores scalarized on SI 1197 1198 // Split if it's too large for the address space. 1199 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1200 unsigned NumElts = DstTy.getNumElements(); 1201 unsigned EltSize = EltTy.getSizeInBits(); 1202 1203 if (MaxSize % EltSize == 0) { 1204 return std::make_pair( 1205 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1206 } 1207 1208 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1209 1210 // FIXME: Refine when odd breakdowns handled 1211 // The scalars will need to be re-legalized. 1212 if (NumPieces == 1 || NumPieces >= NumElts || 1213 NumElts % NumPieces != 0) 1214 return std::make_pair(0, EltTy); 1215 1216 return std::make_pair(0, 1217 LLT::vector(NumElts / NumPieces, EltTy)); 1218 } 1219 1220 // FIXME: We could probably handle weird extending loads better. 1221 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1222 if (DstTy.getSizeInBits() > MemSize) 1223 return std::make_pair(0, EltTy); 1224 1225 unsigned EltSize = EltTy.getSizeInBits(); 1226 unsigned DstSize = DstTy.getSizeInBits(); 1227 if (!isPowerOf2_32(DstSize)) { 1228 // We're probably decomposing an odd sized store. Try to split 1229 // to the widest type. TODO: Account for alignment. As-is it 1230 // should be OK, since the new parts will be further legalized. 1231 unsigned FloorSize = PowerOf2Floor(DstSize); 1232 return std::make_pair( 1233 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1234 } 1235 1236 // Need to split because of alignment. 1237 unsigned Align = Query.MMODescrs[0].AlignInBits; 1238 if (EltSize > Align && 1239 (EltSize / Align < DstTy.getNumElements())) { 1240 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1241 } 1242 1243 // May need relegalization for the scalars. 1244 return std::make_pair(0, EltTy); 1245 }) 1246 .lowerIfMemSizeNotPow2() 1247 .minScalar(0, S32); 1248 1249 if (IsStore) 1250 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1251 1252 Actions 1253 .widenScalarToNextPow2(0) 1254 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1255 .lower(); 1256 } 1257 1258 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1259 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1260 {S32, GlobalPtr, 16, 2 * 8}, 1261 {S32, LocalPtr, 8, 8}, 1262 {S32, LocalPtr, 16, 16}, 1263 {S32, PrivatePtr, 8, 8}, 1264 {S32, PrivatePtr, 16, 16}, 1265 {S32, ConstantPtr, 8, 8}, 1266 {S32, ConstantPtr, 16, 2 * 8}}); 1267 if (ST.hasFlatAddressSpace()) { 1268 ExtLoads.legalForTypesWithMemDesc( 1269 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1270 } 1271 1272 ExtLoads.clampScalar(0, S32, S32) 1273 .widenScalarToNextPow2(0) 1274 .unsupportedIfMemSizeNotPow2() 1275 .lower(); 1276 1277 auto &Atomics = getActionDefinitionsBuilder( 1278 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1279 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1280 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1281 G_ATOMICRMW_UMIN}) 1282 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1283 {S64, GlobalPtr}, {S64, LocalPtr}, 1284 {S32, RegionPtr}, {S64, RegionPtr}}); 1285 if (ST.hasFlatAddressSpace()) { 1286 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1287 } 1288 1289 if (ST.hasLDSFPAtomics()) { 1290 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1291 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1292 } 1293 1294 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1295 // demarshalling 1296 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1297 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1298 {S32, FlatPtr}, {S64, FlatPtr}}) 1299 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1300 {S32, RegionPtr}, {S64, RegionPtr}}); 1301 // TODO: Pointer types, any 32-bit or 64-bit vector 1302 1303 // Condition should be s32 for scalar, s1 for vector. 1304 getActionDefinitionsBuilder(G_SELECT) 1305 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1306 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1307 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1308 .clampScalar(0, S16, S64) 1309 .scalarize(1) 1310 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1311 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1312 .clampMaxNumElements(0, S32, 2) 1313 .clampMaxNumElements(0, LocalPtr, 2) 1314 .clampMaxNumElements(0, PrivatePtr, 2) 1315 .scalarize(0) 1316 .widenScalarToNextPow2(0) 1317 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1318 1319 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1320 // be more flexible with the shift amount type. 1321 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1322 .legalFor({{S32, S32}, {S64, S32}}); 1323 if (ST.has16BitInsts()) { 1324 if (ST.hasVOP3PInsts()) { 1325 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1326 .clampMaxNumElements(0, S16, 2); 1327 } else 1328 Shifts.legalFor({{S16, S16}}); 1329 1330 // TODO: Support 16-bit shift amounts for all types 1331 Shifts.widenScalarIf( 1332 [=](const LegalityQuery &Query) { 1333 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1334 // 32-bit amount. 1335 const LLT ValTy = Query.Types[0]; 1336 const LLT AmountTy = Query.Types[1]; 1337 return ValTy.getSizeInBits() <= 16 && 1338 AmountTy.getSizeInBits() < 16; 1339 }, changeTo(1, S16)); 1340 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1341 Shifts.clampScalar(1, S32, S32); 1342 Shifts.clampScalar(0, S16, S64); 1343 Shifts.widenScalarToNextPow2(0, 16); 1344 1345 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1346 .minScalar(0, S16) 1347 .scalarize(0) 1348 .lower(); 1349 } else { 1350 // Make sure we legalize the shift amount type first, as the general 1351 // expansion for the shifted type will produce much worse code if it hasn't 1352 // been truncated already. 1353 Shifts.clampScalar(1, S32, S32); 1354 Shifts.clampScalar(0, S32, S64); 1355 Shifts.widenScalarToNextPow2(0, 32); 1356 1357 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1358 .minScalar(0, S32) 1359 .scalarize(0) 1360 .lower(); 1361 } 1362 Shifts.scalarize(0); 1363 1364 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1365 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1366 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1367 unsigned IdxTypeIdx = 2; 1368 1369 getActionDefinitionsBuilder(Op) 1370 .customIf([=](const LegalityQuery &Query) { 1371 const LLT EltTy = Query.Types[EltTypeIdx]; 1372 const LLT VecTy = Query.Types[VecTypeIdx]; 1373 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1374 const unsigned EltSize = EltTy.getSizeInBits(); 1375 return (EltSize == 32 || EltSize == 64) && 1376 VecTy.getSizeInBits() % 32 == 0 && 1377 VecTy.getSizeInBits() <= MaxRegisterSize && 1378 IdxTy.getSizeInBits() == 32; 1379 }) 1380 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1381 bitcastToVectorElement32(VecTypeIdx)) 1382 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1383 .bitcastIf( 1384 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1385 [=](const LegalityQuery &Query) { 1386 // For > 64-bit element types, try to turn this into a 64-bit 1387 // element vector since we may be able to do better indexing 1388 // if this is scalar. If not, fall back to 32. 1389 const LLT EltTy = Query.Types[EltTypeIdx]; 1390 const LLT VecTy = Query.Types[VecTypeIdx]; 1391 const unsigned DstEltSize = EltTy.getSizeInBits(); 1392 const unsigned VecSize = VecTy.getSizeInBits(); 1393 1394 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1395 return std::make_pair( 1396 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1397 }) 1398 .clampScalar(EltTypeIdx, S32, S64) 1399 .clampScalar(VecTypeIdx, S32, S64) 1400 .clampScalar(IdxTypeIdx, S32, S32) 1401 .clampMaxNumElements(VecTypeIdx, S32, 32) 1402 // TODO: Clamp elements for 64-bit vectors? 1403 // It should only be necessary with variable indexes. 1404 // As a last resort, lower to the stack 1405 .lower(); 1406 } 1407 1408 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1409 .unsupportedIf([=](const LegalityQuery &Query) { 1410 const LLT &EltTy = Query.Types[1].getElementType(); 1411 return Query.Types[0] != EltTy; 1412 }); 1413 1414 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1415 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1416 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1417 1418 // FIXME: Doesn't handle extract of illegal sizes. 1419 getActionDefinitionsBuilder(Op) 1420 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1421 // FIXME: Multiples of 16 should not be legal. 1422 .legalIf([=](const LegalityQuery &Query) { 1423 const LLT BigTy = Query.Types[BigTyIdx]; 1424 const LLT LitTy = Query.Types[LitTyIdx]; 1425 return (BigTy.getSizeInBits() % 32 == 0) && 1426 (LitTy.getSizeInBits() % 16 == 0); 1427 }) 1428 .widenScalarIf( 1429 [=](const LegalityQuery &Query) { 1430 const LLT BigTy = Query.Types[BigTyIdx]; 1431 return (BigTy.getScalarSizeInBits() < 16); 1432 }, 1433 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1434 .widenScalarIf( 1435 [=](const LegalityQuery &Query) { 1436 const LLT LitTy = Query.Types[LitTyIdx]; 1437 return (LitTy.getScalarSizeInBits() < 16); 1438 }, 1439 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1440 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1441 .widenScalarToNextPow2(BigTyIdx, 32); 1442 1443 } 1444 1445 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1446 .legalForCartesianProduct(AllS32Vectors, {S32}) 1447 .legalForCartesianProduct(AllS64Vectors, {S64}) 1448 .clampNumElements(0, V16S32, V32S32) 1449 .clampNumElements(0, V2S64, V16S64) 1450 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1451 1452 if (ST.hasScalarPackInsts()) { 1453 BuildVector 1454 // FIXME: Should probably widen s1 vectors straight to s32 1455 .minScalarOrElt(0, S16) 1456 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1457 .minScalar(1, S32); 1458 1459 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1460 .legalFor({V2S16, S32}) 1461 .lower(); 1462 BuildVector.minScalarOrElt(0, S32); 1463 } else { 1464 BuildVector.customFor({V2S16, S16}); 1465 BuildVector.minScalarOrElt(0, S32); 1466 1467 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1468 .customFor({V2S16, S32}) 1469 .lower(); 1470 } 1471 1472 BuildVector.legalIf(isRegisterType(0)); 1473 1474 // FIXME: Clamp maximum size 1475 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1476 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1477 .clampMaxNumElements(0, S32, 32) 1478 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1479 .clampMaxNumElements(0, S16, 64); 1480 1481 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1482 // pre-legalize. 1483 if (ST.hasVOP3PInsts()) { 1484 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1485 .customFor({V2S16, V2S16}) 1486 .lower(); 1487 } else 1488 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1489 1490 // Merge/Unmerge 1491 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1492 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1493 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1494 1495 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1496 const LLT Ty = Query.Types[TypeIdx]; 1497 if (Ty.isVector()) { 1498 const LLT &EltTy = Ty.getElementType(); 1499 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1500 return true; 1501 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1502 return true; 1503 } 1504 return false; 1505 }; 1506 1507 auto &Builder = getActionDefinitionsBuilder(Op) 1508 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1509 .lowerFor({{S16, V2S16}}) 1510 .lowerIf([=](const LegalityQuery &Query) { 1511 const LLT BigTy = Query.Types[BigTyIdx]; 1512 return BigTy.getSizeInBits() == 32; 1513 }) 1514 // Try to widen to s16 first for small types. 1515 // TODO: Only do this on targets with legal s16 shifts 1516 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1517 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1518 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1519 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1520 elementTypeIs(1, S16)), 1521 changeTo(1, V2S16)) 1522 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1523 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1524 // valid. 1525 .clampScalar(LitTyIdx, S32, S512) 1526 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1527 // Break up vectors with weird elements into scalars 1528 .fewerElementsIf( 1529 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1530 scalarize(0)) 1531 .fewerElementsIf( 1532 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1533 scalarize(1)) 1534 .clampScalar(BigTyIdx, S32, MaxScalar); 1535 1536 if (Op == G_MERGE_VALUES) { 1537 Builder.widenScalarIf( 1538 // TODO: Use 16-bit shifts if legal for 8-bit values? 1539 [=](const LegalityQuery &Query) { 1540 const LLT Ty = Query.Types[LitTyIdx]; 1541 return Ty.getSizeInBits() < 32; 1542 }, 1543 changeTo(LitTyIdx, S32)); 1544 } 1545 1546 Builder.widenScalarIf( 1547 [=](const LegalityQuery &Query) { 1548 const LLT Ty = Query.Types[BigTyIdx]; 1549 return !isPowerOf2_32(Ty.getSizeInBits()) && 1550 Ty.getSizeInBits() % 16 != 0; 1551 }, 1552 [=](const LegalityQuery &Query) { 1553 // Pick the next power of 2, or a multiple of 64 over 128. 1554 // Whichever is smaller. 1555 const LLT &Ty = Query.Types[BigTyIdx]; 1556 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1557 if (NewSizeInBits >= 256) { 1558 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1559 if (RoundedTo < NewSizeInBits) 1560 NewSizeInBits = RoundedTo; 1561 } 1562 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1563 }) 1564 // Any vectors left are the wrong size. Scalarize them. 1565 .scalarize(0) 1566 .scalarize(1); 1567 } 1568 1569 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1570 // RegBankSelect. 1571 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1572 .legalFor({{S32}, {S64}}); 1573 1574 if (ST.hasVOP3PInsts()) { 1575 SextInReg.lowerFor({{V2S16}}) 1576 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1577 // get more vector shift opportunities, since we'll get those when 1578 // expanded. 1579 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1580 } else if (ST.has16BitInsts()) { 1581 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1582 } else { 1583 // Prefer to promote to s32 before lowering if we don't have 16-bit 1584 // shifts. This avoid a lot of intermediate truncate and extend operations. 1585 SextInReg.lowerFor({{S32}, {S64}}); 1586 } 1587 1588 SextInReg 1589 .scalarize(0) 1590 .clampScalar(0, S32, S64) 1591 .lower(); 1592 1593 getActionDefinitionsBuilder(G_FSHR) 1594 .legalFor({{S32, S32}}) 1595 .scalarize(0) 1596 .lower(); 1597 1598 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1599 .legalFor({S64}); 1600 1601 getActionDefinitionsBuilder(G_FENCE) 1602 .alwaysLegal(); 1603 1604 getActionDefinitionsBuilder({ 1605 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1606 G_FCOPYSIGN, 1607 1608 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1609 G_ATOMICRMW_NAND, 1610 G_ATOMICRMW_FSUB, 1611 G_READ_REGISTER, 1612 G_WRITE_REGISTER, 1613 1614 G_SADDO, G_SSUBO, 1615 1616 // TODO: Implement 1617 G_FMINIMUM, G_FMAXIMUM, 1618 G_FSHL 1619 }).lower(); 1620 1621 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1622 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1623 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1624 .unsupported(); 1625 1626 computeTables(); 1627 verify(*ST.getInstrInfo()); 1628 } 1629 1630 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1631 MachineInstr &MI) const { 1632 MachineIRBuilder &B = Helper.MIRBuilder; 1633 MachineRegisterInfo &MRI = *B.getMRI(); 1634 1635 switch (MI.getOpcode()) { 1636 case TargetOpcode::G_ADDRSPACE_CAST: 1637 return legalizeAddrSpaceCast(MI, MRI, B); 1638 case TargetOpcode::G_FRINT: 1639 return legalizeFrint(MI, MRI, B); 1640 case TargetOpcode::G_FCEIL: 1641 return legalizeFceil(MI, MRI, B); 1642 case TargetOpcode::G_FREM: 1643 return legalizeFrem(MI, MRI, B); 1644 case TargetOpcode::G_INTRINSIC_TRUNC: 1645 return legalizeIntrinsicTrunc(MI, MRI, B); 1646 case TargetOpcode::G_SITOFP: 1647 return legalizeITOFP(MI, MRI, B, true); 1648 case TargetOpcode::G_UITOFP: 1649 return legalizeITOFP(MI, MRI, B, false); 1650 case TargetOpcode::G_FPTOSI: 1651 return legalizeFPTOI(MI, MRI, B, true); 1652 case TargetOpcode::G_FPTOUI: 1653 return legalizeFPTOI(MI, MRI, B, false); 1654 case TargetOpcode::G_FMINNUM: 1655 case TargetOpcode::G_FMAXNUM: 1656 case TargetOpcode::G_FMINNUM_IEEE: 1657 case TargetOpcode::G_FMAXNUM_IEEE: 1658 return legalizeMinNumMaxNum(Helper, MI); 1659 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1660 return legalizeExtractVectorElt(MI, MRI, B); 1661 case TargetOpcode::G_INSERT_VECTOR_ELT: 1662 return legalizeInsertVectorElt(MI, MRI, B); 1663 case TargetOpcode::G_SHUFFLE_VECTOR: 1664 return legalizeShuffleVector(MI, MRI, B); 1665 case TargetOpcode::G_FSIN: 1666 case TargetOpcode::G_FCOS: 1667 return legalizeSinCos(MI, MRI, B); 1668 case TargetOpcode::G_GLOBAL_VALUE: 1669 return legalizeGlobalValue(MI, MRI, B); 1670 case TargetOpcode::G_LOAD: 1671 return legalizeLoad(Helper, MI); 1672 case TargetOpcode::G_FMAD: 1673 return legalizeFMad(MI, MRI, B); 1674 case TargetOpcode::G_FDIV: 1675 return legalizeFDIV(MI, MRI, B); 1676 case TargetOpcode::G_UDIV: 1677 case TargetOpcode::G_UREM: 1678 return legalizeUDIV_UREM(MI, MRI, B); 1679 case TargetOpcode::G_SDIV: 1680 case TargetOpcode::G_SREM: 1681 return legalizeSDIV_SREM(MI, MRI, B); 1682 case TargetOpcode::G_ATOMIC_CMPXCHG: 1683 return legalizeAtomicCmpXChg(MI, MRI, B); 1684 case TargetOpcode::G_FLOG: 1685 return legalizeFlog(MI, B, numbers::ln2f); 1686 case TargetOpcode::G_FLOG10: 1687 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1688 case TargetOpcode::G_FEXP: 1689 return legalizeFExp(MI, B); 1690 case TargetOpcode::G_FPOW: 1691 return legalizeFPow(MI, B); 1692 case TargetOpcode::G_FFLOOR: 1693 return legalizeFFloor(MI, MRI, B); 1694 case TargetOpcode::G_BUILD_VECTOR: 1695 return legalizeBuildVector(MI, MRI, B); 1696 default: 1697 return false; 1698 } 1699 1700 llvm_unreachable("expected switch to return"); 1701 } 1702 1703 Register AMDGPULegalizerInfo::getSegmentAperture( 1704 unsigned AS, 1705 MachineRegisterInfo &MRI, 1706 MachineIRBuilder &B) const { 1707 MachineFunction &MF = B.getMF(); 1708 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1709 const LLT S32 = LLT::scalar(32); 1710 1711 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1712 1713 if (ST.hasApertureRegs()) { 1714 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1715 // getreg. 1716 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1717 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1718 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1719 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1720 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1721 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1722 unsigned Encoding = 1723 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1724 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1725 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1726 1727 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1728 1729 B.buildInstr(AMDGPU::S_GETREG_B32) 1730 .addDef(GetReg) 1731 .addImm(Encoding); 1732 MRI.setType(GetReg, S32); 1733 1734 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1735 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1736 } 1737 1738 Register QueuePtr = MRI.createGenericVirtualRegister( 1739 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1740 1741 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1742 return Register(); 1743 1744 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1745 // private_segment_aperture_base_hi. 1746 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1747 1748 // TODO: can we be smarter about machine pointer info? 1749 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1750 MachineMemOperand *MMO = MF.getMachineMemOperand( 1751 PtrInfo, 1752 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1753 MachineMemOperand::MOInvariant, 1754 4, commonAlignment(Align(64), StructOffset)); 1755 1756 Register LoadAddr; 1757 1758 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1759 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1760 } 1761 1762 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1763 MachineInstr &MI, MachineRegisterInfo &MRI, 1764 MachineIRBuilder &B) const { 1765 MachineFunction &MF = B.getMF(); 1766 1767 const LLT S32 = LLT::scalar(32); 1768 Register Dst = MI.getOperand(0).getReg(); 1769 Register Src = MI.getOperand(1).getReg(); 1770 1771 LLT DstTy = MRI.getType(Dst); 1772 LLT SrcTy = MRI.getType(Src); 1773 unsigned DestAS = DstTy.getAddressSpace(); 1774 unsigned SrcAS = SrcTy.getAddressSpace(); 1775 1776 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1777 // vector element. 1778 assert(!DstTy.isVector()); 1779 1780 const AMDGPUTargetMachine &TM 1781 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1782 1783 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1784 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1785 return true; 1786 } 1787 1788 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1789 // Truncate. 1790 B.buildExtract(Dst, Src, 0); 1791 MI.eraseFromParent(); 1792 return true; 1793 } 1794 1795 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1796 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1797 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1798 1799 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1800 // another. Merge operands are required to be the same type, but creating an 1801 // extra ptrtoint would be kind of pointless. 1802 auto HighAddr = B.buildConstant( 1803 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1804 B.buildMerge(Dst, {Src, HighAddr}); 1805 MI.eraseFromParent(); 1806 return true; 1807 } 1808 1809 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1810 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1811 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1812 unsigned NullVal = TM.getNullPointerValue(DestAS); 1813 1814 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1815 auto FlatNull = B.buildConstant(SrcTy, 0); 1816 1817 // Extract low 32-bits of the pointer. 1818 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1819 1820 auto CmpRes = 1821 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1822 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1823 1824 MI.eraseFromParent(); 1825 return true; 1826 } 1827 1828 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1829 return false; 1830 1831 if (!ST.hasFlatAddressSpace()) 1832 return false; 1833 1834 auto SegmentNull = 1835 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1836 auto FlatNull = 1837 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1838 1839 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1840 if (!ApertureReg.isValid()) 1841 return false; 1842 1843 auto CmpRes = 1844 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1845 1846 // Coerce the type of the low half of the result so we can use merge_values. 1847 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1848 1849 // TODO: Should we allow mismatched types but matching sizes in merges to 1850 // avoid the ptrtoint? 1851 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1852 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1853 1854 MI.eraseFromParent(); 1855 return true; 1856 } 1857 1858 bool AMDGPULegalizerInfo::legalizeFrint( 1859 MachineInstr &MI, MachineRegisterInfo &MRI, 1860 MachineIRBuilder &B) const { 1861 Register Src = MI.getOperand(1).getReg(); 1862 LLT Ty = MRI.getType(Src); 1863 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1864 1865 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1866 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1867 1868 auto C1 = B.buildFConstant(Ty, C1Val); 1869 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1870 1871 // TODO: Should this propagate fast-math-flags? 1872 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1873 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1874 1875 auto C2 = B.buildFConstant(Ty, C2Val); 1876 auto Fabs = B.buildFAbs(Ty, Src); 1877 1878 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1879 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1880 MI.eraseFromParent(); 1881 return true; 1882 } 1883 1884 bool AMDGPULegalizerInfo::legalizeFceil( 1885 MachineInstr &MI, MachineRegisterInfo &MRI, 1886 MachineIRBuilder &B) const { 1887 1888 const LLT S1 = LLT::scalar(1); 1889 const LLT S64 = LLT::scalar(64); 1890 1891 Register Src = MI.getOperand(1).getReg(); 1892 assert(MRI.getType(Src) == S64); 1893 1894 // result = trunc(src) 1895 // if (src > 0.0 && src != result) 1896 // result += 1.0 1897 1898 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1899 1900 const auto Zero = B.buildFConstant(S64, 0.0); 1901 const auto One = B.buildFConstant(S64, 1.0); 1902 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1903 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1904 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1905 auto Add = B.buildSelect(S64, And, One, Zero); 1906 1907 // TODO: Should this propagate fast-math-flags? 1908 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1909 return true; 1910 } 1911 1912 bool AMDGPULegalizerInfo::legalizeFrem( 1913 MachineInstr &MI, MachineRegisterInfo &MRI, 1914 MachineIRBuilder &B) const { 1915 Register DstReg = MI.getOperand(0).getReg(); 1916 Register Src0Reg = MI.getOperand(1).getReg(); 1917 Register Src1Reg = MI.getOperand(2).getReg(); 1918 auto Flags = MI.getFlags(); 1919 LLT Ty = MRI.getType(DstReg); 1920 1921 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1922 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1923 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1924 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1925 MI.eraseFromParent(); 1926 return true; 1927 } 1928 1929 static MachineInstrBuilder extractF64Exponent(Register Hi, 1930 MachineIRBuilder &B) { 1931 const unsigned FractBits = 52; 1932 const unsigned ExpBits = 11; 1933 LLT S32 = LLT::scalar(32); 1934 1935 auto Const0 = B.buildConstant(S32, FractBits - 32); 1936 auto Const1 = B.buildConstant(S32, ExpBits); 1937 1938 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1939 .addUse(Hi) 1940 .addUse(Const0.getReg(0)) 1941 .addUse(Const1.getReg(0)); 1942 1943 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1944 } 1945 1946 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1947 MachineInstr &MI, MachineRegisterInfo &MRI, 1948 MachineIRBuilder &B) const { 1949 const LLT S1 = LLT::scalar(1); 1950 const LLT S32 = LLT::scalar(32); 1951 const LLT S64 = LLT::scalar(64); 1952 1953 Register Src = MI.getOperand(1).getReg(); 1954 assert(MRI.getType(Src) == S64); 1955 1956 // TODO: Should this use extract since the low half is unused? 1957 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1958 Register Hi = Unmerge.getReg(1); 1959 1960 // Extract the upper half, since this is where we will find the sign and 1961 // exponent. 1962 auto Exp = extractF64Exponent(Hi, B); 1963 1964 const unsigned FractBits = 52; 1965 1966 // Extract the sign bit. 1967 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1968 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1969 1970 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1971 1972 const auto Zero32 = B.buildConstant(S32, 0); 1973 1974 // Extend back to 64-bits. 1975 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1976 1977 auto Shr = B.buildAShr(S64, FractMask, Exp); 1978 auto Not = B.buildNot(S64, Shr); 1979 auto Tmp0 = B.buildAnd(S64, Src, Not); 1980 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1981 1982 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1983 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1984 1985 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1986 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1987 MI.eraseFromParent(); 1988 return true; 1989 } 1990 1991 bool AMDGPULegalizerInfo::legalizeITOFP( 1992 MachineInstr &MI, MachineRegisterInfo &MRI, 1993 MachineIRBuilder &B, bool Signed) const { 1994 1995 Register Dst = MI.getOperand(0).getReg(); 1996 Register Src = MI.getOperand(1).getReg(); 1997 1998 const LLT S64 = LLT::scalar(64); 1999 const LLT S32 = LLT::scalar(32); 2000 2001 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2002 2003 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2004 2005 auto CvtHi = Signed ? 2006 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2007 B.buildUITOFP(S64, Unmerge.getReg(1)); 2008 2009 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2010 2011 auto ThirtyTwo = B.buildConstant(S32, 32); 2012 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2013 .addUse(CvtHi.getReg(0)) 2014 .addUse(ThirtyTwo.getReg(0)); 2015 2016 // TODO: Should this propagate fast-math-flags? 2017 B.buildFAdd(Dst, LdExp, CvtLo); 2018 MI.eraseFromParent(); 2019 return true; 2020 } 2021 2022 // TODO: Copied from DAG implementation. Verify logic and document how this 2023 // actually works. 2024 bool AMDGPULegalizerInfo::legalizeFPTOI( 2025 MachineInstr &MI, MachineRegisterInfo &MRI, 2026 MachineIRBuilder &B, bool Signed) const { 2027 2028 Register Dst = MI.getOperand(0).getReg(); 2029 Register Src = MI.getOperand(1).getReg(); 2030 2031 const LLT S64 = LLT::scalar(64); 2032 const LLT S32 = LLT::scalar(32); 2033 2034 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2035 2036 unsigned Flags = MI.getFlags(); 2037 2038 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2039 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2040 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2041 2042 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2043 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2044 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2045 2046 auto Hi = Signed ? 2047 B.buildFPTOSI(S32, FloorMul) : 2048 B.buildFPTOUI(S32, FloorMul); 2049 auto Lo = B.buildFPTOUI(S32, Fma); 2050 2051 B.buildMerge(Dst, { Lo, Hi }); 2052 MI.eraseFromParent(); 2053 2054 return true; 2055 } 2056 2057 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2058 MachineInstr &MI) const { 2059 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2060 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2061 2062 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2063 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2064 2065 // With ieee_mode disabled, the instructions have the correct behavior 2066 // already for G_FMINNUM/G_FMAXNUM 2067 if (!MFI->getMode().IEEE) 2068 return !IsIEEEOp; 2069 2070 if (IsIEEEOp) 2071 return true; 2072 2073 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2074 } 2075 2076 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2077 MachineInstr &MI, MachineRegisterInfo &MRI, 2078 MachineIRBuilder &B) const { 2079 // TODO: Should move some of this into LegalizerHelper. 2080 2081 // TODO: Promote dynamic indexing of s16 to s32 2082 2083 // FIXME: Artifact combiner probably should have replaced the truncated 2084 // constant before this, so we shouldn't need 2085 // getConstantVRegValWithLookThrough. 2086 Optional<ValueAndVReg> MaybeIdxVal = 2087 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2088 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2089 return true; 2090 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2091 2092 Register Dst = MI.getOperand(0).getReg(); 2093 Register Vec = MI.getOperand(1).getReg(); 2094 2095 LLT VecTy = MRI.getType(Vec); 2096 LLT EltTy = VecTy.getElementType(); 2097 assert(EltTy == MRI.getType(Dst)); 2098 2099 if (IdxVal < VecTy.getNumElements()) 2100 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2101 else 2102 B.buildUndef(Dst); 2103 2104 MI.eraseFromParent(); 2105 return true; 2106 } 2107 2108 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2109 MachineInstr &MI, MachineRegisterInfo &MRI, 2110 MachineIRBuilder &B) const { 2111 // TODO: Should move some of this into LegalizerHelper. 2112 2113 // TODO: Promote dynamic indexing of s16 to s32 2114 2115 // FIXME: Artifact combiner probably should have replaced the truncated 2116 // constant before this, so we shouldn't need 2117 // getConstantVRegValWithLookThrough. 2118 Optional<ValueAndVReg> MaybeIdxVal = 2119 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2120 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2121 return true; 2122 2123 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2124 Register Dst = MI.getOperand(0).getReg(); 2125 Register Vec = MI.getOperand(1).getReg(); 2126 Register Ins = MI.getOperand(2).getReg(); 2127 2128 LLT VecTy = MRI.getType(Vec); 2129 LLT EltTy = VecTy.getElementType(); 2130 assert(EltTy == MRI.getType(Ins)); 2131 2132 if (IdxVal < VecTy.getNumElements()) 2133 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2134 else 2135 B.buildUndef(Dst); 2136 2137 MI.eraseFromParent(); 2138 return true; 2139 } 2140 2141 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2142 MachineInstr &MI, MachineRegisterInfo &MRI, 2143 MachineIRBuilder &B) const { 2144 const LLT V2S16 = LLT::vector(2, 16); 2145 2146 Register Dst = MI.getOperand(0).getReg(); 2147 Register Src0 = MI.getOperand(1).getReg(); 2148 LLT DstTy = MRI.getType(Dst); 2149 LLT SrcTy = MRI.getType(Src0); 2150 2151 if (SrcTy == V2S16 && DstTy == V2S16 && 2152 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2153 return true; 2154 2155 MachineIRBuilder HelperBuilder(MI); 2156 GISelObserverWrapper DummyObserver; 2157 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2158 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2159 } 2160 2161 bool AMDGPULegalizerInfo::legalizeSinCos( 2162 MachineInstr &MI, MachineRegisterInfo &MRI, 2163 MachineIRBuilder &B) const { 2164 2165 Register DstReg = MI.getOperand(0).getReg(); 2166 Register SrcReg = MI.getOperand(1).getReg(); 2167 LLT Ty = MRI.getType(DstReg); 2168 unsigned Flags = MI.getFlags(); 2169 2170 Register TrigVal; 2171 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2172 if (ST.hasTrigReducedRange()) { 2173 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2174 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2175 .addUse(MulVal.getReg(0)) 2176 .setMIFlags(Flags).getReg(0); 2177 } else 2178 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2179 2180 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2181 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2182 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2183 .addUse(TrigVal) 2184 .setMIFlags(Flags); 2185 MI.eraseFromParent(); 2186 return true; 2187 } 2188 2189 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2190 MachineIRBuilder &B, 2191 const GlobalValue *GV, 2192 int64_t Offset, 2193 unsigned GAFlags) const { 2194 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2195 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2196 // to the following code sequence: 2197 // 2198 // For constant address space: 2199 // s_getpc_b64 s[0:1] 2200 // s_add_u32 s0, s0, $symbol 2201 // s_addc_u32 s1, s1, 0 2202 // 2203 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2204 // a fixup or relocation is emitted to replace $symbol with a literal 2205 // constant, which is a pc-relative offset from the encoding of the $symbol 2206 // operand to the global variable. 2207 // 2208 // For global address space: 2209 // s_getpc_b64 s[0:1] 2210 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2211 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2212 // 2213 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2214 // fixups or relocations are emitted to replace $symbol@*@lo and 2215 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2216 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2217 // operand to the global variable. 2218 // 2219 // What we want here is an offset from the value returned by s_getpc 2220 // (which is the address of the s_add_u32 instruction) to the global 2221 // variable, but since the encoding of $symbol starts 4 bytes after the start 2222 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2223 // small. This requires us to add 4 to the global variable offset in order to 2224 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2225 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2226 // instruction. 2227 2228 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2229 2230 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2231 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2232 2233 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2234 .addDef(PCReg); 2235 2236 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2237 if (GAFlags == SIInstrInfo::MO_NONE) 2238 MIB.addImm(0); 2239 else 2240 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2241 2242 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2243 2244 if (PtrTy.getSizeInBits() == 32) 2245 B.buildExtract(DstReg, PCReg, 0); 2246 return true; 2247 } 2248 2249 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2250 MachineInstr &MI, MachineRegisterInfo &MRI, 2251 MachineIRBuilder &B) const { 2252 Register DstReg = MI.getOperand(0).getReg(); 2253 LLT Ty = MRI.getType(DstReg); 2254 unsigned AS = Ty.getAddressSpace(); 2255 2256 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2257 MachineFunction &MF = B.getMF(); 2258 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2259 2260 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2261 if (!MFI->isModuleEntryFunction()) { 2262 const Function &Fn = MF.getFunction(); 2263 DiagnosticInfoUnsupported BadLDSDecl( 2264 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2265 DS_Warning); 2266 Fn.getContext().diagnose(BadLDSDecl); 2267 2268 // We currently don't have a way to correctly allocate LDS objects that 2269 // aren't directly associated with a kernel. We do force inlining of 2270 // functions that use local objects. However, if these dead functions are 2271 // not eliminated, we don't want a compile time error. Just emit a warning 2272 // and a trap, since there should be no callable path here. 2273 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2274 B.buildUndef(DstReg); 2275 MI.eraseFromParent(); 2276 return true; 2277 } 2278 2279 // TODO: We could emit code to handle the initialization somewhere. 2280 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2281 const SITargetLowering *TLI = ST.getTargetLowering(); 2282 if (!TLI->shouldUseLDSConstAddress(GV)) { 2283 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2284 return true; // Leave in place; 2285 } 2286 2287 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2288 Type *Ty = GV->getValueType(); 2289 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2290 // zero-sized type in other languages to declare the dynamic shared 2291 // memory which size is not known at the compile time. They will be 2292 // allocated by the runtime and placed directly after the static 2293 // allocated ones. They all share the same offset. 2294 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2295 // Adjust alignment for that dynamic shared memory array. 2296 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2297 LLT S32 = LLT::scalar(32); 2298 auto Sz = 2299 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2300 B.buildIntToPtr(DstReg, Sz); 2301 MI.eraseFromParent(); 2302 return true; 2303 } 2304 } 2305 2306 B.buildConstant( 2307 DstReg, 2308 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2309 MI.eraseFromParent(); 2310 return true; 2311 } 2312 2313 const Function &Fn = MF.getFunction(); 2314 DiagnosticInfoUnsupported BadInit( 2315 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2316 Fn.getContext().diagnose(BadInit); 2317 return true; 2318 } 2319 2320 const SITargetLowering *TLI = ST.getTargetLowering(); 2321 2322 if (TLI->shouldEmitFixup(GV)) { 2323 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2324 MI.eraseFromParent(); 2325 return true; 2326 } 2327 2328 if (TLI->shouldEmitPCReloc(GV)) { 2329 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2330 MI.eraseFromParent(); 2331 return true; 2332 } 2333 2334 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2335 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2336 2337 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2338 MachinePointerInfo::getGOT(MF), 2339 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2340 MachineMemOperand::MOInvariant, 2341 8 /*Size*/, Align(8)); 2342 2343 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2344 2345 if (Ty.getSizeInBits() == 32) { 2346 // Truncate if this is a 32-bit constant adrdess. 2347 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2348 B.buildExtract(DstReg, Load, 0); 2349 } else 2350 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2351 2352 MI.eraseFromParent(); 2353 return true; 2354 } 2355 2356 static LLT widenToNextPowerOf2(LLT Ty) { 2357 if (Ty.isVector()) 2358 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2359 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2360 } 2361 2362 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2363 MachineInstr &MI) const { 2364 MachineIRBuilder &B = Helper.MIRBuilder; 2365 MachineRegisterInfo &MRI = *B.getMRI(); 2366 GISelChangeObserver &Observer = Helper.Observer; 2367 2368 Register PtrReg = MI.getOperand(1).getReg(); 2369 LLT PtrTy = MRI.getType(PtrReg); 2370 unsigned AddrSpace = PtrTy.getAddressSpace(); 2371 2372 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2373 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2374 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2375 Observer.changingInstr(MI); 2376 MI.getOperand(1).setReg(Cast.getReg(0)); 2377 Observer.changedInstr(MI); 2378 return true; 2379 } 2380 2381 Register ValReg = MI.getOperand(0).getReg(); 2382 LLT ValTy = MRI.getType(ValReg); 2383 2384 MachineMemOperand *MMO = *MI.memoperands_begin(); 2385 const unsigned ValSize = ValTy.getSizeInBits(); 2386 const unsigned MemSize = 8 * MMO->getSize(); 2387 const Align MemAlign = MMO->getAlign(); 2388 const unsigned AlignInBits = 8 * MemAlign.value(); 2389 2390 // Widen non-power-of-2 loads to the alignment if needed 2391 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2392 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2393 2394 // This was already the correct extending load result type, so just adjust 2395 // the memory type. 2396 if (WideMemSize == ValSize) { 2397 MachineFunction &MF = B.getMF(); 2398 2399 MachineMemOperand *WideMMO = 2400 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2401 Observer.changingInstr(MI); 2402 MI.setMemRefs(MF, {WideMMO}); 2403 Observer.changedInstr(MI); 2404 return true; 2405 } 2406 2407 // Don't bother handling edge case that should probably never be produced. 2408 if (ValSize > WideMemSize) 2409 return false; 2410 2411 LLT WideTy = widenToNextPowerOf2(ValTy); 2412 2413 Register WideLoad; 2414 if (!WideTy.isVector()) { 2415 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2416 B.buildTrunc(ValReg, WideLoad).getReg(0); 2417 } else { 2418 // Extract the subvector. 2419 2420 if (isRegisterType(ValTy)) { 2421 // If this a case where G_EXTRACT is legal, use it. 2422 // (e.g. <3 x s32> -> <4 x s32>) 2423 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2424 B.buildExtract(ValReg, WideLoad, 0); 2425 } else { 2426 // For cases where the widened type isn't a nice register value, unmerge 2427 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2428 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2429 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2430 B.setInsertPt(B.getMBB(), MI.getIterator()); 2431 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2432 } 2433 } 2434 2435 MI.eraseFromParent(); 2436 return true; 2437 } 2438 2439 return false; 2440 } 2441 2442 bool AMDGPULegalizerInfo::legalizeFMad( 2443 MachineInstr &MI, MachineRegisterInfo &MRI, 2444 MachineIRBuilder &B) const { 2445 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2446 assert(Ty.isScalar()); 2447 2448 MachineFunction &MF = B.getMF(); 2449 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2450 2451 // TODO: Always legal with future ftz flag. 2452 // FIXME: Do we need just output? 2453 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2454 return true; 2455 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2456 return true; 2457 2458 MachineIRBuilder HelperBuilder(MI); 2459 GISelObserverWrapper DummyObserver; 2460 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2461 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2462 } 2463 2464 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2465 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2466 Register DstReg = MI.getOperand(0).getReg(); 2467 Register PtrReg = MI.getOperand(1).getReg(); 2468 Register CmpVal = MI.getOperand(2).getReg(); 2469 Register NewVal = MI.getOperand(3).getReg(); 2470 2471 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2472 "this should not have been custom lowered"); 2473 2474 LLT ValTy = MRI.getType(CmpVal); 2475 LLT VecTy = LLT::vector(2, ValTy); 2476 2477 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2478 2479 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2480 .addDef(DstReg) 2481 .addUse(PtrReg) 2482 .addUse(PackedVal) 2483 .setMemRefs(MI.memoperands()); 2484 2485 MI.eraseFromParent(); 2486 return true; 2487 } 2488 2489 bool AMDGPULegalizerInfo::legalizeFlog( 2490 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2491 Register Dst = MI.getOperand(0).getReg(); 2492 Register Src = MI.getOperand(1).getReg(); 2493 LLT Ty = B.getMRI()->getType(Dst); 2494 unsigned Flags = MI.getFlags(); 2495 2496 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2497 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2498 2499 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2500 MI.eraseFromParent(); 2501 return true; 2502 } 2503 2504 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2505 MachineIRBuilder &B) const { 2506 Register Dst = MI.getOperand(0).getReg(); 2507 Register Src = MI.getOperand(1).getReg(); 2508 unsigned Flags = MI.getFlags(); 2509 LLT Ty = B.getMRI()->getType(Dst); 2510 2511 auto K = B.buildFConstant(Ty, numbers::log2e); 2512 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2513 B.buildFExp2(Dst, Mul, Flags); 2514 MI.eraseFromParent(); 2515 return true; 2516 } 2517 2518 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2519 MachineIRBuilder &B) const { 2520 Register Dst = MI.getOperand(0).getReg(); 2521 Register Src0 = MI.getOperand(1).getReg(); 2522 Register Src1 = MI.getOperand(2).getReg(); 2523 unsigned Flags = MI.getFlags(); 2524 LLT Ty = B.getMRI()->getType(Dst); 2525 const LLT S16 = LLT::scalar(16); 2526 const LLT S32 = LLT::scalar(32); 2527 2528 if (Ty == S32) { 2529 auto Log = B.buildFLog2(S32, Src0, Flags); 2530 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2531 .addUse(Log.getReg(0)) 2532 .addUse(Src1) 2533 .setMIFlags(Flags); 2534 B.buildFExp2(Dst, Mul, Flags); 2535 } else if (Ty == S16) { 2536 // There's no f16 fmul_legacy, so we need to convert for it. 2537 auto Log = B.buildFLog2(S16, Src0, Flags); 2538 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2539 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2540 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2541 .addUse(Ext0.getReg(0)) 2542 .addUse(Ext1.getReg(0)) 2543 .setMIFlags(Flags); 2544 2545 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2546 } else 2547 return false; 2548 2549 MI.eraseFromParent(); 2550 return true; 2551 } 2552 2553 // Find a source register, ignoring any possible source modifiers. 2554 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2555 Register ModSrc = OrigSrc; 2556 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2557 ModSrc = SrcFNeg->getOperand(1).getReg(); 2558 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2559 ModSrc = SrcFAbs->getOperand(1).getReg(); 2560 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2561 ModSrc = SrcFAbs->getOperand(1).getReg(); 2562 return ModSrc; 2563 } 2564 2565 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2566 MachineRegisterInfo &MRI, 2567 MachineIRBuilder &B) const { 2568 2569 const LLT S1 = LLT::scalar(1); 2570 const LLT S64 = LLT::scalar(64); 2571 Register Dst = MI.getOperand(0).getReg(); 2572 Register OrigSrc = MI.getOperand(1).getReg(); 2573 unsigned Flags = MI.getFlags(); 2574 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2575 "this should not have been custom lowered"); 2576 2577 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2578 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2579 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2580 // V_FRACT bug is: 2581 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2582 // 2583 // Convert floor(x) to (x - fract(x)) 2584 2585 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2586 .addUse(OrigSrc) 2587 .setMIFlags(Flags); 2588 2589 // Give source modifier matching some assistance before obscuring a foldable 2590 // pattern. 2591 2592 // TODO: We can avoid the neg on the fract? The input sign to fract 2593 // shouldn't matter? 2594 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2595 2596 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2597 2598 Register Min = MRI.createGenericVirtualRegister(S64); 2599 2600 // We don't need to concern ourselves with the snan handling difference, so 2601 // use the one which will directly select. 2602 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2603 if (MFI->getMode().IEEE) 2604 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2605 else 2606 B.buildFMinNum(Min, Fract, Const, Flags); 2607 2608 Register CorrectedFract = Min; 2609 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2610 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2611 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2612 } 2613 2614 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2615 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2616 2617 MI.eraseFromParent(); 2618 return true; 2619 } 2620 2621 // Turn an illegal packed v2s16 build vector into bit operations. 2622 // TODO: This should probably be a bitcast action in LegalizerHelper. 2623 bool AMDGPULegalizerInfo::legalizeBuildVector( 2624 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2625 Register Dst = MI.getOperand(0).getReg(); 2626 const LLT S32 = LLT::scalar(32); 2627 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2628 2629 Register Src0 = MI.getOperand(1).getReg(); 2630 Register Src1 = MI.getOperand(2).getReg(); 2631 assert(MRI.getType(Src0) == LLT::scalar(16)); 2632 2633 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2634 B.buildBitcast(Dst, Merge); 2635 2636 MI.eraseFromParent(); 2637 return true; 2638 } 2639 2640 // Check that this is a G_XOR x, -1 2641 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2642 if (MI.getOpcode() != TargetOpcode::G_XOR) 2643 return false; 2644 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2645 return ConstVal && *ConstVal == -1; 2646 } 2647 2648 // Return the use branch instruction, otherwise null if the usage is invalid. 2649 static MachineInstr * 2650 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2651 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2652 Register CondDef = MI.getOperand(0).getReg(); 2653 if (!MRI.hasOneNonDBGUse(CondDef)) 2654 return nullptr; 2655 2656 MachineBasicBlock *Parent = MI.getParent(); 2657 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2658 2659 if (isNot(MRI, *UseMI)) { 2660 Register NegatedCond = UseMI->getOperand(0).getReg(); 2661 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2662 return nullptr; 2663 2664 // We're deleting the def of this value, so we need to remove it. 2665 UseMI->eraseFromParent(); 2666 2667 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2668 Negated = true; 2669 } 2670 2671 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2672 return nullptr; 2673 2674 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2675 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2676 if (Next == Parent->end()) { 2677 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2678 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2679 return nullptr; 2680 UncondBrTarget = &*NextMBB; 2681 } else { 2682 if (Next->getOpcode() != AMDGPU::G_BR) 2683 return nullptr; 2684 Br = &*Next; 2685 UncondBrTarget = Br->getOperand(0).getMBB(); 2686 } 2687 2688 return UseMI; 2689 } 2690 2691 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2692 const ArgDescriptor *Arg, 2693 const TargetRegisterClass *ArgRC, 2694 LLT ArgTy) const { 2695 MCRegister SrcReg = Arg->getRegister(); 2696 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2697 assert(DstReg.isVirtual() && "Virtual register expected"); 2698 2699 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2700 ArgTy); 2701 if (Arg->isMasked()) { 2702 // TODO: Should we try to emit this once in the entry block? 2703 const LLT S32 = LLT::scalar(32); 2704 const unsigned Mask = Arg->getMask(); 2705 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2706 2707 Register AndMaskSrc = LiveIn; 2708 2709 if (Shift != 0) { 2710 auto ShiftAmt = B.buildConstant(S32, Shift); 2711 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2712 } 2713 2714 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2715 } else { 2716 B.buildCopy(DstReg, LiveIn); 2717 } 2718 2719 return true; 2720 } 2721 2722 bool AMDGPULegalizerInfo::loadInputValue( 2723 Register DstReg, MachineIRBuilder &B, 2724 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2725 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2726 const ArgDescriptor *Arg; 2727 const TargetRegisterClass *ArgRC; 2728 LLT ArgTy; 2729 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2730 2731 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2732 return false; // TODO: Handle these 2733 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2734 } 2735 2736 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2737 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2738 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2739 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2740 return false; 2741 2742 MI.eraseFromParent(); 2743 return true; 2744 } 2745 2746 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2747 MachineRegisterInfo &MRI, 2748 MachineIRBuilder &B) const { 2749 Register Dst = MI.getOperand(0).getReg(); 2750 LLT DstTy = MRI.getType(Dst); 2751 LLT S16 = LLT::scalar(16); 2752 LLT S32 = LLT::scalar(32); 2753 LLT S64 = LLT::scalar(64); 2754 2755 if (DstTy == S16) 2756 return legalizeFDIV16(MI, MRI, B); 2757 if (DstTy == S32) 2758 return legalizeFDIV32(MI, MRI, B); 2759 if (DstTy == S64) 2760 return legalizeFDIV64(MI, MRI, B); 2761 2762 return false; 2763 } 2764 2765 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2766 Register DstReg, 2767 Register X, 2768 Register Y, 2769 bool IsDiv) const { 2770 const LLT S1 = LLT::scalar(1); 2771 const LLT S32 = LLT::scalar(32); 2772 2773 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2774 // algorithm used here. 2775 2776 // Initial estimate of inv(y). 2777 auto FloatY = B.buildUITOFP(S32, Y); 2778 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2779 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2780 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2781 auto Z = B.buildFPTOUI(S32, ScaledY); 2782 2783 // One round of UNR. 2784 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2785 auto NegYZ = B.buildMul(S32, NegY, Z); 2786 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2787 2788 // Quotient/remainder estimate. 2789 auto Q = B.buildUMulH(S32, X, Z); 2790 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2791 2792 // First quotient/remainder refinement. 2793 auto One = B.buildConstant(S32, 1); 2794 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2795 if (IsDiv) 2796 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2797 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2798 2799 // Second quotient/remainder refinement. 2800 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2801 if (IsDiv) 2802 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2803 else 2804 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2805 } 2806 2807 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2808 MachineRegisterInfo &MRI, 2809 MachineIRBuilder &B) const { 2810 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2811 Register DstReg = MI.getOperand(0).getReg(); 2812 Register Num = MI.getOperand(1).getReg(); 2813 Register Den = MI.getOperand(2).getReg(); 2814 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2815 MI.eraseFromParent(); 2816 return true; 2817 } 2818 2819 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2820 // 2821 // Return lo, hi of result 2822 // 2823 // %cvt.lo = G_UITOFP Val.lo 2824 // %cvt.hi = G_UITOFP Val.hi 2825 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2826 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2827 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2828 // %mul2 = G_FMUL %mul1, 2**(-32) 2829 // %trunc = G_INTRINSIC_TRUNC %mul2 2830 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2831 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2832 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2833 Register Val) { 2834 const LLT S32 = LLT::scalar(32); 2835 auto Unmerge = B.buildUnmerge(S32, Val); 2836 2837 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2838 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2839 2840 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2841 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2842 2843 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2844 auto Mul1 = 2845 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2846 2847 // 2**(-32) 2848 auto Mul2 = 2849 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2850 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2851 2852 // -(2**32) 2853 auto Mad2 = B.buildFMAD(S32, Trunc, 2854 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2855 2856 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2857 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2858 2859 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2860 } 2861 2862 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2863 Register DstReg, 2864 Register Numer, 2865 Register Denom, 2866 bool IsDiv) const { 2867 const LLT S32 = LLT::scalar(32); 2868 const LLT S64 = LLT::scalar(64); 2869 const LLT S1 = LLT::scalar(1); 2870 Register RcpLo, RcpHi; 2871 2872 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2873 2874 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2875 2876 auto Zero64 = B.buildConstant(S64, 0); 2877 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2878 2879 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2880 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2881 2882 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2883 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2884 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2885 2886 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2887 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2888 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2889 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2890 2891 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2892 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2893 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2894 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2895 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2896 2897 auto Zero32 = B.buildConstant(S32, 0); 2898 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2899 auto Add2_HiC = 2900 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2901 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2902 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2903 2904 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2905 Register NumerLo = UnmergeNumer.getReg(0); 2906 Register NumerHi = UnmergeNumer.getReg(1); 2907 2908 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2909 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2910 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2911 Register Mul3_Lo = UnmergeMul3.getReg(0); 2912 Register Mul3_Hi = UnmergeMul3.getReg(1); 2913 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2914 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2915 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2916 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2917 2918 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2919 Register DenomLo = UnmergeDenom.getReg(0); 2920 Register DenomHi = UnmergeDenom.getReg(1); 2921 2922 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2923 auto C1 = B.buildSExt(S32, CmpHi); 2924 2925 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2926 auto C2 = B.buildSExt(S32, CmpLo); 2927 2928 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2929 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2930 2931 // TODO: Here and below portions of the code can be enclosed into if/endif. 2932 // Currently control flow is unconditional and we have 4 selects after 2933 // potential endif to substitute PHIs. 2934 2935 // if C3 != 0 ... 2936 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2937 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2938 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2939 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2940 2941 auto One64 = B.buildConstant(S64, 1); 2942 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2943 2944 auto C4 = 2945 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2946 auto C5 = 2947 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2948 auto C6 = B.buildSelect( 2949 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2950 2951 // if (C6 != 0) 2952 auto Add4 = B.buildAdd(S64, Add3, One64); 2953 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2954 2955 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2956 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2957 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2958 2959 // endif C6 2960 // endif C3 2961 2962 if (IsDiv) { 2963 auto Sel1 = B.buildSelect( 2964 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2965 B.buildSelect(DstReg, 2966 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2967 } else { 2968 auto Sel2 = B.buildSelect( 2969 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2970 B.buildSelect(DstReg, 2971 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2972 } 2973 } 2974 2975 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2976 MachineRegisterInfo &MRI, 2977 MachineIRBuilder &B) const { 2978 const LLT S64 = LLT::scalar(64); 2979 const LLT S32 = LLT::scalar(32); 2980 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2981 Register DstReg = MI.getOperand(0).getReg(); 2982 Register Num = MI.getOperand(1).getReg(); 2983 Register Den = MI.getOperand(2).getReg(); 2984 LLT Ty = MRI.getType(DstReg); 2985 2986 if (Ty == S32) 2987 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2988 else if (Ty == S64) 2989 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2990 else 2991 return false; 2992 2993 MI.eraseFromParent(); 2994 return true; 2995 2996 } 2997 2998 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2999 MachineRegisterInfo &MRI, 3000 MachineIRBuilder &B) const { 3001 const LLT S64 = LLT::scalar(64); 3002 const LLT S32 = LLT::scalar(32); 3003 3004 Register DstReg = MI.getOperand(0).getReg(); 3005 const LLT Ty = MRI.getType(DstReg); 3006 if (Ty != S32 && Ty != S64) 3007 return false; 3008 3009 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 3010 3011 Register LHS = MI.getOperand(1).getReg(); 3012 Register RHS = MI.getOperand(2).getReg(); 3013 3014 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3015 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3016 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3017 3018 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3019 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3020 3021 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3022 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3023 3024 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 3025 if (Ty == S32) 3026 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 3027 else 3028 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 3029 3030 Register Sign; 3031 if (IsDiv) 3032 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3033 else 3034 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3035 3036 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 3037 B.buildSub(DstReg, UDivRem, Sign); 3038 3039 MI.eraseFromParent(); 3040 return true; 3041 } 3042 3043 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3044 MachineRegisterInfo &MRI, 3045 MachineIRBuilder &B) const { 3046 Register Res = MI.getOperand(0).getReg(); 3047 Register LHS = MI.getOperand(1).getReg(); 3048 Register RHS = MI.getOperand(2).getReg(); 3049 uint16_t Flags = MI.getFlags(); 3050 LLT ResTy = MRI.getType(Res); 3051 3052 const MachineFunction &MF = B.getMF(); 3053 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3054 MI.getFlag(MachineInstr::FmAfn); 3055 3056 if (!AllowInaccurateRcp) 3057 return false; 3058 3059 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3060 // 1 / x -> RCP(x) 3061 if (CLHS->isExactlyValue(1.0)) { 3062 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3063 .addUse(RHS) 3064 .setMIFlags(Flags); 3065 3066 MI.eraseFromParent(); 3067 return true; 3068 } 3069 3070 // -1 / x -> RCP( FNEG(x) ) 3071 if (CLHS->isExactlyValue(-1.0)) { 3072 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3073 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3074 .addUse(FNeg.getReg(0)) 3075 .setMIFlags(Flags); 3076 3077 MI.eraseFromParent(); 3078 return true; 3079 } 3080 } 3081 3082 // x / y -> x * (1.0 / y) 3083 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3084 .addUse(RHS) 3085 .setMIFlags(Flags); 3086 B.buildFMul(Res, LHS, RCP, Flags); 3087 3088 MI.eraseFromParent(); 3089 return true; 3090 } 3091 3092 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3093 MachineRegisterInfo &MRI, 3094 MachineIRBuilder &B) const { 3095 Register Res = MI.getOperand(0).getReg(); 3096 Register X = MI.getOperand(1).getReg(); 3097 Register Y = MI.getOperand(2).getReg(); 3098 uint16_t Flags = MI.getFlags(); 3099 LLT ResTy = MRI.getType(Res); 3100 3101 const MachineFunction &MF = B.getMF(); 3102 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3103 MI.getFlag(MachineInstr::FmAfn); 3104 3105 if (!AllowInaccurateRcp) 3106 return false; 3107 3108 auto NegY = B.buildFNeg(ResTy, Y); 3109 auto One = B.buildFConstant(ResTy, 1.0); 3110 3111 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3112 .addUse(Y) 3113 .setMIFlags(Flags); 3114 3115 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3116 R = B.buildFMA(ResTy, Tmp0, R, R); 3117 3118 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3119 R = B.buildFMA(ResTy, Tmp1, R, R); 3120 3121 auto Ret = B.buildFMul(ResTy, X, R); 3122 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3123 3124 B.buildFMA(Res, Tmp2, R, Ret); 3125 MI.eraseFromParent(); 3126 return true; 3127 } 3128 3129 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3130 MachineRegisterInfo &MRI, 3131 MachineIRBuilder &B) const { 3132 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3133 return true; 3134 3135 Register Res = MI.getOperand(0).getReg(); 3136 Register LHS = MI.getOperand(1).getReg(); 3137 Register RHS = MI.getOperand(2).getReg(); 3138 3139 uint16_t Flags = MI.getFlags(); 3140 3141 LLT S16 = LLT::scalar(16); 3142 LLT S32 = LLT::scalar(32); 3143 3144 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3145 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3146 3147 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3148 .addUse(RHSExt.getReg(0)) 3149 .setMIFlags(Flags); 3150 3151 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3152 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3153 3154 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3155 .addUse(RDst.getReg(0)) 3156 .addUse(RHS) 3157 .addUse(LHS) 3158 .setMIFlags(Flags); 3159 3160 MI.eraseFromParent(); 3161 return true; 3162 } 3163 3164 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3165 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3166 static void toggleSPDenormMode(bool Enable, 3167 MachineIRBuilder &B, 3168 const GCNSubtarget &ST, 3169 AMDGPU::SIModeRegisterDefaults Mode) { 3170 // Set SP denorm mode to this value. 3171 unsigned SPDenormMode = 3172 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3173 3174 if (ST.hasDenormModeInst()) { 3175 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3176 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3177 3178 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3179 B.buildInstr(AMDGPU::S_DENORM_MODE) 3180 .addImm(NewDenormModeValue); 3181 3182 } else { 3183 // Select FP32 bit field in mode register. 3184 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3185 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3186 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3187 3188 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3189 .addImm(SPDenormMode) 3190 .addImm(SPDenormModeBitField); 3191 } 3192 } 3193 3194 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3195 MachineRegisterInfo &MRI, 3196 MachineIRBuilder &B) const { 3197 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3198 return true; 3199 3200 Register Res = MI.getOperand(0).getReg(); 3201 Register LHS = MI.getOperand(1).getReg(); 3202 Register RHS = MI.getOperand(2).getReg(); 3203 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3204 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3205 3206 uint16_t Flags = MI.getFlags(); 3207 3208 LLT S32 = LLT::scalar(32); 3209 LLT S1 = LLT::scalar(1); 3210 3211 auto One = B.buildFConstant(S32, 1.0f); 3212 3213 auto DenominatorScaled = 3214 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3215 .addUse(LHS) 3216 .addUse(RHS) 3217 .addImm(0) 3218 .setMIFlags(Flags); 3219 auto NumeratorScaled = 3220 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3221 .addUse(LHS) 3222 .addUse(RHS) 3223 .addImm(1) 3224 .setMIFlags(Flags); 3225 3226 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3227 .addUse(DenominatorScaled.getReg(0)) 3228 .setMIFlags(Flags); 3229 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3230 3231 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3232 // aren't modeled as reading it. 3233 if (!Mode.allFP32Denormals()) 3234 toggleSPDenormMode(true, B, ST, Mode); 3235 3236 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3237 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3238 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3239 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3240 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3241 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3242 3243 if (!Mode.allFP32Denormals()) 3244 toggleSPDenormMode(false, B, ST, Mode); 3245 3246 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3247 .addUse(Fma4.getReg(0)) 3248 .addUse(Fma1.getReg(0)) 3249 .addUse(Fma3.getReg(0)) 3250 .addUse(NumeratorScaled.getReg(1)) 3251 .setMIFlags(Flags); 3252 3253 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3254 .addUse(Fmas.getReg(0)) 3255 .addUse(RHS) 3256 .addUse(LHS) 3257 .setMIFlags(Flags); 3258 3259 MI.eraseFromParent(); 3260 return true; 3261 } 3262 3263 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3264 MachineRegisterInfo &MRI, 3265 MachineIRBuilder &B) const { 3266 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3267 return true; 3268 3269 Register Res = MI.getOperand(0).getReg(); 3270 Register LHS = MI.getOperand(1).getReg(); 3271 Register RHS = MI.getOperand(2).getReg(); 3272 3273 uint16_t Flags = MI.getFlags(); 3274 3275 LLT S64 = LLT::scalar(64); 3276 LLT S1 = LLT::scalar(1); 3277 3278 auto One = B.buildFConstant(S64, 1.0); 3279 3280 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3281 .addUse(LHS) 3282 .addUse(RHS) 3283 .addImm(0) 3284 .setMIFlags(Flags); 3285 3286 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3287 3288 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3289 .addUse(DivScale0.getReg(0)) 3290 .setMIFlags(Flags); 3291 3292 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3293 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3294 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3295 3296 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3297 .addUse(LHS) 3298 .addUse(RHS) 3299 .addImm(1) 3300 .setMIFlags(Flags); 3301 3302 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3303 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3304 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3305 3306 Register Scale; 3307 if (!ST.hasUsableDivScaleConditionOutput()) { 3308 // Workaround a hardware bug on SI where the condition output from div_scale 3309 // is not usable. 3310 3311 LLT S32 = LLT::scalar(32); 3312 3313 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3314 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3315 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3316 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3317 3318 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3319 Scale1Unmerge.getReg(1)); 3320 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3321 Scale0Unmerge.getReg(1)); 3322 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3323 } else { 3324 Scale = DivScale1.getReg(1); 3325 } 3326 3327 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3328 .addUse(Fma4.getReg(0)) 3329 .addUse(Fma3.getReg(0)) 3330 .addUse(Mul.getReg(0)) 3331 .addUse(Scale) 3332 .setMIFlags(Flags); 3333 3334 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3335 .addUse(Fmas.getReg(0)) 3336 .addUse(RHS) 3337 .addUse(LHS) 3338 .setMIFlags(Flags); 3339 3340 MI.eraseFromParent(); 3341 return true; 3342 } 3343 3344 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3345 MachineRegisterInfo &MRI, 3346 MachineIRBuilder &B) const { 3347 Register Res = MI.getOperand(0).getReg(); 3348 Register LHS = MI.getOperand(2).getReg(); 3349 Register RHS = MI.getOperand(3).getReg(); 3350 uint16_t Flags = MI.getFlags(); 3351 3352 LLT S32 = LLT::scalar(32); 3353 LLT S1 = LLT::scalar(1); 3354 3355 auto Abs = B.buildFAbs(S32, RHS, Flags); 3356 const APFloat C0Val(1.0f); 3357 3358 auto C0 = B.buildConstant(S32, 0x6f800000); 3359 auto C1 = B.buildConstant(S32, 0x2f800000); 3360 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3361 3362 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3363 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3364 3365 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3366 3367 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3368 .addUse(Mul0.getReg(0)) 3369 .setMIFlags(Flags); 3370 3371 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3372 3373 B.buildFMul(Res, Sel, Mul1, Flags); 3374 3375 MI.eraseFromParent(); 3376 return true; 3377 } 3378 3379 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3380 // FIXME: Why do we handle this one but not other removed instructions? 3381 // 3382 // Reciprocal square root. The clamp prevents infinite results, clamping 3383 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3384 // +-max_float. 3385 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3386 MachineRegisterInfo &MRI, 3387 MachineIRBuilder &B) const { 3388 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3389 return true; 3390 3391 Register Dst = MI.getOperand(0).getReg(); 3392 Register Src = MI.getOperand(2).getReg(); 3393 auto Flags = MI.getFlags(); 3394 3395 LLT Ty = MRI.getType(Dst); 3396 3397 const fltSemantics *FltSemantics; 3398 if (Ty == LLT::scalar(32)) 3399 FltSemantics = &APFloat::IEEEsingle(); 3400 else if (Ty == LLT::scalar(64)) 3401 FltSemantics = &APFloat::IEEEdouble(); 3402 else 3403 return false; 3404 3405 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3406 .addUse(Src) 3407 .setMIFlags(Flags); 3408 3409 // We don't need to concern ourselves with the snan handling difference, since 3410 // the rsq quieted (or not) so use the one which will directly select. 3411 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3412 const bool UseIEEE = MFI->getMode().IEEE; 3413 3414 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3415 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3416 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3417 3418 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3419 3420 if (UseIEEE) 3421 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3422 else 3423 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3424 MI.eraseFromParent(); 3425 return true; 3426 } 3427 3428 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3429 switch (IID) { 3430 case Intrinsic::amdgcn_ds_fadd: 3431 return AMDGPU::G_ATOMICRMW_FADD; 3432 case Intrinsic::amdgcn_ds_fmin: 3433 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3434 case Intrinsic::amdgcn_ds_fmax: 3435 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3436 default: 3437 llvm_unreachable("not a DS FP intrinsic"); 3438 } 3439 } 3440 3441 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3442 MachineInstr &MI, 3443 Intrinsic::ID IID) const { 3444 GISelChangeObserver &Observer = Helper.Observer; 3445 Observer.changingInstr(MI); 3446 3447 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3448 3449 // The remaining operands were used to set fields in the MemOperand on 3450 // construction. 3451 for (int I = 6; I > 3; --I) 3452 MI.RemoveOperand(I); 3453 3454 MI.RemoveOperand(1); // Remove the intrinsic ID. 3455 Observer.changedInstr(MI); 3456 return true; 3457 } 3458 3459 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3460 MachineRegisterInfo &MRI, 3461 MachineIRBuilder &B) const { 3462 uint64_t Offset = 3463 ST.getTargetLowering()->getImplicitParameterOffset( 3464 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3465 LLT DstTy = MRI.getType(DstReg); 3466 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3467 3468 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3469 if (!loadInputValue(KernargPtrReg, B, 3470 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3471 return false; 3472 3473 // FIXME: This should be nuw 3474 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3475 return true; 3476 } 3477 3478 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3479 MachineRegisterInfo &MRI, 3480 MachineIRBuilder &B) const { 3481 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3482 if (!MFI->isEntryFunction()) { 3483 return legalizePreloadedArgIntrin(MI, MRI, B, 3484 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3485 } 3486 3487 Register DstReg = MI.getOperand(0).getReg(); 3488 if (!getImplicitArgPtr(DstReg, MRI, B)) 3489 return false; 3490 3491 MI.eraseFromParent(); 3492 return true; 3493 } 3494 3495 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3496 MachineRegisterInfo &MRI, 3497 MachineIRBuilder &B, 3498 unsigned AddrSpace) const { 3499 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3500 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3501 Register Hi32 = Unmerge.getReg(1); 3502 3503 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3504 MI.eraseFromParent(); 3505 return true; 3506 } 3507 3508 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3509 // offset (the offset that is included in bounds checking and swizzling, to be 3510 // split between the instruction's voffset and immoffset fields) and soffset 3511 // (the offset that is excluded from bounds checking and swizzling, to go in 3512 // the instruction's soffset field). This function takes the first kind of 3513 // offset and figures out how to split it between voffset and immoffset. 3514 std::tuple<Register, unsigned, unsigned> 3515 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3516 Register OrigOffset) const { 3517 const unsigned MaxImm = 4095; 3518 Register BaseReg; 3519 unsigned TotalConstOffset; 3520 const LLT S32 = LLT::scalar(32); 3521 MachineRegisterInfo &MRI = *B.getMRI(); 3522 3523 std::tie(BaseReg, TotalConstOffset) = 3524 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3525 3526 unsigned ImmOffset = TotalConstOffset; 3527 3528 // If BaseReg is a pointer, convert it to int. 3529 if (MRI.getType(BaseReg).isPointer()) 3530 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3531 3532 // If the immediate value is too big for the immoffset field, put the value 3533 // and -4096 into the immoffset field so that the value that is copied/added 3534 // for the voffset field is a multiple of 4096, and it stands more chance 3535 // of being CSEd with the copy/add for another similar load/store. 3536 // However, do not do that rounding down to a multiple of 4096 if that is a 3537 // negative number, as it appears to be illegal to have a negative offset 3538 // in the vgpr, even if adding the immediate offset makes it positive. 3539 unsigned Overflow = ImmOffset & ~MaxImm; 3540 ImmOffset -= Overflow; 3541 if ((int32_t)Overflow < 0) { 3542 Overflow += ImmOffset; 3543 ImmOffset = 0; 3544 } 3545 3546 if (Overflow != 0) { 3547 if (!BaseReg) { 3548 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3549 } else { 3550 auto OverflowVal = B.buildConstant(S32, Overflow); 3551 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3552 } 3553 } 3554 3555 if (!BaseReg) 3556 BaseReg = B.buildConstant(S32, 0).getReg(0); 3557 3558 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3559 } 3560 3561 /// Handle register layout difference for f16 images for some subtargets. 3562 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3563 MachineRegisterInfo &MRI, 3564 Register Reg, 3565 bool ImageStore) const { 3566 const LLT S16 = LLT::scalar(16); 3567 const LLT S32 = LLT::scalar(32); 3568 LLT StoreVT = MRI.getType(Reg); 3569 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3570 3571 if (ST.hasUnpackedD16VMem()) { 3572 auto Unmerge = B.buildUnmerge(S16, Reg); 3573 3574 SmallVector<Register, 4> WideRegs; 3575 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3576 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3577 3578 int NumElts = StoreVT.getNumElements(); 3579 3580 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3581 } 3582 3583 if (ImageStore && ST.hasImageStoreD16Bug()) { 3584 if (StoreVT.getNumElements() == 2) { 3585 SmallVector<Register, 4> PackedRegs; 3586 Reg = B.buildBitcast(S32, Reg).getReg(0); 3587 PackedRegs.push_back(Reg); 3588 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3589 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); 3590 } 3591 3592 if (StoreVT.getNumElements() == 3) { 3593 SmallVector<Register, 4> PackedRegs; 3594 auto Unmerge = B.buildUnmerge(S16, Reg); 3595 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3596 PackedRegs.push_back(Unmerge.getReg(I)); 3597 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3598 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); 3599 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); 3600 } 3601 3602 if (StoreVT.getNumElements() == 4) { 3603 SmallVector<Register, 4> PackedRegs; 3604 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); 3605 auto Unmerge = B.buildUnmerge(S32, Reg); 3606 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3607 PackedRegs.push_back(Unmerge.getReg(I)); 3608 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3609 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); 3610 } 3611 3612 llvm_unreachable("invalid data type"); 3613 } 3614 3615 return Reg; 3616 } 3617 3618 Register AMDGPULegalizerInfo::fixStoreSourceType( 3619 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3620 MachineRegisterInfo *MRI = B.getMRI(); 3621 LLT Ty = MRI->getType(VData); 3622 3623 const LLT S16 = LLT::scalar(16); 3624 3625 // Fixup illegal register types for i8 stores. 3626 if (Ty == LLT::scalar(8) || Ty == S16) { 3627 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3628 return AnyExt; 3629 } 3630 3631 if (Ty.isVector()) { 3632 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3633 if (IsFormat) 3634 return handleD16VData(B, *MRI, VData); 3635 } 3636 } 3637 3638 return VData; 3639 } 3640 3641 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3642 MachineRegisterInfo &MRI, 3643 MachineIRBuilder &B, 3644 bool IsTyped, 3645 bool IsFormat) const { 3646 Register VData = MI.getOperand(1).getReg(); 3647 LLT Ty = MRI.getType(VData); 3648 LLT EltTy = Ty.getScalarType(); 3649 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3650 const LLT S32 = LLT::scalar(32); 3651 3652 VData = fixStoreSourceType(B, VData, IsFormat); 3653 Register RSrc = MI.getOperand(2).getReg(); 3654 3655 MachineMemOperand *MMO = *MI.memoperands_begin(); 3656 const int MemSize = MMO->getSize(); 3657 3658 unsigned ImmOffset; 3659 unsigned TotalOffset; 3660 3661 // The typed intrinsics add an immediate after the registers. 3662 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3663 3664 // The struct intrinsic variants add one additional operand over raw. 3665 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3666 Register VIndex; 3667 int OpOffset = 0; 3668 if (HasVIndex) { 3669 VIndex = MI.getOperand(3).getReg(); 3670 OpOffset = 1; 3671 } 3672 3673 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3674 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3675 3676 unsigned Format = 0; 3677 if (IsTyped) { 3678 Format = MI.getOperand(5 + OpOffset).getImm(); 3679 ++OpOffset; 3680 } 3681 3682 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3683 3684 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3685 if (TotalOffset != 0) 3686 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3687 3688 unsigned Opc; 3689 if (IsTyped) { 3690 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3691 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3692 } else if (IsFormat) { 3693 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3694 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3695 } else { 3696 switch (MemSize) { 3697 case 1: 3698 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3699 break; 3700 case 2: 3701 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3702 break; 3703 default: 3704 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3705 break; 3706 } 3707 } 3708 3709 if (!VIndex) 3710 VIndex = B.buildConstant(S32, 0).getReg(0); 3711 3712 auto MIB = B.buildInstr(Opc) 3713 .addUse(VData) // vdata 3714 .addUse(RSrc) // rsrc 3715 .addUse(VIndex) // vindex 3716 .addUse(VOffset) // voffset 3717 .addUse(SOffset) // soffset 3718 .addImm(ImmOffset); // offset(imm) 3719 3720 if (IsTyped) 3721 MIB.addImm(Format); 3722 3723 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3724 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3725 .addMemOperand(MMO); 3726 3727 MI.eraseFromParent(); 3728 return true; 3729 } 3730 3731 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3732 MachineRegisterInfo &MRI, 3733 MachineIRBuilder &B, 3734 bool IsFormat, 3735 bool IsTyped) const { 3736 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3737 MachineMemOperand *MMO = *MI.memoperands_begin(); 3738 const int MemSize = MMO->getSize(); 3739 const LLT S32 = LLT::scalar(32); 3740 3741 Register Dst = MI.getOperand(0).getReg(); 3742 Register RSrc = MI.getOperand(2).getReg(); 3743 3744 // The typed intrinsics add an immediate after the registers. 3745 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3746 3747 // The struct intrinsic variants add one additional operand over raw. 3748 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3749 Register VIndex; 3750 int OpOffset = 0; 3751 if (HasVIndex) { 3752 VIndex = MI.getOperand(3).getReg(); 3753 OpOffset = 1; 3754 } 3755 3756 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3757 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3758 3759 unsigned Format = 0; 3760 if (IsTyped) { 3761 Format = MI.getOperand(5 + OpOffset).getImm(); 3762 ++OpOffset; 3763 } 3764 3765 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3766 unsigned ImmOffset; 3767 unsigned TotalOffset; 3768 3769 LLT Ty = MRI.getType(Dst); 3770 LLT EltTy = Ty.getScalarType(); 3771 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3772 const bool Unpacked = ST.hasUnpackedD16VMem(); 3773 3774 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3775 if (TotalOffset != 0) 3776 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3777 3778 unsigned Opc; 3779 3780 if (IsTyped) { 3781 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3782 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3783 } else if (IsFormat) { 3784 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3785 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3786 } else { 3787 switch (MemSize) { 3788 case 1: 3789 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3790 break; 3791 case 2: 3792 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3793 break; 3794 default: 3795 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3796 break; 3797 } 3798 } 3799 3800 Register LoadDstReg; 3801 3802 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3803 LLT UnpackedTy = Ty.changeElementSize(32); 3804 3805 if (IsExtLoad) 3806 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3807 else if (Unpacked && IsD16 && Ty.isVector()) 3808 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3809 else 3810 LoadDstReg = Dst; 3811 3812 if (!VIndex) 3813 VIndex = B.buildConstant(S32, 0).getReg(0); 3814 3815 auto MIB = B.buildInstr(Opc) 3816 .addDef(LoadDstReg) // vdata 3817 .addUse(RSrc) // rsrc 3818 .addUse(VIndex) // vindex 3819 .addUse(VOffset) // voffset 3820 .addUse(SOffset) // soffset 3821 .addImm(ImmOffset); // offset(imm) 3822 3823 if (IsTyped) 3824 MIB.addImm(Format); 3825 3826 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3827 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3828 .addMemOperand(MMO); 3829 3830 if (LoadDstReg != Dst) { 3831 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3832 3833 // Widen result for extending loads was widened. 3834 if (IsExtLoad) 3835 B.buildTrunc(Dst, LoadDstReg); 3836 else { 3837 // Repack to original 16-bit vector result 3838 // FIXME: G_TRUNC should work, but legalization currently fails 3839 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3840 SmallVector<Register, 4> Repack; 3841 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3842 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3843 B.buildMerge(Dst, Repack); 3844 } 3845 } 3846 3847 MI.eraseFromParent(); 3848 return true; 3849 } 3850 3851 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3852 MachineIRBuilder &B, 3853 bool IsInc) const { 3854 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3855 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3856 B.buildInstr(Opc) 3857 .addDef(MI.getOperand(0).getReg()) 3858 .addUse(MI.getOperand(2).getReg()) 3859 .addUse(MI.getOperand(3).getReg()) 3860 .cloneMemRefs(MI); 3861 MI.eraseFromParent(); 3862 return true; 3863 } 3864 3865 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3866 switch (IntrID) { 3867 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3868 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3869 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3870 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3871 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3872 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3873 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3874 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3875 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3876 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3877 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3878 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3879 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3880 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3881 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3882 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3883 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3884 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3885 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3886 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3887 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3888 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3889 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3890 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3891 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3892 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3893 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3894 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3895 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3896 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3897 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3898 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3899 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3900 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3901 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3902 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3903 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3904 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3905 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3906 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3907 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3908 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3909 default: 3910 llvm_unreachable("unhandled atomic opcode"); 3911 } 3912 } 3913 3914 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3915 MachineIRBuilder &B, 3916 Intrinsic::ID IID) const { 3917 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3918 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3919 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3920 3921 Register Dst; 3922 3923 int OpOffset = 0; 3924 if (HasReturn) { 3925 // A few FP atomics do not support return values. 3926 Dst = MI.getOperand(0).getReg(); 3927 } else { 3928 OpOffset = -1; 3929 } 3930 3931 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3932 Register CmpVal; 3933 3934 if (IsCmpSwap) { 3935 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3936 ++OpOffset; 3937 } 3938 3939 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3940 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3941 3942 // The struct intrinsic variants add one additional operand over raw. 3943 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3944 Register VIndex; 3945 if (HasVIndex) { 3946 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3947 ++OpOffset; 3948 } 3949 3950 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3951 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3952 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3953 3954 MachineMemOperand *MMO = *MI.memoperands_begin(); 3955 3956 unsigned ImmOffset; 3957 unsigned TotalOffset; 3958 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3959 if (TotalOffset != 0) 3960 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3961 3962 if (!VIndex) 3963 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3964 3965 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3966 3967 if (HasReturn) 3968 MIB.addDef(Dst); 3969 3970 MIB.addUse(VData); // vdata 3971 3972 if (IsCmpSwap) 3973 MIB.addReg(CmpVal); 3974 3975 MIB.addUse(RSrc) // rsrc 3976 .addUse(VIndex) // vindex 3977 .addUse(VOffset) // voffset 3978 .addUse(SOffset) // soffset 3979 .addImm(ImmOffset) // offset(imm) 3980 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3981 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3982 .addMemOperand(MMO); 3983 3984 MI.eraseFromParent(); 3985 return true; 3986 } 3987 3988 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3989 /// vector with s16 typed elements. 3990 static void packImageA16AddressToDwords( 3991 MachineIRBuilder &B, MachineInstr &MI, 3992 SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset, 3993 const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) { 3994 const LLT S16 = LLT::scalar(16); 3995 const LLT V2S16 = LLT::vector(2, 16); 3996 3997 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 3998 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 3999 if (!SrcOp.isReg()) 4000 continue; // _L to _LZ may have eliminated this. 4001 4002 Register AddrReg = SrcOp.getReg(); 4003 4004 if (I < Intr->GradientStart) { 4005 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4006 PackedAddrs.push_back(AddrReg); 4007 } else { 4008 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4009 // derivatives dx/dh and dx/dv are packed with undef. 4010 if (((I + 1) >= EndIdx) || 4011 ((Intr->NumGradients / 2) % 2 == 1 && 4012 (I == static_cast<unsigned>(Intr->GradientStart + 4013 (Intr->NumGradients / 2) - 1) || 4014 I == static_cast<unsigned>(Intr->GradientStart + 4015 Intr->NumGradients - 1))) || 4016 // Check for _L to _LZ optimization 4017 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4018 PackedAddrs.push_back( 4019 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4020 .getReg(0)); 4021 } else { 4022 PackedAddrs.push_back( 4023 B.buildBuildVector( 4024 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4025 .getReg(0)); 4026 ++I; 4027 } 4028 } 4029 } 4030 } 4031 4032 /// Convert from separate vaddr components to a single vector address register, 4033 /// and replace the remaining operands with $noreg. 4034 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4035 int DimIdx, int NumVAddrs) { 4036 const LLT S32 = LLT::scalar(32); 4037 4038 SmallVector<Register, 8> AddrRegs; 4039 for (int I = 0; I != NumVAddrs; ++I) { 4040 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4041 if (SrcOp.isReg()) { 4042 AddrRegs.push_back(SrcOp.getReg()); 4043 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4044 } 4045 } 4046 4047 int NumAddrRegs = AddrRegs.size(); 4048 if (NumAddrRegs != 1) { 4049 // Round up to 8 elements for v5-v7 4050 // FIXME: Missing intermediate sized register classes and instructions. 4051 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4052 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4053 auto Undef = B.buildUndef(S32); 4054 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4055 NumAddrRegs = RoundedNumRegs; 4056 } 4057 4058 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 4059 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4060 } 4061 4062 for (int I = 1; I != NumVAddrs; ++I) { 4063 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4064 if (SrcOp.isReg()) 4065 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4066 } 4067 } 4068 4069 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4070 /// 4071 /// Depending on the subtarget, load/store with 16-bit element data need to be 4072 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4073 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4074 /// registers. 4075 /// 4076 /// We don't want to directly select image instructions just yet, but also want 4077 /// to exposes all register repacking to the legalizer/combiners. We also don't 4078 /// want a selected instrution entering RegBankSelect. In order to avoid 4079 /// defining a multitude of intermediate image instructions, directly hack on 4080 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4081 /// now unnecessary arguments with $noreg. 4082 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4083 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4084 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4085 4086 const unsigned NumDefs = MI.getNumExplicitDefs(); 4087 const unsigned ArgOffset = NumDefs + 1; 4088 bool IsTFE = NumDefs == 2; 4089 // We are only processing the operands of d16 image operations on subtargets 4090 // that use the unpacked register layout, or need to repack the TFE result. 4091 4092 // TODO: Do we need to guard against already legalized intrinsics? 4093 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4094 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4095 4096 MachineRegisterInfo *MRI = B.getMRI(); 4097 const LLT S32 = LLT::scalar(32); 4098 const LLT S16 = LLT::scalar(16); 4099 const LLT V2S16 = LLT::vector(2, 16); 4100 4101 unsigned DMask = 0; 4102 4103 // Check for 16 bit addresses and pack if true. 4104 LLT GradTy = 4105 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4106 LLT AddrTy = 4107 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4108 const bool IsG16 = GradTy == S16; 4109 const bool IsA16 = AddrTy == S16; 4110 4111 int DMaskLanes = 0; 4112 if (!BaseOpcode->Atomic) { 4113 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4114 if (BaseOpcode->Gather4) { 4115 DMaskLanes = 4; 4116 } else if (DMask != 0) { 4117 DMaskLanes = countPopulation(DMask); 4118 } else if (!IsTFE && !BaseOpcode->Store) { 4119 // If dmask is 0, this is a no-op load. This can be eliminated. 4120 B.buildUndef(MI.getOperand(0)); 4121 MI.eraseFromParent(); 4122 return true; 4123 } 4124 } 4125 4126 Observer.changingInstr(MI); 4127 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4128 4129 unsigned NewOpcode = NumDefs == 0 ? 4130 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4131 4132 // Track that we legalized this 4133 MI.setDesc(B.getTII().get(NewOpcode)); 4134 4135 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4136 // dmask to be at least 1 otherwise the instruction will fail 4137 if (IsTFE && DMask == 0) { 4138 DMask = 0x1; 4139 DMaskLanes = 1; 4140 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4141 } 4142 4143 if (BaseOpcode->Atomic) { 4144 Register VData0 = MI.getOperand(2).getReg(); 4145 LLT Ty = MRI->getType(VData0); 4146 4147 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4148 if (Ty.isVector()) 4149 return false; 4150 4151 if (BaseOpcode->AtomicX2) { 4152 Register VData1 = MI.getOperand(3).getReg(); 4153 // The two values are packed in one register. 4154 LLT PackedTy = LLT::vector(2, Ty); 4155 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4156 MI.getOperand(2).setReg(Concat.getReg(0)); 4157 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4158 } 4159 } 4160 4161 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4162 4163 // Optimize _L to _LZ when _L is zero 4164 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4165 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4166 const ConstantFP *ConstantLod; 4167 4168 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4169 m_GFCst(ConstantLod))) { 4170 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4171 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4172 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4173 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4174 Intr->Dim); 4175 4176 // The starting indexes should remain in the same place. 4177 --CorrectedNumVAddrs; 4178 4179 MI.getOperand(MI.getNumExplicitDefs()) 4180 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4181 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4182 Intr = NewImageDimIntr; 4183 } 4184 } 4185 } 4186 4187 // Optimize _mip away, when 'lod' is zero 4188 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4189 int64_t ConstantLod; 4190 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4191 m_ICst(ConstantLod))) { 4192 if (ConstantLod == 0) { 4193 // TODO: Change intrinsic opcode and remove operand instead or replacing 4194 // it with 0, as the _L to _LZ handling is done above. 4195 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4196 --CorrectedNumVAddrs; 4197 } 4198 } 4199 } 4200 4201 // Rewrite the addressing register layout before doing anything else. 4202 if (IsA16 || IsG16) { 4203 if (IsA16) { 4204 // Target must support the feature and gradients need to be 16 bit too 4205 if (!ST.hasA16() || !IsG16) 4206 return false; 4207 } else if (!ST.hasG16()) 4208 return false; 4209 4210 if (Intr->NumVAddrs > 1) { 4211 SmallVector<Register, 4> PackedRegs; 4212 // Don't compress addresses for G16 4213 const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart; 4214 packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr, 4215 PackEndIdx); 4216 4217 if (!IsA16) { 4218 // Add uncompressed address 4219 for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) { 4220 int AddrReg = MI.getOperand(ArgOffset + I).getReg(); 4221 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 4222 PackedRegs.push_back(AddrReg); 4223 } 4224 } 4225 4226 // See also below in the non-a16 branch 4227 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4228 4229 if (!UseNSA && PackedRegs.size() > 1) { 4230 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4231 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4232 PackedRegs[0] = Concat.getReg(0); 4233 PackedRegs.resize(1); 4234 } 4235 4236 const unsigned NumPacked = PackedRegs.size(); 4237 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4238 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4239 if (!SrcOp.isReg()) { 4240 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4241 continue; 4242 } 4243 4244 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4245 4246 if (I - Intr->VAddrStart < NumPacked) 4247 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4248 else 4249 SrcOp.setReg(AMDGPU::NoRegister); 4250 } 4251 } 4252 } else { 4253 // If the register allocator cannot place the address registers contiguously 4254 // without introducing moves, then using the non-sequential address encoding 4255 // is always preferable, since it saves VALU instructions and is usually a 4256 // wash in terms of code size or even better. 4257 // 4258 // However, we currently have no way of hinting to the register allocator 4259 // that MIMG addresses should be placed contiguously when it is possible to 4260 // do so, so force non-NSA for the common 2-address case as a heuristic. 4261 // 4262 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4263 // allocation when possible. 4264 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4265 4266 if (!UseNSA && Intr->NumVAddrs > 1) 4267 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4268 Intr->NumVAddrs); 4269 } 4270 4271 int Flags = 0; 4272 if (IsA16) 4273 Flags |= 1; 4274 if (IsG16) 4275 Flags |= 2; 4276 MI.addOperand(MachineOperand::CreateImm(Flags)); 4277 4278 if (BaseOpcode->Store) { // No TFE for stores? 4279 // TODO: Handle dmask trim 4280 Register VData = MI.getOperand(1).getReg(); 4281 LLT Ty = MRI->getType(VData); 4282 if (!Ty.isVector() || Ty.getElementType() != S16) 4283 return true; 4284 4285 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4286 if (RepackedReg != VData) { 4287 MI.getOperand(1).setReg(RepackedReg); 4288 } 4289 4290 return true; 4291 } 4292 4293 Register DstReg = MI.getOperand(0).getReg(); 4294 LLT Ty = MRI->getType(DstReg); 4295 const LLT EltTy = Ty.getScalarType(); 4296 const bool IsD16 = Ty.getScalarType() == S16; 4297 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4298 4299 // Confirm that the return type is large enough for the dmask specified 4300 if (NumElts < DMaskLanes) 4301 return false; 4302 4303 if (NumElts > 4 || DMaskLanes > 4) 4304 return false; 4305 4306 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4307 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4308 4309 // The raw dword aligned data component of the load. The only legal cases 4310 // where this matters should be when using the packed D16 format, for 4311 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4312 LLT RoundedTy; 4313 4314 // S32 vector to to cover all data, plus TFE result element. 4315 LLT TFETy; 4316 4317 // Register type to use for each loaded component. Will be S32 or V2S16. 4318 LLT RegTy; 4319 4320 if (IsD16 && ST.hasUnpackedD16VMem()) { 4321 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4322 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4323 RegTy = S32; 4324 } else { 4325 unsigned EltSize = EltTy.getSizeInBits(); 4326 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4327 unsigned RoundedSize = 32 * RoundedElts; 4328 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4329 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4330 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4331 } 4332 4333 // The return type does not need adjustment. 4334 // TODO: Should we change s16 case to s32 or <2 x s16>? 4335 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4336 return true; 4337 4338 Register Dst1Reg; 4339 4340 // Insert after the instruction. 4341 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4342 4343 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4344 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4345 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4346 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4347 4348 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4349 4350 MI.getOperand(0).setReg(NewResultReg); 4351 4352 // In the IR, TFE is supposed to be used with a 2 element struct return 4353 // type. The intruction really returns these two values in one contiguous 4354 // register, with one additional dword beyond the loaded data. Rewrite the 4355 // return type to use a single register result. 4356 4357 if (IsTFE) { 4358 Dst1Reg = MI.getOperand(1).getReg(); 4359 if (MRI->getType(Dst1Reg) != S32) 4360 return false; 4361 4362 // TODO: Make sure the TFE operand bit is set. 4363 MI.RemoveOperand(1); 4364 4365 // Handle the easy case that requires no repack instructions. 4366 if (Ty == S32) { 4367 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4368 return true; 4369 } 4370 } 4371 4372 // Now figure out how to copy the new result register back into the old 4373 // result. 4374 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4375 4376 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4377 4378 if (ResultNumRegs == 1) { 4379 assert(!IsTFE); 4380 ResultRegs[0] = NewResultReg; 4381 } else { 4382 // We have to repack into a new vector of some kind. 4383 for (int I = 0; I != NumDataRegs; ++I) 4384 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4385 B.buildUnmerge(ResultRegs, NewResultReg); 4386 4387 // Drop the final TFE element to get the data part. The TFE result is 4388 // directly written to the right place already. 4389 if (IsTFE) 4390 ResultRegs.resize(NumDataRegs); 4391 } 4392 4393 // For an s16 scalar result, we form an s32 result with a truncate regardless 4394 // of packed vs. unpacked. 4395 if (IsD16 && !Ty.isVector()) { 4396 B.buildTrunc(DstReg, ResultRegs[0]); 4397 return true; 4398 } 4399 4400 // Avoid a build/concat_vector of 1 entry. 4401 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4402 B.buildBitcast(DstReg, ResultRegs[0]); 4403 return true; 4404 } 4405 4406 assert(Ty.isVector()); 4407 4408 if (IsD16) { 4409 // For packed D16 results with TFE enabled, all the data components are 4410 // S32. Cast back to the expected type. 4411 // 4412 // TODO: We don't really need to use load s32 elements. We would only need one 4413 // cast for the TFE result if a multiple of v2s16 was used. 4414 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4415 for (Register &Reg : ResultRegs) 4416 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4417 } else if (ST.hasUnpackedD16VMem()) { 4418 for (Register &Reg : ResultRegs) 4419 Reg = B.buildTrunc(S16, Reg).getReg(0); 4420 } 4421 } 4422 4423 auto padWithUndef = [&](LLT Ty, int NumElts) { 4424 if (NumElts == 0) 4425 return; 4426 Register Undef = B.buildUndef(Ty).getReg(0); 4427 for (int I = 0; I != NumElts; ++I) 4428 ResultRegs.push_back(Undef); 4429 }; 4430 4431 // Pad out any elements eliminated due to the dmask. 4432 LLT ResTy = MRI->getType(ResultRegs[0]); 4433 if (!ResTy.isVector()) { 4434 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4435 B.buildBuildVector(DstReg, ResultRegs); 4436 return true; 4437 } 4438 4439 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4440 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4441 4442 // Deal with the one annoying legal case. 4443 const LLT V3S16 = LLT::vector(3, 16); 4444 if (Ty == V3S16) { 4445 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4446 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4447 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4448 return true; 4449 } 4450 4451 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4452 B.buildConcatVectors(DstReg, ResultRegs); 4453 return true; 4454 } 4455 4456 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4457 LegalizerHelper &Helper, MachineInstr &MI) const { 4458 MachineIRBuilder &B = Helper.MIRBuilder; 4459 GISelChangeObserver &Observer = Helper.Observer; 4460 4461 Register Dst = MI.getOperand(0).getReg(); 4462 LLT Ty = B.getMRI()->getType(Dst); 4463 unsigned Size = Ty.getSizeInBits(); 4464 MachineFunction &MF = B.getMF(); 4465 4466 Observer.changingInstr(MI); 4467 4468 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4469 Ty = getBitcastRegisterType(Ty); 4470 Helper.bitcastDst(MI, Ty, 0); 4471 Dst = MI.getOperand(0).getReg(); 4472 B.setInsertPt(B.getMBB(), MI); 4473 } 4474 4475 // FIXME: We don't really need this intermediate instruction. The intrinsic 4476 // should be fixed to have a memory operand. Since it's readnone, we're not 4477 // allowed to add one. 4478 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4479 MI.RemoveOperand(1); // Remove intrinsic ID 4480 4481 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4482 // TODO: Should this use datalayout alignment? 4483 const unsigned MemSize = (Size + 7) / 8; 4484 const Align MemAlign(4); 4485 MachineMemOperand *MMO = MF.getMachineMemOperand( 4486 MachinePointerInfo(), 4487 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4488 MachineMemOperand::MOInvariant, 4489 MemSize, MemAlign); 4490 MI.addMemOperand(MF, MMO); 4491 4492 // There are no 96-bit result scalar loads, but widening to 128-bit should 4493 // always be legal. We may need to restore this to a 96-bit result if it turns 4494 // out this needs to be converted to a vector load during RegBankSelect. 4495 if (!isPowerOf2_32(Size)) { 4496 if (Ty.isVector()) 4497 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4498 else 4499 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4500 } 4501 4502 Observer.changedInstr(MI); 4503 return true; 4504 } 4505 4506 // TODO: Move to selection 4507 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4508 MachineRegisterInfo &MRI, 4509 MachineIRBuilder &B) const { 4510 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4511 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4512 !ST.isTrapHandlerEnabled()) { 4513 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4514 } else { 4515 // Pass queue pointer to trap handler as input, and insert trap instruction 4516 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4517 MachineRegisterInfo &MRI = *B.getMRI(); 4518 4519 Register LiveIn = 4520 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4521 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4522 return false; 4523 4524 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4525 B.buildCopy(SGPR01, LiveIn); 4526 B.buildInstr(AMDGPU::S_TRAP) 4527 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4528 .addReg(SGPR01, RegState::Implicit); 4529 } 4530 4531 MI.eraseFromParent(); 4532 return true; 4533 } 4534 4535 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4536 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4537 // Is non-HSA path or trap-handler disabled? then, report a warning 4538 // accordingly 4539 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4540 !ST.isTrapHandlerEnabled()) { 4541 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4542 "debugtrap handler not supported", 4543 MI.getDebugLoc(), DS_Warning); 4544 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4545 Ctx.diagnose(NoTrap); 4546 } else { 4547 // Insert debug-trap instruction 4548 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4549 } 4550 4551 MI.eraseFromParent(); 4552 return true; 4553 } 4554 4555 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4556 MachineIRBuilder &B) const { 4557 MachineRegisterInfo &MRI = *B.getMRI(); 4558 const LLT S16 = LLT::scalar(16); 4559 const LLT S32 = LLT::scalar(32); 4560 4561 Register DstReg = MI.getOperand(0).getReg(); 4562 Register NodePtr = MI.getOperand(2).getReg(); 4563 Register RayExtent = MI.getOperand(3).getReg(); 4564 Register RayOrigin = MI.getOperand(4).getReg(); 4565 Register RayDir = MI.getOperand(5).getReg(); 4566 Register RayInvDir = MI.getOperand(6).getReg(); 4567 Register TDescr = MI.getOperand(7).getReg(); 4568 4569 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4570 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4571 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4572 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4573 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4574 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4575 4576 SmallVector<Register, 12> Ops; 4577 if (Is64) { 4578 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4579 Ops.push_back(Unmerge.getReg(0)); 4580 Ops.push_back(Unmerge.getReg(1)); 4581 } else { 4582 Ops.push_back(NodePtr); 4583 } 4584 Ops.push_back(RayExtent); 4585 4586 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4587 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4588 Ops.push_back(Unmerge.getReg(0)); 4589 Ops.push_back(Unmerge.getReg(1)); 4590 Ops.push_back(Unmerge.getReg(2)); 4591 }; 4592 4593 packLanes(RayOrigin); 4594 if (IsA16) { 4595 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4596 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4597 Register R1 = MRI.createGenericVirtualRegister(S32); 4598 Register R2 = MRI.createGenericVirtualRegister(S32); 4599 Register R3 = MRI.createGenericVirtualRegister(S32); 4600 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4601 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4602 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4603 Ops.push_back(R1); 4604 Ops.push_back(R2); 4605 Ops.push_back(R3); 4606 } else { 4607 packLanes(RayDir); 4608 packLanes(RayInvDir); 4609 } 4610 4611 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4612 .addDef(DstReg) 4613 .addImm(Opcode); 4614 4615 for (Register R : Ops) { 4616 MIB.addUse(R); 4617 } 4618 4619 MIB.addUse(TDescr) 4620 .addImm(IsA16 ? 1 : 0) 4621 .cloneMemRefs(MI); 4622 4623 MI.eraseFromParent(); 4624 return true; 4625 } 4626 4627 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4628 MachineInstr &MI) const { 4629 MachineIRBuilder &B = Helper.MIRBuilder; 4630 MachineRegisterInfo &MRI = *B.getMRI(); 4631 4632 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4633 auto IntrID = MI.getIntrinsicID(); 4634 switch (IntrID) { 4635 case Intrinsic::amdgcn_if: 4636 case Intrinsic::amdgcn_else: { 4637 MachineInstr *Br = nullptr; 4638 MachineBasicBlock *UncondBrTarget = nullptr; 4639 bool Negated = false; 4640 if (MachineInstr *BrCond = 4641 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4642 const SIRegisterInfo *TRI 4643 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4644 4645 Register Def = MI.getOperand(1).getReg(); 4646 Register Use = MI.getOperand(3).getReg(); 4647 4648 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4649 4650 if (Negated) 4651 std::swap(CondBrTarget, UncondBrTarget); 4652 4653 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4654 if (IntrID == Intrinsic::amdgcn_if) { 4655 B.buildInstr(AMDGPU::SI_IF) 4656 .addDef(Def) 4657 .addUse(Use) 4658 .addMBB(UncondBrTarget); 4659 } else { 4660 B.buildInstr(AMDGPU::SI_ELSE) 4661 .addDef(Def) 4662 .addUse(Use) 4663 .addMBB(UncondBrTarget); 4664 } 4665 4666 if (Br) { 4667 Br->getOperand(0).setMBB(CondBrTarget); 4668 } else { 4669 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4670 // since we're swapping branch targets it needs to be reinserted. 4671 // FIXME: IRTranslator should probably not do this 4672 B.buildBr(*CondBrTarget); 4673 } 4674 4675 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4676 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4677 MI.eraseFromParent(); 4678 BrCond->eraseFromParent(); 4679 return true; 4680 } 4681 4682 return false; 4683 } 4684 case Intrinsic::amdgcn_loop: { 4685 MachineInstr *Br = nullptr; 4686 MachineBasicBlock *UncondBrTarget = nullptr; 4687 bool Negated = false; 4688 if (MachineInstr *BrCond = 4689 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4690 const SIRegisterInfo *TRI 4691 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4692 4693 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4694 Register Reg = MI.getOperand(2).getReg(); 4695 4696 if (Negated) 4697 std::swap(CondBrTarget, UncondBrTarget); 4698 4699 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4700 B.buildInstr(AMDGPU::SI_LOOP) 4701 .addUse(Reg) 4702 .addMBB(UncondBrTarget); 4703 4704 if (Br) 4705 Br->getOperand(0).setMBB(CondBrTarget); 4706 else 4707 B.buildBr(*CondBrTarget); 4708 4709 MI.eraseFromParent(); 4710 BrCond->eraseFromParent(); 4711 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4712 return true; 4713 } 4714 4715 return false; 4716 } 4717 case Intrinsic::amdgcn_kernarg_segment_ptr: 4718 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4719 // This only makes sense to call in a kernel, so just lower to null. 4720 B.buildConstant(MI.getOperand(0).getReg(), 0); 4721 MI.eraseFromParent(); 4722 return true; 4723 } 4724 4725 return legalizePreloadedArgIntrin( 4726 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4727 case Intrinsic::amdgcn_implicitarg_ptr: 4728 return legalizeImplicitArgPtr(MI, MRI, B); 4729 case Intrinsic::amdgcn_workitem_id_x: 4730 return legalizePreloadedArgIntrin(MI, MRI, B, 4731 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4732 case Intrinsic::amdgcn_workitem_id_y: 4733 return legalizePreloadedArgIntrin(MI, MRI, B, 4734 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4735 case Intrinsic::amdgcn_workitem_id_z: 4736 return legalizePreloadedArgIntrin(MI, MRI, B, 4737 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4738 case Intrinsic::amdgcn_workgroup_id_x: 4739 return legalizePreloadedArgIntrin(MI, MRI, B, 4740 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4741 case Intrinsic::amdgcn_workgroup_id_y: 4742 return legalizePreloadedArgIntrin(MI, MRI, B, 4743 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4744 case Intrinsic::amdgcn_workgroup_id_z: 4745 return legalizePreloadedArgIntrin(MI, MRI, B, 4746 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4747 case Intrinsic::amdgcn_dispatch_ptr: 4748 return legalizePreloadedArgIntrin(MI, MRI, B, 4749 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4750 case Intrinsic::amdgcn_queue_ptr: 4751 return legalizePreloadedArgIntrin(MI, MRI, B, 4752 AMDGPUFunctionArgInfo::QUEUE_PTR); 4753 case Intrinsic::amdgcn_implicit_buffer_ptr: 4754 return legalizePreloadedArgIntrin( 4755 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4756 case Intrinsic::amdgcn_dispatch_id: 4757 return legalizePreloadedArgIntrin(MI, MRI, B, 4758 AMDGPUFunctionArgInfo::DISPATCH_ID); 4759 case Intrinsic::amdgcn_fdiv_fast: 4760 return legalizeFDIVFastIntrin(MI, MRI, B); 4761 case Intrinsic::amdgcn_is_shared: 4762 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4763 case Intrinsic::amdgcn_is_private: 4764 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4765 case Intrinsic::amdgcn_wavefrontsize: { 4766 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4767 MI.eraseFromParent(); 4768 return true; 4769 } 4770 case Intrinsic::amdgcn_s_buffer_load: 4771 return legalizeSBufferLoad(Helper, MI); 4772 case Intrinsic::amdgcn_raw_buffer_store: 4773 case Intrinsic::amdgcn_struct_buffer_store: 4774 return legalizeBufferStore(MI, MRI, B, false, false); 4775 case Intrinsic::amdgcn_raw_buffer_store_format: 4776 case Intrinsic::amdgcn_struct_buffer_store_format: 4777 return legalizeBufferStore(MI, MRI, B, false, true); 4778 case Intrinsic::amdgcn_raw_tbuffer_store: 4779 case Intrinsic::amdgcn_struct_tbuffer_store: 4780 return legalizeBufferStore(MI, MRI, B, true, true); 4781 case Intrinsic::amdgcn_raw_buffer_load: 4782 case Intrinsic::amdgcn_struct_buffer_load: 4783 return legalizeBufferLoad(MI, MRI, B, false, false); 4784 case Intrinsic::amdgcn_raw_buffer_load_format: 4785 case Intrinsic::amdgcn_struct_buffer_load_format: 4786 return legalizeBufferLoad(MI, MRI, B, true, false); 4787 case Intrinsic::amdgcn_raw_tbuffer_load: 4788 case Intrinsic::amdgcn_struct_tbuffer_load: 4789 return legalizeBufferLoad(MI, MRI, B, true, true); 4790 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4791 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4792 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4793 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4794 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4795 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4796 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4797 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4798 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4799 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4800 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4801 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4802 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4803 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4804 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4805 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4806 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4807 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4808 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4809 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4810 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4811 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4812 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4813 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4814 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4815 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4816 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4817 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4818 return legalizeBufferAtomic(MI, B, IntrID); 4819 case Intrinsic::amdgcn_atomic_inc: 4820 return legalizeAtomicIncDec(MI, B, true); 4821 case Intrinsic::amdgcn_atomic_dec: 4822 return legalizeAtomicIncDec(MI, B, false); 4823 case Intrinsic::trap: 4824 return legalizeTrapIntrinsic(MI, MRI, B); 4825 case Intrinsic::debugtrap: 4826 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4827 case Intrinsic::amdgcn_rsq_clamp: 4828 return legalizeRsqClampIntrinsic(MI, MRI, B); 4829 case Intrinsic::amdgcn_ds_fadd: 4830 case Intrinsic::amdgcn_ds_fmin: 4831 case Intrinsic::amdgcn_ds_fmax: 4832 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4833 case Intrinsic::amdgcn_image_bvh_intersect_ray: 4834 return legalizeBVHIntrinsic(MI, B); 4835 default: { 4836 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4837 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4838 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4839 return true; 4840 } 4841 } 4842 4843 return true; 4844 } 4845