1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/ADT/ScopeExit.h" 22 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 25 #include "llvm/IR/DiagnosticInfo.h" 26 #include "llvm/IR/IntrinsicsAMDGPU.h" 27 28 #define DEBUG_TYPE "amdgpu-legalinfo" 29 30 using namespace llvm; 31 using namespace LegalizeActions; 32 using namespace LegalizeMutations; 33 using namespace LegalityPredicates; 34 using namespace MIPatternMatch; 35 36 // Hack until load/store selection patterns support any tuple of legal types. 37 static cl::opt<bool> EnableNewLegality( 38 "amdgpu-global-isel-new-legality", 39 cl::desc("Use GlobalISel desired legality, rather than try to use" 40 "rules compatible with selection patterns"), 41 cl::init(false), 42 cl::ReallyHidden); 43 44 static constexpr unsigned MaxRegisterSize = 1024; 45 46 // Round the number of elements to the next power of two elements 47 static LLT getPow2VectorType(LLT Ty) { 48 unsigned NElts = Ty.getNumElements(); 49 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 50 return Ty.changeNumElements(Pow2NElts); 51 } 52 53 // Round the number of bits to the next power of two bits 54 static LLT getPow2ScalarType(LLT Ty) { 55 unsigned Bits = Ty.getSizeInBits(); 56 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 57 return LLT::scalar(Pow2Bits); 58 } 59 60 /// \returs true if this is an odd sized vector which should widen by adding an 61 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 62 /// excludes s1 vectors, which should always be scalarized. 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 if (!Ty.isVector()) 67 return false; 68 69 const LLT EltTy = Ty.getElementType(); 70 const unsigned EltSize = EltTy.getSizeInBits(); 71 return Ty.getNumElements() % 2 != 0 && 72 EltSize > 1 && EltSize < 32 && 73 Ty.getSizeInBits() % 32 != 0; 74 }; 75 } 76 77 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 78 return [=](const LegalityQuery &Query) { 79 const LLT Ty = Query.Types[TypeIdx]; 80 return Ty.getSizeInBits() % 32 == 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LLT getBitcastRegisterType(const LLT Ty) { 130 const unsigned Size = Ty.getSizeInBits(); 131 132 LLT CoercedTy; 133 if (Size <= 32) { 134 // <2 x s8> -> s16 135 // <4 x s8> -> s32 136 return LLT::scalar(Size); 137 } 138 139 return LLT::scalarOrVector(Size / 32, 32); 140 } 141 142 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 143 return [=](const LegalityQuery &Query) { 144 const LLT Ty = Query.Types[TypeIdx]; 145 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 146 }; 147 } 148 149 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT Ty = Query.Types[TypeIdx]; 152 unsigned Size = Ty.getSizeInBits(); 153 assert(Size % 32 == 0); 154 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 155 }; 156 } 157 158 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 159 return [=](const LegalityQuery &Query) { 160 const LLT QueryTy = Query.Types[TypeIdx]; 161 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 162 }; 163 } 164 165 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 166 return [=](const LegalityQuery &Query) { 167 const LLT QueryTy = Query.Types[TypeIdx]; 168 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 169 }; 170 } 171 172 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 173 return [=](const LegalityQuery &Query) { 174 const LLT QueryTy = Query.Types[TypeIdx]; 175 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 176 }; 177 } 178 179 static bool isRegisterSize(unsigned Size) { 180 return Size % 32 == 0 && Size <= MaxRegisterSize; 181 } 182 183 static bool isRegisterVectorElementType(LLT EltTy) { 184 const int EltSize = EltTy.getSizeInBits(); 185 return EltSize == 16 || EltSize % 32 == 0; 186 } 187 188 static bool isRegisterVectorType(LLT Ty) { 189 const int EltSize = Ty.getElementType().getSizeInBits(); 190 return EltSize == 32 || EltSize == 64 || 191 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 192 EltSize == 128 || EltSize == 256; 193 } 194 195 static bool isRegisterType(LLT Ty) { 196 if (!isRegisterSize(Ty.getSizeInBits())) 197 return false; 198 199 if (Ty.isVector()) 200 return isRegisterVectorType(Ty); 201 202 return true; 203 } 204 205 // Any combination of 32 or 64-bit elements up the maximum register size, and 206 // multiples of v2s16. 207 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 208 return [=](const LegalityQuery &Query) { 209 return isRegisterType(Query.Types[TypeIdx]); 210 }; 211 } 212 213 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 const LLT QueryTy = Query.Types[TypeIdx]; 216 if (!QueryTy.isVector()) 217 return false; 218 const LLT EltTy = QueryTy.getElementType(); 219 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 220 }; 221 } 222 223 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 224 return [=](const LegalityQuery &Query) { 225 const LLT Ty = Query.Types[TypeIdx]; 226 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 227 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 228 }; 229 } 230 231 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 232 // handle some operations by just promoting the register during 233 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 234 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 235 bool IsLoad) { 236 switch (AS) { 237 case AMDGPUAS::PRIVATE_ADDRESS: 238 // FIXME: Private element size. 239 return ST.enableFlatScratch() ? 128 : 32; 240 case AMDGPUAS::LOCAL_ADDRESS: 241 return ST.useDS128() ? 128 : 64; 242 case AMDGPUAS::GLOBAL_ADDRESS: 243 case AMDGPUAS::CONSTANT_ADDRESS: 244 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 245 // Treat constant and global as identical. SMRD loads are sometimes usable for 246 // global loads (ideally constant address space should be eliminated) 247 // depending on the context. Legality cannot be context dependent, but 248 // RegBankSelect can split the load as necessary depending on the pointer 249 // register bank/uniformity and if the memory is invariant or not written in a 250 // kernel. 251 return IsLoad ? 512 : 128; 252 default: 253 // Flat addresses may contextually need to be split to 32-bit parts if they 254 // may alias scratch depending on the subtarget. 255 return 128; 256 } 257 } 258 259 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 260 const LegalityQuery &Query, 261 unsigned Opcode) { 262 const LLT Ty = Query.Types[0]; 263 264 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 265 const bool IsLoad = Opcode != AMDGPU::G_STORE; 266 267 unsigned RegSize = Ty.getSizeInBits(); 268 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 269 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 270 unsigned AS = Query.Types[1].getAddressSpace(); 271 272 // All of these need to be custom lowered to cast the pointer operand. 273 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 274 return false; 275 276 // TODO: We should be able to widen loads if the alignment is high enough, but 277 // we also need to modify the memory access size. 278 #if 0 279 // Accept widening loads based on alignment. 280 if (IsLoad && MemSize < Size) 281 MemSize = std::max(MemSize, Align); 282 #endif 283 284 // Only 1-byte and 2-byte to 32-bit extloads are valid. 285 if (MemSize != RegSize && RegSize != 32) 286 return false; 287 288 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 289 return false; 290 291 switch (MemSize) { 292 case 8: 293 case 16: 294 case 32: 295 case 64: 296 case 128: 297 break; 298 case 96: 299 if (!ST.hasDwordx3LoadStores()) 300 return false; 301 break; 302 case 256: 303 case 512: 304 // These may contextually need to be broken down. 305 break; 306 default: 307 return false; 308 } 309 310 assert(RegSize >= MemSize); 311 312 if (AlignBits < MemSize) { 313 const SITargetLowering *TLI = ST.getTargetLowering(); 314 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 315 Align(AlignBits / 8))) 316 return false; 317 } 318 319 return true; 320 } 321 322 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 323 // workaround this. Eventually it should ignore the type for loads and only care 324 // about the size. Return true in cases where we will workaround this for now by 325 // bitcasting. 326 static bool loadStoreBitcastWorkaround(const LLT Ty) { 327 if (EnableNewLegality) 328 return false; 329 330 const unsigned Size = Ty.getSizeInBits(); 331 if (Size <= 64) 332 return false; 333 if (!Ty.isVector()) 334 return true; 335 336 LLT EltTy = Ty.getElementType(); 337 if (EltTy.isPointer()) 338 return true; 339 340 unsigned EltSize = EltTy.getSizeInBits(); 341 return EltSize != 32 && EltSize != 64; 342 } 343 344 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 345 unsigned Opcode) { 346 const LLT Ty = Query.Types[0]; 347 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 348 !loadStoreBitcastWorkaround(Ty); 349 } 350 351 /// Return true if a load or store of the type should be lowered with a bitcast 352 /// to a different type. 353 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 354 const unsigned MemSizeInBits) { 355 const unsigned Size = Ty.getSizeInBits(); 356 if (Size != MemSizeInBits) 357 return Size <= 32 && Ty.isVector(); 358 359 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 360 return true; 361 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 362 !isRegisterVectorElementType(Ty.getElementType()); 363 } 364 365 /// Return true if we should legalize a load by widening an odd sized memory 366 /// access up to the alignment. Note this case when the memory access itself 367 /// changes, not the size of the result register. 368 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 369 unsigned AlignInBits, unsigned AddrSpace, 370 unsigned Opcode) { 371 // We don't want to widen cases that are naturally legal. 372 if (isPowerOf2_32(SizeInBits)) 373 return false; 374 375 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 376 // end up widening these for a scalar load during RegBankSelect, since there 377 // aren't 96-bit scalar loads. 378 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 379 return false; 380 381 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 382 return false; 383 384 // A load is known dereferenceable up to the alignment, so it's legal to widen 385 // to it. 386 // 387 // TODO: Could check dereferenceable for less aligned cases. 388 unsigned RoundedSize = NextPowerOf2(SizeInBits); 389 if (AlignInBits < RoundedSize) 390 return false; 391 392 // Do not widen if it would introduce a slow unaligned load. 393 const SITargetLowering *TLI = ST.getTargetLowering(); 394 bool Fast = false; 395 return TLI->allowsMisalignedMemoryAccessesImpl( 396 RoundedSize, AddrSpace, Align(AlignInBits / 8), 397 MachineMemOperand::MOLoad, &Fast) && 398 Fast; 399 } 400 401 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 402 unsigned Opcode) { 403 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 404 return false; 405 406 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 407 Query.MMODescrs[0].AlignInBits, 408 Query.Types[1].getAddressSpace(), Opcode); 409 } 410 411 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 412 const GCNTargetMachine &TM) 413 : ST(ST_) { 414 using namespace TargetOpcode; 415 416 auto GetAddrSpacePtr = [&TM](unsigned AS) { 417 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 418 }; 419 420 const LLT S1 = LLT::scalar(1); 421 const LLT S8 = LLT::scalar(8); 422 const LLT S16 = LLT::scalar(16); 423 const LLT S32 = LLT::scalar(32); 424 const LLT S64 = LLT::scalar(64); 425 const LLT S128 = LLT::scalar(128); 426 const LLT S256 = LLT::scalar(256); 427 const LLT S512 = LLT::scalar(512); 428 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 429 430 const LLT V2S8 = LLT::vector(2, 8); 431 const LLT V2S16 = LLT::vector(2, 16); 432 const LLT V4S16 = LLT::vector(4, 16); 433 434 const LLT V2S32 = LLT::vector(2, 32); 435 const LLT V3S32 = LLT::vector(3, 32); 436 const LLT V4S32 = LLT::vector(4, 32); 437 const LLT V5S32 = LLT::vector(5, 32); 438 const LLT V6S32 = LLT::vector(6, 32); 439 const LLT V7S32 = LLT::vector(7, 32); 440 const LLT V8S32 = LLT::vector(8, 32); 441 const LLT V9S32 = LLT::vector(9, 32); 442 const LLT V10S32 = LLT::vector(10, 32); 443 const LLT V11S32 = LLT::vector(11, 32); 444 const LLT V12S32 = LLT::vector(12, 32); 445 const LLT V13S32 = LLT::vector(13, 32); 446 const LLT V14S32 = LLT::vector(14, 32); 447 const LLT V15S32 = LLT::vector(15, 32); 448 const LLT V16S32 = LLT::vector(16, 32); 449 const LLT V32S32 = LLT::vector(32, 32); 450 451 const LLT V2S64 = LLT::vector(2, 64); 452 const LLT V3S64 = LLT::vector(3, 64); 453 const LLT V4S64 = LLT::vector(4, 64); 454 const LLT V5S64 = LLT::vector(5, 64); 455 const LLT V6S64 = LLT::vector(6, 64); 456 const LLT V7S64 = LLT::vector(7, 64); 457 const LLT V8S64 = LLT::vector(8, 64); 458 const LLT V16S64 = LLT::vector(16, 64); 459 460 std::initializer_list<LLT> AllS32Vectors = 461 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 462 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 463 std::initializer_list<LLT> AllS64Vectors = 464 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 465 466 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 467 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 468 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 469 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 470 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 471 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 472 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 473 474 const LLT CodePtr = FlatPtr; 475 476 const std::initializer_list<LLT> AddrSpaces64 = { 477 GlobalPtr, ConstantPtr, FlatPtr 478 }; 479 480 const std::initializer_list<LLT> AddrSpaces32 = { 481 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 482 }; 483 484 const std::initializer_list<LLT> FPTypesBase = { 485 S32, S64 486 }; 487 488 const std::initializer_list<LLT> FPTypes16 = { 489 S32, S64, S16 490 }; 491 492 const std::initializer_list<LLT> FPTypesPK16 = { 493 S32, S64, S16, V2S16 494 }; 495 496 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 497 498 setAction({G_BRCOND, S1}, Legal); // VCC branches 499 setAction({G_BRCOND, S32}, Legal); // SCC branches 500 501 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 502 // elements for v3s16 503 getActionDefinitionsBuilder(G_PHI) 504 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 505 .legalFor(AllS32Vectors) 506 .legalFor(AllS64Vectors) 507 .legalFor(AddrSpaces64) 508 .legalFor(AddrSpaces32) 509 .legalIf(isPointer(0)) 510 .clampScalar(0, S16, S256) 511 .widenScalarToNextPow2(0, 32) 512 .clampMaxNumElements(0, S32, 16) 513 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 514 .scalarize(0); 515 516 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 517 // Full set of gfx9 features. 518 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 519 .legalFor({S32, S16, V2S16}) 520 .clampScalar(0, S16, S32) 521 .clampMaxNumElements(0, S16, 2) 522 .scalarize(0) 523 .widenScalarToNextPow2(0, 32); 524 525 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 526 .legalFor({S32, S16, V2S16}) // Clamp modifier 527 .minScalarOrElt(0, S16) 528 .clampMaxNumElements(0, S16, 2) 529 .scalarize(0) 530 .widenScalarToNextPow2(0, 32) 531 .lower(); 532 } else if (ST.has16BitInsts()) { 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16}) 535 .clampScalar(0, S16, S32) 536 .scalarize(0) 537 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 538 539 // Technically the saturating operations require clamp bit support, but this 540 // was introduced at the same time as 16-bit operations. 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 542 .legalFor({S32, S16}) // Clamp modifier 543 .minScalar(0, S16) 544 .scalarize(0) 545 .widenScalarToNextPow2(0, 16) 546 .lower(); 547 548 // We're just lowering this, but it helps get a better result to try to 549 // coerce to the desired type first. 550 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 551 .minScalar(0, S16) 552 .scalarize(0) 553 .lower(); 554 } else { 555 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 556 .legalFor({S32}) 557 .clampScalar(0, S32, S32) 558 .scalarize(0); 559 560 if (ST.hasIntClamp()) { 561 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 562 .legalFor({S32}) // Clamp modifier. 563 .scalarize(0) 564 .minScalarOrElt(0, S32) 565 .lower(); 566 } else { 567 // Clamp bit support was added in VI, along with 16-bit operations. 568 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 569 .minScalar(0, S32) 570 .scalarize(0) 571 .lower(); 572 } 573 574 // FIXME: DAG expansion gets better results. The widening uses the smaller 575 // range values and goes for the min/max lowering directly. 576 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 577 .minScalar(0, S32) 578 .scalarize(0) 579 .lower(); 580 } 581 582 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 583 .customFor({S32, S64}) 584 .clampScalar(0, S32, S64) 585 .widenScalarToNextPow2(0, 32) 586 .scalarize(0); 587 588 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 589 .legalFor({S32}) 590 .maxScalarOrElt(0, S32); 591 592 if (ST.hasVOP3PInsts()) { 593 Mulh 594 .clampMaxNumElements(0, S8, 2) 595 .lowerFor({V2S8}); 596 } 597 598 Mulh 599 .scalarize(0) 600 .lower(); 601 602 // Report legal for any types we can handle anywhere. For the cases only legal 603 // on the SALU, RegBankSelect will be able to re-legalize. 604 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 605 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 606 .clampScalar(0, S32, S64) 607 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 608 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 609 .widenScalarToNextPow2(0) 610 .scalarize(0); 611 612 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 613 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 614 .legalFor({{S32, S1}, {S32, S32}}) 615 .minScalar(0, S32) 616 // TODO: .scalarize(0) 617 .lower(); 618 619 getActionDefinitionsBuilder(G_BITCAST) 620 // Don't worry about the size constraint. 621 .legalIf(all(isRegisterType(0), isRegisterType(1))) 622 .lower(); 623 624 625 getActionDefinitionsBuilder(G_CONSTANT) 626 .legalFor({S1, S32, S64, S16, GlobalPtr, 627 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 628 .legalIf(isPointer(0)) 629 .clampScalar(0, S32, S64) 630 .widenScalarToNextPow2(0); 631 632 getActionDefinitionsBuilder(G_FCONSTANT) 633 .legalFor({S32, S64, S16}) 634 .clampScalar(0, S16, S64); 635 636 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 637 .legalIf(isRegisterType(0)) 638 // s1 and s16 are special cases because they have legal operations on 639 // them, but don't really occupy registers in the normal way. 640 .legalFor({S1, S16}) 641 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 642 .clampScalarOrElt(0, S32, MaxScalar) 643 .widenScalarToNextPow2(0, 32) 644 .clampMaxNumElements(0, S32, 16); 645 646 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 647 648 // If the amount is divergent, we have to do a wave reduction to get the 649 // maximum value, so this is expanded during RegBankSelect. 650 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 651 .legalFor({{PrivatePtr, S32}}); 652 653 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 654 .customIf(typeIsNot(0, PrivatePtr)); 655 656 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 657 658 auto &FPOpActions = getActionDefinitionsBuilder( 659 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 660 .legalFor({S32, S64}); 661 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 662 .customFor({S32, S64}); 663 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 664 .customFor({S32, S64}); 665 666 if (ST.has16BitInsts()) { 667 if (ST.hasVOP3PInsts()) 668 FPOpActions.legalFor({S16, V2S16}); 669 else 670 FPOpActions.legalFor({S16}); 671 672 TrigActions.customFor({S16}); 673 FDIVActions.customFor({S16}); 674 } 675 676 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 677 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 678 679 if (ST.hasVOP3PInsts()) { 680 MinNumMaxNum.customFor(FPTypesPK16) 681 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 682 .clampMaxNumElements(0, S16, 2) 683 .clampScalar(0, S16, S64) 684 .scalarize(0); 685 } else if (ST.has16BitInsts()) { 686 MinNumMaxNum.customFor(FPTypes16) 687 .clampScalar(0, S16, S64) 688 .scalarize(0); 689 } else { 690 MinNumMaxNum.customFor(FPTypesBase) 691 .clampScalar(0, S32, S64) 692 .scalarize(0); 693 } 694 695 if (ST.hasVOP3PInsts()) 696 FPOpActions.clampMaxNumElements(0, S16, 2); 697 698 FPOpActions 699 .scalarize(0) 700 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 701 702 TrigActions 703 .scalarize(0) 704 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 705 706 FDIVActions 707 .scalarize(0) 708 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 709 710 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 711 .legalFor(FPTypesPK16) 712 .clampMaxNumElements(0, S16, 2) 713 .scalarize(0) 714 .clampScalar(0, S16, S64); 715 716 if (ST.has16BitInsts()) { 717 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 718 .legalFor({S32, S64, S16}) 719 .scalarize(0) 720 .clampScalar(0, S16, S64); 721 } else { 722 getActionDefinitionsBuilder(G_FSQRT) 723 .legalFor({S32, S64}) 724 .scalarize(0) 725 .clampScalar(0, S32, S64); 726 727 if (ST.hasFractBug()) { 728 getActionDefinitionsBuilder(G_FFLOOR) 729 .customFor({S64}) 730 .legalFor({S32, S64}) 731 .scalarize(0) 732 .clampScalar(0, S32, S64); 733 } else { 734 getActionDefinitionsBuilder(G_FFLOOR) 735 .legalFor({S32, S64}) 736 .scalarize(0) 737 .clampScalar(0, S32, S64); 738 } 739 } 740 741 getActionDefinitionsBuilder(G_FPTRUNC) 742 .legalFor({{S32, S64}, {S16, S32}}) 743 .scalarize(0) 744 .lower(); 745 746 getActionDefinitionsBuilder(G_FPEXT) 747 .legalFor({{S64, S32}, {S32, S16}}) 748 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 749 .scalarize(0); 750 751 getActionDefinitionsBuilder(G_FSUB) 752 // Use actual fsub instruction 753 .legalFor({S32}) 754 // Must use fadd + fneg 755 .lowerFor({S64, S16, V2S16}) 756 .scalarize(0) 757 .clampScalar(0, S32, S64); 758 759 // Whether this is legal depends on the floating point mode for the function. 760 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 761 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 762 FMad.customFor({S32, S16}); 763 else if (ST.hasMadMacF32Insts()) 764 FMad.customFor({S32}); 765 else if (ST.hasMadF16()) 766 FMad.customFor({S16}); 767 FMad.scalarize(0) 768 .lower(); 769 770 auto &FRem = getActionDefinitionsBuilder(G_FREM); 771 if (ST.has16BitInsts()) { 772 FRem.customFor({S16, S32, S64}); 773 } else { 774 FRem.minScalar(0, S32) 775 .customFor({S32, S64}); 776 } 777 FRem.scalarize(0); 778 779 // TODO: Do we need to clamp maximum bitwidth? 780 getActionDefinitionsBuilder(G_TRUNC) 781 .legalIf(isScalar(0)) 782 .legalFor({{V2S16, V2S32}}) 783 .clampMaxNumElements(0, S16, 2) 784 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 785 // situations (like an invalid implicit use), we don't want to infinite loop 786 // in the legalizer. 787 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 788 .alwaysLegal(); 789 790 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 791 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 792 {S32, S1}, {S64, S1}, {S16, S1}}) 793 .scalarize(0) 794 .clampScalar(0, S32, S64) 795 .widenScalarToNextPow2(1, 32); 796 797 // TODO: Split s1->s64 during regbankselect for VALU. 798 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 799 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 800 .lowerFor({{S32, S64}}) 801 .lowerIf(typeIs(1, S1)) 802 .customFor({{S64, S64}}); 803 if (ST.has16BitInsts()) 804 IToFP.legalFor({{S16, S16}}); 805 IToFP.clampScalar(1, S32, S64) 806 .minScalar(0, S32) 807 .scalarize(0) 808 .widenScalarToNextPow2(1); 809 810 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 811 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 812 .customFor({{S64, S64}}) 813 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 814 if (ST.has16BitInsts()) 815 FPToI.legalFor({{S16, S16}}); 816 else 817 FPToI.minScalar(1, S32); 818 819 FPToI.minScalar(0, S32) 820 .scalarize(0) 821 .lower(); 822 823 // Lower roundeven into G_FRINT 824 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 825 .scalarize(0) 826 .lower(); 827 828 if (ST.has16BitInsts()) { 829 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 830 .legalFor({S16, S32, S64}) 831 .clampScalar(0, S16, S64) 832 .scalarize(0); 833 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 834 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 835 .legalFor({S32, S64}) 836 .clampScalar(0, S32, S64) 837 .scalarize(0); 838 } else { 839 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 840 .legalFor({S32}) 841 .customFor({S64}) 842 .clampScalar(0, S32, S64) 843 .scalarize(0); 844 } 845 846 getActionDefinitionsBuilder(G_PTR_ADD) 847 .legalIf(all(isPointer(0), sameSize(0, 1))) 848 .scalarize(0) 849 .scalarSameSizeAs(1, 0); 850 851 getActionDefinitionsBuilder(G_PTRMASK) 852 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 853 .scalarSameSizeAs(1, 0) 854 .scalarize(0); 855 856 auto &CmpBuilder = 857 getActionDefinitionsBuilder(G_ICMP) 858 // The compare output type differs based on the register bank of the output, 859 // so make both s1 and s32 legal. 860 // 861 // Scalar compares producing output in scc will be promoted to s32, as that 862 // is the allocatable register type that will be needed for the copy from 863 // scc. This will be promoted during RegBankSelect, and we assume something 864 // before that won't try to use s32 result types. 865 // 866 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 867 // bank. 868 .legalForCartesianProduct( 869 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 870 .legalForCartesianProduct( 871 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 872 if (ST.has16BitInsts()) { 873 CmpBuilder.legalFor({{S1, S16}}); 874 } 875 876 CmpBuilder 877 .widenScalarToNextPow2(1) 878 .clampScalar(1, S32, S64) 879 .scalarize(0) 880 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 881 882 getActionDefinitionsBuilder(G_FCMP) 883 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 884 .widenScalarToNextPow2(1) 885 .clampScalar(1, S32, S64) 886 .scalarize(0); 887 888 // FIXME: fpow has a selection pattern that should move to custom lowering. 889 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 890 if (ST.has16BitInsts()) 891 Exp2Ops.legalFor({S32, S16}); 892 else 893 Exp2Ops.legalFor({S32}); 894 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 895 Exp2Ops.scalarize(0); 896 897 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 898 if (ST.has16BitInsts()) 899 ExpOps.customFor({{S32}, {S16}}); 900 else 901 ExpOps.customFor({S32}); 902 ExpOps.clampScalar(0, MinScalarFPTy, S32) 903 .scalarize(0); 904 905 getActionDefinitionsBuilder(G_FPOWI) 906 .clampScalar(0, MinScalarFPTy, S32) 907 .lower(); 908 909 // The 64-bit versions produce 32-bit results, but only on the SALU. 910 getActionDefinitionsBuilder(G_CTPOP) 911 .legalFor({{S32, S32}, {S32, S64}}) 912 .clampScalar(0, S32, S32) 913 .clampScalar(1, S32, S64) 914 .scalarize(0) 915 .widenScalarToNextPow2(0, 32) 916 .widenScalarToNextPow2(1, 32); 917 918 // The hardware instructions return a different result on 0 than the generic 919 // instructions expect. The hardware produces -1, but these produce the 920 // bitwidth. 921 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 922 .scalarize(0) 923 .clampScalar(0, S32, S32) 924 .clampScalar(1, S32, S64) 925 .widenScalarToNextPow2(0, 32) 926 .widenScalarToNextPow2(1, 32) 927 .lower(); 928 929 // The 64-bit versions produce 32-bit results, but only on the SALU. 930 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 931 .legalFor({{S32, S32}, {S32, S64}}) 932 .clampScalar(0, S32, S32) 933 .clampScalar(1, S32, S64) 934 .scalarize(0) 935 .widenScalarToNextPow2(0, 32) 936 .widenScalarToNextPow2(1, 32); 937 938 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 939 // RegBankSelect. 940 getActionDefinitionsBuilder(G_BITREVERSE) 941 .legalFor({S32, S64}) 942 .clampScalar(0, S32, S64) 943 .scalarize(0) 944 .widenScalarToNextPow2(0); 945 946 if (ST.has16BitInsts()) { 947 getActionDefinitionsBuilder(G_BSWAP) 948 .legalFor({S16, S32, V2S16}) 949 .clampMaxNumElements(0, S16, 2) 950 // FIXME: Fixing non-power-of-2 before clamp is workaround for 951 // narrowScalar limitation. 952 .widenScalarToNextPow2(0) 953 .clampScalar(0, S16, S32) 954 .scalarize(0); 955 956 if (ST.hasVOP3PInsts()) { 957 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 958 .legalFor({S32, S16, V2S16}) 959 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 960 .clampMaxNumElements(0, S16, 2) 961 .minScalar(0, S16) 962 .widenScalarToNextPow2(0) 963 .scalarize(0) 964 .lower(); 965 } else { 966 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 967 .legalFor({S32, S16}) 968 .widenScalarToNextPow2(0) 969 .minScalar(0, S16) 970 .scalarize(0) 971 .lower(); 972 } 973 } else { 974 // TODO: Should have same legality without v_perm_b32 975 getActionDefinitionsBuilder(G_BSWAP) 976 .legalFor({S32}) 977 .lowerIf(scalarNarrowerThan(0, 32)) 978 // FIXME: Fixing non-power-of-2 before clamp is workaround for 979 // narrowScalar limitation. 980 .widenScalarToNextPow2(0) 981 .maxScalar(0, S32) 982 .scalarize(0) 983 .lower(); 984 985 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 986 .legalFor({S32}) 987 .minScalar(0, S32) 988 .widenScalarToNextPow2(0) 989 .scalarize(0) 990 .lower(); 991 } 992 993 getActionDefinitionsBuilder(G_INTTOPTR) 994 // List the common cases 995 .legalForCartesianProduct(AddrSpaces64, {S64}) 996 .legalForCartesianProduct(AddrSpaces32, {S32}) 997 .scalarize(0) 998 // Accept any address space as long as the size matches 999 .legalIf(sameSize(0, 1)) 1000 .widenScalarIf(smallerThan(1, 0), 1001 [](const LegalityQuery &Query) { 1002 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1003 }) 1004 .narrowScalarIf(largerThan(1, 0), 1005 [](const LegalityQuery &Query) { 1006 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1007 }); 1008 1009 getActionDefinitionsBuilder(G_PTRTOINT) 1010 // List the common cases 1011 .legalForCartesianProduct(AddrSpaces64, {S64}) 1012 .legalForCartesianProduct(AddrSpaces32, {S32}) 1013 .scalarize(0) 1014 // Accept any address space as long as the size matches 1015 .legalIf(sameSize(0, 1)) 1016 .widenScalarIf(smallerThan(0, 1), 1017 [](const LegalityQuery &Query) { 1018 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1019 }) 1020 .narrowScalarIf( 1021 largerThan(0, 1), 1022 [](const LegalityQuery &Query) { 1023 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1024 }); 1025 1026 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1027 .scalarize(0) 1028 .custom(); 1029 1030 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1031 bool IsLoad) -> bool { 1032 const LLT DstTy = Query.Types[0]; 1033 1034 // Split vector extloads. 1035 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1036 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1037 1038 if (MemSize < DstTy.getSizeInBits()) 1039 MemSize = std::max(MemSize, AlignBits); 1040 1041 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1042 return true; 1043 1044 const LLT PtrTy = Query.Types[1]; 1045 unsigned AS = PtrTy.getAddressSpace(); 1046 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1047 return true; 1048 1049 // Catch weird sized loads that don't evenly divide into the access sizes 1050 // TODO: May be able to widen depending on alignment etc. 1051 unsigned NumRegs = (MemSize + 31) / 32; 1052 if (NumRegs == 3) { 1053 if (!ST.hasDwordx3LoadStores()) 1054 return true; 1055 } else { 1056 // If the alignment allows, these should have been widened. 1057 if (!isPowerOf2_32(NumRegs)) 1058 return true; 1059 } 1060 1061 if (AlignBits < MemSize) { 1062 const SITargetLowering *TLI = ST.getTargetLowering(); 1063 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1064 Align(AlignBits / 8)); 1065 } 1066 1067 return false; 1068 }; 1069 1070 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1071 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1072 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1073 1074 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1075 // LDS 1076 // TODO: Unsupported flat for SI. 1077 1078 for (unsigned Op : {G_LOAD, G_STORE}) { 1079 const bool IsStore = Op == G_STORE; 1080 1081 auto &Actions = getActionDefinitionsBuilder(Op); 1082 // Explicitly list some common cases. 1083 // TODO: Does this help compile time at all? 1084 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1085 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1086 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1087 {S64, GlobalPtr, 64, GlobalAlign32}, 1088 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1089 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1090 {S32, GlobalPtr, 8, GlobalAlign8}, 1091 {S32, GlobalPtr, 16, GlobalAlign16}, 1092 1093 {S32, LocalPtr, 32, 32}, 1094 {S64, LocalPtr, 64, 32}, 1095 {V2S32, LocalPtr, 64, 32}, 1096 {S32, LocalPtr, 8, 8}, 1097 {S32, LocalPtr, 16, 16}, 1098 {V2S16, LocalPtr, 32, 32}, 1099 1100 {S32, PrivatePtr, 32, 32}, 1101 {S32, PrivatePtr, 8, 8}, 1102 {S32, PrivatePtr, 16, 16}, 1103 {V2S16, PrivatePtr, 32, 32}, 1104 1105 {S32, ConstantPtr, 32, GlobalAlign32}, 1106 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1107 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1108 {S64, ConstantPtr, 64, GlobalAlign32}, 1109 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1110 Actions.legalIf( 1111 [=](const LegalityQuery &Query) -> bool { 1112 return isLoadStoreLegal(ST, Query, Op); 1113 }); 1114 1115 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1116 // 64-bits. 1117 // 1118 // TODO: Should generalize bitcast action into coerce, which will also cover 1119 // inserting addrspacecasts. 1120 Actions.customIf(typeIs(1, Constant32Ptr)); 1121 1122 // Turn any illegal element vectors into something easier to deal 1123 // with. These will ultimately produce 32-bit scalar shifts to extract the 1124 // parts anyway. 1125 // 1126 // For odd 16-bit element vectors, prefer to split those into pieces with 1127 // 16-bit vector parts. 1128 Actions.bitcastIf( 1129 [=](const LegalityQuery &Query) -> bool { 1130 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1131 Query.MMODescrs[0].SizeInBits); 1132 }, bitcastToRegisterType(0)); 1133 1134 if (!IsStore) { 1135 // Widen suitably aligned loads by loading extra bytes. The standard 1136 // legalization actions can't properly express widening memory operands. 1137 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1138 return shouldWidenLoad(ST, Query, G_LOAD); 1139 }); 1140 } 1141 1142 // FIXME: load/store narrowing should be moved to lower action 1143 Actions 1144 .narrowScalarIf( 1145 [=](const LegalityQuery &Query) -> bool { 1146 return !Query.Types[0].isVector() && 1147 needToSplitMemOp(Query, Op == G_LOAD); 1148 }, 1149 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1150 const LLT DstTy = Query.Types[0]; 1151 const LLT PtrTy = Query.Types[1]; 1152 1153 const unsigned DstSize = DstTy.getSizeInBits(); 1154 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1155 1156 // Split extloads. 1157 if (DstSize > MemSize) 1158 return std::make_pair(0, LLT::scalar(MemSize)); 1159 1160 if (!isPowerOf2_32(DstSize)) { 1161 // We're probably decomposing an odd sized store. Try to split 1162 // to the widest type. TODO: Account for alignment. As-is it 1163 // should be OK, since the new parts will be further legalized. 1164 unsigned FloorSize = PowerOf2Floor(DstSize); 1165 return std::make_pair(0, LLT::scalar(FloorSize)); 1166 } 1167 1168 if (DstSize > 32 && (DstSize % 32 != 0)) { 1169 // FIXME: Need a way to specify non-extload of larger size if 1170 // suitably aligned. 1171 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1172 } 1173 1174 unsigned MaxSize = maxSizeForAddrSpace(ST, 1175 PtrTy.getAddressSpace(), 1176 Op == G_LOAD); 1177 if (MemSize > MaxSize) 1178 return std::make_pair(0, LLT::scalar(MaxSize)); 1179 1180 unsigned Align = Query.MMODescrs[0].AlignInBits; 1181 return std::make_pair(0, LLT::scalar(Align)); 1182 }) 1183 .fewerElementsIf( 1184 [=](const LegalityQuery &Query) -> bool { 1185 return Query.Types[0].isVector() && 1186 needToSplitMemOp(Query, Op == G_LOAD); 1187 }, 1188 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1189 const LLT DstTy = Query.Types[0]; 1190 const LLT PtrTy = Query.Types[1]; 1191 1192 LLT EltTy = DstTy.getElementType(); 1193 unsigned MaxSize = maxSizeForAddrSpace(ST, 1194 PtrTy.getAddressSpace(), 1195 Op == G_LOAD); 1196 1197 // FIXME: Handle widened to power of 2 results better. This ends 1198 // up scalarizing. 1199 // FIXME: 3 element stores scalarized on SI 1200 1201 // Split if it's too large for the address space. 1202 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1203 unsigned NumElts = DstTy.getNumElements(); 1204 unsigned EltSize = EltTy.getSizeInBits(); 1205 1206 if (MaxSize % EltSize == 0) { 1207 return std::make_pair( 1208 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1209 } 1210 1211 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1212 1213 // FIXME: Refine when odd breakdowns handled 1214 // The scalars will need to be re-legalized. 1215 if (NumPieces == 1 || NumPieces >= NumElts || 1216 NumElts % NumPieces != 0) 1217 return std::make_pair(0, EltTy); 1218 1219 return std::make_pair(0, 1220 LLT::vector(NumElts / NumPieces, EltTy)); 1221 } 1222 1223 // FIXME: We could probably handle weird extending loads better. 1224 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1225 if (DstTy.getSizeInBits() > MemSize) 1226 return std::make_pair(0, EltTy); 1227 1228 unsigned EltSize = EltTy.getSizeInBits(); 1229 unsigned DstSize = DstTy.getSizeInBits(); 1230 if (!isPowerOf2_32(DstSize)) { 1231 // We're probably decomposing an odd sized store. Try to split 1232 // to the widest type. TODO: Account for alignment. As-is it 1233 // should be OK, since the new parts will be further legalized. 1234 unsigned FloorSize = PowerOf2Floor(DstSize); 1235 return std::make_pair( 1236 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1237 } 1238 1239 // Need to split because of alignment. 1240 unsigned Align = Query.MMODescrs[0].AlignInBits; 1241 if (EltSize > Align && 1242 (EltSize / Align < DstTy.getNumElements())) { 1243 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1244 } 1245 1246 // May need relegalization for the scalars. 1247 return std::make_pair(0, EltTy); 1248 }) 1249 .lowerIfMemSizeNotPow2() 1250 .minScalar(0, S32); 1251 1252 if (IsStore) 1253 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1254 1255 Actions 1256 .widenScalarToNextPow2(0) 1257 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1258 .lower(); 1259 } 1260 1261 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1262 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1263 {S32, GlobalPtr, 16, 2 * 8}, 1264 {S32, LocalPtr, 8, 8}, 1265 {S32, LocalPtr, 16, 16}, 1266 {S32, PrivatePtr, 8, 8}, 1267 {S32, PrivatePtr, 16, 16}, 1268 {S32, ConstantPtr, 8, 8}, 1269 {S32, ConstantPtr, 16, 2 * 8}}); 1270 if (ST.hasFlatAddressSpace()) { 1271 ExtLoads.legalForTypesWithMemDesc( 1272 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1273 } 1274 1275 ExtLoads.clampScalar(0, S32, S32) 1276 .widenScalarToNextPow2(0) 1277 .unsupportedIfMemSizeNotPow2() 1278 .lower(); 1279 1280 auto &Atomics = getActionDefinitionsBuilder( 1281 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1282 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1283 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1284 G_ATOMICRMW_UMIN}) 1285 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1286 {S64, GlobalPtr}, {S64, LocalPtr}, 1287 {S32, RegionPtr}, {S64, RegionPtr}}); 1288 if (ST.hasFlatAddressSpace()) { 1289 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1290 } 1291 1292 if (ST.hasLDSFPAtomics()) { 1293 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1294 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1295 if (ST.hasGFX90AInsts()) 1296 Atomic.legalFor({{S64, LocalPtr}}); 1297 } 1298 1299 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1300 // demarshalling 1301 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1302 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1303 {S32, FlatPtr}, {S64, FlatPtr}}) 1304 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1305 {S32, RegionPtr}, {S64, RegionPtr}}); 1306 // TODO: Pointer types, any 32-bit or 64-bit vector 1307 1308 // Condition should be s32 for scalar, s1 for vector. 1309 getActionDefinitionsBuilder(G_SELECT) 1310 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1311 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1312 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1313 .clampScalar(0, S16, S64) 1314 .scalarize(1) 1315 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1316 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1317 .clampMaxNumElements(0, S32, 2) 1318 .clampMaxNumElements(0, LocalPtr, 2) 1319 .clampMaxNumElements(0, PrivatePtr, 2) 1320 .scalarize(0) 1321 .widenScalarToNextPow2(0) 1322 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1323 1324 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1325 // be more flexible with the shift amount type. 1326 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1327 .legalFor({{S32, S32}, {S64, S32}}); 1328 if (ST.has16BitInsts()) { 1329 if (ST.hasVOP3PInsts()) { 1330 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1331 .clampMaxNumElements(0, S16, 2); 1332 } else 1333 Shifts.legalFor({{S16, S16}}); 1334 1335 // TODO: Support 16-bit shift amounts for all types 1336 Shifts.widenScalarIf( 1337 [=](const LegalityQuery &Query) { 1338 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1339 // 32-bit amount. 1340 const LLT ValTy = Query.Types[0]; 1341 const LLT AmountTy = Query.Types[1]; 1342 return ValTy.getSizeInBits() <= 16 && 1343 AmountTy.getSizeInBits() < 16; 1344 }, changeTo(1, S16)); 1345 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1346 Shifts.clampScalar(1, S32, S32); 1347 Shifts.clampScalar(0, S16, S64); 1348 Shifts.widenScalarToNextPow2(0, 16); 1349 1350 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1351 .minScalar(0, S16) 1352 .scalarize(0) 1353 .lower(); 1354 } else { 1355 // Make sure we legalize the shift amount type first, as the general 1356 // expansion for the shifted type will produce much worse code if it hasn't 1357 // been truncated already. 1358 Shifts.clampScalar(1, S32, S32); 1359 Shifts.clampScalar(0, S32, S64); 1360 Shifts.widenScalarToNextPow2(0, 32); 1361 1362 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1363 .minScalar(0, S32) 1364 .scalarize(0) 1365 .lower(); 1366 } 1367 Shifts.scalarize(0); 1368 1369 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1370 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1371 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1372 unsigned IdxTypeIdx = 2; 1373 1374 getActionDefinitionsBuilder(Op) 1375 .customIf([=](const LegalityQuery &Query) { 1376 const LLT EltTy = Query.Types[EltTypeIdx]; 1377 const LLT VecTy = Query.Types[VecTypeIdx]; 1378 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1379 const unsigned EltSize = EltTy.getSizeInBits(); 1380 return (EltSize == 32 || EltSize == 64) && 1381 VecTy.getSizeInBits() % 32 == 0 && 1382 VecTy.getSizeInBits() <= MaxRegisterSize && 1383 IdxTy.getSizeInBits() == 32; 1384 }) 1385 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1386 bitcastToVectorElement32(VecTypeIdx)) 1387 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1388 .bitcastIf( 1389 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1390 [=](const LegalityQuery &Query) { 1391 // For > 64-bit element types, try to turn this into a 64-bit 1392 // element vector since we may be able to do better indexing 1393 // if this is scalar. If not, fall back to 32. 1394 const LLT EltTy = Query.Types[EltTypeIdx]; 1395 const LLT VecTy = Query.Types[VecTypeIdx]; 1396 const unsigned DstEltSize = EltTy.getSizeInBits(); 1397 const unsigned VecSize = VecTy.getSizeInBits(); 1398 1399 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1400 return std::make_pair( 1401 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1402 }) 1403 .clampScalar(EltTypeIdx, S32, S64) 1404 .clampScalar(VecTypeIdx, S32, S64) 1405 .clampScalar(IdxTypeIdx, S32, S32) 1406 .clampMaxNumElements(VecTypeIdx, S32, 32) 1407 // TODO: Clamp elements for 64-bit vectors? 1408 // It should only be necessary with variable indexes. 1409 // As a last resort, lower to the stack 1410 .lower(); 1411 } 1412 1413 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1414 .unsupportedIf([=](const LegalityQuery &Query) { 1415 const LLT &EltTy = Query.Types[1].getElementType(); 1416 return Query.Types[0] != EltTy; 1417 }); 1418 1419 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1420 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1421 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1422 1423 // FIXME: Doesn't handle extract of illegal sizes. 1424 getActionDefinitionsBuilder(Op) 1425 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1426 // FIXME: Multiples of 16 should not be legal. 1427 .legalIf([=](const LegalityQuery &Query) { 1428 const LLT BigTy = Query.Types[BigTyIdx]; 1429 const LLT LitTy = Query.Types[LitTyIdx]; 1430 return (BigTy.getSizeInBits() % 32 == 0) && 1431 (LitTy.getSizeInBits() % 16 == 0); 1432 }) 1433 .widenScalarIf( 1434 [=](const LegalityQuery &Query) { 1435 const LLT BigTy = Query.Types[BigTyIdx]; 1436 return (BigTy.getScalarSizeInBits() < 16); 1437 }, 1438 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1439 .widenScalarIf( 1440 [=](const LegalityQuery &Query) { 1441 const LLT LitTy = Query.Types[LitTyIdx]; 1442 return (LitTy.getScalarSizeInBits() < 16); 1443 }, 1444 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1445 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1446 .widenScalarToNextPow2(BigTyIdx, 32); 1447 1448 } 1449 1450 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1451 .legalForCartesianProduct(AllS32Vectors, {S32}) 1452 .legalForCartesianProduct(AllS64Vectors, {S64}) 1453 .clampNumElements(0, V16S32, V32S32) 1454 .clampNumElements(0, V2S64, V16S64) 1455 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1456 1457 if (ST.hasScalarPackInsts()) { 1458 BuildVector 1459 // FIXME: Should probably widen s1 vectors straight to s32 1460 .minScalarOrElt(0, S16) 1461 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1462 .minScalar(1, S32); 1463 1464 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1465 .legalFor({V2S16, S32}) 1466 .lower(); 1467 BuildVector.minScalarOrElt(0, S32); 1468 } else { 1469 BuildVector.customFor({V2S16, S16}); 1470 BuildVector.minScalarOrElt(0, S32); 1471 1472 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1473 .customFor({V2S16, S32}) 1474 .lower(); 1475 } 1476 1477 BuildVector.legalIf(isRegisterType(0)); 1478 1479 // FIXME: Clamp maximum size 1480 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1481 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1482 .clampMaxNumElements(0, S32, 32) 1483 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1484 .clampMaxNumElements(0, S16, 64); 1485 1486 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1487 // pre-legalize. 1488 if (ST.hasVOP3PInsts()) { 1489 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1490 .customFor({V2S16, V2S16}) 1491 .lower(); 1492 } else 1493 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1494 1495 // Merge/Unmerge 1496 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1497 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1498 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1499 1500 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1501 const LLT Ty = Query.Types[TypeIdx]; 1502 if (Ty.isVector()) { 1503 const LLT &EltTy = Ty.getElementType(); 1504 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1505 return true; 1506 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1507 return true; 1508 } 1509 return false; 1510 }; 1511 1512 auto &Builder = getActionDefinitionsBuilder(Op) 1513 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1514 .lowerFor({{S16, V2S16}}) 1515 .lowerIf([=](const LegalityQuery &Query) { 1516 const LLT BigTy = Query.Types[BigTyIdx]; 1517 return BigTy.getSizeInBits() == 32; 1518 }) 1519 // Try to widen to s16 first for small types. 1520 // TODO: Only do this on targets with legal s16 shifts 1521 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1522 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1523 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1524 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1525 elementTypeIs(1, S16)), 1526 changeTo(1, V2S16)) 1527 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1528 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1529 // valid. 1530 .clampScalar(LitTyIdx, S32, S512) 1531 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1532 // Break up vectors with weird elements into scalars 1533 .fewerElementsIf( 1534 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1535 scalarize(0)) 1536 .fewerElementsIf( 1537 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1538 scalarize(1)) 1539 .clampScalar(BigTyIdx, S32, MaxScalar); 1540 1541 if (Op == G_MERGE_VALUES) { 1542 Builder.widenScalarIf( 1543 // TODO: Use 16-bit shifts if legal for 8-bit values? 1544 [=](const LegalityQuery &Query) { 1545 const LLT Ty = Query.Types[LitTyIdx]; 1546 return Ty.getSizeInBits() < 32; 1547 }, 1548 changeTo(LitTyIdx, S32)); 1549 } 1550 1551 Builder.widenScalarIf( 1552 [=](const LegalityQuery &Query) { 1553 const LLT Ty = Query.Types[BigTyIdx]; 1554 return !isPowerOf2_32(Ty.getSizeInBits()) && 1555 Ty.getSizeInBits() % 16 != 0; 1556 }, 1557 [=](const LegalityQuery &Query) { 1558 // Pick the next power of 2, or a multiple of 64 over 128. 1559 // Whichever is smaller. 1560 const LLT &Ty = Query.Types[BigTyIdx]; 1561 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1562 if (NewSizeInBits >= 256) { 1563 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1564 if (RoundedTo < NewSizeInBits) 1565 NewSizeInBits = RoundedTo; 1566 } 1567 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1568 }) 1569 // Any vectors left are the wrong size. Scalarize them. 1570 .scalarize(0) 1571 .scalarize(1); 1572 } 1573 1574 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1575 // RegBankSelect. 1576 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1577 .legalFor({{S32}, {S64}}); 1578 1579 if (ST.hasVOP3PInsts()) { 1580 SextInReg.lowerFor({{V2S16}}) 1581 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1582 // get more vector shift opportunities, since we'll get those when 1583 // expanded. 1584 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1585 } else if (ST.has16BitInsts()) { 1586 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1587 } else { 1588 // Prefer to promote to s32 before lowering if we don't have 16-bit 1589 // shifts. This avoid a lot of intermediate truncate and extend operations. 1590 SextInReg.lowerFor({{S32}, {S64}}); 1591 } 1592 1593 SextInReg 1594 .scalarize(0) 1595 .clampScalar(0, S32, S64) 1596 .lower(); 1597 1598 // TODO: Only Try to form v2s16 with legal packed instructions. 1599 getActionDefinitionsBuilder(G_FSHR) 1600 .legalFor({{S32, S32}}) 1601 .lowerFor({{V2S16, V2S16}}) 1602 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1603 .scalarize(0) 1604 .lower(); 1605 1606 if (ST.hasVOP3PInsts()) { 1607 getActionDefinitionsBuilder(G_FSHL) 1608 .lowerFor({{V2S16, V2S16}}) 1609 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1610 .scalarize(0) 1611 .lower(); 1612 } else { 1613 getActionDefinitionsBuilder(G_FSHL) 1614 .scalarize(0) 1615 .lower(); 1616 } 1617 1618 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1619 .legalFor({S64}); 1620 1621 getActionDefinitionsBuilder(G_FENCE) 1622 .alwaysLegal(); 1623 1624 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1625 .scalarize(0) 1626 .minScalar(0, S32) 1627 .lower(); 1628 1629 getActionDefinitionsBuilder({ 1630 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1631 G_FCOPYSIGN, 1632 1633 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1634 G_ATOMICRMW_NAND, 1635 G_ATOMICRMW_FSUB, 1636 G_READ_REGISTER, 1637 G_WRITE_REGISTER, 1638 1639 G_SADDO, G_SSUBO, 1640 1641 // TODO: Implement 1642 G_FMINIMUM, G_FMAXIMUM}).lower(); 1643 1644 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1645 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1646 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1647 .unsupported(); 1648 1649 computeTables(); 1650 verify(*ST.getInstrInfo()); 1651 } 1652 1653 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1654 MachineInstr &MI) const { 1655 MachineIRBuilder &B = Helper.MIRBuilder; 1656 MachineRegisterInfo &MRI = *B.getMRI(); 1657 1658 switch (MI.getOpcode()) { 1659 case TargetOpcode::G_ADDRSPACE_CAST: 1660 return legalizeAddrSpaceCast(MI, MRI, B); 1661 case TargetOpcode::G_FRINT: 1662 return legalizeFrint(MI, MRI, B); 1663 case TargetOpcode::G_FCEIL: 1664 return legalizeFceil(MI, MRI, B); 1665 case TargetOpcode::G_FREM: 1666 return legalizeFrem(MI, MRI, B); 1667 case TargetOpcode::G_INTRINSIC_TRUNC: 1668 return legalizeIntrinsicTrunc(MI, MRI, B); 1669 case TargetOpcode::G_SITOFP: 1670 return legalizeITOFP(MI, MRI, B, true); 1671 case TargetOpcode::G_UITOFP: 1672 return legalizeITOFP(MI, MRI, B, false); 1673 case TargetOpcode::G_FPTOSI: 1674 return legalizeFPTOI(MI, MRI, B, true); 1675 case TargetOpcode::G_FPTOUI: 1676 return legalizeFPTOI(MI, MRI, B, false); 1677 case TargetOpcode::G_FMINNUM: 1678 case TargetOpcode::G_FMAXNUM: 1679 case TargetOpcode::G_FMINNUM_IEEE: 1680 case TargetOpcode::G_FMAXNUM_IEEE: 1681 return legalizeMinNumMaxNum(Helper, MI); 1682 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1683 return legalizeExtractVectorElt(MI, MRI, B); 1684 case TargetOpcode::G_INSERT_VECTOR_ELT: 1685 return legalizeInsertVectorElt(MI, MRI, B); 1686 case TargetOpcode::G_SHUFFLE_VECTOR: 1687 return legalizeShuffleVector(MI, MRI, B); 1688 case TargetOpcode::G_FSIN: 1689 case TargetOpcode::G_FCOS: 1690 return legalizeSinCos(MI, MRI, B); 1691 case TargetOpcode::G_GLOBAL_VALUE: 1692 return legalizeGlobalValue(MI, MRI, B); 1693 case TargetOpcode::G_LOAD: 1694 return legalizeLoad(Helper, MI); 1695 case TargetOpcode::G_FMAD: 1696 return legalizeFMad(MI, MRI, B); 1697 case TargetOpcode::G_FDIV: 1698 return legalizeFDIV(MI, MRI, B); 1699 case TargetOpcode::G_UDIV: 1700 case TargetOpcode::G_UREM: 1701 return legalizeUDIV_UREM(MI, MRI, B); 1702 case TargetOpcode::G_SDIV: 1703 case TargetOpcode::G_SREM: 1704 return legalizeSDIV_SREM(MI, MRI, B); 1705 case TargetOpcode::G_ATOMIC_CMPXCHG: 1706 return legalizeAtomicCmpXChg(MI, MRI, B); 1707 case TargetOpcode::G_FLOG: 1708 return legalizeFlog(MI, B, numbers::ln2f); 1709 case TargetOpcode::G_FLOG10: 1710 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1711 case TargetOpcode::G_FEXP: 1712 return legalizeFExp(MI, B); 1713 case TargetOpcode::G_FPOW: 1714 return legalizeFPow(MI, B); 1715 case TargetOpcode::G_FFLOOR: 1716 return legalizeFFloor(MI, MRI, B); 1717 case TargetOpcode::G_BUILD_VECTOR: 1718 return legalizeBuildVector(MI, MRI, B); 1719 default: 1720 return false; 1721 } 1722 1723 llvm_unreachable("expected switch to return"); 1724 } 1725 1726 Register AMDGPULegalizerInfo::getSegmentAperture( 1727 unsigned AS, 1728 MachineRegisterInfo &MRI, 1729 MachineIRBuilder &B) const { 1730 MachineFunction &MF = B.getMF(); 1731 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1732 const LLT S32 = LLT::scalar(32); 1733 1734 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1735 1736 if (ST.hasApertureRegs()) { 1737 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1738 // getreg. 1739 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1740 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1741 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1742 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1743 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1744 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1745 unsigned Encoding = 1746 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1747 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1748 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1749 1750 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1751 1752 B.buildInstr(AMDGPU::S_GETREG_B32) 1753 .addDef(GetReg) 1754 .addImm(Encoding); 1755 MRI.setType(GetReg, S32); 1756 1757 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1758 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1759 } 1760 1761 Register QueuePtr = MRI.createGenericVirtualRegister( 1762 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1763 1764 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1765 return Register(); 1766 1767 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1768 // private_segment_aperture_base_hi. 1769 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1770 1771 // TODO: can we be smarter about machine pointer info? 1772 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1773 MachineMemOperand *MMO = MF.getMachineMemOperand( 1774 PtrInfo, 1775 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1776 MachineMemOperand::MOInvariant, 1777 4, commonAlignment(Align(64), StructOffset)); 1778 1779 Register LoadAddr; 1780 1781 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1782 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1783 } 1784 1785 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1786 MachineInstr &MI, MachineRegisterInfo &MRI, 1787 MachineIRBuilder &B) const { 1788 MachineFunction &MF = B.getMF(); 1789 1790 const LLT S32 = LLT::scalar(32); 1791 Register Dst = MI.getOperand(0).getReg(); 1792 Register Src = MI.getOperand(1).getReg(); 1793 1794 LLT DstTy = MRI.getType(Dst); 1795 LLT SrcTy = MRI.getType(Src); 1796 unsigned DestAS = DstTy.getAddressSpace(); 1797 unsigned SrcAS = SrcTy.getAddressSpace(); 1798 1799 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1800 // vector element. 1801 assert(!DstTy.isVector()); 1802 1803 const AMDGPUTargetMachine &TM 1804 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1805 1806 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1807 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1808 return true; 1809 } 1810 1811 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1812 // Truncate. 1813 B.buildExtract(Dst, Src, 0); 1814 MI.eraseFromParent(); 1815 return true; 1816 } 1817 1818 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1819 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1820 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1821 1822 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1823 // another. Merge operands are required to be the same type, but creating an 1824 // extra ptrtoint would be kind of pointless. 1825 auto HighAddr = B.buildConstant( 1826 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1827 B.buildMerge(Dst, {Src, HighAddr}); 1828 MI.eraseFromParent(); 1829 return true; 1830 } 1831 1832 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1833 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1834 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1835 unsigned NullVal = TM.getNullPointerValue(DestAS); 1836 1837 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1838 auto FlatNull = B.buildConstant(SrcTy, 0); 1839 1840 // Extract low 32-bits of the pointer. 1841 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1842 1843 auto CmpRes = 1844 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1845 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1846 1847 MI.eraseFromParent(); 1848 return true; 1849 } 1850 1851 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1852 return false; 1853 1854 if (!ST.hasFlatAddressSpace()) 1855 return false; 1856 1857 auto SegmentNull = 1858 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1859 auto FlatNull = 1860 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1861 1862 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1863 if (!ApertureReg.isValid()) 1864 return false; 1865 1866 auto CmpRes = 1867 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1868 1869 // Coerce the type of the low half of the result so we can use merge_values. 1870 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1871 1872 // TODO: Should we allow mismatched types but matching sizes in merges to 1873 // avoid the ptrtoint? 1874 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1875 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1876 1877 MI.eraseFromParent(); 1878 return true; 1879 } 1880 1881 bool AMDGPULegalizerInfo::legalizeFrint( 1882 MachineInstr &MI, MachineRegisterInfo &MRI, 1883 MachineIRBuilder &B) const { 1884 Register Src = MI.getOperand(1).getReg(); 1885 LLT Ty = MRI.getType(Src); 1886 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1887 1888 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1889 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1890 1891 auto C1 = B.buildFConstant(Ty, C1Val); 1892 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1893 1894 // TODO: Should this propagate fast-math-flags? 1895 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1896 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1897 1898 auto C2 = B.buildFConstant(Ty, C2Val); 1899 auto Fabs = B.buildFAbs(Ty, Src); 1900 1901 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1902 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1903 MI.eraseFromParent(); 1904 return true; 1905 } 1906 1907 bool AMDGPULegalizerInfo::legalizeFceil( 1908 MachineInstr &MI, MachineRegisterInfo &MRI, 1909 MachineIRBuilder &B) const { 1910 1911 const LLT S1 = LLT::scalar(1); 1912 const LLT S64 = LLT::scalar(64); 1913 1914 Register Src = MI.getOperand(1).getReg(); 1915 assert(MRI.getType(Src) == S64); 1916 1917 // result = trunc(src) 1918 // if (src > 0.0 && src != result) 1919 // result += 1.0 1920 1921 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1922 1923 const auto Zero = B.buildFConstant(S64, 0.0); 1924 const auto One = B.buildFConstant(S64, 1.0); 1925 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1926 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1927 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1928 auto Add = B.buildSelect(S64, And, One, Zero); 1929 1930 // TODO: Should this propagate fast-math-flags? 1931 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1932 return true; 1933 } 1934 1935 bool AMDGPULegalizerInfo::legalizeFrem( 1936 MachineInstr &MI, MachineRegisterInfo &MRI, 1937 MachineIRBuilder &B) const { 1938 Register DstReg = MI.getOperand(0).getReg(); 1939 Register Src0Reg = MI.getOperand(1).getReg(); 1940 Register Src1Reg = MI.getOperand(2).getReg(); 1941 auto Flags = MI.getFlags(); 1942 LLT Ty = MRI.getType(DstReg); 1943 1944 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1945 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1946 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1947 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1948 MI.eraseFromParent(); 1949 return true; 1950 } 1951 1952 static MachineInstrBuilder extractF64Exponent(Register Hi, 1953 MachineIRBuilder &B) { 1954 const unsigned FractBits = 52; 1955 const unsigned ExpBits = 11; 1956 LLT S32 = LLT::scalar(32); 1957 1958 auto Const0 = B.buildConstant(S32, FractBits - 32); 1959 auto Const1 = B.buildConstant(S32, ExpBits); 1960 1961 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1962 .addUse(Hi) 1963 .addUse(Const0.getReg(0)) 1964 .addUse(Const1.getReg(0)); 1965 1966 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1967 } 1968 1969 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1970 MachineInstr &MI, MachineRegisterInfo &MRI, 1971 MachineIRBuilder &B) const { 1972 const LLT S1 = LLT::scalar(1); 1973 const LLT S32 = LLT::scalar(32); 1974 const LLT S64 = LLT::scalar(64); 1975 1976 Register Src = MI.getOperand(1).getReg(); 1977 assert(MRI.getType(Src) == S64); 1978 1979 // TODO: Should this use extract since the low half is unused? 1980 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1981 Register Hi = Unmerge.getReg(1); 1982 1983 // Extract the upper half, since this is where we will find the sign and 1984 // exponent. 1985 auto Exp = extractF64Exponent(Hi, B); 1986 1987 const unsigned FractBits = 52; 1988 1989 // Extract the sign bit. 1990 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1991 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1992 1993 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1994 1995 const auto Zero32 = B.buildConstant(S32, 0); 1996 1997 // Extend back to 64-bits. 1998 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1999 2000 auto Shr = B.buildAShr(S64, FractMask, Exp); 2001 auto Not = B.buildNot(S64, Shr); 2002 auto Tmp0 = B.buildAnd(S64, Src, Not); 2003 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2004 2005 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2006 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2007 2008 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2009 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2010 MI.eraseFromParent(); 2011 return true; 2012 } 2013 2014 bool AMDGPULegalizerInfo::legalizeITOFP( 2015 MachineInstr &MI, MachineRegisterInfo &MRI, 2016 MachineIRBuilder &B, bool Signed) const { 2017 2018 Register Dst = MI.getOperand(0).getReg(); 2019 Register Src = MI.getOperand(1).getReg(); 2020 2021 const LLT S64 = LLT::scalar(64); 2022 const LLT S32 = LLT::scalar(32); 2023 2024 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2025 2026 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2027 2028 auto CvtHi = Signed ? 2029 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2030 B.buildUITOFP(S64, Unmerge.getReg(1)); 2031 2032 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2033 2034 auto ThirtyTwo = B.buildConstant(S32, 32); 2035 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2036 .addUse(CvtHi.getReg(0)) 2037 .addUse(ThirtyTwo.getReg(0)); 2038 2039 // TODO: Should this propagate fast-math-flags? 2040 B.buildFAdd(Dst, LdExp, CvtLo); 2041 MI.eraseFromParent(); 2042 return true; 2043 } 2044 2045 // TODO: Copied from DAG implementation. Verify logic and document how this 2046 // actually works. 2047 bool AMDGPULegalizerInfo::legalizeFPTOI( 2048 MachineInstr &MI, MachineRegisterInfo &MRI, 2049 MachineIRBuilder &B, bool Signed) const { 2050 2051 Register Dst = MI.getOperand(0).getReg(); 2052 Register Src = MI.getOperand(1).getReg(); 2053 2054 const LLT S64 = LLT::scalar(64); 2055 const LLT S32 = LLT::scalar(32); 2056 2057 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2058 2059 unsigned Flags = MI.getFlags(); 2060 2061 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2062 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2063 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2064 2065 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2066 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2067 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2068 2069 auto Hi = Signed ? 2070 B.buildFPTOSI(S32, FloorMul) : 2071 B.buildFPTOUI(S32, FloorMul); 2072 auto Lo = B.buildFPTOUI(S32, Fma); 2073 2074 B.buildMerge(Dst, { Lo, Hi }); 2075 MI.eraseFromParent(); 2076 2077 return true; 2078 } 2079 2080 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2081 MachineInstr &MI) const { 2082 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2083 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2084 2085 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2086 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2087 2088 // With ieee_mode disabled, the instructions have the correct behavior 2089 // already for G_FMINNUM/G_FMAXNUM 2090 if (!MFI->getMode().IEEE) 2091 return !IsIEEEOp; 2092 2093 if (IsIEEEOp) 2094 return true; 2095 2096 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2097 } 2098 2099 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2100 MachineInstr &MI, MachineRegisterInfo &MRI, 2101 MachineIRBuilder &B) const { 2102 // TODO: Should move some of this into LegalizerHelper. 2103 2104 // TODO: Promote dynamic indexing of s16 to s32 2105 2106 // FIXME: Artifact combiner probably should have replaced the truncated 2107 // constant before this, so we shouldn't need 2108 // getConstantVRegValWithLookThrough. 2109 Optional<ValueAndVReg> MaybeIdxVal = 2110 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2111 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2112 return true; 2113 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2114 2115 Register Dst = MI.getOperand(0).getReg(); 2116 Register Vec = MI.getOperand(1).getReg(); 2117 2118 LLT VecTy = MRI.getType(Vec); 2119 LLT EltTy = VecTy.getElementType(); 2120 assert(EltTy == MRI.getType(Dst)); 2121 2122 if (IdxVal < VecTy.getNumElements()) 2123 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2124 else 2125 B.buildUndef(Dst); 2126 2127 MI.eraseFromParent(); 2128 return true; 2129 } 2130 2131 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2132 MachineInstr &MI, MachineRegisterInfo &MRI, 2133 MachineIRBuilder &B) const { 2134 // TODO: Should move some of this into LegalizerHelper. 2135 2136 // TODO: Promote dynamic indexing of s16 to s32 2137 2138 // FIXME: Artifact combiner probably should have replaced the truncated 2139 // constant before this, so we shouldn't need 2140 // getConstantVRegValWithLookThrough. 2141 Optional<ValueAndVReg> MaybeIdxVal = 2142 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2143 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2144 return true; 2145 2146 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2147 Register Dst = MI.getOperand(0).getReg(); 2148 Register Vec = MI.getOperand(1).getReg(); 2149 Register Ins = MI.getOperand(2).getReg(); 2150 2151 LLT VecTy = MRI.getType(Vec); 2152 LLT EltTy = VecTy.getElementType(); 2153 assert(EltTy == MRI.getType(Ins)); 2154 2155 if (IdxVal < VecTy.getNumElements()) 2156 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2157 else 2158 B.buildUndef(Dst); 2159 2160 MI.eraseFromParent(); 2161 return true; 2162 } 2163 2164 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2165 MachineInstr &MI, MachineRegisterInfo &MRI, 2166 MachineIRBuilder &B) const { 2167 const LLT V2S16 = LLT::vector(2, 16); 2168 2169 Register Dst = MI.getOperand(0).getReg(); 2170 Register Src0 = MI.getOperand(1).getReg(); 2171 LLT DstTy = MRI.getType(Dst); 2172 LLT SrcTy = MRI.getType(Src0); 2173 2174 if (SrcTy == V2S16 && DstTy == V2S16 && 2175 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2176 return true; 2177 2178 MachineIRBuilder HelperBuilder(MI); 2179 GISelObserverWrapper DummyObserver; 2180 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2181 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2182 } 2183 2184 bool AMDGPULegalizerInfo::legalizeSinCos( 2185 MachineInstr &MI, MachineRegisterInfo &MRI, 2186 MachineIRBuilder &B) const { 2187 2188 Register DstReg = MI.getOperand(0).getReg(); 2189 Register SrcReg = MI.getOperand(1).getReg(); 2190 LLT Ty = MRI.getType(DstReg); 2191 unsigned Flags = MI.getFlags(); 2192 2193 Register TrigVal; 2194 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2195 if (ST.hasTrigReducedRange()) { 2196 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2197 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2198 .addUse(MulVal.getReg(0)) 2199 .setMIFlags(Flags).getReg(0); 2200 } else 2201 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2202 2203 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2204 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2205 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2206 .addUse(TrigVal) 2207 .setMIFlags(Flags); 2208 MI.eraseFromParent(); 2209 return true; 2210 } 2211 2212 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2213 MachineIRBuilder &B, 2214 const GlobalValue *GV, 2215 int64_t Offset, 2216 unsigned GAFlags) const { 2217 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2218 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2219 // to the following code sequence: 2220 // 2221 // For constant address space: 2222 // s_getpc_b64 s[0:1] 2223 // s_add_u32 s0, s0, $symbol 2224 // s_addc_u32 s1, s1, 0 2225 // 2226 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2227 // a fixup or relocation is emitted to replace $symbol with a literal 2228 // constant, which is a pc-relative offset from the encoding of the $symbol 2229 // operand to the global variable. 2230 // 2231 // For global address space: 2232 // s_getpc_b64 s[0:1] 2233 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2234 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2235 // 2236 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2237 // fixups or relocations are emitted to replace $symbol@*@lo and 2238 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2239 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2240 // operand to the global variable. 2241 // 2242 // What we want here is an offset from the value returned by s_getpc 2243 // (which is the address of the s_add_u32 instruction) to the global 2244 // variable, but since the encoding of $symbol starts 4 bytes after the start 2245 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2246 // small. This requires us to add 4 to the global variable offset in order to 2247 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2248 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2249 // instruction. 2250 2251 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2252 2253 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2254 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2255 2256 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2257 .addDef(PCReg); 2258 2259 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2260 if (GAFlags == SIInstrInfo::MO_NONE) 2261 MIB.addImm(0); 2262 else 2263 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2264 2265 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2266 2267 if (PtrTy.getSizeInBits() == 32) 2268 B.buildExtract(DstReg, PCReg, 0); 2269 return true; 2270 } 2271 2272 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2273 MachineInstr &MI, MachineRegisterInfo &MRI, 2274 MachineIRBuilder &B) const { 2275 Register DstReg = MI.getOperand(0).getReg(); 2276 LLT Ty = MRI.getType(DstReg); 2277 unsigned AS = Ty.getAddressSpace(); 2278 2279 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2280 MachineFunction &MF = B.getMF(); 2281 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2282 2283 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2284 if (!MFI->isModuleEntryFunction()) { 2285 const Function &Fn = MF.getFunction(); 2286 DiagnosticInfoUnsupported BadLDSDecl( 2287 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2288 DS_Warning); 2289 Fn.getContext().diagnose(BadLDSDecl); 2290 2291 // We currently don't have a way to correctly allocate LDS objects that 2292 // aren't directly associated with a kernel. We do force inlining of 2293 // functions that use local objects. However, if these dead functions are 2294 // not eliminated, we don't want a compile time error. Just emit a warning 2295 // and a trap, since there should be no callable path here. 2296 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2297 B.buildUndef(DstReg); 2298 MI.eraseFromParent(); 2299 return true; 2300 } 2301 2302 // TODO: We could emit code to handle the initialization somewhere. 2303 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2304 const SITargetLowering *TLI = ST.getTargetLowering(); 2305 if (!TLI->shouldUseLDSConstAddress(GV)) { 2306 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2307 return true; // Leave in place; 2308 } 2309 2310 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2311 Type *Ty = GV->getValueType(); 2312 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2313 // zero-sized type in other languages to declare the dynamic shared 2314 // memory which size is not known at the compile time. They will be 2315 // allocated by the runtime and placed directly after the static 2316 // allocated ones. They all share the same offset. 2317 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2318 // Adjust alignment for that dynamic shared memory array. 2319 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2320 LLT S32 = LLT::scalar(32); 2321 auto Sz = 2322 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2323 B.buildIntToPtr(DstReg, Sz); 2324 MI.eraseFromParent(); 2325 return true; 2326 } 2327 } 2328 2329 B.buildConstant( 2330 DstReg, 2331 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2332 MI.eraseFromParent(); 2333 return true; 2334 } 2335 2336 const Function &Fn = MF.getFunction(); 2337 DiagnosticInfoUnsupported BadInit( 2338 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2339 Fn.getContext().diagnose(BadInit); 2340 return true; 2341 } 2342 2343 const SITargetLowering *TLI = ST.getTargetLowering(); 2344 2345 if (TLI->shouldEmitFixup(GV)) { 2346 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2347 MI.eraseFromParent(); 2348 return true; 2349 } 2350 2351 if (TLI->shouldEmitPCReloc(GV)) { 2352 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2353 MI.eraseFromParent(); 2354 return true; 2355 } 2356 2357 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2358 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2359 2360 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2361 MachinePointerInfo::getGOT(MF), 2362 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2363 MachineMemOperand::MOInvariant, 2364 8 /*Size*/, Align(8)); 2365 2366 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2367 2368 if (Ty.getSizeInBits() == 32) { 2369 // Truncate if this is a 32-bit constant adrdess. 2370 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2371 B.buildExtract(DstReg, Load, 0); 2372 } else 2373 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2374 2375 MI.eraseFromParent(); 2376 return true; 2377 } 2378 2379 static LLT widenToNextPowerOf2(LLT Ty) { 2380 if (Ty.isVector()) 2381 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2382 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2383 } 2384 2385 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2386 MachineInstr &MI) const { 2387 MachineIRBuilder &B = Helper.MIRBuilder; 2388 MachineRegisterInfo &MRI = *B.getMRI(); 2389 GISelChangeObserver &Observer = Helper.Observer; 2390 2391 Register PtrReg = MI.getOperand(1).getReg(); 2392 LLT PtrTy = MRI.getType(PtrReg); 2393 unsigned AddrSpace = PtrTy.getAddressSpace(); 2394 2395 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2396 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2397 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2398 Observer.changingInstr(MI); 2399 MI.getOperand(1).setReg(Cast.getReg(0)); 2400 Observer.changedInstr(MI); 2401 return true; 2402 } 2403 2404 Register ValReg = MI.getOperand(0).getReg(); 2405 LLT ValTy = MRI.getType(ValReg); 2406 2407 MachineMemOperand *MMO = *MI.memoperands_begin(); 2408 const unsigned ValSize = ValTy.getSizeInBits(); 2409 const unsigned MemSize = 8 * MMO->getSize(); 2410 const Align MemAlign = MMO->getAlign(); 2411 const unsigned AlignInBits = 8 * MemAlign.value(); 2412 2413 // Widen non-power-of-2 loads to the alignment if needed 2414 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2415 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2416 2417 // This was already the correct extending load result type, so just adjust 2418 // the memory type. 2419 if (WideMemSize == ValSize) { 2420 MachineFunction &MF = B.getMF(); 2421 2422 MachineMemOperand *WideMMO = 2423 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2424 Observer.changingInstr(MI); 2425 MI.setMemRefs(MF, {WideMMO}); 2426 Observer.changedInstr(MI); 2427 return true; 2428 } 2429 2430 // Don't bother handling edge case that should probably never be produced. 2431 if (ValSize > WideMemSize) 2432 return false; 2433 2434 LLT WideTy = widenToNextPowerOf2(ValTy); 2435 2436 Register WideLoad; 2437 if (!WideTy.isVector()) { 2438 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2439 B.buildTrunc(ValReg, WideLoad).getReg(0); 2440 } else { 2441 // Extract the subvector. 2442 2443 if (isRegisterType(ValTy)) { 2444 // If this a case where G_EXTRACT is legal, use it. 2445 // (e.g. <3 x s32> -> <4 x s32>) 2446 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2447 B.buildExtract(ValReg, WideLoad, 0); 2448 } else { 2449 // For cases where the widened type isn't a nice register value, unmerge 2450 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2451 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2452 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2453 B.setInsertPt(B.getMBB(), MI.getIterator()); 2454 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2455 } 2456 } 2457 2458 MI.eraseFromParent(); 2459 return true; 2460 } 2461 2462 return false; 2463 } 2464 2465 bool AMDGPULegalizerInfo::legalizeFMad( 2466 MachineInstr &MI, MachineRegisterInfo &MRI, 2467 MachineIRBuilder &B) const { 2468 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2469 assert(Ty.isScalar()); 2470 2471 MachineFunction &MF = B.getMF(); 2472 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2473 2474 // TODO: Always legal with future ftz flag. 2475 // FIXME: Do we need just output? 2476 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2477 return true; 2478 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2479 return true; 2480 2481 MachineIRBuilder HelperBuilder(MI); 2482 GISelObserverWrapper DummyObserver; 2483 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2484 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2485 } 2486 2487 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2488 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2489 Register DstReg = MI.getOperand(0).getReg(); 2490 Register PtrReg = MI.getOperand(1).getReg(); 2491 Register CmpVal = MI.getOperand(2).getReg(); 2492 Register NewVal = MI.getOperand(3).getReg(); 2493 2494 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2495 "this should not have been custom lowered"); 2496 2497 LLT ValTy = MRI.getType(CmpVal); 2498 LLT VecTy = LLT::vector(2, ValTy); 2499 2500 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2501 2502 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2503 .addDef(DstReg) 2504 .addUse(PtrReg) 2505 .addUse(PackedVal) 2506 .setMemRefs(MI.memoperands()); 2507 2508 MI.eraseFromParent(); 2509 return true; 2510 } 2511 2512 bool AMDGPULegalizerInfo::legalizeFlog( 2513 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2514 Register Dst = MI.getOperand(0).getReg(); 2515 Register Src = MI.getOperand(1).getReg(); 2516 LLT Ty = B.getMRI()->getType(Dst); 2517 unsigned Flags = MI.getFlags(); 2518 2519 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2520 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2521 2522 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2523 MI.eraseFromParent(); 2524 return true; 2525 } 2526 2527 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2528 MachineIRBuilder &B) const { 2529 Register Dst = MI.getOperand(0).getReg(); 2530 Register Src = MI.getOperand(1).getReg(); 2531 unsigned Flags = MI.getFlags(); 2532 LLT Ty = B.getMRI()->getType(Dst); 2533 2534 auto K = B.buildFConstant(Ty, numbers::log2e); 2535 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2536 B.buildFExp2(Dst, Mul, Flags); 2537 MI.eraseFromParent(); 2538 return true; 2539 } 2540 2541 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2542 MachineIRBuilder &B) const { 2543 Register Dst = MI.getOperand(0).getReg(); 2544 Register Src0 = MI.getOperand(1).getReg(); 2545 Register Src1 = MI.getOperand(2).getReg(); 2546 unsigned Flags = MI.getFlags(); 2547 LLT Ty = B.getMRI()->getType(Dst); 2548 const LLT S16 = LLT::scalar(16); 2549 const LLT S32 = LLT::scalar(32); 2550 2551 if (Ty == S32) { 2552 auto Log = B.buildFLog2(S32, Src0, Flags); 2553 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2554 .addUse(Log.getReg(0)) 2555 .addUse(Src1) 2556 .setMIFlags(Flags); 2557 B.buildFExp2(Dst, Mul, Flags); 2558 } else if (Ty == S16) { 2559 // There's no f16 fmul_legacy, so we need to convert for it. 2560 auto Log = B.buildFLog2(S16, Src0, Flags); 2561 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2562 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2563 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2564 .addUse(Ext0.getReg(0)) 2565 .addUse(Ext1.getReg(0)) 2566 .setMIFlags(Flags); 2567 2568 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2569 } else 2570 return false; 2571 2572 MI.eraseFromParent(); 2573 return true; 2574 } 2575 2576 // Find a source register, ignoring any possible source modifiers. 2577 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2578 Register ModSrc = OrigSrc; 2579 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2580 ModSrc = SrcFNeg->getOperand(1).getReg(); 2581 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2582 ModSrc = SrcFAbs->getOperand(1).getReg(); 2583 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2584 ModSrc = SrcFAbs->getOperand(1).getReg(); 2585 return ModSrc; 2586 } 2587 2588 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2589 MachineRegisterInfo &MRI, 2590 MachineIRBuilder &B) const { 2591 2592 const LLT S1 = LLT::scalar(1); 2593 const LLT S64 = LLT::scalar(64); 2594 Register Dst = MI.getOperand(0).getReg(); 2595 Register OrigSrc = MI.getOperand(1).getReg(); 2596 unsigned Flags = MI.getFlags(); 2597 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2598 "this should not have been custom lowered"); 2599 2600 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2601 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2602 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2603 // V_FRACT bug is: 2604 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2605 // 2606 // Convert floor(x) to (x - fract(x)) 2607 2608 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2609 .addUse(OrigSrc) 2610 .setMIFlags(Flags); 2611 2612 // Give source modifier matching some assistance before obscuring a foldable 2613 // pattern. 2614 2615 // TODO: We can avoid the neg on the fract? The input sign to fract 2616 // shouldn't matter? 2617 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2618 2619 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2620 2621 Register Min = MRI.createGenericVirtualRegister(S64); 2622 2623 // We don't need to concern ourselves with the snan handling difference, so 2624 // use the one which will directly select. 2625 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2626 if (MFI->getMode().IEEE) 2627 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2628 else 2629 B.buildFMinNum(Min, Fract, Const, Flags); 2630 2631 Register CorrectedFract = Min; 2632 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2633 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2634 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2635 } 2636 2637 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2638 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2639 2640 MI.eraseFromParent(); 2641 return true; 2642 } 2643 2644 // Turn an illegal packed v2s16 build vector into bit operations. 2645 // TODO: This should probably be a bitcast action in LegalizerHelper. 2646 bool AMDGPULegalizerInfo::legalizeBuildVector( 2647 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2648 Register Dst = MI.getOperand(0).getReg(); 2649 const LLT S32 = LLT::scalar(32); 2650 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2651 2652 Register Src0 = MI.getOperand(1).getReg(); 2653 Register Src1 = MI.getOperand(2).getReg(); 2654 assert(MRI.getType(Src0) == LLT::scalar(16)); 2655 2656 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2657 B.buildBitcast(Dst, Merge); 2658 2659 MI.eraseFromParent(); 2660 return true; 2661 } 2662 2663 // Check that this is a G_XOR x, -1 2664 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2665 if (MI.getOpcode() != TargetOpcode::G_XOR) 2666 return false; 2667 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2668 return ConstVal && *ConstVal == -1; 2669 } 2670 2671 // Return the use branch instruction, otherwise null if the usage is invalid. 2672 static MachineInstr * 2673 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2674 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2675 Register CondDef = MI.getOperand(0).getReg(); 2676 if (!MRI.hasOneNonDBGUse(CondDef)) 2677 return nullptr; 2678 2679 MachineBasicBlock *Parent = MI.getParent(); 2680 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2681 2682 if (isNot(MRI, *UseMI)) { 2683 Register NegatedCond = UseMI->getOperand(0).getReg(); 2684 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2685 return nullptr; 2686 2687 // We're deleting the def of this value, so we need to remove it. 2688 UseMI->eraseFromParent(); 2689 2690 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2691 Negated = true; 2692 } 2693 2694 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2695 return nullptr; 2696 2697 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2698 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2699 if (Next == Parent->end()) { 2700 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2701 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2702 return nullptr; 2703 UncondBrTarget = &*NextMBB; 2704 } else { 2705 if (Next->getOpcode() != AMDGPU::G_BR) 2706 return nullptr; 2707 Br = &*Next; 2708 UncondBrTarget = Br->getOperand(0).getMBB(); 2709 } 2710 2711 return UseMI; 2712 } 2713 2714 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2715 const ArgDescriptor *Arg, 2716 const TargetRegisterClass *ArgRC, 2717 LLT ArgTy) const { 2718 MCRegister SrcReg = Arg->getRegister(); 2719 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2720 assert(DstReg.isVirtual() && "Virtual register expected"); 2721 2722 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2723 ArgTy); 2724 if (Arg->isMasked()) { 2725 // TODO: Should we try to emit this once in the entry block? 2726 const LLT S32 = LLT::scalar(32); 2727 const unsigned Mask = Arg->getMask(); 2728 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2729 2730 Register AndMaskSrc = LiveIn; 2731 2732 if (Shift != 0) { 2733 auto ShiftAmt = B.buildConstant(S32, Shift); 2734 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2735 } 2736 2737 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2738 } else { 2739 B.buildCopy(DstReg, LiveIn); 2740 } 2741 2742 return true; 2743 } 2744 2745 bool AMDGPULegalizerInfo::loadInputValue( 2746 Register DstReg, MachineIRBuilder &B, 2747 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2748 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2749 const ArgDescriptor *Arg; 2750 const TargetRegisterClass *ArgRC; 2751 LLT ArgTy; 2752 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2753 2754 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2755 return false; // TODO: Handle these 2756 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2757 } 2758 2759 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2760 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2761 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2762 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2763 return false; 2764 2765 MI.eraseFromParent(); 2766 return true; 2767 } 2768 2769 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2770 MachineRegisterInfo &MRI, 2771 MachineIRBuilder &B) const { 2772 Register Dst = MI.getOperand(0).getReg(); 2773 LLT DstTy = MRI.getType(Dst); 2774 LLT S16 = LLT::scalar(16); 2775 LLT S32 = LLT::scalar(32); 2776 LLT S64 = LLT::scalar(64); 2777 2778 if (DstTy == S16) 2779 return legalizeFDIV16(MI, MRI, B); 2780 if (DstTy == S32) 2781 return legalizeFDIV32(MI, MRI, B); 2782 if (DstTy == S64) 2783 return legalizeFDIV64(MI, MRI, B); 2784 2785 return false; 2786 } 2787 2788 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2789 Register DstReg, 2790 Register X, 2791 Register Y, 2792 bool IsDiv) const { 2793 const LLT S1 = LLT::scalar(1); 2794 const LLT S32 = LLT::scalar(32); 2795 2796 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2797 // algorithm used here. 2798 2799 // Initial estimate of inv(y). 2800 auto FloatY = B.buildUITOFP(S32, Y); 2801 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2802 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2803 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2804 auto Z = B.buildFPTOUI(S32, ScaledY); 2805 2806 // One round of UNR. 2807 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2808 auto NegYZ = B.buildMul(S32, NegY, Z); 2809 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2810 2811 // Quotient/remainder estimate. 2812 auto Q = B.buildUMulH(S32, X, Z); 2813 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2814 2815 // First quotient/remainder refinement. 2816 auto One = B.buildConstant(S32, 1); 2817 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2818 if (IsDiv) 2819 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2820 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2821 2822 // Second quotient/remainder refinement. 2823 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2824 if (IsDiv) 2825 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2826 else 2827 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2828 } 2829 2830 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2831 MachineRegisterInfo &MRI, 2832 MachineIRBuilder &B) const { 2833 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2834 Register DstReg = MI.getOperand(0).getReg(); 2835 Register Num = MI.getOperand(1).getReg(); 2836 Register Den = MI.getOperand(2).getReg(); 2837 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2838 MI.eraseFromParent(); 2839 return true; 2840 } 2841 2842 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2843 // 2844 // Return lo, hi of result 2845 // 2846 // %cvt.lo = G_UITOFP Val.lo 2847 // %cvt.hi = G_UITOFP Val.hi 2848 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2849 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2850 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2851 // %mul2 = G_FMUL %mul1, 2**(-32) 2852 // %trunc = G_INTRINSIC_TRUNC %mul2 2853 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2854 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2855 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2856 Register Val) { 2857 const LLT S32 = LLT::scalar(32); 2858 auto Unmerge = B.buildUnmerge(S32, Val); 2859 2860 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2861 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2862 2863 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2864 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2865 2866 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2867 auto Mul1 = 2868 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2869 2870 // 2**(-32) 2871 auto Mul2 = 2872 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2873 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2874 2875 // -(2**32) 2876 auto Mad2 = B.buildFMAD(S32, Trunc, 2877 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2878 2879 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2880 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2881 2882 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2883 } 2884 2885 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2886 Register DstReg, 2887 Register Numer, 2888 Register Denom, 2889 bool IsDiv) const { 2890 const LLT S32 = LLT::scalar(32); 2891 const LLT S64 = LLT::scalar(64); 2892 const LLT S1 = LLT::scalar(1); 2893 Register RcpLo, RcpHi; 2894 2895 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2896 2897 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2898 2899 auto Zero64 = B.buildConstant(S64, 0); 2900 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2901 2902 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2903 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2904 2905 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2906 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2907 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2908 2909 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2910 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2911 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2912 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2913 2914 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2915 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2916 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2917 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2918 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2919 2920 auto Zero32 = B.buildConstant(S32, 0); 2921 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2922 auto Add2_HiC = 2923 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2924 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2925 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2926 2927 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2928 Register NumerLo = UnmergeNumer.getReg(0); 2929 Register NumerHi = UnmergeNumer.getReg(1); 2930 2931 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2932 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2933 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2934 Register Mul3_Lo = UnmergeMul3.getReg(0); 2935 Register Mul3_Hi = UnmergeMul3.getReg(1); 2936 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2937 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2938 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2939 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2940 2941 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2942 Register DenomLo = UnmergeDenom.getReg(0); 2943 Register DenomHi = UnmergeDenom.getReg(1); 2944 2945 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2946 auto C1 = B.buildSExt(S32, CmpHi); 2947 2948 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2949 auto C2 = B.buildSExt(S32, CmpLo); 2950 2951 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2952 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2953 2954 // TODO: Here and below portions of the code can be enclosed into if/endif. 2955 // Currently control flow is unconditional and we have 4 selects after 2956 // potential endif to substitute PHIs. 2957 2958 // if C3 != 0 ... 2959 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2960 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2961 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2962 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2963 2964 auto One64 = B.buildConstant(S64, 1); 2965 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2966 2967 auto C4 = 2968 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2969 auto C5 = 2970 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2971 auto C6 = B.buildSelect( 2972 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2973 2974 // if (C6 != 0) 2975 auto Add4 = B.buildAdd(S64, Add3, One64); 2976 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2977 2978 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2979 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2980 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2981 2982 // endif C6 2983 // endif C3 2984 2985 if (IsDiv) { 2986 auto Sel1 = B.buildSelect( 2987 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2988 B.buildSelect(DstReg, 2989 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2990 } else { 2991 auto Sel2 = B.buildSelect( 2992 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2993 B.buildSelect(DstReg, 2994 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2995 } 2996 } 2997 2998 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2999 MachineRegisterInfo &MRI, 3000 MachineIRBuilder &B) const { 3001 const LLT S64 = LLT::scalar(64); 3002 const LLT S32 = LLT::scalar(32); 3003 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 3004 Register DstReg = MI.getOperand(0).getReg(); 3005 Register Num = MI.getOperand(1).getReg(); 3006 Register Den = MI.getOperand(2).getReg(); 3007 LLT Ty = MRI.getType(DstReg); 3008 3009 if (Ty == S32) 3010 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 3011 else if (Ty == S64) 3012 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 3013 else 3014 return false; 3015 3016 MI.eraseFromParent(); 3017 return true; 3018 3019 } 3020 3021 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 3022 MachineRegisterInfo &MRI, 3023 MachineIRBuilder &B) const { 3024 const LLT S64 = LLT::scalar(64); 3025 const LLT S32 = LLT::scalar(32); 3026 3027 Register DstReg = MI.getOperand(0).getReg(); 3028 const LLT Ty = MRI.getType(DstReg); 3029 if (Ty != S32 && Ty != S64) 3030 return false; 3031 3032 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 3033 3034 Register LHS = MI.getOperand(1).getReg(); 3035 Register RHS = MI.getOperand(2).getReg(); 3036 3037 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3038 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3039 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3040 3041 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3042 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3043 3044 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3045 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3046 3047 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 3048 if (Ty == S32) 3049 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 3050 else 3051 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 3052 3053 Register Sign; 3054 if (IsDiv) 3055 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3056 else 3057 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3058 3059 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 3060 B.buildSub(DstReg, UDivRem, Sign); 3061 3062 MI.eraseFromParent(); 3063 return true; 3064 } 3065 3066 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3067 MachineRegisterInfo &MRI, 3068 MachineIRBuilder &B) const { 3069 Register Res = MI.getOperand(0).getReg(); 3070 Register LHS = MI.getOperand(1).getReg(); 3071 Register RHS = MI.getOperand(2).getReg(); 3072 uint16_t Flags = MI.getFlags(); 3073 LLT ResTy = MRI.getType(Res); 3074 3075 const MachineFunction &MF = B.getMF(); 3076 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3077 MI.getFlag(MachineInstr::FmAfn); 3078 3079 if (!AllowInaccurateRcp) 3080 return false; 3081 3082 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3083 // 1 / x -> RCP(x) 3084 if (CLHS->isExactlyValue(1.0)) { 3085 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3086 .addUse(RHS) 3087 .setMIFlags(Flags); 3088 3089 MI.eraseFromParent(); 3090 return true; 3091 } 3092 3093 // -1 / x -> RCP( FNEG(x) ) 3094 if (CLHS->isExactlyValue(-1.0)) { 3095 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3096 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3097 .addUse(FNeg.getReg(0)) 3098 .setMIFlags(Flags); 3099 3100 MI.eraseFromParent(); 3101 return true; 3102 } 3103 } 3104 3105 // x / y -> x * (1.0 / y) 3106 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3107 .addUse(RHS) 3108 .setMIFlags(Flags); 3109 B.buildFMul(Res, LHS, RCP, Flags); 3110 3111 MI.eraseFromParent(); 3112 return true; 3113 } 3114 3115 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3116 MachineRegisterInfo &MRI, 3117 MachineIRBuilder &B) const { 3118 Register Res = MI.getOperand(0).getReg(); 3119 Register X = MI.getOperand(1).getReg(); 3120 Register Y = MI.getOperand(2).getReg(); 3121 uint16_t Flags = MI.getFlags(); 3122 LLT ResTy = MRI.getType(Res); 3123 3124 const MachineFunction &MF = B.getMF(); 3125 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3126 MI.getFlag(MachineInstr::FmAfn); 3127 3128 if (!AllowInaccurateRcp) 3129 return false; 3130 3131 auto NegY = B.buildFNeg(ResTy, Y); 3132 auto One = B.buildFConstant(ResTy, 1.0); 3133 3134 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3135 .addUse(Y) 3136 .setMIFlags(Flags); 3137 3138 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3139 R = B.buildFMA(ResTy, Tmp0, R, R); 3140 3141 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3142 R = B.buildFMA(ResTy, Tmp1, R, R); 3143 3144 auto Ret = B.buildFMul(ResTy, X, R); 3145 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3146 3147 B.buildFMA(Res, Tmp2, R, Ret); 3148 MI.eraseFromParent(); 3149 return true; 3150 } 3151 3152 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3153 MachineRegisterInfo &MRI, 3154 MachineIRBuilder &B) const { 3155 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3156 return true; 3157 3158 Register Res = MI.getOperand(0).getReg(); 3159 Register LHS = MI.getOperand(1).getReg(); 3160 Register RHS = MI.getOperand(2).getReg(); 3161 3162 uint16_t Flags = MI.getFlags(); 3163 3164 LLT S16 = LLT::scalar(16); 3165 LLT S32 = LLT::scalar(32); 3166 3167 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3168 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3169 3170 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3171 .addUse(RHSExt.getReg(0)) 3172 .setMIFlags(Flags); 3173 3174 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3175 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3176 3177 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3178 .addUse(RDst.getReg(0)) 3179 .addUse(RHS) 3180 .addUse(LHS) 3181 .setMIFlags(Flags); 3182 3183 MI.eraseFromParent(); 3184 return true; 3185 } 3186 3187 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3188 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3189 static void toggleSPDenormMode(bool Enable, 3190 MachineIRBuilder &B, 3191 const GCNSubtarget &ST, 3192 AMDGPU::SIModeRegisterDefaults Mode) { 3193 // Set SP denorm mode to this value. 3194 unsigned SPDenormMode = 3195 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3196 3197 if (ST.hasDenormModeInst()) { 3198 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3199 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3200 3201 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3202 B.buildInstr(AMDGPU::S_DENORM_MODE) 3203 .addImm(NewDenormModeValue); 3204 3205 } else { 3206 // Select FP32 bit field in mode register. 3207 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3208 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3209 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3210 3211 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3212 .addImm(SPDenormMode) 3213 .addImm(SPDenormModeBitField); 3214 } 3215 } 3216 3217 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3218 MachineRegisterInfo &MRI, 3219 MachineIRBuilder &B) const { 3220 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3221 return true; 3222 3223 Register Res = MI.getOperand(0).getReg(); 3224 Register LHS = MI.getOperand(1).getReg(); 3225 Register RHS = MI.getOperand(2).getReg(); 3226 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3227 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3228 3229 uint16_t Flags = MI.getFlags(); 3230 3231 LLT S32 = LLT::scalar(32); 3232 LLT S1 = LLT::scalar(1); 3233 3234 auto One = B.buildFConstant(S32, 1.0f); 3235 3236 auto DenominatorScaled = 3237 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3238 .addUse(LHS) 3239 .addUse(RHS) 3240 .addImm(0) 3241 .setMIFlags(Flags); 3242 auto NumeratorScaled = 3243 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3244 .addUse(LHS) 3245 .addUse(RHS) 3246 .addImm(1) 3247 .setMIFlags(Flags); 3248 3249 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3250 .addUse(DenominatorScaled.getReg(0)) 3251 .setMIFlags(Flags); 3252 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3253 3254 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3255 // aren't modeled as reading it. 3256 if (!Mode.allFP32Denormals()) 3257 toggleSPDenormMode(true, B, ST, Mode); 3258 3259 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3260 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3261 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3262 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3263 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3264 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3265 3266 if (!Mode.allFP32Denormals()) 3267 toggleSPDenormMode(false, B, ST, Mode); 3268 3269 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3270 .addUse(Fma4.getReg(0)) 3271 .addUse(Fma1.getReg(0)) 3272 .addUse(Fma3.getReg(0)) 3273 .addUse(NumeratorScaled.getReg(1)) 3274 .setMIFlags(Flags); 3275 3276 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3277 .addUse(Fmas.getReg(0)) 3278 .addUse(RHS) 3279 .addUse(LHS) 3280 .setMIFlags(Flags); 3281 3282 MI.eraseFromParent(); 3283 return true; 3284 } 3285 3286 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3287 MachineRegisterInfo &MRI, 3288 MachineIRBuilder &B) const { 3289 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3290 return true; 3291 3292 Register Res = MI.getOperand(0).getReg(); 3293 Register LHS = MI.getOperand(1).getReg(); 3294 Register RHS = MI.getOperand(2).getReg(); 3295 3296 uint16_t Flags = MI.getFlags(); 3297 3298 LLT S64 = LLT::scalar(64); 3299 LLT S1 = LLT::scalar(1); 3300 3301 auto One = B.buildFConstant(S64, 1.0); 3302 3303 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3304 .addUse(LHS) 3305 .addUse(RHS) 3306 .addImm(0) 3307 .setMIFlags(Flags); 3308 3309 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3310 3311 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3312 .addUse(DivScale0.getReg(0)) 3313 .setMIFlags(Flags); 3314 3315 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3316 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3317 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3318 3319 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3320 .addUse(LHS) 3321 .addUse(RHS) 3322 .addImm(1) 3323 .setMIFlags(Flags); 3324 3325 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3326 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3327 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3328 3329 Register Scale; 3330 if (!ST.hasUsableDivScaleConditionOutput()) { 3331 // Workaround a hardware bug on SI where the condition output from div_scale 3332 // is not usable. 3333 3334 LLT S32 = LLT::scalar(32); 3335 3336 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3337 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3338 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3339 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3340 3341 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3342 Scale1Unmerge.getReg(1)); 3343 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3344 Scale0Unmerge.getReg(1)); 3345 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3346 } else { 3347 Scale = DivScale1.getReg(1); 3348 } 3349 3350 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3351 .addUse(Fma4.getReg(0)) 3352 .addUse(Fma3.getReg(0)) 3353 .addUse(Mul.getReg(0)) 3354 .addUse(Scale) 3355 .setMIFlags(Flags); 3356 3357 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3358 .addUse(Fmas.getReg(0)) 3359 .addUse(RHS) 3360 .addUse(LHS) 3361 .setMIFlags(Flags); 3362 3363 MI.eraseFromParent(); 3364 return true; 3365 } 3366 3367 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3368 MachineRegisterInfo &MRI, 3369 MachineIRBuilder &B) const { 3370 Register Res = MI.getOperand(0).getReg(); 3371 Register LHS = MI.getOperand(2).getReg(); 3372 Register RHS = MI.getOperand(3).getReg(); 3373 uint16_t Flags = MI.getFlags(); 3374 3375 LLT S32 = LLT::scalar(32); 3376 LLT S1 = LLT::scalar(1); 3377 3378 auto Abs = B.buildFAbs(S32, RHS, Flags); 3379 const APFloat C0Val(1.0f); 3380 3381 auto C0 = B.buildConstant(S32, 0x6f800000); 3382 auto C1 = B.buildConstant(S32, 0x2f800000); 3383 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3384 3385 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3386 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3387 3388 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3389 3390 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3391 .addUse(Mul0.getReg(0)) 3392 .setMIFlags(Flags); 3393 3394 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3395 3396 B.buildFMul(Res, Sel, Mul1, Flags); 3397 3398 MI.eraseFromParent(); 3399 return true; 3400 } 3401 3402 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3403 // FIXME: Why do we handle this one but not other removed instructions? 3404 // 3405 // Reciprocal square root. The clamp prevents infinite results, clamping 3406 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3407 // +-max_float. 3408 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3409 MachineRegisterInfo &MRI, 3410 MachineIRBuilder &B) const { 3411 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3412 return true; 3413 3414 Register Dst = MI.getOperand(0).getReg(); 3415 Register Src = MI.getOperand(2).getReg(); 3416 auto Flags = MI.getFlags(); 3417 3418 LLT Ty = MRI.getType(Dst); 3419 3420 const fltSemantics *FltSemantics; 3421 if (Ty == LLT::scalar(32)) 3422 FltSemantics = &APFloat::IEEEsingle(); 3423 else if (Ty == LLT::scalar(64)) 3424 FltSemantics = &APFloat::IEEEdouble(); 3425 else 3426 return false; 3427 3428 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3429 .addUse(Src) 3430 .setMIFlags(Flags); 3431 3432 // We don't need to concern ourselves with the snan handling difference, since 3433 // the rsq quieted (or not) so use the one which will directly select. 3434 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3435 const bool UseIEEE = MFI->getMode().IEEE; 3436 3437 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3438 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3439 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3440 3441 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3442 3443 if (UseIEEE) 3444 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3445 else 3446 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3447 MI.eraseFromParent(); 3448 return true; 3449 } 3450 3451 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3452 switch (IID) { 3453 case Intrinsic::amdgcn_ds_fadd: 3454 return AMDGPU::G_ATOMICRMW_FADD; 3455 case Intrinsic::amdgcn_ds_fmin: 3456 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3457 case Intrinsic::amdgcn_ds_fmax: 3458 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3459 default: 3460 llvm_unreachable("not a DS FP intrinsic"); 3461 } 3462 } 3463 3464 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3465 MachineInstr &MI, 3466 Intrinsic::ID IID) const { 3467 GISelChangeObserver &Observer = Helper.Observer; 3468 Observer.changingInstr(MI); 3469 3470 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3471 3472 // The remaining operands were used to set fields in the MemOperand on 3473 // construction. 3474 for (int I = 6; I > 3; --I) 3475 MI.RemoveOperand(I); 3476 3477 MI.RemoveOperand(1); // Remove the intrinsic ID. 3478 Observer.changedInstr(MI); 3479 return true; 3480 } 3481 3482 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3483 MachineRegisterInfo &MRI, 3484 MachineIRBuilder &B) const { 3485 uint64_t Offset = 3486 ST.getTargetLowering()->getImplicitParameterOffset( 3487 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3488 LLT DstTy = MRI.getType(DstReg); 3489 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3490 3491 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3492 if (!loadInputValue(KernargPtrReg, B, 3493 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3494 return false; 3495 3496 // FIXME: This should be nuw 3497 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3498 return true; 3499 } 3500 3501 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3502 MachineRegisterInfo &MRI, 3503 MachineIRBuilder &B) const { 3504 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3505 if (!MFI->isEntryFunction()) { 3506 return legalizePreloadedArgIntrin(MI, MRI, B, 3507 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3508 } 3509 3510 Register DstReg = MI.getOperand(0).getReg(); 3511 if (!getImplicitArgPtr(DstReg, MRI, B)) 3512 return false; 3513 3514 MI.eraseFromParent(); 3515 return true; 3516 } 3517 3518 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3519 MachineRegisterInfo &MRI, 3520 MachineIRBuilder &B, 3521 unsigned AddrSpace) const { 3522 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3523 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3524 Register Hi32 = Unmerge.getReg(1); 3525 3526 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3527 MI.eraseFromParent(); 3528 return true; 3529 } 3530 3531 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3532 // offset (the offset that is included in bounds checking and swizzling, to be 3533 // split between the instruction's voffset and immoffset fields) and soffset 3534 // (the offset that is excluded from bounds checking and swizzling, to go in 3535 // the instruction's soffset field). This function takes the first kind of 3536 // offset and figures out how to split it between voffset and immoffset. 3537 std::tuple<Register, unsigned, unsigned> 3538 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3539 Register OrigOffset) const { 3540 const unsigned MaxImm = 4095; 3541 Register BaseReg; 3542 unsigned TotalConstOffset; 3543 const LLT S32 = LLT::scalar(32); 3544 MachineRegisterInfo &MRI = *B.getMRI(); 3545 3546 std::tie(BaseReg, TotalConstOffset) = 3547 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3548 3549 unsigned ImmOffset = TotalConstOffset; 3550 3551 // If BaseReg is a pointer, convert it to int. 3552 if (MRI.getType(BaseReg).isPointer()) 3553 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3554 3555 // If the immediate value is too big for the immoffset field, put the value 3556 // and -4096 into the immoffset field so that the value that is copied/added 3557 // for the voffset field is a multiple of 4096, and it stands more chance 3558 // of being CSEd with the copy/add for another similar load/store. 3559 // However, do not do that rounding down to a multiple of 4096 if that is a 3560 // negative number, as it appears to be illegal to have a negative offset 3561 // in the vgpr, even if adding the immediate offset makes it positive. 3562 unsigned Overflow = ImmOffset & ~MaxImm; 3563 ImmOffset -= Overflow; 3564 if ((int32_t)Overflow < 0) { 3565 Overflow += ImmOffset; 3566 ImmOffset = 0; 3567 } 3568 3569 if (Overflow != 0) { 3570 if (!BaseReg) { 3571 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3572 } else { 3573 auto OverflowVal = B.buildConstant(S32, Overflow); 3574 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3575 } 3576 } 3577 3578 if (!BaseReg) 3579 BaseReg = B.buildConstant(S32, 0).getReg(0); 3580 3581 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3582 } 3583 3584 /// Handle register layout difference for f16 images for some subtargets. 3585 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3586 MachineRegisterInfo &MRI, 3587 Register Reg, 3588 bool ImageStore) const { 3589 const LLT S16 = LLT::scalar(16); 3590 const LLT S32 = LLT::scalar(32); 3591 LLT StoreVT = MRI.getType(Reg); 3592 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3593 3594 if (ST.hasUnpackedD16VMem()) { 3595 auto Unmerge = B.buildUnmerge(S16, Reg); 3596 3597 SmallVector<Register, 4> WideRegs; 3598 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3599 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3600 3601 int NumElts = StoreVT.getNumElements(); 3602 3603 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3604 } 3605 3606 if (ImageStore && ST.hasImageStoreD16Bug()) { 3607 if (StoreVT.getNumElements() == 2) { 3608 SmallVector<Register, 4> PackedRegs; 3609 Reg = B.buildBitcast(S32, Reg).getReg(0); 3610 PackedRegs.push_back(Reg); 3611 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3612 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); 3613 } 3614 3615 if (StoreVT.getNumElements() == 3) { 3616 SmallVector<Register, 4> PackedRegs; 3617 auto Unmerge = B.buildUnmerge(S16, Reg); 3618 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3619 PackedRegs.push_back(Unmerge.getReg(I)); 3620 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3621 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); 3622 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); 3623 } 3624 3625 if (StoreVT.getNumElements() == 4) { 3626 SmallVector<Register, 4> PackedRegs; 3627 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); 3628 auto Unmerge = B.buildUnmerge(S32, Reg); 3629 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3630 PackedRegs.push_back(Unmerge.getReg(I)); 3631 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3632 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); 3633 } 3634 3635 llvm_unreachable("invalid data type"); 3636 } 3637 3638 return Reg; 3639 } 3640 3641 Register AMDGPULegalizerInfo::fixStoreSourceType( 3642 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3643 MachineRegisterInfo *MRI = B.getMRI(); 3644 LLT Ty = MRI->getType(VData); 3645 3646 const LLT S16 = LLT::scalar(16); 3647 3648 // Fixup illegal register types for i8 stores. 3649 if (Ty == LLT::scalar(8) || Ty == S16) { 3650 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3651 return AnyExt; 3652 } 3653 3654 if (Ty.isVector()) { 3655 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3656 if (IsFormat) 3657 return handleD16VData(B, *MRI, VData); 3658 } 3659 } 3660 3661 return VData; 3662 } 3663 3664 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3665 MachineRegisterInfo &MRI, 3666 MachineIRBuilder &B, 3667 bool IsTyped, 3668 bool IsFormat) const { 3669 Register VData = MI.getOperand(1).getReg(); 3670 LLT Ty = MRI.getType(VData); 3671 LLT EltTy = Ty.getScalarType(); 3672 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3673 const LLT S32 = LLT::scalar(32); 3674 3675 VData = fixStoreSourceType(B, VData, IsFormat); 3676 Register RSrc = MI.getOperand(2).getReg(); 3677 3678 MachineMemOperand *MMO = *MI.memoperands_begin(); 3679 const int MemSize = MMO->getSize(); 3680 3681 unsigned ImmOffset; 3682 unsigned TotalOffset; 3683 3684 // The typed intrinsics add an immediate after the registers. 3685 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3686 3687 // The struct intrinsic variants add one additional operand over raw. 3688 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3689 Register VIndex; 3690 int OpOffset = 0; 3691 if (HasVIndex) { 3692 VIndex = MI.getOperand(3).getReg(); 3693 OpOffset = 1; 3694 } 3695 3696 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3697 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3698 3699 unsigned Format = 0; 3700 if (IsTyped) { 3701 Format = MI.getOperand(5 + OpOffset).getImm(); 3702 ++OpOffset; 3703 } 3704 3705 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3706 3707 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3708 if (TotalOffset != 0) 3709 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3710 3711 unsigned Opc; 3712 if (IsTyped) { 3713 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3714 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3715 } else if (IsFormat) { 3716 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3717 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3718 } else { 3719 switch (MemSize) { 3720 case 1: 3721 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3722 break; 3723 case 2: 3724 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3725 break; 3726 default: 3727 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3728 break; 3729 } 3730 } 3731 3732 if (!VIndex) 3733 VIndex = B.buildConstant(S32, 0).getReg(0); 3734 3735 auto MIB = B.buildInstr(Opc) 3736 .addUse(VData) // vdata 3737 .addUse(RSrc) // rsrc 3738 .addUse(VIndex) // vindex 3739 .addUse(VOffset) // voffset 3740 .addUse(SOffset) // soffset 3741 .addImm(ImmOffset); // offset(imm) 3742 3743 if (IsTyped) 3744 MIB.addImm(Format); 3745 3746 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3747 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3748 .addMemOperand(MMO); 3749 3750 MI.eraseFromParent(); 3751 return true; 3752 } 3753 3754 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3755 MachineRegisterInfo &MRI, 3756 MachineIRBuilder &B, 3757 bool IsFormat, 3758 bool IsTyped) const { 3759 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3760 MachineMemOperand *MMO = *MI.memoperands_begin(); 3761 const int MemSize = MMO->getSize(); 3762 const LLT S32 = LLT::scalar(32); 3763 3764 Register Dst = MI.getOperand(0).getReg(); 3765 Register RSrc = MI.getOperand(2).getReg(); 3766 3767 // The typed intrinsics add an immediate after the registers. 3768 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3769 3770 // The struct intrinsic variants add one additional operand over raw. 3771 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3772 Register VIndex; 3773 int OpOffset = 0; 3774 if (HasVIndex) { 3775 VIndex = MI.getOperand(3).getReg(); 3776 OpOffset = 1; 3777 } 3778 3779 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3780 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3781 3782 unsigned Format = 0; 3783 if (IsTyped) { 3784 Format = MI.getOperand(5 + OpOffset).getImm(); 3785 ++OpOffset; 3786 } 3787 3788 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3789 unsigned ImmOffset; 3790 unsigned TotalOffset; 3791 3792 LLT Ty = MRI.getType(Dst); 3793 LLT EltTy = Ty.getScalarType(); 3794 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3795 const bool Unpacked = ST.hasUnpackedD16VMem(); 3796 3797 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3798 if (TotalOffset != 0) 3799 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3800 3801 unsigned Opc; 3802 3803 if (IsTyped) { 3804 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3805 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3806 } else if (IsFormat) { 3807 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3808 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3809 } else { 3810 switch (MemSize) { 3811 case 1: 3812 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3813 break; 3814 case 2: 3815 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3816 break; 3817 default: 3818 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3819 break; 3820 } 3821 } 3822 3823 Register LoadDstReg; 3824 3825 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3826 LLT UnpackedTy = Ty.changeElementSize(32); 3827 3828 if (IsExtLoad) 3829 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3830 else if (Unpacked && IsD16 && Ty.isVector()) 3831 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3832 else 3833 LoadDstReg = Dst; 3834 3835 if (!VIndex) 3836 VIndex = B.buildConstant(S32, 0).getReg(0); 3837 3838 auto MIB = B.buildInstr(Opc) 3839 .addDef(LoadDstReg) // vdata 3840 .addUse(RSrc) // rsrc 3841 .addUse(VIndex) // vindex 3842 .addUse(VOffset) // voffset 3843 .addUse(SOffset) // soffset 3844 .addImm(ImmOffset); // offset(imm) 3845 3846 if (IsTyped) 3847 MIB.addImm(Format); 3848 3849 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3850 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3851 .addMemOperand(MMO); 3852 3853 if (LoadDstReg != Dst) { 3854 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3855 3856 // Widen result for extending loads was widened. 3857 if (IsExtLoad) 3858 B.buildTrunc(Dst, LoadDstReg); 3859 else { 3860 // Repack to original 16-bit vector result 3861 // FIXME: G_TRUNC should work, but legalization currently fails 3862 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3863 SmallVector<Register, 4> Repack; 3864 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3865 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3866 B.buildMerge(Dst, Repack); 3867 } 3868 } 3869 3870 MI.eraseFromParent(); 3871 return true; 3872 } 3873 3874 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3875 MachineIRBuilder &B, 3876 bool IsInc) const { 3877 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3878 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3879 B.buildInstr(Opc) 3880 .addDef(MI.getOperand(0).getReg()) 3881 .addUse(MI.getOperand(2).getReg()) 3882 .addUse(MI.getOperand(3).getReg()) 3883 .cloneMemRefs(MI); 3884 MI.eraseFromParent(); 3885 return true; 3886 } 3887 3888 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3889 switch (IntrID) { 3890 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3891 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3892 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3893 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3894 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3895 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3896 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3897 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3898 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3899 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3900 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3901 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3902 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3903 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3904 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3905 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3906 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3907 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3908 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3909 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3910 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3911 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3912 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3913 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3914 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3915 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3916 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3917 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3918 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3919 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3920 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3921 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3922 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3923 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3924 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3925 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3926 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3927 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3928 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3929 case Intrinsic::amdgcn_buffer_atomic_fadd: 3930 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3931 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3932 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3933 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 3934 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 3935 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 3936 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 3937 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 3938 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 3939 default: 3940 llvm_unreachable("unhandled atomic opcode"); 3941 } 3942 } 3943 3944 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3945 MachineIRBuilder &B, 3946 Intrinsic::ID IID) const { 3947 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3948 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3949 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3950 3951 Register Dst; 3952 3953 int OpOffset = 0; 3954 if (HasReturn) { 3955 // A few FP atomics do not support return values. 3956 Dst = MI.getOperand(0).getReg(); 3957 } else { 3958 OpOffset = -1; 3959 } 3960 3961 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3962 Register CmpVal; 3963 3964 if (IsCmpSwap) { 3965 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3966 ++OpOffset; 3967 } 3968 3969 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3970 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3971 3972 // The struct intrinsic variants add one additional operand over raw. 3973 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3974 Register VIndex; 3975 if (HasVIndex) { 3976 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3977 ++OpOffset; 3978 } 3979 3980 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3981 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3982 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3983 3984 MachineMemOperand *MMO = *MI.memoperands_begin(); 3985 3986 unsigned ImmOffset; 3987 unsigned TotalOffset; 3988 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3989 if (TotalOffset != 0) 3990 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3991 3992 if (!VIndex) 3993 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3994 3995 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3996 3997 if (HasReturn) 3998 MIB.addDef(Dst); 3999 4000 MIB.addUse(VData); // vdata 4001 4002 if (IsCmpSwap) 4003 MIB.addReg(CmpVal); 4004 4005 MIB.addUse(RSrc) // rsrc 4006 .addUse(VIndex) // vindex 4007 .addUse(VOffset) // voffset 4008 .addUse(SOffset) // soffset 4009 .addImm(ImmOffset) // offset(imm) 4010 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4011 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4012 .addMemOperand(MMO); 4013 4014 MI.eraseFromParent(); 4015 return true; 4016 } 4017 4018 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 4019 /// vector with s16 typed elements. 4020 static void packImageA16AddressToDwords( 4021 MachineIRBuilder &B, MachineInstr &MI, 4022 SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset, 4023 const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) { 4024 const LLT S16 = LLT::scalar(16); 4025 const LLT V2S16 = LLT::vector(2, 16); 4026 4027 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4028 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4029 if (!SrcOp.isReg()) 4030 continue; // _L to _LZ may have eliminated this. 4031 4032 Register AddrReg = SrcOp.getReg(); 4033 4034 if (I < Intr->GradientStart) { 4035 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4036 PackedAddrs.push_back(AddrReg); 4037 } else { 4038 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4039 // derivatives dx/dh and dx/dv are packed with undef. 4040 if (((I + 1) >= EndIdx) || 4041 ((Intr->NumGradients / 2) % 2 == 1 && 4042 (I == static_cast<unsigned>(Intr->GradientStart + 4043 (Intr->NumGradients / 2) - 1) || 4044 I == static_cast<unsigned>(Intr->GradientStart + 4045 Intr->NumGradients - 1))) || 4046 // Check for _L to _LZ optimization 4047 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4048 PackedAddrs.push_back( 4049 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4050 .getReg(0)); 4051 } else { 4052 PackedAddrs.push_back( 4053 B.buildBuildVector( 4054 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4055 .getReg(0)); 4056 ++I; 4057 } 4058 } 4059 } 4060 } 4061 4062 /// Convert from separate vaddr components to a single vector address register, 4063 /// and replace the remaining operands with $noreg. 4064 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4065 int DimIdx, int NumVAddrs) { 4066 const LLT S32 = LLT::scalar(32); 4067 4068 SmallVector<Register, 8> AddrRegs; 4069 for (int I = 0; I != NumVAddrs; ++I) { 4070 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4071 if (SrcOp.isReg()) { 4072 AddrRegs.push_back(SrcOp.getReg()); 4073 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4074 } 4075 } 4076 4077 int NumAddrRegs = AddrRegs.size(); 4078 if (NumAddrRegs != 1) { 4079 // Round up to 8 elements for v5-v7 4080 // FIXME: Missing intermediate sized register classes and instructions. 4081 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4082 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4083 auto Undef = B.buildUndef(S32); 4084 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4085 NumAddrRegs = RoundedNumRegs; 4086 } 4087 4088 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 4089 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4090 } 4091 4092 for (int I = 1; I != NumVAddrs; ++I) { 4093 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4094 if (SrcOp.isReg()) 4095 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4096 } 4097 } 4098 4099 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4100 /// 4101 /// Depending on the subtarget, load/store with 16-bit element data need to be 4102 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4103 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4104 /// registers. 4105 /// 4106 /// We don't want to directly select image instructions just yet, but also want 4107 /// to exposes all register repacking to the legalizer/combiners. We also don't 4108 /// want a selected instrution entering RegBankSelect. In order to avoid 4109 /// defining a multitude of intermediate image instructions, directly hack on 4110 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4111 /// now unnecessary arguments with $noreg. 4112 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4113 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4114 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4115 4116 const unsigned NumDefs = MI.getNumExplicitDefs(); 4117 const unsigned ArgOffset = NumDefs + 1; 4118 bool IsTFE = NumDefs == 2; 4119 // We are only processing the operands of d16 image operations on subtargets 4120 // that use the unpacked register layout, or need to repack the TFE result. 4121 4122 // TODO: Do we need to guard against already legalized intrinsics? 4123 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4124 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4125 4126 MachineRegisterInfo *MRI = B.getMRI(); 4127 const LLT S32 = LLT::scalar(32); 4128 const LLT S16 = LLT::scalar(16); 4129 const LLT V2S16 = LLT::vector(2, 16); 4130 4131 unsigned DMask = 0; 4132 4133 // Check for 16 bit addresses and pack if true. 4134 LLT GradTy = 4135 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4136 LLT AddrTy = 4137 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4138 const bool IsG16 = GradTy == S16; 4139 const bool IsA16 = AddrTy == S16; 4140 4141 int DMaskLanes = 0; 4142 if (!BaseOpcode->Atomic) { 4143 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4144 if (BaseOpcode->Gather4) { 4145 DMaskLanes = 4; 4146 } else if (DMask != 0) { 4147 DMaskLanes = countPopulation(DMask); 4148 } else if (!IsTFE && !BaseOpcode->Store) { 4149 // If dmask is 0, this is a no-op load. This can be eliminated. 4150 B.buildUndef(MI.getOperand(0)); 4151 MI.eraseFromParent(); 4152 return true; 4153 } 4154 } 4155 4156 Observer.changingInstr(MI); 4157 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4158 4159 unsigned NewOpcode = NumDefs == 0 ? 4160 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4161 4162 // Track that we legalized this 4163 MI.setDesc(B.getTII().get(NewOpcode)); 4164 4165 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4166 // dmask to be at least 1 otherwise the instruction will fail 4167 if (IsTFE && DMask == 0) { 4168 DMask = 0x1; 4169 DMaskLanes = 1; 4170 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4171 } 4172 4173 if (BaseOpcode->Atomic) { 4174 Register VData0 = MI.getOperand(2).getReg(); 4175 LLT Ty = MRI->getType(VData0); 4176 4177 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4178 if (Ty.isVector()) 4179 return false; 4180 4181 if (BaseOpcode->AtomicX2) { 4182 Register VData1 = MI.getOperand(3).getReg(); 4183 // The two values are packed in one register. 4184 LLT PackedTy = LLT::vector(2, Ty); 4185 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4186 MI.getOperand(2).setReg(Concat.getReg(0)); 4187 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4188 } 4189 } 4190 4191 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4192 4193 // Optimize _L to _LZ when _L is zero 4194 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4195 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4196 const ConstantFP *ConstantLod; 4197 4198 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4199 m_GFCst(ConstantLod))) { 4200 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4201 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4202 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4203 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4204 Intr->Dim); 4205 4206 // The starting indexes should remain in the same place. 4207 --CorrectedNumVAddrs; 4208 4209 MI.getOperand(MI.getNumExplicitDefs()) 4210 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4211 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4212 Intr = NewImageDimIntr; 4213 } 4214 } 4215 } 4216 4217 // Optimize _mip away, when 'lod' is zero 4218 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4219 int64_t ConstantLod; 4220 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4221 m_ICst(ConstantLod))) { 4222 if (ConstantLod == 0) { 4223 // TODO: Change intrinsic opcode and remove operand instead or replacing 4224 // it with 0, as the _L to _LZ handling is done above. 4225 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4226 --CorrectedNumVAddrs; 4227 } 4228 } 4229 } 4230 4231 // Rewrite the addressing register layout before doing anything else. 4232 if (IsA16 || IsG16) { 4233 if (IsA16) { 4234 // Target must support the feature and gradients need to be 16 bit too 4235 if (!ST.hasA16() || !IsG16) 4236 return false; 4237 } else if (!ST.hasG16()) 4238 return false; 4239 4240 if (Intr->NumVAddrs > 1) { 4241 SmallVector<Register, 4> PackedRegs; 4242 // Don't compress addresses for G16 4243 const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart; 4244 packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr, 4245 PackEndIdx); 4246 4247 if (!IsA16) { 4248 // Add uncompressed address 4249 for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) { 4250 int AddrReg = MI.getOperand(ArgOffset + I).getReg(); 4251 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 4252 PackedRegs.push_back(AddrReg); 4253 } 4254 } 4255 4256 // See also below in the non-a16 branch 4257 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4258 4259 if (!UseNSA && PackedRegs.size() > 1) { 4260 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4261 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4262 PackedRegs[0] = Concat.getReg(0); 4263 PackedRegs.resize(1); 4264 } 4265 4266 const unsigned NumPacked = PackedRegs.size(); 4267 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4268 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4269 if (!SrcOp.isReg()) { 4270 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4271 continue; 4272 } 4273 4274 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4275 4276 if (I - Intr->VAddrStart < NumPacked) 4277 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4278 else 4279 SrcOp.setReg(AMDGPU::NoRegister); 4280 } 4281 } 4282 } else { 4283 // If the register allocator cannot place the address registers contiguously 4284 // without introducing moves, then using the non-sequential address encoding 4285 // is always preferable, since it saves VALU instructions and is usually a 4286 // wash in terms of code size or even better. 4287 // 4288 // However, we currently have no way of hinting to the register allocator 4289 // that MIMG addresses should be placed contiguously when it is possible to 4290 // do so, so force non-NSA for the common 2-address case as a heuristic. 4291 // 4292 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4293 // allocation when possible. 4294 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4295 4296 if (!UseNSA && Intr->NumVAddrs > 1) 4297 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4298 Intr->NumVAddrs); 4299 } 4300 4301 int Flags = 0; 4302 if (IsA16) 4303 Flags |= 1; 4304 if (IsG16) 4305 Flags |= 2; 4306 MI.addOperand(MachineOperand::CreateImm(Flags)); 4307 4308 if (BaseOpcode->Store) { // No TFE for stores? 4309 // TODO: Handle dmask trim 4310 Register VData = MI.getOperand(1).getReg(); 4311 LLT Ty = MRI->getType(VData); 4312 if (!Ty.isVector() || Ty.getElementType() != S16) 4313 return true; 4314 4315 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4316 if (RepackedReg != VData) { 4317 MI.getOperand(1).setReg(RepackedReg); 4318 } 4319 4320 return true; 4321 } 4322 4323 Register DstReg = MI.getOperand(0).getReg(); 4324 LLT Ty = MRI->getType(DstReg); 4325 const LLT EltTy = Ty.getScalarType(); 4326 const bool IsD16 = Ty.getScalarType() == S16; 4327 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4328 4329 // Confirm that the return type is large enough for the dmask specified 4330 if (NumElts < DMaskLanes) 4331 return false; 4332 4333 if (NumElts > 4 || DMaskLanes > 4) 4334 return false; 4335 4336 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4337 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4338 4339 // The raw dword aligned data component of the load. The only legal cases 4340 // where this matters should be when using the packed D16 format, for 4341 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4342 LLT RoundedTy; 4343 4344 // S32 vector to to cover all data, plus TFE result element. 4345 LLT TFETy; 4346 4347 // Register type to use for each loaded component. Will be S32 or V2S16. 4348 LLT RegTy; 4349 4350 if (IsD16 && ST.hasUnpackedD16VMem()) { 4351 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4352 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4353 RegTy = S32; 4354 } else { 4355 unsigned EltSize = EltTy.getSizeInBits(); 4356 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4357 unsigned RoundedSize = 32 * RoundedElts; 4358 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4359 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4360 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4361 } 4362 4363 // The return type does not need adjustment. 4364 // TODO: Should we change s16 case to s32 or <2 x s16>? 4365 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4366 return true; 4367 4368 Register Dst1Reg; 4369 4370 // Insert after the instruction. 4371 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4372 4373 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4374 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4375 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4376 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4377 4378 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4379 4380 MI.getOperand(0).setReg(NewResultReg); 4381 4382 // In the IR, TFE is supposed to be used with a 2 element struct return 4383 // type. The intruction really returns these two values in one contiguous 4384 // register, with one additional dword beyond the loaded data. Rewrite the 4385 // return type to use a single register result. 4386 4387 if (IsTFE) { 4388 Dst1Reg = MI.getOperand(1).getReg(); 4389 if (MRI->getType(Dst1Reg) != S32) 4390 return false; 4391 4392 // TODO: Make sure the TFE operand bit is set. 4393 MI.RemoveOperand(1); 4394 4395 // Handle the easy case that requires no repack instructions. 4396 if (Ty == S32) { 4397 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4398 return true; 4399 } 4400 } 4401 4402 // Now figure out how to copy the new result register back into the old 4403 // result. 4404 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4405 4406 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4407 4408 if (ResultNumRegs == 1) { 4409 assert(!IsTFE); 4410 ResultRegs[0] = NewResultReg; 4411 } else { 4412 // We have to repack into a new vector of some kind. 4413 for (int I = 0; I != NumDataRegs; ++I) 4414 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4415 B.buildUnmerge(ResultRegs, NewResultReg); 4416 4417 // Drop the final TFE element to get the data part. The TFE result is 4418 // directly written to the right place already. 4419 if (IsTFE) 4420 ResultRegs.resize(NumDataRegs); 4421 } 4422 4423 // For an s16 scalar result, we form an s32 result with a truncate regardless 4424 // of packed vs. unpacked. 4425 if (IsD16 && !Ty.isVector()) { 4426 B.buildTrunc(DstReg, ResultRegs[0]); 4427 return true; 4428 } 4429 4430 // Avoid a build/concat_vector of 1 entry. 4431 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4432 B.buildBitcast(DstReg, ResultRegs[0]); 4433 return true; 4434 } 4435 4436 assert(Ty.isVector()); 4437 4438 if (IsD16) { 4439 // For packed D16 results with TFE enabled, all the data components are 4440 // S32. Cast back to the expected type. 4441 // 4442 // TODO: We don't really need to use load s32 elements. We would only need one 4443 // cast for the TFE result if a multiple of v2s16 was used. 4444 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4445 for (Register &Reg : ResultRegs) 4446 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4447 } else if (ST.hasUnpackedD16VMem()) { 4448 for (Register &Reg : ResultRegs) 4449 Reg = B.buildTrunc(S16, Reg).getReg(0); 4450 } 4451 } 4452 4453 auto padWithUndef = [&](LLT Ty, int NumElts) { 4454 if (NumElts == 0) 4455 return; 4456 Register Undef = B.buildUndef(Ty).getReg(0); 4457 for (int I = 0; I != NumElts; ++I) 4458 ResultRegs.push_back(Undef); 4459 }; 4460 4461 // Pad out any elements eliminated due to the dmask. 4462 LLT ResTy = MRI->getType(ResultRegs[0]); 4463 if (!ResTy.isVector()) { 4464 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4465 B.buildBuildVector(DstReg, ResultRegs); 4466 return true; 4467 } 4468 4469 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4470 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4471 4472 // Deal with the one annoying legal case. 4473 const LLT V3S16 = LLT::vector(3, 16); 4474 if (Ty == V3S16) { 4475 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4476 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4477 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4478 return true; 4479 } 4480 4481 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4482 B.buildConcatVectors(DstReg, ResultRegs); 4483 return true; 4484 } 4485 4486 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4487 LegalizerHelper &Helper, MachineInstr &MI) const { 4488 MachineIRBuilder &B = Helper.MIRBuilder; 4489 GISelChangeObserver &Observer = Helper.Observer; 4490 4491 Register Dst = MI.getOperand(0).getReg(); 4492 LLT Ty = B.getMRI()->getType(Dst); 4493 unsigned Size = Ty.getSizeInBits(); 4494 MachineFunction &MF = B.getMF(); 4495 4496 Observer.changingInstr(MI); 4497 4498 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4499 Ty = getBitcastRegisterType(Ty); 4500 Helper.bitcastDst(MI, Ty, 0); 4501 Dst = MI.getOperand(0).getReg(); 4502 B.setInsertPt(B.getMBB(), MI); 4503 } 4504 4505 // FIXME: We don't really need this intermediate instruction. The intrinsic 4506 // should be fixed to have a memory operand. Since it's readnone, we're not 4507 // allowed to add one. 4508 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4509 MI.RemoveOperand(1); // Remove intrinsic ID 4510 4511 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4512 // TODO: Should this use datalayout alignment? 4513 const unsigned MemSize = (Size + 7) / 8; 4514 const Align MemAlign(4); 4515 MachineMemOperand *MMO = MF.getMachineMemOperand( 4516 MachinePointerInfo(), 4517 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4518 MachineMemOperand::MOInvariant, 4519 MemSize, MemAlign); 4520 MI.addMemOperand(MF, MMO); 4521 4522 // There are no 96-bit result scalar loads, but widening to 128-bit should 4523 // always be legal. We may need to restore this to a 96-bit result if it turns 4524 // out this needs to be converted to a vector load during RegBankSelect. 4525 if (!isPowerOf2_32(Size)) { 4526 if (Ty.isVector()) 4527 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4528 else 4529 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4530 } 4531 4532 Observer.changedInstr(MI); 4533 return true; 4534 } 4535 4536 // TODO: Move to selection 4537 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4538 MachineRegisterInfo &MRI, 4539 MachineIRBuilder &B) const { 4540 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4541 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4542 !ST.isTrapHandlerEnabled()) { 4543 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4544 } else { 4545 // Pass queue pointer to trap handler as input, and insert trap instruction 4546 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4547 MachineRegisterInfo &MRI = *B.getMRI(); 4548 4549 Register LiveIn = 4550 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4551 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4552 return false; 4553 4554 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4555 B.buildCopy(SGPR01, LiveIn); 4556 B.buildInstr(AMDGPU::S_TRAP) 4557 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4558 .addReg(SGPR01, RegState::Implicit); 4559 } 4560 4561 MI.eraseFromParent(); 4562 return true; 4563 } 4564 4565 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4566 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4567 // Is non-HSA path or trap-handler disabled? then, report a warning 4568 // accordingly 4569 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4570 !ST.isTrapHandlerEnabled()) { 4571 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4572 "debugtrap handler not supported", 4573 MI.getDebugLoc(), DS_Warning); 4574 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4575 Ctx.diagnose(NoTrap); 4576 } else { 4577 // Insert debug-trap instruction 4578 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4579 } 4580 4581 MI.eraseFromParent(); 4582 return true; 4583 } 4584 4585 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4586 MachineIRBuilder &B) const { 4587 MachineRegisterInfo &MRI = *B.getMRI(); 4588 const LLT S16 = LLT::scalar(16); 4589 const LLT S32 = LLT::scalar(32); 4590 4591 Register DstReg = MI.getOperand(0).getReg(); 4592 Register NodePtr = MI.getOperand(2).getReg(); 4593 Register RayExtent = MI.getOperand(3).getReg(); 4594 Register RayOrigin = MI.getOperand(4).getReg(); 4595 Register RayDir = MI.getOperand(5).getReg(); 4596 Register RayInvDir = MI.getOperand(6).getReg(); 4597 Register TDescr = MI.getOperand(7).getReg(); 4598 4599 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4600 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4601 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4602 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4603 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4604 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4605 4606 SmallVector<Register, 12> Ops; 4607 if (Is64) { 4608 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4609 Ops.push_back(Unmerge.getReg(0)); 4610 Ops.push_back(Unmerge.getReg(1)); 4611 } else { 4612 Ops.push_back(NodePtr); 4613 } 4614 Ops.push_back(RayExtent); 4615 4616 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4617 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4618 Ops.push_back(Unmerge.getReg(0)); 4619 Ops.push_back(Unmerge.getReg(1)); 4620 Ops.push_back(Unmerge.getReg(2)); 4621 }; 4622 4623 packLanes(RayOrigin); 4624 if (IsA16) { 4625 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4626 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4627 Register R1 = MRI.createGenericVirtualRegister(S32); 4628 Register R2 = MRI.createGenericVirtualRegister(S32); 4629 Register R3 = MRI.createGenericVirtualRegister(S32); 4630 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4631 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4632 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4633 Ops.push_back(R1); 4634 Ops.push_back(R2); 4635 Ops.push_back(R3); 4636 } else { 4637 packLanes(RayDir); 4638 packLanes(RayInvDir); 4639 } 4640 4641 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4642 .addDef(DstReg) 4643 .addImm(Opcode); 4644 4645 for (Register R : Ops) { 4646 MIB.addUse(R); 4647 } 4648 4649 MIB.addUse(TDescr) 4650 .addImm(IsA16 ? 1 : 0) 4651 .cloneMemRefs(MI); 4652 4653 MI.eraseFromParent(); 4654 return true; 4655 } 4656 4657 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4658 MachineInstr &MI) const { 4659 MachineIRBuilder &B = Helper.MIRBuilder; 4660 MachineRegisterInfo &MRI = *B.getMRI(); 4661 4662 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4663 auto IntrID = MI.getIntrinsicID(); 4664 switch (IntrID) { 4665 case Intrinsic::amdgcn_if: 4666 case Intrinsic::amdgcn_else: { 4667 MachineInstr *Br = nullptr; 4668 MachineBasicBlock *UncondBrTarget = nullptr; 4669 bool Negated = false; 4670 if (MachineInstr *BrCond = 4671 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4672 const SIRegisterInfo *TRI 4673 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4674 4675 Register Def = MI.getOperand(1).getReg(); 4676 Register Use = MI.getOperand(3).getReg(); 4677 4678 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4679 4680 if (Negated) 4681 std::swap(CondBrTarget, UncondBrTarget); 4682 4683 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4684 if (IntrID == Intrinsic::amdgcn_if) { 4685 B.buildInstr(AMDGPU::SI_IF) 4686 .addDef(Def) 4687 .addUse(Use) 4688 .addMBB(UncondBrTarget); 4689 } else { 4690 B.buildInstr(AMDGPU::SI_ELSE) 4691 .addDef(Def) 4692 .addUse(Use) 4693 .addMBB(UncondBrTarget); 4694 } 4695 4696 if (Br) { 4697 Br->getOperand(0).setMBB(CondBrTarget); 4698 } else { 4699 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4700 // since we're swapping branch targets it needs to be reinserted. 4701 // FIXME: IRTranslator should probably not do this 4702 B.buildBr(*CondBrTarget); 4703 } 4704 4705 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4706 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4707 MI.eraseFromParent(); 4708 BrCond->eraseFromParent(); 4709 return true; 4710 } 4711 4712 return false; 4713 } 4714 case Intrinsic::amdgcn_loop: { 4715 MachineInstr *Br = nullptr; 4716 MachineBasicBlock *UncondBrTarget = nullptr; 4717 bool Negated = false; 4718 if (MachineInstr *BrCond = 4719 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4720 const SIRegisterInfo *TRI 4721 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4722 4723 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4724 Register Reg = MI.getOperand(2).getReg(); 4725 4726 if (Negated) 4727 std::swap(CondBrTarget, UncondBrTarget); 4728 4729 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4730 B.buildInstr(AMDGPU::SI_LOOP) 4731 .addUse(Reg) 4732 .addMBB(UncondBrTarget); 4733 4734 if (Br) 4735 Br->getOperand(0).setMBB(CondBrTarget); 4736 else 4737 B.buildBr(*CondBrTarget); 4738 4739 MI.eraseFromParent(); 4740 BrCond->eraseFromParent(); 4741 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4742 return true; 4743 } 4744 4745 return false; 4746 } 4747 case Intrinsic::amdgcn_kernarg_segment_ptr: 4748 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4749 // This only makes sense to call in a kernel, so just lower to null. 4750 B.buildConstant(MI.getOperand(0).getReg(), 0); 4751 MI.eraseFromParent(); 4752 return true; 4753 } 4754 4755 return legalizePreloadedArgIntrin( 4756 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4757 case Intrinsic::amdgcn_implicitarg_ptr: 4758 return legalizeImplicitArgPtr(MI, MRI, B); 4759 case Intrinsic::amdgcn_workitem_id_x: 4760 return legalizePreloadedArgIntrin(MI, MRI, B, 4761 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4762 case Intrinsic::amdgcn_workitem_id_y: 4763 return legalizePreloadedArgIntrin(MI, MRI, B, 4764 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4765 case Intrinsic::amdgcn_workitem_id_z: 4766 return legalizePreloadedArgIntrin(MI, MRI, B, 4767 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4768 case Intrinsic::amdgcn_workgroup_id_x: 4769 return legalizePreloadedArgIntrin(MI, MRI, B, 4770 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4771 case Intrinsic::amdgcn_workgroup_id_y: 4772 return legalizePreloadedArgIntrin(MI, MRI, B, 4773 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4774 case Intrinsic::amdgcn_workgroup_id_z: 4775 return legalizePreloadedArgIntrin(MI, MRI, B, 4776 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4777 case Intrinsic::amdgcn_dispatch_ptr: 4778 return legalizePreloadedArgIntrin(MI, MRI, B, 4779 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4780 case Intrinsic::amdgcn_queue_ptr: 4781 return legalizePreloadedArgIntrin(MI, MRI, B, 4782 AMDGPUFunctionArgInfo::QUEUE_PTR); 4783 case Intrinsic::amdgcn_implicit_buffer_ptr: 4784 return legalizePreloadedArgIntrin( 4785 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4786 case Intrinsic::amdgcn_dispatch_id: 4787 return legalizePreloadedArgIntrin(MI, MRI, B, 4788 AMDGPUFunctionArgInfo::DISPATCH_ID); 4789 case Intrinsic::amdgcn_fdiv_fast: 4790 return legalizeFDIVFastIntrin(MI, MRI, B); 4791 case Intrinsic::amdgcn_is_shared: 4792 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4793 case Intrinsic::amdgcn_is_private: 4794 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4795 case Intrinsic::amdgcn_wavefrontsize: { 4796 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4797 MI.eraseFromParent(); 4798 return true; 4799 } 4800 case Intrinsic::amdgcn_s_buffer_load: 4801 return legalizeSBufferLoad(Helper, MI); 4802 case Intrinsic::amdgcn_raw_buffer_store: 4803 case Intrinsic::amdgcn_struct_buffer_store: 4804 return legalizeBufferStore(MI, MRI, B, false, false); 4805 case Intrinsic::amdgcn_raw_buffer_store_format: 4806 case Intrinsic::amdgcn_struct_buffer_store_format: 4807 return legalizeBufferStore(MI, MRI, B, false, true); 4808 case Intrinsic::amdgcn_raw_tbuffer_store: 4809 case Intrinsic::amdgcn_struct_tbuffer_store: 4810 return legalizeBufferStore(MI, MRI, B, true, true); 4811 case Intrinsic::amdgcn_raw_buffer_load: 4812 case Intrinsic::amdgcn_struct_buffer_load: 4813 return legalizeBufferLoad(MI, MRI, B, false, false); 4814 case Intrinsic::amdgcn_raw_buffer_load_format: 4815 case Intrinsic::amdgcn_struct_buffer_load_format: 4816 return legalizeBufferLoad(MI, MRI, B, true, false); 4817 case Intrinsic::amdgcn_raw_tbuffer_load: 4818 case Intrinsic::amdgcn_struct_tbuffer_load: 4819 return legalizeBufferLoad(MI, MRI, B, true, true); 4820 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4821 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4822 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4823 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4824 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4825 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4826 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4827 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4828 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4829 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4830 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4831 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4832 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4833 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4834 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4835 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4836 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4837 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4838 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4839 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4840 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4841 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4842 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4843 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4844 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4845 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4846 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4847 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4848 case Intrinsic::amdgcn_buffer_atomic_fadd: 4849 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4850 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4851 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4852 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4853 return legalizeBufferAtomic(MI, B, IntrID); 4854 case Intrinsic::amdgcn_atomic_inc: 4855 return legalizeAtomicIncDec(MI, B, true); 4856 case Intrinsic::amdgcn_atomic_dec: 4857 return legalizeAtomicIncDec(MI, B, false); 4858 case Intrinsic::trap: 4859 return legalizeTrapIntrinsic(MI, MRI, B); 4860 case Intrinsic::debugtrap: 4861 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4862 case Intrinsic::amdgcn_rsq_clamp: 4863 return legalizeRsqClampIntrinsic(MI, MRI, B); 4864 case Intrinsic::amdgcn_ds_fadd: 4865 case Intrinsic::amdgcn_ds_fmin: 4866 case Intrinsic::amdgcn_ds_fmax: 4867 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4868 case Intrinsic::amdgcn_image_bvh_intersect_ray: 4869 return legalizeBVHIntrinsic(MI, B); 4870 default: { 4871 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4872 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4873 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4874 return true; 4875 } 4876 } 4877 4878 return true; 4879 } 4880