1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/ADT/ScopeExit.h" 22 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 25 #include "llvm/IR/DiagnosticInfo.h" 26 #include "llvm/IR/IntrinsicsAMDGPU.h" 27 28 #define DEBUG_TYPE "amdgpu-legalinfo" 29 30 using namespace llvm; 31 using namespace LegalizeActions; 32 using namespace LegalizeMutations; 33 using namespace LegalityPredicates; 34 using namespace MIPatternMatch; 35 36 // Hack until load/store selection patterns support any tuple of legal types. 37 static cl::opt<bool> EnableNewLegality( 38 "amdgpu-global-isel-new-legality", 39 cl::desc("Use GlobalISel desired legality, rather than try to use" 40 "rules compatible with selection patterns"), 41 cl::init(false), 42 cl::ReallyHidden); 43 44 static constexpr unsigned MaxRegisterSize = 1024; 45 46 // Round the number of elements to the next power of two elements 47 static LLT getPow2VectorType(LLT Ty) { 48 unsigned NElts = Ty.getNumElements(); 49 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 50 return Ty.changeNumElements(Pow2NElts); 51 } 52 53 // Round the number of bits to the next power of two bits 54 static LLT getPow2ScalarType(LLT Ty) { 55 unsigned Bits = Ty.getSizeInBits(); 56 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 57 return LLT::scalar(Pow2Bits); 58 } 59 60 /// \returs true if this is an odd sized vector which should widen by adding an 61 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 62 /// excludes s1 vectors, which should always be scalarized. 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 if (!Ty.isVector()) 67 return false; 68 69 const LLT EltTy = Ty.getElementType(); 70 const unsigned EltSize = EltTy.getSizeInBits(); 71 return Ty.getNumElements() % 2 != 0 && 72 EltSize > 1 && EltSize < 32 && 73 Ty.getSizeInBits() % 32 != 0; 74 }; 75 } 76 77 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 78 return [=](const LegalityQuery &Query) { 79 const LLT Ty = Query.Types[TypeIdx]; 80 return Ty.getSizeInBits() % 32 == 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LLT getBitcastRegisterType(const LLT Ty) { 130 const unsigned Size = Ty.getSizeInBits(); 131 132 LLT CoercedTy; 133 if (Size <= 32) { 134 // <2 x s8> -> s16 135 // <4 x s8> -> s32 136 return LLT::scalar(Size); 137 } 138 139 return LLT::scalarOrVector(Size / 32, 32); 140 } 141 142 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 143 return [=](const LegalityQuery &Query) { 144 const LLT Ty = Query.Types[TypeIdx]; 145 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 146 }; 147 } 148 149 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT Ty = Query.Types[TypeIdx]; 152 unsigned Size = Ty.getSizeInBits(); 153 assert(Size % 32 == 0); 154 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 155 }; 156 } 157 158 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 159 return [=](const LegalityQuery &Query) { 160 const LLT QueryTy = Query.Types[TypeIdx]; 161 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 162 }; 163 } 164 165 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 166 return [=](const LegalityQuery &Query) { 167 const LLT QueryTy = Query.Types[TypeIdx]; 168 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 169 }; 170 } 171 172 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 173 return [=](const LegalityQuery &Query) { 174 const LLT QueryTy = Query.Types[TypeIdx]; 175 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 176 }; 177 } 178 179 static bool isRegisterSize(unsigned Size) { 180 return Size % 32 == 0 && Size <= MaxRegisterSize; 181 } 182 183 static bool isRegisterVectorElementType(LLT EltTy) { 184 const int EltSize = EltTy.getSizeInBits(); 185 return EltSize == 16 || EltSize % 32 == 0; 186 } 187 188 static bool isRegisterVectorType(LLT Ty) { 189 const int EltSize = Ty.getElementType().getSizeInBits(); 190 return EltSize == 32 || EltSize == 64 || 191 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 192 EltSize == 128 || EltSize == 256; 193 } 194 195 static bool isRegisterType(LLT Ty) { 196 if (!isRegisterSize(Ty.getSizeInBits())) 197 return false; 198 199 if (Ty.isVector()) 200 return isRegisterVectorType(Ty); 201 202 return true; 203 } 204 205 // Any combination of 32 or 64-bit elements up the maximum register size, and 206 // multiples of v2s16. 207 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 208 return [=](const LegalityQuery &Query) { 209 return isRegisterType(Query.Types[TypeIdx]); 210 }; 211 } 212 213 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 const LLT QueryTy = Query.Types[TypeIdx]; 216 if (!QueryTy.isVector()) 217 return false; 218 const LLT EltTy = QueryTy.getElementType(); 219 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 220 }; 221 } 222 223 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 224 return [=](const LegalityQuery &Query) { 225 const LLT Ty = Query.Types[TypeIdx]; 226 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 227 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 228 }; 229 } 230 231 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 232 // handle some operations by just promoting the register during 233 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 234 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 235 bool IsLoad) { 236 switch (AS) { 237 case AMDGPUAS::PRIVATE_ADDRESS: 238 // FIXME: Private element size. 239 return ST.enableFlatScratch() ? 128 : 32; 240 case AMDGPUAS::LOCAL_ADDRESS: 241 return ST.useDS128() ? 128 : 64; 242 case AMDGPUAS::GLOBAL_ADDRESS: 243 case AMDGPUAS::CONSTANT_ADDRESS: 244 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 245 // Treat constant and global as identical. SMRD loads are sometimes usable for 246 // global loads (ideally constant address space should be eliminated) 247 // depending on the context. Legality cannot be context dependent, but 248 // RegBankSelect can split the load as necessary depending on the pointer 249 // register bank/uniformity and if the memory is invariant or not written in a 250 // kernel. 251 return IsLoad ? 512 : 128; 252 default: 253 // Flat addresses may contextually need to be split to 32-bit parts if they 254 // may alias scratch depending on the subtarget. 255 return 128; 256 } 257 } 258 259 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 260 const LegalityQuery &Query, 261 unsigned Opcode) { 262 const LLT Ty = Query.Types[0]; 263 264 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 265 const bool IsLoad = Opcode != AMDGPU::G_STORE; 266 267 unsigned RegSize = Ty.getSizeInBits(); 268 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 269 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 270 unsigned AS = Query.Types[1].getAddressSpace(); 271 272 // All of these need to be custom lowered to cast the pointer operand. 273 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 274 return false; 275 276 // TODO: We should be able to widen loads if the alignment is high enough, but 277 // we also need to modify the memory access size. 278 #if 0 279 // Accept widening loads based on alignment. 280 if (IsLoad && MemSize < Size) 281 MemSize = std::max(MemSize, Align); 282 #endif 283 284 // Only 1-byte and 2-byte to 32-bit extloads are valid. 285 if (MemSize != RegSize && RegSize != 32) 286 return false; 287 288 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 289 return false; 290 291 switch (MemSize) { 292 case 8: 293 case 16: 294 case 32: 295 case 64: 296 case 128: 297 break; 298 case 96: 299 if (!ST.hasDwordx3LoadStores()) 300 return false; 301 break; 302 case 256: 303 case 512: 304 // These may contextually need to be broken down. 305 break; 306 default: 307 return false; 308 } 309 310 assert(RegSize >= MemSize); 311 312 if (AlignBits < MemSize) { 313 const SITargetLowering *TLI = ST.getTargetLowering(); 314 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 315 Align(AlignBits / 8))) 316 return false; 317 } 318 319 return true; 320 } 321 322 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 323 // workaround this. Eventually it should ignore the type for loads and only care 324 // about the size. Return true in cases where we will workaround this for now by 325 // bitcasting. 326 static bool loadStoreBitcastWorkaround(const LLT Ty) { 327 if (EnableNewLegality) 328 return false; 329 330 const unsigned Size = Ty.getSizeInBits(); 331 if (Size <= 64) 332 return false; 333 if (!Ty.isVector()) 334 return true; 335 336 LLT EltTy = Ty.getElementType(); 337 if (EltTy.isPointer()) 338 return true; 339 340 unsigned EltSize = EltTy.getSizeInBits(); 341 return EltSize != 32 && EltSize != 64; 342 } 343 344 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 345 unsigned Opcode) { 346 const LLT Ty = Query.Types[0]; 347 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 348 !loadStoreBitcastWorkaround(Ty); 349 } 350 351 /// Return true if a load or store of the type should be lowered with a bitcast 352 /// to a different type. 353 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 354 const unsigned MemSizeInBits) { 355 const unsigned Size = Ty.getSizeInBits(); 356 if (Size != MemSizeInBits) 357 return Size <= 32 && Ty.isVector(); 358 359 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 360 return true; 361 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 362 !isRegisterVectorElementType(Ty.getElementType()); 363 } 364 365 /// Return true if we should legalize a load by widening an odd sized memory 366 /// access up to the alignment. Note this case when the memory access itself 367 /// changes, not the size of the result register. 368 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 369 unsigned AlignInBits, unsigned AddrSpace, 370 unsigned Opcode) { 371 // We don't want to widen cases that are naturally legal. 372 if (isPowerOf2_32(SizeInBits)) 373 return false; 374 375 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 376 // end up widening these for a scalar load during RegBankSelect, since there 377 // aren't 96-bit scalar loads. 378 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 379 return false; 380 381 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 382 return false; 383 384 // A load is known dereferenceable up to the alignment, so it's legal to widen 385 // to it. 386 // 387 // TODO: Could check dereferenceable for less aligned cases. 388 unsigned RoundedSize = NextPowerOf2(SizeInBits); 389 if (AlignInBits < RoundedSize) 390 return false; 391 392 // Do not widen if it would introduce a slow unaligned load. 393 const SITargetLowering *TLI = ST.getTargetLowering(); 394 bool Fast = false; 395 return TLI->allowsMisalignedMemoryAccessesImpl( 396 RoundedSize, AddrSpace, Align(AlignInBits / 8), 397 MachineMemOperand::MOLoad, &Fast) && 398 Fast; 399 } 400 401 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 402 unsigned Opcode) { 403 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 404 return false; 405 406 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 407 Query.MMODescrs[0].AlignInBits, 408 Query.Types[1].getAddressSpace(), Opcode); 409 } 410 411 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 412 const GCNTargetMachine &TM) 413 : ST(ST_) { 414 using namespace TargetOpcode; 415 416 auto GetAddrSpacePtr = [&TM](unsigned AS) { 417 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 418 }; 419 420 const LLT S1 = LLT::scalar(1); 421 const LLT S8 = LLT::scalar(8); 422 const LLT S16 = LLT::scalar(16); 423 const LLT S32 = LLT::scalar(32); 424 const LLT S64 = LLT::scalar(64); 425 const LLT S128 = LLT::scalar(128); 426 const LLT S256 = LLT::scalar(256); 427 const LLT S512 = LLT::scalar(512); 428 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 429 430 const LLT V2S8 = LLT::vector(2, 8); 431 const LLT V2S16 = LLT::vector(2, 16); 432 const LLT V4S16 = LLT::vector(4, 16); 433 434 const LLT V2S32 = LLT::vector(2, 32); 435 const LLT V3S32 = LLT::vector(3, 32); 436 const LLT V4S32 = LLT::vector(4, 32); 437 const LLT V5S32 = LLT::vector(5, 32); 438 const LLT V6S32 = LLT::vector(6, 32); 439 const LLT V7S32 = LLT::vector(7, 32); 440 const LLT V8S32 = LLT::vector(8, 32); 441 const LLT V9S32 = LLT::vector(9, 32); 442 const LLT V10S32 = LLT::vector(10, 32); 443 const LLT V11S32 = LLT::vector(11, 32); 444 const LLT V12S32 = LLT::vector(12, 32); 445 const LLT V13S32 = LLT::vector(13, 32); 446 const LLT V14S32 = LLT::vector(14, 32); 447 const LLT V15S32 = LLT::vector(15, 32); 448 const LLT V16S32 = LLT::vector(16, 32); 449 const LLT V32S32 = LLT::vector(32, 32); 450 451 const LLT V2S64 = LLT::vector(2, 64); 452 const LLT V3S64 = LLT::vector(3, 64); 453 const LLT V4S64 = LLT::vector(4, 64); 454 const LLT V5S64 = LLT::vector(5, 64); 455 const LLT V6S64 = LLT::vector(6, 64); 456 const LLT V7S64 = LLT::vector(7, 64); 457 const LLT V8S64 = LLT::vector(8, 64); 458 const LLT V16S64 = LLT::vector(16, 64); 459 460 std::initializer_list<LLT> AllS32Vectors = 461 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 462 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 463 std::initializer_list<LLT> AllS64Vectors = 464 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 465 466 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 467 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 468 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 469 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 470 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 471 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 472 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 473 474 const LLT CodePtr = FlatPtr; 475 476 const std::initializer_list<LLT> AddrSpaces64 = { 477 GlobalPtr, ConstantPtr, FlatPtr 478 }; 479 480 const std::initializer_list<LLT> AddrSpaces32 = { 481 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 482 }; 483 484 const std::initializer_list<LLT> FPTypesBase = { 485 S32, S64 486 }; 487 488 const std::initializer_list<LLT> FPTypes16 = { 489 S32, S64, S16 490 }; 491 492 const std::initializer_list<LLT> FPTypesPK16 = { 493 S32, S64, S16, V2S16 494 }; 495 496 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 497 498 setAction({G_BRCOND, S1}, Legal); // VCC branches 499 setAction({G_BRCOND, S32}, Legal); // SCC branches 500 501 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 502 // elements for v3s16 503 getActionDefinitionsBuilder(G_PHI) 504 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 505 .legalFor(AllS32Vectors) 506 .legalFor(AllS64Vectors) 507 .legalFor(AddrSpaces64) 508 .legalFor(AddrSpaces32) 509 .legalIf(isPointer(0)) 510 .clampScalar(0, S16, S256) 511 .widenScalarToNextPow2(0, 32) 512 .clampMaxNumElements(0, S32, 16) 513 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 514 .scalarize(0); 515 516 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 517 // Full set of gfx9 features. 518 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 519 .legalFor({S32, S16, V2S16}) 520 .clampScalar(0, S16, S32) 521 .clampMaxNumElements(0, S16, 2) 522 .scalarize(0) 523 .widenScalarToNextPow2(0, 32); 524 525 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 526 .legalFor({S32, S16, V2S16}) // Clamp modifier 527 .minScalarOrElt(0, S16) 528 .clampMaxNumElements(0, S16, 2) 529 .scalarize(0) 530 .widenScalarToNextPow2(0, 32) 531 .lower(); 532 } else if (ST.has16BitInsts()) { 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16}) 535 .clampScalar(0, S16, S32) 536 .scalarize(0) 537 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 538 539 // Technically the saturating operations require clamp bit support, but this 540 // was introduced at the same time as 16-bit operations. 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 542 .legalFor({S32, S16}) // Clamp modifier 543 .minScalar(0, S16) 544 .scalarize(0) 545 .widenScalarToNextPow2(0, 16) 546 .lower(); 547 548 // We're just lowering this, but it helps get a better result to try to 549 // coerce to the desired type first. 550 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 551 .minScalar(0, S16) 552 .scalarize(0) 553 .lower(); 554 } else { 555 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 556 .legalFor({S32}) 557 .clampScalar(0, S32, S32) 558 .scalarize(0); 559 560 if (ST.hasIntClamp()) { 561 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 562 .legalFor({S32}) // Clamp modifier. 563 .scalarize(0) 564 .minScalarOrElt(0, S32) 565 .lower(); 566 } else { 567 // Clamp bit support was added in VI, along with 16-bit operations. 568 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 569 .minScalar(0, S32) 570 .scalarize(0) 571 .lower(); 572 } 573 574 // FIXME: DAG expansion gets better results. The widening uses the smaller 575 // range values and goes for the min/max lowering directly. 576 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 577 .minScalar(0, S32) 578 .scalarize(0) 579 .lower(); 580 } 581 582 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 583 .customFor({S32, S64}) 584 .clampScalar(0, S32, S64) 585 .widenScalarToNextPow2(0, 32) 586 .scalarize(0); 587 588 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 589 .legalFor({S32}) 590 .maxScalarOrElt(0, S32); 591 592 if (ST.hasVOP3PInsts()) { 593 Mulh 594 .clampMaxNumElements(0, S8, 2) 595 .lowerFor({V2S8}); 596 } 597 598 Mulh 599 .scalarize(0) 600 .lower(); 601 602 // Report legal for any types we can handle anywhere. For the cases only legal 603 // on the SALU, RegBankSelect will be able to re-legalize. 604 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 605 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 606 .clampScalar(0, S32, S64) 607 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 608 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 609 .widenScalarToNextPow2(0) 610 .scalarize(0); 611 612 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 613 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 614 .legalFor({{S32, S1}, {S32, S32}}) 615 .minScalar(0, S32) 616 // TODO: .scalarize(0) 617 .lower(); 618 619 getActionDefinitionsBuilder(G_BITCAST) 620 // Don't worry about the size constraint. 621 .legalIf(all(isRegisterType(0), isRegisterType(1))) 622 .lower(); 623 624 625 getActionDefinitionsBuilder(G_CONSTANT) 626 .legalFor({S1, S32, S64, S16, GlobalPtr, 627 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 628 .legalIf(isPointer(0)) 629 .clampScalar(0, S32, S64) 630 .widenScalarToNextPow2(0); 631 632 getActionDefinitionsBuilder(G_FCONSTANT) 633 .legalFor({S32, S64, S16}) 634 .clampScalar(0, S16, S64); 635 636 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 637 .legalIf(isRegisterType(0)) 638 // s1 and s16 are special cases because they have legal operations on 639 // them, but don't really occupy registers in the normal way. 640 .legalFor({S1, S16}) 641 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 642 .clampScalarOrElt(0, S32, MaxScalar) 643 .widenScalarToNextPow2(0, 32) 644 .clampMaxNumElements(0, S32, 16); 645 646 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 647 648 // If the amount is divergent, we have to do a wave reduction to get the 649 // maximum value, so this is expanded during RegBankSelect. 650 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 651 .legalFor({{PrivatePtr, S32}}); 652 653 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 654 .customIf(typeIsNot(0, PrivatePtr)); 655 656 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 657 658 auto &FPOpActions = getActionDefinitionsBuilder( 659 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 660 .legalFor({S32, S64}); 661 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 662 .customFor({S32, S64}); 663 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 664 .customFor({S32, S64}); 665 666 if (ST.has16BitInsts()) { 667 if (ST.hasVOP3PInsts()) 668 FPOpActions.legalFor({S16, V2S16}); 669 else 670 FPOpActions.legalFor({S16}); 671 672 TrigActions.customFor({S16}); 673 FDIVActions.customFor({S16}); 674 } 675 676 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 677 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 678 679 if (ST.hasVOP3PInsts()) { 680 MinNumMaxNum.customFor(FPTypesPK16) 681 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 682 .clampMaxNumElements(0, S16, 2) 683 .clampScalar(0, S16, S64) 684 .scalarize(0); 685 } else if (ST.has16BitInsts()) { 686 MinNumMaxNum.customFor(FPTypes16) 687 .clampScalar(0, S16, S64) 688 .scalarize(0); 689 } else { 690 MinNumMaxNum.customFor(FPTypesBase) 691 .clampScalar(0, S32, S64) 692 .scalarize(0); 693 } 694 695 if (ST.hasVOP3PInsts()) 696 FPOpActions.clampMaxNumElements(0, S16, 2); 697 698 FPOpActions 699 .scalarize(0) 700 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 701 702 TrigActions 703 .scalarize(0) 704 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 705 706 FDIVActions 707 .scalarize(0) 708 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 709 710 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 711 .legalFor(FPTypesPK16) 712 .clampMaxNumElements(0, S16, 2) 713 .scalarize(0) 714 .clampScalar(0, S16, S64); 715 716 if (ST.has16BitInsts()) { 717 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 718 .legalFor({S32, S64, S16}) 719 .scalarize(0) 720 .clampScalar(0, S16, S64); 721 } else { 722 getActionDefinitionsBuilder(G_FSQRT) 723 .legalFor({S32, S64}) 724 .scalarize(0) 725 .clampScalar(0, S32, S64); 726 727 if (ST.hasFractBug()) { 728 getActionDefinitionsBuilder(G_FFLOOR) 729 .customFor({S64}) 730 .legalFor({S32, S64}) 731 .scalarize(0) 732 .clampScalar(0, S32, S64); 733 } else { 734 getActionDefinitionsBuilder(G_FFLOOR) 735 .legalFor({S32, S64}) 736 .scalarize(0) 737 .clampScalar(0, S32, S64); 738 } 739 } 740 741 getActionDefinitionsBuilder(G_FPTRUNC) 742 .legalFor({{S32, S64}, {S16, S32}}) 743 .scalarize(0) 744 .lower(); 745 746 getActionDefinitionsBuilder(G_FPEXT) 747 .legalFor({{S64, S32}, {S32, S16}}) 748 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 749 .scalarize(0); 750 751 getActionDefinitionsBuilder(G_FSUB) 752 // Use actual fsub instruction 753 .legalFor({S32}) 754 // Must use fadd + fneg 755 .lowerFor({S64, S16, V2S16}) 756 .scalarize(0) 757 .clampScalar(0, S32, S64); 758 759 // Whether this is legal depends on the floating point mode for the function. 760 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 761 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 762 FMad.customFor({S32, S16}); 763 else if (ST.hasMadMacF32Insts()) 764 FMad.customFor({S32}); 765 else if (ST.hasMadF16()) 766 FMad.customFor({S16}); 767 FMad.scalarize(0) 768 .lower(); 769 770 auto &FRem = getActionDefinitionsBuilder(G_FREM); 771 if (ST.has16BitInsts()) { 772 FRem.customFor({S16, S32, S64}); 773 } else { 774 FRem.minScalar(0, S32) 775 .customFor({S32, S64}); 776 } 777 FRem.scalarize(0); 778 779 // TODO: Do we need to clamp maximum bitwidth? 780 getActionDefinitionsBuilder(G_TRUNC) 781 .legalIf(isScalar(0)) 782 .legalFor({{V2S16, V2S32}}) 783 .clampMaxNumElements(0, S16, 2) 784 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 785 // situations (like an invalid implicit use), we don't want to infinite loop 786 // in the legalizer. 787 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 788 .alwaysLegal(); 789 790 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 791 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 792 {S32, S1}, {S64, S1}, {S16, S1}}) 793 .scalarize(0) 794 .clampScalar(0, S32, S64) 795 .widenScalarToNextPow2(1, 32); 796 797 // TODO: Split s1->s64 during regbankselect for VALU. 798 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 799 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 800 .lowerFor({{S32, S64}}) 801 .lowerIf(typeIs(1, S1)) 802 .customFor({{S64, S64}}); 803 if (ST.has16BitInsts()) 804 IToFP.legalFor({{S16, S16}}); 805 IToFP.clampScalar(1, S32, S64) 806 .minScalar(0, S32) 807 .scalarize(0) 808 .widenScalarToNextPow2(1); 809 810 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 811 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 812 .customFor({{S64, S64}}) 813 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 814 if (ST.has16BitInsts()) 815 FPToI.legalFor({{S16, S16}}); 816 else 817 FPToI.minScalar(1, S32); 818 819 FPToI.minScalar(0, S32) 820 .scalarize(0) 821 .lower(); 822 823 // Lower roundeven into G_FRINT 824 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 825 .scalarize(0) 826 .lower(); 827 828 if (ST.has16BitInsts()) { 829 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 830 .legalFor({S16, S32, S64}) 831 .clampScalar(0, S16, S64) 832 .scalarize(0); 833 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 834 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 835 .legalFor({S32, S64}) 836 .clampScalar(0, S32, S64) 837 .scalarize(0); 838 } else { 839 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 840 .legalFor({S32}) 841 .customFor({S64}) 842 .clampScalar(0, S32, S64) 843 .scalarize(0); 844 } 845 846 getActionDefinitionsBuilder(G_PTR_ADD) 847 .legalIf(all(isPointer(0), sameSize(0, 1))) 848 .scalarize(0) 849 .scalarSameSizeAs(1, 0); 850 851 getActionDefinitionsBuilder(G_PTRMASK) 852 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 853 .scalarSameSizeAs(1, 0) 854 .scalarize(0); 855 856 auto &CmpBuilder = 857 getActionDefinitionsBuilder(G_ICMP) 858 // The compare output type differs based on the register bank of the output, 859 // so make both s1 and s32 legal. 860 // 861 // Scalar compares producing output in scc will be promoted to s32, as that 862 // is the allocatable register type that will be needed for the copy from 863 // scc. This will be promoted during RegBankSelect, and we assume something 864 // before that won't try to use s32 result types. 865 // 866 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 867 // bank. 868 .legalForCartesianProduct( 869 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 870 .legalForCartesianProduct( 871 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 872 if (ST.has16BitInsts()) { 873 CmpBuilder.legalFor({{S1, S16}}); 874 } 875 876 CmpBuilder 877 .widenScalarToNextPow2(1) 878 .clampScalar(1, S32, S64) 879 .scalarize(0) 880 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 881 882 getActionDefinitionsBuilder(G_FCMP) 883 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 884 .widenScalarToNextPow2(1) 885 .clampScalar(1, S32, S64) 886 .scalarize(0); 887 888 // FIXME: fpow has a selection pattern that should move to custom lowering. 889 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 890 if (ST.has16BitInsts()) 891 Exp2Ops.legalFor({S32, S16}); 892 else 893 Exp2Ops.legalFor({S32}); 894 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 895 Exp2Ops.scalarize(0); 896 897 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 898 if (ST.has16BitInsts()) 899 ExpOps.customFor({{S32}, {S16}}); 900 else 901 ExpOps.customFor({S32}); 902 ExpOps.clampScalar(0, MinScalarFPTy, S32) 903 .scalarize(0); 904 905 getActionDefinitionsBuilder(G_FPOWI) 906 .clampScalar(0, MinScalarFPTy, S32) 907 .lower(); 908 909 // The 64-bit versions produce 32-bit results, but only on the SALU. 910 getActionDefinitionsBuilder(G_CTPOP) 911 .legalFor({{S32, S32}, {S32, S64}}) 912 .clampScalar(0, S32, S32) 913 .clampScalar(1, S32, S64) 914 .scalarize(0) 915 .widenScalarToNextPow2(0, 32) 916 .widenScalarToNextPow2(1, 32); 917 918 // The hardware instructions return a different result on 0 than the generic 919 // instructions expect. The hardware produces -1, but these produce the 920 // bitwidth. 921 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 922 .scalarize(0) 923 .clampScalar(0, S32, S32) 924 .clampScalar(1, S32, S64) 925 .widenScalarToNextPow2(0, 32) 926 .widenScalarToNextPow2(1, 32) 927 .lower(); 928 929 // The 64-bit versions produce 32-bit results, but only on the SALU. 930 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 931 .legalFor({{S32, S32}, {S32, S64}}) 932 .clampScalar(0, S32, S32) 933 .clampScalar(1, S32, S64) 934 .scalarize(0) 935 .widenScalarToNextPow2(0, 32) 936 .widenScalarToNextPow2(1, 32); 937 938 getActionDefinitionsBuilder(G_BITREVERSE) 939 .legalFor({S32}) 940 .clampScalar(0, S32, S32) 941 .scalarize(0); 942 943 if (ST.has16BitInsts()) { 944 getActionDefinitionsBuilder(G_BSWAP) 945 .legalFor({S16, S32, V2S16}) 946 .clampMaxNumElements(0, S16, 2) 947 // FIXME: Fixing non-power-of-2 before clamp is workaround for 948 // narrowScalar limitation. 949 .widenScalarToNextPow2(0) 950 .clampScalar(0, S16, S32) 951 .scalarize(0); 952 953 if (ST.hasVOP3PInsts()) { 954 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 955 .legalFor({S32, S16, V2S16}) 956 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 957 .clampMaxNumElements(0, S16, 2) 958 .minScalar(0, S16) 959 .widenScalarToNextPow2(0) 960 .scalarize(0) 961 .lower(); 962 } else { 963 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 964 .legalFor({S32, S16}) 965 .widenScalarToNextPow2(0) 966 .minScalar(0, S16) 967 .scalarize(0) 968 .lower(); 969 } 970 } else { 971 // TODO: Should have same legality without v_perm_b32 972 getActionDefinitionsBuilder(G_BSWAP) 973 .legalFor({S32}) 974 .lowerIf(scalarNarrowerThan(0, 32)) 975 // FIXME: Fixing non-power-of-2 before clamp is workaround for 976 // narrowScalar limitation. 977 .widenScalarToNextPow2(0) 978 .maxScalar(0, S32) 979 .scalarize(0) 980 .lower(); 981 982 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 983 .legalFor({S32}) 984 .minScalar(0, S32) 985 .widenScalarToNextPow2(0) 986 .scalarize(0) 987 .lower(); 988 } 989 990 getActionDefinitionsBuilder(G_INTTOPTR) 991 // List the common cases 992 .legalForCartesianProduct(AddrSpaces64, {S64}) 993 .legalForCartesianProduct(AddrSpaces32, {S32}) 994 .scalarize(0) 995 // Accept any address space as long as the size matches 996 .legalIf(sameSize(0, 1)) 997 .widenScalarIf(smallerThan(1, 0), 998 [](const LegalityQuery &Query) { 999 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1000 }) 1001 .narrowScalarIf(largerThan(1, 0), 1002 [](const LegalityQuery &Query) { 1003 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1004 }); 1005 1006 getActionDefinitionsBuilder(G_PTRTOINT) 1007 // List the common cases 1008 .legalForCartesianProduct(AddrSpaces64, {S64}) 1009 .legalForCartesianProduct(AddrSpaces32, {S32}) 1010 .scalarize(0) 1011 // Accept any address space as long as the size matches 1012 .legalIf(sameSize(0, 1)) 1013 .widenScalarIf(smallerThan(0, 1), 1014 [](const LegalityQuery &Query) { 1015 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1016 }) 1017 .narrowScalarIf( 1018 largerThan(0, 1), 1019 [](const LegalityQuery &Query) { 1020 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1021 }); 1022 1023 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1024 .scalarize(0) 1025 .custom(); 1026 1027 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1028 bool IsLoad) -> bool { 1029 const LLT DstTy = Query.Types[0]; 1030 1031 // Split vector extloads. 1032 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1033 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1034 1035 if (MemSize < DstTy.getSizeInBits()) 1036 MemSize = std::max(MemSize, AlignBits); 1037 1038 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1039 return true; 1040 1041 const LLT PtrTy = Query.Types[1]; 1042 unsigned AS = PtrTy.getAddressSpace(); 1043 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1044 return true; 1045 1046 // Catch weird sized loads that don't evenly divide into the access sizes 1047 // TODO: May be able to widen depending on alignment etc. 1048 unsigned NumRegs = (MemSize + 31) / 32; 1049 if (NumRegs == 3) { 1050 if (!ST.hasDwordx3LoadStores()) 1051 return true; 1052 } else { 1053 // If the alignment allows, these should have been widened. 1054 if (!isPowerOf2_32(NumRegs)) 1055 return true; 1056 } 1057 1058 if (AlignBits < MemSize) { 1059 const SITargetLowering *TLI = ST.getTargetLowering(); 1060 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1061 Align(AlignBits / 8)); 1062 } 1063 1064 return false; 1065 }; 1066 1067 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1068 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1069 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1070 1071 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1072 // LDS 1073 // TODO: Unsupported flat for SI. 1074 1075 for (unsigned Op : {G_LOAD, G_STORE}) { 1076 const bool IsStore = Op == G_STORE; 1077 1078 auto &Actions = getActionDefinitionsBuilder(Op); 1079 // Explicitly list some common cases. 1080 // TODO: Does this help compile time at all? 1081 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1082 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1083 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1084 {S64, GlobalPtr, 64, GlobalAlign32}, 1085 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1086 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1087 {S32, GlobalPtr, 8, GlobalAlign8}, 1088 {S32, GlobalPtr, 16, GlobalAlign16}, 1089 1090 {S32, LocalPtr, 32, 32}, 1091 {S64, LocalPtr, 64, 32}, 1092 {V2S32, LocalPtr, 64, 32}, 1093 {S32, LocalPtr, 8, 8}, 1094 {S32, LocalPtr, 16, 16}, 1095 {V2S16, LocalPtr, 32, 32}, 1096 1097 {S32, PrivatePtr, 32, 32}, 1098 {S32, PrivatePtr, 8, 8}, 1099 {S32, PrivatePtr, 16, 16}, 1100 {V2S16, PrivatePtr, 32, 32}, 1101 1102 {S32, ConstantPtr, 32, GlobalAlign32}, 1103 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1104 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1105 {S64, ConstantPtr, 64, GlobalAlign32}, 1106 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1107 Actions.legalIf( 1108 [=](const LegalityQuery &Query) -> bool { 1109 return isLoadStoreLegal(ST, Query, Op); 1110 }); 1111 1112 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1113 // 64-bits. 1114 // 1115 // TODO: Should generalize bitcast action into coerce, which will also cover 1116 // inserting addrspacecasts. 1117 Actions.customIf(typeIs(1, Constant32Ptr)); 1118 1119 // Turn any illegal element vectors into something easier to deal 1120 // with. These will ultimately produce 32-bit scalar shifts to extract the 1121 // parts anyway. 1122 // 1123 // For odd 16-bit element vectors, prefer to split those into pieces with 1124 // 16-bit vector parts. 1125 Actions.bitcastIf( 1126 [=](const LegalityQuery &Query) -> bool { 1127 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1128 Query.MMODescrs[0].SizeInBits); 1129 }, bitcastToRegisterType(0)); 1130 1131 if (!IsStore) { 1132 // Widen suitably aligned loads by loading extra bytes. The standard 1133 // legalization actions can't properly express widening memory operands. 1134 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1135 return shouldWidenLoad(ST, Query, G_LOAD); 1136 }); 1137 } 1138 1139 // FIXME: load/store narrowing should be moved to lower action 1140 Actions 1141 .narrowScalarIf( 1142 [=](const LegalityQuery &Query) -> bool { 1143 return !Query.Types[0].isVector() && 1144 needToSplitMemOp(Query, Op == G_LOAD); 1145 }, 1146 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1147 const LLT DstTy = Query.Types[0]; 1148 const LLT PtrTy = Query.Types[1]; 1149 1150 const unsigned DstSize = DstTy.getSizeInBits(); 1151 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1152 1153 // Split extloads. 1154 if (DstSize > MemSize) 1155 return std::make_pair(0, LLT::scalar(MemSize)); 1156 1157 if (!isPowerOf2_32(DstSize)) { 1158 // We're probably decomposing an odd sized store. Try to split 1159 // to the widest type. TODO: Account for alignment. As-is it 1160 // should be OK, since the new parts will be further legalized. 1161 unsigned FloorSize = PowerOf2Floor(DstSize); 1162 return std::make_pair(0, LLT::scalar(FloorSize)); 1163 } 1164 1165 if (DstSize > 32 && (DstSize % 32 != 0)) { 1166 // FIXME: Need a way to specify non-extload of larger size if 1167 // suitably aligned. 1168 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1169 } 1170 1171 unsigned MaxSize = maxSizeForAddrSpace(ST, 1172 PtrTy.getAddressSpace(), 1173 Op == G_LOAD); 1174 if (MemSize > MaxSize) 1175 return std::make_pair(0, LLT::scalar(MaxSize)); 1176 1177 unsigned Align = Query.MMODescrs[0].AlignInBits; 1178 return std::make_pair(0, LLT::scalar(Align)); 1179 }) 1180 .fewerElementsIf( 1181 [=](const LegalityQuery &Query) -> bool { 1182 return Query.Types[0].isVector() && 1183 needToSplitMemOp(Query, Op == G_LOAD); 1184 }, 1185 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1186 const LLT DstTy = Query.Types[0]; 1187 const LLT PtrTy = Query.Types[1]; 1188 1189 LLT EltTy = DstTy.getElementType(); 1190 unsigned MaxSize = maxSizeForAddrSpace(ST, 1191 PtrTy.getAddressSpace(), 1192 Op == G_LOAD); 1193 1194 // FIXME: Handle widened to power of 2 results better. This ends 1195 // up scalarizing. 1196 // FIXME: 3 element stores scalarized on SI 1197 1198 // Split if it's too large for the address space. 1199 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1200 unsigned NumElts = DstTy.getNumElements(); 1201 unsigned EltSize = EltTy.getSizeInBits(); 1202 1203 if (MaxSize % EltSize == 0) { 1204 return std::make_pair( 1205 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1206 } 1207 1208 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1209 1210 // FIXME: Refine when odd breakdowns handled 1211 // The scalars will need to be re-legalized. 1212 if (NumPieces == 1 || NumPieces >= NumElts || 1213 NumElts % NumPieces != 0) 1214 return std::make_pair(0, EltTy); 1215 1216 return std::make_pair(0, 1217 LLT::vector(NumElts / NumPieces, EltTy)); 1218 } 1219 1220 // FIXME: We could probably handle weird extending loads better. 1221 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1222 if (DstTy.getSizeInBits() > MemSize) 1223 return std::make_pair(0, EltTy); 1224 1225 unsigned EltSize = EltTy.getSizeInBits(); 1226 unsigned DstSize = DstTy.getSizeInBits(); 1227 if (!isPowerOf2_32(DstSize)) { 1228 // We're probably decomposing an odd sized store. Try to split 1229 // to the widest type. TODO: Account for alignment. As-is it 1230 // should be OK, since the new parts will be further legalized. 1231 unsigned FloorSize = PowerOf2Floor(DstSize); 1232 return std::make_pair( 1233 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1234 } 1235 1236 // Need to split because of alignment. 1237 unsigned Align = Query.MMODescrs[0].AlignInBits; 1238 if (EltSize > Align && 1239 (EltSize / Align < DstTy.getNumElements())) { 1240 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1241 } 1242 1243 // May need relegalization for the scalars. 1244 return std::make_pair(0, EltTy); 1245 }) 1246 .lowerIfMemSizeNotPow2() 1247 .minScalar(0, S32); 1248 1249 if (IsStore) 1250 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1251 1252 Actions 1253 .widenScalarToNextPow2(0) 1254 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1255 .lower(); 1256 } 1257 1258 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1259 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1260 {S32, GlobalPtr, 16, 2 * 8}, 1261 {S32, LocalPtr, 8, 8}, 1262 {S32, LocalPtr, 16, 16}, 1263 {S32, PrivatePtr, 8, 8}, 1264 {S32, PrivatePtr, 16, 16}, 1265 {S32, ConstantPtr, 8, 8}, 1266 {S32, ConstantPtr, 16, 2 * 8}}); 1267 if (ST.hasFlatAddressSpace()) { 1268 ExtLoads.legalForTypesWithMemDesc( 1269 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1270 } 1271 1272 ExtLoads.clampScalar(0, S32, S32) 1273 .widenScalarToNextPow2(0) 1274 .unsupportedIfMemSizeNotPow2() 1275 .lower(); 1276 1277 auto &Atomics = getActionDefinitionsBuilder( 1278 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1279 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1280 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1281 G_ATOMICRMW_UMIN}) 1282 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1283 {S64, GlobalPtr}, {S64, LocalPtr}, 1284 {S32, RegionPtr}, {S64, RegionPtr}}); 1285 if (ST.hasFlatAddressSpace()) { 1286 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1287 } 1288 1289 if (ST.hasLDSFPAtomics()) { 1290 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1291 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1292 if (ST.hasGFX90AInsts()) 1293 Atomic.legalFor({{S64, LocalPtr}}); 1294 } 1295 1296 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1297 // demarshalling 1298 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1299 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1300 {S32, FlatPtr}, {S64, FlatPtr}}) 1301 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1302 {S32, RegionPtr}, {S64, RegionPtr}}); 1303 // TODO: Pointer types, any 32-bit or 64-bit vector 1304 1305 // Condition should be s32 for scalar, s1 for vector. 1306 getActionDefinitionsBuilder(G_SELECT) 1307 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1308 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1309 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1310 .clampScalar(0, S16, S64) 1311 .scalarize(1) 1312 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1313 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1314 .clampMaxNumElements(0, S32, 2) 1315 .clampMaxNumElements(0, LocalPtr, 2) 1316 .clampMaxNumElements(0, PrivatePtr, 2) 1317 .scalarize(0) 1318 .widenScalarToNextPow2(0) 1319 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1320 1321 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1322 // be more flexible with the shift amount type. 1323 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1324 .legalFor({{S32, S32}, {S64, S32}}); 1325 if (ST.has16BitInsts()) { 1326 if (ST.hasVOP3PInsts()) { 1327 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1328 .clampMaxNumElements(0, S16, 2); 1329 } else 1330 Shifts.legalFor({{S16, S16}}); 1331 1332 // TODO: Support 16-bit shift amounts for all types 1333 Shifts.widenScalarIf( 1334 [=](const LegalityQuery &Query) { 1335 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1336 // 32-bit amount. 1337 const LLT ValTy = Query.Types[0]; 1338 const LLT AmountTy = Query.Types[1]; 1339 return ValTy.getSizeInBits() <= 16 && 1340 AmountTy.getSizeInBits() < 16; 1341 }, changeTo(1, S16)); 1342 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1343 Shifts.clampScalar(1, S32, S32); 1344 Shifts.clampScalar(0, S16, S64); 1345 Shifts.widenScalarToNextPow2(0, 16); 1346 1347 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1348 .minScalar(0, S16) 1349 .scalarize(0) 1350 .lower(); 1351 } else { 1352 // Make sure we legalize the shift amount type first, as the general 1353 // expansion for the shifted type will produce much worse code if it hasn't 1354 // been truncated already. 1355 Shifts.clampScalar(1, S32, S32); 1356 Shifts.clampScalar(0, S32, S64); 1357 Shifts.widenScalarToNextPow2(0, 32); 1358 1359 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1360 .minScalar(0, S32) 1361 .scalarize(0) 1362 .lower(); 1363 } 1364 Shifts.scalarize(0); 1365 1366 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1367 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1368 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1369 unsigned IdxTypeIdx = 2; 1370 1371 getActionDefinitionsBuilder(Op) 1372 .customIf([=](const LegalityQuery &Query) { 1373 const LLT EltTy = Query.Types[EltTypeIdx]; 1374 const LLT VecTy = Query.Types[VecTypeIdx]; 1375 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1376 const unsigned EltSize = EltTy.getSizeInBits(); 1377 return (EltSize == 32 || EltSize == 64) && 1378 VecTy.getSizeInBits() % 32 == 0 && 1379 VecTy.getSizeInBits() <= MaxRegisterSize && 1380 IdxTy.getSizeInBits() == 32; 1381 }) 1382 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1383 bitcastToVectorElement32(VecTypeIdx)) 1384 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1385 .bitcastIf( 1386 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1387 [=](const LegalityQuery &Query) { 1388 // For > 64-bit element types, try to turn this into a 64-bit 1389 // element vector since we may be able to do better indexing 1390 // if this is scalar. If not, fall back to 32. 1391 const LLT EltTy = Query.Types[EltTypeIdx]; 1392 const LLT VecTy = Query.Types[VecTypeIdx]; 1393 const unsigned DstEltSize = EltTy.getSizeInBits(); 1394 const unsigned VecSize = VecTy.getSizeInBits(); 1395 1396 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1397 return std::make_pair( 1398 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1399 }) 1400 .clampScalar(EltTypeIdx, S32, S64) 1401 .clampScalar(VecTypeIdx, S32, S64) 1402 .clampScalar(IdxTypeIdx, S32, S32) 1403 .clampMaxNumElements(VecTypeIdx, S32, 32) 1404 // TODO: Clamp elements for 64-bit vectors? 1405 // It should only be necessary with variable indexes. 1406 // As a last resort, lower to the stack 1407 .lower(); 1408 } 1409 1410 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1411 .unsupportedIf([=](const LegalityQuery &Query) { 1412 const LLT &EltTy = Query.Types[1].getElementType(); 1413 return Query.Types[0] != EltTy; 1414 }); 1415 1416 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1417 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1418 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1419 1420 // FIXME: Doesn't handle extract of illegal sizes. 1421 getActionDefinitionsBuilder(Op) 1422 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1423 // FIXME: Multiples of 16 should not be legal. 1424 .legalIf([=](const LegalityQuery &Query) { 1425 const LLT BigTy = Query.Types[BigTyIdx]; 1426 const LLT LitTy = Query.Types[LitTyIdx]; 1427 return (BigTy.getSizeInBits() % 32 == 0) && 1428 (LitTy.getSizeInBits() % 16 == 0); 1429 }) 1430 .widenScalarIf( 1431 [=](const LegalityQuery &Query) { 1432 const LLT BigTy = Query.Types[BigTyIdx]; 1433 return (BigTy.getScalarSizeInBits() < 16); 1434 }, 1435 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1436 .widenScalarIf( 1437 [=](const LegalityQuery &Query) { 1438 const LLT LitTy = Query.Types[LitTyIdx]; 1439 return (LitTy.getScalarSizeInBits() < 16); 1440 }, 1441 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1442 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1443 .widenScalarToNextPow2(BigTyIdx, 32); 1444 1445 } 1446 1447 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1448 .legalForCartesianProduct(AllS32Vectors, {S32}) 1449 .legalForCartesianProduct(AllS64Vectors, {S64}) 1450 .clampNumElements(0, V16S32, V32S32) 1451 .clampNumElements(0, V2S64, V16S64) 1452 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1453 1454 if (ST.hasScalarPackInsts()) { 1455 BuildVector 1456 // FIXME: Should probably widen s1 vectors straight to s32 1457 .minScalarOrElt(0, S16) 1458 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1459 .minScalar(1, S32); 1460 1461 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1462 .legalFor({V2S16, S32}) 1463 .lower(); 1464 BuildVector.minScalarOrElt(0, S32); 1465 } else { 1466 BuildVector.customFor({V2S16, S16}); 1467 BuildVector.minScalarOrElt(0, S32); 1468 1469 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1470 .customFor({V2S16, S32}) 1471 .lower(); 1472 } 1473 1474 BuildVector.legalIf(isRegisterType(0)); 1475 1476 // FIXME: Clamp maximum size 1477 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1478 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1479 .clampMaxNumElements(0, S32, 32) 1480 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1481 .clampMaxNumElements(0, S16, 64); 1482 1483 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1484 // pre-legalize. 1485 if (ST.hasVOP3PInsts()) { 1486 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1487 .customFor({V2S16, V2S16}) 1488 .lower(); 1489 } else 1490 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1491 1492 // Merge/Unmerge 1493 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1494 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1495 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1496 1497 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1498 const LLT Ty = Query.Types[TypeIdx]; 1499 if (Ty.isVector()) { 1500 const LLT &EltTy = Ty.getElementType(); 1501 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1502 return true; 1503 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1504 return true; 1505 } 1506 return false; 1507 }; 1508 1509 auto &Builder = getActionDefinitionsBuilder(Op) 1510 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1511 .lowerFor({{S16, V2S16}}) 1512 .lowerIf([=](const LegalityQuery &Query) { 1513 const LLT BigTy = Query.Types[BigTyIdx]; 1514 return BigTy.getSizeInBits() == 32; 1515 }) 1516 // Try to widen to s16 first for small types. 1517 // TODO: Only do this on targets with legal s16 shifts 1518 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1519 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1520 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1521 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1522 elementTypeIs(1, S16)), 1523 changeTo(1, V2S16)) 1524 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1525 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1526 // valid. 1527 .clampScalar(LitTyIdx, S32, S512) 1528 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1529 // Break up vectors with weird elements into scalars 1530 .fewerElementsIf( 1531 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1532 scalarize(0)) 1533 .fewerElementsIf( 1534 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1535 scalarize(1)) 1536 .clampScalar(BigTyIdx, S32, MaxScalar); 1537 1538 if (Op == G_MERGE_VALUES) { 1539 Builder.widenScalarIf( 1540 // TODO: Use 16-bit shifts if legal for 8-bit values? 1541 [=](const LegalityQuery &Query) { 1542 const LLT Ty = Query.Types[LitTyIdx]; 1543 return Ty.getSizeInBits() < 32; 1544 }, 1545 changeTo(LitTyIdx, S32)); 1546 } 1547 1548 Builder.widenScalarIf( 1549 [=](const LegalityQuery &Query) { 1550 const LLT Ty = Query.Types[BigTyIdx]; 1551 return !isPowerOf2_32(Ty.getSizeInBits()) && 1552 Ty.getSizeInBits() % 16 != 0; 1553 }, 1554 [=](const LegalityQuery &Query) { 1555 // Pick the next power of 2, or a multiple of 64 over 128. 1556 // Whichever is smaller. 1557 const LLT &Ty = Query.Types[BigTyIdx]; 1558 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1559 if (NewSizeInBits >= 256) { 1560 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1561 if (RoundedTo < NewSizeInBits) 1562 NewSizeInBits = RoundedTo; 1563 } 1564 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1565 }) 1566 // Any vectors left are the wrong size. Scalarize them. 1567 .scalarize(0) 1568 .scalarize(1); 1569 } 1570 1571 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1572 // RegBankSelect. 1573 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1574 .legalFor({{S32}, {S64}}); 1575 1576 if (ST.hasVOP3PInsts()) { 1577 SextInReg.lowerFor({{V2S16}}) 1578 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1579 // get more vector shift opportunities, since we'll get those when 1580 // expanded. 1581 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1582 } else if (ST.has16BitInsts()) { 1583 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1584 } else { 1585 // Prefer to promote to s32 before lowering if we don't have 16-bit 1586 // shifts. This avoid a lot of intermediate truncate and extend operations. 1587 SextInReg.lowerFor({{S32}, {S64}}); 1588 } 1589 1590 SextInReg 1591 .scalarize(0) 1592 .clampScalar(0, S32, S64) 1593 .lower(); 1594 1595 getActionDefinitionsBuilder(G_FSHR) 1596 .legalFor({{S32, S32}}) 1597 .scalarize(0) 1598 .lower(); 1599 1600 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1601 .legalFor({S64}); 1602 1603 getActionDefinitionsBuilder(G_FENCE) 1604 .alwaysLegal(); 1605 1606 getActionDefinitionsBuilder({ 1607 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1608 G_FCOPYSIGN, 1609 1610 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1611 G_ATOMICRMW_NAND, 1612 G_ATOMICRMW_FSUB, 1613 G_READ_REGISTER, 1614 G_WRITE_REGISTER, 1615 1616 G_SADDO, G_SSUBO, 1617 1618 // TODO: Implement 1619 G_FMINIMUM, G_FMAXIMUM, 1620 G_FSHL 1621 }).lower(); 1622 1623 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1624 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1625 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1626 .unsupported(); 1627 1628 computeTables(); 1629 verify(*ST.getInstrInfo()); 1630 } 1631 1632 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1633 MachineInstr &MI) const { 1634 MachineIRBuilder &B = Helper.MIRBuilder; 1635 MachineRegisterInfo &MRI = *B.getMRI(); 1636 1637 switch (MI.getOpcode()) { 1638 case TargetOpcode::G_ADDRSPACE_CAST: 1639 return legalizeAddrSpaceCast(MI, MRI, B); 1640 case TargetOpcode::G_FRINT: 1641 return legalizeFrint(MI, MRI, B); 1642 case TargetOpcode::G_FCEIL: 1643 return legalizeFceil(MI, MRI, B); 1644 case TargetOpcode::G_FREM: 1645 return legalizeFrem(MI, MRI, B); 1646 case TargetOpcode::G_INTRINSIC_TRUNC: 1647 return legalizeIntrinsicTrunc(MI, MRI, B); 1648 case TargetOpcode::G_SITOFP: 1649 return legalizeITOFP(MI, MRI, B, true); 1650 case TargetOpcode::G_UITOFP: 1651 return legalizeITOFP(MI, MRI, B, false); 1652 case TargetOpcode::G_FPTOSI: 1653 return legalizeFPTOI(MI, MRI, B, true); 1654 case TargetOpcode::G_FPTOUI: 1655 return legalizeFPTOI(MI, MRI, B, false); 1656 case TargetOpcode::G_FMINNUM: 1657 case TargetOpcode::G_FMAXNUM: 1658 case TargetOpcode::G_FMINNUM_IEEE: 1659 case TargetOpcode::G_FMAXNUM_IEEE: 1660 return legalizeMinNumMaxNum(Helper, MI); 1661 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1662 return legalizeExtractVectorElt(MI, MRI, B); 1663 case TargetOpcode::G_INSERT_VECTOR_ELT: 1664 return legalizeInsertVectorElt(MI, MRI, B); 1665 case TargetOpcode::G_SHUFFLE_VECTOR: 1666 return legalizeShuffleVector(MI, MRI, B); 1667 case TargetOpcode::G_FSIN: 1668 case TargetOpcode::G_FCOS: 1669 return legalizeSinCos(MI, MRI, B); 1670 case TargetOpcode::G_GLOBAL_VALUE: 1671 return legalizeGlobalValue(MI, MRI, B); 1672 case TargetOpcode::G_LOAD: 1673 return legalizeLoad(Helper, MI); 1674 case TargetOpcode::G_FMAD: 1675 return legalizeFMad(MI, MRI, B); 1676 case TargetOpcode::G_FDIV: 1677 return legalizeFDIV(MI, MRI, B); 1678 case TargetOpcode::G_UDIV: 1679 case TargetOpcode::G_UREM: 1680 return legalizeUDIV_UREM(MI, MRI, B); 1681 case TargetOpcode::G_SDIV: 1682 case TargetOpcode::G_SREM: 1683 return legalizeSDIV_SREM(MI, MRI, B); 1684 case TargetOpcode::G_ATOMIC_CMPXCHG: 1685 return legalizeAtomicCmpXChg(MI, MRI, B); 1686 case TargetOpcode::G_FLOG: 1687 return legalizeFlog(MI, B, numbers::ln2f); 1688 case TargetOpcode::G_FLOG10: 1689 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1690 case TargetOpcode::G_FEXP: 1691 return legalizeFExp(MI, B); 1692 case TargetOpcode::G_FPOW: 1693 return legalizeFPow(MI, B); 1694 case TargetOpcode::G_FFLOOR: 1695 return legalizeFFloor(MI, MRI, B); 1696 case TargetOpcode::G_BUILD_VECTOR: 1697 return legalizeBuildVector(MI, MRI, B); 1698 default: 1699 return false; 1700 } 1701 1702 llvm_unreachable("expected switch to return"); 1703 } 1704 1705 Register AMDGPULegalizerInfo::getSegmentAperture( 1706 unsigned AS, 1707 MachineRegisterInfo &MRI, 1708 MachineIRBuilder &B) const { 1709 MachineFunction &MF = B.getMF(); 1710 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1711 const LLT S32 = LLT::scalar(32); 1712 1713 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1714 1715 if (ST.hasApertureRegs()) { 1716 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1717 // getreg. 1718 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1719 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1720 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1721 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1722 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1723 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1724 unsigned Encoding = 1725 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1726 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1727 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1728 1729 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1730 1731 B.buildInstr(AMDGPU::S_GETREG_B32) 1732 .addDef(GetReg) 1733 .addImm(Encoding); 1734 MRI.setType(GetReg, S32); 1735 1736 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1737 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1738 } 1739 1740 Register QueuePtr = MRI.createGenericVirtualRegister( 1741 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1742 1743 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1744 return Register(); 1745 1746 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1747 // private_segment_aperture_base_hi. 1748 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1749 1750 // TODO: can we be smarter about machine pointer info? 1751 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1752 MachineMemOperand *MMO = MF.getMachineMemOperand( 1753 PtrInfo, 1754 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1755 MachineMemOperand::MOInvariant, 1756 4, commonAlignment(Align(64), StructOffset)); 1757 1758 Register LoadAddr; 1759 1760 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1761 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1762 } 1763 1764 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1765 MachineInstr &MI, MachineRegisterInfo &MRI, 1766 MachineIRBuilder &B) const { 1767 MachineFunction &MF = B.getMF(); 1768 1769 const LLT S32 = LLT::scalar(32); 1770 Register Dst = MI.getOperand(0).getReg(); 1771 Register Src = MI.getOperand(1).getReg(); 1772 1773 LLT DstTy = MRI.getType(Dst); 1774 LLT SrcTy = MRI.getType(Src); 1775 unsigned DestAS = DstTy.getAddressSpace(); 1776 unsigned SrcAS = SrcTy.getAddressSpace(); 1777 1778 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1779 // vector element. 1780 assert(!DstTy.isVector()); 1781 1782 const AMDGPUTargetMachine &TM 1783 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1784 1785 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1786 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1787 return true; 1788 } 1789 1790 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1791 // Truncate. 1792 B.buildExtract(Dst, Src, 0); 1793 MI.eraseFromParent(); 1794 return true; 1795 } 1796 1797 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1798 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1799 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1800 1801 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1802 // another. Merge operands are required to be the same type, but creating an 1803 // extra ptrtoint would be kind of pointless. 1804 auto HighAddr = B.buildConstant( 1805 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1806 B.buildMerge(Dst, {Src, HighAddr}); 1807 MI.eraseFromParent(); 1808 return true; 1809 } 1810 1811 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1812 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1813 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1814 unsigned NullVal = TM.getNullPointerValue(DestAS); 1815 1816 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1817 auto FlatNull = B.buildConstant(SrcTy, 0); 1818 1819 // Extract low 32-bits of the pointer. 1820 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1821 1822 auto CmpRes = 1823 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1824 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1825 1826 MI.eraseFromParent(); 1827 return true; 1828 } 1829 1830 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1831 return false; 1832 1833 if (!ST.hasFlatAddressSpace()) 1834 return false; 1835 1836 auto SegmentNull = 1837 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1838 auto FlatNull = 1839 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1840 1841 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1842 if (!ApertureReg.isValid()) 1843 return false; 1844 1845 auto CmpRes = 1846 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1847 1848 // Coerce the type of the low half of the result so we can use merge_values. 1849 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1850 1851 // TODO: Should we allow mismatched types but matching sizes in merges to 1852 // avoid the ptrtoint? 1853 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1854 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1855 1856 MI.eraseFromParent(); 1857 return true; 1858 } 1859 1860 bool AMDGPULegalizerInfo::legalizeFrint( 1861 MachineInstr &MI, MachineRegisterInfo &MRI, 1862 MachineIRBuilder &B) const { 1863 Register Src = MI.getOperand(1).getReg(); 1864 LLT Ty = MRI.getType(Src); 1865 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1866 1867 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1868 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1869 1870 auto C1 = B.buildFConstant(Ty, C1Val); 1871 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1872 1873 // TODO: Should this propagate fast-math-flags? 1874 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1875 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1876 1877 auto C2 = B.buildFConstant(Ty, C2Val); 1878 auto Fabs = B.buildFAbs(Ty, Src); 1879 1880 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1881 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1882 MI.eraseFromParent(); 1883 return true; 1884 } 1885 1886 bool AMDGPULegalizerInfo::legalizeFceil( 1887 MachineInstr &MI, MachineRegisterInfo &MRI, 1888 MachineIRBuilder &B) const { 1889 1890 const LLT S1 = LLT::scalar(1); 1891 const LLT S64 = LLT::scalar(64); 1892 1893 Register Src = MI.getOperand(1).getReg(); 1894 assert(MRI.getType(Src) == S64); 1895 1896 // result = trunc(src) 1897 // if (src > 0.0 && src != result) 1898 // result += 1.0 1899 1900 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1901 1902 const auto Zero = B.buildFConstant(S64, 0.0); 1903 const auto One = B.buildFConstant(S64, 1.0); 1904 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1905 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1906 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1907 auto Add = B.buildSelect(S64, And, One, Zero); 1908 1909 // TODO: Should this propagate fast-math-flags? 1910 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1911 return true; 1912 } 1913 1914 bool AMDGPULegalizerInfo::legalizeFrem( 1915 MachineInstr &MI, MachineRegisterInfo &MRI, 1916 MachineIRBuilder &B) const { 1917 Register DstReg = MI.getOperand(0).getReg(); 1918 Register Src0Reg = MI.getOperand(1).getReg(); 1919 Register Src1Reg = MI.getOperand(2).getReg(); 1920 auto Flags = MI.getFlags(); 1921 LLT Ty = MRI.getType(DstReg); 1922 1923 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1924 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1925 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1926 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1927 MI.eraseFromParent(); 1928 return true; 1929 } 1930 1931 static MachineInstrBuilder extractF64Exponent(Register Hi, 1932 MachineIRBuilder &B) { 1933 const unsigned FractBits = 52; 1934 const unsigned ExpBits = 11; 1935 LLT S32 = LLT::scalar(32); 1936 1937 auto Const0 = B.buildConstant(S32, FractBits - 32); 1938 auto Const1 = B.buildConstant(S32, ExpBits); 1939 1940 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1941 .addUse(Hi) 1942 .addUse(Const0.getReg(0)) 1943 .addUse(Const1.getReg(0)); 1944 1945 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1946 } 1947 1948 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1949 MachineInstr &MI, MachineRegisterInfo &MRI, 1950 MachineIRBuilder &B) const { 1951 const LLT S1 = LLT::scalar(1); 1952 const LLT S32 = LLT::scalar(32); 1953 const LLT S64 = LLT::scalar(64); 1954 1955 Register Src = MI.getOperand(1).getReg(); 1956 assert(MRI.getType(Src) == S64); 1957 1958 // TODO: Should this use extract since the low half is unused? 1959 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1960 Register Hi = Unmerge.getReg(1); 1961 1962 // Extract the upper half, since this is where we will find the sign and 1963 // exponent. 1964 auto Exp = extractF64Exponent(Hi, B); 1965 1966 const unsigned FractBits = 52; 1967 1968 // Extract the sign bit. 1969 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1970 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1971 1972 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1973 1974 const auto Zero32 = B.buildConstant(S32, 0); 1975 1976 // Extend back to 64-bits. 1977 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1978 1979 auto Shr = B.buildAShr(S64, FractMask, Exp); 1980 auto Not = B.buildNot(S64, Shr); 1981 auto Tmp0 = B.buildAnd(S64, Src, Not); 1982 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1983 1984 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1985 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1986 1987 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1988 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1989 MI.eraseFromParent(); 1990 return true; 1991 } 1992 1993 bool AMDGPULegalizerInfo::legalizeITOFP( 1994 MachineInstr &MI, MachineRegisterInfo &MRI, 1995 MachineIRBuilder &B, bool Signed) const { 1996 1997 Register Dst = MI.getOperand(0).getReg(); 1998 Register Src = MI.getOperand(1).getReg(); 1999 2000 const LLT S64 = LLT::scalar(64); 2001 const LLT S32 = LLT::scalar(32); 2002 2003 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2004 2005 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2006 2007 auto CvtHi = Signed ? 2008 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2009 B.buildUITOFP(S64, Unmerge.getReg(1)); 2010 2011 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2012 2013 auto ThirtyTwo = B.buildConstant(S32, 32); 2014 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2015 .addUse(CvtHi.getReg(0)) 2016 .addUse(ThirtyTwo.getReg(0)); 2017 2018 // TODO: Should this propagate fast-math-flags? 2019 B.buildFAdd(Dst, LdExp, CvtLo); 2020 MI.eraseFromParent(); 2021 return true; 2022 } 2023 2024 // TODO: Copied from DAG implementation. Verify logic and document how this 2025 // actually works. 2026 bool AMDGPULegalizerInfo::legalizeFPTOI( 2027 MachineInstr &MI, MachineRegisterInfo &MRI, 2028 MachineIRBuilder &B, bool Signed) const { 2029 2030 Register Dst = MI.getOperand(0).getReg(); 2031 Register Src = MI.getOperand(1).getReg(); 2032 2033 const LLT S64 = LLT::scalar(64); 2034 const LLT S32 = LLT::scalar(32); 2035 2036 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2037 2038 unsigned Flags = MI.getFlags(); 2039 2040 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2041 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2042 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2043 2044 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2045 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2046 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2047 2048 auto Hi = Signed ? 2049 B.buildFPTOSI(S32, FloorMul) : 2050 B.buildFPTOUI(S32, FloorMul); 2051 auto Lo = B.buildFPTOUI(S32, Fma); 2052 2053 B.buildMerge(Dst, { Lo, Hi }); 2054 MI.eraseFromParent(); 2055 2056 return true; 2057 } 2058 2059 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2060 MachineInstr &MI) const { 2061 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2062 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2063 2064 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2065 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2066 2067 // With ieee_mode disabled, the instructions have the correct behavior 2068 // already for G_FMINNUM/G_FMAXNUM 2069 if (!MFI->getMode().IEEE) 2070 return !IsIEEEOp; 2071 2072 if (IsIEEEOp) 2073 return true; 2074 2075 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2076 } 2077 2078 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2079 MachineInstr &MI, MachineRegisterInfo &MRI, 2080 MachineIRBuilder &B) const { 2081 // TODO: Should move some of this into LegalizerHelper. 2082 2083 // TODO: Promote dynamic indexing of s16 to s32 2084 2085 // FIXME: Artifact combiner probably should have replaced the truncated 2086 // constant before this, so we shouldn't need 2087 // getConstantVRegValWithLookThrough. 2088 Optional<ValueAndVReg> MaybeIdxVal = 2089 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2090 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2091 return true; 2092 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2093 2094 Register Dst = MI.getOperand(0).getReg(); 2095 Register Vec = MI.getOperand(1).getReg(); 2096 2097 LLT VecTy = MRI.getType(Vec); 2098 LLT EltTy = VecTy.getElementType(); 2099 assert(EltTy == MRI.getType(Dst)); 2100 2101 if (IdxVal < VecTy.getNumElements()) 2102 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2103 else 2104 B.buildUndef(Dst); 2105 2106 MI.eraseFromParent(); 2107 return true; 2108 } 2109 2110 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2111 MachineInstr &MI, MachineRegisterInfo &MRI, 2112 MachineIRBuilder &B) const { 2113 // TODO: Should move some of this into LegalizerHelper. 2114 2115 // TODO: Promote dynamic indexing of s16 to s32 2116 2117 // FIXME: Artifact combiner probably should have replaced the truncated 2118 // constant before this, so we shouldn't need 2119 // getConstantVRegValWithLookThrough. 2120 Optional<ValueAndVReg> MaybeIdxVal = 2121 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2122 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2123 return true; 2124 2125 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2126 Register Dst = MI.getOperand(0).getReg(); 2127 Register Vec = MI.getOperand(1).getReg(); 2128 Register Ins = MI.getOperand(2).getReg(); 2129 2130 LLT VecTy = MRI.getType(Vec); 2131 LLT EltTy = VecTy.getElementType(); 2132 assert(EltTy == MRI.getType(Ins)); 2133 2134 if (IdxVal < VecTy.getNumElements()) 2135 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2136 else 2137 B.buildUndef(Dst); 2138 2139 MI.eraseFromParent(); 2140 return true; 2141 } 2142 2143 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2144 MachineInstr &MI, MachineRegisterInfo &MRI, 2145 MachineIRBuilder &B) const { 2146 const LLT V2S16 = LLT::vector(2, 16); 2147 2148 Register Dst = MI.getOperand(0).getReg(); 2149 Register Src0 = MI.getOperand(1).getReg(); 2150 LLT DstTy = MRI.getType(Dst); 2151 LLT SrcTy = MRI.getType(Src0); 2152 2153 if (SrcTy == V2S16 && DstTy == V2S16 && 2154 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2155 return true; 2156 2157 MachineIRBuilder HelperBuilder(MI); 2158 GISelObserverWrapper DummyObserver; 2159 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2160 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2161 } 2162 2163 bool AMDGPULegalizerInfo::legalizeSinCos( 2164 MachineInstr &MI, MachineRegisterInfo &MRI, 2165 MachineIRBuilder &B) const { 2166 2167 Register DstReg = MI.getOperand(0).getReg(); 2168 Register SrcReg = MI.getOperand(1).getReg(); 2169 LLT Ty = MRI.getType(DstReg); 2170 unsigned Flags = MI.getFlags(); 2171 2172 Register TrigVal; 2173 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2174 if (ST.hasTrigReducedRange()) { 2175 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2176 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2177 .addUse(MulVal.getReg(0)) 2178 .setMIFlags(Flags).getReg(0); 2179 } else 2180 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2181 2182 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2183 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2184 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2185 .addUse(TrigVal) 2186 .setMIFlags(Flags); 2187 MI.eraseFromParent(); 2188 return true; 2189 } 2190 2191 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2192 MachineIRBuilder &B, 2193 const GlobalValue *GV, 2194 int64_t Offset, 2195 unsigned GAFlags) const { 2196 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2197 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2198 // to the following code sequence: 2199 // 2200 // For constant address space: 2201 // s_getpc_b64 s[0:1] 2202 // s_add_u32 s0, s0, $symbol 2203 // s_addc_u32 s1, s1, 0 2204 // 2205 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2206 // a fixup or relocation is emitted to replace $symbol with a literal 2207 // constant, which is a pc-relative offset from the encoding of the $symbol 2208 // operand to the global variable. 2209 // 2210 // For global address space: 2211 // s_getpc_b64 s[0:1] 2212 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2213 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2214 // 2215 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2216 // fixups or relocations are emitted to replace $symbol@*@lo and 2217 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2218 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2219 // operand to the global variable. 2220 // 2221 // What we want here is an offset from the value returned by s_getpc 2222 // (which is the address of the s_add_u32 instruction) to the global 2223 // variable, but since the encoding of $symbol starts 4 bytes after the start 2224 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2225 // small. This requires us to add 4 to the global variable offset in order to 2226 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2227 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2228 // instruction. 2229 2230 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2231 2232 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2233 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2234 2235 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2236 .addDef(PCReg); 2237 2238 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2239 if (GAFlags == SIInstrInfo::MO_NONE) 2240 MIB.addImm(0); 2241 else 2242 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2243 2244 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2245 2246 if (PtrTy.getSizeInBits() == 32) 2247 B.buildExtract(DstReg, PCReg, 0); 2248 return true; 2249 } 2250 2251 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2252 MachineInstr &MI, MachineRegisterInfo &MRI, 2253 MachineIRBuilder &B) const { 2254 Register DstReg = MI.getOperand(0).getReg(); 2255 LLT Ty = MRI.getType(DstReg); 2256 unsigned AS = Ty.getAddressSpace(); 2257 2258 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2259 MachineFunction &MF = B.getMF(); 2260 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2261 2262 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2263 if (!MFI->isModuleEntryFunction()) { 2264 const Function &Fn = MF.getFunction(); 2265 DiagnosticInfoUnsupported BadLDSDecl( 2266 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2267 DS_Warning); 2268 Fn.getContext().diagnose(BadLDSDecl); 2269 2270 // We currently don't have a way to correctly allocate LDS objects that 2271 // aren't directly associated with a kernel. We do force inlining of 2272 // functions that use local objects. However, if these dead functions are 2273 // not eliminated, we don't want a compile time error. Just emit a warning 2274 // and a trap, since there should be no callable path here. 2275 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2276 B.buildUndef(DstReg); 2277 MI.eraseFromParent(); 2278 return true; 2279 } 2280 2281 // TODO: We could emit code to handle the initialization somewhere. 2282 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2283 const SITargetLowering *TLI = ST.getTargetLowering(); 2284 if (!TLI->shouldUseLDSConstAddress(GV)) { 2285 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2286 return true; // Leave in place; 2287 } 2288 2289 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2290 Type *Ty = GV->getValueType(); 2291 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2292 // zero-sized type in other languages to declare the dynamic shared 2293 // memory which size is not known at the compile time. They will be 2294 // allocated by the runtime and placed directly after the static 2295 // allocated ones. They all share the same offset. 2296 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2297 // Adjust alignment for that dynamic shared memory array. 2298 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2299 LLT S32 = LLT::scalar(32); 2300 auto Sz = 2301 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2302 B.buildIntToPtr(DstReg, Sz); 2303 MI.eraseFromParent(); 2304 return true; 2305 } 2306 } 2307 2308 B.buildConstant( 2309 DstReg, 2310 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2311 MI.eraseFromParent(); 2312 return true; 2313 } 2314 2315 const Function &Fn = MF.getFunction(); 2316 DiagnosticInfoUnsupported BadInit( 2317 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2318 Fn.getContext().diagnose(BadInit); 2319 return true; 2320 } 2321 2322 const SITargetLowering *TLI = ST.getTargetLowering(); 2323 2324 if (TLI->shouldEmitFixup(GV)) { 2325 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2326 MI.eraseFromParent(); 2327 return true; 2328 } 2329 2330 if (TLI->shouldEmitPCReloc(GV)) { 2331 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2332 MI.eraseFromParent(); 2333 return true; 2334 } 2335 2336 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2337 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2338 2339 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2340 MachinePointerInfo::getGOT(MF), 2341 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2342 MachineMemOperand::MOInvariant, 2343 8 /*Size*/, Align(8)); 2344 2345 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2346 2347 if (Ty.getSizeInBits() == 32) { 2348 // Truncate if this is a 32-bit constant adrdess. 2349 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2350 B.buildExtract(DstReg, Load, 0); 2351 } else 2352 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2353 2354 MI.eraseFromParent(); 2355 return true; 2356 } 2357 2358 static LLT widenToNextPowerOf2(LLT Ty) { 2359 if (Ty.isVector()) 2360 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2361 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2362 } 2363 2364 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2365 MachineInstr &MI) const { 2366 MachineIRBuilder &B = Helper.MIRBuilder; 2367 MachineRegisterInfo &MRI = *B.getMRI(); 2368 GISelChangeObserver &Observer = Helper.Observer; 2369 2370 Register PtrReg = MI.getOperand(1).getReg(); 2371 LLT PtrTy = MRI.getType(PtrReg); 2372 unsigned AddrSpace = PtrTy.getAddressSpace(); 2373 2374 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2375 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2376 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2377 Observer.changingInstr(MI); 2378 MI.getOperand(1).setReg(Cast.getReg(0)); 2379 Observer.changedInstr(MI); 2380 return true; 2381 } 2382 2383 Register ValReg = MI.getOperand(0).getReg(); 2384 LLT ValTy = MRI.getType(ValReg); 2385 2386 MachineMemOperand *MMO = *MI.memoperands_begin(); 2387 const unsigned ValSize = ValTy.getSizeInBits(); 2388 const unsigned MemSize = 8 * MMO->getSize(); 2389 const Align MemAlign = MMO->getAlign(); 2390 const unsigned AlignInBits = 8 * MemAlign.value(); 2391 2392 // Widen non-power-of-2 loads to the alignment if needed 2393 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2394 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2395 2396 // This was already the correct extending load result type, so just adjust 2397 // the memory type. 2398 if (WideMemSize == ValSize) { 2399 MachineFunction &MF = B.getMF(); 2400 2401 MachineMemOperand *WideMMO = 2402 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2403 Observer.changingInstr(MI); 2404 MI.setMemRefs(MF, {WideMMO}); 2405 Observer.changedInstr(MI); 2406 return true; 2407 } 2408 2409 // Don't bother handling edge case that should probably never be produced. 2410 if (ValSize > WideMemSize) 2411 return false; 2412 2413 LLT WideTy = widenToNextPowerOf2(ValTy); 2414 2415 Register WideLoad; 2416 if (!WideTy.isVector()) { 2417 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2418 B.buildTrunc(ValReg, WideLoad).getReg(0); 2419 } else { 2420 // Extract the subvector. 2421 2422 if (isRegisterType(ValTy)) { 2423 // If this a case where G_EXTRACT is legal, use it. 2424 // (e.g. <3 x s32> -> <4 x s32>) 2425 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2426 B.buildExtract(ValReg, WideLoad, 0); 2427 } else { 2428 // For cases where the widened type isn't a nice register value, unmerge 2429 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2430 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2431 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2432 B.setInsertPt(B.getMBB(), MI.getIterator()); 2433 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2434 } 2435 } 2436 2437 MI.eraseFromParent(); 2438 return true; 2439 } 2440 2441 return false; 2442 } 2443 2444 bool AMDGPULegalizerInfo::legalizeFMad( 2445 MachineInstr &MI, MachineRegisterInfo &MRI, 2446 MachineIRBuilder &B) const { 2447 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2448 assert(Ty.isScalar()); 2449 2450 MachineFunction &MF = B.getMF(); 2451 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2452 2453 // TODO: Always legal with future ftz flag. 2454 // FIXME: Do we need just output? 2455 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2456 return true; 2457 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2458 return true; 2459 2460 MachineIRBuilder HelperBuilder(MI); 2461 GISelObserverWrapper DummyObserver; 2462 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2463 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2464 } 2465 2466 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2467 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2468 Register DstReg = MI.getOperand(0).getReg(); 2469 Register PtrReg = MI.getOperand(1).getReg(); 2470 Register CmpVal = MI.getOperand(2).getReg(); 2471 Register NewVal = MI.getOperand(3).getReg(); 2472 2473 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2474 "this should not have been custom lowered"); 2475 2476 LLT ValTy = MRI.getType(CmpVal); 2477 LLT VecTy = LLT::vector(2, ValTy); 2478 2479 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2480 2481 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2482 .addDef(DstReg) 2483 .addUse(PtrReg) 2484 .addUse(PackedVal) 2485 .setMemRefs(MI.memoperands()); 2486 2487 MI.eraseFromParent(); 2488 return true; 2489 } 2490 2491 bool AMDGPULegalizerInfo::legalizeFlog( 2492 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2493 Register Dst = MI.getOperand(0).getReg(); 2494 Register Src = MI.getOperand(1).getReg(); 2495 LLT Ty = B.getMRI()->getType(Dst); 2496 unsigned Flags = MI.getFlags(); 2497 2498 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2499 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2500 2501 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2502 MI.eraseFromParent(); 2503 return true; 2504 } 2505 2506 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2507 MachineIRBuilder &B) const { 2508 Register Dst = MI.getOperand(0).getReg(); 2509 Register Src = MI.getOperand(1).getReg(); 2510 unsigned Flags = MI.getFlags(); 2511 LLT Ty = B.getMRI()->getType(Dst); 2512 2513 auto K = B.buildFConstant(Ty, numbers::log2e); 2514 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2515 B.buildFExp2(Dst, Mul, Flags); 2516 MI.eraseFromParent(); 2517 return true; 2518 } 2519 2520 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2521 MachineIRBuilder &B) const { 2522 Register Dst = MI.getOperand(0).getReg(); 2523 Register Src0 = MI.getOperand(1).getReg(); 2524 Register Src1 = MI.getOperand(2).getReg(); 2525 unsigned Flags = MI.getFlags(); 2526 LLT Ty = B.getMRI()->getType(Dst); 2527 const LLT S16 = LLT::scalar(16); 2528 const LLT S32 = LLT::scalar(32); 2529 2530 if (Ty == S32) { 2531 auto Log = B.buildFLog2(S32, Src0, Flags); 2532 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2533 .addUse(Log.getReg(0)) 2534 .addUse(Src1) 2535 .setMIFlags(Flags); 2536 B.buildFExp2(Dst, Mul, Flags); 2537 } else if (Ty == S16) { 2538 // There's no f16 fmul_legacy, so we need to convert for it. 2539 auto Log = B.buildFLog2(S16, Src0, Flags); 2540 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2541 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2542 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2543 .addUse(Ext0.getReg(0)) 2544 .addUse(Ext1.getReg(0)) 2545 .setMIFlags(Flags); 2546 2547 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2548 } else 2549 return false; 2550 2551 MI.eraseFromParent(); 2552 return true; 2553 } 2554 2555 // Find a source register, ignoring any possible source modifiers. 2556 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2557 Register ModSrc = OrigSrc; 2558 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2559 ModSrc = SrcFNeg->getOperand(1).getReg(); 2560 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2561 ModSrc = SrcFAbs->getOperand(1).getReg(); 2562 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2563 ModSrc = SrcFAbs->getOperand(1).getReg(); 2564 return ModSrc; 2565 } 2566 2567 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2568 MachineRegisterInfo &MRI, 2569 MachineIRBuilder &B) const { 2570 2571 const LLT S1 = LLT::scalar(1); 2572 const LLT S64 = LLT::scalar(64); 2573 Register Dst = MI.getOperand(0).getReg(); 2574 Register OrigSrc = MI.getOperand(1).getReg(); 2575 unsigned Flags = MI.getFlags(); 2576 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2577 "this should not have been custom lowered"); 2578 2579 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2580 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2581 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2582 // V_FRACT bug is: 2583 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2584 // 2585 // Convert floor(x) to (x - fract(x)) 2586 2587 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2588 .addUse(OrigSrc) 2589 .setMIFlags(Flags); 2590 2591 // Give source modifier matching some assistance before obscuring a foldable 2592 // pattern. 2593 2594 // TODO: We can avoid the neg on the fract? The input sign to fract 2595 // shouldn't matter? 2596 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2597 2598 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2599 2600 Register Min = MRI.createGenericVirtualRegister(S64); 2601 2602 // We don't need to concern ourselves with the snan handling difference, so 2603 // use the one which will directly select. 2604 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2605 if (MFI->getMode().IEEE) 2606 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2607 else 2608 B.buildFMinNum(Min, Fract, Const, Flags); 2609 2610 Register CorrectedFract = Min; 2611 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2612 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2613 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2614 } 2615 2616 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2617 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2618 2619 MI.eraseFromParent(); 2620 return true; 2621 } 2622 2623 // Turn an illegal packed v2s16 build vector into bit operations. 2624 // TODO: This should probably be a bitcast action in LegalizerHelper. 2625 bool AMDGPULegalizerInfo::legalizeBuildVector( 2626 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2627 Register Dst = MI.getOperand(0).getReg(); 2628 const LLT S32 = LLT::scalar(32); 2629 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2630 2631 Register Src0 = MI.getOperand(1).getReg(); 2632 Register Src1 = MI.getOperand(2).getReg(); 2633 assert(MRI.getType(Src0) == LLT::scalar(16)); 2634 2635 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2636 B.buildBitcast(Dst, Merge); 2637 2638 MI.eraseFromParent(); 2639 return true; 2640 } 2641 2642 // Check that this is a G_XOR x, -1 2643 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2644 if (MI.getOpcode() != TargetOpcode::G_XOR) 2645 return false; 2646 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2647 return ConstVal && *ConstVal == -1; 2648 } 2649 2650 // Return the use branch instruction, otherwise null if the usage is invalid. 2651 static MachineInstr * 2652 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2653 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2654 Register CondDef = MI.getOperand(0).getReg(); 2655 if (!MRI.hasOneNonDBGUse(CondDef)) 2656 return nullptr; 2657 2658 MachineBasicBlock *Parent = MI.getParent(); 2659 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2660 2661 if (isNot(MRI, *UseMI)) { 2662 Register NegatedCond = UseMI->getOperand(0).getReg(); 2663 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2664 return nullptr; 2665 2666 // We're deleting the def of this value, so we need to remove it. 2667 UseMI->eraseFromParent(); 2668 2669 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2670 Negated = true; 2671 } 2672 2673 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2674 return nullptr; 2675 2676 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2677 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2678 if (Next == Parent->end()) { 2679 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2680 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2681 return nullptr; 2682 UncondBrTarget = &*NextMBB; 2683 } else { 2684 if (Next->getOpcode() != AMDGPU::G_BR) 2685 return nullptr; 2686 Br = &*Next; 2687 UncondBrTarget = Br->getOperand(0).getMBB(); 2688 } 2689 2690 return UseMI; 2691 } 2692 2693 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2694 const ArgDescriptor *Arg, 2695 const TargetRegisterClass *ArgRC, 2696 LLT ArgTy) const { 2697 MCRegister SrcReg = Arg->getRegister(); 2698 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2699 assert(DstReg.isVirtual() && "Virtual register expected"); 2700 2701 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2702 ArgTy); 2703 if (Arg->isMasked()) { 2704 // TODO: Should we try to emit this once in the entry block? 2705 const LLT S32 = LLT::scalar(32); 2706 const unsigned Mask = Arg->getMask(); 2707 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2708 2709 Register AndMaskSrc = LiveIn; 2710 2711 if (Shift != 0) { 2712 auto ShiftAmt = B.buildConstant(S32, Shift); 2713 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2714 } 2715 2716 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2717 } else { 2718 B.buildCopy(DstReg, LiveIn); 2719 } 2720 2721 return true; 2722 } 2723 2724 bool AMDGPULegalizerInfo::loadInputValue( 2725 Register DstReg, MachineIRBuilder &B, 2726 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2727 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2728 const ArgDescriptor *Arg; 2729 const TargetRegisterClass *ArgRC; 2730 LLT ArgTy; 2731 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2732 2733 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2734 return false; // TODO: Handle these 2735 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2736 } 2737 2738 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2739 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2740 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2741 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2742 return false; 2743 2744 MI.eraseFromParent(); 2745 return true; 2746 } 2747 2748 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2749 MachineRegisterInfo &MRI, 2750 MachineIRBuilder &B) const { 2751 Register Dst = MI.getOperand(0).getReg(); 2752 LLT DstTy = MRI.getType(Dst); 2753 LLT S16 = LLT::scalar(16); 2754 LLT S32 = LLT::scalar(32); 2755 LLT S64 = LLT::scalar(64); 2756 2757 if (DstTy == S16) 2758 return legalizeFDIV16(MI, MRI, B); 2759 if (DstTy == S32) 2760 return legalizeFDIV32(MI, MRI, B); 2761 if (DstTy == S64) 2762 return legalizeFDIV64(MI, MRI, B); 2763 2764 return false; 2765 } 2766 2767 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2768 Register DstReg, 2769 Register X, 2770 Register Y, 2771 bool IsDiv) const { 2772 const LLT S1 = LLT::scalar(1); 2773 const LLT S32 = LLT::scalar(32); 2774 2775 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2776 // algorithm used here. 2777 2778 // Initial estimate of inv(y). 2779 auto FloatY = B.buildUITOFP(S32, Y); 2780 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2781 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2782 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2783 auto Z = B.buildFPTOUI(S32, ScaledY); 2784 2785 // One round of UNR. 2786 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2787 auto NegYZ = B.buildMul(S32, NegY, Z); 2788 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2789 2790 // Quotient/remainder estimate. 2791 auto Q = B.buildUMulH(S32, X, Z); 2792 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2793 2794 // First quotient/remainder refinement. 2795 auto One = B.buildConstant(S32, 1); 2796 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2797 if (IsDiv) 2798 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2799 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2800 2801 // Second quotient/remainder refinement. 2802 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2803 if (IsDiv) 2804 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2805 else 2806 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2807 } 2808 2809 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2810 MachineRegisterInfo &MRI, 2811 MachineIRBuilder &B) const { 2812 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2813 Register DstReg = MI.getOperand(0).getReg(); 2814 Register Num = MI.getOperand(1).getReg(); 2815 Register Den = MI.getOperand(2).getReg(); 2816 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2817 MI.eraseFromParent(); 2818 return true; 2819 } 2820 2821 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2822 // 2823 // Return lo, hi of result 2824 // 2825 // %cvt.lo = G_UITOFP Val.lo 2826 // %cvt.hi = G_UITOFP Val.hi 2827 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2828 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2829 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2830 // %mul2 = G_FMUL %mul1, 2**(-32) 2831 // %trunc = G_INTRINSIC_TRUNC %mul2 2832 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2833 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2834 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2835 Register Val) { 2836 const LLT S32 = LLT::scalar(32); 2837 auto Unmerge = B.buildUnmerge(S32, Val); 2838 2839 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2840 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2841 2842 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2843 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2844 2845 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2846 auto Mul1 = 2847 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2848 2849 // 2**(-32) 2850 auto Mul2 = 2851 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2852 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2853 2854 // -(2**32) 2855 auto Mad2 = B.buildFMAD(S32, Trunc, 2856 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2857 2858 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2859 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2860 2861 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2862 } 2863 2864 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2865 Register DstReg, 2866 Register Numer, 2867 Register Denom, 2868 bool IsDiv) const { 2869 const LLT S32 = LLT::scalar(32); 2870 const LLT S64 = LLT::scalar(64); 2871 const LLT S1 = LLT::scalar(1); 2872 Register RcpLo, RcpHi; 2873 2874 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2875 2876 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2877 2878 auto Zero64 = B.buildConstant(S64, 0); 2879 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2880 2881 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2882 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2883 2884 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2885 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2886 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2887 2888 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2889 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2890 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2891 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2892 2893 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2894 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2895 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2896 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2897 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2898 2899 auto Zero32 = B.buildConstant(S32, 0); 2900 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2901 auto Add2_HiC = 2902 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2903 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2904 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2905 2906 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2907 Register NumerLo = UnmergeNumer.getReg(0); 2908 Register NumerHi = UnmergeNumer.getReg(1); 2909 2910 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2911 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2912 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2913 Register Mul3_Lo = UnmergeMul3.getReg(0); 2914 Register Mul3_Hi = UnmergeMul3.getReg(1); 2915 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2916 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2917 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2918 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2919 2920 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2921 Register DenomLo = UnmergeDenom.getReg(0); 2922 Register DenomHi = UnmergeDenom.getReg(1); 2923 2924 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2925 auto C1 = B.buildSExt(S32, CmpHi); 2926 2927 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2928 auto C2 = B.buildSExt(S32, CmpLo); 2929 2930 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2931 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2932 2933 // TODO: Here and below portions of the code can be enclosed into if/endif. 2934 // Currently control flow is unconditional and we have 4 selects after 2935 // potential endif to substitute PHIs. 2936 2937 // if C3 != 0 ... 2938 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2939 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2940 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2941 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2942 2943 auto One64 = B.buildConstant(S64, 1); 2944 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2945 2946 auto C4 = 2947 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2948 auto C5 = 2949 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2950 auto C6 = B.buildSelect( 2951 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2952 2953 // if (C6 != 0) 2954 auto Add4 = B.buildAdd(S64, Add3, One64); 2955 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2956 2957 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2958 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2959 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2960 2961 // endif C6 2962 // endif C3 2963 2964 if (IsDiv) { 2965 auto Sel1 = B.buildSelect( 2966 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2967 B.buildSelect(DstReg, 2968 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2969 } else { 2970 auto Sel2 = B.buildSelect( 2971 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2972 B.buildSelect(DstReg, 2973 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2974 } 2975 } 2976 2977 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2978 MachineRegisterInfo &MRI, 2979 MachineIRBuilder &B) const { 2980 const LLT S64 = LLT::scalar(64); 2981 const LLT S32 = LLT::scalar(32); 2982 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2983 Register DstReg = MI.getOperand(0).getReg(); 2984 Register Num = MI.getOperand(1).getReg(); 2985 Register Den = MI.getOperand(2).getReg(); 2986 LLT Ty = MRI.getType(DstReg); 2987 2988 if (Ty == S32) 2989 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2990 else if (Ty == S64) 2991 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2992 else 2993 return false; 2994 2995 MI.eraseFromParent(); 2996 return true; 2997 2998 } 2999 3000 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 3001 MachineRegisterInfo &MRI, 3002 MachineIRBuilder &B) const { 3003 const LLT S64 = LLT::scalar(64); 3004 const LLT S32 = LLT::scalar(32); 3005 3006 Register DstReg = MI.getOperand(0).getReg(); 3007 const LLT Ty = MRI.getType(DstReg); 3008 if (Ty != S32 && Ty != S64) 3009 return false; 3010 3011 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 3012 3013 Register LHS = MI.getOperand(1).getReg(); 3014 Register RHS = MI.getOperand(2).getReg(); 3015 3016 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3017 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3018 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3019 3020 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3021 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3022 3023 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3024 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3025 3026 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 3027 if (Ty == S32) 3028 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 3029 else 3030 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 3031 3032 Register Sign; 3033 if (IsDiv) 3034 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3035 else 3036 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3037 3038 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 3039 B.buildSub(DstReg, UDivRem, Sign); 3040 3041 MI.eraseFromParent(); 3042 return true; 3043 } 3044 3045 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3046 MachineRegisterInfo &MRI, 3047 MachineIRBuilder &B) const { 3048 Register Res = MI.getOperand(0).getReg(); 3049 Register LHS = MI.getOperand(1).getReg(); 3050 Register RHS = MI.getOperand(2).getReg(); 3051 uint16_t Flags = MI.getFlags(); 3052 LLT ResTy = MRI.getType(Res); 3053 3054 const MachineFunction &MF = B.getMF(); 3055 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3056 MI.getFlag(MachineInstr::FmAfn); 3057 3058 if (!AllowInaccurateRcp) 3059 return false; 3060 3061 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3062 // 1 / x -> RCP(x) 3063 if (CLHS->isExactlyValue(1.0)) { 3064 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3065 .addUse(RHS) 3066 .setMIFlags(Flags); 3067 3068 MI.eraseFromParent(); 3069 return true; 3070 } 3071 3072 // -1 / x -> RCP( FNEG(x) ) 3073 if (CLHS->isExactlyValue(-1.0)) { 3074 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3075 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3076 .addUse(FNeg.getReg(0)) 3077 .setMIFlags(Flags); 3078 3079 MI.eraseFromParent(); 3080 return true; 3081 } 3082 } 3083 3084 // x / y -> x * (1.0 / y) 3085 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3086 .addUse(RHS) 3087 .setMIFlags(Flags); 3088 B.buildFMul(Res, LHS, RCP, Flags); 3089 3090 MI.eraseFromParent(); 3091 return true; 3092 } 3093 3094 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3095 MachineRegisterInfo &MRI, 3096 MachineIRBuilder &B) const { 3097 Register Res = MI.getOperand(0).getReg(); 3098 Register X = MI.getOperand(1).getReg(); 3099 Register Y = MI.getOperand(2).getReg(); 3100 uint16_t Flags = MI.getFlags(); 3101 LLT ResTy = MRI.getType(Res); 3102 3103 const MachineFunction &MF = B.getMF(); 3104 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3105 MI.getFlag(MachineInstr::FmAfn); 3106 3107 if (!AllowInaccurateRcp) 3108 return false; 3109 3110 auto NegY = B.buildFNeg(ResTy, Y); 3111 auto One = B.buildFConstant(ResTy, 1.0); 3112 3113 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3114 .addUse(Y) 3115 .setMIFlags(Flags); 3116 3117 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3118 R = B.buildFMA(ResTy, Tmp0, R, R); 3119 3120 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3121 R = B.buildFMA(ResTy, Tmp1, R, R); 3122 3123 auto Ret = B.buildFMul(ResTy, X, R); 3124 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3125 3126 B.buildFMA(Res, Tmp2, R, Ret); 3127 MI.eraseFromParent(); 3128 return true; 3129 } 3130 3131 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3132 MachineRegisterInfo &MRI, 3133 MachineIRBuilder &B) const { 3134 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3135 return true; 3136 3137 Register Res = MI.getOperand(0).getReg(); 3138 Register LHS = MI.getOperand(1).getReg(); 3139 Register RHS = MI.getOperand(2).getReg(); 3140 3141 uint16_t Flags = MI.getFlags(); 3142 3143 LLT S16 = LLT::scalar(16); 3144 LLT S32 = LLT::scalar(32); 3145 3146 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3147 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3148 3149 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3150 .addUse(RHSExt.getReg(0)) 3151 .setMIFlags(Flags); 3152 3153 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3154 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3155 3156 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3157 .addUse(RDst.getReg(0)) 3158 .addUse(RHS) 3159 .addUse(LHS) 3160 .setMIFlags(Flags); 3161 3162 MI.eraseFromParent(); 3163 return true; 3164 } 3165 3166 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3167 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3168 static void toggleSPDenormMode(bool Enable, 3169 MachineIRBuilder &B, 3170 const GCNSubtarget &ST, 3171 AMDGPU::SIModeRegisterDefaults Mode) { 3172 // Set SP denorm mode to this value. 3173 unsigned SPDenormMode = 3174 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3175 3176 if (ST.hasDenormModeInst()) { 3177 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3178 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3179 3180 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3181 B.buildInstr(AMDGPU::S_DENORM_MODE) 3182 .addImm(NewDenormModeValue); 3183 3184 } else { 3185 // Select FP32 bit field in mode register. 3186 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3187 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3188 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3189 3190 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3191 .addImm(SPDenormMode) 3192 .addImm(SPDenormModeBitField); 3193 } 3194 } 3195 3196 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3197 MachineRegisterInfo &MRI, 3198 MachineIRBuilder &B) const { 3199 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3200 return true; 3201 3202 Register Res = MI.getOperand(0).getReg(); 3203 Register LHS = MI.getOperand(1).getReg(); 3204 Register RHS = MI.getOperand(2).getReg(); 3205 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3206 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3207 3208 uint16_t Flags = MI.getFlags(); 3209 3210 LLT S32 = LLT::scalar(32); 3211 LLT S1 = LLT::scalar(1); 3212 3213 auto One = B.buildFConstant(S32, 1.0f); 3214 3215 auto DenominatorScaled = 3216 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3217 .addUse(LHS) 3218 .addUse(RHS) 3219 .addImm(0) 3220 .setMIFlags(Flags); 3221 auto NumeratorScaled = 3222 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3223 .addUse(LHS) 3224 .addUse(RHS) 3225 .addImm(1) 3226 .setMIFlags(Flags); 3227 3228 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3229 .addUse(DenominatorScaled.getReg(0)) 3230 .setMIFlags(Flags); 3231 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3232 3233 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3234 // aren't modeled as reading it. 3235 if (!Mode.allFP32Denormals()) 3236 toggleSPDenormMode(true, B, ST, Mode); 3237 3238 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3239 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3240 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3241 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3242 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3243 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3244 3245 if (!Mode.allFP32Denormals()) 3246 toggleSPDenormMode(false, B, ST, Mode); 3247 3248 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3249 .addUse(Fma4.getReg(0)) 3250 .addUse(Fma1.getReg(0)) 3251 .addUse(Fma3.getReg(0)) 3252 .addUse(NumeratorScaled.getReg(1)) 3253 .setMIFlags(Flags); 3254 3255 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3256 .addUse(Fmas.getReg(0)) 3257 .addUse(RHS) 3258 .addUse(LHS) 3259 .setMIFlags(Flags); 3260 3261 MI.eraseFromParent(); 3262 return true; 3263 } 3264 3265 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3266 MachineRegisterInfo &MRI, 3267 MachineIRBuilder &B) const { 3268 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3269 return true; 3270 3271 Register Res = MI.getOperand(0).getReg(); 3272 Register LHS = MI.getOperand(1).getReg(); 3273 Register RHS = MI.getOperand(2).getReg(); 3274 3275 uint16_t Flags = MI.getFlags(); 3276 3277 LLT S64 = LLT::scalar(64); 3278 LLT S1 = LLT::scalar(1); 3279 3280 auto One = B.buildFConstant(S64, 1.0); 3281 3282 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3283 .addUse(LHS) 3284 .addUse(RHS) 3285 .addImm(0) 3286 .setMIFlags(Flags); 3287 3288 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3289 3290 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3291 .addUse(DivScale0.getReg(0)) 3292 .setMIFlags(Flags); 3293 3294 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3295 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3296 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3297 3298 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3299 .addUse(LHS) 3300 .addUse(RHS) 3301 .addImm(1) 3302 .setMIFlags(Flags); 3303 3304 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3305 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3306 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3307 3308 Register Scale; 3309 if (!ST.hasUsableDivScaleConditionOutput()) { 3310 // Workaround a hardware bug on SI where the condition output from div_scale 3311 // is not usable. 3312 3313 LLT S32 = LLT::scalar(32); 3314 3315 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3316 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3317 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3318 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3319 3320 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3321 Scale1Unmerge.getReg(1)); 3322 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3323 Scale0Unmerge.getReg(1)); 3324 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3325 } else { 3326 Scale = DivScale1.getReg(1); 3327 } 3328 3329 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3330 .addUse(Fma4.getReg(0)) 3331 .addUse(Fma3.getReg(0)) 3332 .addUse(Mul.getReg(0)) 3333 .addUse(Scale) 3334 .setMIFlags(Flags); 3335 3336 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3337 .addUse(Fmas.getReg(0)) 3338 .addUse(RHS) 3339 .addUse(LHS) 3340 .setMIFlags(Flags); 3341 3342 MI.eraseFromParent(); 3343 return true; 3344 } 3345 3346 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3347 MachineRegisterInfo &MRI, 3348 MachineIRBuilder &B) const { 3349 Register Res = MI.getOperand(0).getReg(); 3350 Register LHS = MI.getOperand(2).getReg(); 3351 Register RHS = MI.getOperand(3).getReg(); 3352 uint16_t Flags = MI.getFlags(); 3353 3354 LLT S32 = LLT::scalar(32); 3355 LLT S1 = LLT::scalar(1); 3356 3357 auto Abs = B.buildFAbs(S32, RHS, Flags); 3358 const APFloat C0Val(1.0f); 3359 3360 auto C0 = B.buildConstant(S32, 0x6f800000); 3361 auto C1 = B.buildConstant(S32, 0x2f800000); 3362 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3363 3364 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3365 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3366 3367 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3368 3369 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3370 .addUse(Mul0.getReg(0)) 3371 .setMIFlags(Flags); 3372 3373 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3374 3375 B.buildFMul(Res, Sel, Mul1, Flags); 3376 3377 MI.eraseFromParent(); 3378 return true; 3379 } 3380 3381 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3382 // FIXME: Why do we handle this one but not other removed instructions? 3383 // 3384 // Reciprocal square root. The clamp prevents infinite results, clamping 3385 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3386 // +-max_float. 3387 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3388 MachineRegisterInfo &MRI, 3389 MachineIRBuilder &B) const { 3390 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3391 return true; 3392 3393 Register Dst = MI.getOperand(0).getReg(); 3394 Register Src = MI.getOperand(2).getReg(); 3395 auto Flags = MI.getFlags(); 3396 3397 LLT Ty = MRI.getType(Dst); 3398 3399 const fltSemantics *FltSemantics; 3400 if (Ty == LLT::scalar(32)) 3401 FltSemantics = &APFloat::IEEEsingle(); 3402 else if (Ty == LLT::scalar(64)) 3403 FltSemantics = &APFloat::IEEEdouble(); 3404 else 3405 return false; 3406 3407 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3408 .addUse(Src) 3409 .setMIFlags(Flags); 3410 3411 // We don't need to concern ourselves with the snan handling difference, since 3412 // the rsq quieted (or not) so use the one which will directly select. 3413 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3414 const bool UseIEEE = MFI->getMode().IEEE; 3415 3416 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3417 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3418 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3419 3420 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3421 3422 if (UseIEEE) 3423 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3424 else 3425 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3426 MI.eraseFromParent(); 3427 return true; 3428 } 3429 3430 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3431 switch (IID) { 3432 case Intrinsic::amdgcn_ds_fadd: 3433 return AMDGPU::G_ATOMICRMW_FADD; 3434 case Intrinsic::amdgcn_ds_fmin: 3435 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3436 case Intrinsic::amdgcn_ds_fmax: 3437 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3438 default: 3439 llvm_unreachable("not a DS FP intrinsic"); 3440 } 3441 } 3442 3443 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3444 MachineInstr &MI, 3445 Intrinsic::ID IID) const { 3446 GISelChangeObserver &Observer = Helper.Observer; 3447 Observer.changingInstr(MI); 3448 3449 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3450 3451 // The remaining operands were used to set fields in the MemOperand on 3452 // construction. 3453 for (int I = 6; I > 3; --I) 3454 MI.RemoveOperand(I); 3455 3456 MI.RemoveOperand(1); // Remove the intrinsic ID. 3457 Observer.changedInstr(MI); 3458 return true; 3459 } 3460 3461 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3462 MachineRegisterInfo &MRI, 3463 MachineIRBuilder &B) const { 3464 uint64_t Offset = 3465 ST.getTargetLowering()->getImplicitParameterOffset( 3466 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3467 LLT DstTy = MRI.getType(DstReg); 3468 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3469 3470 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3471 if (!loadInputValue(KernargPtrReg, B, 3472 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3473 return false; 3474 3475 // FIXME: This should be nuw 3476 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3477 return true; 3478 } 3479 3480 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3481 MachineRegisterInfo &MRI, 3482 MachineIRBuilder &B) const { 3483 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3484 if (!MFI->isEntryFunction()) { 3485 return legalizePreloadedArgIntrin(MI, MRI, B, 3486 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3487 } 3488 3489 Register DstReg = MI.getOperand(0).getReg(); 3490 if (!getImplicitArgPtr(DstReg, MRI, B)) 3491 return false; 3492 3493 MI.eraseFromParent(); 3494 return true; 3495 } 3496 3497 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3498 MachineRegisterInfo &MRI, 3499 MachineIRBuilder &B, 3500 unsigned AddrSpace) const { 3501 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3502 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3503 Register Hi32 = Unmerge.getReg(1); 3504 3505 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3506 MI.eraseFromParent(); 3507 return true; 3508 } 3509 3510 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3511 // offset (the offset that is included in bounds checking and swizzling, to be 3512 // split between the instruction's voffset and immoffset fields) and soffset 3513 // (the offset that is excluded from bounds checking and swizzling, to go in 3514 // the instruction's soffset field). This function takes the first kind of 3515 // offset and figures out how to split it between voffset and immoffset. 3516 std::tuple<Register, unsigned, unsigned> 3517 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3518 Register OrigOffset) const { 3519 const unsigned MaxImm = 4095; 3520 Register BaseReg; 3521 unsigned TotalConstOffset; 3522 const LLT S32 = LLT::scalar(32); 3523 MachineRegisterInfo &MRI = *B.getMRI(); 3524 3525 std::tie(BaseReg, TotalConstOffset) = 3526 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3527 3528 unsigned ImmOffset = TotalConstOffset; 3529 3530 // If BaseReg is a pointer, convert it to int. 3531 if (MRI.getType(BaseReg).isPointer()) 3532 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3533 3534 // If the immediate value is too big for the immoffset field, put the value 3535 // and -4096 into the immoffset field so that the value that is copied/added 3536 // for the voffset field is a multiple of 4096, and it stands more chance 3537 // of being CSEd with the copy/add for another similar load/store. 3538 // However, do not do that rounding down to a multiple of 4096 if that is a 3539 // negative number, as it appears to be illegal to have a negative offset 3540 // in the vgpr, even if adding the immediate offset makes it positive. 3541 unsigned Overflow = ImmOffset & ~MaxImm; 3542 ImmOffset -= Overflow; 3543 if ((int32_t)Overflow < 0) { 3544 Overflow += ImmOffset; 3545 ImmOffset = 0; 3546 } 3547 3548 if (Overflow != 0) { 3549 if (!BaseReg) { 3550 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3551 } else { 3552 auto OverflowVal = B.buildConstant(S32, Overflow); 3553 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3554 } 3555 } 3556 3557 if (!BaseReg) 3558 BaseReg = B.buildConstant(S32, 0).getReg(0); 3559 3560 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3561 } 3562 3563 /// Handle register layout difference for f16 images for some subtargets. 3564 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3565 MachineRegisterInfo &MRI, 3566 Register Reg, 3567 bool ImageStore) const { 3568 const LLT S16 = LLT::scalar(16); 3569 const LLT S32 = LLT::scalar(32); 3570 LLT StoreVT = MRI.getType(Reg); 3571 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3572 3573 if (ST.hasUnpackedD16VMem()) { 3574 auto Unmerge = B.buildUnmerge(S16, Reg); 3575 3576 SmallVector<Register, 4> WideRegs; 3577 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3578 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3579 3580 int NumElts = StoreVT.getNumElements(); 3581 3582 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3583 } 3584 3585 if (ImageStore && ST.hasImageStoreD16Bug()) { 3586 if (StoreVT.getNumElements() == 2) { 3587 SmallVector<Register, 4> PackedRegs; 3588 Reg = B.buildBitcast(S32, Reg).getReg(0); 3589 PackedRegs.push_back(Reg); 3590 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3591 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); 3592 } 3593 3594 if (StoreVT.getNumElements() == 3) { 3595 SmallVector<Register, 4> PackedRegs; 3596 auto Unmerge = B.buildUnmerge(S16, Reg); 3597 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3598 PackedRegs.push_back(Unmerge.getReg(I)); 3599 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3600 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); 3601 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); 3602 } 3603 3604 if (StoreVT.getNumElements() == 4) { 3605 SmallVector<Register, 4> PackedRegs; 3606 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); 3607 auto Unmerge = B.buildUnmerge(S32, Reg); 3608 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3609 PackedRegs.push_back(Unmerge.getReg(I)); 3610 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3611 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); 3612 } 3613 3614 llvm_unreachable("invalid data type"); 3615 } 3616 3617 return Reg; 3618 } 3619 3620 Register AMDGPULegalizerInfo::fixStoreSourceType( 3621 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3622 MachineRegisterInfo *MRI = B.getMRI(); 3623 LLT Ty = MRI->getType(VData); 3624 3625 const LLT S16 = LLT::scalar(16); 3626 3627 // Fixup illegal register types for i8 stores. 3628 if (Ty == LLT::scalar(8) || Ty == S16) { 3629 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3630 return AnyExt; 3631 } 3632 3633 if (Ty.isVector()) { 3634 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3635 if (IsFormat) 3636 return handleD16VData(B, *MRI, VData); 3637 } 3638 } 3639 3640 return VData; 3641 } 3642 3643 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3644 MachineRegisterInfo &MRI, 3645 MachineIRBuilder &B, 3646 bool IsTyped, 3647 bool IsFormat) const { 3648 Register VData = MI.getOperand(1).getReg(); 3649 LLT Ty = MRI.getType(VData); 3650 LLT EltTy = Ty.getScalarType(); 3651 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3652 const LLT S32 = LLT::scalar(32); 3653 3654 VData = fixStoreSourceType(B, VData, IsFormat); 3655 Register RSrc = MI.getOperand(2).getReg(); 3656 3657 MachineMemOperand *MMO = *MI.memoperands_begin(); 3658 const int MemSize = MMO->getSize(); 3659 3660 unsigned ImmOffset; 3661 unsigned TotalOffset; 3662 3663 // The typed intrinsics add an immediate after the registers. 3664 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3665 3666 // The struct intrinsic variants add one additional operand over raw. 3667 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3668 Register VIndex; 3669 int OpOffset = 0; 3670 if (HasVIndex) { 3671 VIndex = MI.getOperand(3).getReg(); 3672 OpOffset = 1; 3673 } 3674 3675 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3676 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3677 3678 unsigned Format = 0; 3679 if (IsTyped) { 3680 Format = MI.getOperand(5 + OpOffset).getImm(); 3681 ++OpOffset; 3682 } 3683 3684 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3685 3686 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3687 if (TotalOffset != 0) 3688 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3689 3690 unsigned Opc; 3691 if (IsTyped) { 3692 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3693 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3694 } else if (IsFormat) { 3695 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3696 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3697 } else { 3698 switch (MemSize) { 3699 case 1: 3700 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3701 break; 3702 case 2: 3703 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3704 break; 3705 default: 3706 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3707 break; 3708 } 3709 } 3710 3711 if (!VIndex) 3712 VIndex = B.buildConstant(S32, 0).getReg(0); 3713 3714 auto MIB = B.buildInstr(Opc) 3715 .addUse(VData) // vdata 3716 .addUse(RSrc) // rsrc 3717 .addUse(VIndex) // vindex 3718 .addUse(VOffset) // voffset 3719 .addUse(SOffset) // soffset 3720 .addImm(ImmOffset); // offset(imm) 3721 3722 if (IsTyped) 3723 MIB.addImm(Format); 3724 3725 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3726 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3727 .addMemOperand(MMO); 3728 3729 MI.eraseFromParent(); 3730 return true; 3731 } 3732 3733 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3734 MachineRegisterInfo &MRI, 3735 MachineIRBuilder &B, 3736 bool IsFormat, 3737 bool IsTyped) const { 3738 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3739 MachineMemOperand *MMO = *MI.memoperands_begin(); 3740 const int MemSize = MMO->getSize(); 3741 const LLT S32 = LLT::scalar(32); 3742 3743 Register Dst = MI.getOperand(0).getReg(); 3744 Register RSrc = MI.getOperand(2).getReg(); 3745 3746 // The typed intrinsics add an immediate after the registers. 3747 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3748 3749 // The struct intrinsic variants add one additional operand over raw. 3750 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3751 Register VIndex; 3752 int OpOffset = 0; 3753 if (HasVIndex) { 3754 VIndex = MI.getOperand(3).getReg(); 3755 OpOffset = 1; 3756 } 3757 3758 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3759 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3760 3761 unsigned Format = 0; 3762 if (IsTyped) { 3763 Format = MI.getOperand(5 + OpOffset).getImm(); 3764 ++OpOffset; 3765 } 3766 3767 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3768 unsigned ImmOffset; 3769 unsigned TotalOffset; 3770 3771 LLT Ty = MRI.getType(Dst); 3772 LLT EltTy = Ty.getScalarType(); 3773 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3774 const bool Unpacked = ST.hasUnpackedD16VMem(); 3775 3776 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3777 if (TotalOffset != 0) 3778 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3779 3780 unsigned Opc; 3781 3782 if (IsTyped) { 3783 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3784 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3785 } else if (IsFormat) { 3786 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3787 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3788 } else { 3789 switch (MemSize) { 3790 case 1: 3791 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3792 break; 3793 case 2: 3794 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3795 break; 3796 default: 3797 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3798 break; 3799 } 3800 } 3801 3802 Register LoadDstReg; 3803 3804 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3805 LLT UnpackedTy = Ty.changeElementSize(32); 3806 3807 if (IsExtLoad) 3808 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3809 else if (Unpacked && IsD16 && Ty.isVector()) 3810 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3811 else 3812 LoadDstReg = Dst; 3813 3814 if (!VIndex) 3815 VIndex = B.buildConstant(S32, 0).getReg(0); 3816 3817 auto MIB = B.buildInstr(Opc) 3818 .addDef(LoadDstReg) // vdata 3819 .addUse(RSrc) // rsrc 3820 .addUse(VIndex) // vindex 3821 .addUse(VOffset) // voffset 3822 .addUse(SOffset) // soffset 3823 .addImm(ImmOffset); // offset(imm) 3824 3825 if (IsTyped) 3826 MIB.addImm(Format); 3827 3828 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3829 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3830 .addMemOperand(MMO); 3831 3832 if (LoadDstReg != Dst) { 3833 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3834 3835 // Widen result for extending loads was widened. 3836 if (IsExtLoad) 3837 B.buildTrunc(Dst, LoadDstReg); 3838 else { 3839 // Repack to original 16-bit vector result 3840 // FIXME: G_TRUNC should work, but legalization currently fails 3841 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3842 SmallVector<Register, 4> Repack; 3843 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3844 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3845 B.buildMerge(Dst, Repack); 3846 } 3847 } 3848 3849 MI.eraseFromParent(); 3850 return true; 3851 } 3852 3853 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3854 MachineIRBuilder &B, 3855 bool IsInc) const { 3856 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3857 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3858 B.buildInstr(Opc) 3859 .addDef(MI.getOperand(0).getReg()) 3860 .addUse(MI.getOperand(2).getReg()) 3861 .addUse(MI.getOperand(3).getReg()) 3862 .cloneMemRefs(MI); 3863 MI.eraseFromParent(); 3864 return true; 3865 } 3866 3867 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3868 switch (IntrID) { 3869 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3870 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3871 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3872 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3873 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3874 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3875 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3876 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3877 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3878 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3879 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3880 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3881 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3882 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3883 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3884 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3885 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3886 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3887 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3888 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3889 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3890 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3891 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3892 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3893 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3894 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3895 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3896 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3897 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3898 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3899 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3900 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3901 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3902 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3903 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3904 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3905 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3906 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3907 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3908 case Intrinsic::amdgcn_buffer_atomic_fadd: 3909 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3910 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3911 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3912 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 3913 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 3914 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 3915 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 3916 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 3917 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 3918 default: 3919 llvm_unreachable("unhandled atomic opcode"); 3920 } 3921 } 3922 3923 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3924 MachineIRBuilder &B, 3925 Intrinsic::ID IID) const { 3926 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3927 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3928 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3929 3930 Register Dst; 3931 3932 int OpOffset = 0; 3933 if (HasReturn) { 3934 // A few FP atomics do not support return values. 3935 Dst = MI.getOperand(0).getReg(); 3936 } else { 3937 OpOffset = -1; 3938 } 3939 3940 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3941 Register CmpVal; 3942 3943 if (IsCmpSwap) { 3944 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3945 ++OpOffset; 3946 } 3947 3948 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3949 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3950 3951 // The struct intrinsic variants add one additional operand over raw. 3952 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3953 Register VIndex; 3954 if (HasVIndex) { 3955 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3956 ++OpOffset; 3957 } 3958 3959 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3960 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3961 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3962 3963 MachineMemOperand *MMO = *MI.memoperands_begin(); 3964 3965 unsigned ImmOffset; 3966 unsigned TotalOffset; 3967 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3968 if (TotalOffset != 0) 3969 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3970 3971 if (!VIndex) 3972 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3973 3974 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3975 3976 if (HasReturn) 3977 MIB.addDef(Dst); 3978 3979 MIB.addUse(VData); // vdata 3980 3981 if (IsCmpSwap) 3982 MIB.addReg(CmpVal); 3983 3984 MIB.addUse(RSrc) // rsrc 3985 .addUse(VIndex) // vindex 3986 .addUse(VOffset) // voffset 3987 .addUse(SOffset) // soffset 3988 .addImm(ImmOffset) // offset(imm) 3989 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3990 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3991 .addMemOperand(MMO); 3992 3993 MI.eraseFromParent(); 3994 return true; 3995 } 3996 3997 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3998 /// vector with s16 typed elements. 3999 static void packImageA16AddressToDwords( 4000 MachineIRBuilder &B, MachineInstr &MI, 4001 SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset, 4002 const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) { 4003 const LLT S16 = LLT::scalar(16); 4004 const LLT V2S16 = LLT::vector(2, 16); 4005 4006 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4007 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4008 if (!SrcOp.isReg()) 4009 continue; // _L to _LZ may have eliminated this. 4010 4011 Register AddrReg = SrcOp.getReg(); 4012 4013 if (I < Intr->GradientStart) { 4014 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4015 PackedAddrs.push_back(AddrReg); 4016 } else { 4017 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4018 // derivatives dx/dh and dx/dv are packed with undef. 4019 if (((I + 1) >= EndIdx) || 4020 ((Intr->NumGradients / 2) % 2 == 1 && 4021 (I == static_cast<unsigned>(Intr->GradientStart + 4022 (Intr->NumGradients / 2) - 1) || 4023 I == static_cast<unsigned>(Intr->GradientStart + 4024 Intr->NumGradients - 1))) || 4025 // Check for _L to _LZ optimization 4026 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4027 PackedAddrs.push_back( 4028 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4029 .getReg(0)); 4030 } else { 4031 PackedAddrs.push_back( 4032 B.buildBuildVector( 4033 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4034 .getReg(0)); 4035 ++I; 4036 } 4037 } 4038 } 4039 } 4040 4041 /// Convert from separate vaddr components to a single vector address register, 4042 /// and replace the remaining operands with $noreg. 4043 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4044 int DimIdx, int NumVAddrs) { 4045 const LLT S32 = LLT::scalar(32); 4046 4047 SmallVector<Register, 8> AddrRegs; 4048 for (int I = 0; I != NumVAddrs; ++I) { 4049 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4050 if (SrcOp.isReg()) { 4051 AddrRegs.push_back(SrcOp.getReg()); 4052 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4053 } 4054 } 4055 4056 int NumAddrRegs = AddrRegs.size(); 4057 if (NumAddrRegs != 1) { 4058 // Round up to 8 elements for v5-v7 4059 // FIXME: Missing intermediate sized register classes and instructions. 4060 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4061 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4062 auto Undef = B.buildUndef(S32); 4063 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4064 NumAddrRegs = RoundedNumRegs; 4065 } 4066 4067 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 4068 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4069 } 4070 4071 for (int I = 1; I != NumVAddrs; ++I) { 4072 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4073 if (SrcOp.isReg()) 4074 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4075 } 4076 } 4077 4078 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4079 /// 4080 /// Depending on the subtarget, load/store with 16-bit element data need to be 4081 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4082 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4083 /// registers. 4084 /// 4085 /// We don't want to directly select image instructions just yet, but also want 4086 /// to exposes all register repacking to the legalizer/combiners. We also don't 4087 /// want a selected instrution entering RegBankSelect. In order to avoid 4088 /// defining a multitude of intermediate image instructions, directly hack on 4089 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4090 /// now unnecessary arguments with $noreg. 4091 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4092 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4093 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4094 4095 const unsigned NumDefs = MI.getNumExplicitDefs(); 4096 const unsigned ArgOffset = NumDefs + 1; 4097 bool IsTFE = NumDefs == 2; 4098 // We are only processing the operands of d16 image operations on subtargets 4099 // that use the unpacked register layout, or need to repack the TFE result. 4100 4101 // TODO: Do we need to guard against already legalized intrinsics? 4102 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4103 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4104 4105 MachineRegisterInfo *MRI = B.getMRI(); 4106 const LLT S32 = LLT::scalar(32); 4107 const LLT S16 = LLT::scalar(16); 4108 const LLT V2S16 = LLT::vector(2, 16); 4109 4110 unsigned DMask = 0; 4111 4112 // Check for 16 bit addresses and pack if true. 4113 LLT GradTy = 4114 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4115 LLT AddrTy = 4116 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4117 const bool IsG16 = GradTy == S16; 4118 const bool IsA16 = AddrTy == S16; 4119 4120 int DMaskLanes = 0; 4121 if (!BaseOpcode->Atomic) { 4122 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4123 if (BaseOpcode->Gather4) { 4124 DMaskLanes = 4; 4125 } else if (DMask != 0) { 4126 DMaskLanes = countPopulation(DMask); 4127 } else if (!IsTFE && !BaseOpcode->Store) { 4128 // If dmask is 0, this is a no-op load. This can be eliminated. 4129 B.buildUndef(MI.getOperand(0)); 4130 MI.eraseFromParent(); 4131 return true; 4132 } 4133 } 4134 4135 Observer.changingInstr(MI); 4136 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4137 4138 unsigned NewOpcode = NumDefs == 0 ? 4139 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4140 4141 // Track that we legalized this 4142 MI.setDesc(B.getTII().get(NewOpcode)); 4143 4144 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4145 // dmask to be at least 1 otherwise the instruction will fail 4146 if (IsTFE && DMask == 0) { 4147 DMask = 0x1; 4148 DMaskLanes = 1; 4149 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4150 } 4151 4152 if (BaseOpcode->Atomic) { 4153 Register VData0 = MI.getOperand(2).getReg(); 4154 LLT Ty = MRI->getType(VData0); 4155 4156 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4157 if (Ty.isVector()) 4158 return false; 4159 4160 if (BaseOpcode->AtomicX2) { 4161 Register VData1 = MI.getOperand(3).getReg(); 4162 // The two values are packed in one register. 4163 LLT PackedTy = LLT::vector(2, Ty); 4164 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4165 MI.getOperand(2).setReg(Concat.getReg(0)); 4166 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4167 } 4168 } 4169 4170 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4171 4172 // Optimize _L to _LZ when _L is zero 4173 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4174 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4175 const ConstantFP *ConstantLod; 4176 4177 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4178 m_GFCst(ConstantLod))) { 4179 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4180 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4181 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4182 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4183 Intr->Dim); 4184 4185 // The starting indexes should remain in the same place. 4186 --CorrectedNumVAddrs; 4187 4188 MI.getOperand(MI.getNumExplicitDefs()) 4189 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4190 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4191 Intr = NewImageDimIntr; 4192 } 4193 } 4194 } 4195 4196 // Optimize _mip away, when 'lod' is zero 4197 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4198 int64_t ConstantLod; 4199 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4200 m_ICst(ConstantLod))) { 4201 if (ConstantLod == 0) { 4202 // TODO: Change intrinsic opcode and remove operand instead or replacing 4203 // it with 0, as the _L to _LZ handling is done above. 4204 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4205 --CorrectedNumVAddrs; 4206 } 4207 } 4208 } 4209 4210 // Rewrite the addressing register layout before doing anything else. 4211 if (IsA16 || IsG16) { 4212 if (IsA16) { 4213 // Target must support the feature and gradients need to be 16 bit too 4214 if (!ST.hasA16() || !IsG16) 4215 return false; 4216 } else if (!ST.hasG16()) 4217 return false; 4218 4219 if (Intr->NumVAddrs > 1) { 4220 SmallVector<Register, 4> PackedRegs; 4221 // Don't compress addresses for G16 4222 const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart; 4223 packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr, 4224 PackEndIdx); 4225 4226 if (!IsA16) { 4227 // Add uncompressed address 4228 for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) { 4229 int AddrReg = MI.getOperand(ArgOffset + I).getReg(); 4230 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 4231 PackedRegs.push_back(AddrReg); 4232 } 4233 } 4234 4235 // See also below in the non-a16 branch 4236 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4237 4238 if (!UseNSA && PackedRegs.size() > 1) { 4239 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4240 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4241 PackedRegs[0] = Concat.getReg(0); 4242 PackedRegs.resize(1); 4243 } 4244 4245 const unsigned NumPacked = PackedRegs.size(); 4246 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4247 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4248 if (!SrcOp.isReg()) { 4249 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4250 continue; 4251 } 4252 4253 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4254 4255 if (I - Intr->VAddrStart < NumPacked) 4256 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4257 else 4258 SrcOp.setReg(AMDGPU::NoRegister); 4259 } 4260 } 4261 } else { 4262 // If the register allocator cannot place the address registers contiguously 4263 // without introducing moves, then using the non-sequential address encoding 4264 // is always preferable, since it saves VALU instructions and is usually a 4265 // wash in terms of code size or even better. 4266 // 4267 // However, we currently have no way of hinting to the register allocator 4268 // that MIMG addresses should be placed contiguously when it is possible to 4269 // do so, so force non-NSA for the common 2-address case as a heuristic. 4270 // 4271 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4272 // allocation when possible. 4273 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4274 4275 if (!UseNSA && Intr->NumVAddrs > 1) 4276 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4277 Intr->NumVAddrs); 4278 } 4279 4280 int Flags = 0; 4281 if (IsA16) 4282 Flags |= 1; 4283 if (IsG16) 4284 Flags |= 2; 4285 MI.addOperand(MachineOperand::CreateImm(Flags)); 4286 4287 if (BaseOpcode->Store) { // No TFE for stores? 4288 // TODO: Handle dmask trim 4289 Register VData = MI.getOperand(1).getReg(); 4290 LLT Ty = MRI->getType(VData); 4291 if (!Ty.isVector() || Ty.getElementType() != S16) 4292 return true; 4293 4294 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4295 if (RepackedReg != VData) { 4296 MI.getOperand(1).setReg(RepackedReg); 4297 } 4298 4299 return true; 4300 } 4301 4302 Register DstReg = MI.getOperand(0).getReg(); 4303 LLT Ty = MRI->getType(DstReg); 4304 const LLT EltTy = Ty.getScalarType(); 4305 const bool IsD16 = Ty.getScalarType() == S16; 4306 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4307 4308 // Confirm that the return type is large enough for the dmask specified 4309 if (NumElts < DMaskLanes) 4310 return false; 4311 4312 if (NumElts > 4 || DMaskLanes > 4) 4313 return false; 4314 4315 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4316 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4317 4318 // The raw dword aligned data component of the load. The only legal cases 4319 // where this matters should be when using the packed D16 format, for 4320 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4321 LLT RoundedTy; 4322 4323 // S32 vector to to cover all data, plus TFE result element. 4324 LLT TFETy; 4325 4326 // Register type to use for each loaded component. Will be S32 or V2S16. 4327 LLT RegTy; 4328 4329 if (IsD16 && ST.hasUnpackedD16VMem()) { 4330 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4331 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4332 RegTy = S32; 4333 } else { 4334 unsigned EltSize = EltTy.getSizeInBits(); 4335 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4336 unsigned RoundedSize = 32 * RoundedElts; 4337 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4338 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4339 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4340 } 4341 4342 // The return type does not need adjustment. 4343 // TODO: Should we change s16 case to s32 or <2 x s16>? 4344 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4345 return true; 4346 4347 Register Dst1Reg; 4348 4349 // Insert after the instruction. 4350 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4351 4352 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4353 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4354 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4355 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4356 4357 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4358 4359 MI.getOperand(0).setReg(NewResultReg); 4360 4361 // In the IR, TFE is supposed to be used with a 2 element struct return 4362 // type. The intruction really returns these two values in one contiguous 4363 // register, with one additional dword beyond the loaded data. Rewrite the 4364 // return type to use a single register result. 4365 4366 if (IsTFE) { 4367 Dst1Reg = MI.getOperand(1).getReg(); 4368 if (MRI->getType(Dst1Reg) != S32) 4369 return false; 4370 4371 // TODO: Make sure the TFE operand bit is set. 4372 MI.RemoveOperand(1); 4373 4374 // Handle the easy case that requires no repack instructions. 4375 if (Ty == S32) { 4376 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4377 return true; 4378 } 4379 } 4380 4381 // Now figure out how to copy the new result register back into the old 4382 // result. 4383 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4384 4385 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4386 4387 if (ResultNumRegs == 1) { 4388 assert(!IsTFE); 4389 ResultRegs[0] = NewResultReg; 4390 } else { 4391 // We have to repack into a new vector of some kind. 4392 for (int I = 0; I != NumDataRegs; ++I) 4393 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4394 B.buildUnmerge(ResultRegs, NewResultReg); 4395 4396 // Drop the final TFE element to get the data part. The TFE result is 4397 // directly written to the right place already. 4398 if (IsTFE) 4399 ResultRegs.resize(NumDataRegs); 4400 } 4401 4402 // For an s16 scalar result, we form an s32 result with a truncate regardless 4403 // of packed vs. unpacked. 4404 if (IsD16 && !Ty.isVector()) { 4405 B.buildTrunc(DstReg, ResultRegs[0]); 4406 return true; 4407 } 4408 4409 // Avoid a build/concat_vector of 1 entry. 4410 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4411 B.buildBitcast(DstReg, ResultRegs[0]); 4412 return true; 4413 } 4414 4415 assert(Ty.isVector()); 4416 4417 if (IsD16) { 4418 // For packed D16 results with TFE enabled, all the data components are 4419 // S32. Cast back to the expected type. 4420 // 4421 // TODO: We don't really need to use load s32 elements. We would only need one 4422 // cast for the TFE result if a multiple of v2s16 was used. 4423 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4424 for (Register &Reg : ResultRegs) 4425 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4426 } else if (ST.hasUnpackedD16VMem()) { 4427 for (Register &Reg : ResultRegs) 4428 Reg = B.buildTrunc(S16, Reg).getReg(0); 4429 } 4430 } 4431 4432 auto padWithUndef = [&](LLT Ty, int NumElts) { 4433 if (NumElts == 0) 4434 return; 4435 Register Undef = B.buildUndef(Ty).getReg(0); 4436 for (int I = 0; I != NumElts; ++I) 4437 ResultRegs.push_back(Undef); 4438 }; 4439 4440 // Pad out any elements eliminated due to the dmask. 4441 LLT ResTy = MRI->getType(ResultRegs[0]); 4442 if (!ResTy.isVector()) { 4443 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4444 B.buildBuildVector(DstReg, ResultRegs); 4445 return true; 4446 } 4447 4448 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4449 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4450 4451 // Deal with the one annoying legal case. 4452 const LLT V3S16 = LLT::vector(3, 16); 4453 if (Ty == V3S16) { 4454 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4455 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4456 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4457 return true; 4458 } 4459 4460 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4461 B.buildConcatVectors(DstReg, ResultRegs); 4462 return true; 4463 } 4464 4465 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4466 LegalizerHelper &Helper, MachineInstr &MI) const { 4467 MachineIRBuilder &B = Helper.MIRBuilder; 4468 GISelChangeObserver &Observer = Helper.Observer; 4469 4470 Register Dst = MI.getOperand(0).getReg(); 4471 LLT Ty = B.getMRI()->getType(Dst); 4472 unsigned Size = Ty.getSizeInBits(); 4473 MachineFunction &MF = B.getMF(); 4474 4475 Observer.changingInstr(MI); 4476 4477 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4478 Ty = getBitcastRegisterType(Ty); 4479 Helper.bitcastDst(MI, Ty, 0); 4480 Dst = MI.getOperand(0).getReg(); 4481 B.setInsertPt(B.getMBB(), MI); 4482 } 4483 4484 // FIXME: We don't really need this intermediate instruction. The intrinsic 4485 // should be fixed to have a memory operand. Since it's readnone, we're not 4486 // allowed to add one. 4487 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4488 MI.RemoveOperand(1); // Remove intrinsic ID 4489 4490 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4491 // TODO: Should this use datalayout alignment? 4492 const unsigned MemSize = (Size + 7) / 8; 4493 const Align MemAlign(4); 4494 MachineMemOperand *MMO = MF.getMachineMemOperand( 4495 MachinePointerInfo(), 4496 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4497 MachineMemOperand::MOInvariant, 4498 MemSize, MemAlign); 4499 MI.addMemOperand(MF, MMO); 4500 4501 // There are no 96-bit result scalar loads, but widening to 128-bit should 4502 // always be legal. We may need to restore this to a 96-bit result if it turns 4503 // out this needs to be converted to a vector load during RegBankSelect. 4504 if (!isPowerOf2_32(Size)) { 4505 if (Ty.isVector()) 4506 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4507 else 4508 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4509 } 4510 4511 Observer.changedInstr(MI); 4512 return true; 4513 } 4514 4515 // TODO: Move to selection 4516 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4517 MachineRegisterInfo &MRI, 4518 MachineIRBuilder &B) const { 4519 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4520 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4521 !ST.isTrapHandlerEnabled()) { 4522 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4523 } else { 4524 // Pass queue pointer to trap handler as input, and insert trap instruction 4525 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4526 MachineRegisterInfo &MRI = *B.getMRI(); 4527 4528 Register LiveIn = 4529 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4530 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4531 return false; 4532 4533 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4534 B.buildCopy(SGPR01, LiveIn); 4535 B.buildInstr(AMDGPU::S_TRAP) 4536 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4537 .addReg(SGPR01, RegState::Implicit); 4538 } 4539 4540 MI.eraseFromParent(); 4541 return true; 4542 } 4543 4544 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4545 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4546 // Is non-HSA path or trap-handler disabled? then, report a warning 4547 // accordingly 4548 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4549 !ST.isTrapHandlerEnabled()) { 4550 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4551 "debugtrap handler not supported", 4552 MI.getDebugLoc(), DS_Warning); 4553 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4554 Ctx.diagnose(NoTrap); 4555 } else { 4556 // Insert debug-trap instruction 4557 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4558 } 4559 4560 MI.eraseFromParent(); 4561 return true; 4562 } 4563 4564 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4565 MachineIRBuilder &B) const { 4566 MachineRegisterInfo &MRI = *B.getMRI(); 4567 const LLT S16 = LLT::scalar(16); 4568 const LLT S32 = LLT::scalar(32); 4569 4570 Register DstReg = MI.getOperand(0).getReg(); 4571 Register NodePtr = MI.getOperand(2).getReg(); 4572 Register RayExtent = MI.getOperand(3).getReg(); 4573 Register RayOrigin = MI.getOperand(4).getReg(); 4574 Register RayDir = MI.getOperand(5).getReg(); 4575 Register RayInvDir = MI.getOperand(6).getReg(); 4576 Register TDescr = MI.getOperand(7).getReg(); 4577 4578 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4579 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4580 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4581 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4582 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4583 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4584 4585 SmallVector<Register, 12> Ops; 4586 if (Is64) { 4587 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4588 Ops.push_back(Unmerge.getReg(0)); 4589 Ops.push_back(Unmerge.getReg(1)); 4590 } else { 4591 Ops.push_back(NodePtr); 4592 } 4593 Ops.push_back(RayExtent); 4594 4595 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4596 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4597 Ops.push_back(Unmerge.getReg(0)); 4598 Ops.push_back(Unmerge.getReg(1)); 4599 Ops.push_back(Unmerge.getReg(2)); 4600 }; 4601 4602 packLanes(RayOrigin); 4603 if (IsA16) { 4604 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4605 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4606 Register R1 = MRI.createGenericVirtualRegister(S32); 4607 Register R2 = MRI.createGenericVirtualRegister(S32); 4608 Register R3 = MRI.createGenericVirtualRegister(S32); 4609 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4610 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4611 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4612 Ops.push_back(R1); 4613 Ops.push_back(R2); 4614 Ops.push_back(R3); 4615 } else { 4616 packLanes(RayDir); 4617 packLanes(RayInvDir); 4618 } 4619 4620 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4621 .addDef(DstReg) 4622 .addImm(Opcode); 4623 4624 for (Register R : Ops) { 4625 MIB.addUse(R); 4626 } 4627 4628 MIB.addUse(TDescr) 4629 .addImm(IsA16 ? 1 : 0) 4630 .cloneMemRefs(MI); 4631 4632 MI.eraseFromParent(); 4633 return true; 4634 } 4635 4636 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4637 MachineInstr &MI) const { 4638 MachineIRBuilder &B = Helper.MIRBuilder; 4639 MachineRegisterInfo &MRI = *B.getMRI(); 4640 4641 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4642 auto IntrID = MI.getIntrinsicID(); 4643 switch (IntrID) { 4644 case Intrinsic::amdgcn_if: 4645 case Intrinsic::amdgcn_else: { 4646 MachineInstr *Br = nullptr; 4647 MachineBasicBlock *UncondBrTarget = nullptr; 4648 bool Negated = false; 4649 if (MachineInstr *BrCond = 4650 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4651 const SIRegisterInfo *TRI 4652 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4653 4654 Register Def = MI.getOperand(1).getReg(); 4655 Register Use = MI.getOperand(3).getReg(); 4656 4657 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4658 4659 if (Negated) 4660 std::swap(CondBrTarget, UncondBrTarget); 4661 4662 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4663 if (IntrID == Intrinsic::amdgcn_if) { 4664 B.buildInstr(AMDGPU::SI_IF) 4665 .addDef(Def) 4666 .addUse(Use) 4667 .addMBB(UncondBrTarget); 4668 } else { 4669 B.buildInstr(AMDGPU::SI_ELSE) 4670 .addDef(Def) 4671 .addUse(Use) 4672 .addMBB(UncondBrTarget); 4673 } 4674 4675 if (Br) { 4676 Br->getOperand(0).setMBB(CondBrTarget); 4677 } else { 4678 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4679 // since we're swapping branch targets it needs to be reinserted. 4680 // FIXME: IRTranslator should probably not do this 4681 B.buildBr(*CondBrTarget); 4682 } 4683 4684 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4685 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4686 MI.eraseFromParent(); 4687 BrCond->eraseFromParent(); 4688 return true; 4689 } 4690 4691 return false; 4692 } 4693 case Intrinsic::amdgcn_loop: { 4694 MachineInstr *Br = nullptr; 4695 MachineBasicBlock *UncondBrTarget = nullptr; 4696 bool Negated = false; 4697 if (MachineInstr *BrCond = 4698 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4699 const SIRegisterInfo *TRI 4700 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4701 4702 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4703 Register Reg = MI.getOperand(2).getReg(); 4704 4705 if (Negated) 4706 std::swap(CondBrTarget, UncondBrTarget); 4707 4708 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4709 B.buildInstr(AMDGPU::SI_LOOP) 4710 .addUse(Reg) 4711 .addMBB(UncondBrTarget); 4712 4713 if (Br) 4714 Br->getOperand(0).setMBB(CondBrTarget); 4715 else 4716 B.buildBr(*CondBrTarget); 4717 4718 MI.eraseFromParent(); 4719 BrCond->eraseFromParent(); 4720 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4721 return true; 4722 } 4723 4724 return false; 4725 } 4726 case Intrinsic::amdgcn_kernarg_segment_ptr: 4727 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4728 // This only makes sense to call in a kernel, so just lower to null. 4729 B.buildConstant(MI.getOperand(0).getReg(), 0); 4730 MI.eraseFromParent(); 4731 return true; 4732 } 4733 4734 return legalizePreloadedArgIntrin( 4735 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4736 case Intrinsic::amdgcn_implicitarg_ptr: 4737 return legalizeImplicitArgPtr(MI, MRI, B); 4738 case Intrinsic::amdgcn_workitem_id_x: 4739 return legalizePreloadedArgIntrin(MI, MRI, B, 4740 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4741 case Intrinsic::amdgcn_workitem_id_y: 4742 return legalizePreloadedArgIntrin(MI, MRI, B, 4743 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4744 case Intrinsic::amdgcn_workitem_id_z: 4745 return legalizePreloadedArgIntrin(MI, MRI, B, 4746 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4747 case Intrinsic::amdgcn_workgroup_id_x: 4748 return legalizePreloadedArgIntrin(MI, MRI, B, 4749 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4750 case Intrinsic::amdgcn_workgroup_id_y: 4751 return legalizePreloadedArgIntrin(MI, MRI, B, 4752 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4753 case Intrinsic::amdgcn_workgroup_id_z: 4754 return legalizePreloadedArgIntrin(MI, MRI, B, 4755 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4756 case Intrinsic::amdgcn_dispatch_ptr: 4757 return legalizePreloadedArgIntrin(MI, MRI, B, 4758 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4759 case Intrinsic::amdgcn_queue_ptr: 4760 return legalizePreloadedArgIntrin(MI, MRI, B, 4761 AMDGPUFunctionArgInfo::QUEUE_PTR); 4762 case Intrinsic::amdgcn_implicit_buffer_ptr: 4763 return legalizePreloadedArgIntrin( 4764 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4765 case Intrinsic::amdgcn_dispatch_id: 4766 return legalizePreloadedArgIntrin(MI, MRI, B, 4767 AMDGPUFunctionArgInfo::DISPATCH_ID); 4768 case Intrinsic::amdgcn_fdiv_fast: 4769 return legalizeFDIVFastIntrin(MI, MRI, B); 4770 case Intrinsic::amdgcn_is_shared: 4771 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4772 case Intrinsic::amdgcn_is_private: 4773 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4774 case Intrinsic::amdgcn_wavefrontsize: { 4775 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4776 MI.eraseFromParent(); 4777 return true; 4778 } 4779 case Intrinsic::amdgcn_s_buffer_load: 4780 return legalizeSBufferLoad(Helper, MI); 4781 case Intrinsic::amdgcn_raw_buffer_store: 4782 case Intrinsic::amdgcn_struct_buffer_store: 4783 return legalizeBufferStore(MI, MRI, B, false, false); 4784 case Intrinsic::amdgcn_raw_buffer_store_format: 4785 case Intrinsic::amdgcn_struct_buffer_store_format: 4786 return legalizeBufferStore(MI, MRI, B, false, true); 4787 case Intrinsic::amdgcn_raw_tbuffer_store: 4788 case Intrinsic::amdgcn_struct_tbuffer_store: 4789 return legalizeBufferStore(MI, MRI, B, true, true); 4790 case Intrinsic::amdgcn_raw_buffer_load: 4791 case Intrinsic::amdgcn_struct_buffer_load: 4792 return legalizeBufferLoad(MI, MRI, B, false, false); 4793 case Intrinsic::amdgcn_raw_buffer_load_format: 4794 case Intrinsic::amdgcn_struct_buffer_load_format: 4795 return legalizeBufferLoad(MI, MRI, B, true, false); 4796 case Intrinsic::amdgcn_raw_tbuffer_load: 4797 case Intrinsic::amdgcn_struct_tbuffer_load: 4798 return legalizeBufferLoad(MI, MRI, B, true, true); 4799 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4800 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4801 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4802 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4803 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4804 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4805 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4806 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4807 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4808 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4809 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4810 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4811 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4812 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4813 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4814 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4815 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4816 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4817 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4818 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4819 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4820 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4821 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4822 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4823 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4824 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4825 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4826 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4827 case Intrinsic::amdgcn_buffer_atomic_fadd: 4828 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4829 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4830 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4831 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4832 return legalizeBufferAtomic(MI, B, IntrID); 4833 case Intrinsic::amdgcn_atomic_inc: 4834 return legalizeAtomicIncDec(MI, B, true); 4835 case Intrinsic::amdgcn_atomic_dec: 4836 return legalizeAtomicIncDec(MI, B, false); 4837 case Intrinsic::trap: 4838 return legalizeTrapIntrinsic(MI, MRI, B); 4839 case Intrinsic::debugtrap: 4840 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4841 case Intrinsic::amdgcn_rsq_clamp: 4842 return legalizeRsqClampIntrinsic(MI, MRI, B); 4843 case Intrinsic::amdgcn_ds_fadd: 4844 case Intrinsic::amdgcn_ds_fmin: 4845 case Intrinsic::amdgcn_ds_fmax: 4846 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4847 case Intrinsic::amdgcn_image_bvh_intersect_ray: 4848 return legalizeBVHIntrinsic(MI, B); 4849 default: { 4850 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4851 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4852 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4853 return true; 4854 } 4855 } 4856 4857 return true; 4858 } 4859