1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/ADT/ScopeExit.h" 22 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 25 #include "llvm/IR/DiagnosticInfo.h" 26 #include "llvm/IR/IntrinsicsAMDGPU.h" 27 28 #define DEBUG_TYPE "amdgpu-legalinfo" 29 30 using namespace llvm; 31 using namespace LegalizeActions; 32 using namespace LegalizeMutations; 33 using namespace LegalityPredicates; 34 using namespace MIPatternMatch; 35 36 // Hack until load/store selection patterns support any tuple of legal types. 37 static cl::opt<bool> EnableNewLegality( 38 "amdgpu-global-isel-new-legality", 39 cl::desc("Use GlobalISel desired legality, rather than try to use" 40 "rules compatible with selection patterns"), 41 cl::init(false), 42 cl::ReallyHidden); 43 44 static constexpr unsigned MaxRegisterSize = 1024; 45 46 // Round the number of elements to the next power of two elements 47 static LLT getPow2VectorType(LLT Ty) { 48 unsigned NElts = Ty.getNumElements(); 49 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 50 return Ty.changeNumElements(Pow2NElts); 51 } 52 53 // Round the number of bits to the next power of two bits 54 static LLT getPow2ScalarType(LLT Ty) { 55 unsigned Bits = Ty.getSizeInBits(); 56 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 57 return LLT::scalar(Pow2Bits); 58 } 59 60 /// \returs true if this is an odd sized vector which should widen by adding an 61 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 62 /// excludes s1 vectors, which should always be scalarized. 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 if (!Ty.isVector()) 67 return false; 68 69 const LLT EltTy = Ty.getElementType(); 70 const unsigned EltSize = EltTy.getSizeInBits(); 71 return Ty.getNumElements() % 2 != 0 && 72 EltSize > 1 && EltSize < 32 && 73 Ty.getSizeInBits() % 32 != 0; 74 }; 75 } 76 77 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 78 return [=](const LegalityQuery &Query) { 79 const LLT Ty = Query.Types[TypeIdx]; 80 return Ty.getSizeInBits() % 32 == 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LLT getBitcastRegisterType(const LLT Ty) { 130 const unsigned Size = Ty.getSizeInBits(); 131 132 LLT CoercedTy; 133 if (Size <= 32) { 134 // <2 x s8> -> s16 135 // <4 x s8> -> s32 136 return LLT::scalar(Size); 137 } 138 139 return LLT::scalarOrVector(Size / 32, 32); 140 } 141 142 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 143 return [=](const LegalityQuery &Query) { 144 const LLT Ty = Query.Types[TypeIdx]; 145 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 146 }; 147 } 148 149 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT Ty = Query.Types[TypeIdx]; 152 unsigned Size = Ty.getSizeInBits(); 153 assert(Size % 32 == 0); 154 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 155 }; 156 } 157 158 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 159 return [=](const LegalityQuery &Query) { 160 const LLT QueryTy = Query.Types[TypeIdx]; 161 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 162 }; 163 } 164 165 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 166 return [=](const LegalityQuery &Query) { 167 const LLT QueryTy = Query.Types[TypeIdx]; 168 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 169 }; 170 } 171 172 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 173 return [=](const LegalityQuery &Query) { 174 const LLT QueryTy = Query.Types[TypeIdx]; 175 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 176 }; 177 } 178 179 static bool isRegisterSize(unsigned Size) { 180 return Size % 32 == 0 && Size <= MaxRegisterSize; 181 } 182 183 static bool isRegisterVectorElementType(LLT EltTy) { 184 const int EltSize = EltTy.getSizeInBits(); 185 return EltSize == 16 || EltSize % 32 == 0; 186 } 187 188 static bool isRegisterVectorType(LLT Ty) { 189 const int EltSize = Ty.getElementType().getSizeInBits(); 190 return EltSize == 32 || EltSize == 64 || 191 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 192 EltSize == 128 || EltSize == 256; 193 } 194 195 static bool isRegisterType(LLT Ty) { 196 if (!isRegisterSize(Ty.getSizeInBits())) 197 return false; 198 199 if (Ty.isVector()) 200 return isRegisterVectorType(Ty); 201 202 return true; 203 } 204 205 // Any combination of 32 or 64-bit elements up the maximum register size, and 206 // multiples of v2s16. 207 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 208 return [=](const LegalityQuery &Query) { 209 return isRegisterType(Query.Types[TypeIdx]); 210 }; 211 } 212 213 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 const LLT QueryTy = Query.Types[TypeIdx]; 216 if (!QueryTy.isVector()) 217 return false; 218 const LLT EltTy = QueryTy.getElementType(); 219 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 220 }; 221 } 222 223 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 224 return [=](const LegalityQuery &Query) { 225 const LLT Ty = Query.Types[TypeIdx]; 226 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 227 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 228 }; 229 } 230 231 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 232 // handle some operations by just promoting the register during 233 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 234 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 235 bool IsLoad) { 236 switch (AS) { 237 case AMDGPUAS::PRIVATE_ADDRESS: 238 // FIXME: Private element size. 239 return ST.enableFlatScratch() ? 128 : 32; 240 case AMDGPUAS::LOCAL_ADDRESS: 241 return ST.useDS128() ? 128 : 64; 242 case AMDGPUAS::GLOBAL_ADDRESS: 243 case AMDGPUAS::CONSTANT_ADDRESS: 244 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 245 // Treat constant and global as identical. SMRD loads are sometimes usable for 246 // global loads (ideally constant address space should be eliminated) 247 // depending on the context. Legality cannot be context dependent, but 248 // RegBankSelect can split the load as necessary depending on the pointer 249 // register bank/uniformity and if the memory is invariant or not written in a 250 // kernel. 251 return IsLoad ? 512 : 128; 252 default: 253 // Flat addresses may contextually need to be split to 32-bit parts if they 254 // may alias scratch depending on the subtarget. 255 return 128; 256 } 257 } 258 259 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 260 const LegalityQuery &Query, 261 unsigned Opcode) { 262 const LLT Ty = Query.Types[0]; 263 264 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 265 const bool IsLoad = Opcode != AMDGPU::G_STORE; 266 267 unsigned RegSize = Ty.getSizeInBits(); 268 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 269 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 270 unsigned AS = Query.Types[1].getAddressSpace(); 271 272 // All of these need to be custom lowered to cast the pointer operand. 273 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 274 return false; 275 276 // TODO: We should be able to widen loads if the alignment is high enough, but 277 // we also need to modify the memory access size. 278 #if 0 279 // Accept widening loads based on alignment. 280 if (IsLoad && MemSize < Size) 281 MemSize = std::max(MemSize, Align); 282 #endif 283 284 // Only 1-byte and 2-byte to 32-bit extloads are valid. 285 if (MemSize != RegSize && RegSize != 32) 286 return false; 287 288 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 289 return false; 290 291 switch (MemSize) { 292 case 8: 293 case 16: 294 case 32: 295 case 64: 296 case 128: 297 break; 298 case 96: 299 if (!ST.hasDwordx3LoadStores()) 300 return false; 301 break; 302 case 256: 303 case 512: 304 // These may contextually need to be broken down. 305 break; 306 default: 307 return false; 308 } 309 310 assert(RegSize >= MemSize); 311 312 if (AlignBits < MemSize) { 313 const SITargetLowering *TLI = ST.getTargetLowering(); 314 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 315 Align(AlignBits / 8))) 316 return false; 317 } 318 319 return true; 320 } 321 322 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 323 // workaround this. Eventually it should ignore the type for loads and only care 324 // about the size. Return true in cases where we will workaround this for now by 325 // bitcasting. 326 static bool loadStoreBitcastWorkaround(const LLT Ty) { 327 if (EnableNewLegality) 328 return false; 329 330 const unsigned Size = Ty.getSizeInBits(); 331 if (Size <= 64) 332 return false; 333 if (!Ty.isVector()) 334 return true; 335 336 LLT EltTy = Ty.getElementType(); 337 if (EltTy.isPointer()) 338 return true; 339 340 unsigned EltSize = EltTy.getSizeInBits(); 341 return EltSize != 32 && EltSize != 64; 342 } 343 344 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 345 unsigned Opcode) { 346 const LLT Ty = Query.Types[0]; 347 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 348 !loadStoreBitcastWorkaround(Ty); 349 } 350 351 /// Return true if a load or store of the type should be lowered with a bitcast 352 /// to a different type. 353 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 354 const unsigned MemSizeInBits) { 355 const unsigned Size = Ty.getSizeInBits(); 356 if (Size != MemSizeInBits) 357 return Size <= 32 && Ty.isVector(); 358 359 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 360 return true; 361 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 362 !isRegisterVectorElementType(Ty.getElementType()); 363 } 364 365 /// Return true if we should legalize a load by widening an odd sized memory 366 /// access up to the alignment. Note this case when the memory access itself 367 /// changes, not the size of the result register. 368 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 369 unsigned AlignInBits, unsigned AddrSpace, 370 unsigned Opcode) { 371 // We don't want to widen cases that are naturally legal. 372 if (isPowerOf2_32(SizeInBits)) 373 return false; 374 375 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 376 // end up widening these for a scalar load during RegBankSelect, since there 377 // aren't 96-bit scalar loads. 378 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 379 return false; 380 381 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 382 return false; 383 384 // A load is known dereferenceable up to the alignment, so it's legal to widen 385 // to it. 386 // 387 // TODO: Could check dereferenceable for less aligned cases. 388 unsigned RoundedSize = NextPowerOf2(SizeInBits); 389 if (AlignInBits < RoundedSize) 390 return false; 391 392 // Do not widen if it would introduce a slow unaligned load. 393 const SITargetLowering *TLI = ST.getTargetLowering(); 394 bool Fast = false; 395 return TLI->allowsMisalignedMemoryAccessesImpl( 396 RoundedSize, AddrSpace, Align(AlignInBits / 8), 397 MachineMemOperand::MOLoad, &Fast) && 398 Fast; 399 } 400 401 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 402 unsigned Opcode) { 403 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 404 return false; 405 406 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 407 Query.MMODescrs[0].AlignInBits, 408 Query.Types[1].getAddressSpace(), Opcode); 409 } 410 411 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 412 const GCNTargetMachine &TM) 413 : ST(ST_) { 414 using namespace TargetOpcode; 415 416 auto GetAddrSpacePtr = [&TM](unsigned AS) { 417 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 418 }; 419 420 const LLT S1 = LLT::scalar(1); 421 const LLT S8 = LLT::scalar(8); 422 const LLT S16 = LLT::scalar(16); 423 const LLT S32 = LLT::scalar(32); 424 const LLT S64 = LLT::scalar(64); 425 const LLT S128 = LLT::scalar(128); 426 const LLT S256 = LLT::scalar(256); 427 const LLT S512 = LLT::scalar(512); 428 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 429 430 const LLT V2S8 = LLT::vector(2, 8); 431 const LLT V2S16 = LLT::vector(2, 16); 432 const LLT V4S16 = LLT::vector(4, 16); 433 434 const LLT V2S32 = LLT::vector(2, 32); 435 const LLT V3S32 = LLT::vector(3, 32); 436 const LLT V4S32 = LLT::vector(4, 32); 437 const LLT V5S32 = LLT::vector(5, 32); 438 const LLT V6S32 = LLT::vector(6, 32); 439 const LLT V7S32 = LLT::vector(7, 32); 440 const LLT V8S32 = LLT::vector(8, 32); 441 const LLT V9S32 = LLT::vector(9, 32); 442 const LLT V10S32 = LLT::vector(10, 32); 443 const LLT V11S32 = LLT::vector(11, 32); 444 const LLT V12S32 = LLT::vector(12, 32); 445 const LLT V13S32 = LLT::vector(13, 32); 446 const LLT V14S32 = LLT::vector(14, 32); 447 const LLT V15S32 = LLT::vector(15, 32); 448 const LLT V16S32 = LLT::vector(16, 32); 449 const LLT V32S32 = LLT::vector(32, 32); 450 451 const LLT V2S64 = LLT::vector(2, 64); 452 const LLT V3S64 = LLT::vector(3, 64); 453 const LLT V4S64 = LLT::vector(4, 64); 454 const LLT V5S64 = LLT::vector(5, 64); 455 const LLT V6S64 = LLT::vector(6, 64); 456 const LLT V7S64 = LLT::vector(7, 64); 457 const LLT V8S64 = LLT::vector(8, 64); 458 const LLT V16S64 = LLT::vector(16, 64); 459 460 std::initializer_list<LLT> AllS32Vectors = 461 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 462 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 463 std::initializer_list<LLT> AllS64Vectors = 464 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 465 466 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 467 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 468 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 469 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 470 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 471 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 472 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 473 474 const LLT CodePtr = FlatPtr; 475 476 const std::initializer_list<LLT> AddrSpaces64 = { 477 GlobalPtr, ConstantPtr, FlatPtr 478 }; 479 480 const std::initializer_list<LLT> AddrSpaces32 = { 481 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 482 }; 483 484 const std::initializer_list<LLT> FPTypesBase = { 485 S32, S64 486 }; 487 488 const std::initializer_list<LLT> FPTypes16 = { 489 S32, S64, S16 490 }; 491 492 const std::initializer_list<LLT> FPTypesPK16 = { 493 S32, S64, S16, V2S16 494 }; 495 496 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 497 498 setAction({G_BRCOND, S1}, Legal); // VCC branches 499 setAction({G_BRCOND, S32}, Legal); // SCC branches 500 501 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 502 // elements for v3s16 503 getActionDefinitionsBuilder(G_PHI) 504 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 505 .legalFor(AllS32Vectors) 506 .legalFor(AllS64Vectors) 507 .legalFor(AddrSpaces64) 508 .legalFor(AddrSpaces32) 509 .legalIf(isPointer(0)) 510 .clampScalar(0, S16, S256) 511 .widenScalarToNextPow2(0, 32) 512 .clampMaxNumElements(0, S32, 16) 513 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 514 .scalarize(0); 515 516 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 517 // Full set of gfx9 features. 518 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 519 .legalFor({S32, S16, V2S16}) 520 .clampScalar(0, S16, S32) 521 .clampMaxNumElements(0, S16, 2) 522 .scalarize(0) 523 .widenScalarToNextPow2(0, 32); 524 525 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 526 .legalFor({S32, S16, V2S16}) // Clamp modifier 527 .minScalarOrElt(0, S16) 528 .clampMaxNumElements(0, S16, 2) 529 .scalarize(0) 530 .widenScalarToNextPow2(0, 32) 531 .lower(); 532 } else if (ST.has16BitInsts()) { 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16}) 535 .clampScalar(0, S16, S32) 536 .scalarize(0) 537 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 538 539 // Technically the saturating operations require clamp bit support, but this 540 // was introduced at the same time as 16-bit operations. 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 542 .legalFor({S32, S16}) // Clamp modifier 543 .minScalar(0, S16) 544 .scalarize(0) 545 .widenScalarToNextPow2(0, 16) 546 .lower(); 547 548 // We're just lowering this, but it helps get a better result to try to 549 // coerce to the desired type first. 550 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 551 .minScalar(0, S16) 552 .scalarize(0) 553 .lower(); 554 } else { 555 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 556 .legalFor({S32}) 557 .clampScalar(0, S32, S32) 558 .scalarize(0); 559 560 if (ST.hasIntClamp()) { 561 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 562 .legalFor({S32}) // Clamp modifier. 563 .scalarize(0) 564 .minScalarOrElt(0, S32) 565 .lower(); 566 } else { 567 // Clamp bit support was added in VI, along with 16-bit operations. 568 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 569 .minScalar(0, S32) 570 .scalarize(0) 571 .lower(); 572 } 573 574 // FIXME: DAG expansion gets better results. The widening uses the smaller 575 // range values and goes for the min/max lowering directly. 576 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 577 .minScalar(0, S32) 578 .scalarize(0) 579 .lower(); 580 } 581 582 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 583 .customFor({S32, S64}) 584 .clampScalar(0, S32, S64) 585 .widenScalarToNextPow2(0, 32) 586 .scalarize(0); 587 588 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 589 .legalFor({S32}) 590 .maxScalarOrElt(0, S32); 591 592 if (ST.hasVOP3PInsts()) { 593 Mulh 594 .clampMaxNumElements(0, S8, 2) 595 .lowerFor({V2S8}); 596 } 597 598 Mulh 599 .scalarize(0) 600 .lower(); 601 602 // Report legal for any types we can handle anywhere. For the cases only legal 603 // on the SALU, RegBankSelect will be able to re-legalize. 604 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 605 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 606 .clampScalar(0, S32, S64) 607 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 608 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 609 .widenScalarToNextPow2(0) 610 .scalarize(0); 611 612 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 613 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 614 .legalFor({{S32, S1}, {S32, S32}}) 615 .minScalar(0, S32) 616 // TODO: .scalarize(0) 617 .lower(); 618 619 getActionDefinitionsBuilder(G_BITCAST) 620 // Don't worry about the size constraint. 621 .legalIf(all(isRegisterType(0), isRegisterType(1))) 622 .lower(); 623 624 625 getActionDefinitionsBuilder(G_CONSTANT) 626 .legalFor({S1, S32, S64, S16, GlobalPtr, 627 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 628 .legalIf(isPointer(0)) 629 .clampScalar(0, S32, S64) 630 .widenScalarToNextPow2(0); 631 632 getActionDefinitionsBuilder(G_FCONSTANT) 633 .legalFor({S32, S64, S16}) 634 .clampScalar(0, S16, S64); 635 636 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 637 .legalIf(isRegisterType(0)) 638 // s1 and s16 are special cases because they have legal operations on 639 // them, but don't really occupy registers in the normal way. 640 .legalFor({S1, S16}) 641 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 642 .clampScalarOrElt(0, S32, MaxScalar) 643 .widenScalarToNextPow2(0, 32) 644 .clampMaxNumElements(0, S32, 16); 645 646 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 647 648 // If the amount is divergent, we have to do a wave reduction to get the 649 // maximum value, so this is expanded during RegBankSelect. 650 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 651 .legalFor({{PrivatePtr, S32}}); 652 653 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 654 .customIf(typeIsNot(0, PrivatePtr)); 655 656 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 657 658 auto &FPOpActions = getActionDefinitionsBuilder( 659 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 660 .legalFor({S32, S64}); 661 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 662 .customFor({S32, S64}); 663 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 664 .customFor({S32, S64}); 665 666 if (ST.has16BitInsts()) { 667 if (ST.hasVOP3PInsts()) 668 FPOpActions.legalFor({S16, V2S16}); 669 else 670 FPOpActions.legalFor({S16}); 671 672 TrigActions.customFor({S16}); 673 FDIVActions.customFor({S16}); 674 } 675 676 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 677 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 678 679 if (ST.hasVOP3PInsts()) { 680 MinNumMaxNum.customFor(FPTypesPK16) 681 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 682 .clampMaxNumElements(0, S16, 2) 683 .clampScalar(0, S16, S64) 684 .scalarize(0); 685 } else if (ST.has16BitInsts()) { 686 MinNumMaxNum.customFor(FPTypes16) 687 .clampScalar(0, S16, S64) 688 .scalarize(0); 689 } else { 690 MinNumMaxNum.customFor(FPTypesBase) 691 .clampScalar(0, S32, S64) 692 .scalarize(0); 693 } 694 695 if (ST.hasVOP3PInsts()) 696 FPOpActions.clampMaxNumElements(0, S16, 2); 697 698 FPOpActions 699 .scalarize(0) 700 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 701 702 TrigActions 703 .scalarize(0) 704 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 705 706 FDIVActions 707 .scalarize(0) 708 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 709 710 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 711 .legalFor(FPTypesPK16) 712 .clampMaxNumElements(0, S16, 2) 713 .scalarize(0) 714 .clampScalar(0, S16, S64); 715 716 if (ST.has16BitInsts()) { 717 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 718 .legalFor({S32, S64, S16}) 719 .scalarize(0) 720 .clampScalar(0, S16, S64); 721 } else { 722 getActionDefinitionsBuilder(G_FSQRT) 723 .legalFor({S32, S64}) 724 .scalarize(0) 725 .clampScalar(0, S32, S64); 726 727 if (ST.hasFractBug()) { 728 getActionDefinitionsBuilder(G_FFLOOR) 729 .customFor({S64}) 730 .legalFor({S32, S64}) 731 .scalarize(0) 732 .clampScalar(0, S32, S64); 733 } else { 734 getActionDefinitionsBuilder(G_FFLOOR) 735 .legalFor({S32, S64}) 736 .scalarize(0) 737 .clampScalar(0, S32, S64); 738 } 739 } 740 741 getActionDefinitionsBuilder(G_FPTRUNC) 742 .legalFor({{S32, S64}, {S16, S32}}) 743 .scalarize(0) 744 .lower(); 745 746 getActionDefinitionsBuilder(G_FPEXT) 747 .legalFor({{S64, S32}, {S32, S16}}) 748 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 749 .scalarize(0); 750 751 getActionDefinitionsBuilder(G_FSUB) 752 // Use actual fsub instruction 753 .legalFor({S32}) 754 // Must use fadd + fneg 755 .lowerFor({S64, S16, V2S16}) 756 .scalarize(0) 757 .clampScalar(0, S32, S64); 758 759 // Whether this is legal depends on the floating point mode for the function. 760 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 761 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 762 FMad.customFor({S32, S16}); 763 else if (ST.hasMadMacF32Insts()) 764 FMad.customFor({S32}); 765 else if (ST.hasMadF16()) 766 FMad.customFor({S16}); 767 FMad.scalarize(0) 768 .lower(); 769 770 auto &FRem = getActionDefinitionsBuilder(G_FREM); 771 if (ST.has16BitInsts()) { 772 FRem.customFor({S16, S32, S64}); 773 } else { 774 FRem.minScalar(0, S32) 775 .customFor({S32, S64}); 776 } 777 FRem.scalarize(0); 778 779 // TODO: Do we need to clamp maximum bitwidth? 780 getActionDefinitionsBuilder(G_TRUNC) 781 .legalIf(isScalar(0)) 782 .legalFor({{V2S16, V2S32}}) 783 .clampMaxNumElements(0, S16, 2) 784 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 785 // situations (like an invalid implicit use), we don't want to infinite loop 786 // in the legalizer. 787 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 788 .alwaysLegal(); 789 790 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 791 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 792 {S32, S1}, {S64, S1}, {S16, S1}}) 793 .scalarize(0) 794 .clampScalar(0, S32, S64) 795 .widenScalarToNextPow2(1, 32); 796 797 // TODO: Split s1->s64 during regbankselect for VALU. 798 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 799 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 800 .lowerFor({{S32, S64}}) 801 .lowerIf(typeIs(1, S1)) 802 .customFor({{S64, S64}}); 803 if (ST.has16BitInsts()) 804 IToFP.legalFor({{S16, S16}}); 805 IToFP.clampScalar(1, S32, S64) 806 .minScalar(0, S32) 807 .scalarize(0) 808 .widenScalarToNextPow2(1); 809 810 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 811 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 812 .customFor({{S64, S64}}) 813 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 814 if (ST.has16BitInsts()) 815 FPToI.legalFor({{S16, S16}}); 816 else 817 FPToI.minScalar(1, S32); 818 819 FPToI.minScalar(0, S32) 820 .scalarize(0) 821 .lower(); 822 823 // Lower roundeven into G_FRINT 824 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 825 .scalarize(0) 826 .lower(); 827 828 if (ST.has16BitInsts()) { 829 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 830 .legalFor({S16, S32, S64}) 831 .clampScalar(0, S16, S64) 832 .scalarize(0); 833 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 834 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 835 .legalFor({S32, S64}) 836 .clampScalar(0, S32, S64) 837 .scalarize(0); 838 } else { 839 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 840 .legalFor({S32}) 841 .customFor({S64}) 842 .clampScalar(0, S32, S64) 843 .scalarize(0); 844 } 845 846 getActionDefinitionsBuilder(G_PTR_ADD) 847 .legalIf(all(isPointer(0), sameSize(0, 1))) 848 .scalarize(0) 849 .scalarSameSizeAs(1, 0); 850 851 getActionDefinitionsBuilder(G_PTRMASK) 852 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 853 .scalarSameSizeAs(1, 0) 854 .scalarize(0); 855 856 auto &CmpBuilder = 857 getActionDefinitionsBuilder(G_ICMP) 858 // The compare output type differs based on the register bank of the output, 859 // so make both s1 and s32 legal. 860 // 861 // Scalar compares producing output in scc will be promoted to s32, as that 862 // is the allocatable register type that will be needed for the copy from 863 // scc. This will be promoted during RegBankSelect, and we assume something 864 // before that won't try to use s32 result types. 865 // 866 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 867 // bank. 868 .legalForCartesianProduct( 869 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 870 .legalForCartesianProduct( 871 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 872 if (ST.has16BitInsts()) { 873 CmpBuilder.legalFor({{S1, S16}}); 874 } 875 876 CmpBuilder 877 .widenScalarToNextPow2(1) 878 .clampScalar(1, S32, S64) 879 .scalarize(0) 880 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 881 882 getActionDefinitionsBuilder(G_FCMP) 883 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 884 .widenScalarToNextPow2(1) 885 .clampScalar(1, S32, S64) 886 .scalarize(0); 887 888 // FIXME: fpow has a selection pattern that should move to custom lowering. 889 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 890 if (ST.has16BitInsts()) 891 Exp2Ops.legalFor({S32, S16}); 892 else 893 Exp2Ops.legalFor({S32}); 894 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 895 Exp2Ops.scalarize(0); 896 897 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 898 if (ST.has16BitInsts()) 899 ExpOps.customFor({{S32}, {S16}}); 900 else 901 ExpOps.customFor({S32}); 902 ExpOps.clampScalar(0, MinScalarFPTy, S32) 903 .scalarize(0); 904 905 getActionDefinitionsBuilder(G_FPOWI) 906 .clampScalar(0, MinScalarFPTy, S32) 907 .lower(); 908 909 // The 64-bit versions produce 32-bit results, but only on the SALU. 910 getActionDefinitionsBuilder(G_CTPOP) 911 .legalFor({{S32, S32}, {S32, S64}}) 912 .clampScalar(0, S32, S32) 913 .clampScalar(1, S32, S64) 914 .scalarize(0) 915 .widenScalarToNextPow2(0, 32) 916 .widenScalarToNextPow2(1, 32); 917 918 // The hardware instructions return a different result on 0 than the generic 919 // instructions expect. The hardware produces -1, but these produce the 920 // bitwidth. 921 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 922 .scalarize(0) 923 .clampScalar(0, S32, S32) 924 .clampScalar(1, S32, S64) 925 .widenScalarToNextPow2(0, 32) 926 .widenScalarToNextPow2(1, 32) 927 .lower(); 928 929 // The 64-bit versions produce 32-bit results, but only on the SALU. 930 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 931 .legalFor({{S32, S32}, {S32, S64}}) 932 .clampScalar(0, S32, S32) 933 .clampScalar(1, S32, S64) 934 .scalarize(0) 935 .widenScalarToNextPow2(0, 32) 936 .widenScalarToNextPow2(1, 32); 937 938 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 939 // RegBankSelect. 940 getActionDefinitionsBuilder(G_BITREVERSE) 941 .legalFor({S32, S64}) 942 .clampScalar(0, S32, S64) 943 .scalarize(0) 944 .widenScalarToNextPow2(0); 945 946 if (ST.has16BitInsts()) { 947 getActionDefinitionsBuilder(G_BSWAP) 948 .legalFor({S16, S32, V2S16}) 949 .clampMaxNumElements(0, S16, 2) 950 // FIXME: Fixing non-power-of-2 before clamp is workaround for 951 // narrowScalar limitation. 952 .widenScalarToNextPow2(0) 953 .clampScalar(0, S16, S32) 954 .scalarize(0); 955 956 if (ST.hasVOP3PInsts()) { 957 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 958 .legalFor({S32, S16, V2S16}) 959 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 960 .clampMaxNumElements(0, S16, 2) 961 .minScalar(0, S16) 962 .widenScalarToNextPow2(0) 963 .scalarize(0) 964 .lower(); 965 } else { 966 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 967 .legalFor({S32, S16}) 968 .widenScalarToNextPow2(0) 969 .minScalar(0, S16) 970 .scalarize(0) 971 .lower(); 972 } 973 } else { 974 // TODO: Should have same legality without v_perm_b32 975 getActionDefinitionsBuilder(G_BSWAP) 976 .legalFor({S32}) 977 .lowerIf(scalarNarrowerThan(0, 32)) 978 // FIXME: Fixing non-power-of-2 before clamp is workaround for 979 // narrowScalar limitation. 980 .widenScalarToNextPow2(0) 981 .maxScalar(0, S32) 982 .scalarize(0) 983 .lower(); 984 985 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 986 .legalFor({S32}) 987 .minScalar(0, S32) 988 .widenScalarToNextPow2(0) 989 .scalarize(0) 990 .lower(); 991 } 992 993 getActionDefinitionsBuilder(G_INTTOPTR) 994 // List the common cases 995 .legalForCartesianProduct(AddrSpaces64, {S64}) 996 .legalForCartesianProduct(AddrSpaces32, {S32}) 997 .scalarize(0) 998 // Accept any address space as long as the size matches 999 .legalIf(sameSize(0, 1)) 1000 .widenScalarIf(smallerThan(1, 0), 1001 [](const LegalityQuery &Query) { 1002 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1003 }) 1004 .narrowScalarIf(largerThan(1, 0), 1005 [](const LegalityQuery &Query) { 1006 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1007 }); 1008 1009 getActionDefinitionsBuilder(G_PTRTOINT) 1010 // List the common cases 1011 .legalForCartesianProduct(AddrSpaces64, {S64}) 1012 .legalForCartesianProduct(AddrSpaces32, {S32}) 1013 .scalarize(0) 1014 // Accept any address space as long as the size matches 1015 .legalIf(sameSize(0, 1)) 1016 .widenScalarIf(smallerThan(0, 1), 1017 [](const LegalityQuery &Query) { 1018 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1019 }) 1020 .narrowScalarIf( 1021 largerThan(0, 1), 1022 [](const LegalityQuery &Query) { 1023 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1024 }); 1025 1026 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1027 .scalarize(0) 1028 .custom(); 1029 1030 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1031 bool IsLoad) -> bool { 1032 const LLT DstTy = Query.Types[0]; 1033 1034 // Split vector extloads. 1035 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1036 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1037 1038 if (MemSize < DstTy.getSizeInBits()) 1039 MemSize = std::max(MemSize, AlignBits); 1040 1041 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1042 return true; 1043 1044 const LLT PtrTy = Query.Types[1]; 1045 unsigned AS = PtrTy.getAddressSpace(); 1046 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1047 return true; 1048 1049 // Catch weird sized loads that don't evenly divide into the access sizes 1050 // TODO: May be able to widen depending on alignment etc. 1051 unsigned NumRegs = (MemSize + 31) / 32; 1052 if (NumRegs == 3) { 1053 if (!ST.hasDwordx3LoadStores()) 1054 return true; 1055 } else { 1056 // If the alignment allows, these should have been widened. 1057 if (!isPowerOf2_32(NumRegs)) 1058 return true; 1059 } 1060 1061 if (AlignBits < MemSize) { 1062 const SITargetLowering *TLI = ST.getTargetLowering(); 1063 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1064 Align(AlignBits / 8)); 1065 } 1066 1067 return false; 1068 }; 1069 1070 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1071 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1072 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1073 1074 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1075 // LDS 1076 // TODO: Unsupported flat for SI. 1077 1078 for (unsigned Op : {G_LOAD, G_STORE}) { 1079 const bool IsStore = Op == G_STORE; 1080 1081 auto &Actions = getActionDefinitionsBuilder(Op); 1082 // Explicitly list some common cases. 1083 // TODO: Does this help compile time at all? 1084 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1085 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1086 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1087 {S64, GlobalPtr, 64, GlobalAlign32}, 1088 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1089 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1090 {S32, GlobalPtr, 8, GlobalAlign8}, 1091 {S32, GlobalPtr, 16, GlobalAlign16}, 1092 1093 {S32, LocalPtr, 32, 32}, 1094 {S64, LocalPtr, 64, 32}, 1095 {V2S32, LocalPtr, 64, 32}, 1096 {S32, LocalPtr, 8, 8}, 1097 {S32, LocalPtr, 16, 16}, 1098 {V2S16, LocalPtr, 32, 32}, 1099 1100 {S32, PrivatePtr, 32, 32}, 1101 {S32, PrivatePtr, 8, 8}, 1102 {S32, PrivatePtr, 16, 16}, 1103 {V2S16, PrivatePtr, 32, 32}, 1104 1105 {S32, ConstantPtr, 32, GlobalAlign32}, 1106 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1107 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1108 {S64, ConstantPtr, 64, GlobalAlign32}, 1109 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1110 Actions.legalIf( 1111 [=](const LegalityQuery &Query) -> bool { 1112 return isLoadStoreLegal(ST, Query, Op); 1113 }); 1114 1115 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1116 // 64-bits. 1117 // 1118 // TODO: Should generalize bitcast action into coerce, which will also cover 1119 // inserting addrspacecasts. 1120 Actions.customIf(typeIs(1, Constant32Ptr)); 1121 1122 // Turn any illegal element vectors into something easier to deal 1123 // with. These will ultimately produce 32-bit scalar shifts to extract the 1124 // parts anyway. 1125 // 1126 // For odd 16-bit element vectors, prefer to split those into pieces with 1127 // 16-bit vector parts. 1128 Actions.bitcastIf( 1129 [=](const LegalityQuery &Query) -> bool { 1130 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1131 Query.MMODescrs[0].SizeInBits); 1132 }, bitcastToRegisterType(0)); 1133 1134 if (!IsStore) { 1135 // Widen suitably aligned loads by loading extra bytes. The standard 1136 // legalization actions can't properly express widening memory operands. 1137 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1138 return shouldWidenLoad(ST, Query, G_LOAD); 1139 }); 1140 } 1141 1142 // FIXME: load/store narrowing should be moved to lower action 1143 Actions 1144 .narrowScalarIf( 1145 [=](const LegalityQuery &Query) -> bool { 1146 return !Query.Types[0].isVector() && 1147 needToSplitMemOp(Query, Op == G_LOAD); 1148 }, 1149 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1150 const LLT DstTy = Query.Types[0]; 1151 const LLT PtrTy = Query.Types[1]; 1152 1153 const unsigned DstSize = DstTy.getSizeInBits(); 1154 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1155 1156 // Split extloads. 1157 if (DstSize > MemSize) 1158 return std::make_pair(0, LLT::scalar(MemSize)); 1159 1160 if (!isPowerOf2_32(DstSize)) { 1161 // We're probably decomposing an odd sized store. Try to split 1162 // to the widest type. TODO: Account for alignment. As-is it 1163 // should be OK, since the new parts will be further legalized. 1164 unsigned FloorSize = PowerOf2Floor(DstSize); 1165 return std::make_pair(0, LLT::scalar(FloorSize)); 1166 } 1167 1168 if (DstSize > 32 && (DstSize % 32 != 0)) { 1169 // FIXME: Need a way to specify non-extload of larger size if 1170 // suitably aligned. 1171 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1172 } 1173 1174 unsigned MaxSize = maxSizeForAddrSpace(ST, 1175 PtrTy.getAddressSpace(), 1176 Op == G_LOAD); 1177 if (MemSize > MaxSize) 1178 return std::make_pair(0, LLT::scalar(MaxSize)); 1179 1180 unsigned Align = Query.MMODescrs[0].AlignInBits; 1181 return std::make_pair(0, LLT::scalar(Align)); 1182 }) 1183 .fewerElementsIf( 1184 [=](const LegalityQuery &Query) -> bool { 1185 return Query.Types[0].isVector() && 1186 needToSplitMemOp(Query, Op == G_LOAD); 1187 }, 1188 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1189 const LLT DstTy = Query.Types[0]; 1190 const LLT PtrTy = Query.Types[1]; 1191 1192 LLT EltTy = DstTy.getElementType(); 1193 unsigned MaxSize = maxSizeForAddrSpace(ST, 1194 PtrTy.getAddressSpace(), 1195 Op == G_LOAD); 1196 1197 // FIXME: Handle widened to power of 2 results better. This ends 1198 // up scalarizing. 1199 // FIXME: 3 element stores scalarized on SI 1200 1201 // Split if it's too large for the address space. 1202 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1203 unsigned NumElts = DstTy.getNumElements(); 1204 unsigned EltSize = EltTy.getSizeInBits(); 1205 1206 if (MaxSize % EltSize == 0) { 1207 return std::make_pair( 1208 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1209 } 1210 1211 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1212 1213 // FIXME: Refine when odd breakdowns handled 1214 // The scalars will need to be re-legalized. 1215 if (NumPieces == 1 || NumPieces >= NumElts || 1216 NumElts % NumPieces != 0) 1217 return std::make_pair(0, EltTy); 1218 1219 return std::make_pair(0, 1220 LLT::vector(NumElts / NumPieces, EltTy)); 1221 } 1222 1223 // FIXME: We could probably handle weird extending loads better. 1224 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1225 if (DstTy.getSizeInBits() > MemSize) 1226 return std::make_pair(0, EltTy); 1227 1228 unsigned EltSize = EltTy.getSizeInBits(); 1229 unsigned DstSize = DstTy.getSizeInBits(); 1230 if (!isPowerOf2_32(DstSize)) { 1231 // We're probably decomposing an odd sized store. Try to split 1232 // to the widest type. TODO: Account for alignment. As-is it 1233 // should be OK, since the new parts will be further legalized. 1234 unsigned FloorSize = PowerOf2Floor(DstSize); 1235 return std::make_pair( 1236 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1237 } 1238 1239 // Need to split because of alignment. 1240 unsigned Align = Query.MMODescrs[0].AlignInBits; 1241 if (EltSize > Align && 1242 (EltSize / Align < DstTy.getNumElements())) { 1243 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1244 } 1245 1246 // May need relegalization for the scalars. 1247 return std::make_pair(0, EltTy); 1248 }) 1249 .lowerIfMemSizeNotPow2() 1250 .minScalar(0, S32); 1251 1252 if (IsStore) 1253 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1254 1255 Actions 1256 .widenScalarToNextPow2(0) 1257 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1258 .lower(); 1259 } 1260 1261 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1262 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1263 {S32, GlobalPtr, 16, 2 * 8}, 1264 {S32, LocalPtr, 8, 8}, 1265 {S32, LocalPtr, 16, 16}, 1266 {S32, PrivatePtr, 8, 8}, 1267 {S32, PrivatePtr, 16, 16}, 1268 {S32, ConstantPtr, 8, 8}, 1269 {S32, ConstantPtr, 16, 2 * 8}}); 1270 if (ST.hasFlatAddressSpace()) { 1271 ExtLoads.legalForTypesWithMemDesc( 1272 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1273 } 1274 1275 ExtLoads.clampScalar(0, S32, S32) 1276 .widenScalarToNextPow2(0) 1277 .unsupportedIfMemSizeNotPow2() 1278 .lower(); 1279 1280 auto &Atomics = getActionDefinitionsBuilder( 1281 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1282 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1283 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1284 G_ATOMICRMW_UMIN}) 1285 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1286 {S64, GlobalPtr}, {S64, LocalPtr}, 1287 {S32, RegionPtr}, {S64, RegionPtr}}); 1288 if (ST.hasFlatAddressSpace()) { 1289 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1290 } 1291 1292 if (ST.hasLDSFPAtomics()) { 1293 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1294 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1295 if (ST.hasGFX90AInsts()) 1296 Atomic.legalFor({{S64, LocalPtr}}); 1297 } 1298 1299 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1300 // demarshalling 1301 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1302 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1303 {S32, FlatPtr}, {S64, FlatPtr}}) 1304 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1305 {S32, RegionPtr}, {S64, RegionPtr}}); 1306 // TODO: Pointer types, any 32-bit or 64-bit vector 1307 1308 // Condition should be s32 for scalar, s1 for vector. 1309 getActionDefinitionsBuilder(G_SELECT) 1310 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1311 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1312 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1313 .clampScalar(0, S16, S64) 1314 .scalarize(1) 1315 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1316 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1317 .clampMaxNumElements(0, S32, 2) 1318 .clampMaxNumElements(0, LocalPtr, 2) 1319 .clampMaxNumElements(0, PrivatePtr, 2) 1320 .scalarize(0) 1321 .widenScalarToNextPow2(0) 1322 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1323 1324 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1325 // be more flexible with the shift amount type. 1326 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1327 .legalFor({{S32, S32}, {S64, S32}}); 1328 if (ST.has16BitInsts()) { 1329 if (ST.hasVOP3PInsts()) { 1330 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1331 .clampMaxNumElements(0, S16, 2); 1332 } else 1333 Shifts.legalFor({{S16, S16}}); 1334 1335 // TODO: Support 16-bit shift amounts for all types 1336 Shifts.widenScalarIf( 1337 [=](const LegalityQuery &Query) { 1338 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1339 // 32-bit amount. 1340 const LLT ValTy = Query.Types[0]; 1341 const LLT AmountTy = Query.Types[1]; 1342 return ValTy.getSizeInBits() <= 16 && 1343 AmountTy.getSizeInBits() < 16; 1344 }, changeTo(1, S16)); 1345 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1346 Shifts.clampScalar(1, S32, S32); 1347 Shifts.clampScalar(0, S16, S64); 1348 Shifts.widenScalarToNextPow2(0, 16); 1349 1350 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1351 .minScalar(0, S16) 1352 .scalarize(0) 1353 .lower(); 1354 } else { 1355 // Make sure we legalize the shift amount type first, as the general 1356 // expansion for the shifted type will produce much worse code if it hasn't 1357 // been truncated already. 1358 Shifts.clampScalar(1, S32, S32); 1359 Shifts.clampScalar(0, S32, S64); 1360 Shifts.widenScalarToNextPow2(0, 32); 1361 1362 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1363 .minScalar(0, S32) 1364 .scalarize(0) 1365 .lower(); 1366 } 1367 Shifts.scalarize(0); 1368 1369 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1370 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1371 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1372 unsigned IdxTypeIdx = 2; 1373 1374 getActionDefinitionsBuilder(Op) 1375 .customIf([=](const LegalityQuery &Query) { 1376 const LLT EltTy = Query.Types[EltTypeIdx]; 1377 const LLT VecTy = Query.Types[VecTypeIdx]; 1378 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1379 const unsigned EltSize = EltTy.getSizeInBits(); 1380 return (EltSize == 32 || EltSize == 64) && 1381 VecTy.getSizeInBits() % 32 == 0 && 1382 VecTy.getSizeInBits() <= MaxRegisterSize && 1383 IdxTy.getSizeInBits() == 32; 1384 }) 1385 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1386 bitcastToVectorElement32(VecTypeIdx)) 1387 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1388 .bitcastIf( 1389 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1390 [=](const LegalityQuery &Query) { 1391 // For > 64-bit element types, try to turn this into a 64-bit 1392 // element vector since we may be able to do better indexing 1393 // if this is scalar. If not, fall back to 32. 1394 const LLT EltTy = Query.Types[EltTypeIdx]; 1395 const LLT VecTy = Query.Types[VecTypeIdx]; 1396 const unsigned DstEltSize = EltTy.getSizeInBits(); 1397 const unsigned VecSize = VecTy.getSizeInBits(); 1398 1399 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1400 return std::make_pair( 1401 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1402 }) 1403 .clampScalar(EltTypeIdx, S32, S64) 1404 .clampScalar(VecTypeIdx, S32, S64) 1405 .clampScalar(IdxTypeIdx, S32, S32) 1406 .clampMaxNumElements(VecTypeIdx, S32, 32) 1407 // TODO: Clamp elements for 64-bit vectors? 1408 // It should only be necessary with variable indexes. 1409 // As a last resort, lower to the stack 1410 .lower(); 1411 } 1412 1413 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1414 .unsupportedIf([=](const LegalityQuery &Query) { 1415 const LLT &EltTy = Query.Types[1].getElementType(); 1416 return Query.Types[0] != EltTy; 1417 }); 1418 1419 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1420 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1421 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1422 1423 // FIXME: Doesn't handle extract of illegal sizes. 1424 getActionDefinitionsBuilder(Op) 1425 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1426 // FIXME: Multiples of 16 should not be legal. 1427 .legalIf([=](const LegalityQuery &Query) { 1428 const LLT BigTy = Query.Types[BigTyIdx]; 1429 const LLT LitTy = Query.Types[LitTyIdx]; 1430 return (BigTy.getSizeInBits() % 32 == 0) && 1431 (LitTy.getSizeInBits() % 16 == 0); 1432 }) 1433 .widenScalarIf( 1434 [=](const LegalityQuery &Query) { 1435 const LLT BigTy = Query.Types[BigTyIdx]; 1436 return (BigTy.getScalarSizeInBits() < 16); 1437 }, 1438 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1439 .widenScalarIf( 1440 [=](const LegalityQuery &Query) { 1441 const LLT LitTy = Query.Types[LitTyIdx]; 1442 return (LitTy.getScalarSizeInBits() < 16); 1443 }, 1444 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1445 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1446 .widenScalarToNextPow2(BigTyIdx, 32); 1447 1448 } 1449 1450 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1451 .legalForCartesianProduct(AllS32Vectors, {S32}) 1452 .legalForCartesianProduct(AllS64Vectors, {S64}) 1453 .clampNumElements(0, V16S32, V32S32) 1454 .clampNumElements(0, V2S64, V16S64) 1455 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1456 1457 if (ST.hasScalarPackInsts()) { 1458 BuildVector 1459 // FIXME: Should probably widen s1 vectors straight to s32 1460 .minScalarOrElt(0, S16) 1461 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1462 .minScalar(1, S32); 1463 1464 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1465 .legalFor({V2S16, S32}) 1466 .lower(); 1467 BuildVector.minScalarOrElt(0, S32); 1468 } else { 1469 BuildVector.customFor({V2S16, S16}); 1470 BuildVector.minScalarOrElt(0, S32); 1471 1472 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1473 .customFor({V2S16, S32}) 1474 .lower(); 1475 } 1476 1477 BuildVector.legalIf(isRegisterType(0)); 1478 1479 // FIXME: Clamp maximum size 1480 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1481 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1482 .clampMaxNumElements(0, S32, 32) 1483 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1484 .clampMaxNumElements(0, S16, 64); 1485 1486 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1487 // pre-legalize. 1488 if (ST.hasVOP3PInsts()) { 1489 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1490 .customFor({V2S16, V2S16}) 1491 .lower(); 1492 } else 1493 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1494 1495 // Merge/Unmerge 1496 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1497 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1498 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1499 1500 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1501 const LLT Ty = Query.Types[TypeIdx]; 1502 if (Ty.isVector()) { 1503 const LLT &EltTy = Ty.getElementType(); 1504 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1505 return true; 1506 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1507 return true; 1508 } 1509 return false; 1510 }; 1511 1512 auto &Builder = getActionDefinitionsBuilder(Op) 1513 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1514 .lowerFor({{S16, V2S16}}) 1515 .lowerIf([=](const LegalityQuery &Query) { 1516 const LLT BigTy = Query.Types[BigTyIdx]; 1517 return BigTy.getSizeInBits() == 32; 1518 }) 1519 // Try to widen to s16 first for small types. 1520 // TODO: Only do this on targets with legal s16 shifts 1521 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1522 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1523 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1524 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1525 elementTypeIs(1, S16)), 1526 changeTo(1, V2S16)) 1527 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1528 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1529 // valid. 1530 .clampScalar(LitTyIdx, S32, S512) 1531 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1532 // Break up vectors with weird elements into scalars 1533 .fewerElementsIf( 1534 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1535 scalarize(0)) 1536 .fewerElementsIf( 1537 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1538 scalarize(1)) 1539 .clampScalar(BigTyIdx, S32, MaxScalar); 1540 1541 if (Op == G_MERGE_VALUES) { 1542 Builder.widenScalarIf( 1543 // TODO: Use 16-bit shifts if legal for 8-bit values? 1544 [=](const LegalityQuery &Query) { 1545 const LLT Ty = Query.Types[LitTyIdx]; 1546 return Ty.getSizeInBits() < 32; 1547 }, 1548 changeTo(LitTyIdx, S32)); 1549 } 1550 1551 Builder.widenScalarIf( 1552 [=](const LegalityQuery &Query) { 1553 const LLT Ty = Query.Types[BigTyIdx]; 1554 return !isPowerOf2_32(Ty.getSizeInBits()) && 1555 Ty.getSizeInBits() % 16 != 0; 1556 }, 1557 [=](const LegalityQuery &Query) { 1558 // Pick the next power of 2, or a multiple of 64 over 128. 1559 // Whichever is smaller. 1560 const LLT &Ty = Query.Types[BigTyIdx]; 1561 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1562 if (NewSizeInBits >= 256) { 1563 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1564 if (RoundedTo < NewSizeInBits) 1565 NewSizeInBits = RoundedTo; 1566 } 1567 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1568 }) 1569 // Any vectors left are the wrong size. Scalarize them. 1570 .scalarize(0) 1571 .scalarize(1); 1572 } 1573 1574 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1575 // RegBankSelect. 1576 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1577 .legalFor({{S32}, {S64}}); 1578 1579 if (ST.hasVOP3PInsts()) { 1580 SextInReg.lowerFor({{V2S16}}) 1581 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1582 // get more vector shift opportunities, since we'll get those when 1583 // expanded. 1584 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1585 } else if (ST.has16BitInsts()) { 1586 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1587 } else { 1588 // Prefer to promote to s32 before lowering if we don't have 16-bit 1589 // shifts. This avoid a lot of intermediate truncate and extend operations. 1590 SextInReg.lowerFor({{S32}, {S64}}); 1591 } 1592 1593 SextInReg 1594 .scalarize(0) 1595 .clampScalar(0, S32, S64) 1596 .lower(); 1597 1598 getActionDefinitionsBuilder(G_FSHR) 1599 .legalFor({{S32, S32}}) 1600 .scalarize(0) 1601 .lower(); 1602 1603 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1604 .legalFor({S64}); 1605 1606 getActionDefinitionsBuilder(G_FENCE) 1607 .alwaysLegal(); 1608 1609 getActionDefinitionsBuilder({ 1610 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1611 G_FCOPYSIGN, 1612 1613 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1614 G_ATOMICRMW_NAND, 1615 G_ATOMICRMW_FSUB, 1616 G_READ_REGISTER, 1617 G_WRITE_REGISTER, 1618 1619 G_SADDO, G_SSUBO, 1620 1621 // TODO: Implement 1622 G_FMINIMUM, G_FMAXIMUM, 1623 G_FSHL 1624 }).lower(); 1625 1626 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1627 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1628 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1629 .unsupported(); 1630 1631 computeTables(); 1632 verify(*ST.getInstrInfo()); 1633 } 1634 1635 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1636 MachineInstr &MI) const { 1637 MachineIRBuilder &B = Helper.MIRBuilder; 1638 MachineRegisterInfo &MRI = *B.getMRI(); 1639 1640 switch (MI.getOpcode()) { 1641 case TargetOpcode::G_ADDRSPACE_CAST: 1642 return legalizeAddrSpaceCast(MI, MRI, B); 1643 case TargetOpcode::G_FRINT: 1644 return legalizeFrint(MI, MRI, B); 1645 case TargetOpcode::G_FCEIL: 1646 return legalizeFceil(MI, MRI, B); 1647 case TargetOpcode::G_FREM: 1648 return legalizeFrem(MI, MRI, B); 1649 case TargetOpcode::G_INTRINSIC_TRUNC: 1650 return legalizeIntrinsicTrunc(MI, MRI, B); 1651 case TargetOpcode::G_SITOFP: 1652 return legalizeITOFP(MI, MRI, B, true); 1653 case TargetOpcode::G_UITOFP: 1654 return legalizeITOFP(MI, MRI, B, false); 1655 case TargetOpcode::G_FPTOSI: 1656 return legalizeFPTOI(MI, MRI, B, true); 1657 case TargetOpcode::G_FPTOUI: 1658 return legalizeFPTOI(MI, MRI, B, false); 1659 case TargetOpcode::G_FMINNUM: 1660 case TargetOpcode::G_FMAXNUM: 1661 case TargetOpcode::G_FMINNUM_IEEE: 1662 case TargetOpcode::G_FMAXNUM_IEEE: 1663 return legalizeMinNumMaxNum(Helper, MI); 1664 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1665 return legalizeExtractVectorElt(MI, MRI, B); 1666 case TargetOpcode::G_INSERT_VECTOR_ELT: 1667 return legalizeInsertVectorElt(MI, MRI, B); 1668 case TargetOpcode::G_SHUFFLE_VECTOR: 1669 return legalizeShuffleVector(MI, MRI, B); 1670 case TargetOpcode::G_FSIN: 1671 case TargetOpcode::G_FCOS: 1672 return legalizeSinCos(MI, MRI, B); 1673 case TargetOpcode::G_GLOBAL_VALUE: 1674 return legalizeGlobalValue(MI, MRI, B); 1675 case TargetOpcode::G_LOAD: 1676 return legalizeLoad(Helper, MI); 1677 case TargetOpcode::G_FMAD: 1678 return legalizeFMad(MI, MRI, B); 1679 case TargetOpcode::G_FDIV: 1680 return legalizeFDIV(MI, MRI, B); 1681 case TargetOpcode::G_UDIV: 1682 case TargetOpcode::G_UREM: 1683 return legalizeUDIV_UREM(MI, MRI, B); 1684 case TargetOpcode::G_SDIV: 1685 case TargetOpcode::G_SREM: 1686 return legalizeSDIV_SREM(MI, MRI, B); 1687 case TargetOpcode::G_ATOMIC_CMPXCHG: 1688 return legalizeAtomicCmpXChg(MI, MRI, B); 1689 case TargetOpcode::G_FLOG: 1690 return legalizeFlog(MI, B, numbers::ln2f); 1691 case TargetOpcode::G_FLOG10: 1692 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1693 case TargetOpcode::G_FEXP: 1694 return legalizeFExp(MI, B); 1695 case TargetOpcode::G_FPOW: 1696 return legalizeFPow(MI, B); 1697 case TargetOpcode::G_FFLOOR: 1698 return legalizeFFloor(MI, MRI, B); 1699 case TargetOpcode::G_BUILD_VECTOR: 1700 return legalizeBuildVector(MI, MRI, B); 1701 default: 1702 return false; 1703 } 1704 1705 llvm_unreachable("expected switch to return"); 1706 } 1707 1708 Register AMDGPULegalizerInfo::getSegmentAperture( 1709 unsigned AS, 1710 MachineRegisterInfo &MRI, 1711 MachineIRBuilder &B) const { 1712 MachineFunction &MF = B.getMF(); 1713 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1714 const LLT S32 = LLT::scalar(32); 1715 1716 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1717 1718 if (ST.hasApertureRegs()) { 1719 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1720 // getreg. 1721 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1722 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1723 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1724 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1725 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1726 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1727 unsigned Encoding = 1728 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1729 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1730 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1731 1732 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1733 1734 B.buildInstr(AMDGPU::S_GETREG_B32) 1735 .addDef(GetReg) 1736 .addImm(Encoding); 1737 MRI.setType(GetReg, S32); 1738 1739 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1740 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1741 } 1742 1743 Register QueuePtr = MRI.createGenericVirtualRegister( 1744 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1745 1746 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1747 return Register(); 1748 1749 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1750 // private_segment_aperture_base_hi. 1751 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1752 1753 // TODO: can we be smarter about machine pointer info? 1754 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1755 MachineMemOperand *MMO = MF.getMachineMemOperand( 1756 PtrInfo, 1757 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1758 MachineMemOperand::MOInvariant, 1759 4, commonAlignment(Align(64), StructOffset)); 1760 1761 Register LoadAddr; 1762 1763 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1764 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1765 } 1766 1767 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1768 MachineInstr &MI, MachineRegisterInfo &MRI, 1769 MachineIRBuilder &B) const { 1770 MachineFunction &MF = B.getMF(); 1771 1772 const LLT S32 = LLT::scalar(32); 1773 Register Dst = MI.getOperand(0).getReg(); 1774 Register Src = MI.getOperand(1).getReg(); 1775 1776 LLT DstTy = MRI.getType(Dst); 1777 LLT SrcTy = MRI.getType(Src); 1778 unsigned DestAS = DstTy.getAddressSpace(); 1779 unsigned SrcAS = SrcTy.getAddressSpace(); 1780 1781 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1782 // vector element. 1783 assert(!DstTy.isVector()); 1784 1785 const AMDGPUTargetMachine &TM 1786 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1787 1788 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1789 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1790 return true; 1791 } 1792 1793 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1794 // Truncate. 1795 B.buildExtract(Dst, Src, 0); 1796 MI.eraseFromParent(); 1797 return true; 1798 } 1799 1800 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1801 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1802 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1803 1804 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1805 // another. Merge operands are required to be the same type, but creating an 1806 // extra ptrtoint would be kind of pointless. 1807 auto HighAddr = B.buildConstant( 1808 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1809 B.buildMerge(Dst, {Src, HighAddr}); 1810 MI.eraseFromParent(); 1811 return true; 1812 } 1813 1814 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1815 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1816 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1817 unsigned NullVal = TM.getNullPointerValue(DestAS); 1818 1819 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1820 auto FlatNull = B.buildConstant(SrcTy, 0); 1821 1822 // Extract low 32-bits of the pointer. 1823 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1824 1825 auto CmpRes = 1826 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1827 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1828 1829 MI.eraseFromParent(); 1830 return true; 1831 } 1832 1833 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1834 return false; 1835 1836 if (!ST.hasFlatAddressSpace()) 1837 return false; 1838 1839 auto SegmentNull = 1840 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1841 auto FlatNull = 1842 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1843 1844 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1845 if (!ApertureReg.isValid()) 1846 return false; 1847 1848 auto CmpRes = 1849 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1850 1851 // Coerce the type of the low half of the result so we can use merge_values. 1852 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1853 1854 // TODO: Should we allow mismatched types but matching sizes in merges to 1855 // avoid the ptrtoint? 1856 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1857 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1858 1859 MI.eraseFromParent(); 1860 return true; 1861 } 1862 1863 bool AMDGPULegalizerInfo::legalizeFrint( 1864 MachineInstr &MI, MachineRegisterInfo &MRI, 1865 MachineIRBuilder &B) const { 1866 Register Src = MI.getOperand(1).getReg(); 1867 LLT Ty = MRI.getType(Src); 1868 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1869 1870 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1871 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1872 1873 auto C1 = B.buildFConstant(Ty, C1Val); 1874 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1875 1876 // TODO: Should this propagate fast-math-flags? 1877 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1878 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1879 1880 auto C2 = B.buildFConstant(Ty, C2Val); 1881 auto Fabs = B.buildFAbs(Ty, Src); 1882 1883 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1884 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1885 MI.eraseFromParent(); 1886 return true; 1887 } 1888 1889 bool AMDGPULegalizerInfo::legalizeFceil( 1890 MachineInstr &MI, MachineRegisterInfo &MRI, 1891 MachineIRBuilder &B) const { 1892 1893 const LLT S1 = LLT::scalar(1); 1894 const LLT S64 = LLT::scalar(64); 1895 1896 Register Src = MI.getOperand(1).getReg(); 1897 assert(MRI.getType(Src) == S64); 1898 1899 // result = trunc(src) 1900 // if (src > 0.0 && src != result) 1901 // result += 1.0 1902 1903 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1904 1905 const auto Zero = B.buildFConstant(S64, 0.0); 1906 const auto One = B.buildFConstant(S64, 1.0); 1907 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1908 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1909 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1910 auto Add = B.buildSelect(S64, And, One, Zero); 1911 1912 // TODO: Should this propagate fast-math-flags? 1913 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1914 return true; 1915 } 1916 1917 bool AMDGPULegalizerInfo::legalizeFrem( 1918 MachineInstr &MI, MachineRegisterInfo &MRI, 1919 MachineIRBuilder &B) const { 1920 Register DstReg = MI.getOperand(0).getReg(); 1921 Register Src0Reg = MI.getOperand(1).getReg(); 1922 Register Src1Reg = MI.getOperand(2).getReg(); 1923 auto Flags = MI.getFlags(); 1924 LLT Ty = MRI.getType(DstReg); 1925 1926 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1927 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1928 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1929 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1930 MI.eraseFromParent(); 1931 return true; 1932 } 1933 1934 static MachineInstrBuilder extractF64Exponent(Register Hi, 1935 MachineIRBuilder &B) { 1936 const unsigned FractBits = 52; 1937 const unsigned ExpBits = 11; 1938 LLT S32 = LLT::scalar(32); 1939 1940 auto Const0 = B.buildConstant(S32, FractBits - 32); 1941 auto Const1 = B.buildConstant(S32, ExpBits); 1942 1943 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1944 .addUse(Hi) 1945 .addUse(Const0.getReg(0)) 1946 .addUse(Const1.getReg(0)); 1947 1948 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1949 } 1950 1951 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1952 MachineInstr &MI, MachineRegisterInfo &MRI, 1953 MachineIRBuilder &B) const { 1954 const LLT S1 = LLT::scalar(1); 1955 const LLT S32 = LLT::scalar(32); 1956 const LLT S64 = LLT::scalar(64); 1957 1958 Register Src = MI.getOperand(1).getReg(); 1959 assert(MRI.getType(Src) == S64); 1960 1961 // TODO: Should this use extract since the low half is unused? 1962 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1963 Register Hi = Unmerge.getReg(1); 1964 1965 // Extract the upper half, since this is where we will find the sign and 1966 // exponent. 1967 auto Exp = extractF64Exponent(Hi, B); 1968 1969 const unsigned FractBits = 52; 1970 1971 // Extract the sign bit. 1972 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1973 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1974 1975 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1976 1977 const auto Zero32 = B.buildConstant(S32, 0); 1978 1979 // Extend back to 64-bits. 1980 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1981 1982 auto Shr = B.buildAShr(S64, FractMask, Exp); 1983 auto Not = B.buildNot(S64, Shr); 1984 auto Tmp0 = B.buildAnd(S64, Src, Not); 1985 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1986 1987 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1988 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1989 1990 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1991 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1992 MI.eraseFromParent(); 1993 return true; 1994 } 1995 1996 bool AMDGPULegalizerInfo::legalizeITOFP( 1997 MachineInstr &MI, MachineRegisterInfo &MRI, 1998 MachineIRBuilder &B, bool Signed) const { 1999 2000 Register Dst = MI.getOperand(0).getReg(); 2001 Register Src = MI.getOperand(1).getReg(); 2002 2003 const LLT S64 = LLT::scalar(64); 2004 const LLT S32 = LLT::scalar(32); 2005 2006 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2007 2008 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2009 2010 auto CvtHi = Signed ? 2011 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2012 B.buildUITOFP(S64, Unmerge.getReg(1)); 2013 2014 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2015 2016 auto ThirtyTwo = B.buildConstant(S32, 32); 2017 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2018 .addUse(CvtHi.getReg(0)) 2019 .addUse(ThirtyTwo.getReg(0)); 2020 2021 // TODO: Should this propagate fast-math-flags? 2022 B.buildFAdd(Dst, LdExp, CvtLo); 2023 MI.eraseFromParent(); 2024 return true; 2025 } 2026 2027 // TODO: Copied from DAG implementation. Verify logic and document how this 2028 // actually works. 2029 bool AMDGPULegalizerInfo::legalizeFPTOI( 2030 MachineInstr &MI, MachineRegisterInfo &MRI, 2031 MachineIRBuilder &B, bool Signed) const { 2032 2033 Register Dst = MI.getOperand(0).getReg(); 2034 Register Src = MI.getOperand(1).getReg(); 2035 2036 const LLT S64 = LLT::scalar(64); 2037 const LLT S32 = LLT::scalar(32); 2038 2039 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2040 2041 unsigned Flags = MI.getFlags(); 2042 2043 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2044 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2045 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2046 2047 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2048 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2049 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2050 2051 auto Hi = Signed ? 2052 B.buildFPTOSI(S32, FloorMul) : 2053 B.buildFPTOUI(S32, FloorMul); 2054 auto Lo = B.buildFPTOUI(S32, Fma); 2055 2056 B.buildMerge(Dst, { Lo, Hi }); 2057 MI.eraseFromParent(); 2058 2059 return true; 2060 } 2061 2062 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2063 MachineInstr &MI) const { 2064 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2065 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2066 2067 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2068 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2069 2070 // With ieee_mode disabled, the instructions have the correct behavior 2071 // already for G_FMINNUM/G_FMAXNUM 2072 if (!MFI->getMode().IEEE) 2073 return !IsIEEEOp; 2074 2075 if (IsIEEEOp) 2076 return true; 2077 2078 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2079 } 2080 2081 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2082 MachineInstr &MI, MachineRegisterInfo &MRI, 2083 MachineIRBuilder &B) const { 2084 // TODO: Should move some of this into LegalizerHelper. 2085 2086 // TODO: Promote dynamic indexing of s16 to s32 2087 2088 // FIXME: Artifact combiner probably should have replaced the truncated 2089 // constant before this, so we shouldn't need 2090 // getConstantVRegValWithLookThrough. 2091 Optional<ValueAndVReg> MaybeIdxVal = 2092 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2093 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2094 return true; 2095 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2096 2097 Register Dst = MI.getOperand(0).getReg(); 2098 Register Vec = MI.getOperand(1).getReg(); 2099 2100 LLT VecTy = MRI.getType(Vec); 2101 LLT EltTy = VecTy.getElementType(); 2102 assert(EltTy == MRI.getType(Dst)); 2103 2104 if (IdxVal < VecTy.getNumElements()) 2105 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2106 else 2107 B.buildUndef(Dst); 2108 2109 MI.eraseFromParent(); 2110 return true; 2111 } 2112 2113 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2114 MachineInstr &MI, MachineRegisterInfo &MRI, 2115 MachineIRBuilder &B) const { 2116 // TODO: Should move some of this into LegalizerHelper. 2117 2118 // TODO: Promote dynamic indexing of s16 to s32 2119 2120 // FIXME: Artifact combiner probably should have replaced the truncated 2121 // constant before this, so we shouldn't need 2122 // getConstantVRegValWithLookThrough. 2123 Optional<ValueAndVReg> MaybeIdxVal = 2124 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2125 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2126 return true; 2127 2128 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2129 Register Dst = MI.getOperand(0).getReg(); 2130 Register Vec = MI.getOperand(1).getReg(); 2131 Register Ins = MI.getOperand(2).getReg(); 2132 2133 LLT VecTy = MRI.getType(Vec); 2134 LLT EltTy = VecTy.getElementType(); 2135 assert(EltTy == MRI.getType(Ins)); 2136 2137 if (IdxVal < VecTy.getNumElements()) 2138 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2139 else 2140 B.buildUndef(Dst); 2141 2142 MI.eraseFromParent(); 2143 return true; 2144 } 2145 2146 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2147 MachineInstr &MI, MachineRegisterInfo &MRI, 2148 MachineIRBuilder &B) const { 2149 const LLT V2S16 = LLT::vector(2, 16); 2150 2151 Register Dst = MI.getOperand(0).getReg(); 2152 Register Src0 = MI.getOperand(1).getReg(); 2153 LLT DstTy = MRI.getType(Dst); 2154 LLT SrcTy = MRI.getType(Src0); 2155 2156 if (SrcTy == V2S16 && DstTy == V2S16 && 2157 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2158 return true; 2159 2160 MachineIRBuilder HelperBuilder(MI); 2161 GISelObserverWrapper DummyObserver; 2162 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2163 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2164 } 2165 2166 bool AMDGPULegalizerInfo::legalizeSinCos( 2167 MachineInstr &MI, MachineRegisterInfo &MRI, 2168 MachineIRBuilder &B) const { 2169 2170 Register DstReg = MI.getOperand(0).getReg(); 2171 Register SrcReg = MI.getOperand(1).getReg(); 2172 LLT Ty = MRI.getType(DstReg); 2173 unsigned Flags = MI.getFlags(); 2174 2175 Register TrigVal; 2176 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2177 if (ST.hasTrigReducedRange()) { 2178 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2179 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2180 .addUse(MulVal.getReg(0)) 2181 .setMIFlags(Flags).getReg(0); 2182 } else 2183 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2184 2185 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2186 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2187 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2188 .addUse(TrigVal) 2189 .setMIFlags(Flags); 2190 MI.eraseFromParent(); 2191 return true; 2192 } 2193 2194 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2195 MachineIRBuilder &B, 2196 const GlobalValue *GV, 2197 int64_t Offset, 2198 unsigned GAFlags) const { 2199 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2200 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2201 // to the following code sequence: 2202 // 2203 // For constant address space: 2204 // s_getpc_b64 s[0:1] 2205 // s_add_u32 s0, s0, $symbol 2206 // s_addc_u32 s1, s1, 0 2207 // 2208 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2209 // a fixup or relocation is emitted to replace $symbol with a literal 2210 // constant, which is a pc-relative offset from the encoding of the $symbol 2211 // operand to the global variable. 2212 // 2213 // For global address space: 2214 // s_getpc_b64 s[0:1] 2215 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2216 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2217 // 2218 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2219 // fixups or relocations are emitted to replace $symbol@*@lo and 2220 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2221 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2222 // operand to the global variable. 2223 // 2224 // What we want here is an offset from the value returned by s_getpc 2225 // (which is the address of the s_add_u32 instruction) to the global 2226 // variable, but since the encoding of $symbol starts 4 bytes after the start 2227 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2228 // small. This requires us to add 4 to the global variable offset in order to 2229 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2230 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2231 // instruction. 2232 2233 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2234 2235 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2236 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2237 2238 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2239 .addDef(PCReg); 2240 2241 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2242 if (GAFlags == SIInstrInfo::MO_NONE) 2243 MIB.addImm(0); 2244 else 2245 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2246 2247 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2248 2249 if (PtrTy.getSizeInBits() == 32) 2250 B.buildExtract(DstReg, PCReg, 0); 2251 return true; 2252 } 2253 2254 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2255 MachineInstr &MI, MachineRegisterInfo &MRI, 2256 MachineIRBuilder &B) const { 2257 Register DstReg = MI.getOperand(0).getReg(); 2258 LLT Ty = MRI.getType(DstReg); 2259 unsigned AS = Ty.getAddressSpace(); 2260 2261 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2262 MachineFunction &MF = B.getMF(); 2263 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2264 2265 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2266 if (!MFI->isModuleEntryFunction()) { 2267 const Function &Fn = MF.getFunction(); 2268 DiagnosticInfoUnsupported BadLDSDecl( 2269 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2270 DS_Warning); 2271 Fn.getContext().diagnose(BadLDSDecl); 2272 2273 // We currently don't have a way to correctly allocate LDS objects that 2274 // aren't directly associated with a kernel. We do force inlining of 2275 // functions that use local objects. However, if these dead functions are 2276 // not eliminated, we don't want a compile time error. Just emit a warning 2277 // and a trap, since there should be no callable path here. 2278 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2279 B.buildUndef(DstReg); 2280 MI.eraseFromParent(); 2281 return true; 2282 } 2283 2284 // TODO: We could emit code to handle the initialization somewhere. 2285 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2286 const SITargetLowering *TLI = ST.getTargetLowering(); 2287 if (!TLI->shouldUseLDSConstAddress(GV)) { 2288 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2289 return true; // Leave in place; 2290 } 2291 2292 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2293 Type *Ty = GV->getValueType(); 2294 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2295 // zero-sized type in other languages to declare the dynamic shared 2296 // memory which size is not known at the compile time. They will be 2297 // allocated by the runtime and placed directly after the static 2298 // allocated ones. They all share the same offset. 2299 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2300 // Adjust alignment for that dynamic shared memory array. 2301 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2302 LLT S32 = LLT::scalar(32); 2303 auto Sz = 2304 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2305 B.buildIntToPtr(DstReg, Sz); 2306 MI.eraseFromParent(); 2307 return true; 2308 } 2309 } 2310 2311 B.buildConstant( 2312 DstReg, 2313 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2314 MI.eraseFromParent(); 2315 return true; 2316 } 2317 2318 const Function &Fn = MF.getFunction(); 2319 DiagnosticInfoUnsupported BadInit( 2320 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2321 Fn.getContext().diagnose(BadInit); 2322 return true; 2323 } 2324 2325 const SITargetLowering *TLI = ST.getTargetLowering(); 2326 2327 if (TLI->shouldEmitFixup(GV)) { 2328 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2329 MI.eraseFromParent(); 2330 return true; 2331 } 2332 2333 if (TLI->shouldEmitPCReloc(GV)) { 2334 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2335 MI.eraseFromParent(); 2336 return true; 2337 } 2338 2339 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2340 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2341 2342 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2343 MachinePointerInfo::getGOT(MF), 2344 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2345 MachineMemOperand::MOInvariant, 2346 8 /*Size*/, Align(8)); 2347 2348 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2349 2350 if (Ty.getSizeInBits() == 32) { 2351 // Truncate if this is a 32-bit constant adrdess. 2352 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2353 B.buildExtract(DstReg, Load, 0); 2354 } else 2355 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2356 2357 MI.eraseFromParent(); 2358 return true; 2359 } 2360 2361 static LLT widenToNextPowerOf2(LLT Ty) { 2362 if (Ty.isVector()) 2363 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2364 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2365 } 2366 2367 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2368 MachineInstr &MI) const { 2369 MachineIRBuilder &B = Helper.MIRBuilder; 2370 MachineRegisterInfo &MRI = *B.getMRI(); 2371 GISelChangeObserver &Observer = Helper.Observer; 2372 2373 Register PtrReg = MI.getOperand(1).getReg(); 2374 LLT PtrTy = MRI.getType(PtrReg); 2375 unsigned AddrSpace = PtrTy.getAddressSpace(); 2376 2377 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2378 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2379 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2380 Observer.changingInstr(MI); 2381 MI.getOperand(1).setReg(Cast.getReg(0)); 2382 Observer.changedInstr(MI); 2383 return true; 2384 } 2385 2386 Register ValReg = MI.getOperand(0).getReg(); 2387 LLT ValTy = MRI.getType(ValReg); 2388 2389 MachineMemOperand *MMO = *MI.memoperands_begin(); 2390 const unsigned ValSize = ValTy.getSizeInBits(); 2391 const unsigned MemSize = 8 * MMO->getSize(); 2392 const Align MemAlign = MMO->getAlign(); 2393 const unsigned AlignInBits = 8 * MemAlign.value(); 2394 2395 // Widen non-power-of-2 loads to the alignment if needed 2396 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2397 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2398 2399 // This was already the correct extending load result type, so just adjust 2400 // the memory type. 2401 if (WideMemSize == ValSize) { 2402 MachineFunction &MF = B.getMF(); 2403 2404 MachineMemOperand *WideMMO = 2405 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2406 Observer.changingInstr(MI); 2407 MI.setMemRefs(MF, {WideMMO}); 2408 Observer.changedInstr(MI); 2409 return true; 2410 } 2411 2412 // Don't bother handling edge case that should probably never be produced. 2413 if (ValSize > WideMemSize) 2414 return false; 2415 2416 LLT WideTy = widenToNextPowerOf2(ValTy); 2417 2418 Register WideLoad; 2419 if (!WideTy.isVector()) { 2420 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2421 B.buildTrunc(ValReg, WideLoad).getReg(0); 2422 } else { 2423 // Extract the subvector. 2424 2425 if (isRegisterType(ValTy)) { 2426 // If this a case where G_EXTRACT is legal, use it. 2427 // (e.g. <3 x s32> -> <4 x s32>) 2428 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2429 B.buildExtract(ValReg, WideLoad, 0); 2430 } else { 2431 // For cases where the widened type isn't a nice register value, unmerge 2432 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2433 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2434 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2435 B.setInsertPt(B.getMBB(), MI.getIterator()); 2436 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2437 } 2438 } 2439 2440 MI.eraseFromParent(); 2441 return true; 2442 } 2443 2444 return false; 2445 } 2446 2447 bool AMDGPULegalizerInfo::legalizeFMad( 2448 MachineInstr &MI, MachineRegisterInfo &MRI, 2449 MachineIRBuilder &B) const { 2450 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2451 assert(Ty.isScalar()); 2452 2453 MachineFunction &MF = B.getMF(); 2454 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2455 2456 // TODO: Always legal with future ftz flag. 2457 // FIXME: Do we need just output? 2458 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2459 return true; 2460 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2461 return true; 2462 2463 MachineIRBuilder HelperBuilder(MI); 2464 GISelObserverWrapper DummyObserver; 2465 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2466 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2467 } 2468 2469 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2470 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2471 Register DstReg = MI.getOperand(0).getReg(); 2472 Register PtrReg = MI.getOperand(1).getReg(); 2473 Register CmpVal = MI.getOperand(2).getReg(); 2474 Register NewVal = MI.getOperand(3).getReg(); 2475 2476 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2477 "this should not have been custom lowered"); 2478 2479 LLT ValTy = MRI.getType(CmpVal); 2480 LLT VecTy = LLT::vector(2, ValTy); 2481 2482 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2483 2484 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2485 .addDef(DstReg) 2486 .addUse(PtrReg) 2487 .addUse(PackedVal) 2488 .setMemRefs(MI.memoperands()); 2489 2490 MI.eraseFromParent(); 2491 return true; 2492 } 2493 2494 bool AMDGPULegalizerInfo::legalizeFlog( 2495 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2496 Register Dst = MI.getOperand(0).getReg(); 2497 Register Src = MI.getOperand(1).getReg(); 2498 LLT Ty = B.getMRI()->getType(Dst); 2499 unsigned Flags = MI.getFlags(); 2500 2501 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2502 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2503 2504 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2505 MI.eraseFromParent(); 2506 return true; 2507 } 2508 2509 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2510 MachineIRBuilder &B) const { 2511 Register Dst = MI.getOperand(0).getReg(); 2512 Register Src = MI.getOperand(1).getReg(); 2513 unsigned Flags = MI.getFlags(); 2514 LLT Ty = B.getMRI()->getType(Dst); 2515 2516 auto K = B.buildFConstant(Ty, numbers::log2e); 2517 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2518 B.buildFExp2(Dst, Mul, Flags); 2519 MI.eraseFromParent(); 2520 return true; 2521 } 2522 2523 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2524 MachineIRBuilder &B) const { 2525 Register Dst = MI.getOperand(0).getReg(); 2526 Register Src0 = MI.getOperand(1).getReg(); 2527 Register Src1 = MI.getOperand(2).getReg(); 2528 unsigned Flags = MI.getFlags(); 2529 LLT Ty = B.getMRI()->getType(Dst); 2530 const LLT S16 = LLT::scalar(16); 2531 const LLT S32 = LLT::scalar(32); 2532 2533 if (Ty == S32) { 2534 auto Log = B.buildFLog2(S32, Src0, Flags); 2535 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2536 .addUse(Log.getReg(0)) 2537 .addUse(Src1) 2538 .setMIFlags(Flags); 2539 B.buildFExp2(Dst, Mul, Flags); 2540 } else if (Ty == S16) { 2541 // There's no f16 fmul_legacy, so we need to convert for it. 2542 auto Log = B.buildFLog2(S16, Src0, Flags); 2543 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2544 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2545 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2546 .addUse(Ext0.getReg(0)) 2547 .addUse(Ext1.getReg(0)) 2548 .setMIFlags(Flags); 2549 2550 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2551 } else 2552 return false; 2553 2554 MI.eraseFromParent(); 2555 return true; 2556 } 2557 2558 // Find a source register, ignoring any possible source modifiers. 2559 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2560 Register ModSrc = OrigSrc; 2561 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2562 ModSrc = SrcFNeg->getOperand(1).getReg(); 2563 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2564 ModSrc = SrcFAbs->getOperand(1).getReg(); 2565 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2566 ModSrc = SrcFAbs->getOperand(1).getReg(); 2567 return ModSrc; 2568 } 2569 2570 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2571 MachineRegisterInfo &MRI, 2572 MachineIRBuilder &B) const { 2573 2574 const LLT S1 = LLT::scalar(1); 2575 const LLT S64 = LLT::scalar(64); 2576 Register Dst = MI.getOperand(0).getReg(); 2577 Register OrigSrc = MI.getOperand(1).getReg(); 2578 unsigned Flags = MI.getFlags(); 2579 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2580 "this should not have been custom lowered"); 2581 2582 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2583 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2584 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2585 // V_FRACT bug is: 2586 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2587 // 2588 // Convert floor(x) to (x - fract(x)) 2589 2590 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2591 .addUse(OrigSrc) 2592 .setMIFlags(Flags); 2593 2594 // Give source modifier matching some assistance before obscuring a foldable 2595 // pattern. 2596 2597 // TODO: We can avoid the neg on the fract? The input sign to fract 2598 // shouldn't matter? 2599 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2600 2601 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2602 2603 Register Min = MRI.createGenericVirtualRegister(S64); 2604 2605 // We don't need to concern ourselves with the snan handling difference, so 2606 // use the one which will directly select. 2607 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2608 if (MFI->getMode().IEEE) 2609 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2610 else 2611 B.buildFMinNum(Min, Fract, Const, Flags); 2612 2613 Register CorrectedFract = Min; 2614 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2615 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2616 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2617 } 2618 2619 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2620 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2621 2622 MI.eraseFromParent(); 2623 return true; 2624 } 2625 2626 // Turn an illegal packed v2s16 build vector into bit operations. 2627 // TODO: This should probably be a bitcast action in LegalizerHelper. 2628 bool AMDGPULegalizerInfo::legalizeBuildVector( 2629 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2630 Register Dst = MI.getOperand(0).getReg(); 2631 const LLT S32 = LLT::scalar(32); 2632 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2633 2634 Register Src0 = MI.getOperand(1).getReg(); 2635 Register Src1 = MI.getOperand(2).getReg(); 2636 assert(MRI.getType(Src0) == LLT::scalar(16)); 2637 2638 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2639 B.buildBitcast(Dst, Merge); 2640 2641 MI.eraseFromParent(); 2642 return true; 2643 } 2644 2645 // Check that this is a G_XOR x, -1 2646 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2647 if (MI.getOpcode() != TargetOpcode::G_XOR) 2648 return false; 2649 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2650 return ConstVal && *ConstVal == -1; 2651 } 2652 2653 // Return the use branch instruction, otherwise null if the usage is invalid. 2654 static MachineInstr * 2655 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2656 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2657 Register CondDef = MI.getOperand(0).getReg(); 2658 if (!MRI.hasOneNonDBGUse(CondDef)) 2659 return nullptr; 2660 2661 MachineBasicBlock *Parent = MI.getParent(); 2662 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2663 2664 if (isNot(MRI, *UseMI)) { 2665 Register NegatedCond = UseMI->getOperand(0).getReg(); 2666 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2667 return nullptr; 2668 2669 // We're deleting the def of this value, so we need to remove it. 2670 UseMI->eraseFromParent(); 2671 2672 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2673 Negated = true; 2674 } 2675 2676 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2677 return nullptr; 2678 2679 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2680 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2681 if (Next == Parent->end()) { 2682 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2683 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2684 return nullptr; 2685 UncondBrTarget = &*NextMBB; 2686 } else { 2687 if (Next->getOpcode() != AMDGPU::G_BR) 2688 return nullptr; 2689 Br = &*Next; 2690 UncondBrTarget = Br->getOperand(0).getMBB(); 2691 } 2692 2693 return UseMI; 2694 } 2695 2696 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2697 const ArgDescriptor *Arg, 2698 const TargetRegisterClass *ArgRC, 2699 LLT ArgTy) const { 2700 MCRegister SrcReg = Arg->getRegister(); 2701 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2702 assert(DstReg.isVirtual() && "Virtual register expected"); 2703 2704 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2705 ArgTy); 2706 if (Arg->isMasked()) { 2707 // TODO: Should we try to emit this once in the entry block? 2708 const LLT S32 = LLT::scalar(32); 2709 const unsigned Mask = Arg->getMask(); 2710 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2711 2712 Register AndMaskSrc = LiveIn; 2713 2714 if (Shift != 0) { 2715 auto ShiftAmt = B.buildConstant(S32, Shift); 2716 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2717 } 2718 2719 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2720 } else { 2721 B.buildCopy(DstReg, LiveIn); 2722 } 2723 2724 return true; 2725 } 2726 2727 bool AMDGPULegalizerInfo::loadInputValue( 2728 Register DstReg, MachineIRBuilder &B, 2729 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2730 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2731 const ArgDescriptor *Arg; 2732 const TargetRegisterClass *ArgRC; 2733 LLT ArgTy; 2734 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2735 2736 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2737 return false; // TODO: Handle these 2738 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2739 } 2740 2741 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2742 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2743 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2744 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2745 return false; 2746 2747 MI.eraseFromParent(); 2748 return true; 2749 } 2750 2751 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2752 MachineRegisterInfo &MRI, 2753 MachineIRBuilder &B) const { 2754 Register Dst = MI.getOperand(0).getReg(); 2755 LLT DstTy = MRI.getType(Dst); 2756 LLT S16 = LLT::scalar(16); 2757 LLT S32 = LLT::scalar(32); 2758 LLT S64 = LLT::scalar(64); 2759 2760 if (DstTy == S16) 2761 return legalizeFDIV16(MI, MRI, B); 2762 if (DstTy == S32) 2763 return legalizeFDIV32(MI, MRI, B); 2764 if (DstTy == S64) 2765 return legalizeFDIV64(MI, MRI, B); 2766 2767 return false; 2768 } 2769 2770 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2771 Register DstReg, 2772 Register X, 2773 Register Y, 2774 bool IsDiv) const { 2775 const LLT S1 = LLT::scalar(1); 2776 const LLT S32 = LLT::scalar(32); 2777 2778 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2779 // algorithm used here. 2780 2781 // Initial estimate of inv(y). 2782 auto FloatY = B.buildUITOFP(S32, Y); 2783 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2784 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2785 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2786 auto Z = B.buildFPTOUI(S32, ScaledY); 2787 2788 // One round of UNR. 2789 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2790 auto NegYZ = B.buildMul(S32, NegY, Z); 2791 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2792 2793 // Quotient/remainder estimate. 2794 auto Q = B.buildUMulH(S32, X, Z); 2795 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2796 2797 // First quotient/remainder refinement. 2798 auto One = B.buildConstant(S32, 1); 2799 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2800 if (IsDiv) 2801 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2802 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2803 2804 // Second quotient/remainder refinement. 2805 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2806 if (IsDiv) 2807 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2808 else 2809 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2810 } 2811 2812 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2813 MachineRegisterInfo &MRI, 2814 MachineIRBuilder &B) const { 2815 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2816 Register DstReg = MI.getOperand(0).getReg(); 2817 Register Num = MI.getOperand(1).getReg(); 2818 Register Den = MI.getOperand(2).getReg(); 2819 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2820 MI.eraseFromParent(); 2821 return true; 2822 } 2823 2824 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2825 // 2826 // Return lo, hi of result 2827 // 2828 // %cvt.lo = G_UITOFP Val.lo 2829 // %cvt.hi = G_UITOFP Val.hi 2830 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2831 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2832 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2833 // %mul2 = G_FMUL %mul1, 2**(-32) 2834 // %trunc = G_INTRINSIC_TRUNC %mul2 2835 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2836 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2837 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2838 Register Val) { 2839 const LLT S32 = LLT::scalar(32); 2840 auto Unmerge = B.buildUnmerge(S32, Val); 2841 2842 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2843 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2844 2845 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2846 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2847 2848 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2849 auto Mul1 = 2850 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2851 2852 // 2**(-32) 2853 auto Mul2 = 2854 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2855 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2856 2857 // -(2**32) 2858 auto Mad2 = B.buildFMAD(S32, Trunc, 2859 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2860 2861 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2862 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2863 2864 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2865 } 2866 2867 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2868 Register DstReg, 2869 Register Numer, 2870 Register Denom, 2871 bool IsDiv) const { 2872 const LLT S32 = LLT::scalar(32); 2873 const LLT S64 = LLT::scalar(64); 2874 const LLT S1 = LLT::scalar(1); 2875 Register RcpLo, RcpHi; 2876 2877 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2878 2879 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2880 2881 auto Zero64 = B.buildConstant(S64, 0); 2882 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2883 2884 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2885 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2886 2887 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2888 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2889 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2890 2891 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2892 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2893 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2894 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2895 2896 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2897 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2898 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2899 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2900 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2901 2902 auto Zero32 = B.buildConstant(S32, 0); 2903 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2904 auto Add2_HiC = 2905 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2906 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2907 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2908 2909 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2910 Register NumerLo = UnmergeNumer.getReg(0); 2911 Register NumerHi = UnmergeNumer.getReg(1); 2912 2913 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2914 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2915 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2916 Register Mul3_Lo = UnmergeMul3.getReg(0); 2917 Register Mul3_Hi = UnmergeMul3.getReg(1); 2918 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2919 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2920 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2921 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2922 2923 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2924 Register DenomLo = UnmergeDenom.getReg(0); 2925 Register DenomHi = UnmergeDenom.getReg(1); 2926 2927 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2928 auto C1 = B.buildSExt(S32, CmpHi); 2929 2930 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2931 auto C2 = B.buildSExt(S32, CmpLo); 2932 2933 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2934 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2935 2936 // TODO: Here and below portions of the code can be enclosed into if/endif. 2937 // Currently control flow is unconditional and we have 4 selects after 2938 // potential endif to substitute PHIs. 2939 2940 // if C3 != 0 ... 2941 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2942 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2943 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2944 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2945 2946 auto One64 = B.buildConstant(S64, 1); 2947 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2948 2949 auto C4 = 2950 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2951 auto C5 = 2952 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2953 auto C6 = B.buildSelect( 2954 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2955 2956 // if (C6 != 0) 2957 auto Add4 = B.buildAdd(S64, Add3, One64); 2958 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2959 2960 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2961 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2962 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2963 2964 // endif C6 2965 // endif C3 2966 2967 if (IsDiv) { 2968 auto Sel1 = B.buildSelect( 2969 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2970 B.buildSelect(DstReg, 2971 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2972 } else { 2973 auto Sel2 = B.buildSelect( 2974 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2975 B.buildSelect(DstReg, 2976 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2977 } 2978 } 2979 2980 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2981 MachineRegisterInfo &MRI, 2982 MachineIRBuilder &B) const { 2983 const LLT S64 = LLT::scalar(64); 2984 const LLT S32 = LLT::scalar(32); 2985 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2986 Register DstReg = MI.getOperand(0).getReg(); 2987 Register Num = MI.getOperand(1).getReg(); 2988 Register Den = MI.getOperand(2).getReg(); 2989 LLT Ty = MRI.getType(DstReg); 2990 2991 if (Ty == S32) 2992 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2993 else if (Ty == S64) 2994 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2995 else 2996 return false; 2997 2998 MI.eraseFromParent(); 2999 return true; 3000 3001 } 3002 3003 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 3004 MachineRegisterInfo &MRI, 3005 MachineIRBuilder &B) const { 3006 const LLT S64 = LLT::scalar(64); 3007 const LLT S32 = LLT::scalar(32); 3008 3009 Register DstReg = MI.getOperand(0).getReg(); 3010 const LLT Ty = MRI.getType(DstReg); 3011 if (Ty != S32 && Ty != S64) 3012 return false; 3013 3014 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 3015 3016 Register LHS = MI.getOperand(1).getReg(); 3017 Register RHS = MI.getOperand(2).getReg(); 3018 3019 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3020 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3021 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3022 3023 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3024 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3025 3026 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3027 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3028 3029 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 3030 if (Ty == S32) 3031 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 3032 else 3033 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 3034 3035 Register Sign; 3036 if (IsDiv) 3037 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3038 else 3039 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3040 3041 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 3042 B.buildSub(DstReg, UDivRem, Sign); 3043 3044 MI.eraseFromParent(); 3045 return true; 3046 } 3047 3048 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3049 MachineRegisterInfo &MRI, 3050 MachineIRBuilder &B) const { 3051 Register Res = MI.getOperand(0).getReg(); 3052 Register LHS = MI.getOperand(1).getReg(); 3053 Register RHS = MI.getOperand(2).getReg(); 3054 uint16_t Flags = MI.getFlags(); 3055 LLT ResTy = MRI.getType(Res); 3056 3057 const MachineFunction &MF = B.getMF(); 3058 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3059 MI.getFlag(MachineInstr::FmAfn); 3060 3061 if (!AllowInaccurateRcp) 3062 return false; 3063 3064 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3065 // 1 / x -> RCP(x) 3066 if (CLHS->isExactlyValue(1.0)) { 3067 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3068 .addUse(RHS) 3069 .setMIFlags(Flags); 3070 3071 MI.eraseFromParent(); 3072 return true; 3073 } 3074 3075 // -1 / x -> RCP( FNEG(x) ) 3076 if (CLHS->isExactlyValue(-1.0)) { 3077 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3078 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3079 .addUse(FNeg.getReg(0)) 3080 .setMIFlags(Flags); 3081 3082 MI.eraseFromParent(); 3083 return true; 3084 } 3085 } 3086 3087 // x / y -> x * (1.0 / y) 3088 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3089 .addUse(RHS) 3090 .setMIFlags(Flags); 3091 B.buildFMul(Res, LHS, RCP, Flags); 3092 3093 MI.eraseFromParent(); 3094 return true; 3095 } 3096 3097 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3098 MachineRegisterInfo &MRI, 3099 MachineIRBuilder &B) const { 3100 Register Res = MI.getOperand(0).getReg(); 3101 Register X = MI.getOperand(1).getReg(); 3102 Register Y = MI.getOperand(2).getReg(); 3103 uint16_t Flags = MI.getFlags(); 3104 LLT ResTy = MRI.getType(Res); 3105 3106 const MachineFunction &MF = B.getMF(); 3107 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3108 MI.getFlag(MachineInstr::FmAfn); 3109 3110 if (!AllowInaccurateRcp) 3111 return false; 3112 3113 auto NegY = B.buildFNeg(ResTy, Y); 3114 auto One = B.buildFConstant(ResTy, 1.0); 3115 3116 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3117 .addUse(Y) 3118 .setMIFlags(Flags); 3119 3120 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3121 R = B.buildFMA(ResTy, Tmp0, R, R); 3122 3123 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3124 R = B.buildFMA(ResTy, Tmp1, R, R); 3125 3126 auto Ret = B.buildFMul(ResTy, X, R); 3127 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3128 3129 B.buildFMA(Res, Tmp2, R, Ret); 3130 MI.eraseFromParent(); 3131 return true; 3132 } 3133 3134 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3135 MachineRegisterInfo &MRI, 3136 MachineIRBuilder &B) const { 3137 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3138 return true; 3139 3140 Register Res = MI.getOperand(0).getReg(); 3141 Register LHS = MI.getOperand(1).getReg(); 3142 Register RHS = MI.getOperand(2).getReg(); 3143 3144 uint16_t Flags = MI.getFlags(); 3145 3146 LLT S16 = LLT::scalar(16); 3147 LLT S32 = LLT::scalar(32); 3148 3149 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3150 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3151 3152 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3153 .addUse(RHSExt.getReg(0)) 3154 .setMIFlags(Flags); 3155 3156 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3157 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3158 3159 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3160 .addUse(RDst.getReg(0)) 3161 .addUse(RHS) 3162 .addUse(LHS) 3163 .setMIFlags(Flags); 3164 3165 MI.eraseFromParent(); 3166 return true; 3167 } 3168 3169 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3170 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3171 static void toggleSPDenormMode(bool Enable, 3172 MachineIRBuilder &B, 3173 const GCNSubtarget &ST, 3174 AMDGPU::SIModeRegisterDefaults Mode) { 3175 // Set SP denorm mode to this value. 3176 unsigned SPDenormMode = 3177 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3178 3179 if (ST.hasDenormModeInst()) { 3180 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3181 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3182 3183 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3184 B.buildInstr(AMDGPU::S_DENORM_MODE) 3185 .addImm(NewDenormModeValue); 3186 3187 } else { 3188 // Select FP32 bit field in mode register. 3189 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3190 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3191 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3192 3193 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3194 .addImm(SPDenormMode) 3195 .addImm(SPDenormModeBitField); 3196 } 3197 } 3198 3199 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3200 MachineRegisterInfo &MRI, 3201 MachineIRBuilder &B) const { 3202 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3203 return true; 3204 3205 Register Res = MI.getOperand(0).getReg(); 3206 Register LHS = MI.getOperand(1).getReg(); 3207 Register RHS = MI.getOperand(2).getReg(); 3208 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3209 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3210 3211 uint16_t Flags = MI.getFlags(); 3212 3213 LLT S32 = LLT::scalar(32); 3214 LLT S1 = LLT::scalar(1); 3215 3216 auto One = B.buildFConstant(S32, 1.0f); 3217 3218 auto DenominatorScaled = 3219 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3220 .addUse(LHS) 3221 .addUse(RHS) 3222 .addImm(0) 3223 .setMIFlags(Flags); 3224 auto NumeratorScaled = 3225 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3226 .addUse(LHS) 3227 .addUse(RHS) 3228 .addImm(1) 3229 .setMIFlags(Flags); 3230 3231 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3232 .addUse(DenominatorScaled.getReg(0)) 3233 .setMIFlags(Flags); 3234 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3235 3236 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3237 // aren't modeled as reading it. 3238 if (!Mode.allFP32Denormals()) 3239 toggleSPDenormMode(true, B, ST, Mode); 3240 3241 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3242 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3243 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3244 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3245 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3246 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3247 3248 if (!Mode.allFP32Denormals()) 3249 toggleSPDenormMode(false, B, ST, Mode); 3250 3251 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3252 .addUse(Fma4.getReg(0)) 3253 .addUse(Fma1.getReg(0)) 3254 .addUse(Fma3.getReg(0)) 3255 .addUse(NumeratorScaled.getReg(1)) 3256 .setMIFlags(Flags); 3257 3258 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3259 .addUse(Fmas.getReg(0)) 3260 .addUse(RHS) 3261 .addUse(LHS) 3262 .setMIFlags(Flags); 3263 3264 MI.eraseFromParent(); 3265 return true; 3266 } 3267 3268 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3269 MachineRegisterInfo &MRI, 3270 MachineIRBuilder &B) const { 3271 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3272 return true; 3273 3274 Register Res = MI.getOperand(0).getReg(); 3275 Register LHS = MI.getOperand(1).getReg(); 3276 Register RHS = MI.getOperand(2).getReg(); 3277 3278 uint16_t Flags = MI.getFlags(); 3279 3280 LLT S64 = LLT::scalar(64); 3281 LLT S1 = LLT::scalar(1); 3282 3283 auto One = B.buildFConstant(S64, 1.0); 3284 3285 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3286 .addUse(LHS) 3287 .addUse(RHS) 3288 .addImm(0) 3289 .setMIFlags(Flags); 3290 3291 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3292 3293 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3294 .addUse(DivScale0.getReg(0)) 3295 .setMIFlags(Flags); 3296 3297 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3298 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3299 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3300 3301 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3302 .addUse(LHS) 3303 .addUse(RHS) 3304 .addImm(1) 3305 .setMIFlags(Flags); 3306 3307 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3308 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3309 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3310 3311 Register Scale; 3312 if (!ST.hasUsableDivScaleConditionOutput()) { 3313 // Workaround a hardware bug on SI where the condition output from div_scale 3314 // is not usable. 3315 3316 LLT S32 = LLT::scalar(32); 3317 3318 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3319 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3320 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3321 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3322 3323 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3324 Scale1Unmerge.getReg(1)); 3325 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3326 Scale0Unmerge.getReg(1)); 3327 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3328 } else { 3329 Scale = DivScale1.getReg(1); 3330 } 3331 3332 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3333 .addUse(Fma4.getReg(0)) 3334 .addUse(Fma3.getReg(0)) 3335 .addUse(Mul.getReg(0)) 3336 .addUse(Scale) 3337 .setMIFlags(Flags); 3338 3339 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3340 .addUse(Fmas.getReg(0)) 3341 .addUse(RHS) 3342 .addUse(LHS) 3343 .setMIFlags(Flags); 3344 3345 MI.eraseFromParent(); 3346 return true; 3347 } 3348 3349 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3350 MachineRegisterInfo &MRI, 3351 MachineIRBuilder &B) const { 3352 Register Res = MI.getOperand(0).getReg(); 3353 Register LHS = MI.getOperand(2).getReg(); 3354 Register RHS = MI.getOperand(3).getReg(); 3355 uint16_t Flags = MI.getFlags(); 3356 3357 LLT S32 = LLT::scalar(32); 3358 LLT S1 = LLT::scalar(1); 3359 3360 auto Abs = B.buildFAbs(S32, RHS, Flags); 3361 const APFloat C0Val(1.0f); 3362 3363 auto C0 = B.buildConstant(S32, 0x6f800000); 3364 auto C1 = B.buildConstant(S32, 0x2f800000); 3365 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3366 3367 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3368 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3369 3370 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3371 3372 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3373 .addUse(Mul0.getReg(0)) 3374 .setMIFlags(Flags); 3375 3376 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3377 3378 B.buildFMul(Res, Sel, Mul1, Flags); 3379 3380 MI.eraseFromParent(); 3381 return true; 3382 } 3383 3384 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3385 // FIXME: Why do we handle this one but not other removed instructions? 3386 // 3387 // Reciprocal square root. The clamp prevents infinite results, clamping 3388 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3389 // +-max_float. 3390 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3391 MachineRegisterInfo &MRI, 3392 MachineIRBuilder &B) const { 3393 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3394 return true; 3395 3396 Register Dst = MI.getOperand(0).getReg(); 3397 Register Src = MI.getOperand(2).getReg(); 3398 auto Flags = MI.getFlags(); 3399 3400 LLT Ty = MRI.getType(Dst); 3401 3402 const fltSemantics *FltSemantics; 3403 if (Ty == LLT::scalar(32)) 3404 FltSemantics = &APFloat::IEEEsingle(); 3405 else if (Ty == LLT::scalar(64)) 3406 FltSemantics = &APFloat::IEEEdouble(); 3407 else 3408 return false; 3409 3410 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3411 .addUse(Src) 3412 .setMIFlags(Flags); 3413 3414 // We don't need to concern ourselves with the snan handling difference, since 3415 // the rsq quieted (or not) so use the one which will directly select. 3416 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3417 const bool UseIEEE = MFI->getMode().IEEE; 3418 3419 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3420 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3421 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3422 3423 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3424 3425 if (UseIEEE) 3426 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3427 else 3428 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3429 MI.eraseFromParent(); 3430 return true; 3431 } 3432 3433 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3434 switch (IID) { 3435 case Intrinsic::amdgcn_ds_fadd: 3436 return AMDGPU::G_ATOMICRMW_FADD; 3437 case Intrinsic::amdgcn_ds_fmin: 3438 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3439 case Intrinsic::amdgcn_ds_fmax: 3440 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3441 default: 3442 llvm_unreachable("not a DS FP intrinsic"); 3443 } 3444 } 3445 3446 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3447 MachineInstr &MI, 3448 Intrinsic::ID IID) const { 3449 GISelChangeObserver &Observer = Helper.Observer; 3450 Observer.changingInstr(MI); 3451 3452 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3453 3454 // The remaining operands were used to set fields in the MemOperand on 3455 // construction. 3456 for (int I = 6; I > 3; --I) 3457 MI.RemoveOperand(I); 3458 3459 MI.RemoveOperand(1); // Remove the intrinsic ID. 3460 Observer.changedInstr(MI); 3461 return true; 3462 } 3463 3464 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3465 MachineRegisterInfo &MRI, 3466 MachineIRBuilder &B) const { 3467 uint64_t Offset = 3468 ST.getTargetLowering()->getImplicitParameterOffset( 3469 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3470 LLT DstTy = MRI.getType(DstReg); 3471 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3472 3473 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3474 if (!loadInputValue(KernargPtrReg, B, 3475 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3476 return false; 3477 3478 // FIXME: This should be nuw 3479 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3480 return true; 3481 } 3482 3483 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3484 MachineRegisterInfo &MRI, 3485 MachineIRBuilder &B) const { 3486 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3487 if (!MFI->isEntryFunction()) { 3488 return legalizePreloadedArgIntrin(MI, MRI, B, 3489 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3490 } 3491 3492 Register DstReg = MI.getOperand(0).getReg(); 3493 if (!getImplicitArgPtr(DstReg, MRI, B)) 3494 return false; 3495 3496 MI.eraseFromParent(); 3497 return true; 3498 } 3499 3500 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3501 MachineRegisterInfo &MRI, 3502 MachineIRBuilder &B, 3503 unsigned AddrSpace) const { 3504 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3505 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3506 Register Hi32 = Unmerge.getReg(1); 3507 3508 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3509 MI.eraseFromParent(); 3510 return true; 3511 } 3512 3513 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3514 // offset (the offset that is included in bounds checking and swizzling, to be 3515 // split between the instruction's voffset and immoffset fields) and soffset 3516 // (the offset that is excluded from bounds checking and swizzling, to go in 3517 // the instruction's soffset field). This function takes the first kind of 3518 // offset and figures out how to split it between voffset and immoffset. 3519 std::tuple<Register, unsigned, unsigned> 3520 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3521 Register OrigOffset) const { 3522 const unsigned MaxImm = 4095; 3523 Register BaseReg; 3524 unsigned TotalConstOffset; 3525 const LLT S32 = LLT::scalar(32); 3526 MachineRegisterInfo &MRI = *B.getMRI(); 3527 3528 std::tie(BaseReg, TotalConstOffset) = 3529 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3530 3531 unsigned ImmOffset = TotalConstOffset; 3532 3533 // If BaseReg is a pointer, convert it to int. 3534 if (MRI.getType(BaseReg).isPointer()) 3535 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3536 3537 // If the immediate value is too big for the immoffset field, put the value 3538 // and -4096 into the immoffset field so that the value that is copied/added 3539 // for the voffset field is a multiple of 4096, and it stands more chance 3540 // of being CSEd with the copy/add for another similar load/store. 3541 // However, do not do that rounding down to a multiple of 4096 if that is a 3542 // negative number, as it appears to be illegal to have a negative offset 3543 // in the vgpr, even if adding the immediate offset makes it positive. 3544 unsigned Overflow = ImmOffset & ~MaxImm; 3545 ImmOffset -= Overflow; 3546 if ((int32_t)Overflow < 0) { 3547 Overflow += ImmOffset; 3548 ImmOffset = 0; 3549 } 3550 3551 if (Overflow != 0) { 3552 if (!BaseReg) { 3553 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3554 } else { 3555 auto OverflowVal = B.buildConstant(S32, Overflow); 3556 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3557 } 3558 } 3559 3560 if (!BaseReg) 3561 BaseReg = B.buildConstant(S32, 0).getReg(0); 3562 3563 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3564 } 3565 3566 /// Handle register layout difference for f16 images for some subtargets. 3567 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3568 MachineRegisterInfo &MRI, 3569 Register Reg, 3570 bool ImageStore) const { 3571 const LLT S16 = LLT::scalar(16); 3572 const LLT S32 = LLT::scalar(32); 3573 LLT StoreVT = MRI.getType(Reg); 3574 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3575 3576 if (ST.hasUnpackedD16VMem()) { 3577 auto Unmerge = B.buildUnmerge(S16, Reg); 3578 3579 SmallVector<Register, 4> WideRegs; 3580 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3581 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3582 3583 int NumElts = StoreVT.getNumElements(); 3584 3585 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3586 } 3587 3588 if (ImageStore && ST.hasImageStoreD16Bug()) { 3589 if (StoreVT.getNumElements() == 2) { 3590 SmallVector<Register, 4> PackedRegs; 3591 Reg = B.buildBitcast(S32, Reg).getReg(0); 3592 PackedRegs.push_back(Reg); 3593 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3594 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); 3595 } 3596 3597 if (StoreVT.getNumElements() == 3) { 3598 SmallVector<Register, 4> PackedRegs; 3599 auto Unmerge = B.buildUnmerge(S16, Reg); 3600 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3601 PackedRegs.push_back(Unmerge.getReg(I)); 3602 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3603 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); 3604 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); 3605 } 3606 3607 if (StoreVT.getNumElements() == 4) { 3608 SmallVector<Register, 4> PackedRegs; 3609 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); 3610 auto Unmerge = B.buildUnmerge(S32, Reg); 3611 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3612 PackedRegs.push_back(Unmerge.getReg(I)); 3613 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3614 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); 3615 } 3616 3617 llvm_unreachable("invalid data type"); 3618 } 3619 3620 return Reg; 3621 } 3622 3623 Register AMDGPULegalizerInfo::fixStoreSourceType( 3624 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3625 MachineRegisterInfo *MRI = B.getMRI(); 3626 LLT Ty = MRI->getType(VData); 3627 3628 const LLT S16 = LLT::scalar(16); 3629 3630 // Fixup illegal register types for i8 stores. 3631 if (Ty == LLT::scalar(8) || Ty == S16) { 3632 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3633 return AnyExt; 3634 } 3635 3636 if (Ty.isVector()) { 3637 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3638 if (IsFormat) 3639 return handleD16VData(B, *MRI, VData); 3640 } 3641 } 3642 3643 return VData; 3644 } 3645 3646 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3647 MachineRegisterInfo &MRI, 3648 MachineIRBuilder &B, 3649 bool IsTyped, 3650 bool IsFormat) const { 3651 Register VData = MI.getOperand(1).getReg(); 3652 LLT Ty = MRI.getType(VData); 3653 LLT EltTy = Ty.getScalarType(); 3654 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3655 const LLT S32 = LLT::scalar(32); 3656 3657 VData = fixStoreSourceType(B, VData, IsFormat); 3658 Register RSrc = MI.getOperand(2).getReg(); 3659 3660 MachineMemOperand *MMO = *MI.memoperands_begin(); 3661 const int MemSize = MMO->getSize(); 3662 3663 unsigned ImmOffset; 3664 unsigned TotalOffset; 3665 3666 // The typed intrinsics add an immediate after the registers. 3667 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3668 3669 // The struct intrinsic variants add one additional operand over raw. 3670 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3671 Register VIndex; 3672 int OpOffset = 0; 3673 if (HasVIndex) { 3674 VIndex = MI.getOperand(3).getReg(); 3675 OpOffset = 1; 3676 } 3677 3678 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3679 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3680 3681 unsigned Format = 0; 3682 if (IsTyped) { 3683 Format = MI.getOperand(5 + OpOffset).getImm(); 3684 ++OpOffset; 3685 } 3686 3687 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3688 3689 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3690 if (TotalOffset != 0) 3691 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3692 3693 unsigned Opc; 3694 if (IsTyped) { 3695 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3696 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3697 } else if (IsFormat) { 3698 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3699 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3700 } else { 3701 switch (MemSize) { 3702 case 1: 3703 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3704 break; 3705 case 2: 3706 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3707 break; 3708 default: 3709 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3710 break; 3711 } 3712 } 3713 3714 if (!VIndex) 3715 VIndex = B.buildConstant(S32, 0).getReg(0); 3716 3717 auto MIB = B.buildInstr(Opc) 3718 .addUse(VData) // vdata 3719 .addUse(RSrc) // rsrc 3720 .addUse(VIndex) // vindex 3721 .addUse(VOffset) // voffset 3722 .addUse(SOffset) // soffset 3723 .addImm(ImmOffset); // offset(imm) 3724 3725 if (IsTyped) 3726 MIB.addImm(Format); 3727 3728 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3729 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3730 .addMemOperand(MMO); 3731 3732 MI.eraseFromParent(); 3733 return true; 3734 } 3735 3736 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3737 MachineRegisterInfo &MRI, 3738 MachineIRBuilder &B, 3739 bool IsFormat, 3740 bool IsTyped) const { 3741 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3742 MachineMemOperand *MMO = *MI.memoperands_begin(); 3743 const int MemSize = MMO->getSize(); 3744 const LLT S32 = LLT::scalar(32); 3745 3746 Register Dst = MI.getOperand(0).getReg(); 3747 Register RSrc = MI.getOperand(2).getReg(); 3748 3749 // The typed intrinsics add an immediate after the registers. 3750 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3751 3752 // The struct intrinsic variants add one additional operand over raw. 3753 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3754 Register VIndex; 3755 int OpOffset = 0; 3756 if (HasVIndex) { 3757 VIndex = MI.getOperand(3).getReg(); 3758 OpOffset = 1; 3759 } 3760 3761 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3762 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3763 3764 unsigned Format = 0; 3765 if (IsTyped) { 3766 Format = MI.getOperand(5 + OpOffset).getImm(); 3767 ++OpOffset; 3768 } 3769 3770 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3771 unsigned ImmOffset; 3772 unsigned TotalOffset; 3773 3774 LLT Ty = MRI.getType(Dst); 3775 LLT EltTy = Ty.getScalarType(); 3776 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3777 const bool Unpacked = ST.hasUnpackedD16VMem(); 3778 3779 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3780 if (TotalOffset != 0) 3781 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3782 3783 unsigned Opc; 3784 3785 if (IsTyped) { 3786 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3787 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3788 } else if (IsFormat) { 3789 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3790 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3791 } else { 3792 switch (MemSize) { 3793 case 1: 3794 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3795 break; 3796 case 2: 3797 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3798 break; 3799 default: 3800 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3801 break; 3802 } 3803 } 3804 3805 Register LoadDstReg; 3806 3807 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3808 LLT UnpackedTy = Ty.changeElementSize(32); 3809 3810 if (IsExtLoad) 3811 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3812 else if (Unpacked && IsD16 && Ty.isVector()) 3813 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3814 else 3815 LoadDstReg = Dst; 3816 3817 if (!VIndex) 3818 VIndex = B.buildConstant(S32, 0).getReg(0); 3819 3820 auto MIB = B.buildInstr(Opc) 3821 .addDef(LoadDstReg) // vdata 3822 .addUse(RSrc) // rsrc 3823 .addUse(VIndex) // vindex 3824 .addUse(VOffset) // voffset 3825 .addUse(SOffset) // soffset 3826 .addImm(ImmOffset); // offset(imm) 3827 3828 if (IsTyped) 3829 MIB.addImm(Format); 3830 3831 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3832 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3833 .addMemOperand(MMO); 3834 3835 if (LoadDstReg != Dst) { 3836 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3837 3838 // Widen result for extending loads was widened. 3839 if (IsExtLoad) 3840 B.buildTrunc(Dst, LoadDstReg); 3841 else { 3842 // Repack to original 16-bit vector result 3843 // FIXME: G_TRUNC should work, but legalization currently fails 3844 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3845 SmallVector<Register, 4> Repack; 3846 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3847 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3848 B.buildMerge(Dst, Repack); 3849 } 3850 } 3851 3852 MI.eraseFromParent(); 3853 return true; 3854 } 3855 3856 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3857 MachineIRBuilder &B, 3858 bool IsInc) const { 3859 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3860 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3861 B.buildInstr(Opc) 3862 .addDef(MI.getOperand(0).getReg()) 3863 .addUse(MI.getOperand(2).getReg()) 3864 .addUse(MI.getOperand(3).getReg()) 3865 .cloneMemRefs(MI); 3866 MI.eraseFromParent(); 3867 return true; 3868 } 3869 3870 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3871 switch (IntrID) { 3872 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3873 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3874 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3875 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3876 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3877 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3878 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3879 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3880 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3881 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3882 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3883 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3884 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3885 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3886 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3887 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3888 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3889 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3890 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3891 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3892 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3893 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3894 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3895 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3896 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3897 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3898 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3899 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3900 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3901 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3902 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3903 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3904 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3905 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3906 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3907 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3908 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3909 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3910 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3911 case Intrinsic::amdgcn_buffer_atomic_fadd: 3912 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3913 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3914 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3915 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 3916 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 3917 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 3918 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 3919 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 3920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 3921 default: 3922 llvm_unreachable("unhandled atomic opcode"); 3923 } 3924 } 3925 3926 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3927 MachineIRBuilder &B, 3928 Intrinsic::ID IID) const { 3929 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3930 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3931 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3932 3933 Register Dst; 3934 3935 int OpOffset = 0; 3936 if (HasReturn) { 3937 // A few FP atomics do not support return values. 3938 Dst = MI.getOperand(0).getReg(); 3939 } else { 3940 OpOffset = -1; 3941 } 3942 3943 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3944 Register CmpVal; 3945 3946 if (IsCmpSwap) { 3947 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3948 ++OpOffset; 3949 } 3950 3951 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3952 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3953 3954 // The struct intrinsic variants add one additional operand over raw. 3955 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3956 Register VIndex; 3957 if (HasVIndex) { 3958 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3959 ++OpOffset; 3960 } 3961 3962 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3963 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3964 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3965 3966 MachineMemOperand *MMO = *MI.memoperands_begin(); 3967 3968 unsigned ImmOffset; 3969 unsigned TotalOffset; 3970 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3971 if (TotalOffset != 0) 3972 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3973 3974 if (!VIndex) 3975 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3976 3977 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3978 3979 if (HasReturn) 3980 MIB.addDef(Dst); 3981 3982 MIB.addUse(VData); // vdata 3983 3984 if (IsCmpSwap) 3985 MIB.addReg(CmpVal); 3986 3987 MIB.addUse(RSrc) // rsrc 3988 .addUse(VIndex) // vindex 3989 .addUse(VOffset) // voffset 3990 .addUse(SOffset) // soffset 3991 .addImm(ImmOffset) // offset(imm) 3992 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3993 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3994 .addMemOperand(MMO); 3995 3996 MI.eraseFromParent(); 3997 return true; 3998 } 3999 4000 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 4001 /// vector with s16 typed elements. 4002 static void packImageA16AddressToDwords( 4003 MachineIRBuilder &B, MachineInstr &MI, 4004 SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset, 4005 const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) { 4006 const LLT S16 = LLT::scalar(16); 4007 const LLT V2S16 = LLT::vector(2, 16); 4008 4009 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4010 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4011 if (!SrcOp.isReg()) 4012 continue; // _L to _LZ may have eliminated this. 4013 4014 Register AddrReg = SrcOp.getReg(); 4015 4016 if (I < Intr->GradientStart) { 4017 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4018 PackedAddrs.push_back(AddrReg); 4019 } else { 4020 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4021 // derivatives dx/dh and dx/dv are packed with undef. 4022 if (((I + 1) >= EndIdx) || 4023 ((Intr->NumGradients / 2) % 2 == 1 && 4024 (I == static_cast<unsigned>(Intr->GradientStart + 4025 (Intr->NumGradients / 2) - 1) || 4026 I == static_cast<unsigned>(Intr->GradientStart + 4027 Intr->NumGradients - 1))) || 4028 // Check for _L to _LZ optimization 4029 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4030 PackedAddrs.push_back( 4031 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4032 .getReg(0)); 4033 } else { 4034 PackedAddrs.push_back( 4035 B.buildBuildVector( 4036 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4037 .getReg(0)); 4038 ++I; 4039 } 4040 } 4041 } 4042 } 4043 4044 /// Convert from separate vaddr components to a single vector address register, 4045 /// and replace the remaining operands with $noreg. 4046 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4047 int DimIdx, int NumVAddrs) { 4048 const LLT S32 = LLT::scalar(32); 4049 4050 SmallVector<Register, 8> AddrRegs; 4051 for (int I = 0; I != NumVAddrs; ++I) { 4052 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4053 if (SrcOp.isReg()) { 4054 AddrRegs.push_back(SrcOp.getReg()); 4055 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4056 } 4057 } 4058 4059 int NumAddrRegs = AddrRegs.size(); 4060 if (NumAddrRegs != 1) { 4061 // Round up to 8 elements for v5-v7 4062 // FIXME: Missing intermediate sized register classes and instructions. 4063 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4064 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4065 auto Undef = B.buildUndef(S32); 4066 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4067 NumAddrRegs = RoundedNumRegs; 4068 } 4069 4070 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 4071 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4072 } 4073 4074 for (int I = 1; I != NumVAddrs; ++I) { 4075 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4076 if (SrcOp.isReg()) 4077 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4078 } 4079 } 4080 4081 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4082 /// 4083 /// Depending on the subtarget, load/store with 16-bit element data need to be 4084 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4085 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4086 /// registers. 4087 /// 4088 /// We don't want to directly select image instructions just yet, but also want 4089 /// to exposes all register repacking to the legalizer/combiners. We also don't 4090 /// want a selected instrution entering RegBankSelect. In order to avoid 4091 /// defining a multitude of intermediate image instructions, directly hack on 4092 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4093 /// now unnecessary arguments with $noreg. 4094 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4095 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4096 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4097 4098 const unsigned NumDefs = MI.getNumExplicitDefs(); 4099 const unsigned ArgOffset = NumDefs + 1; 4100 bool IsTFE = NumDefs == 2; 4101 // We are only processing the operands of d16 image operations on subtargets 4102 // that use the unpacked register layout, or need to repack the TFE result. 4103 4104 // TODO: Do we need to guard against already legalized intrinsics? 4105 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4106 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4107 4108 MachineRegisterInfo *MRI = B.getMRI(); 4109 const LLT S32 = LLT::scalar(32); 4110 const LLT S16 = LLT::scalar(16); 4111 const LLT V2S16 = LLT::vector(2, 16); 4112 4113 unsigned DMask = 0; 4114 4115 // Check for 16 bit addresses and pack if true. 4116 LLT GradTy = 4117 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4118 LLT AddrTy = 4119 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4120 const bool IsG16 = GradTy == S16; 4121 const bool IsA16 = AddrTy == S16; 4122 4123 int DMaskLanes = 0; 4124 if (!BaseOpcode->Atomic) { 4125 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4126 if (BaseOpcode->Gather4) { 4127 DMaskLanes = 4; 4128 } else if (DMask != 0) { 4129 DMaskLanes = countPopulation(DMask); 4130 } else if (!IsTFE && !BaseOpcode->Store) { 4131 // If dmask is 0, this is a no-op load. This can be eliminated. 4132 B.buildUndef(MI.getOperand(0)); 4133 MI.eraseFromParent(); 4134 return true; 4135 } 4136 } 4137 4138 Observer.changingInstr(MI); 4139 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4140 4141 unsigned NewOpcode = NumDefs == 0 ? 4142 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4143 4144 // Track that we legalized this 4145 MI.setDesc(B.getTII().get(NewOpcode)); 4146 4147 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4148 // dmask to be at least 1 otherwise the instruction will fail 4149 if (IsTFE && DMask == 0) { 4150 DMask = 0x1; 4151 DMaskLanes = 1; 4152 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4153 } 4154 4155 if (BaseOpcode->Atomic) { 4156 Register VData0 = MI.getOperand(2).getReg(); 4157 LLT Ty = MRI->getType(VData0); 4158 4159 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4160 if (Ty.isVector()) 4161 return false; 4162 4163 if (BaseOpcode->AtomicX2) { 4164 Register VData1 = MI.getOperand(3).getReg(); 4165 // The two values are packed in one register. 4166 LLT PackedTy = LLT::vector(2, Ty); 4167 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4168 MI.getOperand(2).setReg(Concat.getReg(0)); 4169 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4170 } 4171 } 4172 4173 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4174 4175 // Optimize _L to _LZ when _L is zero 4176 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4177 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4178 const ConstantFP *ConstantLod; 4179 4180 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4181 m_GFCst(ConstantLod))) { 4182 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4183 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4184 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4185 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4186 Intr->Dim); 4187 4188 // The starting indexes should remain in the same place. 4189 --CorrectedNumVAddrs; 4190 4191 MI.getOperand(MI.getNumExplicitDefs()) 4192 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4193 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4194 Intr = NewImageDimIntr; 4195 } 4196 } 4197 } 4198 4199 // Optimize _mip away, when 'lod' is zero 4200 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4201 int64_t ConstantLod; 4202 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4203 m_ICst(ConstantLod))) { 4204 if (ConstantLod == 0) { 4205 // TODO: Change intrinsic opcode and remove operand instead or replacing 4206 // it with 0, as the _L to _LZ handling is done above. 4207 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4208 --CorrectedNumVAddrs; 4209 } 4210 } 4211 } 4212 4213 // Rewrite the addressing register layout before doing anything else. 4214 if (IsA16 || IsG16) { 4215 if (IsA16) { 4216 // Target must support the feature and gradients need to be 16 bit too 4217 if (!ST.hasA16() || !IsG16) 4218 return false; 4219 } else if (!ST.hasG16()) 4220 return false; 4221 4222 if (Intr->NumVAddrs > 1) { 4223 SmallVector<Register, 4> PackedRegs; 4224 // Don't compress addresses for G16 4225 const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart; 4226 packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr, 4227 PackEndIdx); 4228 4229 if (!IsA16) { 4230 // Add uncompressed address 4231 for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) { 4232 int AddrReg = MI.getOperand(ArgOffset + I).getReg(); 4233 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 4234 PackedRegs.push_back(AddrReg); 4235 } 4236 } 4237 4238 // See also below in the non-a16 branch 4239 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4240 4241 if (!UseNSA && PackedRegs.size() > 1) { 4242 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4243 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4244 PackedRegs[0] = Concat.getReg(0); 4245 PackedRegs.resize(1); 4246 } 4247 4248 const unsigned NumPacked = PackedRegs.size(); 4249 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4250 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4251 if (!SrcOp.isReg()) { 4252 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4253 continue; 4254 } 4255 4256 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4257 4258 if (I - Intr->VAddrStart < NumPacked) 4259 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4260 else 4261 SrcOp.setReg(AMDGPU::NoRegister); 4262 } 4263 } 4264 } else { 4265 // If the register allocator cannot place the address registers contiguously 4266 // without introducing moves, then using the non-sequential address encoding 4267 // is always preferable, since it saves VALU instructions and is usually a 4268 // wash in terms of code size or even better. 4269 // 4270 // However, we currently have no way of hinting to the register allocator 4271 // that MIMG addresses should be placed contiguously when it is possible to 4272 // do so, so force non-NSA for the common 2-address case as a heuristic. 4273 // 4274 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4275 // allocation when possible. 4276 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4277 4278 if (!UseNSA && Intr->NumVAddrs > 1) 4279 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4280 Intr->NumVAddrs); 4281 } 4282 4283 int Flags = 0; 4284 if (IsA16) 4285 Flags |= 1; 4286 if (IsG16) 4287 Flags |= 2; 4288 MI.addOperand(MachineOperand::CreateImm(Flags)); 4289 4290 if (BaseOpcode->Store) { // No TFE for stores? 4291 // TODO: Handle dmask trim 4292 Register VData = MI.getOperand(1).getReg(); 4293 LLT Ty = MRI->getType(VData); 4294 if (!Ty.isVector() || Ty.getElementType() != S16) 4295 return true; 4296 4297 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4298 if (RepackedReg != VData) { 4299 MI.getOperand(1).setReg(RepackedReg); 4300 } 4301 4302 return true; 4303 } 4304 4305 Register DstReg = MI.getOperand(0).getReg(); 4306 LLT Ty = MRI->getType(DstReg); 4307 const LLT EltTy = Ty.getScalarType(); 4308 const bool IsD16 = Ty.getScalarType() == S16; 4309 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4310 4311 // Confirm that the return type is large enough for the dmask specified 4312 if (NumElts < DMaskLanes) 4313 return false; 4314 4315 if (NumElts > 4 || DMaskLanes > 4) 4316 return false; 4317 4318 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4319 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4320 4321 // The raw dword aligned data component of the load. The only legal cases 4322 // where this matters should be when using the packed D16 format, for 4323 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4324 LLT RoundedTy; 4325 4326 // S32 vector to to cover all data, plus TFE result element. 4327 LLT TFETy; 4328 4329 // Register type to use for each loaded component. Will be S32 or V2S16. 4330 LLT RegTy; 4331 4332 if (IsD16 && ST.hasUnpackedD16VMem()) { 4333 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4334 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4335 RegTy = S32; 4336 } else { 4337 unsigned EltSize = EltTy.getSizeInBits(); 4338 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4339 unsigned RoundedSize = 32 * RoundedElts; 4340 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4341 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4342 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4343 } 4344 4345 // The return type does not need adjustment. 4346 // TODO: Should we change s16 case to s32 or <2 x s16>? 4347 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4348 return true; 4349 4350 Register Dst1Reg; 4351 4352 // Insert after the instruction. 4353 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4354 4355 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4356 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4357 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4358 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4359 4360 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4361 4362 MI.getOperand(0).setReg(NewResultReg); 4363 4364 // In the IR, TFE is supposed to be used with a 2 element struct return 4365 // type. The intruction really returns these two values in one contiguous 4366 // register, with one additional dword beyond the loaded data. Rewrite the 4367 // return type to use a single register result. 4368 4369 if (IsTFE) { 4370 Dst1Reg = MI.getOperand(1).getReg(); 4371 if (MRI->getType(Dst1Reg) != S32) 4372 return false; 4373 4374 // TODO: Make sure the TFE operand bit is set. 4375 MI.RemoveOperand(1); 4376 4377 // Handle the easy case that requires no repack instructions. 4378 if (Ty == S32) { 4379 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4380 return true; 4381 } 4382 } 4383 4384 // Now figure out how to copy the new result register back into the old 4385 // result. 4386 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4387 4388 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4389 4390 if (ResultNumRegs == 1) { 4391 assert(!IsTFE); 4392 ResultRegs[0] = NewResultReg; 4393 } else { 4394 // We have to repack into a new vector of some kind. 4395 for (int I = 0; I != NumDataRegs; ++I) 4396 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4397 B.buildUnmerge(ResultRegs, NewResultReg); 4398 4399 // Drop the final TFE element to get the data part. The TFE result is 4400 // directly written to the right place already. 4401 if (IsTFE) 4402 ResultRegs.resize(NumDataRegs); 4403 } 4404 4405 // For an s16 scalar result, we form an s32 result with a truncate regardless 4406 // of packed vs. unpacked. 4407 if (IsD16 && !Ty.isVector()) { 4408 B.buildTrunc(DstReg, ResultRegs[0]); 4409 return true; 4410 } 4411 4412 // Avoid a build/concat_vector of 1 entry. 4413 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4414 B.buildBitcast(DstReg, ResultRegs[0]); 4415 return true; 4416 } 4417 4418 assert(Ty.isVector()); 4419 4420 if (IsD16) { 4421 // For packed D16 results with TFE enabled, all the data components are 4422 // S32. Cast back to the expected type. 4423 // 4424 // TODO: We don't really need to use load s32 elements. We would only need one 4425 // cast for the TFE result if a multiple of v2s16 was used. 4426 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4427 for (Register &Reg : ResultRegs) 4428 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4429 } else if (ST.hasUnpackedD16VMem()) { 4430 for (Register &Reg : ResultRegs) 4431 Reg = B.buildTrunc(S16, Reg).getReg(0); 4432 } 4433 } 4434 4435 auto padWithUndef = [&](LLT Ty, int NumElts) { 4436 if (NumElts == 0) 4437 return; 4438 Register Undef = B.buildUndef(Ty).getReg(0); 4439 for (int I = 0; I != NumElts; ++I) 4440 ResultRegs.push_back(Undef); 4441 }; 4442 4443 // Pad out any elements eliminated due to the dmask. 4444 LLT ResTy = MRI->getType(ResultRegs[0]); 4445 if (!ResTy.isVector()) { 4446 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4447 B.buildBuildVector(DstReg, ResultRegs); 4448 return true; 4449 } 4450 4451 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4452 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4453 4454 // Deal with the one annoying legal case. 4455 const LLT V3S16 = LLT::vector(3, 16); 4456 if (Ty == V3S16) { 4457 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4458 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4459 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4460 return true; 4461 } 4462 4463 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4464 B.buildConcatVectors(DstReg, ResultRegs); 4465 return true; 4466 } 4467 4468 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4469 LegalizerHelper &Helper, MachineInstr &MI) const { 4470 MachineIRBuilder &B = Helper.MIRBuilder; 4471 GISelChangeObserver &Observer = Helper.Observer; 4472 4473 Register Dst = MI.getOperand(0).getReg(); 4474 LLT Ty = B.getMRI()->getType(Dst); 4475 unsigned Size = Ty.getSizeInBits(); 4476 MachineFunction &MF = B.getMF(); 4477 4478 Observer.changingInstr(MI); 4479 4480 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4481 Ty = getBitcastRegisterType(Ty); 4482 Helper.bitcastDst(MI, Ty, 0); 4483 Dst = MI.getOperand(0).getReg(); 4484 B.setInsertPt(B.getMBB(), MI); 4485 } 4486 4487 // FIXME: We don't really need this intermediate instruction. The intrinsic 4488 // should be fixed to have a memory operand. Since it's readnone, we're not 4489 // allowed to add one. 4490 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4491 MI.RemoveOperand(1); // Remove intrinsic ID 4492 4493 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4494 // TODO: Should this use datalayout alignment? 4495 const unsigned MemSize = (Size + 7) / 8; 4496 const Align MemAlign(4); 4497 MachineMemOperand *MMO = MF.getMachineMemOperand( 4498 MachinePointerInfo(), 4499 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4500 MachineMemOperand::MOInvariant, 4501 MemSize, MemAlign); 4502 MI.addMemOperand(MF, MMO); 4503 4504 // There are no 96-bit result scalar loads, but widening to 128-bit should 4505 // always be legal. We may need to restore this to a 96-bit result if it turns 4506 // out this needs to be converted to a vector load during RegBankSelect. 4507 if (!isPowerOf2_32(Size)) { 4508 if (Ty.isVector()) 4509 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4510 else 4511 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4512 } 4513 4514 Observer.changedInstr(MI); 4515 return true; 4516 } 4517 4518 // TODO: Move to selection 4519 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4520 MachineRegisterInfo &MRI, 4521 MachineIRBuilder &B) const { 4522 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4523 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4524 !ST.isTrapHandlerEnabled()) { 4525 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4526 } else { 4527 // Pass queue pointer to trap handler as input, and insert trap instruction 4528 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4529 MachineRegisterInfo &MRI = *B.getMRI(); 4530 4531 Register LiveIn = 4532 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4533 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4534 return false; 4535 4536 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4537 B.buildCopy(SGPR01, LiveIn); 4538 B.buildInstr(AMDGPU::S_TRAP) 4539 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4540 .addReg(SGPR01, RegState::Implicit); 4541 } 4542 4543 MI.eraseFromParent(); 4544 return true; 4545 } 4546 4547 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4548 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4549 // Is non-HSA path or trap-handler disabled? then, report a warning 4550 // accordingly 4551 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4552 !ST.isTrapHandlerEnabled()) { 4553 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4554 "debugtrap handler not supported", 4555 MI.getDebugLoc(), DS_Warning); 4556 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4557 Ctx.diagnose(NoTrap); 4558 } else { 4559 // Insert debug-trap instruction 4560 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4561 } 4562 4563 MI.eraseFromParent(); 4564 return true; 4565 } 4566 4567 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4568 MachineIRBuilder &B) const { 4569 MachineRegisterInfo &MRI = *B.getMRI(); 4570 const LLT S16 = LLT::scalar(16); 4571 const LLT S32 = LLT::scalar(32); 4572 4573 Register DstReg = MI.getOperand(0).getReg(); 4574 Register NodePtr = MI.getOperand(2).getReg(); 4575 Register RayExtent = MI.getOperand(3).getReg(); 4576 Register RayOrigin = MI.getOperand(4).getReg(); 4577 Register RayDir = MI.getOperand(5).getReg(); 4578 Register RayInvDir = MI.getOperand(6).getReg(); 4579 Register TDescr = MI.getOperand(7).getReg(); 4580 4581 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4582 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4583 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4584 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4585 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4586 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4587 4588 SmallVector<Register, 12> Ops; 4589 if (Is64) { 4590 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4591 Ops.push_back(Unmerge.getReg(0)); 4592 Ops.push_back(Unmerge.getReg(1)); 4593 } else { 4594 Ops.push_back(NodePtr); 4595 } 4596 Ops.push_back(RayExtent); 4597 4598 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4599 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4600 Ops.push_back(Unmerge.getReg(0)); 4601 Ops.push_back(Unmerge.getReg(1)); 4602 Ops.push_back(Unmerge.getReg(2)); 4603 }; 4604 4605 packLanes(RayOrigin); 4606 if (IsA16) { 4607 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4608 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4609 Register R1 = MRI.createGenericVirtualRegister(S32); 4610 Register R2 = MRI.createGenericVirtualRegister(S32); 4611 Register R3 = MRI.createGenericVirtualRegister(S32); 4612 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4613 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4614 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4615 Ops.push_back(R1); 4616 Ops.push_back(R2); 4617 Ops.push_back(R3); 4618 } else { 4619 packLanes(RayDir); 4620 packLanes(RayInvDir); 4621 } 4622 4623 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4624 .addDef(DstReg) 4625 .addImm(Opcode); 4626 4627 for (Register R : Ops) { 4628 MIB.addUse(R); 4629 } 4630 4631 MIB.addUse(TDescr) 4632 .addImm(IsA16 ? 1 : 0) 4633 .cloneMemRefs(MI); 4634 4635 MI.eraseFromParent(); 4636 return true; 4637 } 4638 4639 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4640 MachineInstr &MI) const { 4641 MachineIRBuilder &B = Helper.MIRBuilder; 4642 MachineRegisterInfo &MRI = *B.getMRI(); 4643 4644 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4645 auto IntrID = MI.getIntrinsicID(); 4646 switch (IntrID) { 4647 case Intrinsic::amdgcn_if: 4648 case Intrinsic::amdgcn_else: { 4649 MachineInstr *Br = nullptr; 4650 MachineBasicBlock *UncondBrTarget = nullptr; 4651 bool Negated = false; 4652 if (MachineInstr *BrCond = 4653 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4654 const SIRegisterInfo *TRI 4655 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4656 4657 Register Def = MI.getOperand(1).getReg(); 4658 Register Use = MI.getOperand(3).getReg(); 4659 4660 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4661 4662 if (Negated) 4663 std::swap(CondBrTarget, UncondBrTarget); 4664 4665 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4666 if (IntrID == Intrinsic::amdgcn_if) { 4667 B.buildInstr(AMDGPU::SI_IF) 4668 .addDef(Def) 4669 .addUse(Use) 4670 .addMBB(UncondBrTarget); 4671 } else { 4672 B.buildInstr(AMDGPU::SI_ELSE) 4673 .addDef(Def) 4674 .addUse(Use) 4675 .addMBB(UncondBrTarget); 4676 } 4677 4678 if (Br) { 4679 Br->getOperand(0).setMBB(CondBrTarget); 4680 } else { 4681 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4682 // since we're swapping branch targets it needs to be reinserted. 4683 // FIXME: IRTranslator should probably not do this 4684 B.buildBr(*CondBrTarget); 4685 } 4686 4687 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4688 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4689 MI.eraseFromParent(); 4690 BrCond->eraseFromParent(); 4691 return true; 4692 } 4693 4694 return false; 4695 } 4696 case Intrinsic::amdgcn_loop: { 4697 MachineInstr *Br = nullptr; 4698 MachineBasicBlock *UncondBrTarget = nullptr; 4699 bool Negated = false; 4700 if (MachineInstr *BrCond = 4701 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4702 const SIRegisterInfo *TRI 4703 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4704 4705 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4706 Register Reg = MI.getOperand(2).getReg(); 4707 4708 if (Negated) 4709 std::swap(CondBrTarget, UncondBrTarget); 4710 4711 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4712 B.buildInstr(AMDGPU::SI_LOOP) 4713 .addUse(Reg) 4714 .addMBB(UncondBrTarget); 4715 4716 if (Br) 4717 Br->getOperand(0).setMBB(CondBrTarget); 4718 else 4719 B.buildBr(*CondBrTarget); 4720 4721 MI.eraseFromParent(); 4722 BrCond->eraseFromParent(); 4723 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4724 return true; 4725 } 4726 4727 return false; 4728 } 4729 case Intrinsic::amdgcn_kernarg_segment_ptr: 4730 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4731 // This only makes sense to call in a kernel, so just lower to null. 4732 B.buildConstant(MI.getOperand(0).getReg(), 0); 4733 MI.eraseFromParent(); 4734 return true; 4735 } 4736 4737 return legalizePreloadedArgIntrin( 4738 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4739 case Intrinsic::amdgcn_implicitarg_ptr: 4740 return legalizeImplicitArgPtr(MI, MRI, B); 4741 case Intrinsic::amdgcn_workitem_id_x: 4742 return legalizePreloadedArgIntrin(MI, MRI, B, 4743 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4744 case Intrinsic::amdgcn_workitem_id_y: 4745 return legalizePreloadedArgIntrin(MI, MRI, B, 4746 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4747 case Intrinsic::amdgcn_workitem_id_z: 4748 return legalizePreloadedArgIntrin(MI, MRI, B, 4749 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4750 case Intrinsic::amdgcn_workgroup_id_x: 4751 return legalizePreloadedArgIntrin(MI, MRI, B, 4752 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4753 case Intrinsic::amdgcn_workgroup_id_y: 4754 return legalizePreloadedArgIntrin(MI, MRI, B, 4755 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4756 case Intrinsic::amdgcn_workgroup_id_z: 4757 return legalizePreloadedArgIntrin(MI, MRI, B, 4758 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4759 case Intrinsic::amdgcn_dispatch_ptr: 4760 return legalizePreloadedArgIntrin(MI, MRI, B, 4761 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4762 case Intrinsic::amdgcn_queue_ptr: 4763 return legalizePreloadedArgIntrin(MI, MRI, B, 4764 AMDGPUFunctionArgInfo::QUEUE_PTR); 4765 case Intrinsic::amdgcn_implicit_buffer_ptr: 4766 return legalizePreloadedArgIntrin( 4767 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4768 case Intrinsic::amdgcn_dispatch_id: 4769 return legalizePreloadedArgIntrin(MI, MRI, B, 4770 AMDGPUFunctionArgInfo::DISPATCH_ID); 4771 case Intrinsic::amdgcn_fdiv_fast: 4772 return legalizeFDIVFastIntrin(MI, MRI, B); 4773 case Intrinsic::amdgcn_is_shared: 4774 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4775 case Intrinsic::amdgcn_is_private: 4776 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4777 case Intrinsic::amdgcn_wavefrontsize: { 4778 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4779 MI.eraseFromParent(); 4780 return true; 4781 } 4782 case Intrinsic::amdgcn_s_buffer_load: 4783 return legalizeSBufferLoad(Helper, MI); 4784 case Intrinsic::amdgcn_raw_buffer_store: 4785 case Intrinsic::amdgcn_struct_buffer_store: 4786 return legalizeBufferStore(MI, MRI, B, false, false); 4787 case Intrinsic::amdgcn_raw_buffer_store_format: 4788 case Intrinsic::amdgcn_struct_buffer_store_format: 4789 return legalizeBufferStore(MI, MRI, B, false, true); 4790 case Intrinsic::amdgcn_raw_tbuffer_store: 4791 case Intrinsic::amdgcn_struct_tbuffer_store: 4792 return legalizeBufferStore(MI, MRI, B, true, true); 4793 case Intrinsic::amdgcn_raw_buffer_load: 4794 case Intrinsic::amdgcn_struct_buffer_load: 4795 return legalizeBufferLoad(MI, MRI, B, false, false); 4796 case Intrinsic::amdgcn_raw_buffer_load_format: 4797 case Intrinsic::amdgcn_struct_buffer_load_format: 4798 return legalizeBufferLoad(MI, MRI, B, true, false); 4799 case Intrinsic::amdgcn_raw_tbuffer_load: 4800 case Intrinsic::amdgcn_struct_tbuffer_load: 4801 return legalizeBufferLoad(MI, MRI, B, true, true); 4802 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4803 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4804 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4805 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4806 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4807 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4808 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4809 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4810 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4811 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4812 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4813 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4814 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4815 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4816 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4817 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4818 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4819 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4820 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4821 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4822 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4823 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4824 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4825 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4826 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4827 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4828 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4829 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4830 case Intrinsic::amdgcn_buffer_atomic_fadd: 4831 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4832 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4833 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4834 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4835 return legalizeBufferAtomic(MI, B, IntrID); 4836 case Intrinsic::amdgcn_atomic_inc: 4837 return legalizeAtomicIncDec(MI, B, true); 4838 case Intrinsic::amdgcn_atomic_dec: 4839 return legalizeAtomicIncDec(MI, B, false); 4840 case Intrinsic::trap: 4841 return legalizeTrapIntrinsic(MI, MRI, B); 4842 case Intrinsic::debugtrap: 4843 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4844 case Intrinsic::amdgcn_rsq_clamp: 4845 return legalizeRsqClampIntrinsic(MI, MRI, B); 4846 case Intrinsic::amdgcn_ds_fadd: 4847 case Intrinsic::amdgcn_ds_fmin: 4848 case Intrinsic::amdgcn_ds_fmax: 4849 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4850 case Intrinsic::amdgcn_image_bvh_intersect_ray: 4851 return legalizeBVHIntrinsic(MI, B); 4852 default: { 4853 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4854 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4855 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4856 return true; 4857 } 4858 } 4859 4860 return true; 4861 } 4862