1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeNumElements(Pow2NElts); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 99 }; 100 } 101 102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 const LLT EltTy = Ty.getElementType(); 106 unsigned Size = Ty.getSizeInBits(); 107 unsigned Pieces = (Size + 63) / 64; 108 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 109 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 110 }; 111 } 112 113 // Increase the number of vector elements to reach the next multiple of 32-bit 114 // type. 115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 116 return [=](const LegalityQuery &Query) { 117 const LLT Ty = Query.Types[TypeIdx]; 118 119 const LLT EltTy = Ty.getElementType(); 120 const int Size = Ty.getSizeInBits(); 121 const int EltSize = EltTy.getSizeInBits(); 122 const int NextMul32 = (Size + 31) / 32; 123 124 assert(EltSize < 32); 125 126 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 127 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 128 }; 129 } 130 131 static LLT getBitcastRegisterType(const LLT Ty) { 132 const unsigned Size = Ty.getSizeInBits(); 133 134 LLT CoercedTy; 135 if (Size <= 32) { 136 // <2 x s8> -> s16 137 // <4 x s8> -> s32 138 return LLT::scalar(Size); 139 } 140 141 return LLT::scalarOrVector(Size / 32, 32); 142 } 143 144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 145 return [=](const LegalityQuery &Query) { 146 const LLT Ty = Query.Types[TypeIdx]; 147 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 148 }; 149 } 150 151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 152 return [=](const LegalityQuery &Query) { 153 const LLT Ty = Query.Types[TypeIdx]; 154 unsigned Size = Ty.getSizeInBits(); 155 assert(Size % 32 == 0); 156 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 157 }; 158 } 159 160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 161 return [=](const LegalityQuery &Query) { 162 const LLT QueryTy = Query.Types[TypeIdx]; 163 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 164 }; 165 } 166 167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 168 return [=](const LegalityQuery &Query) { 169 const LLT QueryTy = Query.Types[TypeIdx]; 170 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 171 }; 172 } 173 174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 const LLT QueryTy = Query.Types[TypeIdx]; 177 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 178 }; 179 } 180 181 static bool isRegisterSize(unsigned Size) { 182 return Size % 32 == 0 && Size <= MaxRegisterSize; 183 } 184 185 static bool isRegisterVectorElementType(LLT EltTy) { 186 const int EltSize = EltTy.getSizeInBits(); 187 return EltSize == 16 || EltSize % 32 == 0; 188 } 189 190 static bool isRegisterVectorType(LLT Ty) { 191 const int EltSize = Ty.getElementType().getSizeInBits(); 192 return EltSize == 32 || EltSize == 64 || 193 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 194 EltSize == 128 || EltSize == 256; 195 } 196 197 static bool isRegisterType(LLT Ty) { 198 if (!isRegisterSize(Ty.getSizeInBits())) 199 return false; 200 201 if (Ty.isVector()) 202 return isRegisterVectorType(Ty); 203 204 return true; 205 } 206 207 // Any combination of 32 or 64-bit elements up the maximum register size, and 208 // multiples of v2s16. 209 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 210 return [=](const LegalityQuery &Query) { 211 return isRegisterType(Query.Types[TypeIdx]); 212 }; 213 } 214 215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 216 return [=](const LegalityQuery &Query) { 217 const LLT QueryTy = Query.Types[TypeIdx]; 218 if (!QueryTy.isVector()) 219 return false; 220 const LLT EltTy = QueryTy.getElementType(); 221 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 222 }; 223 } 224 225 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 226 return [=](const LegalityQuery &Query) { 227 const LLT Ty = Query.Types[TypeIdx]; 228 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 229 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 230 }; 231 } 232 233 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 234 // handle some operations by just promoting the register during 235 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 236 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 237 bool IsLoad) { 238 switch (AS) { 239 case AMDGPUAS::PRIVATE_ADDRESS: 240 // FIXME: Private element size. 241 return ST.enableFlatScratch() ? 128 : 32; 242 case AMDGPUAS::LOCAL_ADDRESS: 243 return ST.useDS128() ? 128 : 64; 244 case AMDGPUAS::GLOBAL_ADDRESS: 245 case AMDGPUAS::CONSTANT_ADDRESS: 246 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 247 // Treat constant and global as identical. SMRD loads are sometimes usable for 248 // global loads (ideally constant address space should be eliminated) 249 // depending on the context. Legality cannot be context dependent, but 250 // RegBankSelect can split the load as necessary depending on the pointer 251 // register bank/uniformity and if the memory is invariant or not written in a 252 // kernel. 253 return IsLoad ? 512 : 128; 254 default: 255 // Flat addresses may contextually need to be split to 32-bit parts if they 256 // may alias scratch depending on the subtarget. 257 return 128; 258 } 259 } 260 261 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 262 const LegalityQuery &Query, 263 unsigned Opcode) { 264 const LLT Ty = Query.Types[0]; 265 266 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 267 const bool IsLoad = Opcode != AMDGPU::G_STORE; 268 269 unsigned RegSize = Ty.getSizeInBits(); 270 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 271 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 272 unsigned AS = Query.Types[1].getAddressSpace(); 273 274 // All of these need to be custom lowered to cast the pointer operand. 275 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 276 return false; 277 278 // TODO: We should be able to widen loads if the alignment is high enough, but 279 // we also need to modify the memory access size. 280 #if 0 281 // Accept widening loads based on alignment. 282 if (IsLoad && MemSize < Size) 283 MemSize = std::max(MemSize, Align); 284 #endif 285 286 // Only 1-byte and 2-byte to 32-bit extloads are valid. 287 if (MemSize != RegSize && RegSize != 32) 288 return false; 289 290 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 291 return false; 292 293 switch (MemSize) { 294 case 8: 295 case 16: 296 case 32: 297 case 64: 298 case 128: 299 break; 300 case 96: 301 if (!ST.hasDwordx3LoadStores()) 302 return false; 303 break; 304 case 256: 305 case 512: 306 // These may contextually need to be broken down. 307 break; 308 default: 309 return false; 310 } 311 312 assert(RegSize >= MemSize); 313 314 if (AlignBits < MemSize) { 315 const SITargetLowering *TLI = ST.getTargetLowering(); 316 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 317 Align(AlignBits / 8))) 318 return false; 319 } 320 321 return true; 322 } 323 324 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 325 // workaround this. Eventually it should ignore the type for loads and only care 326 // about the size. Return true in cases where we will workaround this for now by 327 // bitcasting. 328 static bool loadStoreBitcastWorkaround(const LLT Ty) { 329 if (EnableNewLegality) 330 return false; 331 332 const unsigned Size = Ty.getSizeInBits(); 333 if (Size <= 64) 334 return false; 335 if (!Ty.isVector()) 336 return true; 337 338 LLT EltTy = Ty.getElementType(); 339 if (EltTy.isPointer()) 340 return true; 341 342 unsigned EltSize = EltTy.getSizeInBits(); 343 return EltSize != 32 && EltSize != 64; 344 } 345 346 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 347 unsigned Opcode) { 348 const LLT Ty = Query.Types[0]; 349 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 350 !loadStoreBitcastWorkaround(Ty); 351 } 352 353 /// Return true if a load or store of the type should be lowered with a bitcast 354 /// to a different type. 355 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 356 const unsigned MemSizeInBits) { 357 const unsigned Size = Ty.getSizeInBits(); 358 if (Size != MemSizeInBits) 359 return Size <= 32 && Ty.isVector(); 360 361 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 362 return true; 363 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 364 !isRegisterVectorElementType(Ty.getElementType()); 365 } 366 367 /// Return true if we should legalize a load by widening an odd sized memory 368 /// access up to the alignment. Note this case when the memory access itself 369 /// changes, not the size of the result register. 370 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 371 unsigned AlignInBits, unsigned AddrSpace, 372 unsigned Opcode) { 373 // We don't want to widen cases that are naturally legal. 374 if (isPowerOf2_32(SizeInBits)) 375 return false; 376 377 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 378 // end up widening these for a scalar load during RegBankSelect, since there 379 // aren't 96-bit scalar loads. 380 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 381 return false; 382 383 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 384 return false; 385 386 // A load is known dereferenceable up to the alignment, so it's legal to widen 387 // to it. 388 // 389 // TODO: Could check dereferenceable for less aligned cases. 390 unsigned RoundedSize = NextPowerOf2(SizeInBits); 391 if (AlignInBits < RoundedSize) 392 return false; 393 394 // Do not widen if it would introduce a slow unaligned load. 395 const SITargetLowering *TLI = ST.getTargetLowering(); 396 bool Fast = false; 397 return TLI->allowsMisalignedMemoryAccessesImpl( 398 RoundedSize, AddrSpace, Align(AlignInBits / 8), 399 MachineMemOperand::MOLoad, &Fast) && 400 Fast; 401 } 402 403 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 404 unsigned Opcode) { 405 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 406 return false; 407 408 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 409 Query.MMODescrs[0].AlignInBits, 410 Query.Types[1].getAddressSpace(), Opcode); 411 } 412 413 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 414 const GCNTargetMachine &TM) 415 : ST(ST_) { 416 using namespace TargetOpcode; 417 418 auto GetAddrSpacePtr = [&TM](unsigned AS) { 419 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 420 }; 421 422 const LLT S1 = LLT::scalar(1); 423 const LLT S8 = LLT::scalar(8); 424 const LLT S16 = LLT::scalar(16); 425 const LLT S32 = LLT::scalar(32); 426 const LLT S64 = LLT::scalar(64); 427 const LLT S128 = LLT::scalar(128); 428 const LLT S256 = LLT::scalar(256); 429 const LLT S512 = LLT::scalar(512); 430 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 431 432 const LLT V2S8 = LLT::vector(2, 8); 433 const LLT V2S16 = LLT::vector(2, 16); 434 const LLT V4S16 = LLT::vector(4, 16); 435 436 const LLT V2S32 = LLT::vector(2, 32); 437 const LLT V3S32 = LLT::vector(3, 32); 438 const LLT V4S32 = LLT::vector(4, 32); 439 const LLT V5S32 = LLT::vector(5, 32); 440 const LLT V6S32 = LLT::vector(6, 32); 441 const LLT V7S32 = LLT::vector(7, 32); 442 const LLT V8S32 = LLT::vector(8, 32); 443 const LLT V9S32 = LLT::vector(9, 32); 444 const LLT V10S32 = LLT::vector(10, 32); 445 const LLT V11S32 = LLT::vector(11, 32); 446 const LLT V12S32 = LLT::vector(12, 32); 447 const LLT V13S32 = LLT::vector(13, 32); 448 const LLT V14S32 = LLT::vector(14, 32); 449 const LLT V15S32 = LLT::vector(15, 32); 450 const LLT V16S32 = LLT::vector(16, 32); 451 const LLT V32S32 = LLT::vector(32, 32); 452 453 const LLT V2S64 = LLT::vector(2, 64); 454 const LLT V3S64 = LLT::vector(3, 64); 455 const LLT V4S64 = LLT::vector(4, 64); 456 const LLT V5S64 = LLT::vector(5, 64); 457 const LLT V6S64 = LLT::vector(6, 64); 458 const LLT V7S64 = LLT::vector(7, 64); 459 const LLT V8S64 = LLT::vector(8, 64); 460 const LLT V16S64 = LLT::vector(16, 64); 461 462 std::initializer_list<LLT> AllS32Vectors = 463 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 464 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 465 std::initializer_list<LLT> AllS64Vectors = 466 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 467 468 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 469 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 470 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 471 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 472 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 473 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 474 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 475 476 const LLT CodePtr = FlatPtr; 477 478 const std::initializer_list<LLT> AddrSpaces64 = { 479 GlobalPtr, ConstantPtr, FlatPtr 480 }; 481 482 const std::initializer_list<LLT> AddrSpaces32 = { 483 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 484 }; 485 486 const std::initializer_list<LLT> FPTypesBase = { 487 S32, S64 488 }; 489 490 const std::initializer_list<LLT> FPTypes16 = { 491 S32, S64, S16 492 }; 493 494 const std::initializer_list<LLT> FPTypesPK16 = { 495 S32, S64, S16, V2S16 496 }; 497 498 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 499 500 setAction({G_BRCOND, S1}, Legal); // VCC branches 501 setAction({G_BRCOND, S32}, Legal); // SCC branches 502 503 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 504 // elements for v3s16 505 getActionDefinitionsBuilder(G_PHI) 506 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 507 .legalFor(AllS32Vectors) 508 .legalFor(AllS64Vectors) 509 .legalFor(AddrSpaces64) 510 .legalFor(AddrSpaces32) 511 .legalIf(isPointer(0)) 512 .clampScalar(0, S16, S256) 513 .widenScalarToNextPow2(0, 32) 514 .clampMaxNumElements(0, S32, 16) 515 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 516 .scalarize(0); 517 518 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 519 // Full set of gfx9 features. 520 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 521 .legalFor({S32, S16, V2S16}) 522 .clampScalar(0, S16, S32) 523 .clampMaxNumElements(0, S16, 2) 524 .scalarize(0) 525 .widenScalarToNextPow2(0, 32); 526 527 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 528 .legalFor({S32, S16, V2S16}) // Clamp modifier 529 .minScalarOrElt(0, S16) 530 .clampMaxNumElements(0, S16, 2) 531 .scalarize(0) 532 .widenScalarToNextPow2(0, 32) 533 .lower(); 534 } else if (ST.has16BitInsts()) { 535 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 536 .legalFor({S32, S16}) 537 .clampScalar(0, S16, S32) 538 .scalarize(0) 539 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 540 541 // Technically the saturating operations require clamp bit support, but this 542 // was introduced at the same time as 16-bit operations. 543 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 544 .legalFor({S32, S16}) // Clamp modifier 545 .minScalar(0, S16) 546 .scalarize(0) 547 .widenScalarToNextPow2(0, 16) 548 .lower(); 549 550 // We're just lowering this, but it helps get a better result to try to 551 // coerce to the desired type first. 552 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 553 .minScalar(0, S16) 554 .scalarize(0) 555 .lower(); 556 } else { 557 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 558 .legalFor({S32}) 559 .clampScalar(0, S32, S32) 560 .scalarize(0); 561 562 if (ST.hasIntClamp()) { 563 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 564 .legalFor({S32}) // Clamp modifier. 565 .scalarize(0) 566 .minScalarOrElt(0, S32) 567 .lower(); 568 } else { 569 // Clamp bit support was added in VI, along with 16-bit operations. 570 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 571 .minScalar(0, S32) 572 .scalarize(0) 573 .lower(); 574 } 575 576 // FIXME: DAG expansion gets better results. The widening uses the smaller 577 // range values and goes for the min/max lowering directly. 578 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 579 .minScalar(0, S32) 580 .scalarize(0) 581 .lower(); 582 } 583 584 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 585 .customFor({S32, S64}) 586 .clampScalar(0, S32, S64) 587 .widenScalarToNextPow2(0, 32) 588 .scalarize(0); 589 590 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 591 .legalFor({S32}) 592 .maxScalarOrElt(0, S32); 593 594 if (ST.hasVOP3PInsts()) { 595 Mulh 596 .clampMaxNumElements(0, S8, 2) 597 .lowerFor({V2S8}); 598 } 599 600 Mulh 601 .scalarize(0) 602 .lower(); 603 604 // Report legal for any types we can handle anywhere. For the cases only legal 605 // on the SALU, RegBankSelect will be able to re-legalize. 606 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 607 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 608 .clampScalar(0, S32, S64) 609 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 610 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 611 .widenScalarToNextPow2(0) 612 .scalarize(0); 613 614 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 615 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 616 .legalFor({{S32, S1}, {S32, S32}}) 617 .minScalar(0, S32) 618 // TODO: .scalarize(0) 619 .lower(); 620 621 getActionDefinitionsBuilder(G_BITCAST) 622 // Don't worry about the size constraint. 623 .legalIf(all(isRegisterType(0), isRegisterType(1))) 624 .lower(); 625 626 627 getActionDefinitionsBuilder(G_CONSTANT) 628 .legalFor({S1, S32, S64, S16, GlobalPtr, 629 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 630 .legalIf(isPointer(0)) 631 .clampScalar(0, S32, S64) 632 .widenScalarToNextPow2(0); 633 634 getActionDefinitionsBuilder(G_FCONSTANT) 635 .legalFor({S32, S64, S16}) 636 .clampScalar(0, S16, S64); 637 638 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 639 .legalIf(isRegisterType(0)) 640 // s1 and s16 are special cases because they have legal operations on 641 // them, but don't really occupy registers in the normal way. 642 .legalFor({S1, S16}) 643 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 644 .clampScalarOrElt(0, S32, MaxScalar) 645 .widenScalarToNextPow2(0, 32) 646 .clampMaxNumElements(0, S32, 16); 647 648 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 649 650 // If the amount is divergent, we have to do a wave reduction to get the 651 // maximum value, so this is expanded during RegBankSelect. 652 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 653 .legalFor({{PrivatePtr, S32}}); 654 655 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 656 .customIf(typeIsNot(0, PrivatePtr)); 657 658 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 659 660 auto &FPOpActions = getActionDefinitionsBuilder( 661 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 662 .legalFor({S32, S64}); 663 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 664 .customFor({S32, S64}); 665 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 666 .customFor({S32, S64}); 667 668 if (ST.has16BitInsts()) { 669 if (ST.hasVOP3PInsts()) 670 FPOpActions.legalFor({S16, V2S16}); 671 else 672 FPOpActions.legalFor({S16}); 673 674 TrigActions.customFor({S16}); 675 FDIVActions.customFor({S16}); 676 } 677 678 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 679 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 680 681 if (ST.hasVOP3PInsts()) { 682 MinNumMaxNum.customFor(FPTypesPK16) 683 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 684 .clampMaxNumElements(0, S16, 2) 685 .clampScalar(0, S16, S64) 686 .scalarize(0); 687 } else if (ST.has16BitInsts()) { 688 MinNumMaxNum.customFor(FPTypes16) 689 .clampScalar(0, S16, S64) 690 .scalarize(0); 691 } else { 692 MinNumMaxNum.customFor(FPTypesBase) 693 .clampScalar(0, S32, S64) 694 .scalarize(0); 695 } 696 697 if (ST.hasVOP3PInsts()) 698 FPOpActions.clampMaxNumElements(0, S16, 2); 699 700 FPOpActions 701 .scalarize(0) 702 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 703 704 TrigActions 705 .scalarize(0) 706 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 707 708 FDIVActions 709 .scalarize(0) 710 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 711 712 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 713 .legalFor(FPTypesPK16) 714 .clampMaxNumElements(0, S16, 2) 715 .scalarize(0) 716 .clampScalar(0, S16, S64); 717 718 if (ST.has16BitInsts()) { 719 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 720 .legalFor({S32, S64, S16}) 721 .scalarize(0) 722 .clampScalar(0, S16, S64); 723 } else { 724 getActionDefinitionsBuilder(G_FSQRT) 725 .legalFor({S32, S64}) 726 .scalarize(0) 727 .clampScalar(0, S32, S64); 728 729 if (ST.hasFractBug()) { 730 getActionDefinitionsBuilder(G_FFLOOR) 731 .customFor({S64}) 732 .legalFor({S32, S64}) 733 .scalarize(0) 734 .clampScalar(0, S32, S64); 735 } else { 736 getActionDefinitionsBuilder(G_FFLOOR) 737 .legalFor({S32, S64}) 738 .scalarize(0) 739 .clampScalar(0, S32, S64); 740 } 741 } 742 743 getActionDefinitionsBuilder(G_FPTRUNC) 744 .legalFor({{S32, S64}, {S16, S32}}) 745 .scalarize(0) 746 .lower(); 747 748 getActionDefinitionsBuilder(G_FPEXT) 749 .legalFor({{S64, S32}, {S32, S16}}) 750 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 751 .scalarize(0); 752 753 getActionDefinitionsBuilder(G_FSUB) 754 // Use actual fsub instruction 755 .legalFor({S32}) 756 // Must use fadd + fneg 757 .lowerFor({S64, S16, V2S16}) 758 .scalarize(0) 759 .clampScalar(0, S32, S64); 760 761 // Whether this is legal depends on the floating point mode for the function. 762 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 763 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 764 FMad.customFor({S32, S16}); 765 else if (ST.hasMadMacF32Insts()) 766 FMad.customFor({S32}); 767 else if (ST.hasMadF16()) 768 FMad.customFor({S16}); 769 FMad.scalarize(0) 770 .lower(); 771 772 auto &FRem = getActionDefinitionsBuilder(G_FREM); 773 if (ST.has16BitInsts()) { 774 FRem.customFor({S16, S32, S64}); 775 } else { 776 FRem.minScalar(0, S32) 777 .customFor({S32, S64}); 778 } 779 FRem.scalarize(0); 780 781 // TODO: Do we need to clamp maximum bitwidth? 782 getActionDefinitionsBuilder(G_TRUNC) 783 .legalIf(isScalar(0)) 784 .legalFor({{V2S16, V2S32}}) 785 .clampMaxNumElements(0, S16, 2) 786 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 787 // situations (like an invalid implicit use), we don't want to infinite loop 788 // in the legalizer. 789 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 790 .alwaysLegal(); 791 792 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 793 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 794 {S32, S1}, {S64, S1}, {S16, S1}}) 795 .scalarize(0) 796 .clampScalar(0, S32, S64) 797 .widenScalarToNextPow2(1, 32); 798 799 // TODO: Split s1->s64 during regbankselect for VALU. 800 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 801 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 802 .lowerFor({{S32, S64}}) 803 .lowerIf(typeIs(1, S1)) 804 .customFor({{S64, S64}}); 805 if (ST.has16BitInsts()) 806 IToFP.legalFor({{S16, S16}}); 807 IToFP.clampScalar(1, S32, S64) 808 .minScalar(0, S32) 809 .scalarize(0) 810 .widenScalarToNextPow2(1); 811 812 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 813 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 814 .customFor({{S64, S64}}) 815 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 816 if (ST.has16BitInsts()) 817 FPToI.legalFor({{S16, S16}}); 818 else 819 FPToI.minScalar(1, S32); 820 821 FPToI.minScalar(0, S32) 822 .scalarize(0) 823 .lower(); 824 825 // Lower roundeven into G_FRINT 826 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 827 .scalarize(0) 828 .lower(); 829 830 if (ST.has16BitInsts()) { 831 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 832 .legalFor({S16, S32, S64}) 833 .clampScalar(0, S16, S64) 834 .scalarize(0); 835 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 836 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 837 .legalFor({S32, S64}) 838 .clampScalar(0, S32, S64) 839 .scalarize(0); 840 } else { 841 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 842 .legalFor({S32}) 843 .customFor({S64}) 844 .clampScalar(0, S32, S64) 845 .scalarize(0); 846 } 847 848 getActionDefinitionsBuilder(G_PTR_ADD) 849 .legalIf(all(isPointer(0), sameSize(0, 1))) 850 .scalarize(0) 851 .scalarSameSizeAs(1, 0); 852 853 getActionDefinitionsBuilder(G_PTRMASK) 854 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 855 .scalarSameSizeAs(1, 0) 856 .scalarize(0); 857 858 auto &CmpBuilder = 859 getActionDefinitionsBuilder(G_ICMP) 860 // The compare output type differs based on the register bank of the output, 861 // so make both s1 and s32 legal. 862 // 863 // Scalar compares producing output in scc will be promoted to s32, as that 864 // is the allocatable register type that will be needed for the copy from 865 // scc. This will be promoted during RegBankSelect, and we assume something 866 // before that won't try to use s32 result types. 867 // 868 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 869 // bank. 870 .legalForCartesianProduct( 871 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 872 .legalForCartesianProduct( 873 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 874 if (ST.has16BitInsts()) { 875 CmpBuilder.legalFor({{S1, S16}}); 876 } 877 878 CmpBuilder 879 .widenScalarToNextPow2(1) 880 .clampScalar(1, S32, S64) 881 .scalarize(0) 882 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 883 884 getActionDefinitionsBuilder(G_FCMP) 885 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 886 .widenScalarToNextPow2(1) 887 .clampScalar(1, S32, S64) 888 .scalarize(0); 889 890 // FIXME: fpow has a selection pattern that should move to custom lowering. 891 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 892 if (ST.has16BitInsts()) 893 Exp2Ops.legalFor({S32, S16}); 894 else 895 Exp2Ops.legalFor({S32}); 896 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 897 Exp2Ops.scalarize(0); 898 899 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 900 if (ST.has16BitInsts()) 901 ExpOps.customFor({{S32}, {S16}}); 902 else 903 ExpOps.customFor({S32}); 904 ExpOps.clampScalar(0, MinScalarFPTy, S32) 905 .scalarize(0); 906 907 getActionDefinitionsBuilder(G_FPOWI) 908 .clampScalar(0, MinScalarFPTy, S32) 909 .lower(); 910 911 // The 64-bit versions produce 32-bit results, but only on the SALU. 912 getActionDefinitionsBuilder(G_CTPOP) 913 .legalFor({{S32, S32}, {S32, S64}}) 914 .clampScalar(0, S32, S32) 915 .clampScalar(1, S32, S64) 916 .scalarize(0) 917 .widenScalarToNextPow2(0, 32) 918 .widenScalarToNextPow2(1, 32); 919 920 // The hardware instructions return a different result on 0 than the generic 921 // instructions expect. The hardware produces -1, but these produce the 922 // bitwidth. 923 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 924 .scalarize(0) 925 .clampScalar(0, S32, S32) 926 .clampScalar(1, S32, S64) 927 .widenScalarToNextPow2(0, 32) 928 .widenScalarToNextPow2(1, 32) 929 .lower(); 930 931 // The 64-bit versions produce 32-bit results, but only on the SALU. 932 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 933 .legalFor({{S32, S32}, {S32, S64}}) 934 .clampScalar(0, S32, S32) 935 .clampScalar(1, S32, S64) 936 .scalarize(0) 937 .widenScalarToNextPow2(0, 32) 938 .widenScalarToNextPow2(1, 32); 939 940 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 941 // RegBankSelect. 942 getActionDefinitionsBuilder(G_BITREVERSE) 943 .legalFor({S32, S64}) 944 .clampScalar(0, S32, S64) 945 .scalarize(0) 946 .widenScalarToNextPow2(0); 947 948 if (ST.has16BitInsts()) { 949 getActionDefinitionsBuilder(G_BSWAP) 950 .legalFor({S16, S32, V2S16}) 951 .clampMaxNumElements(0, S16, 2) 952 // FIXME: Fixing non-power-of-2 before clamp is workaround for 953 // narrowScalar limitation. 954 .widenScalarToNextPow2(0) 955 .clampScalar(0, S16, S32) 956 .scalarize(0); 957 958 if (ST.hasVOP3PInsts()) { 959 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 960 .legalFor({S32, S16, V2S16}) 961 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 962 .clampMaxNumElements(0, S16, 2) 963 .minScalar(0, S16) 964 .widenScalarToNextPow2(0) 965 .scalarize(0) 966 .lower(); 967 } else { 968 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 969 .legalFor({S32, S16}) 970 .widenScalarToNextPow2(0) 971 .minScalar(0, S16) 972 .scalarize(0) 973 .lower(); 974 } 975 } else { 976 // TODO: Should have same legality without v_perm_b32 977 getActionDefinitionsBuilder(G_BSWAP) 978 .legalFor({S32}) 979 .lowerIf(scalarNarrowerThan(0, 32)) 980 // FIXME: Fixing non-power-of-2 before clamp is workaround for 981 // narrowScalar limitation. 982 .widenScalarToNextPow2(0) 983 .maxScalar(0, S32) 984 .scalarize(0) 985 .lower(); 986 987 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 988 .legalFor({S32}) 989 .minScalar(0, S32) 990 .widenScalarToNextPow2(0) 991 .scalarize(0) 992 .lower(); 993 } 994 995 getActionDefinitionsBuilder(G_INTTOPTR) 996 // List the common cases 997 .legalForCartesianProduct(AddrSpaces64, {S64}) 998 .legalForCartesianProduct(AddrSpaces32, {S32}) 999 .scalarize(0) 1000 // Accept any address space as long as the size matches 1001 .legalIf(sameSize(0, 1)) 1002 .widenScalarIf(smallerThan(1, 0), 1003 [](const LegalityQuery &Query) { 1004 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1005 }) 1006 .narrowScalarIf(largerThan(1, 0), 1007 [](const LegalityQuery &Query) { 1008 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1009 }); 1010 1011 getActionDefinitionsBuilder(G_PTRTOINT) 1012 // List the common cases 1013 .legalForCartesianProduct(AddrSpaces64, {S64}) 1014 .legalForCartesianProduct(AddrSpaces32, {S32}) 1015 .scalarize(0) 1016 // Accept any address space as long as the size matches 1017 .legalIf(sameSize(0, 1)) 1018 .widenScalarIf(smallerThan(0, 1), 1019 [](const LegalityQuery &Query) { 1020 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1021 }) 1022 .narrowScalarIf( 1023 largerThan(0, 1), 1024 [](const LegalityQuery &Query) { 1025 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1026 }); 1027 1028 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1029 .scalarize(0) 1030 .custom(); 1031 1032 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1033 bool IsLoad) -> bool { 1034 const LLT DstTy = Query.Types[0]; 1035 1036 // Split vector extloads. 1037 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1038 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1039 1040 if (MemSize < DstTy.getSizeInBits()) 1041 MemSize = std::max(MemSize, AlignBits); 1042 1043 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1044 return true; 1045 1046 const LLT PtrTy = Query.Types[1]; 1047 unsigned AS = PtrTy.getAddressSpace(); 1048 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1049 return true; 1050 1051 // Catch weird sized loads that don't evenly divide into the access sizes 1052 // TODO: May be able to widen depending on alignment etc. 1053 unsigned NumRegs = (MemSize + 31) / 32; 1054 if (NumRegs == 3) { 1055 if (!ST.hasDwordx3LoadStores()) 1056 return true; 1057 } else { 1058 // If the alignment allows, these should have been widened. 1059 if (!isPowerOf2_32(NumRegs)) 1060 return true; 1061 } 1062 1063 if (AlignBits < MemSize) { 1064 const SITargetLowering *TLI = ST.getTargetLowering(); 1065 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1066 Align(AlignBits / 8)); 1067 } 1068 1069 return false; 1070 }; 1071 1072 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1073 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1074 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1075 1076 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1077 // LDS 1078 // TODO: Unsupported flat for SI. 1079 1080 for (unsigned Op : {G_LOAD, G_STORE}) { 1081 const bool IsStore = Op == G_STORE; 1082 1083 auto &Actions = getActionDefinitionsBuilder(Op); 1084 // Explicitly list some common cases. 1085 // TODO: Does this help compile time at all? 1086 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1087 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1088 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1089 {S64, GlobalPtr, 64, GlobalAlign32}, 1090 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1091 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1092 {S32, GlobalPtr, 8, GlobalAlign8}, 1093 {S32, GlobalPtr, 16, GlobalAlign16}, 1094 1095 {S32, LocalPtr, 32, 32}, 1096 {S64, LocalPtr, 64, 32}, 1097 {V2S32, LocalPtr, 64, 32}, 1098 {S32, LocalPtr, 8, 8}, 1099 {S32, LocalPtr, 16, 16}, 1100 {V2S16, LocalPtr, 32, 32}, 1101 1102 {S32, PrivatePtr, 32, 32}, 1103 {S32, PrivatePtr, 8, 8}, 1104 {S32, PrivatePtr, 16, 16}, 1105 {V2S16, PrivatePtr, 32, 32}, 1106 1107 {S32, ConstantPtr, 32, GlobalAlign32}, 1108 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1109 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1110 {S64, ConstantPtr, 64, GlobalAlign32}, 1111 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1112 Actions.legalIf( 1113 [=](const LegalityQuery &Query) -> bool { 1114 return isLoadStoreLegal(ST, Query, Op); 1115 }); 1116 1117 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1118 // 64-bits. 1119 // 1120 // TODO: Should generalize bitcast action into coerce, which will also cover 1121 // inserting addrspacecasts. 1122 Actions.customIf(typeIs(1, Constant32Ptr)); 1123 1124 // Turn any illegal element vectors into something easier to deal 1125 // with. These will ultimately produce 32-bit scalar shifts to extract the 1126 // parts anyway. 1127 // 1128 // For odd 16-bit element vectors, prefer to split those into pieces with 1129 // 16-bit vector parts. 1130 Actions.bitcastIf( 1131 [=](const LegalityQuery &Query) -> bool { 1132 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1133 Query.MMODescrs[0].SizeInBits); 1134 }, bitcastToRegisterType(0)); 1135 1136 if (!IsStore) { 1137 // Widen suitably aligned loads by loading extra bytes. The standard 1138 // legalization actions can't properly express widening memory operands. 1139 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1140 return shouldWidenLoad(ST, Query, G_LOAD); 1141 }); 1142 } 1143 1144 // FIXME: load/store narrowing should be moved to lower action 1145 Actions 1146 .narrowScalarIf( 1147 [=](const LegalityQuery &Query) -> bool { 1148 return !Query.Types[0].isVector() && 1149 needToSplitMemOp(Query, Op == G_LOAD); 1150 }, 1151 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1152 const LLT DstTy = Query.Types[0]; 1153 const LLT PtrTy = Query.Types[1]; 1154 1155 const unsigned DstSize = DstTy.getSizeInBits(); 1156 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1157 1158 // Split extloads. 1159 if (DstSize > MemSize) 1160 return std::make_pair(0, LLT::scalar(MemSize)); 1161 1162 if (!isPowerOf2_32(DstSize)) { 1163 // We're probably decomposing an odd sized store. Try to split 1164 // to the widest type. TODO: Account for alignment. As-is it 1165 // should be OK, since the new parts will be further legalized. 1166 unsigned FloorSize = PowerOf2Floor(DstSize); 1167 return std::make_pair(0, LLT::scalar(FloorSize)); 1168 } 1169 1170 if (DstSize > 32 && (DstSize % 32 != 0)) { 1171 // FIXME: Need a way to specify non-extload of larger size if 1172 // suitably aligned. 1173 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1174 } 1175 1176 unsigned MaxSize = maxSizeForAddrSpace(ST, 1177 PtrTy.getAddressSpace(), 1178 Op == G_LOAD); 1179 if (MemSize > MaxSize) 1180 return std::make_pair(0, LLT::scalar(MaxSize)); 1181 1182 unsigned Align = Query.MMODescrs[0].AlignInBits; 1183 return std::make_pair(0, LLT::scalar(Align)); 1184 }) 1185 .fewerElementsIf( 1186 [=](const LegalityQuery &Query) -> bool { 1187 return Query.Types[0].isVector() && 1188 needToSplitMemOp(Query, Op == G_LOAD); 1189 }, 1190 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1191 const LLT DstTy = Query.Types[0]; 1192 const LLT PtrTy = Query.Types[1]; 1193 1194 LLT EltTy = DstTy.getElementType(); 1195 unsigned MaxSize = maxSizeForAddrSpace(ST, 1196 PtrTy.getAddressSpace(), 1197 Op == G_LOAD); 1198 1199 // FIXME: Handle widened to power of 2 results better. This ends 1200 // up scalarizing. 1201 // FIXME: 3 element stores scalarized on SI 1202 1203 // Split if it's too large for the address space. 1204 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1205 unsigned NumElts = DstTy.getNumElements(); 1206 unsigned EltSize = EltTy.getSizeInBits(); 1207 1208 if (MaxSize % EltSize == 0) { 1209 return std::make_pair( 1210 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1211 } 1212 1213 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1214 1215 // FIXME: Refine when odd breakdowns handled 1216 // The scalars will need to be re-legalized. 1217 if (NumPieces == 1 || NumPieces >= NumElts || 1218 NumElts % NumPieces != 0) 1219 return std::make_pair(0, EltTy); 1220 1221 return std::make_pair(0, 1222 LLT::vector(NumElts / NumPieces, EltTy)); 1223 } 1224 1225 // FIXME: We could probably handle weird extending loads better. 1226 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1227 if (DstTy.getSizeInBits() > MemSize) 1228 return std::make_pair(0, EltTy); 1229 1230 unsigned EltSize = EltTy.getSizeInBits(); 1231 unsigned DstSize = DstTy.getSizeInBits(); 1232 if (!isPowerOf2_32(DstSize)) { 1233 // We're probably decomposing an odd sized store. Try to split 1234 // to the widest type. TODO: Account for alignment. As-is it 1235 // should be OK, since the new parts will be further legalized. 1236 unsigned FloorSize = PowerOf2Floor(DstSize); 1237 return std::make_pair( 1238 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1239 } 1240 1241 // Need to split because of alignment. 1242 unsigned Align = Query.MMODescrs[0].AlignInBits; 1243 if (EltSize > Align && 1244 (EltSize / Align < DstTy.getNumElements())) { 1245 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1246 } 1247 1248 // May need relegalization for the scalars. 1249 return std::make_pair(0, EltTy); 1250 }) 1251 .lowerIfMemSizeNotPow2() 1252 .minScalar(0, S32); 1253 1254 if (IsStore) 1255 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1256 1257 Actions 1258 .widenScalarToNextPow2(0) 1259 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1260 .lower(); 1261 } 1262 1263 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1264 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1265 {S32, GlobalPtr, 16, 2 * 8}, 1266 {S32, LocalPtr, 8, 8}, 1267 {S32, LocalPtr, 16, 16}, 1268 {S32, PrivatePtr, 8, 8}, 1269 {S32, PrivatePtr, 16, 16}, 1270 {S32, ConstantPtr, 8, 8}, 1271 {S32, ConstantPtr, 16, 2 * 8}}); 1272 if (ST.hasFlatAddressSpace()) { 1273 ExtLoads.legalForTypesWithMemDesc( 1274 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1275 } 1276 1277 ExtLoads.clampScalar(0, S32, S32) 1278 .widenScalarToNextPow2(0) 1279 .unsupportedIfMemSizeNotPow2() 1280 .lower(); 1281 1282 auto &Atomics = getActionDefinitionsBuilder( 1283 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1284 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1285 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1286 G_ATOMICRMW_UMIN}) 1287 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1288 {S64, GlobalPtr}, {S64, LocalPtr}, 1289 {S32, RegionPtr}, {S64, RegionPtr}}); 1290 if (ST.hasFlatAddressSpace()) { 1291 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1292 } 1293 1294 if (ST.hasLDSFPAtomics()) { 1295 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1296 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1297 if (ST.hasGFX90AInsts()) 1298 Atomic.legalFor({{S64, LocalPtr}}); 1299 } 1300 1301 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1302 // demarshalling 1303 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1304 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1305 {S32, FlatPtr}, {S64, FlatPtr}}) 1306 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1307 {S32, RegionPtr}, {S64, RegionPtr}}); 1308 // TODO: Pointer types, any 32-bit or 64-bit vector 1309 1310 // Condition should be s32 for scalar, s1 for vector. 1311 getActionDefinitionsBuilder(G_SELECT) 1312 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1313 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1314 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1315 .clampScalar(0, S16, S64) 1316 .scalarize(1) 1317 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1318 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1319 .clampMaxNumElements(0, S32, 2) 1320 .clampMaxNumElements(0, LocalPtr, 2) 1321 .clampMaxNumElements(0, PrivatePtr, 2) 1322 .scalarize(0) 1323 .widenScalarToNextPow2(0) 1324 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1325 1326 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1327 // be more flexible with the shift amount type. 1328 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1329 .legalFor({{S32, S32}, {S64, S32}}); 1330 if (ST.has16BitInsts()) { 1331 if (ST.hasVOP3PInsts()) { 1332 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1333 .clampMaxNumElements(0, S16, 2); 1334 } else 1335 Shifts.legalFor({{S16, S16}}); 1336 1337 // TODO: Support 16-bit shift amounts for all types 1338 Shifts.widenScalarIf( 1339 [=](const LegalityQuery &Query) { 1340 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1341 // 32-bit amount. 1342 const LLT ValTy = Query.Types[0]; 1343 const LLT AmountTy = Query.Types[1]; 1344 return ValTy.getSizeInBits() <= 16 && 1345 AmountTy.getSizeInBits() < 16; 1346 }, changeTo(1, S16)); 1347 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1348 Shifts.clampScalar(1, S32, S32); 1349 Shifts.clampScalar(0, S16, S64); 1350 Shifts.widenScalarToNextPow2(0, 16); 1351 1352 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1353 .minScalar(0, S16) 1354 .scalarize(0) 1355 .lower(); 1356 } else { 1357 // Make sure we legalize the shift amount type first, as the general 1358 // expansion for the shifted type will produce much worse code if it hasn't 1359 // been truncated already. 1360 Shifts.clampScalar(1, S32, S32); 1361 Shifts.clampScalar(0, S32, S64); 1362 Shifts.widenScalarToNextPow2(0, 32); 1363 1364 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1365 .minScalar(0, S32) 1366 .scalarize(0) 1367 .lower(); 1368 } 1369 Shifts.scalarize(0); 1370 1371 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1372 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1373 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1374 unsigned IdxTypeIdx = 2; 1375 1376 getActionDefinitionsBuilder(Op) 1377 .customIf([=](const LegalityQuery &Query) { 1378 const LLT EltTy = Query.Types[EltTypeIdx]; 1379 const LLT VecTy = Query.Types[VecTypeIdx]; 1380 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1381 const unsigned EltSize = EltTy.getSizeInBits(); 1382 return (EltSize == 32 || EltSize == 64) && 1383 VecTy.getSizeInBits() % 32 == 0 && 1384 VecTy.getSizeInBits() <= MaxRegisterSize && 1385 IdxTy.getSizeInBits() == 32; 1386 }) 1387 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1388 bitcastToVectorElement32(VecTypeIdx)) 1389 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1390 .bitcastIf( 1391 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1392 [=](const LegalityQuery &Query) { 1393 // For > 64-bit element types, try to turn this into a 64-bit 1394 // element vector since we may be able to do better indexing 1395 // if this is scalar. If not, fall back to 32. 1396 const LLT EltTy = Query.Types[EltTypeIdx]; 1397 const LLT VecTy = Query.Types[VecTypeIdx]; 1398 const unsigned DstEltSize = EltTy.getSizeInBits(); 1399 const unsigned VecSize = VecTy.getSizeInBits(); 1400 1401 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1402 return std::make_pair( 1403 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1404 }) 1405 .clampScalar(EltTypeIdx, S32, S64) 1406 .clampScalar(VecTypeIdx, S32, S64) 1407 .clampScalar(IdxTypeIdx, S32, S32) 1408 .clampMaxNumElements(VecTypeIdx, S32, 32) 1409 // TODO: Clamp elements for 64-bit vectors? 1410 // It should only be necessary with variable indexes. 1411 // As a last resort, lower to the stack 1412 .lower(); 1413 } 1414 1415 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1416 .unsupportedIf([=](const LegalityQuery &Query) { 1417 const LLT &EltTy = Query.Types[1].getElementType(); 1418 return Query.Types[0] != EltTy; 1419 }); 1420 1421 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1422 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1423 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1424 1425 // FIXME: Doesn't handle extract of illegal sizes. 1426 getActionDefinitionsBuilder(Op) 1427 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1428 // FIXME: Multiples of 16 should not be legal. 1429 .legalIf([=](const LegalityQuery &Query) { 1430 const LLT BigTy = Query.Types[BigTyIdx]; 1431 const LLT LitTy = Query.Types[LitTyIdx]; 1432 return (BigTy.getSizeInBits() % 32 == 0) && 1433 (LitTy.getSizeInBits() % 16 == 0); 1434 }) 1435 .widenScalarIf( 1436 [=](const LegalityQuery &Query) { 1437 const LLT BigTy = Query.Types[BigTyIdx]; 1438 return (BigTy.getScalarSizeInBits() < 16); 1439 }, 1440 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1441 .widenScalarIf( 1442 [=](const LegalityQuery &Query) { 1443 const LLT LitTy = Query.Types[LitTyIdx]; 1444 return (LitTy.getScalarSizeInBits() < 16); 1445 }, 1446 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1447 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1448 .widenScalarToNextPow2(BigTyIdx, 32); 1449 1450 } 1451 1452 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1453 .legalForCartesianProduct(AllS32Vectors, {S32}) 1454 .legalForCartesianProduct(AllS64Vectors, {S64}) 1455 .clampNumElements(0, V16S32, V32S32) 1456 .clampNumElements(0, V2S64, V16S64) 1457 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1458 1459 if (ST.hasScalarPackInsts()) { 1460 BuildVector 1461 // FIXME: Should probably widen s1 vectors straight to s32 1462 .minScalarOrElt(0, S16) 1463 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1464 .minScalar(1, S32); 1465 1466 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1467 .legalFor({V2S16, S32}) 1468 .lower(); 1469 BuildVector.minScalarOrElt(0, S32); 1470 } else { 1471 BuildVector.customFor({V2S16, S16}); 1472 BuildVector.minScalarOrElt(0, S32); 1473 1474 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1475 .customFor({V2S16, S32}) 1476 .lower(); 1477 } 1478 1479 BuildVector.legalIf(isRegisterType(0)); 1480 1481 // FIXME: Clamp maximum size 1482 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1483 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1484 .clampMaxNumElements(0, S32, 32) 1485 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1486 .clampMaxNumElements(0, S16, 64); 1487 1488 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1489 // pre-legalize. 1490 if (ST.hasVOP3PInsts()) { 1491 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1492 .customFor({V2S16, V2S16}) 1493 .lower(); 1494 } else 1495 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1496 1497 // Merge/Unmerge 1498 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1499 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1500 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1501 1502 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1503 const LLT Ty = Query.Types[TypeIdx]; 1504 if (Ty.isVector()) { 1505 const LLT &EltTy = Ty.getElementType(); 1506 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1507 return true; 1508 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1509 return true; 1510 } 1511 return false; 1512 }; 1513 1514 auto &Builder = getActionDefinitionsBuilder(Op) 1515 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1516 .lowerFor({{S16, V2S16}}) 1517 .lowerIf([=](const LegalityQuery &Query) { 1518 const LLT BigTy = Query.Types[BigTyIdx]; 1519 return BigTy.getSizeInBits() == 32; 1520 }) 1521 // Try to widen to s16 first for small types. 1522 // TODO: Only do this on targets with legal s16 shifts 1523 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1524 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1525 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1526 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1527 elementTypeIs(1, S16)), 1528 changeTo(1, V2S16)) 1529 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1530 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1531 // valid. 1532 .clampScalar(LitTyIdx, S32, S512) 1533 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1534 // Break up vectors with weird elements into scalars 1535 .fewerElementsIf( 1536 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1537 scalarize(0)) 1538 .fewerElementsIf( 1539 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1540 scalarize(1)) 1541 .clampScalar(BigTyIdx, S32, MaxScalar); 1542 1543 if (Op == G_MERGE_VALUES) { 1544 Builder.widenScalarIf( 1545 // TODO: Use 16-bit shifts if legal for 8-bit values? 1546 [=](const LegalityQuery &Query) { 1547 const LLT Ty = Query.Types[LitTyIdx]; 1548 return Ty.getSizeInBits() < 32; 1549 }, 1550 changeTo(LitTyIdx, S32)); 1551 } 1552 1553 Builder.widenScalarIf( 1554 [=](const LegalityQuery &Query) { 1555 const LLT Ty = Query.Types[BigTyIdx]; 1556 return !isPowerOf2_32(Ty.getSizeInBits()) && 1557 Ty.getSizeInBits() % 16 != 0; 1558 }, 1559 [=](const LegalityQuery &Query) { 1560 // Pick the next power of 2, or a multiple of 64 over 128. 1561 // Whichever is smaller. 1562 const LLT &Ty = Query.Types[BigTyIdx]; 1563 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1564 if (NewSizeInBits >= 256) { 1565 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1566 if (RoundedTo < NewSizeInBits) 1567 NewSizeInBits = RoundedTo; 1568 } 1569 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1570 }) 1571 // Any vectors left are the wrong size. Scalarize them. 1572 .scalarize(0) 1573 .scalarize(1); 1574 } 1575 1576 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1577 // RegBankSelect. 1578 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1579 .legalFor({{S32}, {S64}}); 1580 1581 if (ST.hasVOP3PInsts()) { 1582 SextInReg.lowerFor({{V2S16}}) 1583 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1584 // get more vector shift opportunities, since we'll get those when 1585 // expanded. 1586 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1587 } else if (ST.has16BitInsts()) { 1588 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1589 } else { 1590 // Prefer to promote to s32 before lowering if we don't have 16-bit 1591 // shifts. This avoid a lot of intermediate truncate and extend operations. 1592 SextInReg.lowerFor({{S32}, {S64}}); 1593 } 1594 1595 SextInReg 1596 .scalarize(0) 1597 .clampScalar(0, S32, S64) 1598 .lower(); 1599 1600 // TODO: Only Try to form v2s16 with legal packed instructions. 1601 getActionDefinitionsBuilder(G_FSHR) 1602 .legalFor({{S32, S32}}) 1603 .lowerFor({{V2S16, V2S16}}) 1604 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1605 .scalarize(0) 1606 .lower(); 1607 1608 if (ST.hasVOP3PInsts()) { 1609 getActionDefinitionsBuilder(G_FSHL) 1610 .lowerFor({{V2S16, V2S16}}) 1611 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1612 .scalarize(0) 1613 .lower(); 1614 } else { 1615 getActionDefinitionsBuilder(G_FSHL) 1616 .scalarize(0) 1617 .lower(); 1618 } 1619 1620 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1621 .legalFor({S64}); 1622 1623 getActionDefinitionsBuilder(G_FENCE) 1624 .alwaysLegal(); 1625 1626 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1627 .scalarize(0) 1628 .minScalar(0, S32) 1629 .lower(); 1630 1631 getActionDefinitionsBuilder({ 1632 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1633 G_FCOPYSIGN, 1634 1635 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1636 G_ATOMICRMW_NAND, 1637 G_ATOMICRMW_FSUB, 1638 G_READ_REGISTER, 1639 G_WRITE_REGISTER, 1640 1641 G_SADDO, G_SSUBO, 1642 1643 // TODO: Implement 1644 G_FMINIMUM, G_FMAXIMUM}).lower(); 1645 1646 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1647 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1648 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1649 .unsupported(); 1650 1651 computeTables(); 1652 verify(*ST.getInstrInfo()); 1653 } 1654 1655 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1656 MachineInstr &MI) const { 1657 MachineIRBuilder &B = Helper.MIRBuilder; 1658 MachineRegisterInfo &MRI = *B.getMRI(); 1659 1660 switch (MI.getOpcode()) { 1661 case TargetOpcode::G_ADDRSPACE_CAST: 1662 return legalizeAddrSpaceCast(MI, MRI, B); 1663 case TargetOpcode::G_FRINT: 1664 return legalizeFrint(MI, MRI, B); 1665 case TargetOpcode::G_FCEIL: 1666 return legalizeFceil(MI, MRI, B); 1667 case TargetOpcode::G_FREM: 1668 return legalizeFrem(MI, MRI, B); 1669 case TargetOpcode::G_INTRINSIC_TRUNC: 1670 return legalizeIntrinsicTrunc(MI, MRI, B); 1671 case TargetOpcode::G_SITOFP: 1672 return legalizeITOFP(MI, MRI, B, true); 1673 case TargetOpcode::G_UITOFP: 1674 return legalizeITOFP(MI, MRI, B, false); 1675 case TargetOpcode::G_FPTOSI: 1676 return legalizeFPTOI(MI, MRI, B, true); 1677 case TargetOpcode::G_FPTOUI: 1678 return legalizeFPTOI(MI, MRI, B, false); 1679 case TargetOpcode::G_FMINNUM: 1680 case TargetOpcode::G_FMAXNUM: 1681 case TargetOpcode::G_FMINNUM_IEEE: 1682 case TargetOpcode::G_FMAXNUM_IEEE: 1683 return legalizeMinNumMaxNum(Helper, MI); 1684 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1685 return legalizeExtractVectorElt(MI, MRI, B); 1686 case TargetOpcode::G_INSERT_VECTOR_ELT: 1687 return legalizeInsertVectorElt(MI, MRI, B); 1688 case TargetOpcode::G_SHUFFLE_VECTOR: 1689 return legalizeShuffleVector(MI, MRI, B); 1690 case TargetOpcode::G_FSIN: 1691 case TargetOpcode::G_FCOS: 1692 return legalizeSinCos(MI, MRI, B); 1693 case TargetOpcode::G_GLOBAL_VALUE: 1694 return legalizeGlobalValue(MI, MRI, B); 1695 case TargetOpcode::G_LOAD: 1696 return legalizeLoad(Helper, MI); 1697 case TargetOpcode::G_FMAD: 1698 return legalizeFMad(MI, MRI, B); 1699 case TargetOpcode::G_FDIV: 1700 return legalizeFDIV(MI, MRI, B); 1701 case TargetOpcode::G_UDIV: 1702 case TargetOpcode::G_UREM: 1703 return legalizeUDIV_UREM(MI, MRI, B); 1704 case TargetOpcode::G_SDIV: 1705 case TargetOpcode::G_SREM: 1706 return legalizeSDIV_SREM(MI, MRI, B); 1707 case TargetOpcode::G_ATOMIC_CMPXCHG: 1708 return legalizeAtomicCmpXChg(MI, MRI, B); 1709 case TargetOpcode::G_FLOG: 1710 return legalizeFlog(MI, B, numbers::ln2f); 1711 case TargetOpcode::G_FLOG10: 1712 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1713 case TargetOpcode::G_FEXP: 1714 return legalizeFExp(MI, B); 1715 case TargetOpcode::G_FPOW: 1716 return legalizeFPow(MI, B); 1717 case TargetOpcode::G_FFLOOR: 1718 return legalizeFFloor(MI, MRI, B); 1719 case TargetOpcode::G_BUILD_VECTOR: 1720 return legalizeBuildVector(MI, MRI, B); 1721 default: 1722 return false; 1723 } 1724 1725 llvm_unreachable("expected switch to return"); 1726 } 1727 1728 Register AMDGPULegalizerInfo::getSegmentAperture( 1729 unsigned AS, 1730 MachineRegisterInfo &MRI, 1731 MachineIRBuilder &B) const { 1732 MachineFunction &MF = B.getMF(); 1733 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1734 const LLT S32 = LLT::scalar(32); 1735 1736 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1737 1738 if (ST.hasApertureRegs()) { 1739 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1740 // getreg. 1741 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1742 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1743 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1744 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1745 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1746 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1747 unsigned Encoding = 1748 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1749 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1750 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1751 1752 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1753 1754 B.buildInstr(AMDGPU::S_GETREG_B32) 1755 .addDef(GetReg) 1756 .addImm(Encoding); 1757 MRI.setType(GetReg, S32); 1758 1759 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1760 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1761 } 1762 1763 Register QueuePtr = MRI.createGenericVirtualRegister( 1764 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1765 1766 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1767 return Register(); 1768 1769 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1770 // private_segment_aperture_base_hi. 1771 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1772 1773 // TODO: can we be smarter about machine pointer info? 1774 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1775 MachineMemOperand *MMO = MF.getMachineMemOperand( 1776 PtrInfo, 1777 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1778 MachineMemOperand::MOInvariant, 1779 4, commonAlignment(Align(64), StructOffset)); 1780 1781 Register LoadAddr; 1782 1783 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1784 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1785 } 1786 1787 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1788 MachineInstr &MI, MachineRegisterInfo &MRI, 1789 MachineIRBuilder &B) const { 1790 MachineFunction &MF = B.getMF(); 1791 1792 const LLT S32 = LLT::scalar(32); 1793 Register Dst = MI.getOperand(0).getReg(); 1794 Register Src = MI.getOperand(1).getReg(); 1795 1796 LLT DstTy = MRI.getType(Dst); 1797 LLT SrcTy = MRI.getType(Src); 1798 unsigned DestAS = DstTy.getAddressSpace(); 1799 unsigned SrcAS = SrcTy.getAddressSpace(); 1800 1801 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1802 // vector element. 1803 assert(!DstTy.isVector()); 1804 1805 const AMDGPUTargetMachine &TM 1806 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1807 1808 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1809 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1810 return true; 1811 } 1812 1813 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1814 // Truncate. 1815 B.buildExtract(Dst, Src, 0); 1816 MI.eraseFromParent(); 1817 return true; 1818 } 1819 1820 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1821 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1822 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1823 1824 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1825 // another. Merge operands are required to be the same type, but creating an 1826 // extra ptrtoint would be kind of pointless. 1827 auto HighAddr = B.buildConstant( 1828 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1829 B.buildMerge(Dst, {Src, HighAddr}); 1830 MI.eraseFromParent(); 1831 return true; 1832 } 1833 1834 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1835 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1836 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1837 unsigned NullVal = TM.getNullPointerValue(DestAS); 1838 1839 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1840 auto FlatNull = B.buildConstant(SrcTy, 0); 1841 1842 // Extract low 32-bits of the pointer. 1843 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1844 1845 auto CmpRes = 1846 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1847 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1848 1849 MI.eraseFromParent(); 1850 return true; 1851 } 1852 1853 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1854 return false; 1855 1856 if (!ST.hasFlatAddressSpace()) 1857 return false; 1858 1859 auto SegmentNull = 1860 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1861 auto FlatNull = 1862 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1863 1864 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1865 if (!ApertureReg.isValid()) 1866 return false; 1867 1868 auto CmpRes = 1869 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1870 1871 // Coerce the type of the low half of the result so we can use merge_values. 1872 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1873 1874 // TODO: Should we allow mismatched types but matching sizes in merges to 1875 // avoid the ptrtoint? 1876 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1877 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1878 1879 MI.eraseFromParent(); 1880 return true; 1881 } 1882 1883 bool AMDGPULegalizerInfo::legalizeFrint( 1884 MachineInstr &MI, MachineRegisterInfo &MRI, 1885 MachineIRBuilder &B) const { 1886 Register Src = MI.getOperand(1).getReg(); 1887 LLT Ty = MRI.getType(Src); 1888 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1889 1890 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1891 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1892 1893 auto C1 = B.buildFConstant(Ty, C1Val); 1894 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1895 1896 // TODO: Should this propagate fast-math-flags? 1897 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1898 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1899 1900 auto C2 = B.buildFConstant(Ty, C2Val); 1901 auto Fabs = B.buildFAbs(Ty, Src); 1902 1903 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1904 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1905 MI.eraseFromParent(); 1906 return true; 1907 } 1908 1909 bool AMDGPULegalizerInfo::legalizeFceil( 1910 MachineInstr &MI, MachineRegisterInfo &MRI, 1911 MachineIRBuilder &B) const { 1912 1913 const LLT S1 = LLT::scalar(1); 1914 const LLT S64 = LLT::scalar(64); 1915 1916 Register Src = MI.getOperand(1).getReg(); 1917 assert(MRI.getType(Src) == S64); 1918 1919 // result = trunc(src) 1920 // if (src > 0.0 && src != result) 1921 // result += 1.0 1922 1923 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1924 1925 const auto Zero = B.buildFConstant(S64, 0.0); 1926 const auto One = B.buildFConstant(S64, 1.0); 1927 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1928 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1929 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1930 auto Add = B.buildSelect(S64, And, One, Zero); 1931 1932 // TODO: Should this propagate fast-math-flags? 1933 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1934 return true; 1935 } 1936 1937 bool AMDGPULegalizerInfo::legalizeFrem( 1938 MachineInstr &MI, MachineRegisterInfo &MRI, 1939 MachineIRBuilder &B) const { 1940 Register DstReg = MI.getOperand(0).getReg(); 1941 Register Src0Reg = MI.getOperand(1).getReg(); 1942 Register Src1Reg = MI.getOperand(2).getReg(); 1943 auto Flags = MI.getFlags(); 1944 LLT Ty = MRI.getType(DstReg); 1945 1946 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1947 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1948 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1949 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1950 MI.eraseFromParent(); 1951 return true; 1952 } 1953 1954 static MachineInstrBuilder extractF64Exponent(Register Hi, 1955 MachineIRBuilder &B) { 1956 const unsigned FractBits = 52; 1957 const unsigned ExpBits = 11; 1958 LLT S32 = LLT::scalar(32); 1959 1960 auto Const0 = B.buildConstant(S32, FractBits - 32); 1961 auto Const1 = B.buildConstant(S32, ExpBits); 1962 1963 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1964 .addUse(Hi) 1965 .addUse(Const0.getReg(0)) 1966 .addUse(Const1.getReg(0)); 1967 1968 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1969 } 1970 1971 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1972 MachineInstr &MI, MachineRegisterInfo &MRI, 1973 MachineIRBuilder &B) const { 1974 const LLT S1 = LLT::scalar(1); 1975 const LLT S32 = LLT::scalar(32); 1976 const LLT S64 = LLT::scalar(64); 1977 1978 Register Src = MI.getOperand(1).getReg(); 1979 assert(MRI.getType(Src) == S64); 1980 1981 // TODO: Should this use extract since the low half is unused? 1982 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1983 Register Hi = Unmerge.getReg(1); 1984 1985 // Extract the upper half, since this is where we will find the sign and 1986 // exponent. 1987 auto Exp = extractF64Exponent(Hi, B); 1988 1989 const unsigned FractBits = 52; 1990 1991 // Extract the sign bit. 1992 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1993 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1994 1995 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1996 1997 const auto Zero32 = B.buildConstant(S32, 0); 1998 1999 // Extend back to 64-bits. 2000 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2001 2002 auto Shr = B.buildAShr(S64, FractMask, Exp); 2003 auto Not = B.buildNot(S64, Shr); 2004 auto Tmp0 = B.buildAnd(S64, Src, Not); 2005 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2006 2007 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2008 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2009 2010 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2011 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2012 MI.eraseFromParent(); 2013 return true; 2014 } 2015 2016 bool AMDGPULegalizerInfo::legalizeITOFP( 2017 MachineInstr &MI, MachineRegisterInfo &MRI, 2018 MachineIRBuilder &B, bool Signed) const { 2019 2020 Register Dst = MI.getOperand(0).getReg(); 2021 Register Src = MI.getOperand(1).getReg(); 2022 2023 const LLT S64 = LLT::scalar(64); 2024 const LLT S32 = LLT::scalar(32); 2025 2026 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2027 2028 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2029 2030 auto CvtHi = Signed ? 2031 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2032 B.buildUITOFP(S64, Unmerge.getReg(1)); 2033 2034 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2035 2036 auto ThirtyTwo = B.buildConstant(S32, 32); 2037 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2038 .addUse(CvtHi.getReg(0)) 2039 .addUse(ThirtyTwo.getReg(0)); 2040 2041 // TODO: Should this propagate fast-math-flags? 2042 B.buildFAdd(Dst, LdExp, CvtLo); 2043 MI.eraseFromParent(); 2044 return true; 2045 } 2046 2047 // TODO: Copied from DAG implementation. Verify logic and document how this 2048 // actually works. 2049 bool AMDGPULegalizerInfo::legalizeFPTOI( 2050 MachineInstr &MI, MachineRegisterInfo &MRI, 2051 MachineIRBuilder &B, bool Signed) const { 2052 2053 Register Dst = MI.getOperand(0).getReg(); 2054 Register Src = MI.getOperand(1).getReg(); 2055 2056 const LLT S64 = LLT::scalar(64); 2057 const LLT S32 = LLT::scalar(32); 2058 2059 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2060 2061 unsigned Flags = MI.getFlags(); 2062 2063 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2064 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2065 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2066 2067 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2068 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2069 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2070 2071 auto Hi = Signed ? 2072 B.buildFPTOSI(S32, FloorMul) : 2073 B.buildFPTOUI(S32, FloorMul); 2074 auto Lo = B.buildFPTOUI(S32, Fma); 2075 2076 B.buildMerge(Dst, { Lo, Hi }); 2077 MI.eraseFromParent(); 2078 2079 return true; 2080 } 2081 2082 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2083 MachineInstr &MI) const { 2084 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2085 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2086 2087 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2088 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2089 2090 // With ieee_mode disabled, the instructions have the correct behavior 2091 // already for G_FMINNUM/G_FMAXNUM 2092 if (!MFI->getMode().IEEE) 2093 return !IsIEEEOp; 2094 2095 if (IsIEEEOp) 2096 return true; 2097 2098 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2099 } 2100 2101 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2102 MachineInstr &MI, MachineRegisterInfo &MRI, 2103 MachineIRBuilder &B) const { 2104 // TODO: Should move some of this into LegalizerHelper. 2105 2106 // TODO: Promote dynamic indexing of s16 to s32 2107 2108 // FIXME: Artifact combiner probably should have replaced the truncated 2109 // constant before this, so we shouldn't need 2110 // getConstantVRegValWithLookThrough. 2111 Optional<ValueAndVReg> MaybeIdxVal = 2112 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2113 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2114 return true; 2115 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2116 2117 Register Dst = MI.getOperand(0).getReg(); 2118 Register Vec = MI.getOperand(1).getReg(); 2119 2120 LLT VecTy = MRI.getType(Vec); 2121 LLT EltTy = VecTy.getElementType(); 2122 assert(EltTy == MRI.getType(Dst)); 2123 2124 if (IdxVal < VecTy.getNumElements()) 2125 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2126 else 2127 B.buildUndef(Dst); 2128 2129 MI.eraseFromParent(); 2130 return true; 2131 } 2132 2133 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2134 MachineInstr &MI, MachineRegisterInfo &MRI, 2135 MachineIRBuilder &B) const { 2136 // TODO: Should move some of this into LegalizerHelper. 2137 2138 // TODO: Promote dynamic indexing of s16 to s32 2139 2140 // FIXME: Artifact combiner probably should have replaced the truncated 2141 // constant before this, so we shouldn't need 2142 // getConstantVRegValWithLookThrough. 2143 Optional<ValueAndVReg> MaybeIdxVal = 2144 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2145 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2146 return true; 2147 2148 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2149 Register Dst = MI.getOperand(0).getReg(); 2150 Register Vec = MI.getOperand(1).getReg(); 2151 Register Ins = MI.getOperand(2).getReg(); 2152 2153 LLT VecTy = MRI.getType(Vec); 2154 LLT EltTy = VecTy.getElementType(); 2155 assert(EltTy == MRI.getType(Ins)); 2156 2157 if (IdxVal < VecTy.getNumElements()) 2158 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2159 else 2160 B.buildUndef(Dst); 2161 2162 MI.eraseFromParent(); 2163 return true; 2164 } 2165 2166 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2167 MachineInstr &MI, MachineRegisterInfo &MRI, 2168 MachineIRBuilder &B) const { 2169 const LLT V2S16 = LLT::vector(2, 16); 2170 2171 Register Dst = MI.getOperand(0).getReg(); 2172 Register Src0 = MI.getOperand(1).getReg(); 2173 LLT DstTy = MRI.getType(Dst); 2174 LLT SrcTy = MRI.getType(Src0); 2175 2176 if (SrcTy == V2S16 && DstTy == V2S16 && 2177 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2178 return true; 2179 2180 MachineIRBuilder HelperBuilder(MI); 2181 GISelObserverWrapper DummyObserver; 2182 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2183 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2184 } 2185 2186 bool AMDGPULegalizerInfo::legalizeSinCos( 2187 MachineInstr &MI, MachineRegisterInfo &MRI, 2188 MachineIRBuilder &B) const { 2189 2190 Register DstReg = MI.getOperand(0).getReg(); 2191 Register SrcReg = MI.getOperand(1).getReg(); 2192 LLT Ty = MRI.getType(DstReg); 2193 unsigned Flags = MI.getFlags(); 2194 2195 Register TrigVal; 2196 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2197 if (ST.hasTrigReducedRange()) { 2198 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2199 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2200 .addUse(MulVal.getReg(0)) 2201 .setMIFlags(Flags).getReg(0); 2202 } else 2203 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2204 2205 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2206 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2207 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2208 .addUse(TrigVal) 2209 .setMIFlags(Flags); 2210 MI.eraseFromParent(); 2211 return true; 2212 } 2213 2214 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2215 MachineIRBuilder &B, 2216 const GlobalValue *GV, 2217 int64_t Offset, 2218 unsigned GAFlags) const { 2219 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2220 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2221 // to the following code sequence: 2222 // 2223 // For constant address space: 2224 // s_getpc_b64 s[0:1] 2225 // s_add_u32 s0, s0, $symbol 2226 // s_addc_u32 s1, s1, 0 2227 // 2228 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2229 // a fixup or relocation is emitted to replace $symbol with a literal 2230 // constant, which is a pc-relative offset from the encoding of the $symbol 2231 // operand to the global variable. 2232 // 2233 // For global address space: 2234 // s_getpc_b64 s[0:1] 2235 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2236 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2237 // 2238 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2239 // fixups or relocations are emitted to replace $symbol@*@lo and 2240 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2241 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2242 // operand to the global variable. 2243 // 2244 // What we want here is an offset from the value returned by s_getpc 2245 // (which is the address of the s_add_u32 instruction) to the global 2246 // variable, but since the encoding of $symbol starts 4 bytes after the start 2247 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2248 // small. This requires us to add 4 to the global variable offset in order to 2249 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2250 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2251 // instruction. 2252 2253 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2254 2255 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2256 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2257 2258 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2259 .addDef(PCReg); 2260 2261 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2262 if (GAFlags == SIInstrInfo::MO_NONE) 2263 MIB.addImm(0); 2264 else 2265 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2266 2267 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2268 2269 if (PtrTy.getSizeInBits() == 32) 2270 B.buildExtract(DstReg, PCReg, 0); 2271 return true; 2272 } 2273 2274 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2275 MachineInstr &MI, MachineRegisterInfo &MRI, 2276 MachineIRBuilder &B) const { 2277 Register DstReg = MI.getOperand(0).getReg(); 2278 LLT Ty = MRI.getType(DstReg); 2279 unsigned AS = Ty.getAddressSpace(); 2280 2281 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2282 MachineFunction &MF = B.getMF(); 2283 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2284 2285 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2286 if (!MFI->isModuleEntryFunction()) { 2287 const Function &Fn = MF.getFunction(); 2288 DiagnosticInfoUnsupported BadLDSDecl( 2289 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2290 DS_Warning); 2291 Fn.getContext().diagnose(BadLDSDecl); 2292 2293 // We currently don't have a way to correctly allocate LDS objects that 2294 // aren't directly associated with a kernel. We do force inlining of 2295 // functions that use local objects. However, if these dead functions are 2296 // not eliminated, we don't want a compile time error. Just emit a warning 2297 // and a trap, since there should be no callable path here. 2298 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2299 B.buildUndef(DstReg); 2300 MI.eraseFromParent(); 2301 return true; 2302 } 2303 2304 // TODO: We could emit code to handle the initialization somewhere. 2305 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2306 const SITargetLowering *TLI = ST.getTargetLowering(); 2307 if (!TLI->shouldUseLDSConstAddress(GV)) { 2308 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2309 return true; // Leave in place; 2310 } 2311 2312 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2313 Type *Ty = GV->getValueType(); 2314 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2315 // zero-sized type in other languages to declare the dynamic shared 2316 // memory which size is not known at the compile time. They will be 2317 // allocated by the runtime and placed directly after the static 2318 // allocated ones. They all share the same offset. 2319 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2320 // Adjust alignment for that dynamic shared memory array. 2321 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2322 LLT S32 = LLT::scalar(32); 2323 auto Sz = 2324 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2325 B.buildIntToPtr(DstReg, Sz); 2326 MI.eraseFromParent(); 2327 return true; 2328 } 2329 } 2330 2331 B.buildConstant( 2332 DstReg, 2333 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2334 MI.eraseFromParent(); 2335 return true; 2336 } 2337 2338 const Function &Fn = MF.getFunction(); 2339 DiagnosticInfoUnsupported BadInit( 2340 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2341 Fn.getContext().diagnose(BadInit); 2342 return true; 2343 } 2344 2345 const SITargetLowering *TLI = ST.getTargetLowering(); 2346 2347 if (TLI->shouldEmitFixup(GV)) { 2348 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2349 MI.eraseFromParent(); 2350 return true; 2351 } 2352 2353 if (TLI->shouldEmitPCReloc(GV)) { 2354 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2355 MI.eraseFromParent(); 2356 return true; 2357 } 2358 2359 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2360 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2361 2362 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2363 MachinePointerInfo::getGOT(MF), 2364 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2365 MachineMemOperand::MOInvariant, 2366 8 /*Size*/, Align(8)); 2367 2368 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2369 2370 if (Ty.getSizeInBits() == 32) { 2371 // Truncate if this is a 32-bit constant adrdess. 2372 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2373 B.buildExtract(DstReg, Load, 0); 2374 } else 2375 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2376 2377 MI.eraseFromParent(); 2378 return true; 2379 } 2380 2381 static LLT widenToNextPowerOf2(LLT Ty) { 2382 if (Ty.isVector()) 2383 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2384 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2385 } 2386 2387 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2388 MachineInstr &MI) const { 2389 MachineIRBuilder &B = Helper.MIRBuilder; 2390 MachineRegisterInfo &MRI = *B.getMRI(); 2391 GISelChangeObserver &Observer = Helper.Observer; 2392 2393 Register PtrReg = MI.getOperand(1).getReg(); 2394 LLT PtrTy = MRI.getType(PtrReg); 2395 unsigned AddrSpace = PtrTy.getAddressSpace(); 2396 2397 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2398 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2399 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2400 Observer.changingInstr(MI); 2401 MI.getOperand(1).setReg(Cast.getReg(0)); 2402 Observer.changedInstr(MI); 2403 return true; 2404 } 2405 2406 Register ValReg = MI.getOperand(0).getReg(); 2407 LLT ValTy = MRI.getType(ValReg); 2408 2409 MachineMemOperand *MMO = *MI.memoperands_begin(); 2410 const unsigned ValSize = ValTy.getSizeInBits(); 2411 const unsigned MemSize = 8 * MMO->getSize(); 2412 const Align MemAlign = MMO->getAlign(); 2413 const unsigned AlignInBits = 8 * MemAlign.value(); 2414 2415 // Widen non-power-of-2 loads to the alignment if needed 2416 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2417 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2418 2419 // This was already the correct extending load result type, so just adjust 2420 // the memory type. 2421 if (WideMemSize == ValSize) { 2422 MachineFunction &MF = B.getMF(); 2423 2424 MachineMemOperand *WideMMO = 2425 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2426 Observer.changingInstr(MI); 2427 MI.setMemRefs(MF, {WideMMO}); 2428 Observer.changedInstr(MI); 2429 return true; 2430 } 2431 2432 // Don't bother handling edge case that should probably never be produced. 2433 if (ValSize > WideMemSize) 2434 return false; 2435 2436 LLT WideTy = widenToNextPowerOf2(ValTy); 2437 2438 Register WideLoad; 2439 if (!WideTy.isVector()) { 2440 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2441 B.buildTrunc(ValReg, WideLoad).getReg(0); 2442 } else { 2443 // Extract the subvector. 2444 2445 if (isRegisterType(ValTy)) { 2446 // If this a case where G_EXTRACT is legal, use it. 2447 // (e.g. <3 x s32> -> <4 x s32>) 2448 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2449 B.buildExtract(ValReg, WideLoad, 0); 2450 } else { 2451 // For cases where the widened type isn't a nice register value, unmerge 2452 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2453 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2454 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2455 B.setInsertPt(B.getMBB(), MI.getIterator()); 2456 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2457 } 2458 } 2459 2460 MI.eraseFromParent(); 2461 return true; 2462 } 2463 2464 return false; 2465 } 2466 2467 bool AMDGPULegalizerInfo::legalizeFMad( 2468 MachineInstr &MI, MachineRegisterInfo &MRI, 2469 MachineIRBuilder &B) const { 2470 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2471 assert(Ty.isScalar()); 2472 2473 MachineFunction &MF = B.getMF(); 2474 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2475 2476 // TODO: Always legal with future ftz flag. 2477 // FIXME: Do we need just output? 2478 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2479 return true; 2480 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2481 return true; 2482 2483 MachineIRBuilder HelperBuilder(MI); 2484 GISelObserverWrapper DummyObserver; 2485 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2486 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2487 } 2488 2489 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2490 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2491 Register DstReg = MI.getOperand(0).getReg(); 2492 Register PtrReg = MI.getOperand(1).getReg(); 2493 Register CmpVal = MI.getOperand(2).getReg(); 2494 Register NewVal = MI.getOperand(3).getReg(); 2495 2496 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2497 "this should not have been custom lowered"); 2498 2499 LLT ValTy = MRI.getType(CmpVal); 2500 LLT VecTy = LLT::vector(2, ValTy); 2501 2502 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2503 2504 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2505 .addDef(DstReg) 2506 .addUse(PtrReg) 2507 .addUse(PackedVal) 2508 .setMemRefs(MI.memoperands()); 2509 2510 MI.eraseFromParent(); 2511 return true; 2512 } 2513 2514 bool AMDGPULegalizerInfo::legalizeFlog( 2515 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2516 Register Dst = MI.getOperand(0).getReg(); 2517 Register Src = MI.getOperand(1).getReg(); 2518 LLT Ty = B.getMRI()->getType(Dst); 2519 unsigned Flags = MI.getFlags(); 2520 2521 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2522 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2523 2524 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2525 MI.eraseFromParent(); 2526 return true; 2527 } 2528 2529 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2530 MachineIRBuilder &B) const { 2531 Register Dst = MI.getOperand(0).getReg(); 2532 Register Src = MI.getOperand(1).getReg(); 2533 unsigned Flags = MI.getFlags(); 2534 LLT Ty = B.getMRI()->getType(Dst); 2535 2536 auto K = B.buildFConstant(Ty, numbers::log2e); 2537 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2538 B.buildFExp2(Dst, Mul, Flags); 2539 MI.eraseFromParent(); 2540 return true; 2541 } 2542 2543 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2544 MachineIRBuilder &B) const { 2545 Register Dst = MI.getOperand(0).getReg(); 2546 Register Src0 = MI.getOperand(1).getReg(); 2547 Register Src1 = MI.getOperand(2).getReg(); 2548 unsigned Flags = MI.getFlags(); 2549 LLT Ty = B.getMRI()->getType(Dst); 2550 const LLT S16 = LLT::scalar(16); 2551 const LLT S32 = LLT::scalar(32); 2552 2553 if (Ty == S32) { 2554 auto Log = B.buildFLog2(S32, Src0, Flags); 2555 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2556 .addUse(Log.getReg(0)) 2557 .addUse(Src1) 2558 .setMIFlags(Flags); 2559 B.buildFExp2(Dst, Mul, Flags); 2560 } else if (Ty == S16) { 2561 // There's no f16 fmul_legacy, so we need to convert for it. 2562 auto Log = B.buildFLog2(S16, Src0, Flags); 2563 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2564 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2565 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2566 .addUse(Ext0.getReg(0)) 2567 .addUse(Ext1.getReg(0)) 2568 .setMIFlags(Flags); 2569 2570 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2571 } else 2572 return false; 2573 2574 MI.eraseFromParent(); 2575 return true; 2576 } 2577 2578 // Find a source register, ignoring any possible source modifiers. 2579 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2580 Register ModSrc = OrigSrc; 2581 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2582 ModSrc = SrcFNeg->getOperand(1).getReg(); 2583 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2584 ModSrc = SrcFAbs->getOperand(1).getReg(); 2585 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2586 ModSrc = SrcFAbs->getOperand(1).getReg(); 2587 return ModSrc; 2588 } 2589 2590 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2591 MachineRegisterInfo &MRI, 2592 MachineIRBuilder &B) const { 2593 2594 const LLT S1 = LLT::scalar(1); 2595 const LLT S64 = LLT::scalar(64); 2596 Register Dst = MI.getOperand(0).getReg(); 2597 Register OrigSrc = MI.getOperand(1).getReg(); 2598 unsigned Flags = MI.getFlags(); 2599 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2600 "this should not have been custom lowered"); 2601 2602 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2603 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2604 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2605 // V_FRACT bug is: 2606 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2607 // 2608 // Convert floor(x) to (x - fract(x)) 2609 2610 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2611 .addUse(OrigSrc) 2612 .setMIFlags(Flags); 2613 2614 // Give source modifier matching some assistance before obscuring a foldable 2615 // pattern. 2616 2617 // TODO: We can avoid the neg on the fract? The input sign to fract 2618 // shouldn't matter? 2619 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2620 2621 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2622 2623 Register Min = MRI.createGenericVirtualRegister(S64); 2624 2625 // We don't need to concern ourselves with the snan handling difference, so 2626 // use the one which will directly select. 2627 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2628 if (MFI->getMode().IEEE) 2629 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2630 else 2631 B.buildFMinNum(Min, Fract, Const, Flags); 2632 2633 Register CorrectedFract = Min; 2634 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2635 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2636 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2637 } 2638 2639 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2640 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2641 2642 MI.eraseFromParent(); 2643 return true; 2644 } 2645 2646 // Turn an illegal packed v2s16 build vector into bit operations. 2647 // TODO: This should probably be a bitcast action in LegalizerHelper. 2648 bool AMDGPULegalizerInfo::legalizeBuildVector( 2649 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2650 Register Dst = MI.getOperand(0).getReg(); 2651 const LLT S32 = LLT::scalar(32); 2652 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2653 2654 Register Src0 = MI.getOperand(1).getReg(); 2655 Register Src1 = MI.getOperand(2).getReg(); 2656 assert(MRI.getType(Src0) == LLT::scalar(16)); 2657 2658 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2659 B.buildBitcast(Dst, Merge); 2660 2661 MI.eraseFromParent(); 2662 return true; 2663 } 2664 2665 // Check that this is a G_XOR x, -1 2666 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2667 if (MI.getOpcode() != TargetOpcode::G_XOR) 2668 return false; 2669 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2670 return ConstVal && *ConstVal == -1; 2671 } 2672 2673 // Return the use branch instruction, otherwise null if the usage is invalid. 2674 static MachineInstr * 2675 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2676 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2677 Register CondDef = MI.getOperand(0).getReg(); 2678 if (!MRI.hasOneNonDBGUse(CondDef)) 2679 return nullptr; 2680 2681 MachineBasicBlock *Parent = MI.getParent(); 2682 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2683 2684 if (isNot(MRI, *UseMI)) { 2685 Register NegatedCond = UseMI->getOperand(0).getReg(); 2686 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2687 return nullptr; 2688 2689 // We're deleting the def of this value, so we need to remove it. 2690 UseMI->eraseFromParent(); 2691 2692 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2693 Negated = true; 2694 } 2695 2696 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2697 return nullptr; 2698 2699 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2700 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2701 if (Next == Parent->end()) { 2702 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2703 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2704 return nullptr; 2705 UncondBrTarget = &*NextMBB; 2706 } else { 2707 if (Next->getOpcode() != AMDGPU::G_BR) 2708 return nullptr; 2709 Br = &*Next; 2710 UncondBrTarget = Br->getOperand(0).getMBB(); 2711 } 2712 2713 return UseMI; 2714 } 2715 2716 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2717 const ArgDescriptor *Arg, 2718 const TargetRegisterClass *ArgRC, 2719 LLT ArgTy) const { 2720 MCRegister SrcReg = Arg->getRegister(); 2721 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2722 assert(DstReg.isVirtual() && "Virtual register expected"); 2723 2724 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2725 ArgTy); 2726 if (Arg->isMasked()) { 2727 // TODO: Should we try to emit this once in the entry block? 2728 const LLT S32 = LLT::scalar(32); 2729 const unsigned Mask = Arg->getMask(); 2730 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2731 2732 Register AndMaskSrc = LiveIn; 2733 2734 if (Shift != 0) { 2735 auto ShiftAmt = B.buildConstant(S32, Shift); 2736 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2737 } 2738 2739 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2740 } else { 2741 B.buildCopy(DstReg, LiveIn); 2742 } 2743 2744 return true; 2745 } 2746 2747 bool AMDGPULegalizerInfo::loadInputValue( 2748 Register DstReg, MachineIRBuilder &B, 2749 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2750 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2751 const ArgDescriptor *Arg; 2752 const TargetRegisterClass *ArgRC; 2753 LLT ArgTy; 2754 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2755 2756 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2757 return false; // TODO: Handle these 2758 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2759 } 2760 2761 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2762 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2763 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2764 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2765 return false; 2766 2767 MI.eraseFromParent(); 2768 return true; 2769 } 2770 2771 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2772 MachineRegisterInfo &MRI, 2773 MachineIRBuilder &B) const { 2774 Register Dst = MI.getOperand(0).getReg(); 2775 LLT DstTy = MRI.getType(Dst); 2776 LLT S16 = LLT::scalar(16); 2777 LLT S32 = LLT::scalar(32); 2778 LLT S64 = LLT::scalar(64); 2779 2780 if (DstTy == S16) 2781 return legalizeFDIV16(MI, MRI, B); 2782 if (DstTy == S32) 2783 return legalizeFDIV32(MI, MRI, B); 2784 if (DstTy == S64) 2785 return legalizeFDIV64(MI, MRI, B); 2786 2787 return false; 2788 } 2789 2790 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2791 Register DstReg, 2792 Register X, 2793 Register Y, 2794 bool IsDiv) const { 2795 const LLT S1 = LLT::scalar(1); 2796 const LLT S32 = LLT::scalar(32); 2797 2798 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2799 // algorithm used here. 2800 2801 // Initial estimate of inv(y). 2802 auto FloatY = B.buildUITOFP(S32, Y); 2803 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2804 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2805 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2806 auto Z = B.buildFPTOUI(S32, ScaledY); 2807 2808 // One round of UNR. 2809 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2810 auto NegYZ = B.buildMul(S32, NegY, Z); 2811 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2812 2813 // Quotient/remainder estimate. 2814 auto Q = B.buildUMulH(S32, X, Z); 2815 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2816 2817 // First quotient/remainder refinement. 2818 auto One = B.buildConstant(S32, 1); 2819 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2820 if (IsDiv) 2821 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2822 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2823 2824 // Second quotient/remainder refinement. 2825 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2826 if (IsDiv) 2827 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2828 else 2829 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2830 } 2831 2832 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2833 MachineRegisterInfo &MRI, 2834 MachineIRBuilder &B) const { 2835 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2836 Register DstReg = MI.getOperand(0).getReg(); 2837 Register Num = MI.getOperand(1).getReg(); 2838 Register Den = MI.getOperand(2).getReg(); 2839 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2840 MI.eraseFromParent(); 2841 return true; 2842 } 2843 2844 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2845 // 2846 // Return lo, hi of result 2847 // 2848 // %cvt.lo = G_UITOFP Val.lo 2849 // %cvt.hi = G_UITOFP Val.hi 2850 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2851 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2852 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2853 // %mul2 = G_FMUL %mul1, 2**(-32) 2854 // %trunc = G_INTRINSIC_TRUNC %mul2 2855 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2856 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2857 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2858 Register Val) { 2859 const LLT S32 = LLT::scalar(32); 2860 auto Unmerge = B.buildUnmerge(S32, Val); 2861 2862 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2863 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2864 2865 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2866 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2867 2868 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2869 auto Mul1 = 2870 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2871 2872 // 2**(-32) 2873 auto Mul2 = 2874 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2875 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2876 2877 // -(2**32) 2878 auto Mad2 = B.buildFMAD(S32, Trunc, 2879 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2880 2881 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2882 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2883 2884 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2885 } 2886 2887 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2888 Register DstReg, 2889 Register Numer, 2890 Register Denom, 2891 bool IsDiv) const { 2892 const LLT S32 = LLT::scalar(32); 2893 const LLT S64 = LLT::scalar(64); 2894 const LLT S1 = LLT::scalar(1); 2895 Register RcpLo, RcpHi; 2896 2897 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2898 2899 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2900 2901 auto Zero64 = B.buildConstant(S64, 0); 2902 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2903 2904 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2905 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2906 2907 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2908 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2909 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2910 2911 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2912 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2913 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2914 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2915 2916 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2917 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2918 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2919 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2920 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2921 2922 auto Zero32 = B.buildConstant(S32, 0); 2923 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2924 auto Add2_HiC = 2925 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2926 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2927 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2928 2929 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2930 Register NumerLo = UnmergeNumer.getReg(0); 2931 Register NumerHi = UnmergeNumer.getReg(1); 2932 2933 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2934 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2935 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2936 Register Mul3_Lo = UnmergeMul3.getReg(0); 2937 Register Mul3_Hi = UnmergeMul3.getReg(1); 2938 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2939 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2940 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2941 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2942 2943 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2944 Register DenomLo = UnmergeDenom.getReg(0); 2945 Register DenomHi = UnmergeDenom.getReg(1); 2946 2947 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2948 auto C1 = B.buildSExt(S32, CmpHi); 2949 2950 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2951 auto C2 = B.buildSExt(S32, CmpLo); 2952 2953 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2954 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2955 2956 // TODO: Here and below portions of the code can be enclosed into if/endif. 2957 // Currently control flow is unconditional and we have 4 selects after 2958 // potential endif to substitute PHIs. 2959 2960 // if C3 != 0 ... 2961 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2962 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2963 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2964 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2965 2966 auto One64 = B.buildConstant(S64, 1); 2967 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2968 2969 auto C4 = 2970 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2971 auto C5 = 2972 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2973 auto C6 = B.buildSelect( 2974 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2975 2976 // if (C6 != 0) 2977 auto Add4 = B.buildAdd(S64, Add3, One64); 2978 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2979 2980 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2981 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2982 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2983 2984 // endif C6 2985 // endif C3 2986 2987 if (IsDiv) { 2988 auto Sel1 = B.buildSelect( 2989 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2990 B.buildSelect(DstReg, 2991 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2992 } else { 2993 auto Sel2 = B.buildSelect( 2994 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2995 B.buildSelect(DstReg, 2996 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2997 } 2998 } 2999 3000 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 3001 MachineRegisterInfo &MRI, 3002 MachineIRBuilder &B) const { 3003 const LLT S64 = LLT::scalar(64); 3004 const LLT S32 = LLT::scalar(32); 3005 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 3006 Register DstReg = MI.getOperand(0).getReg(); 3007 Register Num = MI.getOperand(1).getReg(); 3008 Register Den = MI.getOperand(2).getReg(); 3009 LLT Ty = MRI.getType(DstReg); 3010 3011 if (Ty == S32) 3012 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 3013 else if (Ty == S64) 3014 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 3015 else 3016 return false; 3017 3018 MI.eraseFromParent(); 3019 return true; 3020 3021 } 3022 3023 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 3024 MachineRegisterInfo &MRI, 3025 MachineIRBuilder &B) const { 3026 const LLT S64 = LLT::scalar(64); 3027 const LLT S32 = LLT::scalar(32); 3028 3029 Register DstReg = MI.getOperand(0).getReg(); 3030 const LLT Ty = MRI.getType(DstReg); 3031 if (Ty != S32 && Ty != S64) 3032 return false; 3033 3034 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 3035 3036 Register LHS = MI.getOperand(1).getReg(); 3037 Register RHS = MI.getOperand(2).getReg(); 3038 3039 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3040 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3041 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3042 3043 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3044 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3045 3046 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3047 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3048 3049 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 3050 if (Ty == S32) 3051 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 3052 else 3053 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 3054 3055 Register Sign; 3056 if (IsDiv) 3057 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3058 else 3059 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3060 3061 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 3062 B.buildSub(DstReg, UDivRem, Sign); 3063 3064 MI.eraseFromParent(); 3065 return true; 3066 } 3067 3068 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3069 MachineRegisterInfo &MRI, 3070 MachineIRBuilder &B) const { 3071 Register Res = MI.getOperand(0).getReg(); 3072 Register LHS = MI.getOperand(1).getReg(); 3073 Register RHS = MI.getOperand(2).getReg(); 3074 uint16_t Flags = MI.getFlags(); 3075 LLT ResTy = MRI.getType(Res); 3076 3077 const MachineFunction &MF = B.getMF(); 3078 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3079 MI.getFlag(MachineInstr::FmAfn); 3080 3081 if (!AllowInaccurateRcp) 3082 return false; 3083 3084 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3085 // 1 / x -> RCP(x) 3086 if (CLHS->isExactlyValue(1.0)) { 3087 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3088 .addUse(RHS) 3089 .setMIFlags(Flags); 3090 3091 MI.eraseFromParent(); 3092 return true; 3093 } 3094 3095 // -1 / x -> RCP( FNEG(x) ) 3096 if (CLHS->isExactlyValue(-1.0)) { 3097 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3098 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3099 .addUse(FNeg.getReg(0)) 3100 .setMIFlags(Flags); 3101 3102 MI.eraseFromParent(); 3103 return true; 3104 } 3105 } 3106 3107 // x / y -> x * (1.0 / y) 3108 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3109 .addUse(RHS) 3110 .setMIFlags(Flags); 3111 B.buildFMul(Res, LHS, RCP, Flags); 3112 3113 MI.eraseFromParent(); 3114 return true; 3115 } 3116 3117 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3118 MachineRegisterInfo &MRI, 3119 MachineIRBuilder &B) const { 3120 Register Res = MI.getOperand(0).getReg(); 3121 Register X = MI.getOperand(1).getReg(); 3122 Register Y = MI.getOperand(2).getReg(); 3123 uint16_t Flags = MI.getFlags(); 3124 LLT ResTy = MRI.getType(Res); 3125 3126 const MachineFunction &MF = B.getMF(); 3127 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3128 MI.getFlag(MachineInstr::FmAfn); 3129 3130 if (!AllowInaccurateRcp) 3131 return false; 3132 3133 auto NegY = B.buildFNeg(ResTy, Y); 3134 auto One = B.buildFConstant(ResTy, 1.0); 3135 3136 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3137 .addUse(Y) 3138 .setMIFlags(Flags); 3139 3140 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3141 R = B.buildFMA(ResTy, Tmp0, R, R); 3142 3143 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3144 R = B.buildFMA(ResTy, Tmp1, R, R); 3145 3146 auto Ret = B.buildFMul(ResTy, X, R); 3147 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3148 3149 B.buildFMA(Res, Tmp2, R, Ret); 3150 MI.eraseFromParent(); 3151 return true; 3152 } 3153 3154 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3155 MachineRegisterInfo &MRI, 3156 MachineIRBuilder &B) const { 3157 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3158 return true; 3159 3160 Register Res = MI.getOperand(0).getReg(); 3161 Register LHS = MI.getOperand(1).getReg(); 3162 Register RHS = MI.getOperand(2).getReg(); 3163 3164 uint16_t Flags = MI.getFlags(); 3165 3166 LLT S16 = LLT::scalar(16); 3167 LLT S32 = LLT::scalar(32); 3168 3169 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3170 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3171 3172 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3173 .addUse(RHSExt.getReg(0)) 3174 .setMIFlags(Flags); 3175 3176 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3177 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3178 3179 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3180 .addUse(RDst.getReg(0)) 3181 .addUse(RHS) 3182 .addUse(LHS) 3183 .setMIFlags(Flags); 3184 3185 MI.eraseFromParent(); 3186 return true; 3187 } 3188 3189 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3190 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3191 static void toggleSPDenormMode(bool Enable, 3192 MachineIRBuilder &B, 3193 const GCNSubtarget &ST, 3194 AMDGPU::SIModeRegisterDefaults Mode) { 3195 // Set SP denorm mode to this value. 3196 unsigned SPDenormMode = 3197 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3198 3199 if (ST.hasDenormModeInst()) { 3200 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3201 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3202 3203 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3204 B.buildInstr(AMDGPU::S_DENORM_MODE) 3205 .addImm(NewDenormModeValue); 3206 3207 } else { 3208 // Select FP32 bit field in mode register. 3209 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3210 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3211 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3212 3213 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3214 .addImm(SPDenormMode) 3215 .addImm(SPDenormModeBitField); 3216 } 3217 } 3218 3219 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3220 MachineRegisterInfo &MRI, 3221 MachineIRBuilder &B) const { 3222 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3223 return true; 3224 3225 Register Res = MI.getOperand(0).getReg(); 3226 Register LHS = MI.getOperand(1).getReg(); 3227 Register RHS = MI.getOperand(2).getReg(); 3228 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3229 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3230 3231 uint16_t Flags = MI.getFlags(); 3232 3233 LLT S32 = LLT::scalar(32); 3234 LLT S1 = LLT::scalar(1); 3235 3236 auto One = B.buildFConstant(S32, 1.0f); 3237 3238 auto DenominatorScaled = 3239 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3240 .addUse(LHS) 3241 .addUse(RHS) 3242 .addImm(0) 3243 .setMIFlags(Flags); 3244 auto NumeratorScaled = 3245 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3246 .addUse(LHS) 3247 .addUse(RHS) 3248 .addImm(1) 3249 .setMIFlags(Flags); 3250 3251 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3252 .addUse(DenominatorScaled.getReg(0)) 3253 .setMIFlags(Flags); 3254 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3255 3256 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3257 // aren't modeled as reading it. 3258 if (!Mode.allFP32Denormals()) 3259 toggleSPDenormMode(true, B, ST, Mode); 3260 3261 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3262 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3263 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3264 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3265 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3266 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3267 3268 if (!Mode.allFP32Denormals()) 3269 toggleSPDenormMode(false, B, ST, Mode); 3270 3271 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3272 .addUse(Fma4.getReg(0)) 3273 .addUse(Fma1.getReg(0)) 3274 .addUse(Fma3.getReg(0)) 3275 .addUse(NumeratorScaled.getReg(1)) 3276 .setMIFlags(Flags); 3277 3278 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3279 .addUse(Fmas.getReg(0)) 3280 .addUse(RHS) 3281 .addUse(LHS) 3282 .setMIFlags(Flags); 3283 3284 MI.eraseFromParent(); 3285 return true; 3286 } 3287 3288 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3289 MachineRegisterInfo &MRI, 3290 MachineIRBuilder &B) const { 3291 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3292 return true; 3293 3294 Register Res = MI.getOperand(0).getReg(); 3295 Register LHS = MI.getOperand(1).getReg(); 3296 Register RHS = MI.getOperand(2).getReg(); 3297 3298 uint16_t Flags = MI.getFlags(); 3299 3300 LLT S64 = LLT::scalar(64); 3301 LLT S1 = LLT::scalar(1); 3302 3303 auto One = B.buildFConstant(S64, 1.0); 3304 3305 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3306 .addUse(LHS) 3307 .addUse(RHS) 3308 .addImm(0) 3309 .setMIFlags(Flags); 3310 3311 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3312 3313 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3314 .addUse(DivScale0.getReg(0)) 3315 .setMIFlags(Flags); 3316 3317 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3318 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3319 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3320 3321 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3322 .addUse(LHS) 3323 .addUse(RHS) 3324 .addImm(1) 3325 .setMIFlags(Flags); 3326 3327 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3328 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3329 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3330 3331 Register Scale; 3332 if (!ST.hasUsableDivScaleConditionOutput()) { 3333 // Workaround a hardware bug on SI where the condition output from div_scale 3334 // is not usable. 3335 3336 LLT S32 = LLT::scalar(32); 3337 3338 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3339 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3340 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3341 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3342 3343 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3344 Scale1Unmerge.getReg(1)); 3345 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3346 Scale0Unmerge.getReg(1)); 3347 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3348 } else { 3349 Scale = DivScale1.getReg(1); 3350 } 3351 3352 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3353 .addUse(Fma4.getReg(0)) 3354 .addUse(Fma3.getReg(0)) 3355 .addUse(Mul.getReg(0)) 3356 .addUse(Scale) 3357 .setMIFlags(Flags); 3358 3359 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3360 .addUse(Fmas.getReg(0)) 3361 .addUse(RHS) 3362 .addUse(LHS) 3363 .setMIFlags(Flags); 3364 3365 MI.eraseFromParent(); 3366 return true; 3367 } 3368 3369 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3370 MachineRegisterInfo &MRI, 3371 MachineIRBuilder &B) const { 3372 Register Res = MI.getOperand(0).getReg(); 3373 Register LHS = MI.getOperand(2).getReg(); 3374 Register RHS = MI.getOperand(3).getReg(); 3375 uint16_t Flags = MI.getFlags(); 3376 3377 LLT S32 = LLT::scalar(32); 3378 LLT S1 = LLT::scalar(1); 3379 3380 auto Abs = B.buildFAbs(S32, RHS, Flags); 3381 const APFloat C0Val(1.0f); 3382 3383 auto C0 = B.buildConstant(S32, 0x6f800000); 3384 auto C1 = B.buildConstant(S32, 0x2f800000); 3385 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3386 3387 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3388 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3389 3390 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3391 3392 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3393 .addUse(Mul0.getReg(0)) 3394 .setMIFlags(Flags); 3395 3396 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3397 3398 B.buildFMul(Res, Sel, Mul1, Flags); 3399 3400 MI.eraseFromParent(); 3401 return true; 3402 } 3403 3404 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3405 // FIXME: Why do we handle this one but not other removed instructions? 3406 // 3407 // Reciprocal square root. The clamp prevents infinite results, clamping 3408 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3409 // +-max_float. 3410 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3411 MachineRegisterInfo &MRI, 3412 MachineIRBuilder &B) const { 3413 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3414 return true; 3415 3416 Register Dst = MI.getOperand(0).getReg(); 3417 Register Src = MI.getOperand(2).getReg(); 3418 auto Flags = MI.getFlags(); 3419 3420 LLT Ty = MRI.getType(Dst); 3421 3422 const fltSemantics *FltSemantics; 3423 if (Ty == LLT::scalar(32)) 3424 FltSemantics = &APFloat::IEEEsingle(); 3425 else if (Ty == LLT::scalar(64)) 3426 FltSemantics = &APFloat::IEEEdouble(); 3427 else 3428 return false; 3429 3430 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3431 .addUse(Src) 3432 .setMIFlags(Flags); 3433 3434 // We don't need to concern ourselves with the snan handling difference, since 3435 // the rsq quieted (or not) so use the one which will directly select. 3436 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3437 const bool UseIEEE = MFI->getMode().IEEE; 3438 3439 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3440 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3441 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3442 3443 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3444 3445 if (UseIEEE) 3446 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3447 else 3448 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3449 MI.eraseFromParent(); 3450 return true; 3451 } 3452 3453 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3454 switch (IID) { 3455 case Intrinsic::amdgcn_ds_fadd: 3456 return AMDGPU::G_ATOMICRMW_FADD; 3457 case Intrinsic::amdgcn_ds_fmin: 3458 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3459 case Intrinsic::amdgcn_ds_fmax: 3460 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3461 default: 3462 llvm_unreachable("not a DS FP intrinsic"); 3463 } 3464 } 3465 3466 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3467 MachineInstr &MI, 3468 Intrinsic::ID IID) const { 3469 GISelChangeObserver &Observer = Helper.Observer; 3470 Observer.changingInstr(MI); 3471 3472 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3473 3474 // The remaining operands were used to set fields in the MemOperand on 3475 // construction. 3476 for (int I = 6; I > 3; --I) 3477 MI.RemoveOperand(I); 3478 3479 MI.RemoveOperand(1); // Remove the intrinsic ID. 3480 Observer.changedInstr(MI); 3481 return true; 3482 } 3483 3484 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3485 MachineRegisterInfo &MRI, 3486 MachineIRBuilder &B) const { 3487 uint64_t Offset = 3488 ST.getTargetLowering()->getImplicitParameterOffset( 3489 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3490 LLT DstTy = MRI.getType(DstReg); 3491 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3492 3493 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3494 if (!loadInputValue(KernargPtrReg, B, 3495 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3496 return false; 3497 3498 // FIXME: This should be nuw 3499 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3500 return true; 3501 } 3502 3503 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3504 MachineRegisterInfo &MRI, 3505 MachineIRBuilder &B) const { 3506 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3507 if (!MFI->isEntryFunction()) { 3508 return legalizePreloadedArgIntrin(MI, MRI, B, 3509 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3510 } 3511 3512 Register DstReg = MI.getOperand(0).getReg(); 3513 if (!getImplicitArgPtr(DstReg, MRI, B)) 3514 return false; 3515 3516 MI.eraseFromParent(); 3517 return true; 3518 } 3519 3520 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3521 MachineRegisterInfo &MRI, 3522 MachineIRBuilder &B, 3523 unsigned AddrSpace) const { 3524 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3525 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3526 Register Hi32 = Unmerge.getReg(1); 3527 3528 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3529 MI.eraseFromParent(); 3530 return true; 3531 } 3532 3533 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3534 // offset (the offset that is included in bounds checking and swizzling, to be 3535 // split between the instruction's voffset and immoffset fields) and soffset 3536 // (the offset that is excluded from bounds checking and swizzling, to go in 3537 // the instruction's soffset field). This function takes the first kind of 3538 // offset and figures out how to split it between voffset and immoffset. 3539 std::tuple<Register, unsigned, unsigned> 3540 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3541 Register OrigOffset) const { 3542 const unsigned MaxImm = 4095; 3543 Register BaseReg; 3544 unsigned TotalConstOffset; 3545 const LLT S32 = LLT::scalar(32); 3546 MachineRegisterInfo &MRI = *B.getMRI(); 3547 3548 std::tie(BaseReg, TotalConstOffset) = 3549 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3550 3551 unsigned ImmOffset = TotalConstOffset; 3552 3553 // If BaseReg is a pointer, convert it to int. 3554 if (MRI.getType(BaseReg).isPointer()) 3555 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3556 3557 // If the immediate value is too big for the immoffset field, put the value 3558 // and -4096 into the immoffset field so that the value that is copied/added 3559 // for the voffset field is a multiple of 4096, and it stands more chance 3560 // of being CSEd with the copy/add for another similar load/store. 3561 // However, do not do that rounding down to a multiple of 4096 if that is a 3562 // negative number, as it appears to be illegal to have a negative offset 3563 // in the vgpr, even if adding the immediate offset makes it positive. 3564 unsigned Overflow = ImmOffset & ~MaxImm; 3565 ImmOffset -= Overflow; 3566 if ((int32_t)Overflow < 0) { 3567 Overflow += ImmOffset; 3568 ImmOffset = 0; 3569 } 3570 3571 if (Overflow != 0) { 3572 if (!BaseReg) { 3573 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3574 } else { 3575 auto OverflowVal = B.buildConstant(S32, Overflow); 3576 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3577 } 3578 } 3579 3580 if (!BaseReg) 3581 BaseReg = B.buildConstant(S32, 0).getReg(0); 3582 3583 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3584 } 3585 3586 /// Handle register layout difference for f16 images for some subtargets. 3587 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3588 MachineRegisterInfo &MRI, 3589 Register Reg, 3590 bool ImageStore) const { 3591 const LLT S16 = LLT::scalar(16); 3592 const LLT S32 = LLT::scalar(32); 3593 LLT StoreVT = MRI.getType(Reg); 3594 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3595 3596 if (ST.hasUnpackedD16VMem()) { 3597 auto Unmerge = B.buildUnmerge(S16, Reg); 3598 3599 SmallVector<Register, 4> WideRegs; 3600 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3601 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3602 3603 int NumElts = StoreVT.getNumElements(); 3604 3605 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3606 } 3607 3608 if (ImageStore && ST.hasImageStoreD16Bug()) { 3609 if (StoreVT.getNumElements() == 2) { 3610 SmallVector<Register, 4> PackedRegs; 3611 Reg = B.buildBitcast(S32, Reg).getReg(0); 3612 PackedRegs.push_back(Reg); 3613 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3614 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); 3615 } 3616 3617 if (StoreVT.getNumElements() == 3) { 3618 SmallVector<Register, 4> PackedRegs; 3619 auto Unmerge = B.buildUnmerge(S16, Reg); 3620 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3621 PackedRegs.push_back(Unmerge.getReg(I)); 3622 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3623 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); 3624 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); 3625 } 3626 3627 if (StoreVT.getNumElements() == 4) { 3628 SmallVector<Register, 4> PackedRegs; 3629 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); 3630 auto Unmerge = B.buildUnmerge(S32, Reg); 3631 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3632 PackedRegs.push_back(Unmerge.getReg(I)); 3633 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3634 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); 3635 } 3636 3637 llvm_unreachable("invalid data type"); 3638 } 3639 3640 return Reg; 3641 } 3642 3643 Register AMDGPULegalizerInfo::fixStoreSourceType( 3644 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3645 MachineRegisterInfo *MRI = B.getMRI(); 3646 LLT Ty = MRI->getType(VData); 3647 3648 const LLT S16 = LLT::scalar(16); 3649 3650 // Fixup illegal register types for i8 stores. 3651 if (Ty == LLT::scalar(8) || Ty == S16) { 3652 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3653 return AnyExt; 3654 } 3655 3656 if (Ty.isVector()) { 3657 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3658 if (IsFormat) 3659 return handleD16VData(B, *MRI, VData); 3660 } 3661 } 3662 3663 return VData; 3664 } 3665 3666 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3667 MachineRegisterInfo &MRI, 3668 MachineIRBuilder &B, 3669 bool IsTyped, 3670 bool IsFormat) const { 3671 Register VData = MI.getOperand(1).getReg(); 3672 LLT Ty = MRI.getType(VData); 3673 LLT EltTy = Ty.getScalarType(); 3674 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3675 const LLT S32 = LLT::scalar(32); 3676 3677 VData = fixStoreSourceType(B, VData, IsFormat); 3678 Register RSrc = MI.getOperand(2).getReg(); 3679 3680 MachineMemOperand *MMO = *MI.memoperands_begin(); 3681 const int MemSize = MMO->getSize(); 3682 3683 unsigned ImmOffset; 3684 unsigned TotalOffset; 3685 3686 // The typed intrinsics add an immediate after the registers. 3687 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3688 3689 // The struct intrinsic variants add one additional operand over raw. 3690 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3691 Register VIndex; 3692 int OpOffset = 0; 3693 if (HasVIndex) { 3694 VIndex = MI.getOperand(3).getReg(); 3695 OpOffset = 1; 3696 } 3697 3698 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3699 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3700 3701 unsigned Format = 0; 3702 if (IsTyped) { 3703 Format = MI.getOperand(5 + OpOffset).getImm(); 3704 ++OpOffset; 3705 } 3706 3707 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3708 3709 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3710 if (TotalOffset != 0) 3711 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3712 3713 unsigned Opc; 3714 if (IsTyped) { 3715 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3716 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3717 } else if (IsFormat) { 3718 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3719 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3720 } else { 3721 switch (MemSize) { 3722 case 1: 3723 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3724 break; 3725 case 2: 3726 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3727 break; 3728 default: 3729 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3730 break; 3731 } 3732 } 3733 3734 if (!VIndex) 3735 VIndex = B.buildConstant(S32, 0).getReg(0); 3736 3737 auto MIB = B.buildInstr(Opc) 3738 .addUse(VData) // vdata 3739 .addUse(RSrc) // rsrc 3740 .addUse(VIndex) // vindex 3741 .addUse(VOffset) // voffset 3742 .addUse(SOffset) // soffset 3743 .addImm(ImmOffset); // offset(imm) 3744 3745 if (IsTyped) 3746 MIB.addImm(Format); 3747 3748 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3749 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3750 .addMemOperand(MMO); 3751 3752 MI.eraseFromParent(); 3753 return true; 3754 } 3755 3756 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3757 MachineRegisterInfo &MRI, 3758 MachineIRBuilder &B, 3759 bool IsFormat, 3760 bool IsTyped) const { 3761 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3762 MachineMemOperand *MMO = *MI.memoperands_begin(); 3763 const int MemSize = MMO->getSize(); 3764 const LLT S32 = LLT::scalar(32); 3765 3766 Register Dst = MI.getOperand(0).getReg(); 3767 Register RSrc = MI.getOperand(2).getReg(); 3768 3769 // The typed intrinsics add an immediate after the registers. 3770 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3771 3772 // The struct intrinsic variants add one additional operand over raw. 3773 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3774 Register VIndex; 3775 int OpOffset = 0; 3776 if (HasVIndex) { 3777 VIndex = MI.getOperand(3).getReg(); 3778 OpOffset = 1; 3779 } 3780 3781 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3782 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3783 3784 unsigned Format = 0; 3785 if (IsTyped) { 3786 Format = MI.getOperand(5 + OpOffset).getImm(); 3787 ++OpOffset; 3788 } 3789 3790 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3791 unsigned ImmOffset; 3792 unsigned TotalOffset; 3793 3794 LLT Ty = MRI.getType(Dst); 3795 LLT EltTy = Ty.getScalarType(); 3796 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3797 const bool Unpacked = ST.hasUnpackedD16VMem(); 3798 3799 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3800 if (TotalOffset != 0) 3801 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3802 3803 unsigned Opc; 3804 3805 if (IsTyped) { 3806 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3807 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3808 } else if (IsFormat) { 3809 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3810 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3811 } else { 3812 switch (MemSize) { 3813 case 1: 3814 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3815 break; 3816 case 2: 3817 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3818 break; 3819 default: 3820 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3821 break; 3822 } 3823 } 3824 3825 Register LoadDstReg; 3826 3827 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3828 LLT UnpackedTy = Ty.changeElementSize(32); 3829 3830 if (IsExtLoad) 3831 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3832 else if (Unpacked && IsD16 && Ty.isVector()) 3833 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3834 else 3835 LoadDstReg = Dst; 3836 3837 if (!VIndex) 3838 VIndex = B.buildConstant(S32, 0).getReg(0); 3839 3840 auto MIB = B.buildInstr(Opc) 3841 .addDef(LoadDstReg) // vdata 3842 .addUse(RSrc) // rsrc 3843 .addUse(VIndex) // vindex 3844 .addUse(VOffset) // voffset 3845 .addUse(SOffset) // soffset 3846 .addImm(ImmOffset); // offset(imm) 3847 3848 if (IsTyped) 3849 MIB.addImm(Format); 3850 3851 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3852 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3853 .addMemOperand(MMO); 3854 3855 if (LoadDstReg != Dst) { 3856 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3857 3858 // Widen result for extending loads was widened. 3859 if (IsExtLoad) 3860 B.buildTrunc(Dst, LoadDstReg); 3861 else { 3862 // Repack to original 16-bit vector result 3863 // FIXME: G_TRUNC should work, but legalization currently fails 3864 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3865 SmallVector<Register, 4> Repack; 3866 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3867 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3868 B.buildMerge(Dst, Repack); 3869 } 3870 } 3871 3872 MI.eraseFromParent(); 3873 return true; 3874 } 3875 3876 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3877 MachineIRBuilder &B, 3878 bool IsInc) const { 3879 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3880 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3881 B.buildInstr(Opc) 3882 .addDef(MI.getOperand(0).getReg()) 3883 .addUse(MI.getOperand(2).getReg()) 3884 .addUse(MI.getOperand(3).getReg()) 3885 .cloneMemRefs(MI); 3886 MI.eraseFromParent(); 3887 return true; 3888 } 3889 3890 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3891 switch (IntrID) { 3892 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3893 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3894 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3895 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3896 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3897 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3898 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3899 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3900 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3901 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3902 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3903 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3904 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3905 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3906 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3907 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3908 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3909 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3910 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3911 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3912 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3913 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3914 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3915 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3916 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3917 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3918 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3919 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3920 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3921 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3922 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3923 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3924 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3925 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3926 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3927 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3928 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3929 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3930 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3931 case Intrinsic::amdgcn_buffer_atomic_fadd: 3932 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3933 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3934 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3935 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 3936 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 3937 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 3938 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 3939 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 3940 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 3941 default: 3942 llvm_unreachable("unhandled atomic opcode"); 3943 } 3944 } 3945 3946 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3947 MachineIRBuilder &B, 3948 Intrinsic::ID IID) const { 3949 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3950 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3951 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3952 3953 Register Dst; 3954 3955 int OpOffset = 0; 3956 if (HasReturn) { 3957 // A few FP atomics do not support return values. 3958 Dst = MI.getOperand(0).getReg(); 3959 } else { 3960 OpOffset = -1; 3961 } 3962 3963 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3964 Register CmpVal; 3965 3966 if (IsCmpSwap) { 3967 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3968 ++OpOffset; 3969 } 3970 3971 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3972 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3973 3974 // The struct intrinsic variants add one additional operand over raw. 3975 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3976 Register VIndex; 3977 if (HasVIndex) { 3978 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3979 ++OpOffset; 3980 } 3981 3982 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3983 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3984 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3985 3986 MachineMemOperand *MMO = *MI.memoperands_begin(); 3987 3988 unsigned ImmOffset; 3989 unsigned TotalOffset; 3990 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3991 if (TotalOffset != 0) 3992 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3993 3994 if (!VIndex) 3995 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3996 3997 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3998 3999 if (HasReturn) 4000 MIB.addDef(Dst); 4001 4002 MIB.addUse(VData); // vdata 4003 4004 if (IsCmpSwap) 4005 MIB.addReg(CmpVal); 4006 4007 MIB.addUse(RSrc) // rsrc 4008 .addUse(VIndex) // vindex 4009 .addUse(VOffset) // voffset 4010 .addUse(SOffset) // soffset 4011 .addImm(ImmOffset) // offset(imm) 4012 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4013 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4014 .addMemOperand(MMO); 4015 4016 MI.eraseFromParent(); 4017 return true; 4018 } 4019 4020 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 4021 /// vector with s16 typed elements. 4022 static void packImageA16AddressToDwords( 4023 MachineIRBuilder &B, MachineInstr &MI, 4024 SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset, 4025 const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) { 4026 const LLT S16 = LLT::scalar(16); 4027 const LLT V2S16 = LLT::vector(2, 16); 4028 4029 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4030 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4031 if (!SrcOp.isReg()) 4032 continue; // _L to _LZ may have eliminated this. 4033 4034 Register AddrReg = SrcOp.getReg(); 4035 4036 if (I < Intr->GradientStart) { 4037 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4038 PackedAddrs.push_back(AddrReg); 4039 } else { 4040 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4041 // derivatives dx/dh and dx/dv are packed with undef. 4042 if (((I + 1) >= EndIdx) || 4043 ((Intr->NumGradients / 2) % 2 == 1 && 4044 (I == static_cast<unsigned>(Intr->GradientStart + 4045 (Intr->NumGradients / 2) - 1) || 4046 I == static_cast<unsigned>(Intr->GradientStart + 4047 Intr->NumGradients - 1))) || 4048 // Check for _L to _LZ optimization 4049 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4050 PackedAddrs.push_back( 4051 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4052 .getReg(0)); 4053 } else { 4054 PackedAddrs.push_back( 4055 B.buildBuildVector( 4056 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4057 .getReg(0)); 4058 ++I; 4059 } 4060 } 4061 } 4062 } 4063 4064 /// Convert from separate vaddr components to a single vector address register, 4065 /// and replace the remaining operands with $noreg. 4066 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4067 int DimIdx, int NumVAddrs) { 4068 const LLT S32 = LLT::scalar(32); 4069 4070 SmallVector<Register, 8> AddrRegs; 4071 for (int I = 0; I != NumVAddrs; ++I) { 4072 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4073 if (SrcOp.isReg()) { 4074 AddrRegs.push_back(SrcOp.getReg()); 4075 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4076 } 4077 } 4078 4079 int NumAddrRegs = AddrRegs.size(); 4080 if (NumAddrRegs != 1) { 4081 // Round up to 8 elements for v5-v7 4082 // FIXME: Missing intermediate sized register classes and instructions. 4083 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4084 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4085 auto Undef = B.buildUndef(S32); 4086 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4087 NumAddrRegs = RoundedNumRegs; 4088 } 4089 4090 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 4091 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4092 } 4093 4094 for (int I = 1; I != NumVAddrs; ++I) { 4095 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4096 if (SrcOp.isReg()) 4097 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4098 } 4099 } 4100 4101 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4102 /// 4103 /// Depending on the subtarget, load/store with 16-bit element data need to be 4104 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4105 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4106 /// registers. 4107 /// 4108 /// We don't want to directly select image instructions just yet, but also want 4109 /// to exposes all register repacking to the legalizer/combiners. We also don't 4110 /// want a selected instrution entering RegBankSelect. In order to avoid 4111 /// defining a multitude of intermediate image instructions, directly hack on 4112 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4113 /// now unnecessary arguments with $noreg. 4114 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4115 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4116 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4117 4118 const unsigned NumDefs = MI.getNumExplicitDefs(); 4119 const unsigned ArgOffset = NumDefs + 1; 4120 bool IsTFE = NumDefs == 2; 4121 // We are only processing the operands of d16 image operations on subtargets 4122 // that use the unpacked register layout, or need to repack the TFE result. 4123 4124 // TODO: Do we need to guard against already legalized intrinsics? 4125 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4126 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4127 4128 MachineRegisterInfo *MRI = B.getMRI(); 4129 const LLT S32 = LLT::scalar(32); 4130 const LLT S16 = LLT::scalar(16); 4131 const LLT V2S16 = LLT::vector(2, 16); 4132 4133 unsigned DMask = 0; 4134 4135 // Check for 16 bit addresses and pack if true. 4136 LLT GradTy = 4137 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4138 LLT AddrTy = 4139 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4140 const bool IsG16 = GradTy == S16; 4141 const bool IsA16 = AddrTy == S16; 4142 4143 int DMaskLanes = 0; 4144 if (!BaseOpcode->Atomic) { 4145 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4146 if (BaseOpcode->Gather4) { 4147 DMaskLanes = 4; 4148 } else if (DMask != 0) { 4149 DMaskLanes = countPopulation(DMask); 4150 } else if (!IsTFE && !BaseOpcode->Store) { 4151 // If dmask is 0, this is a no-op load. This can be eliminated. 4152 B.buildUndef(MI.getOperand(0)); 4153 MI.eraseFromParent(); 4154 return true; 4155 } 4156 } 4157 4158 Observer.changingInstr(MI); 4159 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4160 4161 unsigned NewOpcode = NumDefs == 0 ? 4162 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4163 4164 // Track that we legalized this 4165 MI.setDesc(B.getTII().get(NewOpcode)); 4166 4167 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4168 // dmask to be at least 1 otherwise the instruction will fail 4169 if (IsTFE && DMask == 0) { 4170 DMask = 0x1; 4171 DMaskLanes = 1; 4172 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4173 } 4174 4175 if (BaseOpcode->Atomic) { 4176 Register VData0 = MI.getOperand(2).getReg(); 4177 LLT Ty = MRI->getType(VData0); 4178 4179 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4180 if (Ty.isVector()) 4181 return false; 4182 4183 if (BaseOpcode->AtomicX2) { 4184 Register VData1 = MI.getOperand(3).getReg(); 4185 // The two values are packed in one register. 4186 LLT PackedTy = LLT::vector(2, Ty); 4187 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4188 MI.getOperand(2).setReg(Concat.getReg(0)); 4189 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4190 } 4191 } 4192 4193 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4194 4195 // Optimize _L to _LZ when _L is zero 4196 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4197 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4198 const ConstantFP *ConstantLod; 4199 4200 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4201 m_GFCst(ConstantLod))) { 4202 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4203 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4204 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4205 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4206 Intr->Dim); 4207 4208 // The starting indexes should remain in the same place. 4209 --CorrectedNumVAddrs; 4210 4211 MI.getOperand(MI.getNumExplicitDefs()) 4212 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4213 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4214 Intr = NewImageDimIntr; 4215 } 4216 } 4217 } 4218 4219 // Optimize _mip away, when 'lod' is zero 4220 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4221 int64_t ConstantLod; 4222 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4223 m_ICst(ConstantLod))) { 4224 if (ConstantLod == 0) { 4225 // TODO: Change intrinsic opcode and remove operand instead or replacing 4226 // it with 0, as the _L to _LZ handling is done above. 4227 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4228 --CorrectedNumVAddrs; 4229 } 4230 } 4231 } 4232 4233 // Rewrite the addressing register layout before doing anything else. 4234 if (IsA16 || IsG16) { 4235 if (IsA16) { 4236 // Target must support the feature and gradients need to be 16 bit too 4237 if (!ST.hasA16() || !IsG16) 4238 return false; 4239 } else if (!ST.hasG16()) 4240 return false; 4241 4242 if (Intr->NumVAddrs > 1) { 4243 SmallVector<Register, 4> PackedRegs; 4244 // Don't compress addresses for G16 4245 const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart; 4246 packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr, 4247 PackEndIdx); 4248 4249 if (!IsA16) { 4250 // Add uncompressed address 4251 for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) { 4252 int AddrReg = MI.getOperand(ArgOffset + I).getReg(); 4253 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 4254 PackedRegs.push_back(AddrReg); 4255 } 4256 } 4257 4258 // See also below in the non-a16 branch 4259 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4260 4261 if (!UseNSA && PackedRegs.size() > 1) { 4262 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4263 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4264 PackedRegs[0] = Concat.getReg(0); 4265 PackedRegs.resize(1); 4266 } 4267 4268 const unsigned NumPacked = PackedRegs.size(); 4269 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4270 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4271 if (!SrcOp.isReg()) { 4272 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4273 continue; 4274 } 4275 4276 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4277 4278 if (I - Intr->VAddrStart < NumPacked) 4279 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4280 else 4281 SrcOp.setReg(AMDGPU::NoRegister); 4282 } 4283 } 4284 } else { 4285 // If the register allocator cannot place the address registers contiguously 4286 // without introducing moves, then using the non-sequential address encoding 4287 // is always preferable, since it saves VALU instructions and is usually a 4288 // wash in terms of code size or even better. 4289 // 4290 // However, we currently have no way of hinting to the register allocator 4291 // that MIMG addresses should be placed contiguously when it is possible to 4292 // do so, so force non-NSA for the common 2-address case as a heuristic. 4293 // 4294 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4295 // allocation when possible. 4296 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4297 4298 if (!UseNSA && Intr->NumVAddrs > 1) 4299 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4300 Intr->NumVAddrs); 4301 } 4302 4303 int Flags = 0; 4304 if (IsA16) 4305 Flags |= 1; 4306 if (IsG16) 4307 Flags |= 2; 4308 MI.addOperand(MachineOperand::CreateImm(Flags)); 4309 4310 if (BaseOpcode->Store) { // No TFE for stores? 4311 // TODO: Handle dmask trim 4312 Register VData = MI.getOperand(1).getReg(); 4313 LLT Ty = MRI->getType(VData); 4314 if (!Ty.isVector() || Ty.getElementType() != S16) 4315 return true; 4316 4317 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4318 if (RepackedReg != VData) { 4319 MI.getOperand(1).setReg(RepackedReg); 4320 } 4321 4322 return true; 4323 } 4324 4325 Register DstReg = MI.getOperand(0).getReg(); 4326 LLT Ty = MRI->getType(DstReg); 4327 const LLT EltTy = Ty.getScalarType(); 4328 const bool IsD16 = Ty.getScalarType() == S16; 4329 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4330 4331 // Confirm that the return type is large enough for the dmask specified 4332 if (NumElts < DMaskLanes) 4333 return false; 4334 4335 if (NumElts > 4 || DMaskLanes > 4) 4336 return false; 4337 4338 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4339 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4340 4341 // The raw dword aligned data component of the load. The only legal cases 4342 // where this matters should be when using the packed D16 format, for 4343 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4344 LLT RoundedTy; 4345 4346 // S32 vector to to cover all data, plus TFE result element. 4347 LLT TFETy; 4348 4349 // Register type to use for each loaded component. Will be S32 or V2S16. 4350 LLT RegTy; 4351 4352 if (IsD16 && ST.hasUnpackedD16VMem()) { 4353 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4354 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4355 RegTy = S32; 4356 } else { 4357 unsigned EltSize = EltTy.getSizeInBits(); 4358 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4359 unsigned RoundedSize = 32 * RoundedElts; 4360 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4361 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4362 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4363 } 4364 4365 // The return type does not need adjustment. 4366 // TODO: Should we change s16 case to s32 or <2 x s16>? 4367 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4368 return true; 4369 4370 Register Dst1Reg; 4371 4372 // Insert after the instruction. 4373 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4374 4375 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4376 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4377 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4378 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4379 4380 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4381 4382 MI.getOperand(0).setReg(NewResultReg); 4383 4384 // In the IR, TFE is supposed to be used with a 2 element struct return 4385 // type. The intruction really returns these two values in one contiguous 4386 // register, with one additional dword beyond the loaded data. Rewrite the 4387 // return type to use a single register result. 4388 4389 if (IsTFE) { 4390 Dst1Reg = MI.getOperand(1).getReg(); 4391 if (MRI->getType(Dst1Reg) != S32) 4392 return false; 4393 4394 // TODO: Make sure the TFE operand bit is set. 4395 MI.RemoveOperand(1); 4396 4397 // Handle the easy case that requires no repack instructions. 4398 if (Ty == S32) { 4399 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4400 return true; 4401 } 4402 } 4403 4404 // Now figure out how to copy the new result register back into the old 4405 // result. 4406 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4407 4408 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4409 4410 if (ResultNumRegs == 1) { 4411 assert(!IsTFE); 4412 ResultRegs[0] = NewResultReg; 4413 } else { 4414 // We have to repack into a new vector of some kind. 4415 for (int I = 0; I != NumDataRegs; ++I) 4416 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4417 B.buildUnmerge(ResultRegs, NewResultReg); 4418 4419 // Drop the final TFE element to get the data part. The TFE result is 4420 // directly written to the right place already. 4421 if (IsTFE) 4422 ResultRegs.resize(NumDataRegs); 4423 } 4424 4425 // For an s16 scalar result, we form an s32 result with a truncate regardless 4426 // of packed vs. unpacked. 4427 if (IsD16 && !Ty.isVector()) { 4428 B.buildTrunc(DstReg, ResultRegs[0]); 4429 return true; 4430 } 4431 4432 // Avoid a build/concat_vector of 1 entry. 4433 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4434 B.buildBitcast(DstReg, ResultRegs[0]); 4435 return true; 4436 } 4437 4438 assert(Ty.isVector()); 4439 4440 if (IsD16) { 4441 // For packed D16 results with TFE enabled, all the data components are 4442 // S32. Cast back to the expected type. 4443 // 4444 // TODO: We don't really need to use load s32 elements. We would only need one 4445 // cast for the TFE result if a multiple of v2s16 was used. 4446 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4447 for (Register &Reg : ResultRegs) 4448 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4449 } else if (ST.hasUnpackedD16VMem()) { 4450 for (Register &Reg : ResultRegs) 4451 Reg = B.buildTrunc(S16, Reg).getReg(0); 4452 } 4453 } 4454 4455 auto padWithUndef = [&](LLT Ty, int NumElts) { 4456 if (NumElts == 0) 4457 return; 4458 Register Undef = B.buildUndef(Ty).getReg(0); 4459 for (int I = 0; I != NumElts; ++I) 4460 ResultRegs.push_back(Undef); 4461 }; 4462 4463 // Pad out any elements eliminated due to the dmask. 4464 LLT ResTy = MRI->getType(ResultRegs[0]); 4465 if (!ResTy.isVector()) { 4466 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4467 B.buildBuildVector(DstReg, ResultRegs); 4468 return true; 4469 } 4470 4471 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4472 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4473 4474 // Deal with the one annoying legal case. 4475 const LLT V3S16 = LLT::vector(3, 16); 4476 if (Ty == V3S16) { 4477 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4478 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4479 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4480 return true; 4481 } 4482 4483 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4484 B.buildConcatVectors(DstReg, ResultRegs); 4485 return true; 4486 } 4487 4488 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4489 LegalizerHelper &Helper, MachineInstr &MI) const { 4490 MachineIRBuilder &B = Helper.MIRBuilder; 4491 GISelChangeObserver &Observer = Helper.Observer; 4492 4493 Register Dst = MI.getOperand(0).getReg(); 4494 LLT Ty = B.getMRI()->getType(Dst); 4495 unsigned Size = Ty.getSizeInBits(); 4496 MachineFunction &MF = B.getMF(); 4497 4498 Observer.changingInstr(MI); 4499 4500 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4501 Ty = getBitcastRegisterType(Ty); 4502 Helper.bitcastDst(MI, Ty, 0); 4503 Dst = MI.getOperand(0).getReg(); 4504 B.setInsertPt(B.getMBB(), MI); 4505 } 4506 4507 // FIXME: We don't really need this intermediate instruction. The intrinsic 4508 // should be fixed to have a memory operand. Since it's readnone, we're not 4509 // allowed to add one. 4510 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4511 MI.RemoveOperand(1); // Remove intrinsic ID 4512 4513 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4514 // TODO: Should this use datalayout alignment? 4515 const unsigned MemSize = (Size + 7) / 8; 4516 const Align MemAlign(4); 4517 MachineMemOperand *MMO = MF.getMachineMemOperand( 4518 MachinePointerInfo(), 4519 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4520 MachineMemOperand::MOInvariant, 4521 MemSize, MemAlign); 4522 MI.addMemOperand(MF, MMO); 4523 4524 // There are no 96-bit result scalar loads, but widening to 128-bit should 4525 // always be legal. We may need to restore this to a 96-bit result if it turns 4526 // out this needs to be converted to a vector load during RegBankSelect. 4527 if (!isPowerOf2_32(Size)) { 4528 if (Ty.isVector()) 4529 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4530 else 4531 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4532 } 4533 4534 Observer.changedInstr(MI); 4535 return true; 4536 } 4537 4538 // TODO: Move to selection 4539 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4540 MachineRegisterInfo &MRI, 4541 MachineIRBuilder &B) const { 4542 if (!ST.isTrapHandlerEnabled() || 4543 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4544 return legalizeTrapEndpgm(MI, MRI, B); 4545 4546 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4547 switch (*HsaAbiVer) { 4548 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4549 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4550 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4551 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4552 return ST.supportsGetDoorbellID() ? 4553 legalizeTrapHsa(MI, MRI, B) : 4554 legalizeTrapHsaQueuePtr(MI, MRI, B); 4555 } 4556 } 4557 4558 llvm_unreachable("Unknown trap handler"); 4559 } 4560 4561 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4562 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4563 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4564 MI.eraseFromParent(); 4565 return true; 4566 } 4567 4568 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4569 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4570 // Pass queue pointer to trap handler as input, and insert trap instruction 4571 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4572 Register LiveIn = 4573 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4574 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4575 return false; 4576 4577 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4578 B.buildCopy(SGPR01, LiveIn); 4579 B.buildInstr(AMDGPU::S_TRAP) 4580 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4581 .addReg(SGPR01, RegState::Implicit); 4582 4583 MI.eraseFromParent(); 4584 return true; 4585 } 4586 4587 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4588 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4589 B.buildInstr(AMDGPU::S_TRAP) 4590 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4591 MI.eraseFromParent(); 4592 return true; 4593 } 4594 4595 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4596 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4597 // Is non-HSA path or trap-handler disabled? then, report a warning 4598 // accordingly 4599 if (!ST.isTrapHandlerEnabled() || 4600 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4601 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4602 "debugtrap handler not supported", 4603 MI.getDebugLoc(), DS_Warning); 4604 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4605 Ctx.diagnose(NoTrap); 4606 } else { 4607 // Insert debug-trap instruction 4608 B.buildInstr(AMDGPU::S_TRAP) 4609 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4610 } 4611 4612 MI.eraseFromParent(); 4613 return true; 4614 } 4615 4616 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4617 MachineIRBuilder &B) const { 4618 MachineRegisterInfo &MRI = *B.getMRI(); 4619 const LLT S16 = LLT::scalar(16); 4620 const LLT S32 = LLT::scalar(32); 4621 4622 Register DstReg = MI.getOperand(0).getReg(); 4623 Register NodePtr = MI.getOperand(2).getReg(); 4624 Register RayExtent = MI.getOperand(3).getReg(); 4625 Register RayOrigin = MI.getOperand(4).getReg(); 4626 Register RayDir = MI.getOperand(5).getReg(); 4627 Register RayInvDir = MI.getOperand(6).getReg(); 4628 Register TDescr = MI.getOperand(7).getReg(); 4629 4630 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4631 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4632 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4633 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4634 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4635 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4636 4637 SmallVector<Register, 12> Ops; 4638 if (Is64) { 4639 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4640 Ops.push_back(Unmerge.getReg(0)); 4641 Ops.push_back(Unmerge.getReg(1)); 4642 } else { 4643 Ops.push_back(NodePtr); 4644 } 4645 Ops.push_back(RayExtent); 4646 4647 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4648 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4649 Ops.push_back(Unmerge.getReg(0)); 4650 Ops.push_back(Unmerge.getReg(1)); 4651 Ops.push_back(Unmerge.getReg(2)); 4652 }; 4653 4654 packLanes(RayOrigin); 4655 if (IsA16) { 4656 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4657 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4658 Register R1 = MRI.createGenericVirtualRegister(S32); 4659 Register R2 = MRI.createGenericVirtualRegister(S32); 4660 Register R3 = MRI.createGenericVirtualRegister(S32); 4661 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4662 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4663 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4664 Ops.push_back(R1); 4665 Ops.push_back(R2); 4666 Ops.push_back(R3); 4667 } else { 4668 packLanes(RayDir); 4669 packLanes(RayInvDir); 4670 } 4671 4672 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4673 .addDef(DstReg) 4674 .addImm(Opcode); 4675 4676 for (Register R : Ops) { 4677 MIB.addUse(R); 4678 } 4679 4680 MIB.addUse(TDescr) 4681 .addImm(IsA16 ? 1 : 0) 4682 .cloneMemRefs(MI); 4683 4684 MI.eraseFromParent(); 4685 return true; 4686 } 4687 4688 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4689 MachineInstr &MI) const { 4690 MachineIRBuilder &B = Helper.MIRBuilder; 4691 MachineRegisterInfo &MRI = *B.getMRI(); 4692 4693 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4694 auto IntrID = MI.getIntrinsicID(); 4695 switch (IntrID) { 4696 case Intrinsic::amdgcn_if: 4697 case Intrinsic::amdgcn_else: { 4698 MachineInstr *Br = nullptr; 4699 MachineBasicBlock *UncondBrTarget = nullptr; 4700 bool Negated = false; 4701 if (MachineInstr *BrCond = 4702 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4703 const SIRegisterInfo *TRI 4704 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4705 4706 Register Def = MI.getOperand(1).getReg(); 4707 Register Use = MI.getOperand(3).getReg(); 4708 4709 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4710 4711 if (Negated) 4712 std::swap(CondBrTarget, UncondBrTarget); 4713 4714 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4715 if (IntrID == Intrinsic::amdgcn_if) { 4716 B.buildInstr(AMDGPU::SI_IF) 4717 .addDef(Def) 4718 .addUse(Use) 4719 .addMBB(UncondBrTarget); 4720 } else { 4721 B.buildInstr(AMDGPU::SI_ELSE) 4722 .addDef(Def) 4723 .addUse(Use) 4724 .addMBB(UncondBrTarget); 4725 } 4726 4727 if (Br) { 4728 Br->getOperand(0).setMBB(CondBrTarget); 4729 } else { 4730 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4731 // since we're swapping branch targets it needs to be reinserted. 4732 // FIXME: IRTranslator should probably not do this 4733 B.buildBr(*CondBrTarget); 4734 } 4735 4736 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4737 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4738 MI.eraseFromParent(); 4739 BrCond->eraseFromParent(); 4740 return true; 4741 } 4742 4743 return false; 4744 } 4745 case Intrinsic::amdgcn_loop: { 4746 MachineInstr *Br = nullptr; 4747 MachineBasicBlock *UncondBrTarget = nullptr; 4748 bool Negated = false; 4749 if (MachineInstr *BrCond = 4750 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4751 const SIRegisterInfo *TRI 4752 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4753 4754 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4755 Register Reg = MI.getOperand(2).getReg(); 4756 4757 if (Negated) 4758 std::swap(CondBrTarget, UncondBrTarget); 4759 4760 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4761 B.buildInstr(AMDGPU::SI_LOOP) 4762 .addUse(Reg) 4763 .addMBB(UncondBrTarget); 4764 4765 if (Br) 4766 Br->getOperand(0).setMBB(CondBrTarget); 4767 else 4768 B.buildBr(*CondBrTarget); 4769 4770 MI.eraseFromParent(); 4771 BrCond->eraseFromParent(); 4772 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4773 return true; 4774 } 4775 4776 return false; 4777 } 4778 case Intrinsic::amdgcn_kernarg_segment_ptr: 4779 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4780 // This only makes sense to call in a kernel, so just lower to null. 4781 B.buildConstant(MI.getOperand(0).getReg(), 0); 4782 MI.eraseFromParent(); 4783 return true; 4784 } 4785 4786 return legalizePreloadedArgIntrin( 4787 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4788 case Intrinsic::amdgcn_implicitarg_ptr: 4789 return legalizeImplicitArgPtr(MI, MRI, B); 4790 case Intrinsic::amdgcn_workitem_id_x: 4791 return legalizePreloadedArgIntrin(MI, MRI, B, 4792 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4793 case Intrinsic::amdgcn_workitem_id_y: 4794 return legalizePreloadedArgIntrin(MI, MRI, B, 4795 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4796 case Intrinsic::amdgcn_workitem_id_z: 4797 return legalizePreloadedArgIntrin(MI, MRI, B, 4798 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4799 case Intrinsic::amdgcn_workgroup_id_x: 4800 return legalizePreloadedArgIntrin(MI, MRI, B, 4801 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4802 case Intrinsic::amdgcn_workgroup_id_y: 4803 return legalizePreloadedArgIntrin(MI, MRI, B, 4804 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4805 case Intrinsic::amdgcn_workgroup_id_z: 4806 return legalizePreloadedArgIntrin(MI, MRI, B, 4807 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4808 case Intrinsic::amdgcn_dispatch_ptr: 4809 return legalizePreloadedArgIntrin(MI, MRI, B, 4810 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4811 case Intrinsic::amdgcn_queue_ptr: 4812 return legalizePreloadedArgIntrin(MI, MRI, B, 4813 AMDGPUFunctionArgInfo::QUEUE_PTR); 4814 case Intrinsic::amdgcn_implicit_buffer_ptr: 4815 return legalizePreloadedArgIntrin( 4816 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4817 case Intrinsic::amdgcn_dispatch_id: 4818 return legalizePreloadedArgIntrin(MI, MRI, B, 4819 AMDGPUFunctionArgInfo::DISPATCH_ID); 4820 case Intrinsic::amdgcn_fdiv_fast: 4821 return legalizeFDIVFastIntrin(MI, MRI, B); 4822 case Intrinsic::amdgcn_is_shared: 4823 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4824 case Intrinsic::amdgcn_is_private: 4825 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4826 case Intrinsic::amdgcn_wavefrontsize: { 4827 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4828 MI.eraseFromParent(); 4829 return true; 4830 } 4831 case Intrinsic::amdgcn_s_buffer_load: 4832 return legalizeSBufferLoad(Helper, MI); 4833 case Intrinsic::amdgcn_raw_buffer_store: 4834 case Intrinsic::amdgcn_struct_buffer_store: 4835 return legalizeBufferStore(MI, MRI, B, false, false); 4836 case Intrinsic::amdgcn_raw_buffer_store_format: 4837 case Intrinsic::amdgcn_struct_buffer_store_format: 4838 return legalizeBufferStore(MI, MRI, B, false, true); 4839 case Intrinsic::amdgcn_raw_tbuffer_store: 4840 case Intrinsic::amdgcn_struct_tbuffer_store: 4841 return legalizeBufferStore(MI, MRI, B, true, true); 4842 case Intrinsic::amdgcn_raw_buffer_load: 4843 case Intrinsic::amdgcn_struct_buffer_load: 4844 return legalizeBufferLoad(MI, MRI, B, false, false); 4845 case Intrinsic::amdgcn_raw_buffer_load_format: 4846 case Intrinsic::amdgcn_struct_buffer_load_format: 4847 return legalizeBufferLoad(MI, MRI, B, true, false); 4848 case Intrinsic::amdgcn_raw_tbuffer_load: 4849 case Intrinsic::amdgcn_struct_tbuffer_load: 4850 return legalizeBufferLoad(MI, MRI, B, true, true); 4851 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4852 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4853 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4854 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4855 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4856 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4857 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4858 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4859 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4860 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4861 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4862 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4863 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4864 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4865 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4866 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4867 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4868 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4869 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4870 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4871 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4872 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4873 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4874 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4875 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4876 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4877 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4878 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4879 case Intrinsic::amdgcn_buffer_atomic_fadd: 4880 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4881 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4882 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4883 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4884 return legalizeBufferAtomic(MI, B, IntrID); 4885 case Intrinsic::amdgcn_atomic_inc: 4886 return legalizeAtomicIncDec(MI, B, true); 4887 case Intrinsic::amdgcn_atomic_dec: 4888 return legalizeAtomicIncDec(MI, B, false); 4889 case Intrinsic::trap: 4890 return legalizeTrapIntrinsic(MI, MRI, B); 4891 case Intrinsic::debugtrap: 4892 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4893 case Intrinsic::amdgcn_rsq_clamp: 4894 return legalizeRsqClampIntrinsic(MI, MRI, B); 4895 case Intrinsic::amdgcn_ds_fadd: 4896 case Intrinsic::amdgcn_ds_fmin: 4897 case Intrinsic::amdgcn_ds_fmax: 4898 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4899 case Intrinsic::amdgcn_image_bvh_intersect_ray: 4900 return legalizeBVHIntrinsic(MI, B); 4901 default: { 4902 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4903 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4904 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4905 return true; 4906 } 4907 } 4908 4909 return true; 4910 } 4911