1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeNumElements(Pow2NElts); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 99 }; 100 } 101 102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 const LLT EltTy = Ty.getElementType(); 106 unsigned Size = Ty.getSizeInBits(); 107 unsigned Pieces = (Size + 63) / 64; 108 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 109 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 110 }; 111 } 112 113 // Increase the number of vector elements to reach the next multiple of 32-bit 114 // type. 115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 116 return [=](const LegalityQuery &Query) { 117 const LLT Ty = Query.Types[TypeIdx]; 118 119 const LLT EltTy = Ty.getElementType(); 120 const int Size = Ty.getSizeInBits(); 121 const int EltSize = EltTy.getSizeInBits(); 122 const int NextMul32 = (Size + 31) / 32; 123 124 assert(EltSize < 32); 125 126 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 127 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 128 }; 129 } 130 131 static LLT getBitcastRegisterType(const LLT Ty) { 132 const unsigned Size = Ty.getSizeInBits(); 133 134 LLT CoercedTy; 135 if (Size <= 32) { 136 // <2 x s8> -> s16 137 // <4 x s8> -> s32 138 return LLT::scalar(Size); 139 } 140 141 return LLT::scalarOrVector(Size / 32, 32); 142 } 143 144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 145 return [=](const LegalityQuery &Query) { 146 const LLT Ty = Query.Types[TypeIdx]; 147 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 148 }; 149 } 150 151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 152 return [=](const LegalityQuery &Query) { 153 const LLT Ty = Query.Types[TypeIdx]; 154 unsigned Size = Ty.getSizeInBits(); 155 assert(Size % 32 == 0); 156 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 157 }; 158 } 159 160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 161 return [=](const LegalityQuery &Query) { 162 const LLT QueryTy = Query.Types[TypeIdx]; 163 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 164 }; 165 } 166 167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 168 return [=](const LegalityQuery &Query) { 169 const LLT QueryTy = Query.Types[TypeIdx]; 170 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 171 }; 172 } 173 174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 const LLT QueryTy = Query.Types[TypeIdx]; 177 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 178 }; 179 } 180 181 static bool isRegisterSize(unsigned Size) { 182 return Size % 32 == 0 && Size <= MaxRegisterSize; 183 } 184 185 static bool isRegisterVectorElementType(LLT EltTy) { 186 const int EltSize = EltTy.getSizeInBits(); 187 return EltSize == 16 || EltSize % 32 == 0; 188 } 189 190 static bool isRegisterVectorType(LLT Ty) { 191 const int EltSize = Ty.getElementType().getSizeInBits(); 192 return EltSize == 32 || EltSize == 64 || 193 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 194 EltSize == 128 || EltSize == 256; 195 } 196 197 static bool isRegisterType(LLT Ty) { 198 if (!isRegisterSize(Ty.getSizeInBits())) 199 return false; 200 201 if (Ty.isVector()) 202 return isRegisterVectorType(Ty); 203 204 return true; 205 } 206 207 // Any combination of 32 or 64-bit elements up the maximum register size, and 208 // multiples of v2s16. 209 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 210 return [=](const LegalityQuery &Query) { 211 return isRegisterType(Query.Types[TypeIdx]); 212 }; 213 } 214 215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 216 return [=](const LegalityQuery &Query) { 217 const LLT QueryTy = Query.Types[TypeIdx]; 218 if (!QueryTy.isVector()) 219 return false; 220 const LLT EltTy = QueryTy.getElementType(); 221 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 222 }; 223 } 224 225 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 226 return [=](const LegalityQuery &Query) { 227 const LLT Ty = Query.Types[TypeIdx]; 228 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 229 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 230 }; 231 } 232 233 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 234 // handle some operations by just promoting the register during 235 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 236 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 237 bool IsLoad) { 238 switch (AS) { 239 case AMDGPUAS::PRIVATE_ADDRESS: 240 // FIXME: Private element size. 241 return ST.enableFlatScratch() ? 128 : 32; 242 case AMDGPUAS::LOCAL_ADDRESS: 243 return ST.useDS128() ? 128 : 64; 244 case AMDGPUAS::GLOBAL_ADDRESS: 245 case AMDGPUAS::CONSTANT_ADDRESS: 246 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 247 // Treat constant and global as identical. SMRD loads are sometimes usable for 248 // global loads (ideally constant address space should be eliminated) 249 // depending on the context. Legality cannot be context dependent, but 250 // RegBankSelect can split the load as necessary depending on the pointer 251 // register bank/uniformity and if the memory is invariant or not written in a 252 // kernel. 253 return IsLoad ? 512 : 128; 254 default: 255 // Flat addresses may contextually need to be split to 32-bit parts if they 256 // may alias scratch depending on the subtarget. 257 return 128; 258 } 259 } 260 261 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 262 const LegalityQuery &Query) { 263 const LLT Ty = Query.Types[0]; 264 265 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 266 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 267 268 unsigned RegSize = Ty.getSizeInBits(); 269 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 270 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 271 unsigned AS = Query.Types[1].getAddressSpace(); 272 273 // All of these need to be custom lowered to cast the pointer operand. 274 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 275 return false; 276 277 // TODO: We should be able to widen loads if the alignment is high enough, but 278 // we also need to modify the memory access size. 279 #if 0 280 // Accept widening loads based on alignment. 281 if (IsLoad && MemSize < Size) 282 MemSize = std::max(MemSize, Align); 283 #endif 284 285 // Only 1-byte and 2-byte to 32-bit extloads are valid. 286 if (MemSize != RegSize && RegSize != 32) 287 return false; 288 289 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 290 return false; 291 292 switch (MemSize) { 293 case 8: 294 case 16: 295 case 32: 296 case 64: 297 case 128: 298 break; 299 case 96: 300 if (!ST.hasDwordx3LoadStores()) 301 return false; 302 break; 303 case 256: 304 case 512: 305 // These may contextually need to be broken down. 306 break; 307 default: 308 return false; 309 } 310 311 assert(RegSize >= MemSize); 312 313 if (AlignBits < MemSize) { 314 const SITargetLowering *TLI = ST.getTargetLowering(); 315 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 316 Align(AlignBits / 8))) 317 return false; 318 } 319 320 return true; 321 } 322 323 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 324 // workaround this. Eventually it should ignore the type for loads and only care 325 // about the size. Return true in cases where we will workaround this for now by 326 // bitcasting. 327 static bool loadStoreBitcastWorkaround(const LLT Ty) { 328 if (EnableNewLegality) 329 return false; 330 331 const unsigned Size = Ty.getSizeInBits(); 332 if (Size <= 64) 333 return false; 334 if (!Ty.isVector()) 335 return true; 336 337 LLT EltTy = Ty.getElementType(); 338 if (EltTy.isPointer()) 339 return true; 340 341 unsigned EltSize = EltTy.getSizeInBits(); 342 return EltSize != 32 && EltSize != 64; 343 } 344 345 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 346 const LLT Ty = Query.Types[0]; 347 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 348 !loadStoreBitcastWorkaround(Ty); 349 } 350 351 /// Return true if a load or store of the type should be lowered with a bitcast 352 /// to a different type. 353 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 354 const unsigned MemSizeInBits) { 355 const unsigned Size = Ty.getSizeInBits(); 356 if (Size != MemSizeInBits) 357 return Size <= 32 && Ty.isVector(); 358 359 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 360 return true; 361 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 362 !isRegisterVectorElementType(Ty.getElementType()); 363 } 364 365 /// Return true if we should legalize a load by widening an odd sized memory 366 /// access up to the alignment. Note this case when the memory access itself 367 /// changes, not the size of the result register. 368 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 369 unsigned AlignInBits, unsigned AddrSpace, 370 unsigned Opcode) { 371 // We don't want to widen cases that are naturally legal. 372 if (isPowerOf2_32(SizeInBits)) 373 return false; 374 375 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 376 // end up widening these for a scalar load during RegBankSelect, since there 377 // aren't 96-bit scalar loads. 378 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 379 return false; 380 381 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 382 return false; 383 384 // A load is known dereferenceable up to the alignment, so it's legal to widen 385 // to it. 386 // 387 // TODO: Could check dereferenceable for less aligned cases. 388 unsigned RoundedSize = NextPowerOf2(SizeInBits); 389 if (AlignInBits < RoundedSize) 390 return false; 391 392 // Do not widen if it would introduce a slow unaligned load. 393 const SITargetLowering *TLI = ST.getTargetLowering(); 394 bool Fast = false; 395 return TLI->allowsMisalignedMemoryAccessesImpl( 396 RoundedSize, AddrSpace, Align(AlignInBits / 8), 397 MachineMemOperand::MOLoad, &Fast) && 398 Fast; 399 } 400 401 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 402 unsigned Opcode) { 403 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 404 return false; 405 406 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 407 Query.MMODescrs[0].AlignInBits, 408 Query.Types[1].getAddressSpace(), Opcode); 409 } 410 411 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 412 const GCNTargetMachine &TM) 413 : ST(ST_) { 414 using namespace TargetOpcode; 415 416 auto GetAddrSpacePtr = [&TM](unsigned AS) { 417 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 418 }; 419 420 const LLT S1 = LLT::scalar(1); 421 const LLT S8 = LLT::scalar(8); 422 const LLT S16 = LLT::scalar(16); 423 const LLT S32 = LLT::scalar(32); 424 const LLT S64 = LLT::scalar(64); 425 const LLT S128 = LLT::scalar(128); 426 const LLT S256 = LLT::scalar(256); 427 const LLT S512 = LLT::scalar(512); 428 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 429 430 const LLT V2S8 = LLT::vector(2, 8); 431 const LLT V2S16 = LLT::vector(2, 16); 432 const LLT V4S16 = LLT::vector(4, 16); 433 434 const LLT V2S32 = LLT::vector(2, 32); 435 const LLT V3S32 = LLT::vector(3, 32); 436 const LLT V4S32 = LLT::vector(4, 32); 437 const LLT V5S32 = LLT::vector(5, 32); 438 const LLT V6S32 = LLT::vector(6, 32); 439 const LLT V7S32 = LLT::vector(7, 32); 440 const LLT V8S32 = LLT::vector(8, 32); 441 const LLT V9S32 = LLT::vector(9, 32); 442 const LLT V10S32 = LLT::vector(10, 32); 443 const LLT V11S32 = LLT::vector(11, 32); 444 const LLT V12S32 = LLT::vector(12, 32); 445 const LLT V13S32 = LLT::vector(13, 32); 446 const LLT V14S32 = LLT::vector(14, 32); 447 const LLT V15S32 = LLT::vector(15, 32); 448 const LLT V16S32 = LLT::vector(16, 32); 449 const LLT V32S32 = LLT::vector(32, 32); 450 451 const LLT V2S64 = LLT::vector(2, 64); 452 const LLT V3S64 = LLT::vector(3, 64); 453 const LLT V4S64 = LLT::vector(4, 64); 454 const LLT V5S64 = LLT::vector(5, 64); 455 const LLT V6S64 = LLT::vector(6, 64); 456 const LLT V7S64 = LLT::vector(7, 64); 457 const LLT V8S64 = LLT::vector(8, 64); 458 const LLT V16S64 = LLT::vector(16, 64); 459 460 std::initializer_list<LLT> AllS32Vectors = 461 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 462 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 463 std::initializer_list<LLT> AllS64Vectors = 464 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 465 466 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 467 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 468 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 469 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 470 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 471 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 472 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 473 474 const LLT CodePtr = FlatPtr; 475 476 const std::initializer_list<LLT> AddrSpaces64 = { 477 GlobalPtr, ConstantPtr, FlatPtr 478 }; 479 480 const std::initializer_list<LLT> AddrSpaces32 = { 481 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 482 }; 483 484 const std::initializer_list<LLT> FPTypesBase = { 485 S32, S64 486 }; 487 488 const std::initializer_list<LLT> FPTypes16 = { 489 S32, S64, S16 490 }; 491 492 const std::initializer_list<LLT> FPTypesPK16 = { 493 S32, S64, S16, V2S16 494 }; 495 496 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 497 498 setAction({G_BRCOND, S1}, Legal); // VCC branches 499 setAction({G_BRCOND, S32}, Legal); // SCC branches 500 501 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 502 // elements for v3s16 503 getActionDefinitionsBuilder(G_PHI) 504 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 505 .legalFor(AllS32Vectors) 506 .legalFor(AllS64Vectors) 507 .legalFor(AddrSpaces64) 508 .legalFor(AddrSpaces32) 509 .legalIf(isPointer(0)) 510 .clampScalar(0, S16, S256) 511 .widenScalarToNextPow2(0, 32) 512 .clampMaxNumElements(0, S32, 16) 513 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 514 .scalarize(0); 515 516 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 517 // Full set of gfx9 features. 518 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 519 .legalFor({S32, S16, V2S16}) 520 .clampScalar(0, S16, S32) 521 .clampMaxNumElements(0, S16, 2) 522 .scalarize(0) 523 .widenScalarToNextPow2(0, 32); 524 525 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 526 .legalFor({S32, S16, V2S16}) // Clamp modifier 527 .minScalarOrElt(0, S16) 528 .clampMaxNumElements(0, S16, 2) 529 .scalarize(0) 530 .widenScalarToNextPow2(0, 32) 531 .lower(); 532 } else if (ST.has16BitInsts()) { 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16}) 535 .clampScalar(0, S16, S32) 536 .scalarize(0) 537 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 538 539 // Technically the saturating operations require clamp bit support, but this 540 // was introduced at the same time as 16-bit operations. 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 542 .legalFor({S32, S16}) // Clamp modifier 543 .minScalar(0, S16) 544 .scalarize(0) 545 .widenScalarToNextPow2(0, 16) 546 .lower(); 547 548 // We're just lowering this, but it helps get a better result to try to 549 // coerce to the desired type first. 550 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 551 .minScalar(0, S16) 552 .scalarize(0) 553 .lower(); 554 } else { 555 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 556 .legalFor({S32}) 557 .clampScalar(0, S32, S32) 558 .scalarize(0); 559 560 if (ST.hasIntClamp()) { 561 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 562 .legalFor({S32}) // Clamp modifier. 563 .scalarize(0) 564 .minScalarOrElt(0, S32) 565 .lower(); 566 } else { 567 // Clamp bit support was added in VI, along with 16-bit operations. 568 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 569 .minScalar(0, S32) 570 .scalarize(0) 571 .lower(); 572 } 573 574 // FIXME: DAG expansion gets better results. The widening uses the smaller 575 // range values and goes for the min/max lowering directly. 576 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 577 .minScalar(0, S32) 578 .scalarize(0) 579 .lower(); 580 } 581 582 getActionDefinitionsBuilder( 583 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 584 .customFor({S32, S64}) 585 .clampScalar(0, S32, S64) 586 .widenScalarToNextPow2(0, 32) 587 .scalarize(0); 588 589 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 590 .legalFor({S32}) 591 .maxScalarOrElt(0, S32); 592 593 if (ST.hasVOP3PInsts()) { 594 Mulh 595 .clampMaxNumElements(0, S8, 2) 596 .lowerFor({V2S8}); 597 } 598 599 Mulh 600 .scalarize(0) 601 .lower(); 602 603 // Report legal for any types we can handle anywhere. For the cases only legal 604 // on the SALU, RegBankSelect will be able to re-legalize. 605 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 606 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 607 .clampScalar(0, S32, S64) 608 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 609 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 610 .widenScalarToNextPow2(0) 611 .scalarize(0); 612 613 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 614 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 615 .legalFor({{S32, S1}, {S32, S32}}) 616 .minScalar(0, S32) 617 // TODO: .scalarize(0) 618 .lower(); 619 620 getActionDefinitionsBuilder(G_BITCAST) 621 // Don't worry about the size constraint. 622 .legalIf(all(isRegisterType(0), isRegisterType(1))) 623 .lower(); 624 625 626 getActionDefinitionsBuilder(G_CONSTANT) 627 .legalFor({S1, S32, S64, S16, GlobalPtr, 628 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 629 .legalIf(isPointer(0)) 630 .clampScalar(0, S32, S64) 631 .widenScalarToNextPow2(0); 632 633 getActionDefinitionsBuilder(G_FCONSTANT) 634 .legalFor({S32, S64, S16}) 635 .clampScalar(0, S16, S64); 636 637 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 638 .legalIf(isRegisterType(0)) 639 // s1 and s16 are special cases because they have legal operations on 640 // them, but don't really occupy registers in the normal way. 641 .legalFor({S1, S16}) 642 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 643 .clampScalarOrElt(0, S32, MaxScalar) 644 .widenScalarToNextPow2(0, 32) 645 .clampMaxNumElements(0, S32, 16); 646 647 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 648 649 // If the amount is divergent, we have to do a wave reduction to get the 650 // maximum value, so this is expanded during RegBankSelect. 651 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 652 .legalFor({{PrivatePtr, S32}}); 653 654 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 655 .customIf(typeIsNot(0, PrivatePtr)); 656 657 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 658 659 auto &FPOpActions = getActionDefinitionsBuilder( 660 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 661 .legalFor({S32, S64}); 662 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 663 .customFor({S32, S64}); 664 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 665 .customFor({S32, S64}); 666 667 if (ST.has16BitInsts()) { 668 if (ST.hasVOP3PInsts()) 669 FPOpActions.legalFor({S16, V2S16}); 670 else 671 FPOpActions.legalFor({S16}); 672 673 TrigActions.customFor({S16}); 674 FDIVActions.customFor({S16}); 675 } 676 677 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 678 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 679 680 if (ST.hasVOP3PInsts()) { 681 MinNumMaxNum.customFor(FPTypesPK16) 682 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 683 .clampMaxNumElements(0, S16, 2) 684 .clampScalar(0, S16, S64) 685 .scalarize(0); 686 } else if (ST.has16BitInsts()) { 687 MinNumMaxNum.customFor(FPTypes16) 688 .clampScalar(0, S16, S64) 689 .scalarize(0); 690 } else { 691 MinNumMaxNum.customFor(FPTypesBase) 692 .clampScalar(0, S32, S64) 693 .scalarize(0); 694 } 695 696 if (ST.hasVOP3PInsts()) 697 FPOpActions.clampMaxNumElements(0, S16, 2); 698 699 FPOpActions 700 .scalarize(0) 701 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 702 703 TrigActions 704 .scalarize(0) 705 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 706 707 FDIVActions 708 .scalarize(0) 709 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 710 711 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 712 .legalFor(FPTypesPK16) 713 .clampMaxNumElements(0, S16, 2) 714 .scalarize(0) 715 .clampScalar(0, S16, S64); 716 717 if (ST.has16BitInsts()) { 718 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 719 .legalFor({S32, S64, S16}) 720 .scalarize(0) 721 .clampScalar(0, S16, S64); 722 } else { 723 getActionDefinitionsBuilder(G_FSQRT) 724 .legalFor({S32, S64}) 725 .scalarize(0) 726 .clampScalar(0, S32, S64); 727 728 if (ST.hasFractBug()) { 729 getActionDefinitionsBuilder(G_FFLOOR) 730 .customFor({S64}) 731 .legalFor({S32, S64}) 732 .scalarize(0) 733 .clampScalar(0, S32, S64); 734 } else { 735 getActionDefinitionsBuilder(G_FFLOOR) 736 .legalFor({S32, S64}) 737 .scalarize(0) 738 .clampScalar(0, S32, S64); 739 } 740 } 741 742 getActionDefinitionsBuilder(G_FPTRUNC) 743 .legalFor({{S32, S64}, {S16, S32}}) 744 .scalarize(0) 745 .lower(); 746 747 getActionDefinitionsBuilder(G_FPEXT) 748 .legalFor({{S64, S32}, {S32, S16}}) 749 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 750 .scalarize(0); 751 752 getActionDefinitionsBuilder(G_FSUB) 753 // Use actual fsub instruction 754 .legalFor({S32}) 755 // Must use fadd + fneg 756 .lowerFor({S64, S16, V2S16}) 757 .scalarize(0) 758 .clampScalar(0, S32, S64); 759 760 // Whether this is legal depends on the floating point mode for the function. 761 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 762 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 763 FMad.customFor({S32, S16}); 764 else if (ST.hasMadMacF32Insts()) 765 FMad.customFor({S32}); 766 else if (ST.hasMadF16()) 767 FMad.customFor({S16}); 768 FMad.scalarize(0) 769 .lower(); 770 771 auto &FRem = getActionDefinitionsBuilder(G_FREM); 772 if (ST.has16BitInsts()) { 773 FRem.customFor({S16, S32, S64}); 774 } else { 775 FRem.minScalar(0, S32) 776 .customFor({S32, S64}); 777 } 778 FRem.scalarize(0); 779 780 // TODO: Do we need to clamp maximum bitwidth? 781 getActionDefinitionsBuilder(G_TRUNC) 782 .legalIf(isScalar(0)) 783 .legalFor({{V2S16, V2S32}}) 784 .clampMaxNumElements(0, S16, 2) 785 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 786 // situations (like an invalid implicit use), we don't want to infinite loop 787 // in the legalizer. 788 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 789 .alwaysLegal(); 790 791 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 792 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 793 {S32, S1}, {S64, S1}, {S16, S1}}) 794 .scalarize(0) 795 .clampScalar(0, S32, S64) 796 .widenScalarToNextPow2(1, 32); 797 798 // TODO: Split s1->s64 during regbankselect for VALU. 799 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 800 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 801 .lowerFor({{S32, S64}}) 802 .lowerIf(typeIs(1, S1)) 803 .customFor({{S64, S64}}); 804 if (ST.has16BitInsts()) 805 IToFP.legalFor({{S16, S16}}); 806 IToFP.clampScalar(1, S32, S64) 807 .minScalar(0, S32) 808 .scalarize(0) 809 .widenScalarToNextPow2(1); 810 811 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 812 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 813 .customFor({{S64, S64}}) 814 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 815 if (ST.has16BitInsts()) 816 FPToI.legalFor({{S16, S16}}); 817 else 818 FPToI.minScalar(1, S32); 819 820 FPToI.minScalar(0, S32) 821 .widenScalarToNextPow2(0, 32) 822 .scalarize(0) 823 .lower(); 824 825 // Lower roundeven into G_FRINT 826 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 827 .scalarize(0) 828 .lower(); 829 830 if (ST.has16BitInsts()) { 831 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 832 .legalFor({S16, S32, S64}) 833 .clampScalar(0, S16, S64) 834 .scalarize(0); 835 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 836 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 837 .legalFor({S32, S64}) 838 .clampScalar(0, S32, S64) 839 .scalarize(0); 840 } else { 841 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 842 .legalFor({S32}) 843 .customFor({S64}) 844 .clampScalar(0, S32, S64) 845 .scalarize(0); 846 } 847 848 getActionDefinitionsBuilder(G_PTR_ADD) 849 .legalIf(all(isPointer(0), sameSize(0, 1))) 850 .scalarize(0) 851 .scalarSameSizeAs(1, 0); 852 853 getActionDefinitionsBuilder(G_PTRMASK) 854 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 855 .scalarSameSizeAs(1, 0) 856 .scalarize(0); 857 858 auto &CmpBuilder = 859 getActionDefinitionsBuilder(G_ICMP) 860 // The compare output type differs based on the register bank of the output, 861 // so make both s1 and s32 legal. 862 // 863 // Scalar compares producing output in scc will be promoted to s32, as that 864 // is the allocatable register type that will be needed for the copy from 865 // scc. This will be promoted during RegBankSelect, and we assume something 866 // before that won't try to use s32 result types. 867 // 868 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 869 // bank. 870 .legalForCartesianProduct( 871 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 872 .legalForCartesianProduct( 873 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 874 if (ST.has16BitInsts()) { 875 CmpBuilder.legalFor({{S1, S16}}); 876 } 877 878 CmpBuilder 879 .widenScalarToNextPow2(1) 880 .clampScalar(1, S32, S64) 881 .scalarize(0) 882 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 883 884 getActionDefinitionsBuilder(G_FCMP) 885 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 886 .widenScalarToNextPow2(1) 887 .clampScalar(1, S32, S64) 888 .scalarize(0); 889 890 // FIXME: fpow has a selection pattern that should move to custom lowering. 891 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 892 if (ST.has16BitInsts()) 893 Exp2Ops.legalFor({S32, S16}); 894 else 895 Exp2Ops.legalFor({S32}); 896 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 897 Exp2Ops.scalarize(0); 898 899 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 900 if (ST.has16BitInsts()) 901 ExpOps.customFor({{S32}, {S16}}); 902 else 903 ExpOps.customFor({S32}); 904 ExpOps.clampScalar(0, MinScalarFPTy, S32) 905 .scalarize(0); 906 907 getActionDefinitionsBuilder(G_FPOWI) 908 .clampScalar(0, MinScalarFPTy, S32) 909 .lower(); 910 911 // The 64-bit versions produce 32-bit results, but only on the SALU. 912 getActionDefinitionsBuilder(G_CTPOP) 913 .legalFor({{S32, S32}, {S32, S64}}) 914 .clampScalar(0, S32, S32) 915 .clampScalar(1, S32, S64) 916 .scalarize(0) 917 .widenScalarToNextPow2(0, 32) 918 .widenScalarToNextPow2(1, 32); 919 920 // The hardware instructions return a different result on 0 than the generic 921 // instructions expect. The hardware produces -1, but these produce the 922 // bitwidth. 923 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 924 .scalarize(0) 925 .clampScalar(0, S32, S32) 926 .clampScalar(1, S32, S64) 927 .widenScalarToNextPow2(0, 32) 928 .widenScalarToNextPow2(1, 32) 929 .lower(); 930 931 // The 64-bit versions produce 32-bit results, but only on the SALU. 932 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 933 .legalFor({{S32, S32}, {S32, S64}}) 934 .clampScalar(0, S32, S32) 935 .clampScalar(1, S32, S64) 936 .scalarize(0) 937 .widenScalarToNextPow2(0, 32) 938 .widenScalarToNextPow2(1, 32); 939 940 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 941 // RegBankSelect. 942 getActionDefinitionsBuilder(G_BITREVERSE) 943 .legalFor({S32, S64}) 944 .clampScalar(0, S32, S64) 945 .scalarize(0) 946 .widenScalarToNextPow2(0); 947 948 if (ST.has16BitInsts()) { 949 getActionDefinitionsBuilder(G_BSWAP) 950 .legalFor({S16, S32, V2S16}) 951 .clampMaxNumElements(0, S16, 2) 952 // FIXME: Fixing non-power-of-2 before clamp is workaround for 953 // narrowScalar limitation. 954 .widenScalarToNextPow2(0) 955 .clampScalar(0, S16, S32) 956 .scalarize(0); 957 958 if (ST.hasVOP3PInsts()) { 959 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 960 .legalFor({S32, S16, V2S16}) 961 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 962 .clampMaxNumElements(0, S16, 2) 963 .minScalar(0, S16) 964 .widenScalarToNextPow2(0) 965 .scalarize(0) 966 .lower(); 967 } else { 968 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 969 .legalFor({S32, S16}) 970 .widenScalarToNextPow2(0) 971 .minScalar(0, S16) 972 .scalarize(0) 973 .lower(); 974 } 975 } else { 976 // TODO: Should have same legality without v_perm_b32 977 getActionDefinitionsBuilder(G_BSWAP) 978 .legalFor({S32}) 979 .lowerIf(scalarNarrowerThan(0, 32)) 980 // FIXME: Fixing non-power-of-2 before clamp is workaround for 981 // narrowScalar limitation. 982 .widenScalarToNextPow2(0) 983 .maxScalar(0, S32) 984 .scalarize(0) 985 .lower(); 986 987 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 988 .legalFor({S32}) 989 .minScalar(0, S32) 990 .widenScalarToNextPow2(0) 991 .scalarize(0) 992 .lower(); 993 } 994 995 getActionDefinitionsBuilder(G_INTTOPTR) 996 // List the common cases 997 .legalForCartesianProduct(AddrSpaces64, {S64}) 998 .legalForCartesianProduct(AddrSpaces32, {S32}) 999 .scalarize(0) 1000 // Accept any address space as long as the size matches 1001 .legalIf(sameSize(0, 1)) 1002 .widenScalarIf(smallerThan(1, 0), 1003 [](const LegalityQuery &Query) { 1004 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1005 }) 1006 .narrowScalarIf(largerThan(1, 0), 1007 [](const LegalityQuery &Query) { 1008 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1009 }); 1010 1011 getActionDefinitionsBuilder(G_PTRTOINT) 1012 // List the common cases 1013 .legalForCartesianProduct(AddrSpaces64, {S64}) 1014 .legalForCartesianProduct(AddrSpaces32, {S32}) 1015 .scalarize(0) 1016 // Accept any address space as long as the size matches 1017 .legalIf(sameSize(0, 1)) 1018 .widenScalarIf(smallerThan(0, 1), 1019 [](const LegalityQuery &Query) { 1020 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1021 }) 1022 .narrowScalarIf( 1023 largerThan(0, 1), 1024 [](const LegalityQuery &Query) { 1025 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1026 }); 1027 1028 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1029 .scalarize(0) 1030 .custom(); 1031 1032 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1033 bool IsLoad) -> bool { 1034 const LLT DstTy = Query.Types[0]; 1035 1036 // Split vector extloads. 1037 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1038 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1039 1040 if (MemSize < DstTy.getSizeInBits()) 1041 MemSize = std::max(MemSize, AlignBits); 1042 1043 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1044 return true; 1045 1046 const LLT PtrTy = Query.Types[1]; 1047 unsigned AS = PtrTy.getAddressSpace(); 1048 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1049 return true; 1050 1051 // Catch weird sized loads that don't evenly divide into the access sizes 1052 // TODO: May be able to widen depending on alignment etc. 1053 unsigned NumRegs = (MemSize + 31) / 32; 1054 if (NumRegs == 3) { 1055 if (!ST.hasDwordx3LoadStores()) 1056 return true; 1057 } else { 1058 // If the alignment allows, these should have been widened. 1059 if (!isPowerOf2_32(NumRegs)) 1060 return true; 1061 } 1062 1063 if (AlignBits < MemSize) { 1064 const SITargetLowering *TLI = ST.getTargetLowering(); 1065 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1066 Align(AlignBits / 8)); 1067 } 1068 1069 return false; 1070 }; 1071 1072 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1073 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1074 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1075 1076 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1077 // LDS 1078 // TODO: Unsupported flat for SI. 1079 1080 for (unsigned Op : {G_LOAD, G_STORE}) { 1081 const bool IsStore = Op == G_STORE; 1082 1083 auto &Actions = getActionDefinitionsBuilder(Op); 1084 // Explicitly list some common cases. 1085 // TODO: Does this help compile time at all? 1086 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1087 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1088 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1089 {S64, GlobalPtr, 64, GlobalAlign32}, 1090 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1091 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1092 {S32, GlobalPtr, 8, GlobalAlign8}, 1093 {S32, GlobalPtr, 16, GlobalAlign16}, 1094 1095 {S32, LocalPtr, 32, 32}, 1096 {S64, LocalPtr, 64, 32}, 1097 {V2S32, LocalPtr, 64, 32}, 1098 {S32, LocalPtr, 8, 8}, 1099 {S32, LocalPtr, 16, 16}, 1100 {V2S16, LocalPtr, 32, 32}, 1101 1102 {S32, PrivatePtr, 32, 32}, 1103 {S32, PrivatePtr, 8, 8}, 1104 {S32, PrivatePtr, 16, 16}, 1105 {V2S16, PrivatePtr, 32, 32}, 1106 1107 {S32, ConstantPtr, 32, GlobalAlign32}, 1108 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1109 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1110 {S64, ConstantPtr, 64, GlobalAlign32}, 1111 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1112 Actions.legalIf( 1113 [=](const LegalityQuery &Query) -> bool { 1114 return isLoadStoreLegal(ST, Query); 1115 }); 1116 1117 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1118 // 64-bits. 1119 // 1120 // TODO: Should generalize bitcast action into coerce, which will also cover 1121 // inserting addrspacecasts. 1122 Actions.customIf(typeIs(1, Constant32Ptr)); 1123 1124 // Turn any illegal element vectors into something easier to deal 1125 // with. These will ultimately produce 32-bit scalar shifts to extract the 1126 // parts anyway. 1127 // 1128 // For odd 16-bit element vectors, prefer to split those into pieces with 1129 // 16-bit vector parts. 1130 Actions.bitcastIf( 1131 [=](const LegalityQuery &Query) -> bool { 1132 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1133 Query.MMODescrs[0].SizeInBits); 1134 }, bitcastToRegisterType(0)); 1135 1136 if (!IsStore) { 1137 // Widen suitably aligned loads by loading extra bytes. The standard 1138 // legalization actions can't properly express widening memory operands. 1139 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1140 return shouldWidenLoad(ST, Query, G_LOAD); 1141 }); 1142 } 1143 1144 // FIXME: load/store narrowing should be moved to lower action 1145 Actions 1146 .narrowScalarIf( 1147 [=](const LegalityQuery &Query) -> bool { 1148 return !Query.Types[0].isVector() && 1149 needToSplitMemOp(Query, Op == G_LOAD); 1150 }, 1151 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1152 const LLT DstTy = Query.Types[0]; 1153 const LLT PtrTy = Query.Types[1]; 1154 1155 const unsigned DstSize = DstTy.getSizeInBits(); 1156 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1157 1158 // Split extloads. 1159 if (DstSize > MemSize) 1160 return std::make_pair(0, LLT::scalar(MemSize)); 1161 1162 if (!isPowerOf2_32(DstSize)) { 1163 // We're probably decomposing an odd sized store. Try to split 1164 // to the widest type. TODO: Account for alignment. As-is it 1165 // should be OK, since the new parts will be further legalized. 1166 unsigned FloorSize = PowerOf2Floor(DstSize); 1167 return std::make_pair(0, LLT::scalar(FloorSize)); 1168 } 1169 1170 if (DstSize > 32 && (DstSize % 32 != 0)) { 1171 // FIXME: Need a way to specify non-extload of larger size if 1172 // suitably aligned. 1173 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1174 } 1175 1176 unsigned MaxSize = maxSizeForAddrSpace(ST, 1177 PtrTy.getAddressSpace(), 1178 Op == G_LOAD); 1179 if (MemSize > MaxSize) 1180 return std::make_pair(0, LLT::scalar(MaxSize)); 1181 1182 unsigned Align = Query.MMODescrs[0].AlignInBits; 1183 return std::make_pair(0, LLT::scalar(Align)); 1184 }) 1185 .fewerElementsIf( 1186 [=](const LegalityQuery &Query) -> bool { 1187 return Query.Types[0].isVector() && 1188 needToSplitMemOp(Query, Op == G_LOAD); 1189 }, 1190 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1191 const LLT DstTy = Query.Types[0]; 1192 const LLT PtrTy = Query.Types[1]; 1193 1194 LLT EltTy = DstTy.getElementType(); 1195 unsigned MaxSize = maxSizeForAddrSpace(ST, 1196 PtrTy.getAddressSpace(), 1197 Op == G_LOAD); 1198 1199 // FIXME: Handle widened to power of 2 results better. This ends 1200 // up scalarizing. 1201 // FIXME: 3 element stores scalarized on SI 1202 1203 // Split if it's too large for the address space. 1204 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1205 unsigned NumElts = DstTy.getNumElements(); 1206 unsigned EltSize = EltTy.getSizeInBits(); 1207 1208 if (MaxSize % EltSize == 0) { 1209 return std::make_pair( 1210 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1211 } 1212 1213 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1214 1215 // FIXME: Refine when odd breakdowns handled 1216 // The scalars will need to be re-legalized. 1217 if (NumPieces == 1 || NumPieces >= NumElts || 1218 NumElts % NumPieces != 0) 1219 return std::make_pair(0, EltTy); 1220 1221 return std::make_pair(0, 1222 LLT::vector(NumElts / NumPieces, EltTy)); 1223 } 1224 1225 // FIXME: We could probably handle weird extending loads better. 1226 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1227 if (DstTy.getSizeInBits() > MemSize) 1228 return std::make_pair(0, EltTy); 1229 1230 unsigned EltSize = EltTy.getSizeInBits(); 1231 unsigned DstSize = DstTy.getSizeInBits(); 1232 if (!isPowerOf2_32(DstSize)) { 1233 // We're probably decomposing an odd sized store. Try to split 1234 // to the widest type. TODO: Account for alignment. As-is it 1235 // should be OK, since the new parts will be further legalized. 1236 unsigned FloorSize = PowerOf2Floor(DstSize); 1237 return std::make_pair( 1238 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1239 } 1240 1241 // Need to split because of alignment. 1242 unsigned Align = Query.MMODescrs[0].AlignInBits; 1243 if (EltSize > Align && 1244 (EltSize / Align < DstTy.getNumElements())) { 1245 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1246 } 1247 1248 // May need relegalization for the scalars. 1249 return std::make_pair(0, EltTy); 1250 }) 1251 .lowerIfMemSizeNotPow2() 1252 .minScalar(0, S32); 1253 1254 if (IsStore) 1255 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1256 1257 Actions 1258 .widenScalarToNextPow2(0) 1259 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1260 .lower(); 1261 } 1262 1263 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1264 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1265 {S32, GlobalPtr, 16, 2 * 8}, 1266 {S32, LocalPtr, 8, 8}, 1267 {S32, LocalPtr, 16, 16}, 1268 {S32, PrivatePtr, 8, 8}, 1269 {S32, PrivatePtr, 16, 16}, 1270 {S32, ConstantPtr, 8, 8}, 1271 {S32, ConstantPtr, 16, 2 * 8}}); 1272 if (ST.hasFlatAddressSpace()) { 1273 ExtLoads.legalForTypesWithMemDesc( 1274 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1275 } 1276 1277 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1278 // 64-bits. 1279 // 1280 // TODO: Should generalize bitcast action into coerce, which will also cover 1281 // inserting addrspacecasts. 1282 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1283 1284 ExtLoads.clampScalar(0, S32, S32) 1285 .widenScalarToNextPow2(0) 1286 .unsupportedIfMemSizeNotPow2() 1287 .lower(); 1288 1289 auto &Atomics = getActionDefinitionsBuilder( 1290 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1291 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1292 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1293 G_ATOMICRMW_UMIN}) 1294 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1295 {S64, GlobalPtr}, {S64, LocalPtr}, 1296 {S32, RegionPtr}, {S64, RegionPtr}}); 1297 if (ST.hasFlatAddressSpace()) { 1298 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1299 } 1300 1301 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1302 if (ST.hasLDSFPAtomics()) { 1303 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1304 if (ST.hasGFX90AInsts()) 1305 Atomic.legalFor({{S64, LocalPtr}}); 1306 } 1307 if (ST.hasAtomicFaddInsts()) 1308 Atomic.legalFor({{S32, GlobalPtr}}); 1309 1310 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1311 // demarshalling 1312 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1313 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1314 {S32, FlatPtr}, {S64, FlatPtr}}) 1315 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1316 {S32, RegionPtr}, {S64, RegionPtr}}); 1317 // TODO: Pointer types, any 32-bit or 64-bit vector 1318 1319 // Condition should be s32 for scalar, s1 for vector. 1320 getActionDefinitionsBuilder(G_SELECT) 1321 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1322 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1323 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1324 .clampScalar(0, S16, S64) 1325 .scalarize(1) 1326 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1327 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1328 .clampMaxNumElements(0, S32, 2) 1329 .clampMaxNumElements(0, LocalPtr, 2) 1330 .clampMaxNumElements(0, PrivatePtr, 2) 1331 .scalarize(0) 1332 .widenScalarToNextPow2(0) 1333 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1334 1335 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1336 // be more flexible with the shift amount type. 1337 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1338 .legalFor({{S32, S32}, {S64, S32}}); 1339 if (ST.has16BitInsts()) { 1340 if (ST.hasVOP3PInsts()) { 1341 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1342 .clampMaxNumElements(0, S16, 2); 1343 } else 1344 Shifts.legalFor({{S16, S16}}); 1345 1346 // TODO: Support 16-bit shift amounts for all types 1347 Shifts.widenScalarIf( 1348 [=](const LegalityQuery &Query) { 1349 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1350 // 32-bit amount. 1351 const LLT ValTy = Query.Types[0]; 1352 const LLT AmountTy = Query.Types[1]; 1353 return ValTy.getSizeInBits() <= 16 && 1354 AmountTy.getSizeInBits() < 16; 1355 }, changeTo(1, S16)); 1356 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1357 Shifts.clampScalar(1, S32, S32); 1358 Shifts.clampScalar(0, S16, S64); 1359 Shifts.widenScalarToNextPow2(0, 16); 1360 1361 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1362 .minScalar(0, S16) 1363 .scalarize(0) 1364 .lower(); 1365 } else { 1366 // Make sure we legalize the shift amount type first, as the general 1367 // expansion for the shifted type will produce much worse code if it hasn't 1368 // been truncated already. 1369 Shifts.clampScalar(1, S32, S32); 1370 Shifts.clampScalar(0, S32, S64); 1371 Shifts.widenScalarToNextPow2(0, 32); 1372 1373 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1374 .minScalar(0, S32) 1375 .scalarize(0) 1376 .lower(); 1377 } 1378 Shifts.scalarize(0); 1379 1380 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1381 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1382 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1383 unsigned IdxTypeIdx = 2; 1384 1385 getActionDefinitionsBuilder(Op) 1386 .customIf([=](const LegalityQuery &Query) { 1387 const LLT EltTy = Query.Types[EltTypeIdx]; 1388 const LLT VecTy = Query.Types[VecTypeIdx]; 1389 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1390 const unsigned EltSize = EltTy.getSizeInBits(); 1391 return (EltSize == 32 || EltSize == 64) && 1392 VecTy.getSizeInBits() % 32 == 0 && 1393 VecTy.getSizeInBits() <= MaxRegisterSize && 1394 IdxTy.getSizeInBits() == 32; 1395 }) 1396 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1397 bitcastToVectorElement32(VecTypeIdx)) 1398 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1399 .bitcastIf( 1400 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1401 [=](const LegalityQuery &Query) { 1402 // For > 64-bit element types, try to turn this into a 64-bit 1403 // element vector since we may be able to do better indexing 1404 // if this is scalar. If not, fall back to 32. 1405 const LLT EltTy = Query.Types[EltTypeIdx]; 1406 const LLT VecTy = Query.Types[VecTypeIdx]; 1407 const unsigned DstEltSize = EltTy.getSizeInBits(); 1408 const unsigned VecSize = VecTy.getSizeInBits(); 1409 1410 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1411 return std::make_pair( 1412 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1413 }) 1414 .clampScalar(EltTypeIdx, S32, S64) 1415 .clampScalar(VecTypeIdx, S32, S64) 1416 .clampScalar(IdxTypeIdx, S32, S32) 1417 .clampMaxNumElements(VecTypeIdx, S32, 32) 1418 // TODO: Clamp elements for 64-bit vectors? 1419 // It should only be necessary with variable indexes. 1420 // As a last resort, lower to the stack 1421 .lower(); 1422 } 1423 1424 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1425 .unsupportedIf([=](const LegalityQuery &Query) { 1426 const LLT &EltTy = Query.Types[1].getElementType(); 1427 return Query.Types[0] != EltTy; 1428 }); 1429 1430 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1431 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1432 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1433 1434 // FIXME: Doesn't handle extract of illegal sizes. 1435 getActionDefinitionsBuilder(Op) 1436 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1437 // FIXME: Multiples of 16 should not be legal. 1438 .legalIf([=](const LegalityQuery &Query) { 1439 const LLT BigTy = Query.Types[BigTyIdx]; 1440 const LLT LitTy = Query.Types[LitTyIdx]; 1441 return (BigTy.getSizeInBits() % 32 == 0) && 1442 (LitTy.getSizeInBits() % 16 == 0); 1443 }) 1444 .widenScalarIf( 1445 [=](const LegalityQuery &Query) { 1446 const LLT BigTy = Query.Types[BigTyIdx]; 1447 return (BigTy.getScalarSizeInBits() < 16); 1448 }, 1449 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1450 .widenScalarIf( 1451 [=](const LegalityQuery &Query) { 1452 const LLT LitTy = Query.Types[LitTyIdx]; 1453 return (LitTy.getScalarSizeInBits() < 16); 1454 }, 1455 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1456 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1457 .widenScalarToNextPow2(BigTyIdx, 32); 1458 1459 } 1460 1461 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1462 .legalForCartesianProduct(AllS32Vectors, {S32}) 1463 .legalForCartesianProduct(AllS64Vectors, {S64}) 1464 .clampNumElements(0, V16S32, V32S32) 1465 .clampNumElements(0, V2S64, V16S64) 1466 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1467 1468 if (ST.hasScalarPackInsts()) { 1469 BuildVector 1470 // FIXME: Should probably widen s1 vectors straight to s32 1471 .minScalarOrElt(0, S16) 1472 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1473 .minScalar(1, S32); 1474 1475 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1476 .legalFor({V2S16, S32}) 1477 .lower(); 1478 BuildVector.minScalarOrElt(0, S32); 1479 } else { 1480 BuildVector.customFor({V2S16, S16}); 1481 BuildVector.minScalarOrElt(0, S32); 1482 1483 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1484 .customFor({V2S16, S32}) 1485 .lower(); 1486 } 1487 1488 BuildVector.legalIf(isRegisterType(0)); 1489 1490 // FIXME: Clamp maximum size 1491 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1492 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1493 .clampMaxNumElements(0, S32, 32) 1494 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1495 .clampMaxNumElements(0, S16, 64); 1496 1497 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1498 // pre-legalize. 1499 if (ST.hasVOP3PInsts()) { 1500 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1501 .customFor({V2S16, V2S16}) 1502 .lower(); 1503 } else 1504 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1505 1506 // Merge/Unmerge 1507 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1508 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1509 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1510 1511 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1512 const LLT Ty = Query.Types[TypeIdx]; 1513 if (Ty.isVector()) { 1514 const LLT &EltTy = Ty.getElementType(); 1515 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1516 return true; 1517 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1518 return true; 1519 } 1520 return false; 1521 }; 1522 1523 auto &Builder = getActionDefinitionsBuilder(Op) 1524 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1525 .lowerFor({{S16, V2S16}}) 1526 .lowerIf([=](const LegalityQuery &Query) { 1527 const LLT BigTy = Query.Types[BigTyIdx]; 1528 return BigTy.getSizeInBits() == 32; 1529 }) 1530 // Try to widen to s16 first for small types. 1531 // TODO: Only do this on targets with legal s16 shifts 1532 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1533 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1534 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1535 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1536 elementTypeIs(1, S16)), 1537 changeTo(1, V2S16)) 1538 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1539 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1540 // valid. 1541 .clampScalar(LitTyIdx, S32, S512) 1542 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1543 // Break up vectors with weird elements into scalars 1544 .fewerElementsIf( 1545 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1546 scalarize(0)) 1547 .fewerElementsIf( 1548 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1549 scalarize(1)) 1550 .clampScalar(BigTyIdx, S32, MaxScalar); 1551 1552 if (Op == G_MERGE_VALUES) { 1553 Builder.widenScalarIf( 1554 // TODO: Use 16-bit shifts if legal for 8-bit values? 1555 [=](const LegalityQuery &Query) { 1556 const LLT Ty = Query.Types[LitTyIdx]; 1557 return Ty.getSizeInBits() < 32; 1558 }, 1559 changeTo(LitTyIdx, S32)); 1560 } 1561 1562 Builder.widenScalarIf( 1563 [=](const LegalityQuery &Query) { 1564 const LLT Ty = Query.Types[BigTyIdx]; 1565 return !isPowerOf2_32(Ty.getSizeInBits()) && 1566 Ty.getSizeInBits() % 16 != 0; 1567 }, 1568 [=](const LegalityQuery &Query) { 1569 // Pick the next power of 2, or a multiple of 64 over 128. 1570 // Whichever is smaller. 1571 const LLT &Ty = Query.Types[BigTyIdx]; 1572 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1573 if (NewSizeInBits >= 256) { 1574 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1575 if (RoundedTo < NewSizeInBits) 1576 NewSizeInBits = RoundedTo; 1577 } 1578 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1579 }) 1580 // Any vectors left are the wrong size. Scalarize them. 1581 .scalarize(0) 1582 .scalarize(1); 1583 } 1584 1585 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1586 // RegBankSelect. 1587 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1588 .legalFor({{S32}, {S64}}); 1589 1590 if (ST.hasVOP3PInsts()) { 1591 SextInReg.lowerFor({{V2S16}}) 1592 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1593 // get more vector shift opportunities, since we'll get those when 1594 // expanded. 1595 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1596 } else if (ST.has16BitInsts()) { 1597 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1598 } else { 1599 // Prefer to promote to s32 before lowering if we don't have 16-bit 1600 // shifts. This avoid a lot of intermediate truncate and extend operations. 1601 SextInReg.lowerFor({{S32}, {S64}}); 1602 } 1603 1604 SextInReg 1605 .scalarize(0) 1606 .clampScalar(0, S32, S64) 1607 .lower(); 1608 1609 // TODO: Only Try to form v2s16 with legal packed instructions. 1610 getActionDefinitionsBuilder(G_FSHR) 1611 .legalFor({{S32, S32}}) 1612 .lowerFor({{V2S16, V2S16}}) 1613 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1614 .scalarize(0) 1615 .lower(); 1616 1617 if (ST.hasVOP3PInsts()) { 1618 getActionDefinitionsBuilder(G_FSHL) 1619 .lowerFor({{V2S16, V2S16}}) 1620 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1621 .scalarize(0) 1622 .lower(); 1623 } else { 1624 getActionDefinitionsBuilder(G_FSHL) 1625 .scalarize(0) 1626 .lower(); 1627 } 1628 1629 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1630 .legalFor({S64}); 1631 1632 getActionDefinitionsBuilder(G_FENCE) 1633 .alwaysLegal(); 1634 1635 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1636 .scalarize(0) 1637 .minScalar(0, S32) 1638 .lower(); 1639 1640 getActionDefinitionsBuilder({ 1641 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1642 G_FCOPYSIGN, 1643 1644 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1645 G_ATOMICRMW_NAND, 1646 G_ATOMICRMW_FSUB, 1647 G_READ_REGISTER, 1648 G_WRITE_REGISTER, 1649 1650 G_SADDO, G_SSUBO, 1651 1652 // TODO: Implement 1653 G_FMINIMUM, G_FMAXIMUM}).lower(); 1654 1655 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1656 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1657 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1658 .unsupported(); 1659 1660 computeTables(); 1661 verify(*ST.getInstrInfo()); 1662 } 1663 1664 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1665 MachineInstr &MI) const { 1666 MachineIRBuilder &B = Helper.MIRBuilder; 1667 MachineRegisterInfo &MRI = *B.getMRI(); 1668 1669 switch (MI.getOpcode()) { 1670 case TargetOpcode::G_ADDRSPACE_CAST: 1671 return legalizeAddrSpaceCast(MI, MRI, B); 1672 case TargetOpcode::G_FRINT: 1673 return legalizeFrint(MI, MRI, B); 1674 case TargetOpcode::G_FCEIL: 1675 return legalizeFceil(MI, MRI, B); 1676 case TargetOpcode::G_FREM: 1677 return legalizeFrem(MI, MRI, B); 1678 case TargetOpcode::G_INTRINSIC_TRUNC: 1679 return legalizeIntrinsicTrunc(MI, MRI, B); 1680 case TargetOpcode::G_SITOFP: 1681 return legalizeITOFP(MI, MRI, B, true); 1682 case TargetOpcode::G_UITOFP: 1683 return legalizeITOFP(MI, MRI, B, false); 1684 case TargetOpcode::G_FPTOSI: 1685 return legalizeFPTOI(MI, MRI, B, true); 1686 case TargetOpcode::G_FPTOUI: 1687 return legalizeFPTOI(MI, MRI, B, false); 1688 case TargetOpcode::G_FMINNUM: 1689 case TargetOpcode::G_FMAXNUM: 1690 case TargetOpcode::G_FMINNUM_IEEE: 1691 case TargetOpcode::G_FMAXNUM_IEEE: 1692 return legalizeMinNumMaxNum(Helper, MI); 1693 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1694 return legalizeExtractVectorElt(MI, MRI, B); 1695 case TargetOpcode::G_INSERT_VECTOR_ELT: 1696 return legalizeInsertVectorElt(MI, MRI, B); 1697 case TargetOpcode::G_SHUFFLE_VECTOR: 1698 return legalizeShuffleVector(MI, MRI, B); 1699 case TargetOpcode::G_FSIN: 1700 case TargetOpcode::G_FCOS: 1701 return legalizeSinCos(MI, MRI, B); 1702 case TargetOpcode::G_GLOBAL_VALUE: 1703 return legalizeGlobalValue(MI, MRI, B); 1704 case TargetOpcode::G_LOAD: 1705 case TargetOpcode::G_SEXTLOAD: 1706 case TargetOpcode::G_ZEXTLOAD: 1707 return legalizeLoad(Helper, MI); 1708 case TargetOpcode::G_FMAD: 1709 return legalizeFMad(MI, MRI, B); 1710 case TargetOpcode::G_FDIV: 1711 return legalizeFDIV(MI, MRI, B); 1712 case TargetOpcode::G_UDIV: 1713 case TargetOpcode::G_UREM: 1714 case TargetOpcode::G_UDIVREM: 1715 return legalizeUnsignedDIV_REM(MI, MRI, B); 1716 case TargetOpcode::G_SDIV: 1717 case TargetOpcode::G_SREM: 1718 case TargetOpcode::G_SDIVREM: 1719 return legalizeSignedDIV_REM(MI, MRI, B); 1720 case TargetOpcode::G_ATOMIC_CMPXCHG: 1721 return legalizeAtomicCmpXChg(MI, MRI, B); 1722 case TargetOpcode::G_FLOG: 1723 return legalizeFlog(MI, B, numbers::ln2f); 1724 case TargetOpcode::G_FLOG10: 1725 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1726 case TargetOpcode::G_FEXP: 1727 return legalizeFExp(MI, B); 1728 case TargetOpcode::G_FPOW: 1729 return legalizeFPow(MI, B); 1730 case TargetOpcode::G_FFLOOR: 1731 return legalizeFFloor(MI, MRI, B); 1732 case TargetOpcode::G_BUILD_VECTOR: 1733 return legalizeBuildVector(MI, MRI, B); 1734 default: 1735 return false; 1736 } 1737 1738 llvm_unreachable("expected switch to return"); 1739 } 1740 1741 Register AMDGPULegalizerInfo::getSegmentAperture( 1742 unsigned AS, 1743 MachineRegisterInfo &MRI, 1744 MachineIRBuilder &B) const { 1745 MachineFunction &MF = B.getMF(); 1746 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1747 const LLT S32 = LLT::scalar(32); 1748 1749 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1750 1751 if (ST.hasApertureRegs()) { 1752 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1753 // getreg. 1754 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1755 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1756 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1757 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1758 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1759 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1760 unsigned Encoding = 1761 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1762 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1763 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1764 1765 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1766 1767 B.buildInstr(AMDGPU::S_GETREG_B32) 1768 .addDef(GetReg) 1769 .addImm(Encoding); 1770 MRI.setType(GetReg, S32); 1771 1772 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1773 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1774 } 1775 1776 Register QueuePtr = MRI.createGenericVirtualRegister( 1777 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1778 1779 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1780 return Register(); 1781 1782 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1783 // private_segment_aperture_base_hi. 1784 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1785 1786 // TODO: can we be smarter about machine pointer info? 1787 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1788 MachineMemOperand *MMO = MF.getMachineMemOperand( 1789 PtrInfo, 1790 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1791 MachineMemOperand::MOInvariant, 1792 4, commonAlignment(Align(64), StructOffset)); 1793 1794 Register LoadAddr; 1795 1796 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1797 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1798 } 1799 1800 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1801 MachineInstr &MI, MachineRegisterInfo &MRI, 1802 MachineIRBuilder &B) const { 1803 MachineFunction &MF = B.getMF(); 1804 1805 const LLT S32 = LLT::scalar(32); 1806 Register Dst = MI.getOperand(0).getReg(); 1807 Register Src = MI.getOperand(1).getReg(); 1808 1809 LLT DstTy = MRI.getType(Dst); 1810 LLT SrcTy = MRI.getType(Src); 1811 unsigned DestAS = DstTy.getAddressSpace(); 1812 unsigned SrcAS = SrcTy.getAddressSpace(); 1813 1814 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1815 // vector element. 1816 assert(!DstTy.isVector()); 1817 1818 const AMDGPUTargetMachine &TM 1819 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1820 1821 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1822 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1823 return true; 1824 } 1825 1826 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1827 // Truncate. 1828 B.buildExtract(Dst, Src, 0); 1829 MI.eraseFromParent(); 1830 return true; 1831 } 1832 1833 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1834 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1835 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1836 1837 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1838 // another. Merge operands are required to be the same type, but creating an 1839 // extra ptrtoint would be kind of pointless. 1840 auto HighAddr = B.buildConstant( 1841 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1842 B.buildMerge(Dst, {Src, HighAddr}); 1843 MI.eraseFromParent(); 1844 return true; 1845 } 1846 1847 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1848 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1849 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1850 unsigned NullVal = TM.getNullPointerValue(DestAS); 1851 1852 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1853 auto FlatNull = B.buildConstant(SrcTy, 0); 1854 1855 // Extract low 32-bits of the pointer. 1856 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1857 1858 auto CmpRes = 1859 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1860 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1861 1862 MI.eraseFromParent(); 1863 return true; 1864 } 1865 1866 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1867 return false; 1868 1869 if (!ST.hasFlatAddressSpace()) 1870 return false; 1871 1872 auto SegmentNull = 1873 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1874 auto FlatNull = 1875 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1876 1877 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1878 if (!ApertureReg.isValid()) 1879 return false; 1880 1881 auto CmpRes = 1882 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1883 1884 // Coerce the type of the low half of the result so we can use merge_values. 1885 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1886 1887 // TODO: Should we allow mismatched types but matching sizes in merges to 1888 // avoid the ptrtoint? 1889 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1890 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1891 1892 MI.eraseFromParent(); 1893 return true; 1894 } 1895 1896 bool AMDGPULegalizerInfo::legalizeFrint( 1897 MachineInstr &MI, MachineRegisterInfo &MRI, 1898 MachineIRBuilder &B) const { 1899 Register Src = MI.getOperand(1).getReg(); 1900 LLT Ty = MRI.getType(Src); 1901 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1902 1903 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1904 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1905 1906 auto C1 = B.buildFConstant(Ty, C1Val); 1907 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1908 1909 // TODO: Should this propagate fast-math-flags? 1910 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1911 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1912 1913 auto C2 = B.buildFConstant(Ty, C2Val); 1914 auto Fabs = B.buildFAbs(Ty, Src); 1915 1916 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1917 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1918 MI.eraseFromParent(); 1919 return true; 1920 } 1921 1922 bool AMDGPULegalizerInfo::legalizeFceil( 1923 MachineInstr &MI, MachineRegisterInfo &MRI, 1924 MachineIRBuilder &B) const { 1925 1926 const LLT S1 = LLT::scalar(1); 1927 const LLT S64 = LLT::scalar(64); 1928 1929 Register Src = MI.getOperand(1).getReg(); 1930 assert(MRI.getType(Src) == S64); 1931 1932 // result = trunc(src) 1933 // if (src > 0.0 && src != result) 1934 // result += 1.0 1935 1936 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1937 1938 const auto Zero = B.buildFConstant(S64, 0.0); 1939 const auto One = B.buildFConstant(S64, 1.0); 1940 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1941 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1942 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1943 auto Add = B.buildSelect(S64, And, One, Zero); 1944 1945 // TODO: Should this propagate fast-math-flags? 1946 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1947 return true; 1948 } 1949 1950 bool AMDGPULegalizerInfo::legalizeFrem( 1951 MachineInstr &MI, MachineRegisterInfo &MRI, 1952 MachineIRBuilder &B) const { 1953 Register DstReg = MI.getOperand(0).getReg(); 1954 Register Src0Reg = MI.getOperand(1).getReg(); 1955 Register Src1Reg = MI.getOperand(2).getReg(); 1956 auto Flags = MI.getFlags(); 1957 LLT Ty = MRI.getType(DstReg); 1958 1959 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1960 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1961 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1962 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1963 MI.eraseFromParent(); 1964 return true; 1965 } 1966 1967 static MachineInstrBuilder extractF64Exponent(Register Hi, 1968 MachineIRBuilder &B) { 1969 const unsigned FractBits = 52; 1970 const unsigned ExpBits = 11; 1971 LLT S32 = LLT::scalar(32); 1972 1973 auto Const0 = B.buildConstant(S32, FractBits - 32); 1974 auto Const1 = B.buildConstant(S32, ExpBits); 1975 1976 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1977 .addUse(Hi) 1978 .addUse(Const0.getReg(0)) 1979 .addUse(Const1.getReg(0)); 1980 1981 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1982 } 1983 1984 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1985 MachineInstr &MI, MachineRegisterInfo &MRI, 1986 MachineIRBuilder &B) const { 1987 const LLT S1 = LLT::scalar(1); 1988 const LLT S32 = LLT::scalar(32); 1989 const LLT S64 = LLT::scalar(64); 1990 1991 Register Src = MI.getOperand(1).getReg(); 1992 assert(MRI.getType(Src) == S64); 1993 1994 // TODO: Should this use extract since the low half is unused? 1995 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1996 Register Hi = Unmerge.getReg(1); 1997 1998 // Extract the upper half, since this is where we will find the sign and 1999 // exponent. 2000 auto Exp = extractF64Exponent(Hi, B); 2001 2002 const unsigned FractBits = 52; 2003 2004 // Extract the sign bit. 2005 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2006 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2007 2008 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2009 2010 const auto Zero32 = B.buildConstant(S32, 0); 2011 2012 // Extend back to 64-bits. 2013 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2014 2015 auto Shr = B.buildAShr(S64, FractMask, Exp); 2016 auto Not = B.buildNot(S64, Shr); 2017 auto Tmp0 = B.buildAnd(S64, Src, Not); 2018 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2019 2020 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2021 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2022 2023 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2024 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2025 MI.eraseFromParent(); 2026 return true; 2027 } 2028 2029 bool AMDGPULegalizerInfo::legalizeITOFP( 2030 MachineInstr &MI, MachineRegisterInfo &MRI, 2031 MachineIRBuilder &B, bool Signed) const { 2032 2033 Register Dst = MI.getOperand(0).getReg(); 2034 Register Src = MI.getOperand(1).getReg(); 2035 2036 const LLT S64 = LLT::scalar(64); 2037 const LLT S32 = LLT::scalar(32); 2038 2039 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2040 2041 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2042 2043 auto CvtHi = Signed ? 2044 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2045 B.buildUITOFP(S64, Unmerge.getReg(1)); 2046 2047 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2048 2049 auto ThirtyTwo = B.buildConstant(S32, 32); 2050 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2051 .addUse(CvtHi.getReg(0)) 2052 .addUse(ThirtyTwo.getReg(0)); 2053 2054 // TODO: Should this propagate fast-math-flags? 2055 B.buildFAdd(Dst, LdExp, CvtLo); 2056 MI.eraseFromParent(); 2057 return true; 2058 } 2059 2060 // TODO: Copied from DAG implementation. Verify logic and document how this 2061 // actually works. 2062 bool AMDGPULegalizerInfo::legalizeFPTOI( 2063 MachineInstr &MI, MachineRegisterInfo &MRI, 2064 MachineIRBuilder &B, bool Signed) const { 2065 2066 Register Dst = MI.getOperand(0).getReg(); 2067 Register Src = MI.getOperand(1).getReg(); 2068 2069 const LLT S64 = LLT::scalar(64); 2070 const LLT S32 = LLT::scalar(32); 2071 2072 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2073 2074 unsigned Flags = MI.getFlags(); 2075 2076 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2077 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2078 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2079 2080 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2081 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2082 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2083 2084 auto Hi = Signed ? 2085 B.buildFPTOSI(S32, FloorMul) : 2086 B.buildFPTOUI(S32, FloorMul); 2087 auto Lo = B.buildFPTOUI(S32, Fma); 2088 2089 B.buildMerge(Dst, { Lo, Hi }); 2090 MI.eraseFromParent(); 2091 2092 return true; 2093 } 2094 2095 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2096 MachineInstr &MI) const { 2097 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2098 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2099 2100 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2101 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2102 2103 // With ieee_mode disabled, the instructions have the correct behavior 2104 // already for G_FMINNUM/G_FMAXNUM 2105 if (!MFI->getMode().IEEE) 2106 return !IsIEEEOp; 2107 2108 if (IsIEEEOp) 2109 return true; 2110 2111 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2112 } 2113 2114 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2115 MachineInstr &MI, MachineRegisterInfo &MRI, 2116 MachineIRBuilder &B) const { 2117 // TODO: Should move some of this into LegalizerHelper. 2118 2119 // TODO: Promote dynamic indexing of s16 to s32 2120 2121 // FIXME: Artifact combiner probably should have replaced the truncated 2122 // constant before this, so we shouldn't need 2123 // getConstantVRegValWithLookThrough. 2124 Optional<ValueAndVReg> MaybeIdxVal = 2125 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2126 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2127 return true; 2128 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2129 2130 Register Dst = MI.getOperand(0).getReg(); 2131 Register Vec = MI.getOperand(1).getReg(); 2132 2133 LLT VecTy = MRI.getType(Vec); 2134 LLT EltTy = VecTy.getElementType(); 2135 assert(EltTy == MRI.getType(Dst)); 2136 2137 if (IdxVal < VecTy.getNumElements()) 2138 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2139 else 2140 B.buildUndef(Dst); 2141 2142 MI.eraseFromParent(); 2143 return true; 2144 } 2145 2146 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2147 MachineInstr &MI, MachineRegisterInfo &MRI, 2148 MachineIRBuilder &B) const { 2149 // TODO: Should move some of this into LegalizerHelper. 2150 2151 // TODO: Promote dynamic indexing of s16 to s32 2152 2153 // FIXME: Artifact combiner probably should have replaced the truncated 2154 // constant before this, so we shouldn't need 2155 // getConstantVRegValWithLookThrough. 2156 Optional<ValueAndVReg> MaybeIdxVal = 2157 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2158 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2159 return true; 2160 2161 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2162 Register Dst = MI.getOperand(0).getReg(); 2163 Register Vec = MI.getOperand(1).getReg(); 2164 Register Ins = MI.getOperand(2).getReg(); 2165 2166 LLT VecTy = MRI.getType(Vec); 2167 LLT EltTy = VecTy.getElementType(); 2168 assert(EltTy == MRI.getType(Ins)); 2169 2170 if (IdxVal < VecTy.getNumElements()) 2171 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2172 else 2173 B.buildUndef(Dst); 2174 2175 MI.eraseFromParent(); 2176 return true; 2177 } 2178 2179 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2180 MachineInstr &MI, MachineRegisterInfo &MRI, 2181 MachineIRBuilder &B) const { 2182 const LLT V2S16 = LLT::vector(2, 16); 2183 2184 Register Dst = MI.getOperand(0).getReg(); 2185 Register Src0 = MI.getOperand(1).getReg(); 2186 LLT DstTy = MRI.getType(Dst); 2187 LLT SrcTy = MRI.getType(Src0); 2188 2189 if (SrcTy == V2S16 && DstTy == V2S16 && 2190 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2191 return true; 2192 2193 MachineIRBuilder HelperBuilder(MI); 2194 GISelObserverWrapper DummyObserver; 2195 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2196 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2197 } 2198 2199 bool AMDGPULegalizerInfo::legalizeSinCos( 2200 MachineInstr &MI, MachineRegisterInfo &MRI, 2201 MachineIRBuilder &B) const { 2202 2203 Register DstReg = MI.getOperand(0).getReg(); 2204 Register SrcReg = MI.getOperand(1).getReg(); 2205 LLT Ty = MRI.getType(DstReg); 2206 unsigned Flags = MI.getFlags(); 2207 2208 Register TrigVal; 2209 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2210 if (ST.hasTrigReducedRange()) { 2211 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2212 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2213 .addUse(MulVal.getReg(0)) 2214 .setMIFlags(Flags).getReg(0); 2215 } else 2216 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2217 2218 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2219 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2220 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2221 .addUse(TrigVal) 2222 .setMIFlags(Flags); 2223 MI.eraseFromParent(); 2224 return true; 2225 } 2226 2227 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2228 MachineIRBuilder &B, 2229 const GlobalValue *GV, 2230 int64_t Offset, 2231 unsigned GAFlags) const { 2232 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2233 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2234 // to the following code sequence: 2235 // 2236 // For constant address space: 2237 // s_getpc_b64 s[0:1] 2238 // s_add_u32 s0, s0, $symbol 2239 // s_addc_u32 s1, s1, 0 2240 // 2241 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2242 // a fixup or relocation is emitted to replace $symbol with a literal 2243 // constant, which is a pc-relative offset from the encoding of the $symbol 2244 // operand to the global variable. 2245 // 2246 // For global address space: 2247 // s_getpc_b64 s[0:1] 2248 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2249 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2250 // 2251 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2252 // fixups or relocations are emitted to replace $symbol@*@lo and 2253 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2254 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2255 // operand to the global variable. 2256 // 2257 // What we want here is an offset from the value returned by s_getpc 2258 // (which is the address of the s_add_u32 instruction) to the global 2259 // variable, but since the encoding of $symbol starts 4 bytes after the start 2260 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2261 // small. This requires us to add 4 to the global variable offset in order to 2262 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2263 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2264 // instruction. 2265 2266 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2267 2268 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2269 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2270 2271 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2272 .addDef(PCReg); 2273 2274 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2275 if (GAFlags == SIInstrInfo::MO_NONE) 2276 MIB.addImm(0); 2277 else 2278 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2279 2280 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2281 2282 if (PtrTy.getSizeInBits() == 32) 2283 B.buildExtract(DstReg, PCReg, 0); 2284 return true; 2285 } 2286 2287 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2288 MachineInstr &MI, MachineRegisterInfo &MRI, 2289 MachineIRBuilder &B) const { 2290 Register DstReg = MI.getOperand(0).getReg(); 2291 LLT Ty = MRI.getType(DstReg); 2292 unsigned AS = Ty.getAddressSpace(); 2293 2294 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2295 MachineFunction &MF = B.getMF(); 2296 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2297 2298 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2299 if (!MFI->isModuleEntryFunction() && 2300 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2301 const Function &Fn = MF.getFunction(); 2302 DiagnosticInfoUnsupported BadLDSDecl( 2303 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2304 DS_Warning); 2305 Fn.getContext().diagnose(BadLDSDecl); 2306 2307 // We currently don't have a way to correctly allocate LDS objects that 2308 // aren't directly associated with a kernel. We do force inlining of 2309 // functions that use local objects. However, if these dead functions are 2310 // not eliminated, we don't want a compile time error. Just emit a warning 2311 // and a trap, since there should be no callable path here. 2312 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2313 B.buildUndef(DstReg); 2314 MI.eraseFromParent(); 2315 return true; 2316 } 2317 2318 // TODO: We could emit code to handle the initialization somewhere. 2319 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2320 const SITargetLowering *TLI = ST.getTargetLowering(); 2321 if (!TLI->shouldUseLDSConstAddress(GV)) { 2322 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2323 return true; // Leave in place; 2324 } 2325 2326 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2327 Type *Ty = GV->getValueType(); 2328 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2329 // zero-sized type in other languages to declare the dynamic shared 2330 // memory which size is not known at the compile time. They will be 2331 // allocated by the runtime and placed directly after the static 2332 // allocated ones. They all share the same offset. 2333 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2334 // Adjust alignment for that dynamic shared memory array. 2335 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2336 LLT S32 = LLT::scalar(32); 2337 auto Sz = 2338 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2339 B.buildIntToPtr(DstReg, Sz); 2340 MI.eraseFromParent(); 2341 return true; 2342 } 2343 } 2344 2345 B.buildConstant( 2346 DstReg, 2347 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2348 MI.eraseFromParent(); 2349 return true; 2350 } 2351 2352 const Function &Fn = MF.getFunction(); 2353 DiagnosticInfoUnsupported BadInit( 2354 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2355 Fn.getContext().diagnose(BadInit); 2356 return true; 2357 } 2358 2359 const SITargetLowering *TLI = ST.getTargetLowering(); 2360 2361 if (TLI->shouldEmitFixup(GV)) { 2362 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2363 MI.eraseFromParent(); 2364 return true; 2365 } 2366 2367 if (TLI->shouldEmitPCReloc(GV)) { 2368 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2369 MI.eraseFromParent(); 2370 return true; 2371 } 2372 2373 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2374 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2375 2376 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2377 MachinePointerInfo::getGOT(MF), 2378 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2379 MachineMemOperand::MOInvariant, 2380 8 /*Size*/, Align(8)); 2381 2382 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2383 2384 if (Ty.getSizeInBits() == 32) { 2385 // Truncate if this is a 32-bit constant adrdess. 2386 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2387 B.buildExtract(DstReg, Load, 0); 2388 } else 2389 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2390 2391 MI.eraseFromParent(); 2392 return true; 2393 } 2394 2395 static LLT widenToNextPowerOf2(LLT Ty) { 2396 if (Ty.isVector()) 2397 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2398 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2399 } 2400 2401 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2402 MachineInstr &MI) const { 2403 MachineIRBuilder &B = Helper.MIRBuilder; 2404 MachineRegisterInfo &MRI = *B.getMRI(); 2405 GISelChangeObserver &Observer = Helper.Observer; 2406 2407 Register PtrReg = MI.getOperand(1).getReg(); 2408 LLT PtrTy = MRI.getType(PtrReg); 2409 unsigned AddrSpace = PtrTy.getAddressSpace(); 2410 2411 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2412 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2413 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2414 Observer.changingInstr(MI); 2415 MI.getOperand(1).setReg(Cast.getReg(0)); 2416 Observer.changedInstr(MI); 2417 return true; 2418 } 2419 2420 if (MI.getOpcode() != AMDGPU::G_LOAD) 2421 return false; 2422 2423 Register ValReg = MI.getOperand(0).getReg(); 2424 LLT ValTy = MRI.getType(ValReg); 2425 2426 MachineMemOperand *MMO = *MI.memoperands_begin(); 2427 const unsigned ValSize = ValTy.getSizeInBits(); 2428 const unsigned MemSize = 8 * MMO->getSize(); 2429 const Align MemAlign = MMO->getAlign(); 2430 const unsigned AlignInBits = 8 * MemAlign.value(); 2431 2432 // Widen non-power-of-2 loads to the alignment if needed 2433 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2434 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2435 2436 // This was already the correct extending load result type, so just adjust 2437 // the memory type. 2438 if (WideMemSize == ValSize) { 2439 MachineFunction &MF = B.getMF(); 2440 2441 MachineMemOperand *WideMMO = 2442 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2443 Observer.changingInstr(MI); 2444 MI.setMemRefs(MF, {WideMMO}); 2445 Observer.changedInstr(MI); 2446 return true; 2447 } 2448 2449 // Don't bother handling edge case that should probably never be produced. 2450 if (ValSize > WideMemSize) 2451 return false; 2452 2453 LLT WideTy = widenToNextPowerOf2(ValTy); 2454 2455 Register WideLoad; 2456 if (!WideTy.isVector()) { 2457 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2458 B.buildTrunc(ValReg, WideLoad).getReg(0); 2459 } else { 2460 // Extract the subvector. 2461 2462 if (isRegisterType(ValTy)) { 2463 // If this a case where G_EXTRACT is legal, use it. 2464 // (e.g. <3 x s32> -> <4 x s32>) 2465 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2466 B.buildExtract(ValReg, WideLoad, 0); 2467 } else { 2468 // For cases where the widened type isn't a nice register value, unmerge 2469 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2470 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2471 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2472 B.setInsertPt(B.getMBB(), MI.getIterator()); 2473 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2474 } 2475 } 2476 2477 MI.eraseFromParent(); 2478 return true; 2479 } 2480 2481 return false; 2482 } 2483 2484 bool AMDGPULegalizerInfo::legalizeFMad( 2485 MachineInstr &MI, MachineRegisterInfo &MRI, 2486 MachineIRBuilder &B) const { 2487 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2488 assert(Ty.isScalar()); 2489 2490 MachineFunction &MF = B.getMF(); 2491 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2492 2493 // TODO: Always legal with future ftz flag. 2494 // FIXME: Do we need just output? 2495 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2496 return true; 2497 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2498 return true; 2499 2500 MachineIRBuilder HelperBuilder(MI); 2501 GISelObserverWrapper DummyObserver; 2502 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2503 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2504 } 2505 2506 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2507 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2508 Register DstReg = MI.getOperand(0).getReg(); 2509 Register PtrReg = MI.getOperand(1).getReg(); 2510 Register CmpVal = MI.getOperand(2).getReg(); 2511 Register NewVal = MI.getOperand(3).getReg(); 2512 2513 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2514 "this should not have been custom lowered"); 2515 2516 LLT ValTy = MRI.getType(CmpVal); 2517 LLT VecTy = LLT::vector(2, ValTy); 2518 2519 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2520 2521 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2522 .addDef(DstReg) 2523 .addUse(PtrReg) 2524 .addUse(PackedVal) 2525 .setMemRefs(MI.memoperands()); 2526 2527 MI.eraseFromParent(); 2528 return true; 2529 } 2530 2531 bool AMDGPULegalizerInfo::legalizeFlog( 2532 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2533 Register Dst = MI.getOperand(0).getReg(); 2534 Register Src = MI.getOperand(1).getReg(); 2535 LLT Ty = B.getMRI()->getType(Dst); 2536 unsigned Flags = MI.getFlags(); 2537 2538 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2539 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2540 2541 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2542 MI.eraseFromParent(); 2543 return true; 2544 } 2545 2546 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2547 MachineIRBuilder &B) const { 2548 Register Dst = MI.getOperand(0).getReg(); 2549 Register Src = MI.getOperand(1).getReg(); 2550 unsigned Flags = MI.getFlags(); 2551 LLT Ty = B.getMRI()->getType(Dst); 2552 2553 auto K = B.buildFConstant(Ty, numbers::log2e); 2554 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2555 B.buildFExp2(Dst, Mul, Flags); 2556 MI.eraseFromParent(); 2557 return true; 2558 } 2559 2560 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2561 MachineIRBuilder &B) const { 2562 Register Dst = MI.getOperand(0).getReg(); 2563 Register Src0 = MI.getOperand(1).getReg(); 2564 Register Src1 = MI.getOperand(2).getReg(); 2565 unsigned Flags = MI.getFlags(); 2566 LLT Ty = B.getMRI()->getType(Dst); 2567 const LLT S16 = LLT::scalar(16); 2568 const LLT S32 = LLT::scalar(32); 2569 2570 if (Ty == S32) { 2571 auto Log = B.buildFLog2(S32, Src0, Flags); 2572 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2573 .addUse(Log.getReg(0)) 2574 .addUse(Src1) 2575 .setMIFlags(Flags); 2576 B.buildFExp2(Dst, Mul, Flags); 2577 } else if (Ty == S16) { 2578 // There's no f16 fmul_legacy, so we need to convert for it. 2579 auto Log = B.buildFLog2(S16, Src0, Flags); 2580 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2581 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2582 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2583 .addUse(Ext0.getReg(0)) 2584 .addUse(Ext1.getReg(0)) 2585 .setMIFlags(Flags); 2586 2587 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2588 } else 2589 return false; 2590 2591 MI.eraseFromParent(); 2592 return true; 2593 } 2594 2595 // Find a source register, ignoring any possible source modifiers. 2596 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2597 Register ModSrc = OrigSrc; 2598 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2599 ModSrc = SrcFNeg->getOperand(1).getReg(); 2600 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2601 ModSrc = SrcFAbs->getOperand(1).getReg(); 2602 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2603 ModSrc = SrcFAbs->getOperand(1).getReg(); 2604 return ModSrc; 2605 } 2606 2607 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2608 MachineRegisterInfo &MRI, 2609 MachineIRBuilder &B) const { 2610 2611 const LLT S1 = LLT::scalar(1); 2612 const LLT S64 = LLT::scalar(64); 2613 Register Dst = MI.getOperand(0).getReg(); 2614 Register OrigSrc = MI.getOperand(1).getReg(); 2615 unsigned Flags = MI.getFlags(); 2616 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2617 "this should not have been custom lowered"); 2618 2619 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2620 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2621 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2622 // V_FRACT bug is: 2623 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2624 // 2625 // Convert floor(x) to (x - fract(x)) 2626 2627 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2628 .addUse(OrigSrc) 2629 .setMIFlags(Flags); 2630 2631 // Give source modifier matching some assistance before obscuring a foldable 2632 // pattern. 2633 2634 // TODO: We can avoid the neg on the fract? The input sign to fract 2635 // shouldn't matter? 2636 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2637 2638 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2639 2640 Register Min = MRI.createGenericVirtualRegister(S64); 2641 2642 // We don't need to concern ourselves with the snan handling difference, so 2643 // use the one which will directly select. 2644 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2645 if (MFI->getMode().IEEE) 2646 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2647 else 2648 B.buildFMinNum(Min, Fract, Const, Flags); 2649 2650 Register CorrectedFract = Min; 2651 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2652 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2653 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2654 } 2655 2656 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2657 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2658 2659 MI.eraseFromParent(); 2660 return true; 2661 } 2662 2663 // Turn an illegal packed v2s16 build vector into bit operations. 2664 // TODO: This should probably be a bitcast action in LegalizerHelper. 2665 bool AMDGPULegalizerInfo::legalizeBuildVector( 2666 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2667 Register Dst = MI.getOperand(0).getReg(); 2668 const LLT S32 = LLT::scalar(32); 2669 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2670 2671 Register Src0 = MI.getOperand(1).getReg(); 2672 Register Src1 = MI.getOperand(2).getReg(); 2673 assert(MRI.getType(Src0) == LLT::scalar(16)); 2674 2675 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2676 B.buildBitcast(Dst, Merge); 2677 2678 MI.eraseFromParent(); 2679 return true; 2680 } 2681 2682 // Check that this is a G_XOR x, -1 2683 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2684 if (MI.getOpcode() != TargetOpcode::G_XOR) 2685 return false; 2686 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2687 return ConstVal && *ConstVal == -1; 2688 } 2689 2690 // Return the use branch instruction, otherwise null if the usage is invalid. 2691 static MachineInstr * 2692 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2693 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2694 Register CondDef = MI.getOperand(0).getReg(); 2695 if (!MRI.hasOneNonDBGUse(CondDef)) 2696 return nullptr; 2697 2698 MachineBasicBlock *Parent = MI.getParent(); 2699 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2700 2701 if (isNot(MRI, *UseMI)) { 2702 Register NegatedCond = UseMI->getOperand(0).getReg(); 2703 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2704 return nullptr; 2705 2706 // We're deleting the def of this value, so we need to remove it. 2707 UseMI->eraseFromParent(); 2708 2709 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2710 Negated = true; 2711 } 2712 2713 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2714 return nullptr; 2715 2716 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2717 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2718 if (Next == Parent->end()) { 2719 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2720 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2721 return nullptr; 2722 UncondBrTarget = &*NextMBB; 2723 } else { 2724 if (Next->getOpcode() != AMDGPU::G_BR) 2725 return nullptr; 2726 Br = &*Next; 2727 UncondBrTarget = Br->getOperand(0).getMBB(); 2728 } 2729 2730 return UseMI; 2731 } 2732 2733 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2734 const ArgDescriptor *Arg, 2735 const TargetRegisterClass *ArgRC, 2736 LLT ArgTy) const { 2737 MCRegister SrcReg = Arg->getRegister(); 2738 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2739 assert(DstReg.isVirtual() && "Virtual register expected"); 2740 2741 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2742 ArgTy); 2743 if (Arg->isMasked()) { 2744 // TODO: Should we try to emit this once in the entry block? 2745 const LLT S32 = LLT::scalar(32); 2746 const unsigned Mask = Arg->getMask(); 2747 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2748 2749 Register AndMaskSrc = LiveIn; 2750 2751 if (Shift != 0) { 2752 auto ShiftAmt = B.buildConstant(S32, Shift); 2753 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2754 } 2755 2756 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2757 } else { 2758 B.buildCopy(DstReg, LiveIn); 2759 } 2760 2761 return true; 2762 } 2763 2764 bool AMDGPULegalizerInfo::loadInputValue( 2765 Register DstReg, MachineIRBuilder &B, 2766 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2767 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2768 const ArgDescriptor *Arg; 2769 const TargetRegisterClass *ArgRC; 2770 LLT ArgTy; 2771 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2772 2773 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2774 return false; // TODO: Handle these 2775 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2776 } 2777 2778 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2779 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2780 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2781 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2782 return false; 2783 2784 MI.eraseFromParent(); 2785 return true; 2786 } 2787 2788 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2789 MachineRegisterInfo &MRI, 2790 MachineIRBuilder &B) const { 2791 Register Dst = MI.getOperand(0).getReg(); 2792 LLT DstTy = MRI.getType(Dst); 2793 LLT S16 = LLT::scalar(16); 2794 LLT S32 = LLT::scalar(32); 2795 LLT S64 = LLT::scalar(64); 2796 2797 if (DstTy == S16) 2798 return legalizeFDIV16(MI, MRI, B); 2799 if (DstTy == S32) 2800 return legalizeFDIV32(MI, MRI, B); 2801 if (DstTy == S64) 2802 return legalizeFDIV64(MI, MRI, B); 2803 2804 return false; 2805 } 2806 2807 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2808 Register DstDivReg, 2809 Register DstRemReg, 2810 Register X, 2811 Register Y) const { 2812 const LLT S1 = LLT::scalar(1); 2813 const LLT S32 = LLT::scalar(32); 2814 2815 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2816 // algorithm used here. 2817 2818 // Initial estimate of inv(y). 2819 auto FloatY = B.buildUITOFP(S32, Y); 2820 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2821 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2822 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2823 auto Z = B.buildFPTOUI(S32, ScaledY); 2824 2825 // One round of UNR. 2826 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2827 auto NegYZ = B.buildMul(S32, NegY, Z); 2828 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2829 2830 // Quotient/remainder estimate. 2831 auto Q = B.buildUMulH(S32, X, Z); 2832 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2833 2834 // First quotient/remainder refinement. 2835 auto One = B.buildConstant(S32, 1); 2836 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2837 if (DstDivReg) 2838 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2839 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2840 2841 // Second quotient/remainder refinement. 2842 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2843 if (DstDivReg) 2844 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2845 2846 if (DstRemReg) 2847 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2848 } 2849 2850 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2851 // 2852 // Return lo, hi of result 2853 // 2854 // %cvt.lo = G_UITOFP Val.lo 2855 // %cvt.hi = G_UITOFP Val.hi 2856 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2857 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2858 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2859 // %mul2 = G_FMUL %mul1, 2**(-32) 2860 // %trunc = G_INTRINSIC_TRUNC %mul2 2861 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2862 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2863 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2864 Register Val) { 2865 const LLT S32 = LLT::scalar(32); 2866 auto Unmerge = B.buildUnmerge(S32, Val); 2867 2868 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2869 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2870 2871 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2872 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2873 2874 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2875 auto Mul1 = 2876 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2877 2878 // 2**(-32) 2879 auto Mul2 = 2880 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2881 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2882 2883 // -(2**32) 2884 auto Mad2 = B.buildFMAD(S32, Trunc, 2885 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2886 2887 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2888 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2889 2890 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2891 } 2892 2893 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 2894 Register DstDivReg, 2895 Register DstRemReg, 2896 Register Numer, 2897 Register Denom) const { 2898 const LLT S32 = LLT::scalar(32); 2899 const LLT S64 = LLT::scalar(64); 2900 const LLT S1 = LLT::scalar(1); 2901 Register RcpLo, RcpHi; 2902 2903 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2904 2905 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2906 2907 auto Zero64 = B.buildConstant(S64, 0); 2908 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2909 2910 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2911 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2912 2913 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2914 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2915 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2916 2917 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2918 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2919 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2920 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2921 2922 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2923 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2924 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2925 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2926 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2927 2928 auto Zero32 = B.buildConstant(S32, 0); 2929 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2930 auto Add2_HiC = 2931 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2932 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2933 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2934 2935 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2936 Register NumerLo = UnmergeNumer.getReg(0); 2937 Register NumerHi = UnmergeNumer.getReg(1); 2938 2939 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2940 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2941 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2942 Register Mul3_Lo = UnmergeMul3.getReg(0); 2943 Register Mul3_Hi = UnmergeMul3.getReg(1); 2944 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2945 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2946 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2947 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2948 2949 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2950 Register DenomLo = UnmergeDenom.getReg(0); 2951 Register DenomHi = UnmergeDenom.getReg(1); 2952 2953 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2954 auto C1 = B.buildSExt(S32, CmpHi); 2955 2956 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2957 auto C2 = B.buildSExt(S32, CmpLo); 2958 2959 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2960 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2961 2962 // TODO: Here and below portions of the code can be enclosed into if/endif. 2963 // Currently control flow is unconditional and we have 4 selects after 2964 // potential endif to substitute PHIs. 2965 2966 // if C3 != 0 ... 2967 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2968 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2969 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2970 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2971 2972 auto One64 = B.buildConstant(S64, 1); 2973 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2974 2975 auto C4 = 2976 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2977 auto C5 = 2978 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2979 auto C6 = B.buildSelect( 2980 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2981 2982 // if (C6 != 0) 2983 auto Add4 = B.buildAdd(S64, Add3, One64); 2984 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2985 2986 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2987 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2988 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2989 2990 // endif C6 2991 // endif C3 2992 2993 if (DstDivReg) { 2994 auto Sel1 = B.buildSelect( 2995 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2996 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 2997 Sel1, MulHi3); 2998 } 2999 3000 if (DstRemReg) { 3001 auto Sel2 = B.buildSelect( 3002 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3003 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3004 Sel2, Sub1); 3005 } 3006 } 3007 3008 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3009 MachineRegisterInfo &MRI, 3010 MachineIRBuilder &B) const { 3011 Register DstDivReg, DstRemReg; 3012 switch (MI.getOpcode()) { 3013 default: 3014 llvm_unreachable("Unexpected opcode!"); 3015 case AMDGPU::G_UDIV: { 3016 DstDivReg = MI.getOperand(0).getReg(); 3017 break; 3018 } 3019 case AMDGPU::G_UREM: { 3020 DstRemReg = MI.getOperand(0).getReg(); 3021 break; 3022 } 3023 case AMDGPU::G_UDIVREM: { 3024 DstDivReg = MI.getOperand(0).getReg(); 3025 DstRemReg = MI.getOperand(1).getReg(); 3026 break; 3027 } 3028 } 3029 3030 const LLT S64 = LLT::scalar(64); 3031 const LLT S32 = LLT::scalar(32); 3032 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3033 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3034 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3035 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3036 3037 if (Ty == S32) 3038 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3039 else if (Ty == S64) 3040 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3041 else 3042 return false; 3043 3044 MI.eraseFromParent(); 3045 return true; 3046 } 3047 3048 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3049 MachineRegisterInfo &MRI, 3050 MachineIRBuilder &B) const { 3051 const LLT S64 = LLT::scalar(64); 3052 const LLT S32 = LLT::scalar(32); 3053 3054 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3055 if (Ty != S32 && Ty != S64) 3056 return false; 3057 3058 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3059 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3060 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3061 3062 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3063 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3064 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3065 3066 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3067 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3068 3069 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3070 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3071 3072 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3073 switch (MI.getOpcode()) { 3074 default: 3075 llvm_unreachable("Unexpected opcode!"); 3076 case AMDGPU::G_SDIV: { 3077 DstDivReg = MI.getOperand(0).getReg(); 3078 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3079 break; 3080 } 3081 case AMDGPU::G_SREM: { 3082 DstRemReg = MI.getOperand(0).getReg(); 3083 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3084 break; 3085 } 3086 case AMDGPU::G_SDIVREM: { 3087 DstDivReg = MI.getOperand(0).getReg(); 3088 DstRemReg = MI.getOperand(1).getReg(); 3089 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3090 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3091 break; 3092 } 3093 } 3094 3095 if (Ty == S32) 3096 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3097 else 3098 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3099 3100 if (DstDivReg) { 3101 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3102 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3103 B.buildSub(DstDivReg, SignXor, Sign); 3104 } 3105 3106 if (DstRemReg) { 3107 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3108 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3109 B.buildSub(DstRemReg, SignXor, Sign); 3110 } 3111 3112 MI.eraseFromParent(); 3113 return true; 3114 } 3115 3116 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3117 MachineRegisterInfo &MRI, 3118 MachineIRBuilder &B) const { 3119 Register Res = MI.getOperand(0).getReg(); 3120 Register LHS = MI.getOperand(1).getReg(); 3121 Register RHS = MI.getOperand(2).getReg(); 3122 uint16_t Flags = MI.getFlags(); 3123 LLT ResTy = MRI.getType(Res); 3124 3125 const MachineFunction &MF = B.getMF(); 3126 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3127 MI.getFlag(MachineInstr::FmAfn); 3128 3129 if (!AllowInaccurateRcp) 3130 return false; 3131 3132 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3133 // 1 / x -> RCP(x) 3134 if (CLHS->isExactlyValue(1.0)) { 3135 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3136 .addUse(RHS) 3137 .setMIFlags(Flags); 3138 3139 MI.eraseFromParent(); 3140 return true; 3141 } 3142 3143 // -1 / x -> RCP( FNEG(x) ) 3144 if (CLHS->isExactlyValue(-1.0)) { 3145 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3146 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3147 .addUse(FNeg.getReg(0)) 3148 .setMIFlags(Flags); 3149 3150 MI.eraseFromParent(); 3151 return true; 3152 } 3153 } 3154 3155 // x / y -> x * (1.0 / y) 3156 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3157 .addUse(RHS) 3158 .setMIFlags(Flags); 3159 B.buildFMul(Res, LHS, RCP, Flags); 3160 3161 MI.eraseFromParent(); 3162 return true; 3163 } 3164 3165 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3166 MachineRegisterInfo &MRI, 3167 MachineIRBuilder &B) const { 3168 Register Res = MI.getOperand(0).getReg(); 3169 Register X = MI.getOperand(1).getReg(); 3170 Register Y = MI.getOperand(2).getReg(); 3171 uint16_t Flags = MI.getFlags(); 3172 LLT ResTy = MRI.getType(Res); 3173 3174 const MachineFunction &MF = B.getMF(); 3175 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3176 MI.getFlag(MachineInstr::FmAfn); 3177 3178 if (!AllowInaccurateRcp) 3179 return false; 3180 3181 auto NegY = B.buildFNeg(ResTy, Y); 3182 auto One = B.buildFConstant(ResTy, 1.0); 3183 3184 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3185 .addUse(Y) 3186 .setMIFlags(Flags); 3187 3188 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3189 R = B.buildFMA(ResTy, Tmp0, R, R); 3190 3191 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3192 R = B.buildFMA(ResTy, Tmp1, R, R); 3193 3194 auto Ret = B.buildFMul(ResTy, X, R); 3195 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3196 3197 B.buildFMA(Res, Tmp2, R, Ret); 3198 MI.eraseFromParent(); 3199 return true; 3200 } 3201 3202 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3203 MachineRegisterInfo &MRI, 3204 MachineIRBuilder &B) const { 3205 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3206 return true; 3207 3208 Register Res = MI.getOperand(0).getReg(); 3209 Register LHS = MI.getOperand(1).getReg(); 3210 Register RHS = MI.getOperand(2).getReg(); 3211 3212 uint16_t Flags = MI.getFlags(); 3213 3214 LLT S16 = LLT::scalar(16); 3215 LLT S32 = LLT::scalar(32); 3216 3217 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3218 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3219 3220 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3221 .addUse(RHSExt.getReg(0)) 3222 .setMIFlags(Flags); 3223 3224 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3225 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3226 3227 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3228 .addUse(RDst.getReg(0)) 3229 .addUse(RHS) 3230 .addUse(LHS) 3231 .setMIFlags(Flags); 3232 3233 MI.eraseFromParent(); 3234 return true; 3235 } 3236 3237 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3238 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3239 static void toggleSPDenormMode(bool Enable, 3240 MachineIRBuilder &B, 3241 const GCNSubtarget &ST, 3242 AMDGPU::SIModeRegisterDefaults Mode) { 3243 // Set SP denorm mode to this value. 3244 unsigned SPDenormMode = 3245 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3246 3247 if (ST.hasDenormModeInst()) { 3248 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3249 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3250 3251 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3252 B.buildInstr(AMDGPU::S_DENORM_MODE) 3253 .addImm(NewDenormModeValue); 3254 3255 } else { 3256 // Select FP32 bit field in mode register. 3257 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3258 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3259 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3260 3261 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3262 .addImm(SPDenormMode) 3263 .addImm(SPDenormModeBitField); 3264 } 3265 } 3266 3267 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3268 MachineRegisterInfo &MRI, 3269 MachineIRBuilder &B) const { 3270 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3271 return true; 3272 3273 Register Res = MI.getOperand(0).getReg(); 3274 Register LHS = MI.getOperand(1).getReg(); 3275 Register RHS = MI.getOperand(2).getReg(); 3276 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3277 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3278 3279 uint16_t Flags = MI.getFlags(); 3280 3281 LLT S32 = LLT::scalar(32); 3282 LLT S1 = LLT::scalar(1); 3283 3284 auto One = B.buildFConstant(S32, 1.0f); 3285 3286 auto DenominatorScaled = 3287 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3288 .addUse(LHS) 3289 .addUse(RHS) 3290 .addImm(0) 3291 .setMIFlags(Flags); 3292 auto NumeratorScaled = 3293 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3294 .addUse(LHS) 3295 .addUse(RHS) 3296 .addImm(1) 3297 .setMIFlags(Flags); 3298 3299 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3300 .addUse(DenominatorScaled.getReg(0)) 3301 .setMIFlags(Flags); 3302 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3303 3304 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3305 // aren't modeled as reading it. 3306 if (!Mode.allFP32Denormals()) 3307 toggleSPDenormMode(true, B, ST, Mode); 3308 3309 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3310 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3311 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3312 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3313 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3314 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3315 3316 if (!Mode.allFP32Denormals()) 3317 toggleSPDenormMode(false, B, ST, Mode); 3318 3319 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3320 .addUse(Fma4.getReg(0)) 3321 .addUse(Fma1.getReg(0)) 3322 .addUse(Fma3.getReg(0)) 3323 .addUse(NumeratorScaled.getReg(1)) 3324 .setMIFlags(Flags); 3325 3326 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3327 .addUse(Fmas.getReg(0)) 3328 .addUse(RHS) 3329 .addUse(LHS) 3330 .setMIFlags(Flags); 3331 3332 MI.eraseFromParent(); 3333 return true; 3334 } 3335 3336 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3337 MachineRegisterInfo &MRI, 3338 MachineIRBuilder &B) const { 3339 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3340 return true; 3341 3342 Register Res = MI.getOperand(0).getReg(); 3343 Register LHS = MI.getOperand(1).getReg(); 3344 Register RHS = MI.getOperand(2).getReg(); 3345 3346 uint16_t Flags = MI.getFlags(); 3347 3348 LLT S64 = LLT::scalar(64); 3349 LLT S1 = LLT::scalar(1); 3350 3351 auto One = B.buildFConstant(S64, 1.0); 3352 3353 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3354 .addUse(LHS) 3355 .addUse(RHS) 3356 .addImm(0) 3357 .setMIFlags(Flags); 3358 3359 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3360 3361 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3362 .addUse(DivScale0.getReg(0)) 3363 .setMIFlags(Flags); 3364 3365 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3366 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3367 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3368 3369 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3370 .addUse(LHS) 3371 .addUse(RHS) 3372 .addImm(1) 3373 .setMIFlags(Flags); 3374 3375 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3376 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3377 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3378 3379 Register Scale; 3380 if (!ST.hasUsableDivScaleConditionOutput()) { 3381 // Workaround a hardware bug on SI where the condition output from div_scale 3382 // is not usable. 3383 3384 LLT S32 = LLT::scalar(32); 3385 3386 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3387 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3388 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3389 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3390 3391 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3392 Scale1Unmerge.getReg(1)); 3393 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3394 Scale0Unmerge.getReg(1)); 3395 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3396 } else { 3397 Scale = DivScale1.getReg(1); 3398 } 3399 3400 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3401 .addUse(Fma4.getReg(0)) 3402 .addUse(Fma3.getReg(0)) 3403 .addUse(Mul.getReg(0)) 3404 .addUse(Scale) 3405 .setMIFlags(Flags); 3406 3407 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3408 .addUse(Fmas.getReg(0)) 3409 .addUse(RHS) 3410 .addUse(LHS) 3411 .setMIFlags(Flags); 3412 3413 MI.eraseFromParent(); 3414 return true; 3415 } 3416 3417 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3418 MachineRegisterInfo &MRI, 3419 MachineIRBuilder &B) const { 3420 Register Res = MI.getOperand(0).getReg(); 3421 Register LHS = MI.getOperand(2).getReg(); 3422 Register RHS = MI.getOperand(3).getReg(); 3423 uint16_t Flags = MI.getFlags(); 3424 3425 LLT S32 = LLT::scalar(32); 3426 LLT S1 = LLT::scalar(1); 3427 3428 auto Abs = B.buildFAbs(S32, RHS, Flags); 3429 const APFloat C0Val(1.0f); 3430 3431 auto C0 = B.buildConstant(S32, 0x6f800000); 3432 auto C1 = B.buildConstant(S32, 0x2f800000); 3433 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3434 3435 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3436 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3437 3438 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3439 3440 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3441 .addUse(Mul0.getReg(0)) 3442 .setMIFlags(Flags); 3443 3444 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3445 3446 B.buildFMul(Res, Sel, Mul1, Flags); 3447 3448 MI.eraseFromParent(); 3449 return true; 3450 } 3451 3452 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3453 // FIXME: Why do we handle this one but not other removed instructions? 3454 // 3455 // Reciprocal square root. The clamp prevents infinite results, clamping 3456 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3457 // +-max_float. 3458 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3459 MachineRegisterInfo &MRI, 3460 MachineIRBuilder &B) const { 3461 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3462 return true; 3463 3464 Register Dst = MI.getOperand(0).getReg(); 3465 Register Src = MI.getOperand(2).getReg(); 3466 auto Flags = MI.getFlags(); 3467 3468 LLT Ty = MRI.getType(Dst); 3469 3470 const fltSemantics *FltSemantics; 3471 if (Ty == LLT::scalar(32)) 3472 FltSemantics = &APFloat::IEEEsingle(); 3473 else if (Ty == LLT::scalar(64)) 3474 FltSemantics = &APFloat::IEEEdouble(); 3475 else 3476 return false; 3477 3478 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3479 .addUse(Src) 3480 .setMIFlags(Flags); 3481 3482 // We don't need to concern ourselves with the snan handling difference, since 3483 // the rsq quieted (or not) so use the one which will directly select. 3484 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3485 const bool UseIEEE = MFI->getMode().IEEE; 3486 3487 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3488 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3489 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3490 3491 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3492 3493 if (UseIEEE) 3494 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3495 else 3496 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3497 MI.eraseFromParent(); 3498 return true; 3499 } 3500 3501 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3502 switch (IID) { 3503 case Intrinsic::amdgcn_ds_fadd: 3504 return AMDGPU::G_ATOMICRMW_FADD; 3505 case Intrinsic::amdgcn_ds_fmin: 3506 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3507 case Intrinsic::amdgcn_ds_fmax: 3508 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3509 default: 3510 llvm_unreachable("not a DS FP intrinsic"); 3511 } 3512 } 3513 3514 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3515 MachineInstr &MI, 3516 Intrinsic::ID IID) const { 3517 GISelChangeObserver &Observer = Helper.Observer; 3518 Observer.changingInstr(MI); 3519 3520 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3521 3522 // The remaining operands were used to set fields in the MemOperand on 3523 // construction. 3524 for (int I = 6; I > 3; --I) 3525 MI.RemoveOperand(I); 3526 3527 MI.RemoveOperand(1); // Remove the intrinsic ID. 3528 Observer.changedInstr(MI); 3529 return true; 3530 } 3531 3532 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3533 MachineRegisterInfo &MRI, 3534 MachineIRBuilder &B) const { 3535 uint64_t Offset = 3536 ST.getTargetLowering()->getImplicitParameterOffset( 3537 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3538 LLT DstTy = MRI.getType(DstReg); 3539 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3540 3541 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3542 if (!loadInputValue(KernargPtrReg, B, 3543 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3544 return false; 3545 3546 // FIXME: This should be nuw 3547 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3548 return true; 3549 } 3550 3551 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3552 MachineRegisterInfo &MRI, 3553 MachineIRBuilder &B) const { 3554 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3555 if (!MFI->isEntryFunction()) { 3556 return legalizePreloadedArgIntrin(MI, MRI, B, 3557 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3558 } 3559 3560 Register DstReg = MI.getOperand(0).getReg(); 3561 if (!getImplicitArgPtr(DstReg, MRI, B)) 3562 return false; 3563 3564 MI.eraseFromParent(); 3565 return true; 3566 } 3567 3568 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3569 MachineRegisterInfo &MRI, 3570 MachineIRBuilder &B, 3571 unsigned AddrSpace) const { 3572 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3573 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3574 Register Hi32 = Unmerge.getReg(1); 3575 3576 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3577 MI.eraseFromParent(); 3578 return true; 3579 } 3580 3581 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3582 // offset (the offset that is included in bounds checking and swizzling, to be 3583 // split between the instruction's voffset and immoffset fields) and soffset 3584 // (the offset that is excluded from bounds checking and swizzling, to go in 3585 // the instruction's soffset field). This function takes the first kind of 3586 // offset and figures out how to split it between voffset and immoffset. 3587 std::tuple<Register, unsigned, unsigned> 3588 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3589 Register OrigOffset) const { 3590 const unsigned MaxImm = 4095; 3591 Register BaseReg; 3592 unsigned TotalConstOffset; 3593 const LLT S32 = LLT::scalar(32); 3594 MachineRegisterInfo &MRI = *B.getMRI(); 3595 3596 std::tie(BaseReg, TotalConstOffset) = 3597 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3598 3599 unsigned ImmOffset = TotalConstOffset; 3600 3601 // If BaseReg is a pointer, convert it to int. 3602 if (MRI.getType(BaseReg).isPointer()) 3603 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3604 3605 // If the immediate value is too big for the immoffset field, put the value 3606 // and -4096 into the immoffset field so that the value that is copied/added 3607 // for the voffset field is a multiple of 4096, and it stands more chance 3608 // of being CSEd with the copy/add for another similar load/store. 3609 // However, do not do that rounding down to a multiple of 4096 if that is a 3610 // negative number, as it appears to be illegal to have a negative offset 3611 // in the vgpr, even if adding the immediate offset makes it positive. 3612 unsigned Overflow = ImmOffset & ~MaxImm; 3613 ImmOffset -= Overflow; 3614 if ((int32_t)Overflow < 0) { 3615 Overflow += ImmOffset; 3616 ImmOffset = 0; 3617 } 3618 3619 if (Overflow != 0) { 3620 if (!BaseReg) { 3621 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3622 } else { 3623 auto OverflowVal = B.buildConstant(S32, Overflow); 3624 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3625 } 3626 } 3627 3628 if (!BaseReg) 3629 BaseReg = B.buildConstant(S32, 0).getReg(0); 3630 3631 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3632 } 3633 3634 /// Handle register layout difference for f16 images for some subtargets. 3635 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3636 MachineRegisterInfo &MRI, 3637 Register Reg, 3638 bool ImageStore) const { 3639 const LLT S16 = LLT::scalar(16); 3640 const LLT S32 = LLT::scalar(32); 3641 LLT StoreVT = MRI.getType(Reg); 3642 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3643 3644 if (ST.hasUnpackedD16VMem()) { 3645 auto Unmerge = B.buildUnmerge(S16, Reg); 3646 3647 SmallVector<Register, 4> WideRegs; 3648 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3649 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3650 3651 int NumElts = StoreVT.getNumElements(); 3652 3653 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3654 } 3655 3656 if (ImageStore && ST.hasImageStoreD16Bug()) { 3657 if (StoreVT.getNumElements() == 2) { 3658 SmallVector<Register, 4> PackedRegs; 3659 Reg = B.buildBitcast(S32, Reg).getReg(0); 3660 PackedRegs.push_back(Reg); 3661 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3662 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); 3663 } 3664 3665 if (StoreVT.getNumElements() == 3) { 3666 SmallVector<Register, 4> PackedRegs; 3667 auto Unmerge = B.buildUnmerge(S16, Reg); 3668 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3669 PackedRegs.push_back(Unmerge.getReg(I)); 3670 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3671 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); 3672 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); 3673 } 3674 3675 if (StoreVT.getNumElements() == 4) { 3676 SmallVector<Register, 4> PackedRegs; 3677 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); 3678 auto Unmerge = B.buildUnmerge(S32, Reg); 3679 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3680 PackedRegs.push_back(Unmerge.getReg(I)); 3681 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3682 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); 3683 } 3684 3685 llvm_unreachable("invalid data type"); 3686 } 3687 3688 return Reg; 3689 } 3690 3691 Register AMDGPULegalizerInfo::fixStoreSourceType( 3692 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3693 MachineRegisterInfo *MRI = B.getMRI(); 3694 LLT Ty = MRI->getType(VData); 3695 3696 const LLT S16 = LLT::scalar(16); 3697 3698 // Fixup illegal register types for i8 stores. 3699 if (Ty == LLT::scalar(8) || Ty == S16) { 3700 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3701 return AnyExt; 3702 } 3703 3704 if (Ty.isVector()) { 3705 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3706 if (IsFormat) 3707 return handleD16VData(B, *MRI, VData); 3708 } 3709 } 3710 3711 return VData; 3712 } 3713 3714 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3715 MachineRegisterInfo &MRI, 3716 MachineIRBuilder &B, 3717 bool IsTyped, 3718 bool IsFormat) const { 3719 Register VData = MI.getOperand(1).getReg(); 3720 LLT Ty = MRI.getType(VData); 3721 LLT EltTy = Ty.getScalarType(); 3722 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3723 const LLT S32 = LLT::scalar(32); 3724 3725 VData = fixStoreSourceType(B, VData, IsFormat); 3726 Register RSrc = MI.getOperand(2).getReg(); 3727 3728 MachineMemOperand *MMO = *MI.memoperands_begin(); 3729 const int MemSize = MMO->getSize(); 3730 3731 unsigned ImmOffset; 3732 unsigned TotalOffset; 3733 3734 // The typed intrinsics add an immediate after the registers. 3735 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3736 3737 // The struct intrinsic variants add one additional operand over raw. 3738 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3739 Register VIndex; 3740 int OpOffset = 0; 3741 if (HasVIndex) { 3742 VIndex = MI.getOperand(3).getReg(); 3743 OpOffset = 1; 3744 } 3745 3746 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3747 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3748 3749 unsigned Format = 0; 3750 if (IsTyped) { 3751 Format = MI.getOperand(5 + OpOffset).getImm(); 3752 ++OpOffset; 3753 } 3754 3755 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3756 3757 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3758 if (TotalOffset != 0) 3759 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3760 3761 unsigned Opc; 3762 if (IsTyped) { 3763 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3764 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3765 } else if (IsFormat) { 3766 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3767 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3768 } else { 3769 switch (MemSize) { 3770 case 1: 3771 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3772 break; 3773 case 2: 3774 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3775 break; 3776 default: 3777 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3778 break; 3779 } 3780 } 3781 3782 if (!VIndex) 3783 VIndex = B.buildConstant(S32, 0).getReg(0); 3784 3785 auto MIB = B.buildInstr(Opc) 3786 .addUse(VData) // vdata 3787 .addUse(RSrc) // rsrc 3788 .addUse(VIndex) // vindex 3789 .addUse(VOffset) // voffset 3790 .addUse(SOffset) // soffset 3791 .addImm(ImmOffset); // offset(imm) 3792 3793 if (IsTyped) 3794 MIB.addImm(Format); 3795 3796 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3797 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3798 .addMemOperand(MMO); 3799 3800 MI.eraseFromParent(); 3801 return true; 3802 } 3803 3804 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3805 MachineRegisterInfo &MRI, 3806 MachineIRBuilder &B, 3807 bool IsFormat, 3808 bool IsTyped) const { 3809 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3810 MachineMemOperand *MMO = *MI.memoperands_begin(); 3811 const int MemSize = MMO->getSize(); 3812 const LLT S32 = LLT::scalar(32); 3813 3814 Register Dst = MI.getOperand(0).getReg(); 3815 Register RSrc = MI.getOperand(2).getReg(); 3816 3817 // The typed intrinsics add an immediate after the registers. 3818 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3819 3820 // The struct intrinsic variants add one additional operand over raw. 3821 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3822 Register VIndex; 3823 int OpOffset = 0; 3824 if (HasVIndex) { 3825 VIndex = MI.getOperand(3).getReg(); 3826 OpOffset = 1; 3827 } 3828 3829 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3830 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3831 3832 unsigned Format = 0; 3833 if (IsTyped) { 3834 Format = MI.getOperand(5 + OpOffset).getImm(); 3835 ++OpOffset; 3836 } 3837 3838 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3839 unsigned ImmOffset; 3840 unsigned TotalOffset; 3841 3842 LLT Ty = MRI.getType(Dst); 3843 LLT EltTy = Ty.getScalarType(); 3844 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3845 const bool Unpacked = ST.hasUnpackedD16VMem(); 3846 3847 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3848 if (TotalOffset != 0) 3849 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3850 3851 unsigned Opc; 3852 3853 if (IsTyped) { 3854 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3855 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3856 } else if (IsFormat) { 3857 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3858 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3859 } else { 3860 switch (MemSize) { 3861 case 1: 3862 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3863 break; 3864 case 2: 3865 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3866 break; 3867 default: 3868 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3869 break; 3870 } 3871 } 3872 3873 Register LoadDstReg; 3874 3875 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3876 LLT UnpackedTy = Ty.changeElementSize(32); 3877 3878 if (IsExtLoad) 3879 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3880 else if (Unpacked && IsD16 && Ty.isVector()) 3881 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3882 else 3883 LoadDstReg = Dst; 3884 3885 if (!VIndex) 3886 VIndex = B.buildConstant(S32, 0).getReg(0); 3887 3888 auto MIB = B.buildInstr(Opc) 3889 .addDef(LoadDstReg) // vdata 3890 .addUse(RSrc) // rsrc 3891 .addUse(VIndex) // vindex 3892 .addUse(VOffset) // voffset 3893 .addUse(SOffset) // soffset 3894 .addImm(ImmOffset); // offset(imm) 3895 3896 if (IsTyped) 3897 MIB.addImm(Format); 3898 3899 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3900 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3901 .addMemOperand(MMO); 3902 3903 if (LoadDstReg != Dst) { 3904 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3905 3906 // Widen result for extending loads was widened. 3907 if (IsExtLoad) 3908 B.buildTrunc(Dst, LoadDstReg); 3909 else { 3910 // Repack to original 16-bit vector result 3911 // FIXME: G_TRUNC should work, but legalization currently fails 3912 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3913 SmallVector<Register, 4> Repack; 3914 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3915 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3916 B.buildMerge(Dst, Repack); 3917 } 3918 } 3919 3920 MI.eraseFromParent(); 3921 return true; 3922 } 3923 3924 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3925 MachineIRBuilder &B, 3926 bool IsInc) const { 3927 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3928 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3929 B.buildInstr(Opc) 3930 .addDef(MI.getOperand(0).getReg()) 3931 .addUse(MI.getOperand(2).getReg()) 3932 .addUse(MI.getOperand(3).getReg()) 3933 .cloneMemRefs(MI); 3934 MI.eraseFromParent(); 3935 return true; 3936 } 3937 3938 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3939 switch (IntrID) { 3940 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3941 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3942 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3943 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3944 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3945 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3946 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3947 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3948 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3949 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3950 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3951 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3952 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3953 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3954 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3955 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3956 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3957 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3958 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3959 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3960 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3961 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3962 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3963 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3964 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3965 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3966 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3967 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3968 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3969 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3970 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3971 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3972 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3973 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3974 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3975 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3976 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3977 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3978 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3979 case Intrinsic::amdgcn_buffer_atomic_fadd: 3980 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3981 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3982 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3983 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 3984 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 3985 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 3986 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 3987 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 3988 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 3989 default: 3990 llvm_unreachable("unhandled atomic opcode"); 3991 } 3992 } 3993 3994 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3995 MachineIRBuilder &B, 3996 Intrinsic::ID IID) const { 3997 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3998 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3999 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4000 4001 Register Dst; 4002 4003 int OpOffset = 0; 4004 if (HasReturn) { 4005 // A few FP atomics do not support return values. 4006 Dst = MI.getOperand(0).getReg(); 4007 } else { 4008 OpOffset = -1; 4009 } 4010 4011 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4012 Register CmpVal; 4013 4014 if (IsCmpSwap) { 4015 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4016 ++OpOffset; 4017 } 4018 4019 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4020 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4021 4022 // The struct intrinsic variants add one additional operand over raw. 4023 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4024 Register VIndex; 4025 if (HasVIndex) { 4026 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4027 ++OpOffset; 4028 } 4029 4030 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4031 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4032 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4033 4034 MachineMemOperand *MMO = *MI.memoperands_begin(); 4035 4036 unsigned ImmOffset; 4037 unsigned TotalOffset; 4038 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 4039 if (TotalOffset != 0) 4040 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 4041 4042 if (!VIndex) 4043 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4044 4045 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4046 4047 if (HasReturn) 4048 MIB.addDef(Dst); 4049 4050 MIB.addUse(VData); // vdata 4051 4052 if (IsCmpSwap) 4053 MIB.addReg(CmpVal); 4054 4055 MIB.addUse(RSrc) // rsrc 4056 .addUse(VIndex) // vindex 4057 .addUse(VOffset) // voffset 4058 .addUse(SOffset) // soffset 4059 .addImm(ImmOffset) // offset(imm) 4060 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4061 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4062 .addMemOperand(MMO); 4063 4064 MI.eraseFromParent(); 4065 return true; 4066 } 4067 4068 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4069 /// vector with s16 typed elements. 4070 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4071 SmallVectorImpl<Register> &PackedAddrs, 4072 unsigned ArgOffset, 4073 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4074 bool IsA16, bool IsG16) { 4075 const LLT S16 = LLT::scalar(16); 4076 const LLT V2S16 = LLT::vector(2, 16); 4077 auto EndIdx = Intr->VAddrEnd; 4078 4079 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4080 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4081 if (!SrcOp.isReg()) 4082 continue; // _L to _LZ may have eliminated this. 4083 4084 Register AddrReg = SrcOp.getReg(); 4085 4086 if (I < Intr->GradientStart) { 4087 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4088 PackedAddrs.push_back(AddrReg); 4089 } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4090 (I >= Intr->CoordStart && !IsA16)) { 4091 // Handle any gradient or coordinate operands that should not be packed 4092 PackedAddrs.push_back(AddrReg); 4093 } else { 4094 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4095 // derivatives dx/dh and dx/dv are packed with undef. 4096 if (((I + 1) >= EndIdx) || 4097 ((Intr->NumGradients / 2) % 2 == 1 && 4098 (I == static_cast<unsigned>(Intr->GradientStart + 4099 (Intr->NumGradients / 2) - 1) || 4100 I == static_cast<unsigned>(Intr->GradientStart + 4101 Intr->NumGradients - 1))) || 4102 // Check for _L to _LZ optimization 4103 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4104 PackedAddrs.push_back( 4105 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4106 .getReg(0)); 4107 } else { 4108 PackedAddrs.push_back( 4109 B.buildBuildVector( 4110 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4111 .getReg(0)); 4112 ++I; 4113 } 4114 } 4115 } 4116 } 4117 4118 /// Convert from separate vaddr components to a single vector address register, 4119 /// and replace the remaining operands with $noreg. 4120 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4121 int DimIdx, int NumVAddrs) { 4122 const LLT S32 = LLT::scalar(32); 4123 4124 SmallVector<Register, 8> AddrRegs; 4125 for (int I = 0; I != NumVAddrs; ++I) { 4126 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4127 if (SrcOp.isReg()) { 4128 AddrRegs.push_back(SrcOp.getReg()); 4129 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4130 } 4131 } 4132 4133 int NumAddrRegs = AddrRegs.size(); 4134 if (NumAddrRegs != 1) { 4135 // Round up to 8 elements for v5-v7 4136 // FIXME: Missing intermediate sized register classes and instructions. 4137 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4138 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4139 auto Undef = B.buildUndef(S32); 4140 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4141 NumAddrRegs = RoundedNumRegs; 4142 } 4143 4144 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 4145 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4146 } 4147 4148 for (int I = 1; I != NumVAddrs; ++I) { 4149 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4150 if (SrcOp.isReg()) 4151 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4152 } 4153 } 4154 4155 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4156 /// 4157 /// Depending on the subtarget, load/store with 16-bit element data need to be 4158 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4159 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4160 /// registers. 4161 /// 4162 /// We don't want to directly select image instructions just yet, but also want 4163 /// to exposes all register repacking to the legalizer/combiners. We also don't 4164 /// want a selected instrution entering RegBankSelect. In order to avoid 4165 /// defining a multitude of intermediate image instructions, directly hack on 4166 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4167 /// now unnecessary arguments with $noreg. 4168 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4169 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4170 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4171 4172 const unsigned NumDefs = MI.getNumExplicitDefs(); 4173 const unsigned ArgOffset = NumDefs + 1; 4174 bool IsTFE = NumDefs == 2; 4175 // We are only processing the operands of d16 image operations on subtargets 4176 // that use the unpacked register layout, or need to repack the TFE result. 4177 4178 // TODO: Do we need to guard against already legalized intrinsics? 4179 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4180 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4181 4182 MachineRegisterInfo *MRI = B.getMRI(); 4183 const LLT S32 = LLT::scalar(32); 4184 const LLT S16 = LLT::scalar(16); 4185 const LLT V2S16 = LLT::vector(2, 16); 4186 4187 unsigned DMask = 0; 4188 4189 // Check for 16 bit addresses and pack if true. 4190 LLT GradTy = 4191 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4192 LLT AddrTy = 4193 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4194 const bool IsG16 = GradTy == S16; 4195 const bool IsA16 = AddrTy == S16; 4196 4197 int DMaskLanes = 0; 4198 if (!BaseOpcode->Atomic) { 4199 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4200 if (BaseOpcode->Gather4) { 4201 DMaskLanes = 4; 4202 } else if (DMask != 0) { 4203 DMaskLanes = countPopulation(DMask); 4204 } else if (!IsTFE && !BaseOpcode->Store) { 4205 // If dmask is 0, this is a no-op load. This can be eliminated. 4206 B.buildUndef(MI.getOperand(0)); 4207 MI.eraseFromParent(); 4208 return true; 4209 } 4210 } 4211 4212 Observer.changingInstr(MI); 4213 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4214 4215 unsigned NewOpcode = NumDefs == 0 ? 4216 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4217 4218 // Track that we legalized this 4219 MI.setDesc(B.getTII().get(NewOpcode)); 4220 4221 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4222 // dmask to be at least 1 otherwise the instruction will fail 4223 if (IsTFE && DMask == 0) { 4224 DMask = 0x1; 4225 DMaskLanes = 1; 4226 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4227 } 4228 4229 if (BaseOpcode->Atomic) { 4230 Register VData0 = MI.getOperand(2).getReg(); 4231 LLT Ty = MRI->getType(VData0); 4232 4233 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4234 if (Ty.isVector()) 4235 return false; 4236 4237 if (BaseOpcode->AtomicX2) { 4238 Register VData1 = MI.getOperand(3).getReg(); 4239 // The two values are packed in one register. 4240 LLT PackedTy = LLT::vector(2, Ty); 4241 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4242 MI.getOperand(2).setReg(Concat.getReg(0)); 4243 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4244 } 4245 } 4246 4247 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4248 4249 // Optimize _L to _LZ when _L is zero 4250 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4251 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4252 const ConstantFP *ConstantLod; 4253 4254 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4255 m_GFCst(ConstantLod))) { 4256 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4257 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4258 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4259 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4260 Intr->Dim); 4261 4262 // The starting indexes should remain in the same place. 4263 --CorrectedNumVAddrs; 4264 4265 MI.getOperand(MI.getNumExplicitDefs()) 4266 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4267 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4268 Intr = NewImageDimIntr; 4269 } 4270 } 4271 } 4272 4273 // Optimize _mip away, when 'lod' is zero 4274 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4275 int64_t ConstantLod; 4276 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4277 m_ICst(ConstantLod))) { 4278 if (ConstantLod == 0) { 4279 // TODO: Change intrinsic opcode and remove operand instead or replacing 4280 // it with 0, as the _L to _LZ handling is done above. 4281 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4282 --CorrectedNumVAddrs; 4283 } 4284 } 4285 } 4286 4287 // Rewrite the addressing register layout before doing anything else. 4288 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4289 // 16 bit gradients are supported, but are tied to the A16 control 4290 // so both gradients and addresses must be 16 bit 4291 return false; 4292 } 4293 4294 if (IsA16 && !ST.hasA16()) { 4295 // A16 not supported 4296 return false; 4297 } 4298 4299 if (IsA16 || IsG16) { 4300 if (Intr->NumVAddrs > 1) { 4301 SmallVector<Register, 4> PackedRegs; 4302 4303 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4304 IsG16); 4305 4306 // See also below in the non-a16 branch 4307 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4308 4309 if (!UseNSA && PackedRegs.size() > 1) { 4310 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4311 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4312 PackedRegs[0] = Concat.getReg(0); 4313 PackedRegs.resize(1); 4314 } 4315 4316 const unsigned NumPacked = PackedRegs.size(); 4317 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4318 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4319 if (!SrcOp.isReg()) { 4320 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4321 continue; 4322 } 4323 4324 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4325 4326 if (I - Intr->VAddrStart < NumPacked) 4327 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4328 else 4329 SrcOp.setReg(AMDGPU::NoRegister); 4330 } 4331 } 4332 } else { 4333 // If the register allocator cannot place the address registers contiguously 4334 // without introducing moves, then using the non-sequential address encoding 4335 // is always preferable, since it saves VALU instructions and is usually a 4336 // wash in terms of code size or even better. 4337 // 4338 // However, we currently have no way of hinting to the register allocator 4339 // that MIMG addresses should be placed contiguously when it is possible to 4340 // do so, so force non-NSA for the common 2-address case as a heuristic. 4341 // 4342 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4343 // allocation when possible. 4344 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4345 4346 if (!UseNSA && Intr->NumVAddrs > 1) 4347 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4348 Intr->NumVAddrs); 4349 } 4350 4351 int Flags = 0; 4352 if (IsA16) 4353 Flags |= 1; 4354 if (IsG16) 4355 Flags |= 2; 4356 MI.addOperand(MachineOperand::CreateImm(Flags)); 4357 4358 if (BaseOpcode->Store) { // No TFE for stores? 4359 // TODO: Handle dmask trim 4360 Register VData = MI.getOperand(1).getReg(); 4361 LLT Ty = MRI->getType(VData); 4362 if (!Ty.isVector() || Ty.getElementType() != S16) 4363 return true; 4364 4365 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4366 if (RepackedReg != VData) { 4367 MI.getOperand(1).setReg(RepackedReg); 4368 } 4369 4370 return true; 4371 } 4372 4373 Register DstReg = MI.getOperand(0).getReg(); 4374 LLT Ty = MRI->getType(DstReg); 4375 const LLT EltTy = Ty.getScalarType(); 4376 const bool IsD16 = Ty.getScalarType() == S16; 4377 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4378 4379 // Confirm that the return type is large enough for the dmask specified 4380 if (NumElts < DMaskLanes) 4381 return false; 4382 4383 if (NumElts > 4 || DMaskLanes > 4) 4384 return false; 4385 4386 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4387 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4388 4389 // The raw dword aligned data component of the load. The only legal cases 4390 // where this matters should be when using the packed D16 format, for 4391 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4392 LLT RoundedTy; 4393 4394 // S32 vector to to cover all data, plus TFE result element. 4395 LLT TFETy; 4396 4397 // Register type to use for each loaded component. Will be S32 or V2S16. 4398 LLT RegTy; 4399 4400 if (IsD16 && ST.hasUnpackedD16VMem()) { 4401 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4402 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4403 RegTy = S32; 4404 } else { 4405 unsigned EltSize = EltTy.getSizeInBits(); 4406 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4407 unsigned RoundedSize = 32 * RoundedElts; 4408 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4409 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4410 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4411 } 4412 4413 // The return type does not need adjustment. 4414 // TODO: Should we change s16 case to s32 or <2 x s16>? 4415 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4416 return true; 4417 4418 Register Dst1Reg; 4419 4420 // Insert after the instruction. 4421 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4422 4423 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4424 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4425 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4426 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4427 4428 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4429 4430 MI.getOperand(0).setReg(NewResultReg); 4431 4432 // In the IR, TFE is supposed to be used with a 2 element struct return 4433 // type. The intruction really returns these two values in one contiguous 4434 // register, with one additional dword beyond the loaded data. Rewrite the 4435 // return type to use a single register result. 4436 4437 if (IsTFE) { 4438 Dst1Reg = MI.getOperand(1).getReg(); 4439 if (MRI->getType(Dst1Reg) != S32) 4440 return false; 4441 4442 // TODO: Make sure the TFE operand bit is set. 4443 MI.RemoveOperand(1); 4444 4445 // Handle the easy case that requires no repack instructions. 4446 if (Ty == S32) { 4447 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4448 return true; 4449 } 4450 } 4451 4452 // Now figure out how to copy the new result register back into the old 4453 // result. 4454 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4455 4456 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4457 4458 if (ResultNumRegs == 1) { 4459 assert(!IsTFE); 4460 ResultRegs[0] = NewResultReg; 4461 } else { 4462 // We have to repack into a new vector of some kind. 4463 for (int I = 0; I != NumDataRegs; ++I) 4464 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4465 B.buildUnmerge(ResultRegs, NewResultReg); 4466 4467 // Drop the final TFE element to get the data part. The TFE result is 4468 // directly written to the right place already. 4469 if (IsTFE) 4470 ResultRegs.resize(NumDataRegs); 4471 } 4472 4473 // For an s16 scalar result, we form an s32 result with a truncate regardless 4474 // of packed vs. unpacked. 4475 if (IsD16 && !Ty.isVector()) { 4476 B.buildTrunc(DstReg, ResultRegs[0]); 4477 return true; 4478 } 4479 4480 // Avoid a build/concat_vector of 1 entry. 4481 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4482 B.buildBitcast(DstReg, ResultRegs[0]); 4483 return true; 4484 } 4485 4486 assert(Ty.isVector()); 4487 4488 if (IsD16) { 4489 // For packed D16 results with TFE enabled, all the data components are 4490 // S32. Cast back to the expected type. 4491 // 4492 // TODO: We don't really need to use load s32 elements. We would only need one 4493 // cast for the TFE result if a multiple of v2s16 was used. 4494 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4495 for (Register &Reg : ResultRegs) 4496 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4497 } else if (ST.hasUnpackedD16VMem()) { 4498 for (Register &Reg : ResultRegs) 4499 Reg = B.buildTrunc(S16, Reg).getReg(0); 4500 } 4501 } 4502 4503 auto padWithUndef = [&](LLT Ty, int NumElts) { 4504 if (NumElts == 0) 4505 return; 4506 Register Undef = B.buildUndef(Ty).getReg(0); 4507 for (int I = 0; I != NumElts; ++I) 4508 ResultRegs.push_back(Undef); 4509 }; 4510 4511 // Pad out any elements eliminated due to the dmask. 4512 LLT ResTy = MRI->getType(ResultRegs[0]); 4513 if (!ResTy.isVector()) { 4514 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4515 B.buildBuildVector(DstReg, ResultRegs); 4516 return true; 4517 } 4518 4519 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4520 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4521 4522 // Deal with the one annoying legal case. 4523 const LLT V3S16 = LLT::vector(3, 16); 4524 if (Ty == V3S16) { 4525 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4526 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4527 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4528 return true; 4529 } 4530 4531 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4532 B.buildConcatVectors(DstReg, ResultRegs); 4533 return true; 4534 } 4535 4536 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4537 LegalizerHelper &Helper, MachineInstr &MI) const { 4538 MachineIRBuilder &B = Helper.MIRBuilder; 4539 GISelChangeObserver &Observer = Helper.Observer; 4540 4541 Register Dst = MI.getOperand(0).getReg(); 4542 LLT Ty = B.getMRI()->getType(Dst); 4543 unsigned Size = Ty.getSizeInBits(); 4544 MachineFunction &MF = B.getMF(); 4545 4546 Observer.changingInstr(MI); 4547 4548 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4549 Ty = getBitcastRegisterType(Ty); 4550 Helper.bitcastDst(MI, Ty, 0); 4551 Dst = MI.getOperand(0).getReg(); 4552 B.setInsertPt(B.getMBB(), MI); 4553 } 4554 4555 // FIXME: We don't really need this intermediate instruction. The intrinsic 4556 // should be fixed to have a memory operand. Since it's readnone, we're not 4557 // allowed to add one. 4558 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4559 MI.RemoveOperand(1); // Remove intrinsic ID 4560 4561 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4562 // TODO: Should this use datalayout alignment? 4563 const unsigned MemSize = (Size + 7) / 8; 4564 const Align MemAlign(4); 4565 MachineMemOperand *MMO = MF.getMachineMemOperand( 4566 MachinePointerInfo(), 4567 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4568 MachineMemOperand::MOInvariant, 4569 MemSize, MemAlign); 4570 MI.addMemOperand(MF, MMO); 4571 4572 // There are no 96-bit result scalar loads, but widening to 128-bit should 4573 // always be legal. We may need to restore this to a 96-bit result if it turns 4574 // out this needs to be converted to a vector load during RegBankSelect. 4575 if (!isPowerOf2_32(Size)) { 4576 if (Ty.isVector()) 4577 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4578 else 4579 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4580 } 4581 4582 Observer.changedInstr(MI); 4583 return true; 4584 } 4585 4586 // TODO: Move to selection 4587 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4588 MachineRegisterInfo &MRI, 4589 MachineIRBuilder &B) const { 4590 if (!ST.isTrapHandlerEnabled() || 4591 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4592 return legalizeTrapEndpgm(MI, MRI, B); 4593 4594 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4595 switch (*HsaAbiVer) { 4596 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4597 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4598 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4599 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4600 return ST.supportsGetDoorbellID() ? 4601 legalizeTrapHsa(MI, MRI, B) : 4602 legalizeTrapHsaQueuePtr(MI, MRI, B); 4603 } 4604 } 4605 4606 llvm_unreachable("Unknown trap handler"); 4607 } 4608 4609 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4610 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4611 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4612 MI.eraseFromParent(); 4613 return true; 4614 } 4615 4616 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4617 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4618 // Pass queue pointer to trap handler as input, and insert trap instruction 4619 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4620 Register LiveIn = 4621 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4622 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4623 return false; 4624 4625 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4626 B.buildCopy(SGPR01, LiveIn); 4627 B.buildInstr(AMDGPU::S_TRAP) 4628 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4629 .addReg(SGPR01, RegState::Implicit); 4630 4631 MI.eraseFromParent(); 4632 return true; 4633 } 4634 4635 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4636 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4637 B.buildInstr(AMDGPU::S_TRAP) 4638 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4639 MI.eraseFromParent(); 4640 return true; 4641 } 4642 4643 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4644 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4645 // Is non-HSA path or trap-handler disabled? then, report a warning 4646 // accordingly 4647 if (!ST.isTrapHandlerEnabled() || 4648 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4649 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4650 "debugtrap handler not supported", 4651 MI.getDebugLoc(), DS_Warning); 4652 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4653 Ctx.diagnose(NoTrap); 4654 } else { 4655 // Insert debug-trap instruction 4656 B.buildInstr(AMDGPU::S_TRAP) 4657 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4658 } 4659 4660 MI.eraseFromParent(); 4661 return true; 4662 } 4663 4664 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4665 MachineIRBuilder &B) const { 4666 MachineRegisterInfo &MRI = *B.getMRI(); 4667 const LLT S16 = LLT::scalar(16); 4668 const LLT S32 = LLT::scalar(32); 4669 4670 Register DstReg = MI.getOperand(0).getReg(); 4671 Register NodePtr = MI.getOperand(2).getReg(); 4672 Register RayExtent = MI.getOperand(3).getReg(); 4673 Register RayOrigin = MI.getOperand(4).getReg(); 4674 Register RayDir = MI.getOperand(5).getReg(); 4675 Register RayInvDir = MI.getOperand(6).getReg(); 4676 Register TDescr = MI.getOperand(7).getReg(); 4677 4678 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4679 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4680 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4681 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4682 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4683 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4684 4685 SmallVector<Register, 12> Ops; 4686 if (Is64) { 4687 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4688 Ops.push_back(Unmerge.getReg(0)); 4689 Ops.push_back(Unmerge.getReg(1)); 4690 } else { 4691 Ops.push_back(NodePtr); 4692 } 4693 Ops.push_back(RayExtent); 4694 4695 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4696 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4697 Ops.push_back(Unmerge.getReg(0)); 4698 Ops.push_back(Unmerge.getReg(1)); 4699 Ops.push_back(Unmerge.getReg(2)); 4700 }; 4701 4702 packLanes(RayOrigin); 4703 if (IsA16) { 4704 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4705 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4706 Register R1 = MRI.createGenericVirtualRegister(S32); 4707 Register R2 = MRI.createGenericVirtualRegister(S32); 4708 Register R3 = MRI.createGenericVirtualRegister(S32); 4709 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4710 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4711 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4712 Ops.push_back(R1); 4713 Ops.push_back(R2); 4714 Ops.push_back(R3); 4715 } else { 4716 packLanes(RayDir); 4717 packLanes(RayInvDir); 4718 } 4719 4720 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4721 .addDef(DstReg) 4722 .addImm(Opcode); 4723 4724 for (Register R : Ops) { 4725 MIB.addUse(R); 4726 } 4727 4728 MIB.addUse(TDescr) 4729 .addImm(IsA16 ? 1 : 0) 4730 .cloneMemRefs(MI); 4731 4732 MI.eraseFromParent(); 4733 return true; 4734 } 4735 4736 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4737 MachineInstr &MI) const { 4738 MachineIRBuilder &B = Helper.MIRBuilder; 4739 MachineRegisterInfo &MRI = *B.getMRI(); 4740 4741 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4742 auto IntrID = MI.getIntrinsicID(); 4743 switch (IntrID) { 4744 case Intrinsic::amdgcn_if: 4745 case Intrinsic::amdgcn_else: { 4746 MachineInstr *Br = nullptr; 4747 MachineBasicBlock *UncondBrTarget = nullptr; 4748 bool Negated = false; 4749 if (MachineInstr *BrCond = 4750 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4751 const SIRegisterInfo *TRI 4752 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4753 4754 Register Def = MI.getOperand(1).getReg(); 4755 Register Use = MI.getOperand(3).getReg(); 4756 4757 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4758 4759 if (Negated) 4760 std::swap(CondBrTarget, UncondBrTarget); 4761 4762 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4763 if (IntrID == Intrinsic::amdgcn_if) { 4764 B.buildInstr(AMDGPU::SI_IF) 4765 .addDef(Def) 4766 .addUse(Use) 4767 .addMBB(UncondBrTarget); 4768 } else { 4769 B.buildInstr(AMDGPU::SI_ELSE) 4770 .addDef(Def) 4771 .addUse(Use) 4772 .addMBB(UncondBrTarget); 4773 } 4774 4775 if (Br) { 4776 Br->getOperand(0).setMBB(CondBrTarget); 4777 } else { 4778 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4779 // since we're swapping branch targets it needs to be reinserted. 4780 // FIXME: IRTranslator should probably not do this 4781 B.buildBr(*CondBrTarget); 4782 } 4783 4784 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4785 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4786 MI.eraseFromParent(); 4787 BrCond->eraseFromParent(); 4788 return true; 4789 } 4790 4791 return false; 4792 } 4793 case Intrinsic::amdgcn_loop: { 4794 MachineInstr *Br = nullptr; 4795 MachineBasicBlock *UncondBrTarget = nullptr; 4796 bool Negated = false; 4797 if (MachineInstr *BrCond = 4798 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4799 const SIRegisterInfo *TRI 4800 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4801 4802 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4803 Register Reg = MI.getOperand(2).getReg(); 4804 4805 if (Negated) 4806 std::swap(CondBrTarget, UncondBrTarget); 4807 4808 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4809 B.buildInstr(AMDGPU::SI_LOOP) 4810 .addUse(Reg) 4811 .addMBB(UncondBrTarget); 4812 4813 if (Br) 4814 Br->getOperand(0).setMBB(CondBrTarget); 4815 else 4816 B.buildBr(*CondBrTarget); 4817 4818 MI.eraseFromParent(); 4819 BrCond->eraseFromParent(); 4820 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4821 return true; 4822 } 4823 4824 return false; 4825 } 4826 case Intrinsic::amdgcn_kernarg_segment_ptr: 4827 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4828 // This only makes sense to call in a kernel, so just lower to null. 4829 B.buildConstant(MI.getOperand(0).getReg(), 0); 4830 MI.eraseFromParent(); 4831 return true; 4832 } 4833 4834 return legalizePreloadedArgIntrin( 4835 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4836 case Intrinsic::amdgcn_implicitarg_ptr: 4837 return legalizeImplicitArgPtr(MI, MRI, B); 4838 case Intrinsic::amdgcn_workitem_id_x: 4839 return legalizePreloadedArgIntrin(MI, MRI, B, 4840 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4841 case Intrinsic::amdgcn_workitem_id_y: 4842 return legalizePreloadedArgIntrin(MI, MRI, B, 4843 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4844 case Intrinsic::amdgcn_workitem_id_z: 4845 return legalizePreloadedArgIntrin(MI, MRI, B, 4846 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4847 case Intrinsic::amdgcn_workgroup_id_x: 4848 return legalizePreloadedArgIntrin(MI, MRI, B, 4849 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4850 case Intrinsic::amdgcn_workgroup_id_y: 4851 return legalizePreloadedArgIntrin(MI, MRI, B, 4852 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4853 case Intrinsic::amdgcn_workgroup_id_z: 4854 return legalizePreloadedArgIntrin(MI, MRI, B, 4855 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4856 case Intrinsic::amdgcn_dispatch_ptr: 4857 return legalizePreloadedArgIntrin(MI, MRI, B, 4858 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4859 case Intrinsic::amdgcn_queue_ptr: 4860 return legalizePreloadedArgIntrin(MI, MRI, B, 4861 AMDGPUFunctionArgInfo::QUEUE_PTR); 4862 case Intrinsic::amdgcn_implicit_buffer_ptr: 4863 return legalizePreloadedArgIntrin( 4864 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4865 case Intrinsic::amdgcn_dispatch_id: 4866 return legalizePreloadedArgIntrin(MI, MRI, B, 4867 AMDGPUFunctionArgInfo::DISPATCH_ID); 4868 case Intrinsic::amdgcn_fdiv_fast: 4869 return legalizeFDIVFastIntrin(MI, MRI, B); 4870 case Intrinsic::amdgcn_is_shared: 4871 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4872 case Intrinsic::amdgcn_is_private: 4873 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4874 case Intrinsic::amdgcn_wavefrontsize: { 4875 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4876 MI.eraseFromParent(); 4877 return true; 4878 } 4879 case Intrinsic::amdgcn_s_buffer_load: 4880 return legalizeSBufferLoad(Helper, MI); 4881 case Intrinsic::amdgcn_raw_buffer_store: 4882 case Intrinsic::amdgcn_struct_buffer_store: 4883 return legalizeBufferStore(MI, MRI, B, false, false); 4884 case Intrinsic::amdgcn_raw_buffer_store_format: 4885 case Intrinsic::amdgcn_struct_buffer_store_format: 4886 return legalizeBufferStore(MI, MRI, B, false, true); 4887 case Intrinsic::amdgcn_raw_tbuffer_store: 4888 case Intrinsic::amdgcn_struct_tbuffer_store: 4889 return legalizeBufferStore(MI, MRI, B, true, true); 4890 case Intrinsic::amdgcn_raw_buffer_load: 4891 case Intrinsic::amdgcn_struct_buffer_load: 4892 return legalizeBufferLoad(MI, MRI, B, false, false); 4893 case Intrinsic::amdgcn_raw_buffer_load_format: 4894 case Intrinsic::amdgcn_struct_buffer_load_format: 4895 return legalizeBufferLoad(MI, MRI, B, true, false); 4896 case Intrinsic::amdgcn_raw_tbuffer_load: 4897 case Intrinsic::amdgcn_struct_tbuffer_load: 4898 return legalizeBufferLoad(MI, MRI, B, true, true); 4899 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4900 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4901 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4902 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4903 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4904 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4905 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4906 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4907 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4908 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4909 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4910 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4911 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4912 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4913 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4914 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4915 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4916 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4917 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4918 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4919 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4920 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4921 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4922 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4923 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4924 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4925 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4926 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4927 case Intrinsic::amdgcn_buffer_atomic_fadd: 4928 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4929 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4930 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4931 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4932 return legalizeBufferAtomic(MI, B, IntrID); 4933 case Intrinsic::amdgcn_atomic_inc: 4934 return legalizeAtomicIncDec(MI, B, true); 4935 case Intrinsic::amdgcn_atomic_dec: 4936 return legalizeAtomicIncDec(MI, B, false); 4937 case Intrinsic::trap: 4938 return legalizeTrapIntrinsic(MI, MRI, B); 4939 case Intrinsic::debugtrap: 4940 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4941 case Intrinsic::amdgcn_rsq_clamp: 4942 return legalizeRsqClampIntrinsic(MI, MRI, B); 4943 case Intrinsic::amdgcn_ds_fadd: 4944 case Intrinsic::amdgcn_ds_fmin: 4945 case Intrinsic::amdgcn_ds_fmax: 4946 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4947 case Intrinsic::amdgcn_image_bvh_intersect_ray: 4948 return legalizeBVHIntrinsic(MI, B); 4949 default: { 4950 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4951 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4952 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4953 return true; 4954 } 4955 } 4956 4957 return true; 4958 } 4959