1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeNumElements(Pow2NElts); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 99 }; 100 } 101 102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 const LLT EltTy = Ty.getElementType(); 106 unsigned Size = Ty.getSizeInBits(); 107 unsigned Pieces = (Size + 63) / 64; 108 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 109 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 110 }; 111 } 112 113 // Increase the number of vector elements to reach the next multiple of 32-bit 114 // type. 115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 116 return [=](const LegalityQuery &Query) { 117 const LLT Ty = Query.Types[TypeIdx]; 118 119 const LLT EltTy = Ty.getElementType(); 120 const int Size = Ty.getSizeInBits(); 121 const int EltSize = EltTy.getSizeInBits(); 122 const int NextMul32 = (Size + 31) / 32; 123 124 assert(EltSize < 32); 125 126 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 127 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 128 }; 129 } 130 131 static LLT getBitcastRegisterType(const LLT Ty) { 132 const unsigned Size = Ty.getSizeInBits(); 133 134 LLT CoercedTy; 135 if (Size <= 32) { 136 // <2 x s8> -> s16 137 // <4 x s8> -> s32 138 return LLT::scalar(Size); 139 } 140 141 return LLT::scalarOrVector(Size / 32, 32); 142 } 143 144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 145 return [=](const LegalityQuery &Query) { 146 const LLT Ty = Query.Types[TypeIdx]; 147 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 148 }; 149 } 150 151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 152 return [=](const LegalityQuery &Query) { 153 const LLT Ty = Query.Types[TypeIdx]; 154 unsigned Size = Ty.getSizeInBits(); 155 assert(Size % 32 == 0); 156 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 157 }; 158 } 159 160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 161 return [=](const LegalityQuery &Query) { 162 const LLT QueryTy = Query.Types[TypeIdx]; 163 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 164 }; 165 } 166 167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 168 return [=](const LegalityQuery &Query) { 169 const LLT QueryTy = Query.Types[TypeIdx]; 170 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 171 }; 172 } 173 174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 const LLT QueryTy = Query.Types[TypeIdx]; 177 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 178 }; 179 } 180 181 static bool isRegisterSize(unsigned Size) { 182 return Size % 32 == 0 && Size <= MaxRegisterSize; 183 } 184 185 static bool isRegisterVectorElementType(LLT EltTy) { 186 const int EltSize = EltTy.getSizeInBits(); 187 return EltSize == 16 || EltSize % 32 == 0; 188 } 189 190 static bool isRegisterVectorType(LLT Ty) { 191 const int EltSize = Ty.getElementType().getSizeInBits(); 192 return EltSize == 32 || EltSize == 64 || 193 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 194 EltSize == 128 || EltSize == 256; 195 } 196 197 static bool isRegisterType(LLT Ty) { 198 if (!isRegisterSize(Ty.getSizeInBits())) 199 return false; 200 201 if (Ty.isVector()) 202 return isRegisterVectorType(Ty); 203 204 return true; 205 } 206 207 // Any combination of 32 or 64-bit elements up the maximum register size, and 208 // multiples of v2s16. 209 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 210 return [=](const LegalityQuery &Query) { 211 return isRegisterType(Query.Types[TypeIdx]); 212 }; 213 } 214 215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 216 return [=](const LegalityQuery &Query) { 217 const LLT QueryTy = Query.Types[TypeIdx]; 218 if (!QueryTy.isVector()) 219 return false; 220 const LLT EltTy = QueryTy.getElementType(); 221 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 222 }; 223 } 224 225 // If we have a truncating store or an extending load with a data size larger 226 // than 32-bits, we need to reduce to a 32-bit type. 227 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 228 return [=](const LegalityQuery &Query) { 229 const LLT Ty = Query.Types[TypeIdx]; 230 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 231 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 232 }; 233 } 234 235 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 236 // handle some operations by just promoting the register during 237 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 238 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 239 bool IsLoad) { 240 switch (AS) { 241 case AMDGPUAS::PRIVATE_ADDRESS: 242 // FIXME: Private element size. 243 return ST.enableFlatScratch() ? 128 : 32; 244 case AMDGPUAS::LOCAL_ADDRESS: 245 return ST.useDS128() ? 128 : 64; 246 case AMDGPUAS::GLOBAL_ADDRESS: 247 case AMDGPUAS::CONSTANT_ADDRESS: 248 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 249 // Treat constant and global as identical. SMRD loads are sometimes usable for 250 // global loads (ideally constant address space should be eliminated) 251 // depending on the context. Legality cannot be context dependent, but 252 // RegBankSelect can split the load as necessary depending on the pointer 253 // register bank/uniformity and if the memory is invariant or not written in a 254 // kernel. 255 return IsLoad ? 512 : 128; 256 default: 257 // Flat addresses may contextually need to be split to 32-bit parts if they 258 // may alias scratch depending on the subtarget. 259 return 128; 260 } 261 } 262 263 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 264 const LegalityQuery &Query) { 265 const LLT Ty = Query.Types[0]; 266 267 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 268 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 269 270 unsigned RegSize = Ty.getSizeInBits(); 271 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 272 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 273 unsigned AS = Query.Types[1].getAddressSpace(); 274 275 // All of these need to be custom lowered to cast the pointer operand. 276 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 277 return false; 278 279 // Do not handle extending vector loads. 280 if (Ty.isVector() && MemSize != RegSize) 281 return false; 282 283 // TODO: We should be able to widen loads if the alignment is high enough, but 284 // we also need to modify the memory access size. 285 #if 0 286 // Accept widening loads based on alignment. 287 if (IsLoad && MemSize < Size) 288 MemSize = std::max(MemSize, Align); 289 #endif 290 291 // Only 1-byte and 2-byte to 32-bit extloads are valid. 292 if (MemSize != RegSize && RegSize != 32) 293 return false; 294 295 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 296 return false; 297 298 switch (MemSize) { 299 case 8: 300 case 16: 301 case 32: 302 case 64: 303 case 128: 304 break; 305 case 96: 306 if (!ST.hasDwordx3LoadStores()) 307 return false; 308 break; 309 case 256: 310 case 512: 311 // These may contextually need to be broken down. 312 break; 313 default: 314 return false; 315 } 316 317 assert(RegSize >= MemSize); 318 319 if (AlignBits < MemSize) { 320 const SITargetLowering *TLI = ST.getTargetLowering(); 321 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 322 Align(AlignBits / 8))) 323 return false; 324 } 325 326 return true; 327 } 328 329 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 330 // workaround this. Eventually it should ignore the type for loads and only care 331 // about the size. Return true in cases where we will workaround this for now by 332 // bitcasting. 333 static bool loadStoreBitcastWorkaround(const LLT Ty) { 334 if (EnableNewLegality) 335 return false; 336 337 const unsigned Size = Ty.getSizeInBits(); 338 if (Size <= 64) 339 return false; 340 if (!Ty.isVector()) 341 return true; 342 343 LLT EltTy = Ty.getElementType(); 344 if (EltTy.isPointer()) 345 return true; 346 347 unsigned EltSize = EltTy.getSizeInBits(); 348 return EltSize != 32 && EltSize != 64; 349 } 350 351 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 352 const LLT Ty = Query.Types[0]; 353 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 354 !loadStoreBitcastWorkaround(Ty); 355 } 356 357 /// Return true if a load or store of the type should be lowered with a bitcast 358 /// to a different type. 359 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 360 const unsigned MemSizeInBits) { 361 const unsigned Size = Ty.getSizeInBits(); 362 if (Size != MemSizeInBits) 363 return Size <= 32 && Ty.isVector(); 364 365 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 366 return true; 367 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 368 !isRegisterVectorElementType(Ty.getElementType()); 369 } 370 371 /// Return true if we should legalize a load by widening an odd sized memory 372 /// access up to the alignment. Note this case when the memory access itself 373 /// changes, not the size of the result register. 374 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 375 unsigned AlignInBits, unsigned AddrSpace, 376 unsigned Opcode) { 377 // We don't want to widen cases that are naturally legal. 378 if (isPowerOf2_32(SizeInBits)) 379 return false; 380 381 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 382 // end up widening these for a scalar load during RegBankSelect, since there 383 // aren't 96-bit scalar loads. 384 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 385 return false; 386 387 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 388 return false; 389 390 // A load is known dereferenceable up to the alignment, so it's legal to widen 391 // to it. 392 // 393 // TODO: Could check dereferenceable for less aligned cases. 394 unsigned RoundedSize = NextPowerOf2(SizeInBits); 395 if (AlignInBits < RoundedSize) 396 return false; 397 398 // Do not widen if it would introduce a slow unaligned load. 399 const SITargetLowering *TLI = ST.getTargetLowering(); 400 bool Fast = false; 401 return TLI->allowsMisalignedMemoryAccessesImpl( 402 RoundedSize, AddrSpace, Align(AlignInBits / 8), 403 MachineMemOperand::MOLoad, &Fast) && 404 Fast; 405 } 406 407 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 408 unsigned Opcode) { 409 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 410 return false; 411 412 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 413 Query.MMODescrs[0].AlignInBits, 414 Query.Types[1].getAddressSpace(), Opcode); 415 } 416 417 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 418 const GCNTargetMachine &TM) 419 : ST(ST_) { 420 using namespace TargetOpcode; 421 422 auto GetAddrSpacePtr = [&TM](unsigned AS) { 423 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 424 }; 425 426 const LLT S1 = LLT::scalar(1); 427 const LLT S8 = LLT::scalar(8); 428 const LLT S16 = LLT::scalar(16); 429 const LLT S32 = LLT::scalar(32); 430 const LLT S64 = LLT::scalar(64); 431 const LLT S128 = LLT::scalar(128); 432 const LLT S256 = LLT::scalar(256); 433 const LLT S512 = LLT::scalar(512); 434 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 435 436 const LLT V2S8 = LLT::vector(2, 8); 437 const LLT V2S16 = LLT::vector(2, 16); 438 const LLT V4S16 = LLT::vector(4, 16); 439 440 const LLT V2S32 = LLT::vector(2, 32); 441 const LLT V3S32 = LLT::vector(3, 32); 442 const LLT V4S32 = LLT::vector(4, 32); 443 const LLT V5S32 = LLT::vector(5, 32); 444 const LLT V6S32 = LLT::vector(6, 32); 445 const LLT V7S32 = LLT::vector(7, 32); 446 const LLT V8S32 = LLT::vector(8, 32); 447 const LLT V9S32 = LLT::vector(9, 32); 448 const LLT V10S32 = LLT::vector(10, 32); 449 const LLT V11S32 = LLT::vector(11, 32); 450 const LLT V12S32 = LLT::vector(12, 32); 451 const LLT V13S32 = LLT::vector(13, 32); 452 const LLT V14S32 = LLT::vector(14, 32); 453 const LLT V15S32 = LLT::vector(15, 32); 454 const LLT V16S32 = LLT::vector(16, 32); 455 const LLT V32S32 = LLT::vector(32, 32); 456 457 const LLT V2S64 = LLT::vector(2, 64); 458 const LLT V3S64 = LLT::vector(3, 64); 459 const LLT V4S64 = LLT::vector(4, 64); 460 const LLT V5S64 = LLT::vector(5, 64); 461 const LLT V6S64 = LLT::vector(6, 64); 462 const LLT V7S64 = LLT::vector(7, 64); 463 const LLT V8S64 = LLT::vector(8, 64); 464 const LLT V16S64 = LLT::vector(16, 64); 465 466 std::initializer_list<LLT> AllS32Vectors = 467 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 468 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 469 std::initializer_list<LLT> AllS64Vectors = 470 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 471 472 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 473 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 474 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 475 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 476 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 477 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 478 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 479 480 const LLT CodePtr = FlatPtr; 481 482 const std::initializer_list<LLT> AddrSpaces64 = { 483 GlobalPtr, ConstantPtr, FlatPtr 484 }; 485 486 const std::initializer_list<LLT> AddrSpaces32 = { 487 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 488 }; 489 490 const std::initializer_list<LLT> FPTypesBase = { 491 S32, S64 492 }; 493 494 const std::initializer_list<LLT> FPTypes16 = { 495 S32, S64, S16 496 }; 497 498 const std::initializer_list<LLT> FPTypesPK16 = { 499 S32, S64, S16, V2S16 500 }; 501 502 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 503 504 setAction({G_BRCOND, S1}, Legal); // VCC branches 505 setAction({G_BRCOND, S32}, Legal); // SCC branches 506 507 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 508 // elements for v3s16 509 getActionDefinitionsBuilder(G_PHI) 510 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 511 .legalFor(AllS32Vectors) 512 .legalFor(AllS64Vectors) 513 .legalFor(AddrSpaces64) 514 .legalFor(AddrSpaces32) 515 .legalIf(isPointer(0)) 516 .clampScalar(0, S16, S256) 517 .widenScalarToNextPow2(0, 32) 518 .clampMaxNumElements(0, S32, 16) 519 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 520 .scalarize(0); 521 522 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 523 // Full set of gfx9 features. 524 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 525 .legalFor({S32, S16, V2S16}) 526 .clampScalar(0, S16, S32) 527 .clampMaxNumElements(0, S16, 2) 528 .scalarize(0) 529 .widenScalarToNextPow2(0, 32); 530 531 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 532 .legalFor({S32, S16, V2S16}) // Clamp modifier 533 .minScalarOrElt(0, S16) 534 .clampMaxNumElements(0, S16, 2) 535 .scalarize(0) 536 .widenScalarToNextPow2(0, 32) 537 .lower(); 538 } else if (ST.has16BitInsts()) { 539 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 540 .legalFor({S32, S16}) 541 .clampScalar(0, S16, S32) 542 .scalarize(0) 543 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 544 545 // Technically the saturating operations require clamp bit support, but this 546 // was introduced at the same time as 16-bit operations. 547 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 548 .legalFor({S32, S16}) // Clamp modifier 549 .minScalar(0, S16) 550 .scalarize(0) 551 .widenScalarToNextPow2(0, 16) 552 .lower(); 553 554 // We're just lowering this, but it helps get a better result to try to 555 // coerce to the desired type first. 556 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 557 .minScalar(0, S16) 558 .scalarize(0) 559 .lower(); 560 } else { 561 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 562 .legalFor({S32}) 563 .clampScalar(0, S32, S32) 564 .scalarize(0); 565 566 if (ST.hasIntClamp()) { 567 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 568 .legalFor({S32}) // Clamp modifier. 569 .scalarize(0) 570 .minScalarOrElt(0, S32) 571 .lower(); 572 } else { 573 // Clamp bit support was added in VI, along with 16-bit operations. 574 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 575 .minScalar(0, S32) 576 .scalarize(0) 577 .lower(); 578 } 579 580 // FIXME: DAG expansion gets better results. The widening uses the smaller 581 // range values and goes for the min/max lowering directly. 582 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 583 .minScalar(0, S32) 584 .scalarize(0) 585 .lower(); 586 } 587 588 getActionDefinitionsBuilder( 589 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 590 .customFor({S32, S64}) 591 .clampScalar(0, S32, S64) 592 .widenScalarToNextPow2(0, 32) 593 .scalarize(0); 594 595 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 596 .legalFor({S32}) 597 .maxScalarOrElt(0, S32); 598 599 if (ST.hasVOP3PInsts()) { 600 Mulh 601 .clampMaxNumElements(0, S8, 2) 602 .lowerFor({V2S8}); 603 } 604 605 Mulh 606 .scalarize(0) 607 .lower(); 608 609 // Report legal for any types we can handle anywhere. For the cases only legal 610 // on the SALU, RegBankSelect will be able to re-legalize. 611 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 612 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 613 .clampScalar(0, S32, S64) 614 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 615 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 616 .widenScalarToNextPow2(0) 617 .scalarize(0); 618 619 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 620 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 621 .legalFor({{S32, S1}, {S32, S32}}) 622 .minScalar(0, S32) 623 // TODO: .scalarize(0) 624 .lower(); 625 626 getActionDefinitionsBuilder(G_BITCAST) 627 // Don't worry about the size constraint. 628 .legalIf(all(isRegisterType(0), isRegisterType(1))) 629 .lower(); 630 631 632 getActionDefinitionsBuilder(G_CONSTANT) 633 .legalFor({S1, S32, S64, S16, GlobalPtr, 634 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 635 .legalIf(isPointer(0)) 636 .clampScalar(0, S32, S64) 637 .widenScalarToNextPow2(0); 638 639 getActionDefinitionsBuilder(G_FCONSTANT) 640 .legalFor({S32, S64, S16}) 641 .clampScalar(0, S16, S64); 642 643 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 644 .legalIf(isRegisterType(0)) 645 // s1 and s16 are special cases because they have legal operations on 646 // them, but don't really occupy registers in the normal way. 647 .legalFor({S1, S16}) 648 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 649 .clampScalarOrElt(0, S32, MaxScalar) 650 .widenScalarToNextPow2(0, 32) 651 .clampMaxNumElements(0, S32, 16); 652 653 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 654 655 // If the amount is divergent, we have to do a wave reduction to get the 656 // maximum value, so this is expanded during RegBankSelect. 657 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 658 .legalFor({{PrivatePtr, S32}}); 659 660 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 661 .customIf(typeIsNot(0, PrivatePtr)); 662 663 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 664 665 auto &FPOpActions = getActionDefinitionsBuilder( 666 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 667 .legalFor({S32, S64}); 668 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 669 .customFor({S32, S64}); 670 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 671 .customFor({S32, S64}); 672 673 if (ST.has16BitInsts()) { 674 if (ST.hasVOP3PInsts()) 675 FPOpActions.legalFor({S16, V2S16}); 676 else 677 FPOpActions.legalFor({S16}); 678 679 TrigActions.customFor({S16}); 680 FDIVActions.customFor({S16}); 681 } 682 683 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 684 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 685 686 if (ST.hasVOP3PInsts()) { 687 MinNumMaxNum.customFor(FPTypesPK16) 688 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 689 .clampMaxNumElements(0, S16, 2) 690 .clampScalar(0, S16, S64) 691 .scalarize(0); 692 } else if (ST.has16BitInsts()) { 693 MinNumMaxNum.customFor(FPTypes16) 694 .clampScalar(0, S16, S64) 695 .scalarize(0); 696 } else { 697 MinNumMaxNum.customFor(FPTypesBase) 698 .clampScalar(0, S32, S64) 699 .scalarize(0); 700 } 701 702 if (ST.hasVOP3PInsts()) 703 FPOpActions.clampMaxNumElements(0, S16, 2); 704 705 FPOpActions 706 .scalarize(0) 707 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 708 709 TrigActions 710 .scalarize(0) 711 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 712 713 FDIVActions 714 .scalarize(0) 715 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 716 717 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 718 .legalFor(FPTypesPK16) 719 .clampMaxNumElements(0, S16, 2) 720 .scalarize(0) 721 .clampScalar(0, S16, S64); 722 723 if (ST.has16BitInsts()) { 724 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 725 .legalFor({S32, S64, S16}) 726 .scalarize(0) 727 .clampScalar(0, S16, S64); 728 } else { 729 getActionDefinitionsBuilder(G_FSQRT) 730 .legalFor({S32, S64}) 731 .scalarize(0) 732 .clampScalar(0, S32, S64); 733 734 if (ST.hasFractBug()) { 735 getActionDefinitionsBuilder(G_FFLOOR) 736 .customFor({S64}) 737 .legalFor({S32, S64}) 738 .scalarize(0) 739 .clampScalar(0, S32, S64); 740 } else { 741 getActionDefinitionsBuilder(G_FFLOOR) 742 .legalFor({S32, S64}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 } 746 } 747 748 getActionDefinitionsBuilder(G_FPTRUNC) 749 .legalFor({{S32, S64}, {S16, S32}}) 750 .scalarize(0) 751 .lower(); 752 753 getActionDefinitionsBuilder(G_FPEXT) 754 .legalFor({{S64, S32}, {S32, S16}}) 755 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 756 .scalarize(0); 757 758 getActionDefinitionsBuilder(G_FSUB) 759 // Use actual fsub instruction 760 .legalFor({S32}) 761 // Must use fadd + fneg 762 .lowerFor({S64, S16, V2S16}) 763 .scalarize(0) 764 .clampScalar(0, S32, S64); 765 766 // Whether this is legal depends on the floating point mode for the function. 767 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 768 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 769 FMad.customFor({S32, S16}); 770 else if (ST.hasMadMacF32Insts()) 771 FMad.customFor({S32}); 772 else if (ST.hasMadF16()) 773 FMad.customFor({S16}); 774 FMad.scalarize(0) 775 .lower(); 776 777 auto &FRem = getActionDefinitionsBuilder(G_FREM); 778 if (ST.has16BitInsts()) { 779 FRem.customFor({S16, S32, S64}); 780 } else { 781 FRem.minScalar(0, S32) 782 .customFor({S32, S64}); 783 } 784 FRem.scalarize(0); 785 786 // TODO: Do we need to clamp maximum bitwidth? 787 getActionDefinitionsBuilder(G_TRUNC) 788 .legalIf(isScalar(0)) 789 .legalFor({{V2S16, V2S32}}) 790 .clampMaxNumElements(0, S16, 2) 791 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 792 // situations (like an invalid implicit use), we don't want to infinite loop 793 // in the legalizer. 794 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 795 .alwaysLegal(); 796 797 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 798 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 799 {S32, S1}, {S64, S1}, {S16, S1}}) 800 .scalarize(0) 801 .clampScalar(0, S32, S64) 802 .widenScalarToNextPow2(1, 32); 803 804 // TODO: Split s1->s64 during regbankselect for VALU. 805 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 806 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 807 .lowerFor({{S32, S64}}) 808 .lowerIf(typeIs(1, S1)) 809 .customFor({{S64, S64}}); 810 if (ST.has16BitInsts()) 811 IToFP.legalFor({{S16, S16}}); 812 IToFP.clampScalar(1, S32, S64) 813 .minScalar(0, S32) 814 .scalarize(0) 815 .widenScalarToNextPow2(1); 816 817 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 818 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 819 .customFor({{S64, S64}}) 820 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 821 if (ST.has16BitInsts()) 822 FPToI.legalFor({{S16, S16}}); 823 else 824 FPToI.minScalar(1, S32); 825 826 FPToI.minScalar(0, S32) 827 .widenScalarToNextPow2(0, 32) 828 .scalarize(0) 829 .lower(); 830 831 // Lower roundeven into G_FRINT 832 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 833 .scalarize(0) 834 .lower(); 835 836 if (ST.has16BitInsts()) { 837 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 838 .legalFor({S16, S32, S64}) 839 .clampScalar(0, S16, S64) 840 .scalarize(0); 841 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 842 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 843 .legalFor({S32, S64}) 844 .clampScalar(0, S32, S64) 845 .scalarize(0); 846 } else { 847 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 848 .legalFor({S32}) 849 .customFor({S64}) 850 .clampScalar(0, S32, S64) 851 .scalarize(0); 852 } 853 854 getActionDefinitionsBuilder(G_PTR_ADD) 855 .legalIf(all(isPointer(0), sameSize(0, 1))) 856 .scalarize(0) 857 .scalarSameSizeAs(1, 0); 858 859 getActionDefinitionsBuilder(G_PTRMASK) 860 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 861 .scalarSameSizeAs(1, 0) 862 .scalarize(0); 863 864 auto &CmpBuilder = 865 getActionDefinitionsBuilder(G_ICMP) 866 // The compare output type differs based on the register bank of the output, 867 // so make both s1 and s32 legal. 868 // 869 // Scalar compares producing output in scc will be promoted to s32, as that 870 // is the allocatable register type that will be needed for the copy from 871 // scc. This will be promoted during RegBankSelect, and we assume something 872 // before that won't try to use s32 result types. 873 // 874 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 875 // bank. 876 .legalForCartesianProduct( 877 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 878 .legalForCartesianProduct( 879 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 880 if (ST.has16BitInsts()) { 881 CmpBuilder.legalFor({{S1, S16}}); 882 } 883 884 CmpBuilder 885 .widenScalarToNextPow2(1) 886 .clampScalar(1, S32, S64) 887 .scalarize(0) 888 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 889 890 getActionDefinitionsBuilder(G_FCMP) 891 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 892 .widenScalarToNextPow2(1) 893 .clampScalar(1, S32, S64) 894 .scalarize(0); 895 896 // FIXME: fpow has a selection pattern that should move to custom lowering. 897 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 898 if (ST.has16BitInsts()) 899 Exp2Ops.legalFor({S32, S16}); 900 else 901 Exp2Ops.legalFor({S32}); 902 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 903 Exp2Ops.scalarize(0); 904 905 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 906 if (ST.has16BitInsts()) 907 ExpOps.customFor({{S32}, {S16}}); 908 else 909 ExpOps.customFor({S32}); 910 ExpOps.clampScalar(0, MinScalarFPTy, S32) 911 .scalarize(0); 912 913 getActionDefinitionsBuilder(G_FPOWI) 914 .clampScalar(0, MinScalarFPTy, S32) 915 .lower(); 916 917 // The 64-bit versions produce 32-bit results, but only on the SALU. 918 getActionDefinitionsBuilder(G_CTPOP) 919 .legalFor({{S32, S32}, {S32, S64}}) 920 .clampScalar(0, S32, S32) 921 .clampScalar(1, S32, S64) 922 .scalarize(0) 923 .widenScalarToNextPow2(0, 32) 924 .widenScalarToNextPow2(1, 32); 925 926 // The hardware instructions return a different result on 0 than the generic 927 // instructions expect. The hardware produces -1, but these produce the 928 // bitwidth. 929 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 930 .scalarize(0) 931 .clampScalar(0, S32, S32) 932 .clampScalar(1, S32, S64) 933 .widenScalarToNextPow2(0, 32) 934 .widenScalarToNextPow2(1, 32) 935 .lower(); 936 937 // The 64-bit versions produce 32-bit results, but only on the SALU. 938 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 939 .legalFor({{S32, S32}, {S32, S64}}) 940 .clampScalar(0, S32, S32) 941 .clampScalar(1, S32, S64) 942 .scalarize(0) 943 .widenScalarToNextPow2(0, 32) 944 .widenScalarToNextPow2(1, 32); 945 946 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 947 // RegBankSelect. 948 getActionDefinitionsBuilder(G_BITREVERSE) 949 .legalFor({S32, S64}) 950 .clampScalar(0, S32, S64) 951 .scalarize(0) 952 .widenScalarToNextPow2(0); 953 954 if (ST.has16BitInsts()) { 955 getActionDefinitionsBuilder(G_BSWAP) 956 .legalFor({S16, S32, V2S16}) 957 .clampMaxNumElements(0, S16, 2) 958 // FIXME: Fixing non-power-of-2 before clamp is workaround for 959 // narrowScalar limitation. 960 .widenScalarToNextPow2(0) 961 .clampScalar(0, S16, S32) 962 .scalarize(0); 963 964 if (ST.hasVOP3PInsts()) { 965 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 966 .legalFor({S32, S16, V2S16}) 967 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 968 .clampMaxNumElements(0, S16, 2) 969 .minScalar(0, S16) 970 .widenScalarToNextPow2(0) 971 .scalarize(0) 972 .lower(); 973 } else { 974 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 975 .legalFor({S32, S16}) 976 .widenScalarToNextPow2(0) 977 .minScalar(0, S16) 978 .scalarize(0) 979 .lower(); 980 } 981 } else { 982 // TODO: Should have same legality without v_perm_b32 983 getActionDefinitionsBuilder(G_BSWAP) 984 .legalFor({S32}) 985 .lowerIf(scalarNarrowerThan(0, 32)) 986 // FIXME: Fixing non-power-of-2 before clamp is workaround for 987 // narrowScalar limitation. 988 .widenScalarToNextPow2(0) 989 .maxScalar(0, S32) 990 .scalarize(0) 991 .lower(); 992 993 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 994 .legalFor({S32}) 995 .minScalar(0, S32) 996 .widenScalarToNextPow2(0) 997 .scalarize(0) 998 .lower(); 999 } 1000 1001 getActionDefinitionsBuilder(G_INTTOPTR) 1002 // List the common cases 1003 .legalForCartesianProduct(AddrSpaces64, {S64}) 1004 .legalForCartesianProduct(AddrSpaces32, {S32}) 1005 .scalarize(0) 1006 // Accept any address space as long as the size matches 1007 .legalIf(sameSize(0, 1)) 1008 .widenScalarIf(smallerThan(1, 0), 1009 [](const LegalityQuery &Query) { 1010 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1011 }) 1012 .narrowScalarIf(largerThan(1, 0), 1013 [](const LegalityQuery &Query) { 1014 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1015 }); 1016 1017 getActionDefinitionsBuilder(G_PTRTOINT) 1018 // List the common cases 1019 .legalForCartesianProduct(AddrSpaces64, {S64}) 1020 .legalForCartesianProduct(AddrSpaces32, {S32}) 1021 .scalarize(0) 1022 // Accept any address space as long as the size matches 1023 .legalIf(sameSize(0, 1)) 1024 .widenScalarIf(smallerThan(0, 1), 1025 [](const LegalityQuery &Query) { 1026 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1027 }) 1028 .narrowScalarIf( 1029 largerThan(0, 1), 1030 [](const LegalityQuery &Query) { 1031 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1032 }); 1033 1034 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1035 .scalarize(0) 1036 .custom(); 1037 1038 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1039 bool IsLoad) -> bool { 1040 const LLT DstTy = Query.Types[0]; 1041 1042 // Split vector extloads. 1043 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1044 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1045 1046 if (MemSize < DstTy.getSizeInBits()) 1047 MemSize = std::max(MemSize, AlignBits); 1048 1049 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1050 return true; 1051 1052 const LLT PtrTy = Query.Types[1]; 1053 unsigned AS = PtrTy.getAddressSpace(); 1054 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1055 return true; 1056 1057 // Catch weird sized loads that don't evenly divide into the access sizes 1058 // TODO: May be able to widen depending on alignment etc. 1059 unsigned NumRegs = (MemSize + 31) / 32; 1060 if (NumRegs == 3) { 1061 if (!ST.hasDwordx3LoadStores()) 1062 return true; 1063 } else { 1064 // If the alignment allows, these should have been widened. 1065 if (!isPowerOf2_32(NumRegs)) 1066 return true; 1067 } 1068 1069 if (AlignBits < MemSize) { 1070 const SITargetLowering *TLI = ST.getTargetLowering(); 1071 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1072 Align(AlignBits / 8)); 1073 } 1074 1075 return false; 1076 }; 1077 1078 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1079 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1080 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1081 1082 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1083 // LDS 1084 // TODO: Unsupported flat for SI. 1085 1086 for (unsigned Op : {G_LOAD, G_STORE}) { 1087 const bool IsStore = Op == G_STORE; 1088 1089 auto &Actions = getActionDefinitionsBuilder(Op); 1090 // Explicitly list some common cases. 1091 // TODO: Does this help compile time at all? 1092 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1093 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1094 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1095 {S64, GlobalPtr, 64, GlobalAlign32}, 1096 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1097 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1098 {S32, GlobalPtr, 8, GlobalAlign8}, 1099 {S32, GlobalPtr, 16, GlobalAlign16}, 1100 1101 {S32, LocalPtr, 32, 32}, 1102 {S64, LocalPtr, 64, 32}, 1103 {V2S32, LocalPtr, 64, 32}, 1104 {S32, LocalPtr, 8, 8}, 1105 {S32, LocalPtr, 16, 16}, 1106 {V2S16, LocalPtr, 32, 32}, 1107 1108 {S32, PrivatePtr, 32, 32}, 1109 {S32, PrivatePtr, 8, 8}, 1110 {S32, PrivatePtr, 16, 16}, 1111 {V2S16, PrivatePtr, 32, 32}, 1112 1113 {S32, ConstantPtr, 32, GlobalAlign32}, 1114 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1115 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1116 {S64, ConstantPtr, 64, GlobalAlign32}, 1117 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1118 Actions.legalIf( 1119 [=](const LegalityQuery &Query) -> bool { 1120 return isLoadStoreLegal(ST, Query); 1121 }); 1122 1123 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1124 // 64-bits. 1125 // 1126 // TODO: Should generalize bitcast action into coerce, which will also cover 1127 // inserting addrspacecasts. 1128 Actions.customIf(typeIs(1, Constant32Ptr)); 1129 1130 // Turn any illegal element vectors into something easier to deal 1131 // with. These will ultimately produce 32-bit scalar shifts to extract the 1132 // parts anyway. 1133 // 1134 // For odd 16-bit element vectors, prefer to split those into pieces with 1135 // 16-bit vector parts. 1136 Actions.bitcastIf( 1137 [=](const LegalityQuery &Query) -> bool { 1138 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1139 Query.MMODescrs[0].SizeInBits); 1140 }, bitcastToRegisterType(0)); 1141 1142 if (!IsStore) { 1143 // Widen suitably aligned loads by loading extra bytes. The standard 1144 // legalization actions can't properly express widening memory operands. 1145 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1146 return shouldWidenLoad(ST, Query, G_LOAD); 1147 }); 1148 } 1149 1150 // FIXME: load/store narrowing should be moved to lower action 1151 Actions 1152 .narrowScalarIf( 1153 [=](const LegalityQuery &Query) -> bool { 1154 return !Query.Types[0].isVector() && 1155 needToSplitMemOp(Query, Op == G_LOAD); 1156 }, 1157 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1158 const LLT DstTy = Query.Types[0]; 1159 const LLT PtrTy = Query.Types[1]; 1160 1161 const unsigned DstSize = DstTy.getSizeInBits(); 1162 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1163 1164 // Split extloads. 1165 if (DstSize > MemSize) 1166 return std::make_pair(0, LLT::scalar(MemSize)); 1167 1168 if (!isPowerOf2_32(DstSize)) { 1169 // We're probably decomposing an odd sized store. Try to split 1170 // to the widest type. TODO: Account for alignment. As-is it 1171 // should be OK, since the new parts will be further legalized. 1172 unsigned FloorSize = PowerOf2Floor(DstSize); 1173 return std::make_pair(0, LLT::scalar(FloorSize)); 1174 } 1175 1176 if (DstSize > 32 && (DstSize % 32 != 0)) { 1177 // FIXME: Need a way to specify non-extload of larger size if 1178 // suitably aligned. 1179 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1180 } 1181 1182 unsigned MaxSize = maxSizeForAddrSpace(ST, 1183 PtrTy.getAddressSpace(), 1184 Op == G_LOAD); 1185 if (MemSize > MaxSize) 1186 return std::make_pair(0, LLT::scalar(MaxSize)); 1187 1188 unsigned Align = Query.MMODescrs[0].AlignInBits; 1189 return std::make_pair(0, LLT::scalar(Align)); 1190 }) 1191 .fewerElementsIf( 1192 [=](const LegalityQuery &Query) -> bool { 1193 return Query.Types[0].isVector() && 1194 needToSplitMemOp(Query, Op == G_LOAD); 1195 }, 1196 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1197 const LLT DstTy = Query.Types[0]; 1198 const LLT PtrTy = Query.Types[1]; 1199 1200 LLT EltTy = DstTy.getElementType(); 1201 unsigned MaxSize = maxSizeForAddrSpace(ST, 1202 PtrTy.getAddressSpace(), 1203 Op == G_LOAD); 1204 1205 // FIXME: Handle widened to power of 2 results better. This ends 1206 // up scalarizing. 1207 // FIXME: 3 element stores scalarized on SI 1208 1209 // Split if it's too large for the address space. 1210 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1211 unsigned NumElts = DstTy.getNumElements(); 1212 unsigned EltSize = EltTy.getSizeInBits(); 1213 1214 if (MaxSize % EltSize == 0) { 1215 return std::make_pair( 1216 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1217 } 1218 1219 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1220 1221 // FIXME: Refine when odd breakdowns handled 1222 // The scalars will need to be re-legalized. 1223 if (NumPieces == 1 || NumPieces >= NumElts || 1224 NumElts % NumPieces != 0) 1225 return std::make_pair(0, EltTy); 1226 1227 return std::make_pair(0, 1228 LLT::vector(NumElts / NumPieces, EltTy)); 1229 } 1230 1231 // FIXME: We could probably handle weird extending loads better. 1232 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1233 if (DstTy.getSizeInBits() > MemSize) 1234 return std::make_pair(0, EltTy); 1235 1236 unsigned EltSize = EltTy.getSizeInBits(); 1237 unsigned DstSize = DstTy.getSizeInBits(); 1238 if (!isPowerOf2_32(DstSize)) { 1239 // We're probably decomposing an odd sized store. Try to split 1240 // to the widest type. TODO: Account for alignment. As-is it 1241 // should be OK, since the new parts will be further legalized. 1242 unsigned FloorSize = PowerOf2Floor(DstSize); 1243 return std::make_pair( 1244 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1245 } 1246 1247 // Need to split because of alignment. 1248 unsigned Align = Query.MMODescrs[0].AlignInBits; 1249 if (EltSize > Align && 1250 (EltSize / Align < DstTy.getNumElements())) { 1251 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1252 } 1253 1254 // May need relegalization for the scalars. 1255 return std::make_pair(0, EltTy); 1256 }) 1257 .lowerIfMemSizeNotPow2() 1258 .minScalar(0, S32) 1259 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1260 .widenScalarToNextPow2(0) 1261 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1262 .lower(); 1263 } 1264 1265 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1266 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1267 {S32, GlobalPtr, 16, 2 * 8}, 1268 {S32, LocalPtr, 8, 8}, 1269 {S32, LocalPtr, 16, 16}, 1270 {S32, PrivatePtr, 8, 8}, 1271 {S32, PrivatePtr, 16, 16}, 1272 {S32, ConstantPtr, 8, 8}, 1273 {S32, ConstantPtr, 16, 2 * 8}}) 1274 .legalIf( 1275 [=](const LegalityQuery &Query) -> bool { 1276 return isLoadStoreLegal(ST, Query); 1277 }); 1278 1279 if (ST.hasFlatAddressSpace()) { 1280 ExtLoads.legalForTypesWithMemDesc( 1281 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1282 } 1283 1284 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1285 // 64-bits. 1286 // 1287 // TODO: Should generalize bitcast action into coerce, which will also cover 1288 // inserting addrspacecasts. 1289 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1290 1291 ExtLoads.clampScalar(0, S32, S32) 1292 .widenScalarToNextPow2(0) 1293 .unsupportedIfMemSizeNotPow2() 1294 .lower(); 1295 1296 auto &Atomics = getActionDefinitionsBuilder( 1297 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1298 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1299 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1300 G_ATOMICRMW_UMIN}) 1301 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1302 {S64, GlobalPtr}, {S64, LocalPtr}, 1303 {S32, RegionPtr}, {S64, RegionPtr}}); 1304 if (ST.hasFlatAddressSpace()) { 1305 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1306 } 1307 1308 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1309 if (ST.hasLDSFPAtomics()) { 1310 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1311 if (ST.hasGFX90AInsts()) 1312 Atomic.legalFor({{S64, LocalPtr}}); 1313 } 1314 if (ST.hasAtomicFaddInsts()) 1315 Atomic.legalFor({{S32, GlobalPtr}}); 1316 1317 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1318 // demarshalling 1319 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1320 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1321 {S32, FlatPtr}, {S64, FlatPtr}}) 1322 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1323 {S32, RegionPtr}, {S64, RegionPtr}}); 1324 // TODO: Pointer types, any 32-bit or 64-bit vector 1325 1326 // Condition should be s32 for scalar, s1 for vector. 1327 getActionDefinitionsBuilder(G_SELECT) 1328 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1329 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1330 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1331 .clampScalar(0, S16, S64) 1332 .scalarize(1) 1333 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1334 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1335 .clampMaxNumElements(0, S32, 2) 1336 .clampMaxNumElements(0, LocalPtr, 2) 1337 .clampMaxNumElements(0, PrivatePtr, 2) 1338 .scalarize(0) 1339 .widenScalarToNextPow2(0) 1340 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1341 1342 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1343 // be more flexible with the shift amount type. 1344 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1345 .legalFor({{S32, S32}, {S64, S32}}); 1346 if (ST.has16BitInsts()) { 1347 if (ST.hasVOP3PInsts()) { 1348 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1349 .clampMaxNumElements(0, S16, 2); 1350 } else 1351 Shifts.legalFor({{S16, S16}}); 1352 1353 // TODO: Support 16-bit shift amounts for all types 1354 Shifts.widenScalarIf( 1355 [=](const LegalityQuery &Query) { 1356 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1357 // 32-bit amount. 1358 const LLT ValTy = Query.Types[0]; 1359 const LLT AmountTy = Query.Types[1]; 1360 return ValTy.getSizeInBits() <= 16 && 1361 AmountTy.getSizeInBits() < 16; 1362 }, changeTo(1, S16)); 1363 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1364 Shifts.clampScalar(1, S32, S32); 1365 Shifts.clampScalar(0, S16, S64); 1366 Shifts.widenScalarToNextPow2(0, 16); 1367 1368 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1369 .minScalar(0, S16) 1370 .scalarize(0) 1371 .lower(); 1372 } else { 1373 // Make sure we legalize the shift amount type first, as the general 1374 // expansion for the shifted type will produce much worse code if it hasn't 1375 // been truncated already. 1376 Shifts.clampScalar(1, S32, S32); 1377 Shifts.clampScalar(0, S32, S64); 1378 Shifts.widenScalarToNextPow2(0, 32); 1379 1380 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1381 .minScalar(0, S32) 1382 .scalarize(0) 1383 .lower(); 1384 } 1385 Shifts.scalarize(0); 1386 1387 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1388 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1389 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1390 unsigned IdxTypeIdx = 2; 1391 1392 getActionDefinitionsBuilder(Op) 1393 .customIf([=](const LegalityQuery &Query) { 1394 const LLT EltTy = Query.Types[EltTypeIdx]; 1395 const LLT VecTy = Query.Types[VecTypeIdx]; 1396 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1397 const unsigned EltSize = EltTy.getSizeInBits(); 1398 return (EltSize == 32 || EltSize == 64) && 1399 VecTy.getSizeInBits() % 32 == 0 && 1400 VecTy.getSizeInBits() <= MaxRegisterSize && 1401 IdxTy.getSizeInBits() == 32; 1402 }) 1403 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1404 bitcastToVectorElement32(VecTypeIdx)) 1405 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1406 .bitcastIf( 1407 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1408 [=](const LegalityQuery &Query) { 1409 // For > 64-bit element types, try to turn this into a 64-bit 1410 // element vector since we may be able to do better indexing 1411 // if this is scalar. If not, fall back to 32. 1412 const LLT EltTy = Query.Types[EltTypeIdx]; 1413 const LLT VecTy = Query.Types[VecTypeIdx]; 1414 const unsigned DstEltSize = EltTy.getSizeInBits(); 1415 const unsigned VecSize = VecTy.getSizeInBits(); 1416 1417 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1418 return std::make_pair( 1419 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1420 }) 1421 .clampScalar(EltTypeIdx, S32, S64) 1422 .clampScalar(VecTypeIdx, S32, S64) 1423 .clampScalar(IdxTypeIdx, S32, S32) 1424 .clampMaxNumElements(VecTypeIdx, S32, 32) 1425 // TODO: Clamp elements for 64-bit vectors? 1426 // It should only be necessary with variable indexes. 1427 // As a last resort, lower to the stack 1428 .lower(); 1429 } 1430 1431 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1432 .unsupportedIf([=](const LegalityQuery &Query) { 1433 const LLT &EltTy = Query.Types[1].getElementType(); 1434 return Query.Types[0] != EltTy; 1435 }); 1436 1437 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1438 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1439 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1440 1441 // FIXME: Doesn't handle extract of illegal sizes. 1442 getActionDefinitionsBuilder(Op) 1443 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1444 // FIXME: Multiples of 16 should not be legal. 1445 .legalIf([=](const LegalityQuery &Query) { 1446 const LLT BigTy = Query.Types[BigTyIdx]; 1447 const LLT LitTy = Query.Types[LitTyIdx]; 1448 return (BigTy.getSizeInBits() % 32 == 0) && 1449 (LitTy.getSizeInBits() % 16 == 0); 1450 }) 1451 .widenScalarIf( 1452 [=](const LegalityQuery &Query) { 1453 const LLT BigTy = Query.Types[BigTyIdx]; 1454 return (BigTy.getScalarSizeInBits() < 16); 1455 }, 1456 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1457 .widenScalarIf( 1458 [=](const LegalityQuery &Query) { 1459 const LLT LitTy = Query.Types[LitTyIdx]; 1460 return (LitTy.getScalarSizeInBits() < 16); 1461 }, 1462 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1463 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1464 .widenScalarToNextPow2(BigTyIdx, 32); 1465 1466 } 1467 1468 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1469 .legalForCartesianProduct(AllS32Vectors, {S32}) 1470 .legalForCartesianProduct(AllS64Vectors, {S64}) 1471 .clampNumElements(0, V16S32, V32S32) 1472 .clampNumElements(0, V2S64, V16S64) 1473 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1474 1475 if (ST.hasScalarPackInsts()) { 1476 BuildVector 1477 // FIXME: Should probably widen s1 vectors straight to s32 1478 .minScalarOrElt(0, S16) 1479 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1480 .minScalar(1, S32); 1481 1482 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1483 .legalFor({V2S16, S32}) 1484 .lower(); 1485 BuildVector.minScalarOrElt(0, S32); 1486 } else { 1487 BuildVector.customFor({V2S16, S16}); 1488 BuildVector.minScalarOrElt(0, S32); 1489 1490 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1491 .customFor({V2S16, S32}) 1492 .lower(); 1493 } 1494 1495 BuildVector.legalIf(isRegisterType(0)); 1496 1497 // FIXME: Clamp maximum size 1498 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1499 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1500 .clampMaxNumElements(0, S32, 32) 1501 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1502 .clampMaxNumElements(0, S16, 64); 1503 1504 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1505 // pre-legalize. 1506 if (ST.hasVOP3PInsts()) { 1507 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1508 .customFor({V2S16, V2S16}) 1509 .lower(); 1510 } else 1511 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1512 1513 // Merge/Unmerge 1514 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1515 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1516 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1517 1518 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1519 const LLT Ty = Query.Types[TypeIdx]; 1520 if (Ty.isVector()) { 1521 const LLT &EltTy = Ty.getElementType(); 1522 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1523 return true; 1524 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1525 return true; 1526 } 1527 return false; 1528 }; 1529 1530 auto &Builder = getActionDefinitionsBuilder(Op) 1531 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1532 .lowerFor({{S16, V2S16}}) 1533 .lowerIf([=](const LegalityQuery &Query) { 1534 const LLT BigTy = Query.Types[BigTyIdx]; 1535 return BigTy.getSizeInBits() == 32; 1536 }) 1537 // Try to widen to s16 first for small types. 1538 // TODO: Only do this on targets with legal s16 shifts 1539 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1540 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1541 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1542 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1543 elementTypeIs(1, S16)), 1544 changeTo(1, V2S16)) 1545 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1546 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1547 // valid. 1548 .clampScalar(LitTyIdx, S32, S512) 1549 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1550 // Break up vectors with weird elements into scalars 1551 .fewerElementsIf( 1552 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1553 scalarize(0)) 1554 .fewerElementsIf( 1555 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1556 scalarize(1)) 1557 .clampScalar(BigTyIdx, S32, MaxScalar); 1558 1559 if (Op == G_MERGE_VALUES) { 1560 Builder.widenScalarIf( 1561 // TODO: Use 16-bit shifts if legal for 8-bit values? 1562 [=](const LegalityQuery &Query) { 1563 const LLT Ty = Query.Types[LitTyIdx]; 1564 return Ty.getSizeInBits() < 32; 1565 }, 1566 changeTo(LitTyIdx, S32)); 1567 } 1568 1569 Builder.widenScalarIf( 1570 [=](const LegalityQuery &Query) { 1571 const LLT Ty = Query.Types[BigTyIdx]; 1572 return !isPowerOf2_32(Ty.getSizeInBits()) && 1573 Ty.getSizeInBits() % 16 != 0; 1574 }, 1575 [=](const LegalityQuery &Query) { 1576 // Pick the next power of 2, or a multiple of 64 over 128. 1577 // Whichever is smaller. 1578 const LLT &Ty = Query.Types[BigTyIdx]; 1579 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1580 if (NewSizeInBits >= 256) { 1581 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1582 if (RoundedTo < NewSizeInBits) 1583 NewSizeInBits = RoundedTo; 1584 } 1585 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1586 }) 1587 // Any vectors left are the wrong size. Scalarize them. 1588 .scalarize(0) 1589 .scalarize(1); 1590 } 1591 1592 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1593 // RegBankSelect. 1594 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1595 .legalFor({{S32}, {S64}}); 1596 1597 if (ST.hasVOP3PInsts()) { 1598 SextInReg.lowerFor({{V2S16}}) 1599 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1600 // get more vector shift opportunities, since we'll get those when 1601 // expanded. 1602 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1603 } else if (ST.has16BitInsts()) { 1604 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1605 } else { 1606 // Prefer to promote to s32 before lowering if we don't have 16-bit 1607 // shifts. This avoid a lot of intermediate truncate and extend operations. 1608 SextInReg.lowerFor({{S32}, {S64}}); 1609 } 1610 1611 SextInReg 1612 .scalarize(0) 1613 .clampScalar(0, S32, S64) 1614 .lower(); 1615 1616 // TODO: Only Try to form v2s16 with legal packed instructions. 1617 getActionDefinitionsBuilder(G_FSHR) 1618 .legalFor({{S32, S32}}) 1619 .lowerFor({{V2S16, V2S16}}) 1620 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1621 .scalarize(0) 1622 .lower(); 1623 1624 if (ST.hasVOP3PInsts()) { 1625 getActionDefinitionsBuilder(G_FSHL) 1626 .lowerFor({{V2S16, V2S16}}) 1627 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1628 .scalarize(0) 1629 .lower(); 1630 } else { 1631 getActionDefinitionsBuilder(G_FSHL) 1632 .scalarize(0) 1633 .lower(); 1634 } 1635 1636 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1637 .legalFor({S64}); 1638 1639 getActionDefinitionsBuilder(G_FENCE) 1640 .alwaysLegal(); 1641 1642 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1643 .scalarize(0) 1644 .minScalar(0, S32) 1645 .lower(); 1646 1647 getActionDefinitionsBuilder({ 1648 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1649 G_FCOPYSIGN, 1650 1651 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1652 G_ATOMICRMW_NAND, 1653 G_ATOMICRMW_FSUB, 1654 G_READ_REGISTER, 1655 G_WRITE_REGISTER, 1656 1657 G_SADDO, G_SSUBO, 1658 1659 // TODO: Implement 1660 G_FMINIMUM, G_FMAXIMUM}).lower(); 1661 1662 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1663 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1664 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1665 .unsupported(); 1666 1667 computeTables(); 1668 verify(*ST.getInstrInfo()); 1669 } 1670 1671 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1672 MachineInstr &MI) const { 1673 MachineIRBuilder &B = Helper.MIRBuilder; 1674 MachineRegisterInfo &MRI = *B.getMRI(); 1675 1676 switch (MI.getOpcode()) { 1677 case TargetOpcode::G_ADDRSPACE_CAST: 1678 return legalizeAddrSpaceCast(MI, MRI, B); 1679 case TargetOpcode::G_FRINT: 1680 return legalizeFrint(MI, MRI, B); 1681 case TargetOpcode::G_FCEIL: 1682 return legalizeFceil(MI, MRI, B); 1683 case TargetOpcode::G_FREM: 1684 return legalizeFrem(MI, MRI, B); 1685 case TargetOpcode::G_INTRINSIC_TRUNC: 1686 return legalizeIntrinsicTrunc(MI, MRI, B); 1687 case TargetOpcode::G_SITOFP: 1688 return legalizeITOFP(MI, MRI, B, true); 1689 case TargetOpcode::G_UITOFP: 1690 return legalizeITOFP(MI, MRI, B, false); 1691 case TargetOpcode::G_FPTOSI: 1692 return legalizeFPTOI(MI, MRI, B, true); 1693 case TargetOpcode::G_FPTOUI: 1694 return legalizeFPTOI(MI, MRI, B, false); 1695 case TargetOpcode::G_FMINNUM: 1696 case TargetOpcode::G_FMAXNUM: 1697 case TargetOpcode::G_FMINNUM_IEEE: 1698 case TargetOpcode::G_FMAXNUM_IEEE: 1699 return legalizeMinNumMaxNum(Helper, MI); 1700 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1701 return legalizeExtractVectorElt(MI, MRI, B); 1702 case TargetOpcode::G_INSERT_VECTOR_ELT: 1703 return legalizeInsertVectorElt(MI, MRI, B); 1704 case TargetOpcode::G_SHUFFLE_VECTOR: 1705 return legalizeShuffleVector(MI, MRI, B); 1706 case TargetOpcode::G_FSIN: 1707 case TargetOpcode::G_FCOS: 1708 return legalizeSinCos(MI, MRI, B); 1709 case TargetOpcode::G_GLOBAL_VALUE: 1710 return legalizeGlobalValue(MI, MRI, B); 1711 case TargetOpcode::G_LOAD: 1712 case TargetOpcode::G_SEXTLOAD: 1713 case TargetOpcode::G_ZEXTLOAD: 1714 return legalizeLoad(Helper, MI); 1715 case TargetOpcode::G_FMAD: 1716 return legalizeFMad(MI, MRI, B); 1717 case TargetOpcode::G_FDIV: 1718 return legalizeFDIV(MI, MRI, B); 1719 case TargetOpcode::G_UDIV: 1720 case TargetOpcode::G_UREM: 1721 case TargetOpcode::G_UDIVREM: 1722 return legalizeUnsignedDIV_REM(MI, MRI, B); 1723 case TargetOpcode::G_SDIV: 1724 case TargetOpcode::G_SREM: 1725 case TargetOpcode::G_SDIVREM: 1726 return legalizeSignedDIV_REM(MI, MRI, B); 1727 case TargetOpcode::G_ATOMIC_CMPXCHG: 1728 return legalizeAtomicCmpXChg(MI, MRI, B); 1729 case TargetOpcode::G_FLOG: 1730 return legalizeFlog(MI, B, numbers::ln2f); 1731 case TargetOpcode::G_FLOG10: 1732 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1733 case TargetOpcode::G_FEXP: 1734 return legalizeFExp(MI, B); 1735 case TargetOpcode::G_FPOW: 1736 return legalizeFPow(MI, B); 1737 case TargetOpcode::G_FFLOOR: 1738 return legalizeFFloor(MI, MRI, B); 1739 case TargetOpcode::G_BUILD_VECTOR: 1740 return legalizeBuildVector(MI, MRI, B); 1741 default: 1742 return false; 1743 } 1744 1745 llvm_unreachable("expected switch to return"); 1746 } 1747 1748 Register AMDGPULegalizerInfo::getSegmentAperture( 1749 unsigned AS, 1750 MachineRegisterInfo &MRI, 1751 MachineIRBuilder &B) const { 1752 MachineFunction &MF = B.getMF(); 1753 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1754 const LLT S32 = LLT::scalar(32); 1755 1756 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1757 1758 if (ST.hasApertureRegs()) { 1759 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1760 // getreg. 1761 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1762 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1763 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1764 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1765 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1766 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1767 unsigned Encoding = 1768 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1769 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1770 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1771 1772 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1773 1774 B.buildInstr(AMDGPU::S_GETREG_B32) 1775 .addDef(GetReg) 1776 .addImm(Encoding); 1777 MRI.setType(GetReg, S32); 1778 1779 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1780 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1781 } 1782 1783 Register QueuePtr = MRI.createGenericVirtualRegister( 1784 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1785 1786 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1787 return Register(); 1788 1789 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1790 // private_segment_aperture_base_hi. 1791 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1792 1793 // TODO: can we be smarter about machine pointer info? 1794 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1795 MachineMemOperand *MMO = MF.getMachineMemOperand( 1796 PtrInfo, 1797 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1798 MachineMemOperand::MOInvariant, 1799 4, commonAlignment(Align(64), StructOffset)); 1800 1801 Register LoadAddr; 1802 1803 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1804 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1805 } 1806 1807 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1808 MachineInstr &MI, MachineRegisterInfo &MRI, 1809 MachineIRBuilder &B) const { 1810 MachineFunction &MF = B.getMF(); 1811 1812 const LLT S32 = LLT::scalar(32); 1813 Register Dst = MI.getOperand(0).getReg(); 1814 Register Src = MI.getOperand(1).getReg(); 1815 1816 LLT DstTy = MRI.getType(Dst); 1817 LLT SrcTy = MRI.getType(Src); 1818 unsigned DestAS = DstTy.getAddressSpace(); 1819 unsigned SrcAS = SrcTy.getAddressSpace(); 1820 1821 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1822 // vector element. 1823 assert(!DstTy.isVector()); 1824 1825 const AMDGPUTargetMachine &TM 1826 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1827 1828 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1829 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1830 return true; 1831 } 1832 1833 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1834 // Truncate. 1835 B.buildExtract(Dst, Src, 0); 1836 MI.eraseFromParent(); 1837 return true; 1838 } 1839 1840 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1841 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1842 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1843 1844 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1845 // another. Merge operands are required to be the same type, but creating an 1846 // extra ptrtoint would be kind of pointless. 1847 auto HighAddr = B.buildConstant( 1848 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1849 B.buildMerge(Dst, {Src, HighAddr}); 1850 MI.eraseFromParent(); 1851 return true; 1852 } 1853 1854 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1855 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1856 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1857 unsigned NullVal = TM.getNullPointerValue(DestAS); 1858 1859 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1860 auto FlatNull = B.buildConstant(SrcTy, 0); 1861 1862 // Extract low 32-bits of the pointer. 1863 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1864 1865 auto CmpRes = 1866 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1867 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1868 1869 MI.eraseFromParent(); 1870 return true; 1871 } 1872 1873 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1874 return false; 1875 1876 if (!ST.hasFlatAddressSpace()) 1877 return false; 1878 1879 auto SegmentNull = 1880 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1881 auto FlatNull = 1882 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1883 1884 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1885 if (!ApertureReg.isValid()) 1886 return false; 1887 1888 auto CmpRes = 1889 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1890 1891 // Coerce the type of the low half of the result so we can use merge_values. 1892 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1893 1894 // TODO: Should we allow mismatched types but matching sizes in merges to 1895 // avoid the ptrtoint? 1896 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1897 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1898 1899 MI.eraseFromParent(); 1900 return true; 1901 } 1902 1903 bool AMDGPULegalizerInfo::legalizeFrint( 1904 MachineInstr &MI, MachineRegisterInfo &MRI, 1905 MachineIRBuilder &B) const { 1906 Register Src = MI.getOperand(1).getReg(); 1907 LLT Ty = MRI.getType(Src); 1908 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1909 1910 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1911 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1912 1913 auto C1 = B.buildFConstant(Ty, C1Val); 1914 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1915 1916 // TODO: Should this propagate fast-math-flags? 1917 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1918 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1919 1920 auto C2 = B.buildFConstant(Ty, C2Val); 1921 auto Fabs = B.buildFAbs(Ty, Src); 1922 1923 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1924 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1925 MI.eraseFromParent(); 1926 return true; 1927 } 1928 1929 bool AMDGPULegalizerInfo::legalizeFceil( 1930 MachineInstr &MI, MachineRegisterInfo &MRI, 1931 MachineIRBuilder &B) const { 1932 1933 const LLT S1 = LLT::scalar(1); 1934 const LLT S64 = LLT::scalar(64); 1935 1936 Register Src = MI.getOperand(1).getReg(); 1937 assert(MRI.getType(Src) == S64); 1938 1939 // result = trunc(src) 1940 // if (src > 0.0 && src != result) 1941 // result += 1.0 1942 1943 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1944 1945 const auto Zero = B.buildFConstant(S64, 0.0); 1946 const auto One = B.buildFConstant(S64, 1.0); 1947 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1948 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1949 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1950 auto Add = B.buildSelect(S64, And, One, Zero); 1951 1952 // TODO: Should this propagate fast-math-flags? 1953 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1954 return true; 1955 } 1956 1957 bool AMDGPULegalizerInfo::legalizeFrem( 1958 MachineInstr &MI, MachineRegisterInfo &MRI, 1959 MachineIRBuilder &B) const { 1960 Register DstReg = MI.getOperand(0).getReg(); 1961 Register Src0Reg = MI.getOperand(1).getReg(); 1962 Register Src1Reg = MI.getOperand(2).getReg(); 1963 auto Flags = MI.getFlags(); 1964 LLT Ty = MRI.getType(DstReg); 1965 1966 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1967 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1968 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1969 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1970 MI.eraseFromParent(); 1971 return true; 1972 } 1973 1974 static MachineInstrBuilder extractF64Exponent(Register Hi, 1975 MachineIRBuilder &B) { 1976 const unsigned FractBits = 52; 1977 const unsigned ExpBits = 11; 1978 LLT S32 = LLT::scalar(32); 1979 1980 auto Const0 = B.buildConstant(S32, FractBits - 32); 1981 auto Const1 = B.buildConstant(S32, ExpBits); 1982 1983 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1984 .addUse(Hi) 1985 .addUse(Const0.getReg(0)) 1986 .addUse(Const1.getReg(0)); 1987 1988 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1989 } 1990 1991 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1992 MachineInstr &MI, MachineRegisterInfo &MRI, 1993 MachineIRBuilder &B) const { 1994 const LLT S1 = LLT::scalar(1); 1995 const LLT S32 = LLT::scalar(32); 1996 const LLT S64 = LLT::scalar(64); 1997 1998 Register Src = MI.getOperand(1).getReg(); 1999 assert(MRI.getType(Src) == S64); 2000 2001 // TODO: Should this use extract since the low half is unused? 2002 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2003 Register Hi = Unmerge.getReg(1); 2004 2005 // Extract the upper half, since this is where we will find the sign and 2006 // exponent. 2007 auto Exp = extractF64Exponent(Hi, B); 2008 2009 const unsigned FractBits = 52; 2010 2011 // Extract the sign bit. 2012 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2013 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2014 2015 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2016 2017 const auto Zero32 = B.buildConstant(S32, 0); 2018 2019 // Extend back to 64-bits. 2020 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2021 2022 auto Shr = B.buildAShr(S64, FractMask, Exp); 2023 auto Not = B.buildNot(S64, Shr); 2024 auto Tmp0 = B.buildAnd(S64, Src, Not); 2025 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2026 2027 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2028 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2029 2030 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2031 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2032 MI.eraseFromParent(); 2033 return true; 2034 } 2035 2036 bool AMDGPULegalizerInfo::legalizeITOFP( 2037 MachineInstr &MI, MachineRegisterInfo &MRI, 2038 MachineIRBuilder &B, bool Signed) const { 2039 2040 Register Dst = MI.getOperand(0).getReg(); 2041 Register Src = MI.getOperand(1).getReg(); 2042 2043 const LLT S64 = LLT::scalar(64); 2044 const LLT S32 = LLT::scalar(32); 2045 2046 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2047 2048 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2049 2050 auto CvtHi = Signed ? 2051 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2052 B.buildUITOFP(S64, Unmerge.getReg(1)); 2053 2054 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2055 2056 auto ThirtyTwo = B.buildConstant(S32, 32); 2057 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2058 .addUse(CvtHi.getReg(0)) 2059 .addUse(ThirtyTwo.getReg(0)); 2060 2061 // TODO: Should this propagate fast-math-flags? 2062 B.buildFAdd(Dst, LdExp, CvtLo); 2063 MI.eraseFromParent(); 2064 return true; 2065 } 2066 2067 // TODO: Copied from DAG implementation. Verify logic and document how this 2068 // actually works. 2069 bool AMDGPULegalizerInfo::legalizeFPTOI( 2070 MachineInstr &MI, MachineRegisterInfo &MRI, 2071 MachineIRBuilder &B, bool Signed) const { 2072 2073 Register Dst = MI.getOperand(0).getReg(); 2074 Register Src = MI.getOperand(1).getReg(); 2075 2076 const LLT S64 = LLT::scalar(64); 2077 const LLT S32 = LLT::scalar(32); 2078 2079 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2080 2081 unsigned Flags = MI.getFlags(); 2082 2083 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2084 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2085 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2086 2087 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2088 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2089 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2090 2091 auto Hi = Signed ? 2092 B.buildFPTOSI(S32, FloorMul) : 2093 B.buildFPTOUI(S32, FloorMul); 2094 auto Lo = B.buildFPTOUI(S32, Fma); 2095 2096 B.buildMerge(Dst, { Lo, Hi }); 2097 MI.eraseFromParent(); 2098 2099 return true; 2100 } 2101 2102 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2103 MachineInstr &MI) const { 2104 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2105 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2106 2107 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2108 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2109 2110 // With ieee_mode disabled, the instructions have the correct behavior 2111 // already for G_FMINNUM/G_FMAXNUM 2112 if (!MFI->getMode().IEEE) 2113 return !IsIEEEOp; 2114 2115 if (IsIEEEOp) 2116 return true; 2117 2118 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2119 } 2120 2121 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2122 MachineInstr &MI, MachineRegisterInfo &MRI, 2123 MachineIRBuilder &B) const { 2124 // TODO: Should move some of this into LegalizerHelper. 2125 2126 // TODO: Promote dynamic indexing of s16 to s32 2127 2128 // FIXME: Artifact combiner probably should have replaced the truncated 2129 // constant before this, so we shouldn't need 2130 // getConstantVRegValWithLookThrough. 2131 Optional<ValueAndVReg> MaybeIdxVal = 2132 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2133 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2134 return true; 2135 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2136 2137 Register Dst = MI.getOperand(0).getReg(); 2138 Register Vec = MI.getOperand(1).getReg(); 2139 2140 LLT VecTy = MRI.getType(Vec); 2141 LLT EltTy = VecTy.getElementType(); 2142 assert(EltTy == MRI.getType(Dst)); 2143 2144 if (IdxVal < VecTy.getNumElements()) 2145 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2146 else 2147 B.buildUndef(Dst); 2148 2149 MI.eraseFromParent(); 2150 return true; 2151 } 2152 2153 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2154 MachineInstr &MI, MachineRegisterInfo &MRI, 2155 MachineIRBuilder &B) const { 2156 // TODO: Should move some of this into LegalizerHelper. 2157 2158 // TODO: Promote dynamic indexing of s16 to s32 2159 2160 // FIXME: Artifact combiner probably should have replaced the truncated 2161 // constant before this, so we shouldn't need 2162 // getConstantVRegValWithLookThrough. 2163 Optional<ValueAndVReg> MaybeIdxVal = 2164 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2165 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2166 return true; 2167 2168 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2169 Register Dst = MI.getOperand(0).getReg(); 2170 Register Vec = MI.getOperand(1).getReg(); 2171 Register Ins = MI.getOperand(2).getReg(); 2172 2173 LLT VecTy = MRI.getType(Vec); 2174 LLT EltTy = VecTy.getElementType(); 2175 assert(EltTy == MRI.getType(Ins)); 2176 2177 if (IdxVal < VecTy.getNumElements()) 2178 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2179 else 2180 B.buildUndef(Dst); 2181 2182 MI.eraseFromParent(); 2183 return true; 2184 } 2185 2186 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2187 MachineInstr &MI, MachineRegisterInfo &MRI, 2188 MachineIRBuilder &B) const { 2189 const LLT V2S16 = LLT::vector(2, 16); 2190 2191 Register Dst = MI.getOperand(0).getReg(); 2192 Register Src0 = MI.getOperand(1).getReg(); 2193 LLT DstTy = MRI.getType(Dst); 2194 LLT SrcTy = MRI.getType(Src0); 2195 2196 if (SrcTy == V2S16 && DstTy == V2S16 && 2197 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2198 return true; 2199 2200 MachineIRBuilder HelperBuilder(MI); 2201 GISelObserverWrapper DummyObserver; 2202 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2203 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2204 } 2205 2206 bool AMDGPULegalizerInfo::legalizeSinCos( 2207 MachineInstr &MI, MachineRegisterInfo &MRI, 2208 MachineIRBuilder &B) const { 2209 2210 Register DstReg = MI.getOperand(0).getReg(); 2211 Register SrcReg = MI.getOperand(1).getReg(); 2212 LLT Ty = MRI.getType(DstReg); 2213 unsigned Flags = MI.getFlags(); 2214 2215 Register TrigVal; 2216 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2217 if (ST.hasTrigReducedRange()) { 2218 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2219 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2220 .addUse(MulVal.getReg(0)) 2221 .setMIFlags(Flags).getReg(0); 2222 } else 2223 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2224 2225 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2226 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2227 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2228 .addUse(TrigVal) 2229 .setMIFlags(Flags); 2230 MI.eraseFromParent(); 2231 return true; 2232 } 2233 2234 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2235 MachineIRBuilder &B, 2236 const GlobalValue *GV, 2237 int64_t Offset, 2238 unsigned GAFlags) const { 2239 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2240 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2241 // to the following code sequence: 2242 // 2243 // For constant address space: 2244 // s_getpc_b64 s[0:1] 2245 // s_add_u32 s0, s0, $symbol 2246 // s_addc_u32 s1, s1, 0 2247 // 2248 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2249 // a fixup or relocation is emitted to replace $symbol with a literal 2250 // constant, which is a pc-relative offset from the encoding of the $symbol 2251 // operand to the global variable. 2252 // 2253 // For global address space: 2254 // s_getpc_b64 s[0:1] 2255 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2256 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2257 // 2258 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2259 // fixups or relocations are emitted to replace $symbol@*@lo and 2260 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2261 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2262 // operand to the global variable. 2263 // 2264 // What we want here is an offset from the value returned by s_getpc 2265 // (which is the address of the s_add_u32 instruction) to the global 2266 // variable, but since the encoding of $symbol starts 4 bytes after the start 2267 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2268 // small. This requires us to add 4 to the global variable offset in order to 2269 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2270 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2271 // instruction. 2272 2273 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2274 2275 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2276 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2277 2278 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2279 .addDef(PCReg); 2280 2281 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2282 if (GAFlags == SIInstrInfo::MO_NONE) 2283 MIB.addImm(0); 2284 else 2285 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2286 2287 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2288 2289 if (PtrTy.getSizeInBits() == 32) 2290 B.buildExtract(DstReg, PCReg, 0); 2291 return true; 2292 } 2293 2294 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2295 MachineInstr &MI, MachineRegisterInfo &MRI, 2296 MachineIRBuilder &B) const { 2297 Register DstReg = MI.getOperand(0).getReg(); 2298 LLT Ty = MRI.getType(DstReg); 2299 unsigned AS = Ty.getAddressSpace(); 2300 2301 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2302 MachineFunction &MF = B.getMF(); 2303 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2304 2305 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2306 if (!MFI->isModuleEntryFunction() && 2307 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2308 const Function &Fn = MF.getFunction(); 2309 DiagnosticInfoUnsupported BadLDSDecl( 2310 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2311 DS_Warning); 2312 Fn.getContext().diagnose(BadLDSDecl); 2313 2314 // We currently don't have a way to correctly allocate LDS objects that 2315 // aren't directly associated with a kernel. We do force inlining of 2316 // functions that use local objects. However, if these dead functions are 2317 // not eliminated, we don't want a compile time error. Just emit a warning 2318 // and a trap, since there should be no callable path here. 2319 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2320 B.buildUndef(DstReg); 2321 MI.eraseFromParent(); 2322 return true; 2323 } 2324 2325 // TODO: We could emit code to handle the initialization somewhere. 2326 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2327 const SITargetLowering *TLI = ST.getTargetLowering(); 2328 if (!TLI->shouldUseLDSConstAddress(GV)) { 2329 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2330 return true; // Leave in place; 2331 } 2332 2333 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2334 Type *Ty = GV->getValueType(); 2335 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2336 // zero-sized type in other languages to declare the dynamic shared 2337 // memory which size is not known at the compile time. They will be 2338 // allocated by the runtime and placed directly after the static 2339 // allocated ones. They all share the same offset. 2340 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2341 // Adjust alignment for that dynamic shared memory array. 2342 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2343 LLT S32 = LLT::scalar(32); 2344 auto Sz = 2345 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2346 B.buildIntToPtr(DstReg, Sz); 2347 MI.eraseFromParent(); 2348 return true; 2349 } 2350 } 2351 2352 B.buildConstant( 2353 DstReg, 2354 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2355 MI.eraseFromParent(); 2356 return true; 2357 } 2358 2359 const Function &Fn = MF.getFunction(); 2360 DiagnosticInfoUnsupported BadInit( 2361 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2362 Fn.getContext().diagnose(BadInit); 2363 return true; 2364 } 2365 2366 const SITargetLowering *TLI = ST.getTargetLowering(); 2367 2368 if (TLI->shouldEmitFixup(GV)) { 2369 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2370 MI.eraseFromParent(); 2371 return true; 2372 } 2373 2374 if (TLI->shouldEmitPCReloc(GV)) { 2375 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2376 MI.eraseFromParent(); 2377 return true; 2378 } 2379 2380 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2381 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2382 2383 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2384 MachinePointerInfo::getGOT(MF), 2385 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2386 MachineMemOperand::MOInvariant, 2387 8 /*Size*/, Align(8)); 2388 2389 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2390 2391 if (Ty.getSizeInBits() == 32) { 2392 // Truncate if this is a 32-bit constant adrdess. 2393 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2394 B.buildExtract(DstReg, Load, 0); 2395 } else 2396 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2397 2398 MI.eraseFromParent(); 2399 return true; 2400 } 2401 2402 static LLT widenToNextPowerOf2(LLT Ty) { 2403 if (Ty.isVector()) 2404 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2405 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2406 } 2407 2408 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2409 MachineInstr &MI) const { 2410 MachineIRBuilder &B = Helper.MIRBuilder; 2411 MachineRegisterInfo &MRI = *B.getMRI(); 2412 GISelChangeObserver &Observer = Helper.Observer; 2413 2414 Register PtrReg = MI.getOperand(1).getReg(); 2415 LLT PtrTy = MRI.getType(PtrReg); 2416 unsigned AddrSpace = PtrTy.getAddressSpace(); 2417 2418 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2419 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2420 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2421 Observer.changingInstr(MI); 2422 MI.getOperand(1).setReg(Cast.getReg(0)); 2423 Observer.changedInstr(MI); 2424 return true; 2425 } 2426 2427 if (MI.getOpcode() != AMDGPU::G_LOAD) 2428 return false; 2429 2430 Register ValReg = MI.getOperand(0).getReg(); 2431 LLT ValTy = MRI.getType(ValReg); 2432 2433 MachineMemOperand *MMO = *MI.memoperands_begin(); 2434 const unsigned ValSize = ValTy.getSizeInBits(); 2435 const unsigned MemSize = 8 * MMO->getSize(); 2436 const Align MemAlign = MMO->getAlign(); 2437 const unsigned AlignInBits = 8 * MemAlign.value(); 2438 2439 // Widen non-power-of-2 loads to the alignment if needed 2440 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2441 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2442 2443 // This was already the correct extending load result type, so just adjust 2444 // the memory type. 2445 if (WideMemSize == ValSize) { 2446 MachineFunction &MF = B.getMF(); 2447 2448 MachineMemOperand *WideMMO = 2449 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2450 Observer.changingInstr(MI); 2451 MI.setMemRefs(MF, {WideMMO}); 2452 Observer.changedInstr(MI); 2453 return true; 2454 } 2455 2456 // Don't bother handling edge case that should probably never be produced. 2457 if (ValSize > WideMemSize) 2458 return false; 2459 2460 LLT WideTy = widenToNextPowerOf2(ValTy); 2461 2462 Register WideLoad; 2463 if (!WideTy.isVector()) { 2464 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2465 B.buildTrunc(ValReg, WideLoad).getReg(0); 2466 } else { 2467 // Extract the subvector. 2468 2469 if (isRegisterType(ValTy)) { 2470 // If this a case where G_EXTRACT is legal, use it. 2471 // (e.g. <3 x s32> -> <4 x s32>) 2472 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2473 B.buildExtract(ValReg, WideLoad, 0); 2474 } else { 2475 // For cases where the widened type isn't a nice register value, unmerge 2476 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2477 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2478 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2479 B.setInsertPt(B.getMBB(), MI.getIterator()); 2480 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2481 } 2482 } 2483 2484 MI.eraseFromParent(); 2485 return true; 2486 } 2487 2488 return false; 2489 } 2490 2491 bool AMDGPULegalizerInfo::legalizeFMad( 2492 MachineInstr &MI, MachineRegisterInfo &MRI, 2493 MachineIRBuilder &B) const { 2494 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2495 assert(Ty.isScalar()); 2496 2497 MachineFunction &MF = B.getMF(); 2498 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2499 2500 // TODO: Always legal with future ftz flag. 2501 // FIXME: Do we need just output? 2502 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2503 return true; 2504 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2505 return true; 2506 2507 MachineIRBuilder HelperBuilder(MI); 2508 GISelObserverWrapper DummyObserver; 2509 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2510 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2511 } 2512 2513 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2514 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2515 Register DstReg = MI.getOperand(0).getReg(); 2516 Register PtrReg = MI.getOperand(1).getReg(); 2517 Register CmpVal = MI.getOperand(2).getReg(); 2518 Register NewVal = MI.getOperand(3).getReg(); 2519 2520 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2521 "this should not have been custom lowered"); 2522 2523 LLT ValTy = MRI.getType(CmpVal); 2524 LLT VecTy = LLT::vector(2, ValTy); 2525 2526 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2527 2528 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2529 .addDef(DstReg) 2530 .addUse(PtrReg) 2531 .addUse(PackedVal) 2532 .setMemRefs(MI.memoperands()); 2533 2534 MI.eraseFromParent(); 2535 return true; 2536 } 2537 2538 bool AMDGPULegalizerInfo::legalizeFlog( 2539 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2540 Register Dst = MI.getOperand(0).getReg(); 2541 Register Src = MI.getOperand(1).getReg(); 2542 LLT Ty = B.getMRI()->getType(Dst); 2543 unsigned Flags = MI.getFlags(); 2544 2545 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2546 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2547 2548 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2549 MI.eraseFromParent(); 2550 return true; 2551 } 2552 2553 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2554 MachineIRBuilder &B) const { 2555 Register Dst = MI.getOperand(0).getReg(); 2556 Register Src = MI.getOperand(1).getReg(); 2557 unsigned Flags = MI.getFlags(); 2558 LLT Ty = B.getMRI()->getType(Dst); 2559 2560 auto K = B.buildFConstant(Ty, numbers::log2e); 2561 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2562 B.buildFExp2(Dst, Mul, Flags); 2563 MI.eraseFromParent(); 2564 return true; 2565 } 2566 2567 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2568 MachineIRBuilder &B) const { 2569 Register Dst = MI.getOperand(0).getReg(); 2570 Register Src0 = MI.getOperand(1).getReg(); 2571 Register Src1 = MI.getOperand(2).getReg(); 2572 unsigned Flags = MI.getFlags(); 2573 LLT Ty = B.getMRI()->getType(Dst); 2574 const LLT S16 = LLT::scalar(16); 2575 const LLT S32 = LLT::scalar(32); 2576 2577 if (Ty == S32) { 2578 auto Log = B.buildFLog2(S32, Src0, Flags); 2579 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2580 .addUse(Log.getReg(0)) 2581 .addUse(Src1) 2582 .setMIFlags(Flags); 2583 B.buildFExp2(Dst, Mul, Flags); 2584 } else if (Ty == S16) { 2585 // There's no f16 fmul_legacy, so we need to convert for it. 2586 auto Log = B.buildFLog2(S16, Src0, Flags); 2587 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2588 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2589 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2590 .addUse(Ext0.getReg(0)) 2591 .addUse(Ext1.getReg(0)) 2592 .setMIFlags(Flags); 2593 2594 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2595 } else 2596 return false; 2597 2598 MI.eraseFromParent(); 2599 return true; 2600 } 2601 2602 // Find a source register, ignoring any possible source modifiers. 2603 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2604 Register ModSrc = OrigSrc; 2605 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2606 ModSrc = SrcFNeg->getOperand(1).getReg(); 2607 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2608 ModSrc = SrcFAbs->getOperand(1).getReg(); 2609 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2610 ModSrc = SrcFAbs->getOperand(1).getReg(); 2611 return ModSrc; 2612 } 2613 2614 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2615 MachineRegisterInfo &MRI, 2616 MachineIRBuilder &B) const { 2617 2618 const LLT S1 = LLT::scalar(1); 2619 const LLT S64 = LLT::scalar(64); 2620 Register Dst = MI.getOperand(0).getReg(); 2621 Register OrigSrc = MI.getOperand(1).getReg(); 2622 unsigned Flags = MI.getFlags(); 2623 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2624 "this should not have been custom lowered"); 2625 2626 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2627 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2628 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2629 // V_FRACT bug is: 2630 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2631 // 2632 // Convert floor(x) to (x - fract(x)) 2633 2634 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2635 .addUse(OrigSrc) 2636 .setMIFlags(Flags); 2637 2638 // Give source modifier matching some assistance before obscuring a foldable 2639 // pattern. 2640 2641 // TODO: We can avoid the neg on the fract? The input sign to fract 2642 // shouldn't matter? 2643 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2644 2645 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2646 2647 Register Min = MRI.createGenericVirtualRegister(S64); 2648 2649 // We don't need to concern ourselves with the snan handling difference, so 2650 // use the one which will directly select. 2651 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2652 if (MFI->getMode().IEEE) 2653 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2654 else 2655 B.buildFMinNum(Min, Fract, Const, Flags); 2656 2657 Register CorrectedFract = Min; 2658 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2659 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2660 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2661 } 2662 2663 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2664 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2665 2666 MI.eraseFromParent(); 2667 return true; 2668 } 2669 2670 // Turn an illegal packed v2s16 build vector into bit operations. 2671 // TODO: This should probably be a bitcast action in LegalizerHelper. 2672 bool AMDGPULegalizerInfo::legalizeBuildVector( 2673 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2674 Register Dst = MI.getOperand(0).getReg(); 2675 const LLT S32 = LLT::scalar(32); 2676 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2677 2678 Register Src0 = MI.getOperand(1).getReg(); 2679 Register Src1 = MI.getOperand(2).getReg(); 2680 assert(MRI.getType(Src0) == LLT::scalar(16)); 2681 2682 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2683 B.buildBitcast(Dst, Merge); 2684 2685 MI.eraseFromParent(); 2686 return true; 2687 } 2688 2689 // Check that this is a G_XOR x, -1 2690 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2691 if (MI.getOpcode() != TargetOpcode::G_XOR) 2692 return false; 2693 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2694 return ConstVal && *ConstVal == -1; 2695 } 2696 2697 // Return the use branch instruction, otherwise null if the usage is invalid. 2698 static MachineInstr * 2699 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2700 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2701 Register CondDef = MI.getOperand(0).getReg(); 2702 if (!MRI.hasOneNonDBGUse(CondDef)) 2703 return nullptr; 2704 2705 MachineBasicBlock *Parent = MI.getParent(); 2706 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2707 2708 if (isNot(MRI, *UseMI)) { 2709 Register NegatedCond = UseMI->getOperand(0).getReg(); 2710 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2711 return nullptr; 2712 2713 // We're deleting the def of this value, so we need to remove it. 2714 UseMI->eraseFromParent(); 2715 2716 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2717 Negated = true; 2718 } 2719 2720 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2721 return nullptr; 2722 2723 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2724 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2725 if (Next == Parent->end()) { 2726 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2727 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2728 return nullptr; 2729 UncondBrTarget = &*NextMBB; 2730 } else { 2731 if (Next->getOpcode() != AMDGPU::G_BR) 2732 return nullptr; 2733 Br = &*Next; 2734 UncondBrTarget = Br->getOperand(0).getMBB(); 2735 } 2736 2737 return UseMI; 2738 } 2739 2740 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2741 const ArgDescriptor *Arg, 2742 const TargetRegisterClass *ArgRC, 2743 LLT ArgTy) const { 2744 MCRegister SrcReg = Arg->getRegister(); 2745 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2746 assert(DstReg.isVirtual() && "Virtual register expected"); 2747 2748 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2749 ArgTy); 2750 if (Arg->isMasked()) { 2751 // TODO: Should we try to emit this once in the entry block? 2752 const LLT S32 = LLT::scalar(32); 2753 const unsigned Mask = Arg->getMask(); 2754 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2755 2756 Register AndMaskSrc = LiveIn; 2757 2758 if (Shift != 0) { 2759 auto ShiftAmt = B.buildConstant(S32, Shift); 2760 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2761 } 2762 2763 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2764 } else { 2765 B.buildCopy(DstReg, LiveIn); 2766 } 2767 2768 return true; 2769 } 2770 2771 bool AMDGPULegalizerInfo::loadInputValue( 2772 Register DstReg, MachineIRBuilder &B, 2773 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2774 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2775 const ArgDescriptor *Arg; 2776 const TargetRegisterClass *ArgRC; 2777 LLT ArgTy; 2778 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2779 2780 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2781 return false; // TODO: Handle these 2782 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2783 } 2784 2785 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2786 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2787 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2788 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2789 return false; 2790 2791 MI.eraseFromParent(); 2792 return true; 2793 } 2794 2795 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2796 MachineRegisterInfo &MRI, 2797 MachineIRBuilder &B) const { 2798 Register Dst = MI.getOperand(0).getReg(); 2799 LLT DstTy = MRI.getType(Dst); 2800 LLT S16 = LLT::scalar(16); 2801 LLT S32 = LLT::scalar(32); 2802 LLT S64 = LLT::scalar(64); 2803 2804 if (DstTy == S16) 2805 return legalizeFDIV16(MI, MRI, B); 2806 if (DstTy == S32) 2807 return legalizeFDIV32(MI, MRI, B); 2808 if (DstTy == S64) 2809 return legalizeFDIV64(MI, MRI, B); 2810 2811 return false; 2812 } 2813 2814 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2815 Register DstDivReg, 2816 Register DstRemReg, 2817 Register X, 2818 Register Y) const { 2819 const LLT S1 = LLT::scalar(1); 2820 const LLT S32 = LLT::scalar(32); 2821 2822 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2823 // algorithm used here. 2824 2825 // Initial estimate of inv(y). 2826 auto FloatY = B.buildUITOFP(S32, Y); 2827 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2828 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2829 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2830 auto Z = B.buildFPTOUI(S32, ScaledY); 2831 2832 // One round of UNR. 2833 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2834 auto NegYZ = B.buildMul(S32, NegY, Z); 2835 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2836 2837 // Quotient/remainder estimate. 2838 auto Q = B.buildUMulH(S32, X, Z); 2839 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2840 2841 // First quotient/remainder refinement. 2842 auto One = B.buildConstant(S32, 1); 2843 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2844 if (DstDivReg) 2845 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2846 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2847 2848 // Second quotient/remainder refinement. 2849 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2850 if (DstDivReg) 2851 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2852 2853 if (DstRemReg) 2854 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2855 } 2856 2857 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2858 // 2859 // Return lo, hi of result 2860 // 2861 // %cvt.lo = G_UITOFP Val.lo 2862 // %cvt.hi = G_UITOFP Val.hi 2863 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2864 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2865 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2866 // %mul2 = G_FMUL %mul1, 2**(-32) 2867 // %trunc = G_INTRINSIC_TRUNC %mul2 2868 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2869 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2870 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2871 Register Val) { 2872 const LLT S32 = LLT::scalar(32); 2873 auto Unmerge = B.buildUnmerge(S32, Val); 2874 2875 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2876 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2877 2878 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2879 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2880 2881 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2882 auto Mul1 = 2883 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2884 2885 // 2**(-32) 2886 auto Mul2 = 2887 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2888 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2889 2890 // -(2**32) 2891 auto Mad2 = B.buildFMAD(S32, Trunc, 2892 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2893 2894 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2895 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2896 2897 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2898 } 2899 2900 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 2901 Register DstDivReg, 2902 Register DstRemReg, 2903 Register Numer, 2904 Register Denom) const { 2905 const LLT S32 = LLT::scalar(32); 2906 const LLT S64 = LLT::scalar(64); 2907 const LLT S1 = LLT::scalar(1); 2908 Register RcpLo, RcpHi; 2909 2910 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2911 2912 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2913 2914 auto Zero64 = B.buildConstant(S64, 0); 2915 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2916 2917 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2918 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2919 2920 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2921 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2922 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2923 2924 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2925 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2926 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2927 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2928 2929 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2930 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2931 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2932 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2933 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2934 2935 auto Zero32 = B.buildConstant(S32, 0); 2936 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2937 auto Add2_HiC = 2938 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2939 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2940 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2941 2942 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2943 Register NumerLo = UnmergeNumer.getReg(0); 2944 Register NumerHi = UnmergeNumer.getReg(1); 2945 2946 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2947 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2948 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2949 Register Mul3_Lo = UnmergeMul3.getReg(0); 2950 Register Mul3_Hi = UnmergeMul3.getReg(1); 2951 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2952 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2953 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2954 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2955 2956 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2957 Register DenomLo = UnmergeDenom.getReg(0); 2958 Register DenomHi = UnmergeDenom.getReg(1); 2959 2960 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2961 auto C1 = B.buildSExt(S32, CmpHi); 2962 2963 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2964 auto C2 = B.buildSExt(S32, CmpLo); 2965 2966 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2967 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2968 2969 // TODO: Here and below portions of the code can be enclosed into if/endif. 2970 // Currently control flow is unconditional and we have 4 selects after 2971 // potential endif to substitute PHIs. 2972 2973 // if C3 != 0 ... 2974 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2975 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2976 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2977 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2978 2979 auto One64 = B.buildConstant(S64, 1); 2980 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2981 2982 auto C4 = 2983 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2984 auto C5 = 2985 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2986 auto C6 = B.buildSelect( 2987 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2988 2989 // if (C6 != 0) 2990 auto Add4 = B.buildAdd(S64, Add3, One64); 2991 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2992 2993 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2994 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2995 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2996 2997 // endif C6 2998 // endif C3 2999 3000 if (DstDivReg) { 3001 auto Sel1 = B.buildSelect( 3002 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3003 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3004 Sel1, MulHi3); 3005 } 3006 3007 if (DstRemReg) { 3008 auto Sel2 = B.buildSelect( 3009 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3010 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3011 Sel2, Sub1); 3012 } 3013 } 3014 3015 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3016 MachineRegisterInfo &MRI, 3017 MachineIRBuilder &B) const { 3018 Register DstDivReg, DstRemReg; 3019 switch (MI.getOpcode()) { 3020 default: 3021 llvm_unreachable("Unexpected opcode!"); 3022 case AMDGPU::G_UDIV: { 3023 DstDivReg = MI.getOperand(0).getReg(); 3024 break; 3025 } 3026 case AMDGPU::G_UREM: { 3027 DstRemReg = MI.getOperand(0).getReg(); 3028 break; 3029 } 3030 case AMDGPU::G_UDIVREM: { 3031 DstDivReg = MI.getOperand(0).getReg(); 3032 DstRemReg = MI.getOperand(1).getReg(); 3033 break; 3034 } 3035 } 3036 3037 const LLT S64 = LLT::scalar(64); 3038 const LLT S32 = LLT::scalar(32); 3039 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3040 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3041 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3042 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3043 3044 if (Ty == S32) 3045 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3046 else if (Ty == S64) 3047 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3048 else 3049 return false; 3050 3051 MI.eraseFromParent(); 3052 return true; 3053 } 3054 3055 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3056 MachineRegisterInfo &MRI, 3057 MachineIRBuilder &B) const { 3058 const LLT S64 = LLT::scalar(64); 3059 const LLT S32 = LLT::scalar(32); 3060 3061 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3062 if (Ty != S32 && Ty != S64) 3063 return false; 3064 3065 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3066 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3067 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3068 3069 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3070 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3071 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3072 3073 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3074 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3075 3076 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3077 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3078 3079 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3080 switch (MI.getOpcode()) { 3081 default: 3082 llvm_unreachable("Unexpected opcode!"); 3083 case AMDGPU::G_SDIV: { 3084 DstDivReg = MI.getOperand(0).getReg(); 3085 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3086 break; 3087 } 3088 case AMDGPU::G_SREM: { 3089 DstRemReg = MI.getOperand(0).getReg(); 3090 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3091 break; 3092 } 3093 case AMDGPU::G_SDIVREM: { 3094 DstDivReg = MI.getOperand(0).getReg(); 3095 DstRemReg = MI.getOperand(1).getReg(); 3096 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3097 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3098 break; 3099 } 3100 } 3101 3102 if (Ty == S32) 3103 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3104 else 3105 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3106 3107 if (DstDivReg) { 3108 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3109 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3110 B.buildSub(DstDivReg, SignXor, Sign); 3111 } 3112 3113 if (DstRemReg) { 3114 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3115 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3116 B.buildSub(DstRemReg, SignXor, Sign); 3117 } 3118 3119 MI.eraseFromParent(); 3120 return true; 3121 } 3122 3123 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3124 MachineRegisterInfo &MRI, 3125 MachineIRBuilder &B) const { 3126 Register Res = MI.getOperand(0).getReg(); 3127 Register LHS = MI.getOperand(1).getReg(); 3128 Register RHS = MI.getOperand(2).getReg(); 3129 uint16_t Flags = MI.getFlags(); 3130 LLT ResTy = MRI.getType(Res); 3131 3132 const MachineFunction &MF = B.getMF(); 3133 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3134 MI.getFlag(MachineInstr::FmAfn); 3135 3136 if (!AllowInaccurateRcp) 3137 return false; 3138 3139 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3140 // 1 / x -> RCP(x) 3141 if (CLHS->isExactlyValue(1.0)) { 3142 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3143 .addUse(RHS) 3144 .setMIFlags(Flags); 3145 3146 MI.eraseFromParent(); 3147 return true; 3148 } 3149 3150 // -1 / x -> RCP( FNEG(x) ) 3151 if (CLHS->isExactlyValue(-1.0)) { 3152 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3153 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3154 .addUse(FNeg.getReg(0)) 3155 .setMIFlags(Flags); 3156 3157 MI.eraseFromParent(); 3158 return true; 3159 } 3160 } 3161 3162 // x / y -> x * (1.0 / y) 3163 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3164 .addUse(RHS) 3165 .setMIFlags(Flags); 3166 B.buildFMul(Res, LHS, RCP, Flags); 3167 3168 MI.eraseFromParent(); 3169 return true; 3170 } 3171 3172 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3173 MachineRegisterInfo &MRI, 3174 MachineIRBuilder &B) const { 3175 Register Res = MI.getOperand(0).getReg(); 3176 Register X = MI.getOperand(1).getReg(); 3177 Register Y = MI.getOperand(2).getReg(); 3178 uint16_t Flags = MI.getFlags(); 3179 LLT ResTy = MRI.getType(Res); 3180 3181 const MachineFunction &MF = B.getMF(); 3182 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3183 MI.getFlag(MachineInstr::FmAfn); 3184 3185 if (!AllowInaccurateRcp) 3186 return false; 3187 3188 auto NegY = B.buildFNeg(ResTy, Y); 3189 auto One = B.buildFConstant(ResTy, 1.0); 3190 3191 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3192 .addUse(Y) 3193 .setMIFlags(Flags); 3194 3195 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3196 R = B.buildFMA(ResTy, Tmp0, R, R); 3197 3198 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3199 R = B.buildFMA(ResTy, Tmp1, R, R); 3200 3201 auto Ret = B.buildFMul(ResTy, X, R); 3202 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3203 3204 B.buildFMA(Res, Tmp2, R, Ret); 3205 MI.eraseFromParent(); 3206 return true; 3207 } 3208 3209 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3210 MachineRegisterInfo &MRI, 3211 MachineIRBuilder &B) const { 3212 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3213 return true; 3214 3215 Register Res = MI.getOperand(0).getReg(); 3216 Register LHS = MI.getOperand(1).getReg(); 3217 Register RHS = MI.getOperand(2).getReg(); 3218 3219 uint16_t Flags = MI.getFlags(); 3220 3221 LLT S16 = LLT::scalar(16); 3222 LLT S32 = LLT::scalar(32); 3223 3224 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3225 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3226 3227 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3228 .addUse(RHSExt.getReg(0)) 3229 .setMIFlags(Flags); 3230 3231 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3232 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3233 3234 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3235 .addUse(RDst.getReg(0)) 3236 .addUse(RHS) 3237 .addUse(LHS) 3238 .setMIFlags(Flags); 3239 3240 MI.eraseFromParent(); 3241 return true; 3242 } 3243 3244 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3245 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3246 static void toggleSPDenormMode(bool Enable, 3247 MachineIRBuilder &B, 3248 const GCNSubtarget &ST, 3249 AMDGPU::SIModeRegisterDefaults Mode) { 3250 // Set SP denorm mode to this value. 3251 unsigned SPDenormMode = 3252 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3253 3254 if (ST.hasDenormModeInst()) { 3255 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3256 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3257 3258 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3259 B.buildInstr(AMDGPU::S_DENORM_MODE) 3260 .addImm(NewDenormModeValue); 3261 3262 } else { 3263 // Select FP32 bit field in mode register. 3264 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3265 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3266 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3267 3268 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3269 .addImm(SPDenormMode) 3270 .addImm(SPDenormModeBitField); 3271 } 3272 } 3273 3274 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3275 MachineRegisterInfo &MRI, 3276 MachineIRBuilder &B) const { 3277 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3278 return true; 3279 3280 Register Res = MI.getOperand(0).getReg(); 3281 Register LHS = MI.getOperand(1).getReg(); 3282 Register RHS = MI.getOperand(2).getReg(); 3283 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3284 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3285 3286 uint16_t Flags = MI.getFlags(); 3287 3288 LLT S32 = LLT::scalar(32); 3289 LLT S1 = LLT::scalar(1); 3290 3291 auto One = B.buildFConstant(S32, 1.0f); 3292 3293 auto DenominatorScaled = 3294 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3295 .addUse(LHS) 3296 .addUse(RHS) 3297 .addImm(0) 3298 .setMIFlags(Flags); 3299 auto NumeratorScaled = 3300 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3301 .addUse(LHS) 3302 .addUse(RHS) 3303 .addImm(1) 3304 .setMIFlags(Flags); 3305 3306 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3307 .addUse(DenominatorScaled.getReg(0)) 3308 .setMIFlags(Flags); 3309 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3310 3311 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3312 // aren't modeled as reading it. 3313 if (!Mode.allFP32Denormals()) 3314 toggleSPDenormMode(true, B, ST, Mode); 3315 3316 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3317 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3318 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3319 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3320 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3321 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3322 3323 if (!Mode.allFP32Denormals()) 3324 toggleSPDenormMode(false, B, ST, Mode); 3325 3326 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3327 .addUse(Fma4.getReg(0)) 3328 .addUse(Fma1.getReg(0)) 3329 .addUse(Fma3.getReg(0)) 3330 .addUse(NumeratorScaled.getReg(1)) 3331 .setMIFlags(Flags); 3332 3333 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3334 .addUse(Fmas.getReg(0)) 3335 .addUse(RHS) 3336 .addUse(LHS) 3337 .setMIFlags(Flags); 3338 3339 MI.eraseFromParent(); 3340 return true; 3341 } 3342 3343 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3344 MachineRegisterInfo &MRI, 3345 MachineIRBuilder &B) const { 3346 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3347 return true; 3348 3349 Register Res = MI.getOperand(0).getReg(); 3350 Register LHS = MI.getOperand(1).getReg(); 3351 Register RHS = MI.getOperand(2).getReg(); 3352 3353 uint16_t Flags = MI.getFlags(); 3354 3355 LLT S64 = LLT::scalar(64); 3356 LLT S1 = LLT::scalar(1); 3357 3358 auto One = B.buildFConstant(S64, 1.0); 3359 3360 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3361 .addUse(LHS) 3362 .addUse(RHS) 3363 .addImm(0) 3364 .setMIFlags(Flags); 3365 3366 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3367 3368 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3369 .addUse(DivScale0.getReg(0)) 3370 .setMIFlags(Flags); 3371 3372 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3373 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3374 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3375 3376 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3377 .addUse(LHS) 3378 .addUse(RHS) 3379 .addImm(1) 3380 .setMIFlags(Flags); 3381 3382 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3383 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3384 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3385 3386 Register Scale; 3387 if (!ST.hasUsableDivScaleConditionOutput()) { 3388 // Workaround a hardware bug on SI where the condition output from div_scale 3389 // is not usable. 3390 3391 LLT S32 = LLT::scalar(32); 3392 3393 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3394 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3395 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3396 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3397 3398 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3399 Scale1Unmerge.getReg(1)); 3400 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3401 Scale0Unmerge.getReg(1)); 3402 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3403 } else { 3404 Scale = DivScale1.getReg(1); 3405 } 3406 3407 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3408 .addUse(Fma4.getReg(0)) 3409 .addUse(Fma3.getReg(0)) 3410 .addUse(Mul.getReg(0)) 3411 .addUse(Scale) 3412 .setMIFlags(Flags); 3413 3414 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3415 .addUse(Fmas.getReg(0)) 3416 .addUse(RHS) 3417 .addUse(LHS) 3418 .setMIFlags(Flags); 3419 3420 MI.eraseFromParent(); 3421 return true; 3422 } 3423 3424 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3425 MachineRegisterInfo &MRI, 3426 MachineIRBuilder &B) const { 3427 Register Res = MI.getOperand(0).getReg(); 3428 Register LHS = MI.getOperand(2).getReg(); 3429 Register RHS = MI.getOperand(3).getReg(); 3430 uint16_t Flags = MI.getFlags(); 3431 3432 LLT S32 = LLT::scalar(32); 3433 LLT S1 = LLT::scalar(1); 3434 3435 auto Abs = B.buildFAbs(S32, RHS, Flags); 3436 const APFloat C0Val(1.0f); 3437 3438 auto C0 = B.buildConstant(S32, 0x6f800000); 3439 auto C1 = B.buildConstant(S32, 0x2f800000); 3440 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3441 3442 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3443 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3444 3445 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3446 3447 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3448 .addUse(Mul0.getReg(0)) 3449 .setMIFlags(Flags); 3450 3451 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3452 3453 B.buildFMul(Res, Sel, Mul1, Flags); 3454 3455 MI.eraseFromParent(); 3456 return true; 3457 } 3458 3459 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3460 // FIXME: Why do we handle this one but not other removed instructions? 3461 // 3462 // Reciprocal square root. The clamp prevents infinite results, clamping 3463 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3464 // +-max_float. 3465 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3466 MachineRegisterInfo &MRI, 3467 MachineIRBuilder &B) const { 3468 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3469 return true; 3470 3471 Register Dst = MI.getOperand(0).getReg(); 3472 Register Src = MI.getOperand(2).getReg(); 3473 auto Flags = MI.getFlags(); 3474 3475 LLT Ty = MRI.getType(Dst); 3476 3477 const fltSemantics *FltSemantics; 3478 if (Ty == LLT::scalar(32)) 3479 FltSemantics = &APFloat::IEEEsingle(); 3480 else if (Ty == LLT::scalar(64)) 3481 FltSemantics = &APFloat::IEEEdouble(); 3482 else 3483 return false; 3484 3485 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3486 .addUse(Src) 3487 .setMIFlags(Flags); 3488 3489 // We don't need to concern ourselves with the snan handling difference, since 3490 // the rsq quieted (or not) so use the one which will directly select. 3491 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3492 const bool UseIEEE = MFI->getMode().IEEE; 3493 3494 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3495 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3496 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3497 3498 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3499 3500 if (UseIEEE) 3501 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3502 else 3503 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3504 MI.eraseFromParent(); 3505 return true; 3506 } 3507 3508 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3509 switch (IID) { 3510 case Intrinsic::amdgcn_ds_fadd: 3511 return AMDGPU::G_ATOMICRMW_FADD; 3512 case Intrinsic::amdgcn_ds_fmin: 3513 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3514 case Intrinsic::amdgcn_ds_fmax: 3515 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3516 default: 3517 llvm_unreachable("not a DS FP intrinsic"); 3518 } 3519 } 3520 3521 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3522 MachineInstr &MI, 3523 Intrinsic::ID IID) const { 3524 GISelChangeObserver &Observer = Helper.Observer; 3525 Observer.changingInstr(MI); 3526 3527 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3528 3529 // The remaining operands were used to set fields in the MemOperand on 3530 // construction. 3531 for (int I = 6; I > 3; --I) 3532 MI.RemoveOperand(I); 3533 3534 MI.RemoveOperand(1); // Remove the intrinsic ID. 3535 Observer.changedInstr(MI); 3536 return true; 3537 } 3538 3539 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3540 MachineRegisterInfo &MRI, 3541 MachineIRBuilder &B) const { 3542 uint64_t Offset = 3543 ST.getTargetLowering()->getImplicitParameterOffset( 3544 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3545 LLT DstTy = MRI.getType(DstReg); 3546 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3547 3548 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3549 if (!loadInputValue(KernargPtrReg, B, 3550 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3551 return false; 3552 3553 // FIXME: This should be nuw 3554 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3555 return true; 3556 } 3557 3558 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3559 MachineRegisterInfo &MRI, 3560 MachineIRBuilder &B) const { 3561 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3562 if (!MFI->isEntryFunction()) { 3563 return legalizePreloadedArgIntrin(MI, MRI, B, 3564 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3565 } 3566 3567 Register DstReg = MI.getOperand(0).getReg(); 3568 if (!getImplicitArgPtr(DstReg, MRI, B)) 3569 return false; 3570 3571 MI.eraseFromParent(); 3572 return true; 3573 } 3574 3575 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3576 MachineRegisterInfo &MRI, 3577 MachineIRBuilder &B, 3578 unsigned AddrSpace) const { 3579 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3580 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3581 Register Hi32 = Unmerge.getReg(1); 3582 3583 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3584 MI.eraseFromParent(); 3585 return true; 3586 } 3587 3588 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3589 // offset (the offset that is included in bounds checking and swizzling, to be 3590 // split between the instruction's voffset and immoffset fields) and soffset 3591 // (the offset that is excluded from bounds checking and swizzling, to go in 3592 // the instruction's soffset field). This function takes the first kind of 3593 // offset and figures out how to split it between voffset and immoffset. 3594 std::tuple<Register, unsigned, unsigned> 3595 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3596 Register OrigOffset) const { 3597 const unsigned MaxImm = 4095; 3598 Register BaseReg; 3599 unsigned TotalConstOffset; 3600 const LLT S32 = LLT::scalar(32); 3601 MachineRegisterInfo &MRI = *B.getMRI(); 3602 3603 std::tie(BaseReg, TotalConstOffset) = 3604 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3605 3606 unsigned ImmOffset = TotalConstOffset; 3607 3608 // If BaseReg is a pointer, convert it to int. 3609 if (MRI.getType(BaseReg).isPointer()) 3610 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3611 3612 // If the immediate value is too big for the immoffset field, put the value 3613 // and -4096 into the immoffset field so that the value that is copied/added 3614 // for the voffset field is a multiple of 4096, and it stands more chance 3615 // of being CSEd with the copy/add for another similar load/store. 3616 // However, do not do that rounding down to a multiple of 4096 if that is a 3617 // negative number, as it appears to be illegal to have a negative offset 3618 // in the vgpr, even if adding the immediate offset makes it positive. 3619 unsigned Overflow = ImmOffset & ~MaxImm; 3620 ImmOffset -= Overflow; 3621 if ((int32_t)Overflow < 0) { 3622 Overflow += ImmOffset; 3623 ImmOffset = 0; 3624 } 3625 3626 if (Overflow != 0) { 3627 if (!BaseReg) { 3628 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3629 } else { 3630 auto OverflowVal = B.buildConstant(S32, Overflow); 3631 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3632 } 3633 } 3634 3635 if (!BaseReg) 3636 BaseReg = B.buildConstant(S32, 0).getReg(0); 3637 3638 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3639 } 3640 3641 /// Handle register layout difference for f16 images for some subtargets. 3642 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3643 MachineRegisterInfo &MRI, 3644 Register Reg, 3645 bool ImageStore) const { 3646 const LLT S16 = LLT::scalar(16); 3647 const LLT S32 = LLT::scalar(32); 3648 LLT StoreVT = MRI.getType(Reg); 3649 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3650 3651 if (ST.hasUnpackedD16VMem()) { 3652 auto Unmerge = B.buildUnmerge(S16, Reg); 3653 3654 SmallVector<Register, 4> WideRegs; 3655 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3656 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3657 3658 int NumElts = StoreVT.getNumElements(); 3659 3660 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3661 } 3662 3663 if (ImageStore && ST.hasImageStoreD16Bug()) { 3664 if (StoreVT.getNumElements() == 2) { 3665 SmallVector<Register, 4> PackedRegs; 3666 Reg = B.buildBitcast(S32, Reg).getReg(0); 3667 PackedRegs.push_back(Reg); 3668 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3669 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); 3670 } 3671 3672 if (StoreVT.getNumElements() == 3) { 3673 SmallVector<Register, 4> PackedRegs; 3674 auto Unmerge = B.buildUnmerge(S16, Reg); 3675 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3676 PackedRegs.push_back(Unmerge.getReg(I)); 3677 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3678 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); 3679 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); 3680 } 3681 3682 if (StoreVT.getNumElements() == 4) { 3683 SmallVector<Register, 4> PackedRegs; 3684 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); 3685 auto Unmerge = B.buildUnmerge(S32, Reg); 3686 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3687 PackedRegs.push_back(Unmerge.getReg(I)); 3688 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3689 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); 3690 } 3691 3692 llvm_unreachable("invalid data type"); 3693 } 3694 3695 return Reg; 3696 } 3697 3698 Register AMDGPULegalizerInfo::fixStoreSourceType( 3699 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3700 MachineRegisterInfo *MRI = B.getMRI(); 3701 LLT Ty = MRI->getType(VData); 3702 3703 const LLT S16 = LLT::scalar(16); 3704 3705 // Fixup illegal register types for i8 stores. 3706 if (Ty == LLT::scalar(8) || Ty == S16) { 3707 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3708 return AnyExt; 3709 } 3710 3711 if (Ty.isVector()) { 3712 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3713 if (IsFormat) 3714 return handleD16VData(B, *MRI, VData); 3715 } 3716 } 3717 3718 return VData; 3719 } 3720 3721 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3722 MachineRegisterInfo &MRI, 3723 MachineIRBuilder &B, 3724 bool IsTyped, 3725 bool IsFormat) const { 3726 Register VData = MI.getOperand(1).getReg(); 3727 LLT Ty = MRI.getType(VData); 3728 LLT EltTy = Ty.getScalarType(); 3729 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3730 const LLT S32 = LLT::scalar(32); 3731 3732 VData = fixStoreSourceType(B, VData, IsFormat); 3733 Register RSrc = MI.getOperand(2).getReg(); 3734 3735 MachineMemOperand *MMO = *MI.memoperands_begin(); 3736 const int MemSize = MMO->getSize(); 3737 3738 unsigned ImmOffset; 3739 unsigned TotalOffset; 3740 3741 // The typed intrinsics add an immediate after the registers. 3742 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3743 3744 // The struct intrinsic variants add one additional operand over raw. 3745 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3746 Register VIndex; 3747 int OpOffset = 0; 3748 if (HasVIndex) { 3749 VIndex = MI.getOperand(3).getReg(); 3750 OpOffset = 1; 3751 } 3752 3753 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3754 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3755 3756 unsigned Format = 0; 3757 if (IsTyped) { 3758 Format = MI.getOperand(5 + OpOffset).getImm(); 3759 ++OpOffset; 3760 } 3761 3762 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3763 3764 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3765 if (TotalOffset != 0) 3766 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3767 3768 unsigned Opc; 3769 if (IsTyped) { 3770 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3771 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3772 } else if (IsFormat) { 3773 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3774 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3775 } else { 3776 switch (MemSize) { 3777 case 1: 3778 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3779 break; 3780 case 2: 3781 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3782 break; 3783 default: 3784 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3785 break; 3786 } 3787 } 3788 3789 if (!VIndex) 3790 VIndex = B.buildConstant(S32, 0).getReg(0); 3791 3792 auto MIB = B.buildInstr(Opc) 3793 .addUse(VData) // vdata 3794 .addUse(RSrc) // rsrc 3795 .addUse(VIndex) // vindex 3796 .addUse(VOffset) // voffset 3797 .addUse(SOffset) // soffset 3798 .addImm(ImmOffset); // offset(imm) 3799 3800 if (IsTyped) 3801 MIB.addImm(Format); 3802 3803 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3804 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3805 .addMemOperand(MMO); 3806 3807 MI.eraseFromParent(); 3808 return true; 3809 } 3810 3811 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3812 MachineRegisterInfo &MRI, 3813 MachineIRBuilder &B, 3814 bool IsFormat, 3815 bool IsTyped) const { 3816 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3817 MachineMemOperand *MMO = *MI.memoperands_begin(); 3818 const int MemSize = MMO->getSize(); 3819 const LLT S32 = LLT::scalar(32); 3820 3821 Register Dst = MI.getOperand(0).getReg(); 3822 Register RSrc = MI.getOperand(2).getReg(); 3823 3824 // The typed intrinsics add an immediate after the registers. 3825 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3826 3827 // The struct intrinsic variants add one additional operand over raw. 3828 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3829 Register VIndex; 3830 int OpOffset = 0; 3831 if (HasVIndex) { 3832 VIndex = MI.getOperand(3).getReg(); 3833 OpOffset = 1; 3834 } 3835 3836 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3837 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3838 3839 unsigned Format = 0; 3840 if (IsTyped) { 3841 Format = MI.getOperand(5 + OpOffset).getImm(); 3842 ++OpOffset; 3843 } 3844 3845 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3846 unsigned ImmOffset; 3847 unsigned TotalOffset; 3848 3849 LLT Ty = MRI.getType(Dst); 3850 LLT EltTy = Ty.getScalarType(); 3851 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3852 const bool Unpacked = ST.hasUnpackedD16VMem(); 3853 3854 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3855 if (TotalOffset != 0) 3856 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3857 3858 unsigned Opc; 3859 3860 if (IsTyped) { 3861 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3862 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3863 } else if (IsFormat) { 3864 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3865 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3866 } else { 3867 switch (MemSize) { 3868 case 1: 3869 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3870 break; 3871 case 2: 3872 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3873 break; 3874 default: 3875 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3876 break; 3877 } 3878 } 3879 3880 Register LoadDstReg; 3881 3882 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3883 LLT UnpackedTy = Ty.changeElementSize(32); 3884 3885 if (IsExtLoad) 3886 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3887 else if (Unpacked && IsD16 && Ty.isVector()) 3888 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3889 else 3890 LoadDstReg = Dst; 3891 3892 if (!VIndex) 3893 VIndex = B.buildConstant(S32, 0).getReg(0); 3894 3895 auto MIB = B.buildInstr(Opc) 3896 .addDef(LoadDstReg) // vdata 3897 .addUse(RSrc) // rsrc 3898 .addUse(VIndex) // vindex 3899 .addUse(VOffset) // voffset 3900 .addUse(SOffset) // soffset 3901 .addImm(ImmOffset); // offset(imm) 3902 3903 if (IsTyped) 3904 MIB.addImm(Format); 3905 3906 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3907 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3908 .addMemOperand(MMO); 3909 3910 if (LoadDstReg != Dst) { 3911 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3912 3913 // Widen result for extending loads was widened. 3914 if (IsExtLoad) 3915 B.buildTrunc(Dst, LoadDstReg); 3916 else { 3917 // Repack to original 16-bit vector result 3918 // FIXME: G_TRUNC should work, but legalization currently fails 3919 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3920 SmallVector<Register, 4> Repack; 3921 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3922 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3923 B.buildMerge(Dst, Repack); 3924 } 3925 } 3926 3927 MI.eraseFromParent(); 3928 return true; 3929 } 3930 3931 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3932 MachineIRBuilder &B, 3933 bool IsInc) const { 3934 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3935 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3936 B.buildInstr(Opc) 3937 .addDef(MI.getOperand(0).getReg()) 3938 .addUse(MI.getOperand(2).getReg()) 3939 .addUse(MI.getOperand(3).getReg()) 3940 .cloneMemRefs(MI); 3941 MI.eraseFromParent(); 3942 return true; 3943 } 3944 3945 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3946 switch (IntrID) { 3947 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3948 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3949 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3950 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3951 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3952 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3953 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3954 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3955 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3956 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3957 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3958 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3959 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3960 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3961 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3962 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3963 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3964 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3965 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3966 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3967 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3968 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3969 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3970 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3971 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3972 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3973 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3974 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3975 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3976 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3977 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3978 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3979 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3980 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3981 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3982 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3983 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3984 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3985 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3986 case Intrinsic::amdgcn_buffer_atomic_fadd: 3987 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3988 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3989 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3990 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 3991 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 3992 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 3993 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 3994 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 3995 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 3996 default: 3997 llvm_unreachable("unhandled atomic opcode"); 3998 } 3999 } 4000 4001 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4002 MachineIRBuilder &B, 4003 Intrinsic::ID IID) const { 4004 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4005 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4006 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4007 4008 Register Dst; 4009 4010 int OpOffset = 0; 4011 if (HasReturn) { 4012 // A few FP atomics do not support return values. 4013 Dst = MI.getOperand(0).getReg(); 4014 } else { 4015 OpOffset = -1; 4016 } 4017 4018 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4019 Register CmpVal; 4020 4021 if (IsCmpSwap) { 4022 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4023 ++OpOffset; 4024 } 4025 4026 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4027 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4028 4029 // The struct intrinsic variants add one additional operand over raw. 4030 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4031 Register VIndex; 4032 if (HasVIndex) { 4033 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4034 ++OpOffset; 4035 } 4036 4037 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4038 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4039 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4040 4041 MachineMemOperand *MMO = *MI.memoperands_begin(); 4042 4043 unsigned ImmOffset; 4044 unsigned TotalOffset; 4045 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 4046 if (TotalOffset != 0) 4047 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 4048 4049 if (!VIndex) 4050 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4051 4052 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4053 4054 if (HasReturn) 4055 MIB.addDef(Dst); 4056 4057 MIB.addUse(VData); // vdata 4058 4059 if (IsCmpSwap) 4060 MIB.addReg(CmpVal); 4061 4062 MIB.addUse(RSrc) // rsrc 4063 .addUse(VIndex) // vindex 4064 .addUse(VOffset) // voffset 4065 .addUse(SOffset) // soffset 4066 .addImm(ImmOffset) // offset(imm) 4067 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4068 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4069 .addMemOperand(MMO); 4070 4071 MI.eraseFromParent(); 4072 return true; 4073 } 4074 4075 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4076 /// vector with s16 typed elements. 4077 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4078 SmallVectorImpl<Register> &PackedAddrs, 4079 unsigned ArgOffset, 4080 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4081 bool IsA16, bool IsG16) { 4082 const LLT S16 = LLT::scalar(16); 4083 const LLT V2S16 = LLT::vector(2, 16); 4084 auto EndIdx = Intr->VAddrEnd; 4085 4086 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4087 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4088 if (!SrcOp.isReg()) 4089 continue; // _L to _LZ may have eliminated this. 4090 4091 Register AddrReg = SrcOp.getReg(); 4092 4093 if (I < Intr->GradientStart) { 4094 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4095 PackedAddrs.push_back(AddrReg); 4096 } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4097 (I >= Intr->CoordStart && !IsA16)) { 4098 // Handle any gradient or coordinate operands that should not be packed 4099 PackedAddrs.push_back(AddrReg); 4100 } else { 4101 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4102 // derivatives dx/dh and dx/dv are packed with undef. 4103 if (((I + 1) >= EndIdx) || 4104 ((Intr->NumGradients / 2) % 2 == 1 && 4105 (I == static_cast<unsigned>(Intr->GradientStart + 4106 (Intr->NumGradients / 2) - 1) || 4107 I == static_cast<unsigned>(Intr->GradientStart + 4108 Intr->NumGradients - 1))) || 4109 // Check for _L to _LZ optimization 4110 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4111 PackedAddrs.push_back( 4112 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4113 .getReg(0)); 4114 } else { 4115 PackedAddrs.push_back( 4116 B.buildBuildVector( 4117 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4118 .getReg(0)); 4119 ++I; 4120 } 4121 } 4122 } 4123 } 4124 4125 /// Convert from separate vaddr components to a single vector address register, 4126 /// and replace the remaining operands with $noreg. 4127 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4128 int DimIdx, int NumVAddrs) { 4129 const LLT S32 = LLT::scalar(32); 4130 4131 SmallVector<Register, 8> AddrRegs; 4132 for (int I = 0; I != NumVAddrs; ++I) { 4133 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4134 if (SrcOp.isReg()) { 4135 AddrRegs.push_back(SrcOp.getReg()); 4136 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4137 } 4138 } 4139 4140 int NumAddrRegs = AddrRegs.size(); 4141 if (NumAddrRegs != 1) { 4142 // Round up to 8 elements for v5-v7 4143 // FIXME: Missing intermediate sized register classes and instructions. 4144 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4145 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4146 auto Undef = B.buildUndef(S32); 4147 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4148 NumAddrRegs = RoundedNumRegs; 4149 } 4150 4151 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 4152 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4153 } 4154 4155 for (int I = 1; I != NumVAddrs; ++I) { 4156 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4157 if (SrcOp.isReg()) 4158 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4159 } 4160 } 4161 4162 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4163 /// 4164 /// Depending on the subtarget, load/store with 16-bit element data need to be 4165 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4166 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4167 /// registers. 4168 /// 4169 /// We don't want to directly select image instructions just yet, but also want 4170 /// to exposes all register repacking to the legalizer/combiners. We also don't 4171 /// want a selected instrution entering RegBankSelect. In order to avoid 4172 /// defining a multitude of intermediate image instructions, directly hack on 4173 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4174 /// now unnecessary arguments with $noreg. 4175 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4176 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4177 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4178 4179 const unsigned NumDefs = MI.getNumExplicitDefs(); 4180 const unsigned ArgOffset = NumDefs + 1; 4181 bool IsTFE = NumDefs == 2; 4182 // We are only processing the operands of d16 image operations on subtargets 4183 // that use the unpacked register layout, or need to repack the TFE result. 4184 4185 // TODO: Do we need to guard against already legalized intrinsics? 4186 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4187 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4188 4189 MachineRegisterInfo *MRI = B.getMRI(); 4190 const LLT S32 = LLT::scalar(32); 4191 const LLT S16 = LLT::scalar(16); 4192 const LLT V2S16 = LLT::vector(2, 16); 4193 4194 unsigned DMask = 0; 4195 4196 // Check for 16 bit addresses and pack if true. 4197 LLT GradTy = 4198 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4199 LLT AddrTy = 4200 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4201 const bool IsG16 = GradTy == S16; 4202 const bool IsA16 = AddrTy == S16; 4203 4204 int DMaskLanes = 0; 4205 if (!BaseOpcode->Atomic) { 4206 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4207 if (BaseOpcode->Gather4) { 4208 DMaskLanes = 4; 4209 } else if (DMask != 0) { 4210 DMaskLanes = countPopulation(DMask); 4211 } else if (!IsTFE && !BaseOpcode->Store) { 4212 // If dmask is 0, this is a no-op load. This can be eliminated. 4213 B.buildUndef(MI.getOperand(0)); 4214 MI.eraseFromParent(); 4215 return true; 4216 } 4217 } 4218 4219 Observer.changingInstr(MI); 4220 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4221 4222 unsigned NewOpcode = NumDefs == 0 ? 4223 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4224 4225 // Track that we legalized this 4226 MI.setDesc(B.getTII().get(NewOpcode)); 4227 4228 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4229 // dmask to be at least 1 otherwise the instruction will fail 4230 if (IsTFE && DMask == 0) { 4231 DMask = 0x1; 4232 DMaskLanes = 1; 4233 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4234 } 4235 4236 if (BaseOpcode->Atomic) { 4237 Register VData0 = MI.getOperand(2).getReg(); 4238 LLT Ty = MRI->getType(VData0); 4239 4240 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4241 if (Ty.isVector()) 4242 return false; 4243 4244 if (BaseOpcode->AtomicX2) { 4245 Register VData1 = MI.getOperand(3).getReg(); 4246 // The two values are packed in one register. 4247 LLT PackedTy = LLT::vector(2, Ty); 4248 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4249 MI.getOperand(2).setReg(Concat.getReg(0)); 4250 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4251 } 4252 } 4253 4254 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4255 4256 // Optimize _L to _LZ when _L is zero 4257 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4258 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4259 const ConstantFP *ConstantLod; 4260 4261 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4262 m_GFCst(ConstantLod))) { 4263 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4264 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4265 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4266 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4267 Intr->Dim); 4268 4269 // The starting indexes should remain in the same place. 4270 --CorrectedNumVAddrs; 4271 4272 MI.getOperand(MI.getNumExplicitDefs()) 4273 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4274 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4275 Intr = NewImageDimIntr; 4276 } 4277 } 4278 } 4279 4280 // Optimize _mip away, when 'lod' is zero 4281 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4282 int64_t ConstantLod; 4283 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4284 m_ICst(ConstantLod))) { 4285 if (ConstantLod == 0) { 4286 // TODO: Change intrinsic opcode and remove operand instead or replacing 4287 // it with 0, as the _L to _LZ handling is done above. 4288 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4289 --CorrectedNumVAddrs; 4290 } 4291 } 4292 } 4293 4294 // Rewrite the addressing register layout before doing anything else. 4295 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4296 // 16 bit gradients are supported, but are tied to the A16 control 4297 // so both gradients and addresses must be 16 bit 4298 return false; 4299 } 4300 4301 if (IsA16 && !ST.hasA16()) { 4302 // A16 not supported 4303 return false; 4304 } 4305 4306 if (IsA16 || IsG16) { 4307 if (Intr->NumVAddrs > 1) { 4308 SmallVector<Register, 4> PackedRegs; 4309 4310 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4311 IsG16); 4312 4313 // See also below in the non-a16 branch 4314 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4315 4316 if (!UseNSA && PackedRegs.size() > 1) { 4317 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4318 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4319 PackedRegs[0] = Concat.getReg(0); 4320 PackedRegs.resize(1); 4321 } 4322 4323 const unsigned NumPacked = PackedRegs.size(); 4324 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4325 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4326 if (!SrcOp.isReg()) { 4327 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4328 continue; 4329 } 4330 4331 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4332 4333 if (I - Intr->VAddrStart < NumPacked) 4334 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4335 else 4336 SrcOp.setReg(AMDGPU::NoRegister); 4337 } 4338 } 4339 } else { 4340 // If the register allocator cannot place the address registers contiguously 4341 // without introducing moves, then using the non-sequential address encoding 4342 // is always preferable, since it saves VALU instructions and is usually a 4343 // wash in terms of code size or even better. 4344 // 4345 // However, we currently have no way of hinting to the register allocator 4346 // that MIMG addresses should be placed contiguously when it is possible to 4347 // do so, so force non-NSA for the common 2-address case as a heuristic. 4348 // 4349 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4350 // allocation when possible. 4351 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4352 4353 if (!UseNSA && Intr->NumVAddrs > 1) 4354 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4355 Intr->NumVAddrs); 4356 } 4357 4358 int Flags = 0; 4359 if (IsA16) 4360 Flags |= 1; 4361 if (IsG16) 4362 Flags |= 2; 4363 MI.addOperand(MachineOperand::CreateImm(Flags)); 4364 4365 if (BaseOpcode->Store) { // No TFE for stores? 4366 // TODO: Handle dmask trim 4367 Register VData = MI.getOperand(1).getReg(); 4368 LLT Ty = MRI->getType(VData); 4369 if (!Ty.isVector() || Ty.getElementType() != S16) 4370 return true; 4371 4372 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4373 if (RepackedReg != VData) { 4374 MI.getOperand(1).setReg(RepackedReg); 4375 } 4376 4377 return true; 4378 } 4379 4380 Register DstReg = MI.getOperand(0).getReg(); 4381 LLT Ty = MRI->getType(DstReg); 4382 const LLT EltTy = Ty.getScalarType(); 4383 const bool IsD16 = Ty.getScalarType() == S16; 4384 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4385 4386 // Confirm that the return type is large enough for the dmask specified 4387 if (NumElts < DMaskLanes) 4388 return false; 4389 4390 if (NumElts > 4 || DMaskLanes > 4) 4391 return false; 4392 4393 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4394 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4395 4396 // The raw dword aligned data component of the load. The only legal cases 4397 // where this matters should be when using the packed D16 format, for 4398 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4399 LLT RoundedTy; 4400 4401 // S32 vector to to cover all data, plus TFE result element. 4402 LLT TFETy; 4403 4404 // Register type to use for each loaded component. Will be S32 or V2S16. 4405 LLT RegTy; 4406 4407 if (IsD16 && ST.hasUnpackedD16VMem()) { 4408 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4409 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4410 RegTy = S32; 4411 } else { 4412 unsigned EltSize = EltTy.getSizeInBits(); 4413 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4414 unsigned RoundedSize = 32 * RoundedElts; 4415 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4416 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4417 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4418 } 4419 4420 // The return type does not need adjustment. 4421 // TODO: Should we change s16 case to s32 or <2 x s16>? 4422 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4423 return true; 4424 4425 Register Dst1Reg; 4426 4427 // Insert after the instruction. 4428 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4429 4430 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4431 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4432 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4433 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4434 4435 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4436 4437 MI.getOperand(0).setReg(NewResultReg); 4438 4439 // In the IR, TFE is supposed to be used with a 2 element struct return 4440 // type. The intruction really returns these two values in one contiguous 4441 // register, with one additional dword beyond the loaded data. Rewrite the 4442 // return type to use a single register result. 4443 4444 if (IsTFE) { 4445 Dst1Reg = MI.getOperand(1).getReg(); 4446 if (MRI->getType(Dst1Reg) != S32) 4447 return false; 4448 4449 // TODO: Make sure the TFE operand bit is set. 4450 MI.RemoveOperand(1); 4451 4452 // Handle the easy case that requires no repack instructions. 4453 if (Ty == S32) { 4454 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4455 return true; 4456 } 4457 } 4458 4459 // Now figure out how to copy the new result register back into the old 4460 // result. 4461 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4462 4463 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4464 4465 if (ResultNumRegs == 1) { 4466 assert(!IsTFE); 4467 ResultRegs[0] = NewResultReg; 4468 } else { 4469 // We have to repack into a new vector of some kind. 4470 for (int I = 0; I != NumDataRegs; ++I) 4471 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4472 B.buildUnmerge(ResultRegs, NewResultReg); 4473 4474 // Drop the final TFE element to get the data part. The TFE result is 4475 // directly written to the right place already. 4476 if (IsTFE) 4477 ResultRegs.resize(NumDataRegs); 4478 } 4479 4480 // For an s16 scalar result, we form an s32 result with a truncate regardless 4481 // of packed vs. unpacked. 4482 if (IsD16 && !Ty.isVector()) { 4483 B.buildTrunc(DstReg, ResultRegs[0]); 4484 return true; 4485 } 4486 4487 // Avoid a build/concat_vector of 1 entry. 4488 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4489 B.buildBitcast(DstReg, ResultRegs[0]); 4490 return true; 4491 } 4492 4493 assert(Ty.isVector()); 4494 4495 if (IsD16) { 4496 // For packed D16 results with TFE enabled, all the data components are 4497 // S32. Cast back to the expected type. 4498 // 4499 // TODO: We don't really need to use load s32 elements. We would only need one 4500 // cast for the TFE result if a multiple of v2s16 was used. 4501 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4502 for (Register &Reg : ResultRegs) 4503 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4504 } else if (ST.hasUnpackedD16VMem()) { 4505 for (Register &Reg : ResultRegs) 4506 Reg = B.buildTrunc(S16, Reg).getReg(0); 4507 } 4508 } 4509 4510 auto padWithUndef = [&](LLT Ty, int NumElts) { 4511 if (NumElts == 0) 4512 return; 4513 Register Undef = B.buildUndef(Ty).getReg(0); 4514 for (int I = 0; I != NumElts; ++I) 4515 ResultRegs.push_back(Undef); 4516 }; 4517 4518 // Pad out any elements eliminated due to the dmask. 4519 LLT ResTy = MRI->getType(ResultRegs[0]); 4520 if (!ResTy.isVector()) { 4521 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4522 B.buildBuildVector(DstReg, ResultRegs); 4523 return true; 4524 } 4525 4526 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4527 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4528 4529 // Deal with the one annoying legal case. 4530 const LLT V3S16 = LLT::vector(3, 16); 4531 if (Ty == V3S16) { 4532 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4533 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4534 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4535 return true; 4536 } 4537 4538 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4539 B.buildConcatVectors(DstReg, ResultRegs); 4540 return true; 4541 } 4542 4543 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4544 LegalizerHelper &Helper, MachineInstr &MI) const { 4545 MachineIRBuilder &B = Helper.MIRBuilder; 4546 GISelChangeObserver &Observer = Helper.Observer; 4547 4548 Register Dst = MI.getOperand(0).getReg(); 4549 LLT Ty = B.getMRI()->getType(Dst); 4550 unsigned Size = Ty.getSizeInBits(); 4551 MachineFunction &MF = B.getMF(); 4552 4553 Observer.changingInstr(MI); 4554 4555 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4556 Ty = getBitcastRegisterType(Ty); 4557 Helper.bitcastDst(MI, Ty, 0); 4558 Dst = MI.getOperand(0).getReg(); 4559 B.setInsertPt(B.getMBB(), MI); 4560 } 4561 4562 // FIXME: We don't really need this intermediate instruction. The intrinsic 4563 // should be fixed to have a memory operand. Since it's readnone, we're not 4564 // allowed to add one. 4565 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4566 MI.RemoveOperand(1); // Remove intrinsic ID 4567 4568 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4569 // TODO: Should this use datalayout alignment? 4570 const unsigned MemSize = (Size + 7) / 8; 4571 const Align MemAlign(4); 4572 MachineMemOperand *MMO = MF.getMachineMemOperand( 4573 MachinePointerInfo(), 4574 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4575 MachineMemOperand::MOInvariant, 4576 MemSize, MemAlign); 4577 MI.addMemOperand(MF, MMO); 4578 4579 // There are no 96-bit result scalar loads, but widening to 128-bit should 4580 // always be legal. We may need to restore this to a 96-bit result if it turns 4581 // out this needs to be converted to a vector load during RegBankSelect. 4582 if (!isPowerOf2_32(Size)) { 4583 if (Ty.isVector()) 4584 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4585 else 4586 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4587 } 4588 4589 Observer.changedInstr(MI); 4590 return true; 4591 } 4592 4593 // TODO: Move to selection 4594 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4595 MachineRegisterInfo &MRI, 4596 MachineIRBuilder &B) const { 4597 if (!ST.isTrapHandlerEnabled() || 4598 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4599 return legalizeTrapEndpgm(MI, MRI, B); 4600 4601 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4602 switch (*HsaAbiVer) { 4603 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4604 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4605 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4606 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4607 return ST.supportsGetDoorbellID() ? 4608 legalizeTrapHsa(MI, MRI, B) : 4609 legalizeTrapHsaQueuePtr(MI, MRI, B); 4610 } 4611 } 4612 4613 llvm_unreachable("Unknown trap handler"); 4614 } 4615 4616 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4617 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4618 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4619 MI.eraseFromParent(); 4620 return true; 4621 } 4622 4623 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4624 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4625 // Pass queue pointer to trap handler as input, and insert trap instruction 4626 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4627 Register LiveIn = 4628 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4629 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4630 return false; 4631 4632 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4633 B.buildCopy(SGPR01, LiveIn); 4634 B.buildInstr(AMDGPU::S_TRAP) 4635 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4636 .addReg(SGPR01, RegState::Implicit); 4637 4638 MI.eraseFromParent(); 4639 return true; 4640 } 4641 4642 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4643 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4644 B.buildInstr(AMDGPU::S_TRAP) 4645 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4646 MI.eraseFromParent(); 4647 return true; 4648 } 4649 4650 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4651 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4652 // Is non-HSA path or trap-handler disabled? then, report a warning 4653 // accordingly 4654 if (!ST.isTrapHandlerEnabled() || 4655 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4656 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4657 "debugtrap handler not supported", 4658 MI.getDebugLoc(), DS_Warning); 4659 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4660 Ctx.diagnose(NoTrap); 4661 } else { 4662 // Insert debug-trap instruction 4663 B.buildInstr(AMDGPU::S_TRAP) 4664 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4665 } 4666 4667 MI.eraseFromParent(); 4668 return true; 4669 } 4670 4671 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4672 MachineIRBuilder &B) const { 4673 MachineRegisterInfo &MRI = *B.getMRI(); 4674 const LLT S16 = LLT::scalar(16); 4675 const LLT S32 = LLT::scalar(32); 4676 4677 Register DstReg = MI.getOperand(0).getReg(); 4678 Register NodePtr = MI.getOperand(2).getReg(); 4679 Register RayExtent = MI.getOperand(3).getReg(); 4680 Register RayOrigin = MI.getOperand(4).getReg(); 4681 Register RayDir = MI.getOperand(5).getReg(); 4682 Register RayInvDir = MI.getOperand(6).getReg(); 4683 Register TDescr = MI.getOperand(7).getReg(); 4684 4685 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4686 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4687 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4688 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4689 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4690 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4691 4692 SmallVector<Register, 12> Ops; 4693 if (Is64) { 4694 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4695 Ops.push_back(Unmerge.getReg(0)); 4696 Ops.push_back(Unmerge.getReg(1)); 4697 } else { 4698 Ops.push_back(NodePtr); 4699 } 4700 Ops.push_back(RayExtent); 4701 4702 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4703 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4704 Ops.push_back(Unmerge.getReg(0)); 4705 Ops.push_back(Unmerge.getReg(1)); 4706 Ops.push_back(Unmerge.getReg(2)); 4707 }; 4708 4709 packLanes(RayOrigin); 4710 if (IsA16) { 4711 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4712 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4713 Register R1 = MRI.createGenericVirtualRegister(S32); 4714 Register R2 = MRI.createGenericVirtualRegister(S32); 4715 Register R3 = MRI.createGenericVirtualRegister(S32); 4716 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4717 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4718 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4719 Ops.push_back(R1); 4720 Ops.push_back(R2); 4721 Ops.push_back(R3); 4722 } else { 4723 packLanes(RayDir); 4724 packLanes(RayInvDir); 4725 } 4726 4727 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4728 .addDef(DstReg) 4729 .addImm(Opcode); 4730 4731 for (Register R : Ops) { 4732 MIB.addUse(R); 4733 } 4734 4735 MIB.addUse(TDescr) 4736 .addImm(IsA16 ? 1 : 0) 4737 .cloneMemRefs(MI); 4738 4739 MI.eraseFromParent(); 4740 return true; 4741 } 4742 4743 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4744 MachineInstr &MI) const { 4745 MachineIRBuilder &B = Helper.MIRBuilder; 4746 MachineRegisterInfo &MRI = *B.getMRI(); 4747 4748 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4749 auto IntrID = MI.getIntrinsicID(); 4750 switch (IntrID) { 4751 case Intrinsic::amdgcn_if: 4752 case Intrinsic::amdgcn_else: { 4753 MachineInstr *Br = nullptr; 4754 MachineBasicBlock *UncondBrTarget = nullptr; 4755 bool Negated = false; 4756 if (MachineInstr *BrCond = 4757 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4758 const SIRegisterInfo *TRI 4759 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4760 4761 Register Def = MI.getOperand(1).getReg(); 4762 Register Use = MI.getOperand(3).getReg(); 4763 4764 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4765 4766 if (Negated) 4767 std::swap(CondBrTarget, UncondBrTarget); 4768 4769 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4770 if (IntrID == Intrinsic::amdgcn_if) { 4771 B.buildInstr(AMDGPU::SI_IF) 4772 .addDef(Def) 4773 .addUse(Use) 4774 .addMBB(UncondBrTarget); 4775 } else { 4776 B.buildInstr(AMDGPU::SI_ELSE) 4777 .addDef(Def) 4778 .addUse(Use) 4779 .addMBB(UncondBrTarget); 4780 } 4781 4782 if (Br) { 4783 Br->getOperand(0).setMBB(CondBrTarget); 4784 } else { 4785 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4786 // since we're swapping branch targets it needs to be reinserted. 4787 // FIXME: IRTranslator should probably not do this 4788 B.buildBr(*CondBrTarget); 4789 } 4790 4791 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4792 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4793 MI.eraseFromParent(); 4794 BrCond->eraseFromParent(); 4795 return true; 4796 } 4797 4798 return false; 4799 } 4800 case Intrinsic::amdgcn_loop: { 4801 MachineInstr *Br = nullptr; 4802 MachineBasicBlock *UncondBrTarget = nullptr; 4803 bool Negated = false; 4804 if (MachineInstr *BrCond = 4805 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4806 const SIRegisterInfo *TRI 4807 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4808 4809 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4810 Register Reg = MI.getOperand(2).getReg(); 4811 4812 if (Negated) 4813 std::swap(CondBrTarget, UncondBrTarget); 4814 4815 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4816 B.buildInstr(AMDGPU::SI_LOOP) 4817 .addUse(Reg) 4818 .addMBB(UncondBrTarget); 4819 4820 if (Br) 4821 Br->getOperand(0).setMBB(CondBrTarget); 4822 else 4823 B.buildBr(*CondBrTarget); 4824 4825 MI.eraseFromParent(); 4826 BrCond->eraseFromParent(); 4827 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4828 return true; 4829 } 4830 4831 return false; 4832 } 4833 case Intrinsic::amdgcn_kernarg_segment_ptr: 4834 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4835 // This only makes sense to call in a kernel, so just lower to null. 4836 B.buildConstant(MI.getOperand(0).getReg(), 0); 4837 MI.eraseFromParent(); 4838 return true; 4839 } 4840 4841 return legalizePreloadedArgIntrin( 4842 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4843 case Intrinsic::amdgcn_implicitarg_ptr: 4844 return legalizeImplicitArgPtr(MI, MRI, B); 4845 case Intrinsic::amdgcn_workitem_id_x: 4846 return legalizePreloadedArgIntrin(MI, MRI, B, 4847 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4848 case Intrinsic::amdgcn_workitem_id_y: 4849 return legalizePreloadedArgIntrin(MI, MRI, B, 4850 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4851 case Intrinsic::amdgcn_workitem_id_z: 4852 return legalizePreloadedArgIntrin(MI, MRI, B, 4853 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4854 case Intrinsic::amdgcn_workgroup_id_x: 4855 return legalizePreloadedArgIntrin(MI, MRI, B, 4856 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4857 case Intrinsic::amdgcn_workgroup_id_y: 4858 return legalizePreloadedArgIntrin(MI, MRI, B, 4859 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4860 case Intrinsic::amdgcn_workgroup_id_z: 4861 return legalizePreloadedArgIntrin(MI, MRI, B, 4862 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4863 case Intrinsic::amdgcn_dispatch_ptr: 4864 return legalizePreloadedArgIntrin(MI, MRI, B, 4865 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4866 case Intrinsic::amdgcn_queue_ptr: 4867 return legalizePreloadedArgIntrin(MI, MRI, B, 4868 AMDGPUFunctionArgInfo::QUEUE_PTR); 4869 case Intrinsic::amdgcn_implicit_buffer_ptr: 4870 return legalizePreloadedArgIntrin( 4871 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4872 case Intrinsic::amdgcn_dispatch_id: 4873 return legalizePreloadedArgIntrin(MI, MRI, B, 4874 AMDGPUFunctionArgInfo::DISPATCH_ID); 4875 case Intrinsic::amdgcn_fdiv_fast: 4876 return legalizeFDIVFastIntrin(MI, MRI, B); 4877 case Intrinsic::amdgcn_is_shared: 4878 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4879 case Intrinsic::amdgcn_is_private: 4880 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4881 case Intrinsic::amdgcn_wavefrontsize: { 4882 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4883 MI.eraseFromParent(); 4884 return true; 4885 } 4886 case Intrinsic::amdgcn_s_buffer_load: 4887 return legalizeSBufferLoad(Helper, MI); 4888 case Intrinsic::amdgcn_raw_buffer_store: 4889 case Intrinsic::amdgcn_struct_buffer_store: 4890 return legalizeBufferStore(MI, MRI, B, false, false); 4891 case Intrinsic::amdgcn_raw_buffer_store_format: 4892 case Intrinsic::amdgcn_struct_buffer_store_format: 4893 return legalizeBufferStore(MI, MRI, B, false, true); 4894 case Intrinsic::amdgcn_raw_tbuffer_store: 4895 case Intrinsic::amdgcn_struct_tbuffer_store: 4896 return legalizeBufferStore(MI, MRI, B, true, true); 4897 case Intrinsic::amdgcn_raw_buffer_load: 4898 case Intrinsic::amdgcn_struct_buffer_load: 4899 return legalizeBufferLoad(MI, MRI, B, false, false); 4900 case Intrinsic::amdgcn_raw_buffer_load_format: 4901 case Intrinsic::amdgcn_struct_buffer_load_format: 4902 return legalizeBufferLoad(MI, MRI, B, true, false); 4903 case Intrinsic::amdgcn_raw_tbuffer_load: 4904 case Intrinsic::amdgcn_struct_tbuffer_load: 4905 return legalizeBufferLoad(MI, MRI, B, true, true); 4906 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4907 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4908 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4909 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4910 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4911 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4912 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4913 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4914 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4915 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4916 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4917 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4918 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4919 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4920 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4921 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4922 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4923 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4924 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4925 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4926 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4927 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4928 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4929 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4930 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4931 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4932 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4933 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4934 case Intrinsic::amdgcn_buffer_atomic_fadd: 4935 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4936 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4937 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4938 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4939 return legalizeBufferAtomic(MI, B, IntrID); 4940 case Intrinsic::amdgcn_atomic_inc: 4941 return legalizeAtomicIncDec(MI, B, true); 4942 case Intrinsic::amdgcn_atomic_dec: 4943 return legalizeAtomicIncDec(MI, B, false); 4944 case Intrinsic::trap: 4945 return legalizeTrapIntrinsic(MI, MRI, B); 4946 case Intrinsic::debugtrap: 4947 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4948 case Intrinsic::amdgcn_rsq_clamp: 4949 return legalizeRsqClampIntrinsic(MI, MRI, B); 4950 case Intrinsic::amdgcn_ds_fadd: 4951 case Intrinsic::amdgcn_ds_fmin: 4952 case Intrinsic::amdgcn_ds_fmax: 4953 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4954 case Intrinsic::amdgcn_image_bvh_intersect_ray: 4955 return legalizeBVHIntrinsic(MI, B); 4956 default: { 4957 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4958 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4959 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4960 return true; 4961 } 4962 } 4963 4964 return true; 4965 } 4966