1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/ADT/ScopeExit.h" 22 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 25 #include "llvm/IR/DiagnosticInfo.h" 26 #include "llvm/IR/IntrinsicsAMDGPU.h" 27 28 #define DEBUG_TYPE "amdgpu-legalinfo" 29 30 using namespace llvm; 31 using namespace LegalizeActions; 32 using namespace LegalizeMutations; 33 using namespace LegalityPredicates; 34 using namespace MIPatternMatch; 35 36 // Hack until load/store selection patterns support any tuple of legal types. 37 static cl::opt<bool> EnableNewLegality( 38 "amdgpu-global-isel-new-legality", 39 cl::desc("Use GlobalISel desired legality, rather than try to use" 40 "rules compatible with selection patterns"), 41 cl::init(false), 42 cl::ReallyHidden); 43 44 static constexpr unsigned MaxRegisterSize = 1024; 45 46 // Round the number of elements to the next power of two elements 47 static LLT getPow2VectorType(LLT Ty) { 48 unsigned NElts = Ty.getNumElements(); 49 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 50 return Ty.changeNumElements(Pow2NElts); 51 } 52 53 // Round the number of bits to the next power of two bits 54 static LLT getPow2ScalarType(LLT Ty) { 55 unsigned Bits = Ty.getSizeInBits(); 56 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 57 return LLT::scalar(Pow2Bits); 58 } 59 60 /// \returs true if this is an odd sized vector which should widen by adding an 61 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 62 /// excludes s1 vectors, which should always be scalarized. 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 if (!Ty.isVector()) 67 return false; 68 69 const LLT EltTy = Ty.getElementType(); 70 const unsigned EltSize = EltTy.getSizeInBits(); 71 return Ty.getNumElements() % 2 != 0 && 72 EltSize > 1 && EltSize < 32 && 73 Ty.getSizeInBits() % 32 != 0; 74 }; 75 } 76 77 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 78 return [=](const LegalityQuery &Query) { 79 const LLT Ty = Query.Types[TypeIdx]; 80 return Ty.getSizeInBits() % 32 == 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LLT getBitcastRegisterType(const LLT Ty) { 130 const unsigned Size = Ty.getSizeInBits(); 131 132 LLT CoercedTy; 133 if (Size <= 32) { 134 // <2 x s8> -> s16 135 // <4 x s8> -> s32 136 return LLT::scalar(Size); 137 } 138 139 return LLT::scalarOrVector(Size / 32, 32); 140 } 141 142 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 143 return [=](const LegalityQuery &Query) { 144 const LLT Ty = Query.Types[TypeIdx]; 145 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 146 }; 147 } 148 149 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT Ty = Query.Types[TypeIdx]; 152 unsigned Size = Ty.getSizeInBits(); 153 assert(Size % 32 == 0); 154 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 155 }; 156 } 157 158 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 159 return [=](const LegalityQuery &Query) { 160 const LLT QueryTy = Query.Types[TypeIdx]; 161 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 162 }; 163 } 164 165 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 166 return [=](const LegalityQuery &Query) { 167 const LLT QueryTy = Query.Types[TypeIdx]; 168 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 169 }; 170 } 171 172 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 173 return [=](const LegalityQuery &Query) { 174 const LLT QueryTy = Query.Types[TypeIdx]; 175 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 176 }; 177 } 178 179 static bool isRegisterSize(unsigned Size) { 180 return Size % 32 == 0 && Size <= MaxRegisterSize; 181 } 182 183 static bool isRegisterVectorElementType(LLT EltTy) { 184 const int EltSize = EltTy.getSizeInBits(); 185 return EltSize == 16 || EltSize % 32 == 0; 186 } 187 188 static bool isRegisterVectorType(LLT Ty) { 189 const int EltSize = Ty.getElementType().getSizeInBits(); 190 return EltSize == 32 || EltSize == 64 || 191 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 192 EltSize == 128 || EltSize == 256; 193 } 194 195 static bool isRegisterType(LLT Ty) { 196 if (!isRegisterSize(Ty.getSizeInBits())) 197 return false; 198 199 if (Ty.isVector()) 200 return isRegisterVectorType(Ty); 201 202 return true; 203 } 204 205 // Any combination of 32 or 64-bit elements up the maximum register size, and 206 // multiples of v2s16. 207 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 208 return [=](const LegalityQuery &Query) { 209 return isRegisterType(Query.Types[TypeIdx]); 210 }; 211 } 212 213 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 const LLT QueryTy = Query.Types[TypeIdx]; 216 if (!QueryTy.isVector()) 217 return false; 218 const LLT EltTy = QueryTy.getElementType(); 219 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 220 }; 221 } 222 223 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 224 return [=](const LegalityQuery &Query) { 225 const LLT Ty = Query.Types[TypeIdx]; 226 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 227 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 228 }; 229 } 230 231 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 232 // handle some operations by just promoting the register during 233 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 234 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 235 bool IsLoad) { 236 switch (AS) { 237 case AMDGPUAS::PRIVATE_ADDRESS: 238 // FIXME: Private element size. 239 return ST.enableFlatScratch() ? 128 : 32; 240 case AMDGPUAS::LOCAL_ADDRESS: 241 return ST.useDS128() ? 128 : 64; 242 case AMDGPUAS::GLOBAL_ADDRESS: 243 case AMDGPUAS::CONSTANT_ADDRESS: 244 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 245 // Treat constant and global as identical. SMRD loads are sometimes usable for 246 // global loads (ideally constant address space should be eliminated) 247 // depending on the context. Legality cannot be context dependent, but 248 // RegBankSelect can split the load as necessary depending on the pointer 249 // register bank/uniformity and if the memory is invariant or not written in a 250 // kernel. 251 return IsLoad ? 512 : 128; 252 default: 253 // Flat addresses may contextually need to be split to 32-bit parts if they 254 // may alias scratch depending on the subtarget. 255 return 128; 256 } 257 } 258 259 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 260 const LegalityQuery &Query, 261 unsigned Opcode) { 262 const LLT Ty = Query.Types[0]; 263 264 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 265 const bool IsLoad = Opcode != AMDGPU::G_STORE; 266 267 unsigned RegSize = Ty.getSizeInBits(); 268 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 269 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 270 unsigned AS = Query.Types[1].getAddressSpace(); 271 272 // All of these need to be custom lowered to cast the pointer operand. 273 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 274 return false; 275 276 // TODO: We should be able to widen loads if the alignment is high enough, but 277 // we also need to modify the memory access size. 278 #if 0 279 // Accept widening loads based on alignment. 280 if (IsLoad && MemSize < Size) 281 MemSize = std::max(MemSize, Align); 282 #endif 283 284 // Only 1-byte and 2-byte to 32-bit extloads are valid. 285 if (MemSize != RegSize && RegSize != 32) 286 return false; 287 288 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 289 return false; 290 291 switch (MemSize) { 292 case 8: 293 case 16: 294 case 32: 295 case 64: 296 case 128: 297 break; 298 case 96: 299 if (!ST.hasDwordx3LoadStores()) 300 return false; 301 break; 302 case 256: 303 case 512: 304 // These may contextually need to be broken down. 305 break; 306 default: 307 return false; 308 } 309 310 assert(RegSize >= MemSize); 311 312 if (AlignBits < MemSize) { 313 const SITargetLowering *TLI = ST.getTargetLowering(); 314 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 315 Align(AlignBits / 8))) 316 return false; 317 } 318 319 return true; 320 } 321 322 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 323 // workaround this. Eventually it should ignore the type for loads and only care 324 // about the size. Return true in cases where we will workaround this for now by 325 // bitcasting. 326 static bool loadStoreBitcastWorkaround(const LLT Ty) { 327 if (EnableNewLegality) 328 return false; 329 330 const unsigned Size = Ty.getSizeInBits(); 331 if (Size <= 64) 332 return false; 333 if (!Ty.isVector()) 334 return true; 335 336 LLT EltTy = Ty.getElementType(); 337 if (EltTy.isPointer()) 338 return true; 339 340 unsigned EltSize = EltTy.getSizeInBits(); 341 return EltSize != 32 && EltSize != 64; 342 } 343 344 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 345 unsigned Opcode) { 346 const LLT Ty = Query.Types[0]; 347 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 348 !loadStoreBitcastWorkaround(Ty); 349 } 350 351 /// Return true if a load or store of the type should be lowered with a bitcast 352 /// to a different type. 353 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 354 const unsigned MemSizeInBits) { 355 const unsigned Size = Ty.getSizeInBits(); 356 if (Size != MemSizeInBits) 357 return Size <= 32 && Ty.isVector(); 358 359 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 360 return true; 361 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 362 !isRegisterVectorElementType(Ty.getElementType()); 363 } 364 365 /// Return true if we should legalize a load by widening an odd sized memory 366 /// access up to the alignment. Note this case when the memory access itself 367 /// changes, not the size of the result register. 368 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 369 unsigned AlignInBits, unsigned AddrSpace, 370 unsigned Opcode) { 371 // We don't want to widen cases that are naturally legal. 372 if (isPowerOf2_32(SizeInBits)) 373 return false; 374 375 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 376 // end up widening these for a scalar load during RegBankSelect, since there 377 // aren't 96-bit scalar loads. 378 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 379 return false; 380 381 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 382 return false; 383 384 // A load is known dereferenceable up to the alignment, so it's legal to widen 385 // to it. 386 // 387 // TODO: Could check dereferenceable for less aligned cases. 388 unsigned RoundedSize = NextPowerOf2(SizeInBits); 389 if (AlignInBits < RoundedSize) 390 return false; 391 392 // Do not widen if it would introduce a slow unaligned load. 393 const SITargetLowering *TLI = ST.getTargetLowering(); 394 bool Fast = false; 395 return TLI->allowsMisalignedMemoryAccessesImpl( 396 RoundedSize, AddrSpace, Align(AlignInBits / 8), 397 MachineMemOperand::MOLoad, &Fast) && 398 Fast; 399 } 400 401 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 402 unsigned Opcode) { 403 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 404 return false; 405 406 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 407 Query.MMODescrs[0].AlignInBits, 408 Query.Types[1].getAddressSpace(), Opcode); 409 } 410 411 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 412 const GCNTargetMachine &TM) 413 : ST(ST_) { 414 using namespace TargetOpcode; 415 416 auto GetAddrSpacePtr = [&TM](unsigned AS) { 417 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 418 }; 419 420 const LLT S1 = LLT::scalar(1); 421 const LLT S8 = LLT::scalar(8); 422 const LLT S16 = LLT::scalar(16); 423 const LLT S32 = LLT::scalar(32); 424 const LLT S64 = LLT::scalar(64); 425 const LLT S128 = LLT::scalar(128); 426 const LLT S256 = LLT::scalar(256); 427 const LLT S512 = LLT::scalar(512); 428 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 429 430 const LLT V2S8 = LLT::vector(2, 8); 431 const LLT V2S16 = LLT::vector(2, 16); 432 const LLT V4S16 = LLT::vector(4, 16); 433 434 const LLT V2S32 = LLT::vector(2, 32); 435 const LLT V3S32 = LLT::vector(3, 32); 436 const LLT V4S32 = LLT::vector(4, 32); 437 const LLT V5S32 = LLT::vector(5, 32); 438 const LLT V6S32 = LLT::vector(6, 32); 439 const LLT V7S32 = LLT::vector(7, 32); 440 const LLT V8S32 = LLT::vector(8, 32); 441 const LLT V9S32 = LLT::vector(9, 32); 442 const LLT V10S32 = LLT::vector(10, 32); 443 const LLT V11S32 = LLT::vector(11, 32); 444 const LLT V12S32 = LLT::vector(12, 32); 445 const LLT V13S32 = LLT::vector(13, 32); 446 const LLT V14S32 = LLT::vector(14, 32); 447 const LLT V15S32 = LLT::vector(15, 32); 448 const LLT V16S32 = LLT::vector(16, 32); 449 const LLT V32S32 = LLT::vector(32, 32); 450 451 const LLT V2S64 = LLT::vector(2, 64); 452 const LLT V3S64 = LLT::vector(3, 64); 453 const LLT V4S64 = LLT::vector(4, 64); 454 const LLT V5S64 = LLT::vector(5, 64); 455 const LLT V6S64 = LLT::vector(6, 64); 456 const LLT V7S64 = LLT::vector(7, 64); 457 const LLT V8S64 = LLT::vector(8, 64); 458 const LLT V16S64 = LLT::vector(16, 64); 459 460 std::initializer_list<LLT> AllS32Vectors = 461 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 462 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 463 std::initializer_list<LLT> AllS64Vectors = 464 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 465 466 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 467 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 468 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 469 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 470 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 471 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 472 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 473 474 const LLT CodePtr = FlatPtr; 475 476 const std::initializer_list<LLT> AddrSpaces64 = { 477 GlobalPtr, ConstantPtr, FlatPtr 478 }; 479 480 const std::initializer_list<LLT> AddrSpaces32 = { 481 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 482 }; 483 484 const std::initializer_list<LLT> FPTypesBase = { 485 S32, S64 486 }; 487 488 const std::initializer_list<LLT> FPTypes16 = { 489 S32, S64, S16 490 }; 491 492 const std::initializer_list<LLT> FPTypesPK16 = { 493 S32, S64, S16, V2S16 494 }; 495 496 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 497 498 setAction({G_BRCOND, S1}, Legal); // VCC branches 499 setAction({G_BRCOND, S32}, Legal); // SCC branches 500 501 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 502 // elements for v3s16 503 getActionDefinitionsBuilder(G_PHI) 504 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 505 .legalFor(AllS32Vectors) 506 .legalFor(AllS64Vectors) 507 .legalFor(AddrSpaces64) 508 .legalFor(AddrSpaces32) 509 .legalIf(isPointer(0)) 510 .clampScalar(0, S16, S256) 511 .widenScalarToNextPow2(0, 32) 512 .clampMaxNumElements(0, S32, 16) 513 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 514 .scalarize(0); 515 516 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 517 // Full set of gfx9 features. 518 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 519 .legalFor({S32, S16, V2S16}) 520 .clampScalar(0, S16, S32) 521 .clampMaxNumElements(0, S16, 2) 522 .scalarize(0) 523 .widenScalarToNextPow2(0, 32); 524 525 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 526 .legalFor({S32, S16, V2S16}) // Clamp modifier 527 .minScalarOrElt(0, S16) 528 .clampMaxNumElements(0, S16, 2) 529 .scalarize(0) 530 .widenScalarToNextPow2(0, 32) 531 .lower(); 532 } else if (ST.has16BitInsts()) { 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16}) 535 .clampScalar(0, S16, S32) 536 .scalarize(0) 537 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 538 539 // Technically the saturating operations require clamp bit support, but this 540 // was introduced at the same time as 16-bit operations. 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 542 .legalFor({S32, S16}) // Clamp modifier 543 .minScalar(0, S16) 544 .scalarize(0) 545 .widenScalarToNextPow2(0, 16) 546 .lower(); 547 548 // We're just lowering this, but it helps get a better result to try to 549 // coerce to the desired type first. 550 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 551 .minScalar(0, S16) 552 .scalarize(0) 553 .lower(); 554 } else { 555 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 556 .legalFor({S32}) 557 .clampScalar(0, S32, S32) 558 .scalarize(0); 559 560 if (ST.hasIntClamp()) { 561 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 562 .legalFor({S32}) // Clamp modifier. 563 .scalarize(0) 564 .minScalarOrElt(0, S32) 565 .lower(); 566 } else { 567 // Clamp bit support was added in VI, along with 16-bit operations. 568 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 569 .minScalar(0, S32) 570 .scalarize(0) 571 .lower(); 572 } 573 574 // FIXME: DAG expansion gets better results. The widening uses the smaller 575 // range values and goes for the min/max lowering directly. 576 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 577 .minScalar(0, S32) 578 .scalarize(0) 579 .lower(); 580 } 581 582 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 583 .customFor({S32, S64}) 584 .clampScalar(0, S32, S64) 585 .widenScalarToNextPow2(0, 32) 586 .scalarize(0); 587 588 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 589 .legalFor({S32}) 590 .maxScalarOrElt(0, S32); 591 592 if (ST.hasVOP3PInsts()) { 593 Mulh 594 .clampMaxNumElements(0, S8, 2) 595 .lowerFor({V2S8}); 596 } 597 598 Mulh 599 .scalarize(0) 600 .lower(); 601 602 // Report legal for any types we can handle anywhere. For the cases only legal 603 // on the SALU, RegBankSelect will be able to re-legalize. 604 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 605 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 606 .clampScalar(0, S32, S64) 607 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 608 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 609 .widenScalarToNextPow2(0) 610 .scalarize(0); 611 612 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 613 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 614 .legalFor({{S32, S1}, {S32, S32}}) 615 .minScalar(0, S32) 616 // TODO: .scalarize(0) 617 .lower(); 618 619 getActionDefinitionsBuilder(G_BITCAST) 620 // Don't worry about the size constraint. 621 .legalIf(all(isRegisterType(0), isRegisterType(1))) 622 .lower(); 623 624 625 getActionDefinitionsBuilder(G_CONSTANT) 626 .legalFor({S1, S32, S64, S16, GlobalPtr, 627 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 628 .legalIf(isPointer(0)) 629 .clampScalar(0, S32, S64) 630 .widenScalarToNextPow2(0); 631 632 getActionDefinitionsBuilder(G_FCONSTANT) 633 .legalFor({S32, S64, S16}) 634 .clampScalar(0, S16, S64); 635 636 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 637 .legalIf(isRegisterType(0)) 638 // s1 and s16 are special cases because they have legal operations on 639 // them, but don't really occupy registers in the normal way. 640 .legalFor({S1, S16}) 641 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 642 .clampScalarOrElt(0, S32, MaxScalar) 643 .widenScalarToNextPow2(0, 32) 644 .clampMaxNumElements(0, S32, 16); 645 646 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 647 648 // If the amount is divergent, we have to do a wave reduction to get the 649 // maximum value, so this is expanded during RegBankSelect. 650 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 651 .legalFor({{PrivatePtr, S32}}); 652 653 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 654 .customIf(typeIsNot(0, PrivatePtr)); 655 656 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 657 658 auto &FPOpActions = getActionDefinitionsBuilder( 659 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 660 .legalFor({S32, S64}); 661 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 662 .customFor({S32, S64}); 663 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 664 .customFor({S32, S64}); 665 666 if (ST.has16BitInsts()) { 667 if (ST.hasVOP3PInsts()) 668 FPOpActions.legalFor({S16, V2S16}); 669 else 670 FPOpActions.legalFor({S16}); 671 672 TrigActions.customFor({S16}); 673 FDIVActions.customFor({S16}); 674 } 675 676 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 677 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 678 679 if (ST.hasVOP3PInsts()) { 680 MinNumMaxNum.customFor(FPTypesPK16) 681 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 682 .clampMaxNumElements(0, S16, 2) 683 .clampScalar(0, S16, S64) 684 .scalarize(0); 685 } else if (ST.has16BitInsts()) { 686 MinNumMaxNum.customFor(FPTypes16) 687 .clampScalar(0, S16, S64) 688 .scalarize(0); 689 } else { 690 MinNumMaxNum.customFor(FPTypesBase) 691 .clampScalar(0, S32, S64) 692 .scalarize(0); 693 } 694 695 if (ST.hasVOP3PInsts()) 696 FPOpActions.clampMaxNumElements(0, S16, 2); 697 698 FPOpActions 699 .scalarize(0) 700 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 701 702 TrigActions 703 .scalarize(0) 704 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 705 706 FDIVActions 707 .scalarize(0) 708 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 709 710 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 711 .legalFor(FPTypesPK16) 712 .clampMaxNumElements(0, S16, 2) 713 .scalarize(0) 714 .clampScalar(0, S16, S64); 715 716 if (ST.has16BitInsts()) { 717 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 718 .legalFor({S32, S64, S16}) 719 .scalarize(0) 720 .clampScalar(0, S16, S64); 721 } else { 722 getActionDefinitionsBuilder(G_FSQRT) 723 .legalFor({S32, S64}) 724 .scalarize(0) 725 .clampScalar(0, S32, S64); 726 727 if (ST.hasFractBug()) { 728 getActionDefinitionsBuilder(G_FFLOOR) 729 .customFor({S64}) 730 .legalFor({S32, S64}) 731 .scalarize(0) 732 .clampScalar(0, S32, S64); 733 } else { 734 getActionDefinitionsBuilder(G_FFLOOR) 735 .legalFor({S32, S64}) 736 .scalarize(0) 737 .clampScalar(0, S32, S64); 738 } 739 } 740 741 getActionDefinitionsBuilder(G_FPTRUNC) 742 .legalFor({{S32, S64}, {S16, S32}}) 743 .scalarize(0) 744 .lower(); 745 746 getActionDefinitionsBuilder(G_FPEXT) 747 .legalFor({{S64, S32}, {S32, S16}}) 748 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 749 .scalarize(0); 750 751 getActionDefinitionsBuilder(G_FSUB) 752 // Use actual fsub instruction 753 .legalFor({S32}) 754 // Must use fadd + fneg 755 .lowerFor({S64, S16, V2S16}) 756 .scalarize(0) 757 .clampScalar(0, S32, S64); 758 759 // Whether this is legal depends on the floating point mode for the function. 760 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 761 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 762 FMad.customFor({S32, S16}); 763 else if (ST.hasMadMacF32Insts()) 764 FMad.customFor({S32}); 765 else if (ST.hasMadF16()) 766 FMad.customFor({S16}); 767 FMad.scalarize(0) 768 .lower(); 769 770 auto &FRem = getActionDefinitionsBuilder(G_FREM); 771 if (ST.has16BitInsts()) { 772 FRem.customFor({S16, S32, S64}); 773 } else { 774 FRem.minScalar(0, S32) 775 .customFor({S32, S64}); 776 } 777 FRem.scalarize(0); 778 779 // TODO: Do we need to clamp maximum bitwidth? 780 getActionDefinitionsBuilder(G_TRUNC) 781 .legalIf(isScalar(0)) 782 .legalFor({{V2S16, V2S32}}) 783 .clampMaxNumElements(0, S16, 2) 784 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 785 // situations (like an invalid implicit use), we don't want to infinite loop 786 // in the legalizer. 787 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 788 .alwaysLegal(); 789 790 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 791 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 792 {S32, S1}, {S64, S1}, {S16, S1}}) 793 .scalarize(0) 794 .clampScalar(0, S32, S64) 795 .widenScalarToNextPow2(1, 32); 796 797 // TODO: Split s1->s64 during regbankselect for VALU. 798 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 799 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 800 .lowerFor({{S32, S64}}) 801 .lowerIf(typeIs(1, S1)) 802 .customFor({{S64, S64}}); 803 if (ST.has16BitInsts()) 804 IToFP.legalFor({{S16, S16}}); 805 IToFP.clampScalar(1, S32, S64) 806 .minScalar(0, S32) 807 .scalarize(0) 808 .widenScalarToNextPow2(1); 809 810 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 811 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 812 .customFor({{S64, S64}}) 813 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 814 if (ST.has16BitInsts()) 815 FPToI.legalFor({{S16, S16}}); 816 else 817 FPToI.minScalar(1, S32); 818 819 FPToI.minScalar(0, S32) 820 .scalarize(0) 821 .lower(); 822 823 // Lower roundeven into G_FRINT 824 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 825 .scalarize(0) 826 .lower(); 827 828 if (ST.has16BitInsts()) { 829 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 830 .legalFor({S16, S32, S64}) 831 .clampScalar(0, S16, S64) 832 .scalarize(0); 833 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 834 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 835 .legalFor({S32, S64}) 836 .clampScalar(0, S32, S64) 837 .scalarize(0); 838 } else { 839 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 840 .legalFor({S32}) 841 .customFor({S64}) 842 .clampScalar(0, S32, S64) 843 .scalarize(0); 844 } 845 846 getActionDefinitionsBuilder(G_PTR_ADD) 847 .legalIf(all(isPointer(0), sameSize(0, 1))) 848 .scalarize(0) 849 .scalarSameSizeAs(1, 0); 850 851 getActionDefinitionsBuilder(G_PTRMASK) 852 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 853 .scalarSameSizeAs(1, 0) 854 .scalarize(0); 855 856 auto &CmpBuilder = 857 getActionDefinitionsBuilder(G_ICMP) 858 // The compare output type differs based on the register bank of the output, 859 // so make both s1 and s32 legal. 860 // 861 // Scalar compares producing output in scc will be promoted to s32, as that 862 // is the allocatable register type that will be needed for the copy from 863 // scc. This will be promoted during RegBankSelect, and we assume something 864 // before that won't try to use s32 result types. 865 // 866 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 867 // bank. 868 .legalForCartesianProduct( 869 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 870 .legalForCartesianProduct( 871 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 872 if (ST.has16BitInsts()) { 873 CmpBuilder.legalFor({{S1, S16}}); 874 } 875 876 CmpBuilder 877 .widenScalarToNextPow2(1) 878 .clampScalar(1, S32, S64) 879 .scalarize(0) 880 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 881 882 getActionDefinitionsBuilder(G_FCMP) 883 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 884 .widenScalarToNextPow2(1) 885 .clampScalar(1, S32, S64) 886 .scalarize(0); 887 888 // FIXME: fpow has a selection pattern that should move to custom lowering. 889 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 890 if (ST.has16BitInsts()) 891 Exp2Ops.legalFor({S32, S16}); 892 else 893 Exp2Ops.legalFor({S32}); 894 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 895 Exp2Ops.scalarize(0); 896 897 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 898 if (ST.has16BitInsts()) 899 ExpOps.customFor({{S32}, {S16}}); 900 else 901 ExpOps.customFor({S32}); 902 ExpOps.clampScalar(0, MinScalarFPTy, S32) 903 .scalarize(0); 904 905 getActionDefinitionsBuilder(G_FPOWI) 906 .clampScalar(0, MinScalarFPTy, S32) 907 .lower(); 908 909 // The 64-bit versions produce 32-bit results, but only on the SALU. 910 getActionDefinitionsBuilder(G_CTPOP) 911 .legalFor({{S32, S32}, {S32, S64}}) 912 .clampScalar(0, S32, S32) 913 .clampScalar(1, S32, S64) 914 .scalarize(0) 915 .widenScalarToNextPow2(0, 32) 916 .widenScalarToNextPow2(1, 32); 917 918 // The hardware instructions return a different result on 0 than the generic 919 // instructions expect. The hardware produces -1, but these produce the 920 // bitwidth. 921 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 922 .scalarize(0) 923 .clampScalar(0, S32, S32) 924 .clampScalar(1, S32, S64) 925 .widenScalarToNextPow2(0, 32) 926 .widenScalarToNextPow2(1, 32) 927 .lower(); 928 929 // The 64-bit versions produce 32-bit results, but only on the SALU. 930 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 931 .legalFor({{S32, S32}, {S32, S64}}) 932 .clampScalar(0, S32, S32) 933 .clampScalar(1, S32, S64) 934 .scalarize(0) 935 .widenScalarToNextPow2(0, 32) 936 .widenScalarToNextPow2(1, 32); 937 938 getActionDefinitionsBuilder(G_BITREVERSE) 939 .legalFor({S32}) 940 .clampScalar(0, S32, S32) 941 .scalarize(0); 942 943 if (ST.has16BitInsts()) { 944 getActionDefinitionsBuilder(G_BSWAP) 945 .legalFor({S16, S32, V2S16}) 946 .clampMaxNumElements(0, S16, 2) 947 // FIXME: Fixing non-power-of-2 before clamp is workaround for 948 // narrowScalar limitation. 949 .widenScalarToNextPow2(0) 950 .clampScalar(0, S16, S32) 951 .scalarize(0); 952 953 if (ST.hasVOP3PInsts()) { 954 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 955 .legalFor({S32, S16, V2S16}) 956 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 957 .clampMaxNumElements(0, S16, 2) 958 .minScalar(0, S16) 959 .widenScalarToNextPow2(0) 960 .scalarize(0) 961 .lower(); 962 } else { 963 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 964 .legalFor({S32, S16}) 965 .widenScalarToNextPow2(0) 966 .minScalar(0, S16) 967 .scalarize(0) 968 .lower(); 969 } 970 } else { 971 // TODO: Should have same legality without v_perm_b32 972 getActionDefinitionsBuilder(G_BSWAP) 973 .legalFor({S32}) 974 .lowerIf(scalarNarrowerThan(0, 32)) 975 // FIXME: Fixing non-power-of-2 before clamp is workaround for 976 // narrowScalar limitation. 977 .widenScalarToNextPow2(0) 978 .maxScalar(0, S32) 979 .scalarize(0) 980 .lower(); 981 982 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 983 .legalFor({S32}) 984 .minScalar(0, S32) 985 .widenScalarToNextPow2(0) 986 .scalarize(0) 987 .lower(); 988 } 989 990 getActionDefinitionsBuilder(G_INTTOPTR) 991 // List the common cases 992 .legalForCartesianProduct(AddrSpaces64, {S64}) 993 .legalForCartesianProduct(AddrSpaces32, {S32}) 994 .scalarize(0) 995 // Accept any address space as long as the size matches 996 .legalIf(sameSize(0, 1)) 997 .widenScalarIf(smallerThan(1, 0), 998 [](const LegalityQuery &Query) { 999 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1000 }) 1001 .narrowScalarIf(largerThan(1, 0), 1002 [](const LegalityQuery &Query) { 1003 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1004 }); 1005 1006 getActionDefinitionsBuilder(G_PTRTOINT) 1007 // List the common cases 1008 .legalForCartesianProduct(AddrSpaces64, {S64}) 1009 .legalForCartesianProduct(AddrSpaces32, {S32}) 1010 .scalarize(0) 1011 // Accept any address space as long as the size matches 1012 .legalIf(sameSize(0, 1)) 1013 .widenScalarIf(smallerThan(0, 1), 1014 [](const LegalityQuery &Query) { 1015 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1016 }) 1017 .narrowScalarIf( 1018 largerThan(0, 1), 1019 [](const LegalityQuery &Query) { 1020 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1021 }); 1022 1023 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1024 .scalarize(0) 1025 .custom(); 1026 1027 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1028 bool IsLoad) -> bool { 1029 const LLT DstTy = Query.Types[0]; 1030 1031 // Split vector extloads. 1032 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1033 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1034 1035 if (MemSize < DstTy.getSizeInBits()) 1036 MemSize = std::max(MemSize, AlignBits); 1037 1038 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1039 return true; 1040 1041 const LLT PtrTy = Query.Types[1]; 1042 unsigned AS = PtrTy.getAddressSpace(); 1043 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1044 return true; 1045 1046 // Catch weird sized loads that don't evenly divide into the access sizes 1047 // TODO: May be able to widen depending on alignment etc. 1048 unsigned NumRegs = (MemSize + 31) / 32; 1049 if (NumRegs == 3) { 1050 if (!ST.hasDwordx3LoadStores()) 1051 return true; 1052 } else { 1053 // If the alignment allows, these should have been widened. 1054 if (!isPowerOf2_32(NumRegs)) 1055 return true; 1056 } 1057 1058 if (AlignBits < MemSize) { 1059 const SITargetLowering *TLI = ST.getTargetLowering(); 1060 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1061 Align(AlignBits / 8)); 1062 } 1063 1064 return false; 1065 }; 1066 1067 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1068 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1069 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1070 1071 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1072 // LDS 1073 // TODO: Unsupported flat for SI. 1074 1075 for (unsigned Op : {G_LOAD, G_STORE}) { 1076 const bool IsStore = Op == G_STORE; 1077 1078 auto &Actions = getActionDefinitionsBuilder(Op); 1079 // Explicitly list some common cases. 1080 // TODO: Does this help compile time at all? 1081 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1082 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1083 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1084 {S64, GlobalPtr, 64, GlobalAlign32}, 1085 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1086 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1087 {S32, GlobalPtr, 8, GlobalAlign8}, 1088 {S32, GlobalPtr, 16, GlobalAlign16}, 1089 1090 {S32, LocalPtr, 32, 32}, 1091 {S64, LocalPtr, 64, 32}, 1092 {V2S32, LocalPtr, 64, 32}, 1093 {S32, LocalPtr, 8, 8}, 1094 {S32, LocalPtr, 16, 16}, 1095 {V2S16, LocalPtr, 32, 32}, 1096 1097 {S32, PrivatePtr, 32, 32}, 1098 {S32, PrivatePtr, 8, 8}, 1099 {S32, PrivatePtr, 16, 16}, 1100 {V2S16, PrivatePtr, 32, 32}, 1101 1102 {S32, ConstantPtr, 32, GlobalAlign32}, 1103 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1104 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1105 {S64, ConstantPtr, 64, GlobalAlign32}, 1106 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1107 Actions.legalIf( 1108 [=](const LegalityQuery &Query) -> bool { 1109 return isLoadStoreLegal(ST, Query, Op); 1110 }); 1111 1112 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1113 // 64-bits. 1114 // 1115 // TODO: Should generalize bitcast action into coerce, which will also cover 1116 // inserting addrspacecasts. 1117 Actions.customIf(typeIs(1, Constant32Ptr)); 1118 1119 // Turn any illegal element vectors into something easier to deal 1120 // with. These will ultimately produce 32-bit scalar shifts to extract the 1121 // parts anyway. 1122 // 1123 // For odd 16-bit element vectors, prefer to split those into pieces with 1124 // 16-bit vector parts. 1125 Actions.bitcastIf( 1126 [=](const LegalityQuery &Query) -> bool { 1127 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1128 Query.MMODescrs[0].SizeInBits); 1129 }, bitcastToRegisterType(0)); 1130 1131 if (!IsStore) { 1132 // Widen suitably aligned loads by loading extra bytes. The standard 1133 // legalization actions can't properly express widening memory operands. 1134 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1135 return shouldWidenLoad(ST, Query, G_LOAD); 1136 }); 1137 } 1138 1139 // FIXME: load/store narrowing should be moved to lower action 1140 Actions 1141 .narrowScalarIf( 1142 [=](const LegalityQuery &Query) -> bool { 1143 return !Query.Types[0].isVector() && 1144 needToSplitMemOp(Query, Op == G_LOAD); 1145 }, 1146 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1147 const LLT DstTy = Query.Types[0]; 1148 const LLT PtrTy = Query.Types[1]; 1149 1150 const unsigned DstSize = DstTy.getSizeInBits(); 1151 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1152 1153 // Split extloads. 1154 if (DstSize > MemSize) 1155 return std::make_pair(0, LLT::scalar(MemSize)); 1156 1157 if (!isPowerOf2_32(DstSize)) { 1158 // We're probably decomposing an odd sized store. Try to split 1159 // to the widest type. TODO: Account for alignment. As-is it 1160 // should be OK, since the new parts will be further legalized. 1161 unsigned FloorSize = PowerOf2Floor(DstSize); 1162 return std::make_pair(0, LLT::scalar(FloorSize)); 1163 } 1164 1165 if (DstSize > 32 && (DstSize % 32 != 0)) { 1166 // FIXME: Need a way to specify non-extload of larger size if 1167 // suitably aligned. 1168 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1169 } 1170 1171 unsigned MaxSize = maxSizeForAddrSpace(ST, 1172 PtrTy.getAddressSpace(), 1173 Op == G_LOAD); 1174 if (MemSize > MaxSize) 1175 return std::make_pair(0, LLT::scalar(MaxSize)); 1176 1177 unsigned Align = Query.MMODescrs[0].AlignInBits; 1178 return std::make_pair(0, LLT::scalar(Align)); 1179 }) 1180 .fewerElementsIf( 1181 [=](const LegalityQuery &Query) -> bool { 1182 return Query.Types[0].isVector() && 1183 needToSplitMemOp(Query, Op == G_LOAD); 1184 }, 1185 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1186 const LLT DstTy = Query.Types[0]; 1187 const LLT PtrTy = Query.Types[1]; 1188 1189 LLT EltTy = DstTy.getElementType(); 1190 unsigned MaxSize = maxSizeForAddrSpace(ST, 1191 PtrTy.getAddressSpace(), 1192 Op == G_LOAD); 1193 1194 // FIXME: Handle widened to power of 2 results better. This ends 1195 // up scalarizing. 1196 // FIXME: 3 element stores scalarized on SI 1197 1198 // Split if it's too large for the address space. 1199 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1200 unsigned NumElts = DstTy.getNumElements(); 1201 unsigned EltSize = EltTy.getSizeInBits(); 1202 1203 if (MaxSize % EltSize == 0) { 1204 return std::make_pair( 1205 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1206 } 1207 1208 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1209 1210 // FIXME: Refine when odd breakdowns handled 1211 // The scalars will need to be re-legalized. 1212 if (NumPieces == 1 || NumPieces >= NumElts || 1213 NumElts % NumPieces != 0) 1214 return std::make_pair(0, EltTy); 1215 1216 return std::make_pair(0, 1217 LLT::vector(NumElts / NumPieces, EltTy)); 1218 } 1219 1220 // FIXME: We could probably handle weird extending loads better. 1221 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1222 if (DstTy.getSizeInBits() > MemSize) 1223 return std::make_pair(0, EltTy); 1224 1225 unsigned EltSize = EltTy.getSizeInBits(); 1226 unsigned DstSize = DstTy.getSizeInBits(); 1227 if (!isPowerOf2_32(DstSize)) { 1228 // We're probably decomposing an odd sized store. Try to split 1229 // to the widest type. TODO: Account for alignment. As-is it 1230 // should be OK, since the new parts will be further legalized. 1231 unsigned FloorSize = PowerOf2Floor(DstSize); 1232 return std::make_pair( 1233 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1234 } 1235 1236 // Need to split because of alignment. 1237 unsigned Align = Query.MMODescrs[0].AlignInBits; 1238 if (EltSize > Align && 1239 (EltSize / Align < DstTy.getNumElements())) { 1240 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1241 } 1242 1243 // May need relegalization for the scalars. 1244 return std::make_pair(0, EltTy); 1245 }) 1246 .lowerIfMemSizeNotPow2() 1247 .minScalar(0, S32); 1248 1249 if (IsStore) 1250 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1251 1252 Actions 1253 .widenScalarToNextPow2(0) 1254 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1255 .lower(); 1256 } 1257 1258 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1259 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1260 {S32, GlobalPtr, 16, 2 * 8}, 1261 {S32, LocalPtr, 8, 8}, 1262 {S32, LocalPtr, 16, 16}, 1263 {S32, PrivatePtr, 8, 8}, 1264 {S32, PrivatePtr, 16, 16}, 1265 {S32, ConstantPtr, 8, 8}, 1266 {S32, ConstantPtr, 16, 2 * 8}}); 1267 if (ST.hasFlatAddressSpace()) { 1268 ExtLoads.legalForTypesWithMemDesc( 1269 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1270 } 1271 1272 ExtLoads.clampScalar(0, S32, S32) 1273 .widenScalarToNextPow2(0) 1274 .unsupportedIfMemSizeNotPow2() 1275 .lower(); 1276 1277 auto &Atomics = getActionDefinitionsBuilder( 1278 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1279 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1280 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1281 G_ATOMICRMW_UMIN}) 1282 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1283 {S64, GlobalPtr}, {S64, LocalPtr}, 1284 {S32, RegionPtr}, {S64, RegionPtr}}); 1285 if (ST.hasFlatAddressSpace()) { 1286 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1287 } 1288 1289 if (ST.hasLDSFPAtomics()) { 1290 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1291 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1292 } 1293 1294 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1295 // demarshalling 1296 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1297 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1298 {S32, FlatPtr}, {S64, FlatPtr}}) 1299 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1300 {S32, RegionPtr}, {S64, RegionPtr}}); 1301 // TODO: Pointer types, any 32-bit or 64-bit vector 1302 1303 // Condition should be s32 for scalar, s1 for vector. 1304 getActionDefinitionsBuilder(G_SELECT) 1305 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1306 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1307 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1308 .clampScalar(0, S16, S64) 1309 .scalarize(1) 1310 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1311 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1312 .clampMaxNumElements(0, S32, 2) 1313 .clampMaxNumElements(0, LocalPtr, 2) 1314 .clampMaxNumElements(0, PrivatePtr, 2) 1315 .scalarize(0) 1316 .widenScalarToNextPow2(0) 1317 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1318 1319 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1320 // be more flexible with the shift amount type. 1321 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1322 .legalFor({{S32, S32}, {S64, S32}}); 1323 if (ST.has16BitInsts()) { 1324 if (ST.hasVOP3PInsts()) { 1325 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1326 .clampMaxNumElements(0, S16, 2); 1327 } else 1328 Shifts.legalFor({{S16, S16}}); 1329 1330 // TODO: Support 16-bit shift amounts for all types 1331 Shifts.widenScalarIf( 1332 [=](const LegalityQuery &Query) { 1333 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1334 // 32-bit amount. 1335 const LLT ValTy = Query.Types[0]; 1336 const LLT AmountTy = Query.Types[1]; 1337 return ValTy.getSizeInBits() <= 16 && 1338 AmountTy.getSizeInBits() < 16; 1339 }, changeTo(1, S16)); 1340 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1341 Shifts.clampScalar(1, S32, S32); 1342 Shifts.clampScalar(0, S16, S64); 1343 Shifts.widenScalarToNextPow2(0, 16); 1344 1345 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1346 .minScalar(0, S16) 1347 .scalarize(0) 1348 .lower(); 1349 } else { 1350 // Make sure we legalize the shift amount type first, as the general 1351 // expansion for the shifted type will produce much worse code if it hasn't 1352 // been truncated already. 1353 Shifts.clampScalar(1, S32, S32); 1354 Shifts.clampScalar(0, S32, S64); 1355 Shifts.widenScalarToNextPow2(0, 32); 1356 1357 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1358 .minScalar(0, S32) 1359 .scalarize(0) 1360 .lower(); 1361 } 1362 Shifts.scalarize(0); 1363 1364 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1365 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1366 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1367 unsigned IdxTypeIdx = 2; 1368 1369 getActionDefinitionsBuilder(Op) 1370 .customIf([=](const LegalityQuery &Query) { 1371 const LLT EltTy = Query.Types[EltTypeIdx]; 1372 const LLT VecTy = Query.Types[VecTypeIdx]; 1373 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1374 const unsigned EltSize = EltTy.getSizeInBits(); 1375 return (EltSize == 32 || EltSize == 64) && 1376 VecTy.getSizeInBits() % 32 == 0 && 1377 VecTy.getSizeInBits() <= MaxRegisterSize && 1378 IdxTy.getSizeInBits() == 32; 1379 }) 1380 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1381 bitcastToVectorElement32(VecTypeIdx)) 1382 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1383 .bitcastIf( 1384 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1385 [=](const LegalityQuery &Query) { 1386 // For > 64-bit element types, try to turn this into a 64-bit 1387 // element vector since we may be able to do better indexing 1388 // if this is scalar. If not, fall back to 32. 1389 const LLT EltTy = Query.Types[EltTypeIdx]; 1390 const LLT VecTy = Query.Types[VecTypeIdx]; 1391 const unsigned DstEltSize = EltTy.getSizeInBits(); 1392 const unsigned VecSize = VecTy.getSizeInBits(); 1393 1394 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1395 return std::make_pair( 1396 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1397 }) 1398 .clampScalar(EltTypeIdx, S32, S64) 1399 .clampScalar(VecTypeIdx, S32, S64) 1400 .clampScalar(IdxTypeIdx, S32, S32) 1401 .clampMaxNumElements(VecTypeIdx, S32, 32) 1402 // TODO: Clamp elements for 64-bit vectors? 1403 // It should only be necessary with variable indexes. 1404 // As a last resort, lower to the stack 1405 .lower(); 1406 } 1407 1408 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1409 .unsupportedIf([=](const LegalityQuery &Query) { 1410 const LLT &EltTy = Query.Types[1].getElementType(); 1411 return Query.Types[0] != EltTy; 1412 }); 1413 1414 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1415 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1416 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1417 1418 // FIXME: Doesn't handle extract of illegal sizes. 1419 getActionDefinitionsBuilder(Op) 1420 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1421 // FIXME: Multiples of 16 should not be legal. 1422 .legalIf([=](const LegalityQuery &Query) { 1423 const LLT BigTy = Query.Types[BigTyIdx]; 1424 const LLT LitTy = Query.Types[LitTyIdx]; 1425 return (BigTy.getSizeInBits() % 32 == 0) && 1426 (LitTy.getSizeInBits() % 16 == 0); 1427 }) 1428 .widenScalarIf( 1429 [=](const LegalityQuery &Query) { 1430 const LLT BigTy = Query.Types[BigTyIdx]; 1431 return (BigTy.getScalarSizeInBits() < 16); 1432 }, 1433 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1434 .widenScalarIf( 1435 [=](const LegalityQuery &Query) { 1436 const LLT LitTy = Query.Types[LitTyIdx]; 1437 return (LitTy.getScalarSizeInBits() < 16); 1438 }, 1439 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1440 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1441 .widenScalarToNextPow2(BigTyIdx, 32); 1442 1443 } 1444 1445 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1446 .legalForCartesianProduct(AllS32Vectors, {S32}) 1447 .legalForCartesianProduct(AllS64Vectors, {S64}) 1448 .clampNumElements(0, V16S32, V32S32) 1449 .clampNumElements(0, V2S64, V16S64) 1450 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1451 1452 if (ST.hasScalarPackInsts()) { 1453 BuildVector 1454 // FIXME: Should probably widen s1 vectors straight to s32 1455 .minScalarOrElt(0, S16) 1456 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1457 .minScalar(1, S32); 1458 1459 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1460 .legalFor({V2S16, S32}) 1461 .lower(); 1462 BuildVector.minScalarOrElt(0, S32); 1463 } else { 1464 BuildVector.customFor({V2S16, S16}); 1465 BuildVector.minScalarOrElt(0, S32); 1466 1467 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1468 .customFor({V2S16, S32}) 1469 .lower(); 1470 } 1471 1472 BuildVector.legalIf(isRegisterType(0)); 1473 1474 // FIXME: Clamp maximum size 1475 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1476 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1477 .clampMaxNumElements(0, S32, 32) 1478 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1479 .clampMaxNumElements(0, S16, 64); 1480 1481 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1482 // pre-legalize. 1483 if (ST.hasVOP3PInsts()) { 1484 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1485 .customFor({V2S16, V2S16}) 1486 .lower(); 1487 } else 1488 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1489 1490 // Merge/Unmerge 1491 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1492 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1493 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1494 1495 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1496 const LLT Ty = Query.Types[TypeIdx]; 1497 if (Ty.isVector()) { 1498 const LLT &EltTy = Ty.getElementType(); 1499 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1500 return true; 1501 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1502 return true; 1503 } 1504 return false; 1505 }; 1506 1507 auto &Builder = getActionDefinitionsBuilder(Op) 1508 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1509 .lowerFor({{S16, V2S16}}) 1510 .lowerIf([=](const LegalityQuery &Query) { 1511 const LLT BigTy = Query.Types[BigTyIdx]; 1512 return BigTy.getSizeInBits() == 32; 1513 }) 1514 // Try to widen to s16 first for small types. 1515 // TODO: Only do this on targets with legal s16 shifts 1516 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1517 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1518 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1519 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1520 elementTypeIs(1, S16)), 1521 changeTo(1, V2S16)) 1522 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1523 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1524 // valid. 1525 .clampScalar(LitTyIdx, S32, S512) 1526 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1527 // Break up vectors with weird elements into scalars 1528 .fewerElementsIf( 1529 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1530 scalarize(0)) 1531 .fewerElementsIf( 1532 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1533 scalarize(1)) 1534 .clampScalar(BigTyIdx, S32, MaxScalar); 1535 1536 if (Op == G_MERGE_VALUES) { 1537 Builder.widenScalarIf( 1538 // TODO: Use 16-bit shifts if legal for 8-bit values? 1539 [=](const LegalityQuery &Query) { 1540 const LLT Ty = Query.Types[LitTyIdx]; 1541 return Ty.getSizeInBits() < 32; 1542 }, 1543 changeTo(LitTyIdx, S32)); 1544 } 1545 1546 Builder.widenScalarIf( 1547 [=](const LegalityQuery &Query) { 1548 const LLT Ty = Query.Types[BigTyIdx]; 1549 return !isPowerOf2_32(Ty.getSizeInBits()) && 1550 Ty.getSizeInBits() % 16 != 0; 1551 }, 1552 [=](const LegalityQuery &Query) { 1553 // Pick the next power of 2, or a multiple of 64 over 128. 1554 // Whichever is smaller. 1555 const LLT &Ty = Query.Types[BigTyIdx]; 1556 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1557 if (NewSizeInBits >= 256) { 1558 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1559 if (RoundedTo < NewSizeInBits) 1560 NewSizeInBits = RoundedTo; 1561 } 1562 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1563 }) 1564 // Any vectors left are the wrong size. Scalarize them. 1565 .scalarize(0) 1566 .scalarize(1); 1567 } 1568 1569 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1570 // RegBankSelect. 1571 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1572 .legalFor({{S32}, {S64}}); 1573 1574 if (ST.hasVOP3PInsts()) { 1575 SextInReg.lowerFor({{V2S16}}) 1576 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1577 // get more vector shift opportunities, since we'll get those when 1578 // expanded. 1579 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1580 } else if (ST.has16BitInsts()) { 1581 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1582 } else { 1583 // Prefer to promote to s32 before lowering if we don't have 16-bit 1584 // shifts. This avoid a lot of intermediate truncate and extend operations. 1585 SextInReg.lowerFor({{S32}, {S64}}); 1586 } 1587 1588 SextInReg 1589 .scalarize(0) 1590 .clampScalar(0, S32, S64) 1591 .lower(); 1592 1593 getActionDefinitionsBuilder(G_FSHR) 1594 .legalFor({{S32, S32}}) 1595 .scalarize(0) 1596 .lower(); 1597 1598 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1599 .legalFor({S64}); 1600 1601 getActionDefinitionsBuilder(G_FENCE) 1602 .alwaysLegal(); 1603 1604 getActionDefinitionsBuilder({ 1605 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1606 G_FCOPYSIGN, 1607 1608 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1609 G_ATOMICRMW_NAND, 1610 G_ATOMICRMW_FSUB, 1611 G_READ_REGISTER, 1612 G_WRITE_REGISTER, 1613 1614 G_SADDO, G_SSUBO, 1615 1616 // TODO: Implement 1617 G_FMINIMUM, G_FMAXIMUM, 1618 G_FSHL 1619 }).lower(); 1620 1621 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1622 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1623 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1624 .unsupported(); 1625 1626 computeTables(); 1627 verify(*ST.getInstrInfo()); 1628 } 1629 1630 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1631 MachineInstr &MI) const { 1632 MachineIRBuilder &B = Helper.MIRBuilder; 1633 MachineRegisterInfo &MRI = *B.getMRI(); 1634 1635 switch (MI.getOpcode()) { 1636 case TargetOpcode::G_ADDRSPACE_CAST: 1637 return legalizeAddrSpaceCast(MI, MRI, B); 1638 case TargetOpcode::G_FRINT: 1639 return legalizeFrint(MI, MRI, B); 1640 case TargetOpcode::G_FCEIL: 1641 return legalizeFceil(MI, MRI, B); 1642 case TargetOpcode::G_FREM: 1643 return legalizeFrem(MI, MRI, B); 1644 case TargetOpcode::G_INTRINSIC_TRUNC: 1645 return legalizeIntrinsicTrunc(MI, MRI, B); 1646 case TargetOpcode::G_SITOFP: 1647 return legalizeITOFP(MI, MRI, B, true); 1648 case TargetOpcode::G_UITOFP: 1649 return legalizeITOFP(MI, MRI, B, false); 1650 case TargetOpcode::G_FPTOSI: 1651 return legalizeFPTOI(MI, MRI, B, true); 1652 case TargetOpcode::G_FPTOUI: 1653 return legalizeFPTOI(MI, MRI, B, false); 1654 case TargetOpcode::G_FMINNUM: 1655 case TargetOpcode::G_FMAXNUM: 1656 case TargetOpcode::G_FMINNUM_IEEE: 1657 case TargetOpcode::G_FMAXNUM_IEEE: 1658 return legalizeMinNumMaxNum(Helper, MI); 1659 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1660 return legalizeExtractVectorElt(MI, MRI, B); 1661 case TargetOpcode::G_INSERT_VECTOR_ELT: 1662 return legalizeInsertVectorElt(MI, MRI, B); 1663 case TargetOpcode::G_SHUFFLE_VECTOR: 1664 return legalizeShuffleVector(MI, MRI, B); 1665 case TargetOpcode::G_FSIN: 1666 case TargetOpcode::G_FCOS: 1667 return legalizeSinCos(MI, MRI, B); 1668 case TargetOpcode::G_GLOBAL_VALUE: 1669 return legalizeGlobalValue(MI, MRI, B); 1670 case TargetOpcode::G_LOAD: 1671 return legalizeLoad(Helper, MI); 1672 case TargetOpcode::G_FMAD: 1673 return legalizeFMad(MI, MRI, B); 1674 case TargetOpcode::G_FDIV: 1675 return legalizeFDIV(MI, MRI, B); 1676 case TargetOpcode::G_UDIV: 1677 case TargetOpcode::G_UREM: 1678 return legalizeUDIV_UREM(MI, MRI, B); 1679 case TargetOpcode::G_SDIV: 1680 case TargetOpcode::G_SREM: 1681 return legalizeSDIV_SREM(MI, MRI, B); 1682 case TargetOpcode::G_ATOMIC_CMPXCHG: 1683 return legalizeAtomicCmpXChg(MI, MRI, B); 1684 case TargetOpcode::G_FLOG: 1685 return legalizeFlog(MI, B, numbers::ln2f); 1686 case TargetOpcode::G_FLOG10: 1687 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1688 case TargetOpcode::G_FEXP: 1689 return legalizeFExp(MI, B); 1690 case TargetOpcode::G_FPOW: 1691 return legalizeFPow(MI, B); 1692 case TargetOpcode::G_FFLOOR: 1693 return legalizeFFloor(MI, MRI, B); 1694 case TargetOpcode::G_BUILD_VECTOR: 1695 return legalizeBuildVector(MI, MRI, B); 1696 default: 1697 return false; 1698 } 1699 1700 llvm_unreachable("expected switch to return"); 1701 } 1702 1703 Register AMDGPULegalizerInfo::getSegmentAperture( 1704 unsigned AS, 1705 MachineRegisterInfo &MRI, 1706 MachineIRBuilder &B) const { 1707 MachineFunction &MF = B.getMF(); 1708 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1709 const LLT S32 = LLT::scalar(32); 1710 1711 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1712 1713 if (ST.hasApertureRegs()) { 1714 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1715 // getreg. 1716 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1717 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1718 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1719 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1720 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1721 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1722 unsigned Encoding = 1723 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1724 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1725 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1726 1727 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1728 1729 B.buildInstr(AMDGPU::S_GETREG_B32) 1730 .addDef(GetReg) 1731 .addImm(Encoding); 1732 MRI.setType(GetReg, S32); 1733 1734 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1735 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1736 } 1737 1738 Register QueuePtr = MRI.createGenericVirtualRegister( 1739 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1740 1741 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1742 return Register(); 1743 1744 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1745 // private_segment_aperture_base_hi. 1746 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1747 1748 // TODO: can we be smarter about machine pointer info? 1749 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1750 MachineMemOperand *MMO = MF.getMachineMemOperand( 1751 PtrInfo, 1752 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1753 MachineMemOperand::MOInvariant, 1754 4, commonAlignment(Align(64), StructOffset)); 1755 1756 Register LoadAddr; 1757 1758 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1759 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1760 } 1761 1762 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1763 MachineInstr &MI, MachineRegisterInfo &MRI, 1764 MachineIRBuilder &B) const { 1765 MachineFunction &MF = B.getMF(); 1766 1767 const LLT S32 = LLT::scalar(32); 1768 Register Dst = MI.getOperand(0).getReg(); 1769 Register Src = MI.getOperand(1).getReg(); 1770 1771 LLT DstTy = MRI.getType(Dst); 1772 LLT SrcTy = MRI.getType(Src); 1773 unsigned DestAS = DstTy.getAddressSpace(); 1774 unsigned SrcAS = SrcTy.getAddressSpace(); 1775 1776 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1777 // vector element. 1778 assert(!DstTy.isVector()); 1779 1780 const AMDGPUTargetMachine &TM 1781 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1782 1783 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1784 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1785 return true; 1786 } 1787 1788 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1789 // Truncate. 1790 B.buildExtract(Dst, Src, 0); 1791 MI.eraseFromParent(); 1792 return true; 1793 } 1794 1795 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1796 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1797 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1798 1799 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1800 // another. Merge operands are required to be the same type, but creating an 1801 // extra ptrtoint would be kind of pointless. 1802 auto HighAddr = B.buildConstant( 1803 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1804 B.buildMerge(Dst, {Src, HighAddr}); 1805 MI.eraseFromParent(); 1806 return true; 1807 } 1808 1809 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1810 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1811 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1812 unsigned NullVal = TM.getNullPointerValue(DestAS); 1813 1814 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1815 auto FlatNull = B.buildConstant(SrcTy, 0); 1816 1817 // Extract low 32-bits of the pointer. 1818 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1819 1820 auto CmpRes = 1821 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1822 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1823 1824 MI.eraseFromParent(); 1825 return true; 1826 } 1827 1828 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1829 return false; 1830 1831 if (!ST.hasFlatAddressSpace()) 1832 return false; 1833 1834 auto SegmentNull = 1835 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1836 auto FlatNull = 1837 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1838 1839 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1840 if (!ApertureReg.isValid()) 1841 return false; 1842 1843 auto CmpRes = 1844 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1845 1846 // Coerce the type of the low half of the result so we can use merge_values. 1847 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1848 1849 // TODO: Should we allow mismatched types but matching sizes in merges to 1850 // avoid the ptrtoint? 1851 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1852 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1853 1854 MI.eraseFromParent(); 1855 return true; 1856 } 1857 1858 bool AMDGPULegalizerInfo::legalizeFrint( 1859 MachineInstr &MI, MachineRegisterInfo &MRI, 1860 MachineIRBuilder &B) const { 1861 Register Src = MI.getOperand(1).getReg(); 1862 LLT Ty = MRI.getType(Src); 1863 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1864 1865 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1866 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1867 1868 auto C1 = B.buildFConstant(Ty, C1Val); 1869 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1870 1871 // TODO: Should this propagate fast-math-flags? 1872 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1873 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1874 1875 auto C2 = B.buildFConstant(Ty, C2Val); 1876 auto Fabs = B.buildFAbs(Ty, Src); 1877 1878 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1879 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1880 MI.eraseFromParent(); 1881 return true; 1882 } 1883 1884 bool AMDGPULegalizerInfo::legalizeFceil( 1885 MachineInstr &MI, MachineRegisterInfo &MRI, 1886 MachineIRBuilder &B) const { 1887 1888 const LLT S1 = LLT::scalar(1); 1889 const LLT S64 = LLT::scalar(64); 1890 1891 Register Src = MI.getOperand(1).getReg(); 1892 assert(MRI.getType(Src) == S64); 1893 1894 // result = trunc(src) 1895 // if (src > 0.0 && src != result) 1896 // result += 1.0 1897 1898 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1899 1900 const auto Zero = B.buildFConstant(S64, 0.0); 1901 const auto One = B.buildFConstant(S64, 1.0); 1902 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1903 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1904 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1905 auto Add = B.buildSelect(S64, And, One, Zero); 1906 1907 // TODO: Should this propagate fast-math-flags? 1908 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1909 return true; 1910 } 1911 1912 bool AMDGPULegalizerInfo::legalizeFrem( 1913 MachineInstr &MI, MachineRegisterInfo &MRI, 1914 MachineIRBuilder &B) const { 1915 Register DstReg = MI.getOperand(0).getReg(); 1916 Register Src0Reg = MI.getOperand(1).getReg(); 1917 Register Src1Reg = MI.getOperand(2).getReg(); 1918 auto Flags = MI.getFlags(); 1919 LLT Ty = MRI.getType(DstReg); 1920 1921 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1922 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1923 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1924 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1925 MI.eraseFromParent(); 1926 return true; 1927 } 1928 1929 static MachineInstrBuilder extractF64Exponent(Register Hi, 1930 MachineIRBuilder &B) { 1931 const unsigned FractBits = 52; 1932 const unsigned ExpBits = 11; 1933 LLT S32 = LLT::scalar(32); 1934 1935 auto Const0 = B.buildConstant(S32, FractBits - 32); 1936 auto Const1 = B.buildConstant(S32, ExpBits); 1937 1938 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1939 .addUse(Hi) 1940 .addUse(Const0.getReg(0)) 1941 .addUse(Const1.getReg(0)); 1942 1943 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1944 } 1945 1946 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1947 MachineInstr &MI, MachineRegisterInfo &MRI, 1948 MachineIRBuilder &B) const { 1949 const LLT S1 = LLT::scalar(1); 1950 const LLT S32 = LLT::scalar(32); 1951 const LLT S64 = LLT::scalar(64); 1952 1953 Register Src = MI.getOperand(1).getReg(); 1954 assert(MRI.getType(Src) == S64); 1955 1956 // TODO: Should this use extract since the low half is unused? 1957 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1958 Register Hi = Unmerge.getReg(1); 1959 1960 // Extract the upper half, since this is where we will find the sign and 1961 // exponent. 1962 auto Exp = extractF64Exponent(Hi, B); 1963 1964 const unsigned FractBits = 52; 1965 1966 // Extract the sign bit. 1967 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1968 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1969 1970 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1971 1972 const auto Zero32 = B.buildConstant(S32, 0); 1973 1974 // Extend back to 64-bits. 1975 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1976 1977 auto Shr = B.buildAShr(S64, FractMask, Exp); 1978 auto Not = B.buildNot(S64, Shr); 1979 auto Tmp0 = B.buildAnd(S64, Src, Not); 1980 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1981 1982 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1983 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1984 1985 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1986 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1987 MI.eraseFromParent(); 1988 return true; 1989 } 1990 1991 bool AMDGPULegalizerInfo::legalizeITOFP( 1992 MachineInstr &MI, MachineRegisterInfo &MRI, 1993 MachineIRBuilder &B, bool Signed) const { 1994 1995 Register Dst = MI.getOperand(0).getReg(); 1996 Register Src = MI.getOperand(1).getReg(); 1997 1998 const LLT S64 = LLT::scalar(64); 1999 const LLT S32 = LLT::scalar(32); 2000 2001 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2002 2003 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2004 2005 auto CvtHi = Signed ? 2006 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2007 B.buildUITOFP(S64, Unmerge.getReg(1)); 2008 2009 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2010 2011 auto ThirtyTwo = B.buildConstant(S32, 32); 2012 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2013 .addUse(CvtHi.getReg(0)) 2014 .addUse(ThirtyTwo.getReg(0)); 2015 2016 // TODO: Should this propagate fast-math-flags? 2017 B.buildFAdd(Dst, LdExp, CvtLo); 2018 MI.eraseFromParent(); 2019 return true; 2020 } 2021 2022 // TODO: Copied from DAG implementation. Verify logic and document how this 2023 // actually works. 2024 bool AMDGPULegalizerInfo::legalizeFPTOI( 2025 MachineInstr &MI, MachineRegisterInfo &MRI, 2026 MachineIRBuilder &B, bool Signed) const { 2027 2028 Register Dst = MI.getOperand(0).getReg(); 2029 Register Src = MI.getOperand(1).getReg(); 2030 2031 const LLT S64 = LLT::scalar(64); 2032 const LLT S32 = LLT::scalar(32); 2033 2034 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2035 2036 unsigned Flags = MI.getFlags(); 2037 2038 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2039 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2040 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2041 2042 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2043 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2044 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2045 2046 auto Hi = Signed ? 2047 B.buildFPTOSI(S32, FloorMul) : 2048 B.buildFPTOUI(S32, FloorMul); 2049 auto Lo = B.buildFPTOUI(S32, Fma); 2050 2051 B.buildMerge(Dst, { Lo, Hi }); 2052 MI.eraseFromParent(); 2053 2054 return true; 2055 } 2056 2057 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2058 MachineInstr &MI) const { 2059 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2060 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2061 2062 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2063 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2064 2065 // With ieee_mode disabled, the instructions have the correct behavior 2066 // already for G_FMINNUM/G_FMAXNUM 2067 if (!MFI->getMode().IEEE) 2068 return !IsIEEEOp; 2069 2070 if (IsIEEEOp) 2071 return true; 2072 2073 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2074 } 2075 2076 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2077 MachineInstr &MI, MachineRegisterInfo &MRI, 2078 MachineIRBuilder &B) const { 2079 // TODO: Should move some of this into LegalizerHelper. 2080 2081 // TODO: Promote dynamic indexing of s16 to s32 2082 2083 // FIXME: Artifact combiner probably should have replaced the truncated 2084 // constant before this, so we shouldn't need 2085 // getConstantVRegValWithLookThrough. 2086 Optional<ValueAndVReg> MaybeIdxVal = 2087 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2088 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2089 return true; 2090 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2091 2092 Register Dst = MI.getOperand(0).getReg(); 2093 Register Vec = MI.getOperand(1).getReg(); 2094 2095 LLT VecTy = MRI.getType(Vec); 2096 LLT EltTy = VecTy.getElementType(); 2097 assert(EltTy == MRI.getType(Dst)); 2098 2099 if (IdxVal < VecTy.getNumElements()) 2100 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2101 else 2102 B.buildUndef(Dst); 2103 2104 MI.eraseFromParent(); 2105 return true; 2106 } 2107 2108 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2109 MachineInstr &MI, MachineRegisterInfo &MRI, 2110 MachineIRBuilder &B) const { 2111 // TODO: Should move some of this into LegalizerHelper. 2112 2113 // TODO: Promote dynamic indexing of s16 to s32 2114 2115 // FIXME: Artifact combiner probably should have replaced the truncated 2116 // constant before this, so we shouldn't need 2117 // getConstantVRegValWithLookThrough. 2118 Optional<ValueAndVReg> MaybeIdxVal = 2119 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2120 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2121 return true; 2122 2123 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2124 Register Dst = MI.getOperand(0).getReg(); 2125 Register Vec = MI.getOperand(1).getReg(); 2126 Register Ins = MI.getOperand(2).getReg(); 2127 2128 LLT VecTy = MRI.getType(Vec); 2129 LLT EltTy = VecTy.getElementType(); 2130 assert(EltTy == MRI.getType(Ins)); 2131 2132 if (IdxVal < VecTy.getNumElements()) 2133 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2134 else 2135 B.buildUndef(Dst); 2136 2137 MI.eraseFromParent(); 2138 return true; 2139 } 2140 2141 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2142 MachineInstr &MI, MachineRegisterInfo &MRI, 2143 MachineIRBuilder &B) const { 2144 const LLT V2S16 = LLT::vector(2, 16); 2145 2146 Register Dst = MI.getOperand(0).getReg(); 2147 Register Src0 = MI.getOperand(1).getReg(); 2148 LLT DstTy = MRI.getType(Dst); 2149 LLT SrcTy = MRI.getType(Src0); 2150 2151 if (SrcTy == V2S16 && DstTy == V2S16 && 2152 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2153 return true; 2154 2155 MachineIRBuilder HelperBuilder(MI); 2156 GISelObserverWrapper DummyObserver; 2157 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2158 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2159 } 2160 2161 bool AMDGPULegalizerInfo::legalizeSinCos( 2162 MachineInstr &MI, MachineRegisterInfo &MRI, 2163 MachineIRBuilder &B) const { 2164 2165 Register DstReg = MI.getOperand(0).getReg(); 2166 Register SrcReg = MI.getOperand(1).getReg(); 2167 LLT Ty = MRI.getType(DstReg); 2168 unsigned Flags = MI.getFlags(); 2169 2170 Register TrigVal; 2171 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2172 if (ST.hasTrigReducedRange()) { 2173 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2174 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2175 .addUse(MulVal.getReg(0)) 2176 .setMIFlags(Flags).getReg(0); 2177 } else 2178 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2179 2180 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2181 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2182 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2183 .addUse(TrigVal) 2184 .setMIFlags(Flags); 2185 MI.eraseFromParent(); 2186 return true; 2187 } 2188 2189 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2190 MachineIRBuilder &B, 2191 const GlobalValue *GV, 2192 int64_t Offset, 2193 unsigned GAFlags) const { 2194 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2195 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2196 // to the following code sequence: 2197 // 2198 // For constant address space: 2199 // s_getpc_b64 s[0:1] 2200 // s_add_u32 s0, s0, $symbol 2201 // s_addc_u32 s1, s1, 0 2202 // 2203 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2204 // a fixup or relocation is emitted to replace $symbol with a literal 2205 // constant, which is a pc-relative offset from the encoding of the $symbol 2206 // operand to the global variable. 2207 // 2208 // For global address space: 2209 // s_getpc_b64 s[0:1] 2210 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2211 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2212 // 2213 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2214 // fixups or relocations are emitted to replace $symbol@*@lo and 2215 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2216 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2217 // operand to the global variable. 2218 // 2219 // What we want here is an offset from the value returned by s_getpc 2220 // (which is the address of the s_add_u32 instruction) to the global 2221 // variable, but since the encoding of $symbol starts 4 bytes after the start 2222 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2223 // small. This requires us to add 4 to the global variable offset in order to 2224 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2225 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2226 // instruction. 2227 2228 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2229 2230 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2231 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2232 2233 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2234 .addDef(PCReg); 2235 2236 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2237 if (GAFlags == SIInstrInfo::MO_NONE) 2238 MIB.addImm(0); 2239 else 2240 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2241 2242 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2243 2244 if (PtrTy.getSizeInBits() == 32) 2245 B.buildExtract(DstReg, PCReg, 0); 2246 return true; 2247 } 2248 2249 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2250 MachineInstr &MI, MachineRegisterInfo &MRI, 2251 MachineIRBuilder &B) const { 2252 Register DstReg = MI.getOperand(0).getReg(); 2253 LLT Ty = MRI.getType(DstReg); 2254 unsigned AS = Ty.getAddressSpace(); 2255 2256 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2257 MachineFunction &MF = B.getMF(); 2258 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2259 2260 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2261 if (!MFI->isModuleEntryFunction()) { 2262 const Function &Fn = MF.getFunction(); 2263 DiagnosticInfoUnsupported BadLDSDecl( 2264 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2265 DS_Warning); 2266 Fn.getContext().diagnose(BadLDSDecl); 2267 2268 // We currently don't have a way to correctly allocate LDS objects that 2269 // aren't directly associated with a kernel. We do force inlining of 2270 // functions that use local objects. However, if these dead functions are 2271 // not eliminated, we don't want a compile time error. Just emit a warning 2272 // and a trap, since there should be no callable path here. 2273 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2274 B.buildUndef(DstReg); 2275 MI.eraseFromParent(); 2276 return true; 2277 } 2278 2279 // TODO: We could emit code to handle the initialization somewhere. 2280 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2281 const SITargetLowering *TLI = ST.getTargetLowering(); 2282 if (!TLI->shouldUseLDSConstAddress(GV)) { 2283 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2284 return true; // Leave in place; 2285 } 2286 2287 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2288 Type *Ty = GV->getValueType(); 2289 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2290 // zero-sized type in other languages to declare the dynamic shared 2291 // memory which size is not known at the compile time. They will be 2292 // allocated by the runtime and placed directly after the static 2293 // allocated ones. They all share the same offset. 2294 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2295 // Adjust alignment for that dynamic shared memory array. 2296 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2297 LLT S32 = LLT::scalar(32); 2298 auto Sz = 2299 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2300 B.buildIntToPtr(DstReg, Sz); 2301 MI.eraseFromParent(); 2302 return true; 2303 } 2304 } 2305 2306 B.buildConstant( 2307 DstReg, 2308 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2309 MI.eraseFromParent(); 2310 return true; 2311 } 2312 2313 const Function &Fn = MF.getFunction(); 2314 DiagnosticInfoUnsupported BadInit( 2315 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2316 Fn.getContext().diagnose(BadInit); 2317 return true; 2318 } 2319 2320 const SITargetLowering *TLI = ST.getTargetLowering(); 2321 2322 if (TLI->shouldEmitFixup(GV)) { 2323 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2324 MI.eraseFromParent(); 2325 return true; 2326 } 2327 2328 if (TLI->shouldEmitPCReloc(GV)) { 2329 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2330 MI.eraseFromParent(); 2331 return true; 2332 } 2333 2334 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2335 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2336 2337 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2338 MachinePointerInfo::getGOT(MF), 2339 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2340 MachineMemOperand::MOInvariant, 2341 8 /*Size*/, Align(8)); 2342 2343 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2344 2345 if (Ty.getSizeInBits() == 32) { 2346 // Truncate if this is a 32-bit constant adrdess. 2347 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2348 B.buildExtract(DstReg, Load, 0); 2349 } else 2350 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2351 2352 MI.eraseFromParent(); 2353 return true; 2354 } 2355 2356 static LLT widenToNextPowerOf2(LLT Ty) { 2357 if (Ty.isVector()) 2358 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2359 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2360 } 2361 2362 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2363 MachineInstr &MI) const { 2364 MachineIRBuilder &B = Helper.MIRBuilder; 2365 MachineRegisterInfo &MRI = *B.getMRI(); 2366 GISelChangeObserver &Observer = Helper.Observer; 2367 2368 Register PtrReg = MI.getOperand(1).getReg(); 2369 LLT PtrTy = MRI.getType(PtrReg); 2370 unsigned AddrSpace = PtrTy.getAddressSpace(); 2371 2372 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2373 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2374 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2375 Observer.changingInstr(MI); 2376 MI.getOperand(1).setReg(Cast.getReg(0)); 2377 Observer.changedInstr(MI); 2378 return true; 2379 } 2380 2381 Register ValReg = MI.getOperand(0).getReg(); 2382 LLT ValTy = MRI.getType(ValReg); 2383 2384 MachineMemOperand *MMO = *MI.memoperands_begin(); 2385 const unsigned ValSize = ValTy.getSizeInBits(); 2386 const unsigned MemSize = 8 * MMO->getSize(); 2387 const Align MemAlign = MMO->getAlign(); 2388 const unsigned AlignInBits = 8 * MemAlign.value(); 2389 2390 // Widen non-power-of-2 loads to the alignment if needed 2391 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2392 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2393 2394 // This was already the correct extending load result type, so just adjust 2395 // the memory type. 2396 if (WideMemSize == ValSize) { 2397 MachineFunction &MF = B.getMF(); 2398 2399 MachineMemOperand *WideMMO = 2400 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2401 Observer.changingInstr(MI); 2402 MI.setMemRefs(MF, {WideMMO}); 2403 Observer.changedInstr(MI); 2404 return true; 2405 } 2406 2407 // Don't bother handling edge case that should probably never be produced. 2408 if (ValSize > WideMemSize) 2409 return false; 2410 2411 LLT WideTy = widenToNextPowerOf2(ValTy); 2412 2413 Register WideLoad; 2414 if (!WideTy.isVector()) { 2415 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2416 B.buildTrunc(ValReg, WideLoad).getReg(0); 2417 } else { 2418 // Extract the subvector. 2419 2420 if (isRegisterType(ValTy)) { 2421 // If this a case where G_EXTRACT is legal, use it. 2422 // (e.g. <3 x s32> -> <4 x s32>) 2423 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2424 B.buildExtract(ValReg, WideLoad, 0); 2425 } else { 2426 // For cases where the widened type isn't a nice register value, unmerge 2427 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2428 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2429 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2430 B.setInsertPt(B.getMBB(), MI.getIterator()); 2431 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2432 } 2433 } 2434 2435 MI.eraseFromParent(); 2436 return true; 2437 } 2438 2439 return false; 2440 } 2441 2442 bool AMDGPULegalizerInfo::legalizeFMad( 2443 MachineInstr &MI, MachineRegisterInfo &MRI, 2444 MachineIRBuilder &B) const { 2445 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2446 assert(Ty.isScalar()); 2447 2448 MachineFunction &MF = B.getMF(); 2449 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2450 2451 // TODO: Always legal with future ftz flag. 2452 // FIXME: Do we need just output? 2453 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2454 return true; 2455 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2456 return true; 2457 2458 MachineIRBuilder HelperBuilder(MI); 2459 GISelObserverWrapper DummyObserver; 2460 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2461 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2462 } 2463 2464 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2465 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2466 Register DstReg = MI.getOperand(0).getReg(); 2467 Register PtrReg = MI.getOperand(1).getReg(); 2468 Register CmpVal = MI.getOperand(2).getReg(); 2469 Register NewVal = MI.getOperand(3).getReg(); 2470 2471 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2472 "this should not have been custom lowered"); 2473 2474 LLT ValTy = MRI.getType(CmpVal); 2475 LLT VecTy = LLT::vector(2, ValTy); 2476 2477 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2478 2479 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2480 .addDef(DstReg) 2481 .addUse(PtrReg) 2482 .addUse(PackedVal) 2483 .setMemRefs(MI.memoperands()); 2484 2485 MI.eraseFromParent(); 2486 return true; 2487 } 2488 2489 bool AMDGPULegalizerInfo::legalizeFlog( 2490 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2491 Register Dst = MI.getOperand(0).getReg(); 2492 Register Src = MI.getOperand(1).getReg(); 2493 LLT Ty = B.getMRI()->getType(Dst); 2494 unsigned Flags = MI.getFlags(); 2495 2496 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2497 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2498 2499 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2500 MI.eraseFromParent(); 2501 return true; 2502 } 2503 2504 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2505 MachineIRBuilder &B) const { 2506 Register Dst = MI.getOperand(0).getReg(); 2507 Register Src = MI.getOperand(1).getReg(); 2508 unsigned Flags = MI.getFlags(); 2509 LLT Ty = B.getMRI()->getType(Dst); 2510 2511 auto K = B.buildFConstant(Ty, numbers::log2e); 2512 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2513 B.buildFExp2(Dst, Mul, Flags); 2514 MI.eraseFromParent(); 2515 return true; 2516 } 2517 2518 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2519 MachineIRBuilder &B) const { 2520 Register Dst = MI.getOperand(0).getReg(); 2521 Register Src0 = MI.getOperand(1).getReg(); 2522 Register Src1 = MI.getOperand(2).getReg(); 2523 unsigned Flags = MI.getFlags(); 2524 LLT Ty = B.getMRI()->getType(Dst); 2525 const LLT S16 = LLT::scalar(16); 2526 const LLT S32 = LLT::scalar(32); 2527 2528 if (Ty == S32) { 2529 auto Log = B.buildFLog2(S32, Src0, Flags); 2530 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2531 .addUse(Log.getReg(0)) 2532 .addUse(Src1) 2533 .setMIFlags(Flags); 2534 B.buildFExp2(Dst, Mul, Flags); 2535 } else if (Ty == S16) { 2536 // There's no f16 fmul_legacy, so we need to convert for it. 2537 auto Log = B.buildFLog2(S16, Src0, Flags); 2538 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2539 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2540 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2541 .addUse(Ext0.getReg(0)) 2542 .addUse(Ext1.getReg(0)) 2543 .setMIFlags(Flags); 2544 2545 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2546 } else 2547 return false; 2548 2549 MI.eraseFromParent(); 2550 return true; 2551 } 2552 2553 // Find a source register, ignoring any possible source modifiers. 2554 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2555 Register ModSrc = OrigSrc; 2556 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2557 ModSrc = SrcFNeg->getOperand(1).getReg(); 2558 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2559 ModSrc = SrcFAbs->getOperand(1).getReg(); 2560 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2561 ModSrc = SrcFAbs->getOperand(1).getReg(); 2562 return ModSrc; 2563 } 2564 2565 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2566 MachineRegisterInfo &MRI, 2567 MachineIRBuilder &B) const { 2568 2569 const LLT S1 = LLT::scalar(1); 2570 const LLT S64 = LLT::scalar(64); 2571 Register Dst = MI.getOperand(0).getReg(); 2572 Register OrigSrc = MI.getOperand(1).getReg(); 2573 unsigned Flags = MI.getFlags(); 2574 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2575 "this should not have been custom lowered"); 2576 2577 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2578 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2579 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2580 // V_FRACT bug is: 2581 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2582 // 2583 // Convert floor(x) to (x - fract(x)) 2584 2585 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2586 .addUse(OrigSrc) 2587 .setMIFlags(Flags); 2588 2589 // Give source modifier matching some assistance before obscuring a foldable 2590 // pattern. 2591 2592 // TODO: We can avoid the neg on the fract? The input sign to fract 2593 // shouldn't matter? 2594 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2595 2596 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2597 2598 Register Min = MRI.createGenericVirtualRegister(S64); 2599 2600 // We don't need to concern ourselves with the snan handling difference, so 2601 // use the one which will directly select. 2602 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2603 if (MFI->getMode().IEEE) 2604 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2605 else 2606 B.buildFMinNum(Min, Fract, Const, Flags); 2607 2608 Register CorrectedFract = Min; 2609 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2610 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2611 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2612 } 2613 2614 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2615 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2616 2617 MI.eraseFromParent(); 2618 return true; 2619 } 2620 2621 // Turn an illegal packed v2s16 build vector into bit operations. 2622 // TODO: This should probably be a bitcast action in LegalizerHelper. 2623 bool AMDGPULegalizerInfo::legalizeBuildVector( 2624 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2625 Register Dst = MI.getOperand(0).getReg(); 2626 const LLT S32 = LLT::scalar(32); 2627 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2628 2629 Register Src0 = MI.getOperand(1).getReg(); 2630 Register Src1 = MI.getOperand(2).getReg(); 2631 assert(MRI.getType(Src0) == LLT::scalar(16)); 2632 2633 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2634 B.buildBitcast(Dst, Merge); 2635 2636 MI.eraseFromParent(); 2637 return true; 2638 } 2639 2640 // Check that this is a G_XOR x, -1 2641 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2642 if (MI.getOpcode() != TargetOpcode::G_XOR) 2643 return false; 2644 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2645 return ConstVal && *ConstVal == -1; 2646 } 2647 2648 // Return the use branch instruction, otherwise null if the usage is invalid. 2649 static MachineInstr * 2650 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2651 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2652 Register CondDef = MI.getOperand(0).getReg(); 2653 if (!MRI.hasOneNonDBGUse(CondDef)) 2654 return nullptr; 2655 2656 MachineBasicBlock *Parent = MI.getParent(); 2657 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2658 2659 if (isNot(MRI, *UseMI)) { 2660 Register NegatedCond = UseMI->getOperand(0).getReg(); 2661 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2662 return nullptr; 2663 2664 // We're deleting the def of this value, so we need to remove it. 2665 UseMI->eraseFromParent(); 2666 2667 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2668 Negated = true; 2669 } 2670 2671 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2672 return nullptr; 2673 2674 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2675 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2676 if (Next == Parent->end()) { 2677 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2678 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2679 return nullptr; 2680 UncondBrTarget = &*NextMBB; 2681 } else { 2682 if (Next->getOpcode() != AMDGPU::G_BR) 2683 return nullptr; 2684 Br = &*Next; 2685 UncondBrTarget = Br->getOperand(0).getMBB(); 2686 } 2687 2688 return UseMI; 2689 } 2690 2691 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2692 const ArgDescriptor *Arg, 2693 const TargetRegisterClass *ArgRC, 2694 LLT ArgTy) const { 2695 MCRegister SrcReg = Arg->getRegister(); 2696 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2697 assert(DstReg.isVirtual() && "Virtual register expected"); 2698 2699 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2700 ArgTy); 2701 if (Arg->isMasked()) { 2702 // TODO: Should we try to emit this once in the entry block? 2703 const LLT S32 = LLT::scalar(32); 2704 const unsigned Mask = Arg->getMask(); 2705 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2706 2707 Register AndMaskSrc = LiveIn; 2708 2709 if (Shift != 0) { 2710 auto ShiftAmt = B.buildConstant(S32, Shift); 2711 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2712 } 2713 2714 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2715 } else { 2716 B.buildCopy(DstReg, LiveIn); 2717 } 2718 2719 return true; 2720 } 2721 2722 bool AMDGPULegalizerInfo::loadInputValue( 2723 Register DstReg, MachineIRBuilder &B, 2724 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2725 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2726 const ArgDescriptor *Arg; 2727 const TargetRegisterClass *ArgRC; 2728 LLT ArgTy; 2729 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2730 2731 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2732 return false; // TODO: Handle these 2733 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2734 } 2735 2736 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2737 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2738 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2739 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2740 return false; 2741 2742 MI.eraseFromParent(); 2743 return true; 2744 } 2745 2746 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2747 MachineRegisterInfo &MRI, 2748 MachineIRBuilder &B) const { 2749 Register Dst = MI.getOperand(0).getReg(); 2750 LLT DstTy = MRI.getType(Dst); 2751 LLT S16 = LLT::scalar(16); 2752 LLT S32 = LLT::scalar(32); 2753 LLT S64 = LLT::scalar(64); 2754 2755 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2756 return true; 2757 2758 if (DstTy == S16) 2759 return legalizeFDIV16(MI, MRI, B); 2760 if (DstTy == S32) 2761 return legalizeFDIV32(MI, MRI, B); 2762 if (DstTy == S64) 2763 return legalizeFDIV64(MI, MRI, B); 2764 2765 return false; 2766 } 2767 2768 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2769 Register DstReg, 2770 Register X, 2771 Register Y, 2772 bool IsDiv) const { 2773 const LLT S1 = LLT::scalar(1); 2774 const LLT S32 = LLT::scalar(32); 2775 2776 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2777 // algorithm used here. 2778 2779 // Initial estimate of inv(y). 2780 auto FloatY = B.buildUITOFP(S32, Y); 2781 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2782 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2783 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2784 auto Z = B.buildFPTOUI(S32, ScaledY); 2785 2786 // One round of UNR. 2787 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2788 auto NegYZ = B.buildMul(S32, NegY, Z); 2789 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2790 2791 // Quotient/remainder estimate. 2792 auto Q = B.buildUMulH(S32, X, Z); 2793 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2794 2795 // First quotient/remainder refinement. 2796 auto One = B.buildConstant(S32, 1); 2797 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2798 if (IsDiv) 2799 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2800 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2801 2802 // Second quotient/remainder refinement. 2803 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2804 if (IsDiv) 2805 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2806 else 2807 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2808 } 2809 2810 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2811 MachineRegisterInfo &MRI, 2812 MachineIRBuilder &B) const { 2813 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2814 Register DstReg = MI.getOperand(0).getReg(); 2815 Register Num = MI.getOperand(1).getReg(); 2816 Register Den = MI.getOperand(2).getReg(); 2817 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2818 MI.eraseFromParent(); 2819 return true; 2820 } 2821 2822 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2823 // 2824 // Return lo, hi of result 2825 // 2826 // %cvt.lo = G_UITOFP Val.lo 2827 // %cvt.hi = G_UITOFP Val.hi 2828 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2829 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2830 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2831 // %mul2 = G_FMUL %mul1, 2**(-32) 2832 // %trunc = G_INTRINSIC_TRUNC %mul2 2833 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2834 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2835 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2836 Register Val) { 2837 const LLT S32 = LLT::scalar(32); 2838 auto Unmerge = B.buildUnmerge(S32, Val); 2839 2840 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2841 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2842 2843 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2844 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2845 2846 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2847 auto Mul1 = 2848 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2849 2850 // 2**(-32) 2851 auto Mul2 = 2852 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2853 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2854 2855 // -(2**32) 2856 auto Mad2 = B.buildFMAD(S32, Trunc, 2857 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2858 2859 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2860 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2861 2862 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2863 } 2864 2865 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2866 Register DstReg, 2867 Register Numer, 2868 Register Denom, 2869 bool IsDiv) const { 2870 const LLT S32 = LLT::scalar(32); 2871 const LLT S64 = LLT::scalar(64); 2872 const LLT S1 = LLT::scalar(1); 2873 Register RcpLo, RcpHi; 2874 2875 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2876 2877 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2878 2879 auto Zero64 = B.buildConstant(S64, 0); 2880 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2881 2882 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2883 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2884 2885 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2886 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2887 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2888 2889 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2890 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2891 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2892 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2893 2894 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2895 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2896 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2897 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2898 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2899 2900 auto Zero32 = B.buildConstant(S32, 0); 2901 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2902 auto Add2_HiC = 2903 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2904 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2905 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2906 2907 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2908 Register NumerLo = UnmergeNumer.getReg(0); 2909 Register NumerHi = UnmergeNumer.getReg(1); 2910 2911 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2912 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2913 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2914 Register Mul3_Lo = UnmergeMul3.getReg(0); 2915 Register Mul3_Hi = UnmergeMul3.getReg(1); 2916 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2917 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2918 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2919 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2920 2921 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2922 Register DenomLo = UnmergeDenom.getReg(0); 2923 Register DenomHi = UnmergeDenom.getReg(1); 2924 2925 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2926 auto C1 = B.buildSExt(S32, CmpHi); 2927 2928 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2929 auto C2 = B.buildSExt(S32, CmpLo); 2930 2931 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2932 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2933 2934 // TODO: Here and below portions of the code can be enclosed into if/endif. 2935 // Currently control flow is unconditional and we have 4 selects after 2936 // potential endif to substitute PHIs. 2937 2938 // if C3 != 0 ... 2939 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2940 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2941 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2942 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2943 2944 auto One64 = B.buildConstant(S64, 1); 2945 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2946 2947 auto C4 = 2948 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2949 auto C5 = 2950 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2951 auto C6 = B.buildSelect( 2952 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2953 2954 // if (C6 != 0) 2955 auto Add4 = B.buildAdd(S64, Add3, One64); 2956 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2957 2958 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2959 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2960 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2961 2962 // endif C6 2963 // endif C3 2964 2965 if (IsDiv) { 2966 auto Sel1 = B.buildSelect( 2967 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2968 B.buildSelect(DstReg, 2969 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2970 } else { 2971 auto Sel2 = B.buildSelect( 2972 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2973 B.buildSelect(DstReg, 2974 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2975 } 2976 } 2977 2978 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2979 MachineRegisterInfo &MRI, 2980 MachineIRBuilder &B) const { 2981 const LLT S64 = LLT::scalar(64); 2982 const LLT S32 = LLT::scalar(32); 2983 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2984 Register DstReg = MI.getOperand(0).getReg(); 2985 Register Num = MI.getOperand(1).getReg(); 2986 Register Den = MI.getOperand(2).getReg(); 2987 LLT Ty = MRI.getType(DstReg); 2988 2989 if (Ty == S32) 2990 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2991 else if (Ty == S64) 2992 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2993 else 2994 return false; 2995 2996 MI.eraseFromParent(); 2997 return true; 2998 2999 } 3000 3001 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 3002 MachineRegisterInfo &MRI, 3003 MachineIRBuilder &B) const { 3004 const LLT S64 = LLT::scalar(64); 3005 const LLT S32 = LLT::scalar(32); 3006 3007 Register DstReg = MI.getOperand(0).getReg(); 3008 const LLT Ty = MRI.getType(DstReg); 3009 if (Ty != S32 && Ty != S64) 3010 return false; 3011 3012 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 3013 3014 Register LHS = MI.getOperand(1).getReg(); 3015 Register RHS = MI.getOperand(2).getReg(); 3016 3017 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3018 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3019 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3020 3021 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3022 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3023 3024 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3025 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3026 3027 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 3028 if (Ty == S32) 3029 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 3030 else 3031 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 3032 3033 Register Sign; 3034 if (IsDiv) 3035 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3036 else 3037 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3038 3039 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 3040 B.buildSub(DstReg, UDivRem, Sign); 3041 3042 MI.eraseFromParent(); 3043 return true; 3044 } 3045 3046 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3047 MachineRegisterInfo &MRI, 3048 MachineIRBuilder &B) const { 3049 Register Res = MI.getOperand(0).getReg(); 3050 Register LHS = MI.getOperand(1).getReg(); 3051 Register RHS = MI.getOperand(2).getReg(); 3052 uint16_t Flags = MI.getFlags(); 3053 LLT ResTy = MRI.getType(Res); 3054 3055 const MachineFunction &MF = B.getMF(); 3056 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3057 MI.getFlag(MachineInstr::FmAfn); 3058 3059 if (!AllowInaccurateRcp) 3060 return false; 3061 3062 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3063 // 1 / x -> RCP(x) 3064 if (CLHS->isExactlyValue(1.0)) { 3065 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3066 .addUse(RHS) 3067 .setMIFlags(Flags); 3068 3069 MI.eraseFromParent(); 3070 return true; 3071 } 3072 3073 // -1 / x -> RCP( FNEG(x) ) 3074 if (CLHS->isExactlyValue(-1.0)) { 3075 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3076 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3077 .addUse(FNeg.getReg(0)) 3078 .setMIFlags(Flags); 3079 3080 MI.eraseFromParent(); 3081 return true; 3082 } 3083 } 3084 3085 // x / y -> x * (1.0 / y) 3086 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3087 .addUse(RHS) 3088 .setMIFlags(Flags); 3089 B.buildFMul(Res, LHS, RCP, Flags); 3090 3091 MI.eraseFromParent(); 3092 return true; 3093 } 3094 3095 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3096 MachineRegisterInfo &MRI, 3097 MachineIRBuilder &B) const { 3098 Register Res = MI.getOperand(0).getReg(); 3099 Register LHS = MI.getOperand(1).getReg(); 3100 Register RHS = MI.getOperand(2).getReg(); 3101 3102 uint16_t Flags = MI.getFlags(); 3103 3104 LLT S16 = LLT::scalar(16); 3105 LLT S32 = LLT::scalar(32); 3106 3107 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3108 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3109 3110 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3111 .addUse(RHSExt.getReg(0)) 3112 .setMIFlags(Flags); 3113 3114 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3115 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3116 3117 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3118 .addUse(RDst.getReg(0)) 3119 .addUse(RHS) 3120 .addUse(LHS) 3121 .setMIFlags(Flags); 3122 3123 MI.eraseFromParent(); 3124 return true; 3125 } 3126 3127 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3128 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3129 static void toggleSPDenormMode(bool Enable, 3130 MachineIRBuilder &B, 3131 const GCNSubtarget &ST, 3132 AMDGPU::SIModeRegisterDefaults Mode) { 3133 // Set SP denorm mode to this value. 3134 unsigned SPDenormMode = 3135 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3136 3137 if (ST.hasDenormModeInst()) { 3138 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3139 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3140 3141 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3142 B.buildInstr(AMDGPU::S_DENORM_MODE) 3143 .addImm(NewDenormModeValue); 3144 3145 } else { 3146 // Select FP32 bit field in mode register. 3147 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3148 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3149 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3150 3151 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3152 .addImm(SPDenormMode) 3153 .addImm(SPDenormModeBitField); 3154 } 3155 } 3156 3157 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3158 MachineRegisterInfo &MRI, 3159 MachineIRBuilder &B) const { 3160 Register Res = MI.getOperand(0).getReg(); 3161 Register LHS = MI.getOperand(1).getReg(); 3162 Register RHS = MI.getOperand(2).getReg(); 3163 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3164 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3165 3166 uint16_t Flags = MI.getFlags(); 3167 3168 LLT S32 = LLT::scalar(32); 3169 LLT S1 = LLT::scalar(1); 3170 3171 auto One = B.buildFConstant(S32, 1.0f); 3172 3173 auto DenominatorScaled = 3174 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3175 .addUse(LHS) 3176 .addUse(RHS) 3177 .addImm(0) 3178 .setMIFlags(Flags); 3179 auto NumeratorScaled = 3180 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3181 .addUse(LHS) 3182 .addUse(RHS) 3183 .addImm(1) 3184 .setMIFlags(Flags); 3185 3186 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3187 .addUse(DenominatorScaled.getReg(0)) 3188 .setMIFlags(Flags); 3189 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3190 3191 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3192 // aren't modeled as reading it. 3193 if (!Mode.allFP32Denormals()) 3194 toggleSPDenormMode(true, B, ST, Mode); 3195 3196 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3197 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3198 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3199 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3200 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3201 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3202 3203 if (!Mode.allFP32Denormals()) 3204 toggleSPDenormMode(false, B, ST, Mode); 3205 3206 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3207 .addUse(Fma4.getReg(0)) 3208 .addUse(Fma1.getReg(0)) 3209 .addUse(Fma3.getReg(0)) 3210 .addUse(NumeratorScaled.getReg(1)) 3211 .setMIFlags(Flags); 3212 3213 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3214 .addUse(Fmas.getReg(0)) 3215 .addUse(RHS) 3216 .addUse(LHS) 3217 .setMIFlags(Flags); 3218 3219 MI.eraseFromParent(); 3220 return true; 3221 } 3222 3223 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3224 MachineRegisterInfo &MRI, 3225 MachineIRBuilder &B) const { 3226 Register Res = MI.getOperand(0).getReg(); 3227 Register LHS = MI.getOperand(1).getReg(); 3228 Register RHS = MI.getOperand(2).getReg(); 3229 3230 uint16_t Flags = MI.getFlags(); 3231 3232 LLT S64 = LLT::scalar(64); 3233 LLT S1 = LLT::scalar(1); 3234 3235 auto One = B.buildFConstant(S64, 1.0); 3236 3237 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3238 .addUse(LHS) 3239 .addUse(RHS) 3240 .addImm(0) 3241 .setMIFlags(Flags); 3242 3243 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3244 3245 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3246 .addUse(DivScale0.getReg(0)) 3247 .setMIFlags(Flags); 3248 3249 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3250 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3251 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3252 3253 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3254 .addUse(LHS) 3255 .addUse(RHS) 3256 .addImm(1) 3257 .setMIFlags(Flags); 3258 3259 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3260 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3261 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3262 3263 Register Scale; 3264 if (!ST.hasUsableDivScaleConditionOutput()) { 3265 // Workaround a hardware bug on SI where the condition output from div_scale 3266 // is not usable. 3267 3268 LLT S32 = LLT::scalar(32); 3269 3270 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3271 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3272 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3273 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3274 3275 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3276 Scale1Unmerge.getReg(1)); 3277 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3278 Scale0Unmerge.getReg(1)); 3279 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3280 } else { 3281 Scale = DivScale1.getReg(1); 3282 } 3283 3284 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3285 .addUse(Fma4.getReg(0)) 3286 .addUse(Fma3.getReg(0)) 3287 .addUse(Mul.getReg(0)) 3288 .addUse(Scale) 3289 .setMIFlags(Flags); 3290 3291 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3292 .addUse(Fmas.getReg(0)) 3293 .addUse(RHS) 3294 .addUse(LHS) 3295 .setMIFlags(Flags); 3296 3297 MI.eraseFromParent(); 3298 return true; 3299 } 3300 3301 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3302 MachineRegisterInfo &MRI, 3303 MachineIRBuilder &B) const { 3304 Register Res = MI.getOperand(0).getReg(); 3305 Register LHS = MI.getOperand(2).getReg(); 3306 Register RHS = MI.getOperand(3).getReg(); 3307 uint16_t Flags = MI.getFlags(); 3308 3309 LLT S32 = LLT::scalar(32); 3310 LLT S1 = LLT::scalar(1); 3311 3312 auto Abs = B.buildFAbs(S32, RHS, Flags); 3313 const APFloat C0Val(1.0f); 3314 3315 auto C0 = B.buildConstant(S32, 0x6f800000); 3316 auto C1 = B.buildConstant(S32, 0x2f800000); 3317 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3318 3319 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3320 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3321 3322 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3323 3324 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3325 .addUse(Mul0.getReg(0)) 3326 .setMIFlags(Flags); 3327 3328 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3329 3330 B.buildFMul(Res, Sel, Mul1, Flags); 3331 3332 MI.eraseFromParent(); 3333 return true; 3334 } 3335 3336 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3337 // FIXME: Why do we handle this one but not other removed instructions? 3338 // 3339 // Reciprocal square root. The clamp prevents infinite results, clamping 3340 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3341 // +-max_float. 3342 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3343 MachineRegisterInfo &MRI, 3344 MachineIRBuilder &B) const { 3345 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3346 return true; 3347 3348 Register Dst = MI.getOperand(0).getReg(); 3349 Register Src = MI.getOperand(2).getReg(); 3350 auto Flags = MI.getFlags(); 3351 3352 LLT Ty = MRI.getType(Dst); 3353 3354 const fltSemantics *FltSemantics; 3355 if (Ty == LLT::scalar(32)) 3356 FltSemantics = &APFloat::IEEEsingle(); 3357 else if (Ty == LLT::scalar(64)) 3358 FltSemantics = &APFloat::IEEEdouble(); 3359 else 3360 return false; 3361 3362 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3363 .addUse(Src) 3364 .setMIFlags(Flags); 3365 3366 // We don't need to concern ourselves with the snan handling difference, since 3367 // the rsq quieted (or not) so use the one which will directly select. 3368 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3369 const bool UseIEEE = MFI->getMode().IEEE; 3370 3371 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3372 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3373 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3374 3375 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3376 3377 if (UseIEEE) 3378 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3379 else 3380 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3381 MI.eraseFromParent(); 3382 return true; 3383 } 3384 3385 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3386 switch (IID) { 3387 case Intrinsic::amdgcn_ds_fadd: 3388 return AMDGPU::G_ATOMICRMW_FADD; 3389 case Intrinsic::amdgcn_ds_fmin: 3390 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3391 case Intrinsic::amdgcn_ds_fmax: 3392 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3393 default: 3394 llvm_unreachable("not a DS FP intrinsic"); 3395 } 3396 } 3397 3398 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3399 MachineInstr &MI, 3400 Intrinsic::ID IID) const { 3401 GISelChangeObserver &Observer = Helper.Observer; 3402 Observer.changingInstr(MI); 3403 3404 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3405 3406 // The remaining operands were used to set fields in the MemOperand on 3407 // construction. 3408 for (int I = 6; I > 3; --I) 3409 MI.RemoveOperand(I); 3410 3411 MI.RemoveOperand(1); // Remove the intrinsic ID. 3412 Observer.changedInstr(MI); 3413 return true; 3414 } 3415 3416 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3417 MachineRegisterInfo &MRI, 3418 MachineIRBuilder &B) const { 3419 uint64_t Offset = 3420 ST.getTargetLowering()->getImplicitParameterOffset( 3421 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3422 LLT DstTy = MRI.getType(DstReg); 3423 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3424 3425 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3426 if (!loadInputValue(KernargPtrReg, B, 3427 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3428 return false; 3429 3430 // FIXME: This should be nuw 3431 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3432 return true; 3433 } 3434 3435 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3436 MachineRegisterInfo &MRI, 3437 MachineIRBuilder &B) const { 3438 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3439 if (!MFI->isEntryFunction()) { 3440 return legalizePreloadedArgIntrin(MI, MRI, B, 3441 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3442 } 3443 3444 Register DstReg = MI.getOperand(0).getReg(); 3445 if (!getImplicitArgPtr(DstReg, MRI, B)) 3446 return false; 3447 3448 MI.eraseFromParent(); 3449 return true; 3450 } 3451 3452 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3453 MachineRegisterInfo &MRI, 3454 MachineIRBuilder &B, 3455 unsigned AddrSpace) const { 3456 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3457 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3458 Register Hi32 = Unmerge.getReg(1); 3459 3460 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3461 MI.eraseFromParent(); 3462 return true; 3463 } 3464 3465 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3466 // offset (the offset that is included in bounds checking and swizzling, to be 3467 // split between the instruction's voffset and immoffset fields) and soffset 3468 // (the offset that is excluded from bounds checking and swizzling, to go in 3469 // the instruction's soffset field). This function takes the first kind of 3470 // offset and figures out how to split it between voffset and immoffset. 3471 std::tuple<Register, unsigned, unsigned> 3472 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3473 Register OrigOffset) const { 3474 const unsigned MaxImm = 4095; 3475 Register BaseReg; 3476 unsigned TotalConstOffset; 3477 const LLT S32 = LLT::scalar(32); 3478 3479 std::tie(BaseReg, TotalConstOffset) = 3480 AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3481 3482 unsigned ImmOffset = TotalConstOffset; 3483 3484 // If the immediate value is too big for the immoffset field, put the value 3485 // and -4096 into the immoffset field so that the value that is copied/added 3486 // for the voffset field is a multiple of 4096, and it stands more chance 3487 // of being CSEd with the copy/add for another similar load/store. 3488 // However, do not do that rounding down to a multiple of 4096 if that is a 3489 // negative number, as it appears to be illegal to have a negative offset 3490 // in the vgpr, even if adding the immediate offset makes it positive. 3491 unsigned Overflow = ImmOffset & ~MaxImm; 3492 ImmOffset -= Overflow; 3493 if ((int32_t)Overflow < 0) { 3494 Overflow += ImmOffset; 3495 ImmOffset = 0; 3496 } 3497 3498 if (Overflow != 0) { 3499 if (!BaseReg) { 3500 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3501 } else { 3502 auto OverflowVal = B.buildConstant(S32, Overflow); 3503 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3504 } 3505 } 3506 3507 if (!BaseReg) 3508 BaseReg = B.buildConstant(S32, 0).getReg(0); 3509 3510 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3511 } 3512 3513 /// Handle register layout difference for f16 images for some subtargets. 3514 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3515 MachineRegisterInfo &MRI, 3516 Register Reg, 3517 bool ImageStore) const { 3518 const LLT S16 = LLT::scalar(16); 3519 const LLT S32 = LLT::scalar(32); 3520 LLT StoreVT = MRI.getType(Reg); 3521 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3522 3523 if (ST.hasUnpackedD16VMem()) { 3524 auto Unmerge = B.buildUnmerge(S16, Reg); 3525 3526 SmallVector<Register, 4> WideRegs; 3527 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3528 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3529 3530 int NumElts = StoreVT.getNumElements(); 3531 3532 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3533 } 3534 3535 if (ImageStore && ST.hasImageStoreD16Bug()) { 3536 if (StoreVT.getNumElements() == 2) { 3537 SmallVector<Register, 4> PackedRegs; 3538 Reg = B.buildBitcast(S32, Reg).getReg(0); 3539 PackedRegs.push_back(Reg); 3540 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3541 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); 3542 } 3543 3544 if (StoreVT.getNumElements() == 3) { 3545 SmallVector<Register, 4> PackedRegs; 3546 auto Unmerge = B.buildUnmerge(S16, Reg); 3547 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3548 PackedRegs.push_back(Unmerge.getReg(I)); 3549 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3550 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); 3551 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); 3552 } 3553 3554 if (StoreVT.getNumElements() == 4) { 3555 SmallVector<Register, 4> PackedRegs; 3556 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); 3557 auto Unmerge = B.buildUnmerge(S32, Reg); 3558 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3559 PackedRegs.push_back(Unmerge.getReg(I)); 3560 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3561 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); 3562 } 3563 3564 llvm_unreachable("invalid data type"); 3565 } 3566 3567 return Reg; 3568 } 3569 3570 Register AMDGPULegalizerInfo::fixStoreSourceType( 3571 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3572 MachineRegisterInfo *MRI = B.getMRI(); 3573 LLT Ty = MRI->getType(VData); 3574 3575 const LLT S16 = LLT::scalar(16); 3576 3577 // Fixup illegal register types for i8 stores. 3578 if (Ty == LLT::scalar(8) || Ty == S16) { 3579 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3580 return AnyExt; 3581 } 3582 3583 if (Ty.isVector()) { 3584 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3585 if (IsFormat) 3586 return handleD16VData(B, *MRI, VData); 3587 } 3588 } 3589 3590 return VData; 3591 } 3592 3593 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3594 MachineRegisterInfo &MRI, 3595 MachineIRBuilder &B, 3596 bool IsTyped, 3597 bool IsFormat) const { 3598 Register VData = MI.getOperand(1).getReg(); 3599 LLT Ty = MRI.getType(VData); 3600 LLT EltTy = Ty.getScalarType(); 3601 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3602 const LLT S32 = LLT::scalar(32); 3603 3604 VData = fixStoreSourceType(B, VData, IsFormat); 3605 Register RSrc = MI.getOperand(2).getReg(); 3606 3607 MachineMemOperand *MMO = *MI.memoperands_begin(); 3608 const int MemSize = MMO->getSize(); 3609 3610 unsigned ImmOffset; 3611 unsigned TotalOffset; 3612 3613 // The typed intrinsics add an immediate after the registers. 3614 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3615 3616 // The struct intrinsic variants add one additional operand over raw. 3617 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3618 Register VIndex; 3619 int OpOffset = 0; 3620 if (HasVIndex) { 3621 VIndex = MI.getOperand(3).getReg(); 3622 OpOffset = 1; 3623 } 3624 3625 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3626 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3627 3628 unsigned Format = 0; 3629 if (IsTyped) { 3630 Format = MI.getOperand(5 + OpOffset).getImm(); 3631 ++OpOffset; 3632 } 3633 3634 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3635 3636 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3637 if (TotalOffset != 0) 3638 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3639 3640 unsigned Opc; 3641 if (IsTyped) { 3642 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3643 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3644 } else if (IsFormat) { 3645 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3646 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3647 } else { 3648 switch (MemSize) { 3649 case 1: 3650 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3651 break; 3652 case 2: 3653 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3654 break; 3655 default: 3656 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3657 break; 3658 } 3659 } 3660 3661 if (!VIndex) 3662 VIndex = B.buildConstant(S32, 0).getReg(0); 3663 3664 auto MIB = B.buildInstr(Opc) 3665 .addUse(VData) // vdata 3666 .addUse(RSrc) // rsrc 3667 .addUse(VIndex) // vindex 3668 .addUse(VOffset) // voffset 3669 .addUse(SOffset) // soffset 3670 .addImm(ImmOffset); // offset(imm) 3671 3672 if (IsTyped) 3673 MIB.addImm(Format); 3674 3675 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3676 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3677 .addMemOperand(MMO); 3678 3679 MI.eraseFromParent(); 3680 return true; 3681 } 3682 3683 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3684 MachineRegisterInfo &MRI, 3685 MachineIRBuilder &B, 3686 bool IsFormat, 3687 bool IsTyped) const { 3688 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3689 MachineMemOperand *MMO = *MI.memoperands_begin(); 3690 const int MemSize = MMO->getSize(); 3691 const LLT S32 = LLT::scalar(32); 3692 3693 Register Dst = MI.getOperand(0).getReg(); 3694 Register RSrc = MI.getOperand(2).getReg(); 3695 3696 // The typed intrinsics add an immediate after the registers. 3697 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3698 3699 // The struct intrinsic variants add one additional operand over raw. 3700 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3701 Register VIndex; 3702 int OpOffset = 0; 3703 if (HasVIndex) { 3704 VIndex = MI.getOperand(3).getReg(); 3705 OpOffset = 1; 3706 } 3707 3708 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3709 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3710 3711 unsigned Format = 0; 3712 if (IsTyped) { 3713 Format = MI.getOperand(5 + OpOffset).getImm(); 3714 ++OpOffset; 3715 } 3716 3717 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3718 unsigned ImmOffset; 3719 unsigned TotalOffset; 3720 3721 LLT Ty = MRI.getType(Dst); 3722 LLT EltTy = Ty.getScalarType(); 3723 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3724 const bool Unpacked = ST.hasUnpackedD16VMem(); 3725 3726 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3727 if (TotalOffset != 0) 3728 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3729 3730 unsigned Opc; 3731 3732 if (IsTyped) { 3733 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3734 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3735 } else if (IsFormat) { 3736 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3737 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3738 } else { 3739 switch (MemSize) { 3740 case 1: 3741 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3742 break; 3743 case 2: 3744 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3745 break; 3746 default: 3747 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3748 break; 3749 } 3750 } 3751 3752 Register LoadDstReg; 3753 3754 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3755 LLT UnpackedTy = Ty.changeElementSize(32); 3756 3757 if (IsExtLoad) 3758 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3759 else if (Unpacked && IsD16 && Ty.isVector()) 3760 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3761 else 3762 LoadDstReg = Dst; 3763 3764 if (!VIndex) 3765 VIndex = B.buildConstant(S32, 0).getReg(0); 3766 3767 auto MIB = B.buildInstr(Opc) 3768 .addDef(LoadDstReg) // vdata 3769 .addUse(RSrc) // rsrc 3770 .addUse(VIndex) // vindex 3771 .addUse(VOffset) // voffset 3772 .addUse(SOffset) // soffset 3773 .addImm(ImmOffset); // offset(imm) 3774 3775 if (IsTyped) 3776 MIB.addImm(Format); 3777 3778 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3779 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3780 .addMemOperand(MMO); 3781 3782 if (LoadDstReg != Dst) { 3783 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3784 3785 // Widen result for extending loads was widened. 3786 if (IsExtLoad) 3787 B.buildTrunc(Dst, LoadDstReg); 3788 else { 3789 // Repack to original 16-bit vector result 3790 // FIXME: G_TRUNC should work, but legalization currently fails 3791 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3792 SmallVector<Register, 4> Repack; 3793 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3794 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3795 B.buildMerge(Dst, Repack); 3796 } 3797 } 3798 3799 MI.eraseFromParent(); 3800 return true; 3801 } 3802 3803 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3804 MachineIRBuilder &B, 3805 bool IsInc) const { 3806 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3807 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3808 B.buildInstr(Opc) 3809 .addDef(MI.getOperand(0).getReg()) 3810 .addUse(MI.getOperand(2).getReg()) 3811 .addUse(MI.getOperand(3).getReg()) 3812 .cloneMemRefs(MI); 3813 MI.eraseFromParent(); 3814 return true; 3815 } 3816 3817 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3818 switch (IntrID) { 3819 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3820 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3821 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3822 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3823 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3824 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3825 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3826 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3827 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3828 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3829 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3830 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3831 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3832 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3833 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3834 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3835 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3836 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3837 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3838 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3839 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3840 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3841 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3842 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3843 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3844 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3845 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3846 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3847 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3848 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3849 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3850 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3851 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3852 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3853 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3854 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3855 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3856 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3857 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3858 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3859 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3860 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3861 default: 3862 llvm_unreachable("unhandled atomic opcode"); 3863 } 3864 } 3865 3866 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3867 MachineIRBuilder &B, 3868 Intrinsic::ID IID) const { 3869 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3870 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3871 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3872 3873 Register Dst; 3874 3875 int OpOffset = 0; 3876 if (HasReturn) { 3877 // A few FP atomics do not support return values. 3878 Dst = MI.getOperand(0).getReg(); 3879 } else { 3880 OpOffset = -1; 3881 } 3882 3883 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3884 Register CmpVal; 3885 3886 if (IsCmpSwap) { 3887 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3888 ++OpOffset; 3889 } 3890 3891 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3892 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3893 3894 // The struct intrinsic variants add one additional operand over raw. 3895 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3896 Register VIndex; 3897 if (HasVIndex) { 3898 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3899 ++OpOffset; 3900 } 3901 3902 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3903 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3904 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3905 3906 MachineMemOperand *MMO = *MI.memoperands_begin(); 3907 3908 unsigned ImmOffset; 3909 unsigned TotalOffset; 3910 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3911 if (TotalOffset != 0) 3912 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3913 3914 if (!VIndex) 3915 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3916 3917 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3918 3919 if (HasReturn) 3920 MIB.addDef(Dst); 3921 3922 MIB.addUse(VData); // vdata 3923 3924 if (IsCmpSwap) 3925 MIB.addReg(CmpVal); 3926 3927 MIB.addUse(RSrc) // rsrc 3928 .addUse(VIndex) // vindex 3929 .addUse(VOffset) // voffset 3930 .addUse(SOffset) // soffset 3931 .addImm(ImmOffset) // offset(imm) 3932 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3933 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3934 .addMemOperand(MMO); 3935 3936 MI.eraseFromParent(); 3937 return true; 3938 } 3939 3940 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3941 /// vector with s16 typed elements. 3942 static void packImageA16AddressToDwords( 3943 MachineIRBuilder &B, MachineInstr &MI, 3944 SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset, 3945 const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) { 3946 const LLT S16 = LLT::scalar(16); 3947 const LLT V2S16 = LLT::vector(2, 16); 3948 3949 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 3950 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 3951 if (!SrcOp.isReg()) 3952 continue; // _L to _LZ may have eliminated this. 3953 3954 Register AddrReg = SrcOp.getReg(); 3955 3956 if (I < Intr->GradientStart) { 3957 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3958 PackedAddrs.push_back(AddrReg); 3959 } else { 3960 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3961 // derivatives dx/dh and dx/dv are packed with undef. 3962 if (((I + 1) >= EndIdx) || 3963 ((Intr->NumGradients / 2) % 2 == 1 && 3964 (I == static_cast<unsigned>(Intr->GradientStart + 3965 (Intr->NumGradients / 2) - 1) || 3966 I == static_cast<unsigned>(Intr->GradientStart + 3967 Intr->NumGradients - 1))) || 3968 // Check for _L to _LZ optimization 3969 !MI.getOperand(ArgOffset + I + 1).isReg()) { 3970 PackedAddrs.push_back( 3971 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3972 .getReg(0)); 3973 } else { 3974 PackedAddrs.push_back( 3975 B.buildBuildVector( 3976 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 3977 .getReg(0)); 3978 ++I; 3979 } 3980 } 3981 } 3982 } 3983 3984 /// Convert from separate vaddr components to a single vector address register, 3985 /// and replace the remaining operands with $noreg. 3986 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3987 int DimIdx, int NumVAddrs) { 3988 const LLT S32 = LLT::scalar(32); 3989 3990 SmallVector<Register, 8> AddrRegs; 3991 for (int I = 0; I != NumVAddrs; ++I) { 3992 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3993 if (SrcOp.isReg()) { 3994 AddrRegs.push_back(SrcOp.getReg()); 3995 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3996 } 3997 } 3998 3999 int NumAddrRegs = AddrRegs.size(); 4000 if (NumAddrRegs != 1) { 4001 // Round up to 8 elements for v5-v7 4002 // FIXME: Missing intermediate sized register classes and instructions. 4003 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4004 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4005 auto Undef = B.buildUndef(S32); 4006 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4007 NumAddrRegs = RoundedNumRegs; 4008 } 4009 4010 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 4011 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4012 } 4013 4014 for (int I = 1; I != NumVAddrs; ++I) { 4015 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4016 if (SrcOp.isReg()) 4017 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4018 } 4019 } 4020 4021 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4022 /// 4023 /// Depending on the subtarget, load/store with 16-bit element data need to be 4024 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4025 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4026 /// registers. 4027 /// 4028 /// We don't want to directly select image instructions just yet, but also want 4029 /// to exposes all register repacking to the legalizer/combiners. We also don't 4030 /// want a selected instrution entering RegBankSelect. In order to avoid 4031 /// defining a multitude of intermediate image instructions, directly hack on 4032 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4033 /// now unnecessary arguments with $noreg. 4034 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4035 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4036 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4037 4038 const unsigned NumDefs = MI.getNumExplicitDefs(); 4039 const unsigned ArgOffset = NumDefs + 1; 4040 bool IsTFE = NumDefs == 2; 4041 // We are only processing the operands of d16 image operations on subtargets 4042 // that use the unpacked register layout, or need to repack the TFE result. 4043 4044 // TODO: Do we need to guard against already legalized intrinsics? 4045 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4046 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4047 4048 MachineRegisterInfo *MRI = B.getMRI(); 4049 const LLT S32 = LLT::scalar(32); 4050 const LLT S16 = LLT::scalar(16); 4051 const LLT V2S16 = LLT::vector(2, 16); 4052 4053 unsigned DMask = 0; 4054 4055 // Check for 16 bit addresses and pack if true. 4056 LLT GradTy = 4057 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4058 LLT AddrTy = 4059 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4060 const bool IsG16 = GradTy == S16; 4061 const bool IsA16 = AddrTy == S16; 4062 4063 int DMaskLanes = 0; 4064 if (!BaseOpcode->Atomic) { 4065 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4066 if (BaseOpcode->Gather4) { 4067 DMaskLanes = 4; 4068 } else if (DMask != 0) { 4069 DMaskLanes = countPopulation(DMask); 4070 } else if (!IsTFE && !BaseOpcode->Store) { 4071 // If dmask is 0, this is a no-op load. This can be eliminated. 4072 B.buildUndef(MI.getOperand(0)); 4073 MI.eraseFromParent(); 4074 return true; 4075 } 4076 } 4077 4078 Observer.changingInstr(MI); 4079 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4080 4081 unsigned NewOpcode = NumDefs == 0 ? 4082 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4083 4084 // Track that we legalized this 4085 MI.setDesc(B.getTII().get(NewOpcode)); 4086 4087 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4088 // dmask to be at least 1 otherwise the instruction will fail 4089 if (IsTFE && DMask == 0) { 4090 DMask = 0x1; 4091 DMaskLanes = 1; 4092 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4093 } 4094 4095 if (BaseOpcode->Atomic) { 4096 Register VData0 = MI.getOperand(2).getReg(); 4097 LLT Ty = MRI->getType(VData0); 4098 4099 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4100 if (Ty.isVector()) 4101 return false; 4102 4103 if (BaseOpcode->AtomicX2) { 4104 Register VData1 = MI.getOperand(3).getReg(); 4105 // The two values are packed in one register. 4106 LLT PackedTy = LLT::vector(2, Ty); 4107 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4108 MI.getOperand(2).setReg(Concat.getReg(0)); 4109 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4110 } 4111 } 4112 4113 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4114 4115 // Optimize _L to _LZ when _L is zero 4116 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4117 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4118 const ConstantFP *ConstantLod; 4119 4120 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4121 m_GFCst(ConstantLod))) { 4122 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4123 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4124 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4125 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4126 Intr->Dim); 4127 4128 // The starting indexes should remain in the same place. 4129 --CorrectedNumVAddrs; 4130 4131 MI.getOperand(MI.getNumExplicitDefs()) 4132 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4133 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4134 Intr = NewImageDimIntr; 4135 } 4136 } 4137 } 4138 4139 // Optimize _mip away, when 'lod' is zero 4140 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4141 int64_t ConstantLod; 4142 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4143 m_ICst(ConstantLod))) { 4144 if (ConstantLod == 0) { 4145 // TODO: Change intrinsic opcode and remove operand instead or replacing 4146 // it with 0, as the _L to _LZ handling is done above. 4147 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4148 --CorrectedNumVAddrs; 4149 } 4150 } 4151 } 4152 4153 // Rewrite the addressing register layout before doing anything else. 4154 if (IsA16 || IsG16) { 4155 if (IsA16) { 4156 // Target must support the feature and gradients need to be 16 bit too 4157 if (!ST.hasA16() || !IsG16) 4158 return false; 4159 } else if (!ST.hasG16()) 4160 return false; 4161 4162 if (Intr->NumVAddrs > 1) { 4163 SmallVector<Register, 4> PackedRegs; 4164 // Don't compress addresses for G16 4165 const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart; 4166 packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr, 4167 PackEndIdx); 4168 4169 if (!IsA16) { 4170 // Add uncompressed address 4171 for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) { 4172 int AddrReg = MI.getOperand(ArgOffset + I).getReg(); 4173 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 4174 PackedRegs.push_back(AddrReg); 4175 } 4176 } 4177 4178 // See also below in the non-a16 branch 4179 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4180 4181 if (!UseNSA && PackedRegs.size() > 1) { 4182 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4183 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4184 PackedRegs[0] = Concat.getReg(0); 4185 PackedRegs.resize(1); 4186 } 4187 4188 const unsigned NumPacked = PackedRegs.size(); 4189 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4190 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4191 if (!SrcOp.isReg()) { 4192 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4193 continue; 4194 } 4195 4196 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4197 4198 if (I - Intr->VAddrStart < NumPacked) 4199 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4200 else 4201 SrcOp.setReg(AMDGPU::NoRegister); 4202 } 4203 } 4204 } else { 4205 // If the register allocator cannot place the address registers contiguously 4206 // without introducing moves, then using the non-sequential address encoding 4207 // is always preferable, since it saves VALU instructions and is usually a 4208 // wash in terms of code size or even better. 4209 // 4210 // However, we currently have no way of hinting to the register allocator 4211 // that MIMG addresses should be placed contiguously when it is possible to 4212 // do so, so force non-NSA for the common 2-address case as a heuristic. 4213 // 4214 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4215 // allocation when possible. 4216 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4217 4218 if (!UseNSA && Intr->NumVAddrs > 1) 4219 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4220 Intr->NumVAddrs); 4221 } 4222 4223 int Flags = 0; 4224 if (IsA16) 4225 Flags |= 1; 4226 if (IsG16) 4227 Flags |= 2; 4228 MI.addOperand(MachineOperand::CreateImm(Flags)); 4229 4230 if (BaseOpcode->Store) { // No TFE for stores? 4231 // TODO: Handle dmask trim 4232 Register VData = MI.getOperand(1).getReg(); 4233 LLT Ty = MRI->getType(VData); 4234 if (!Ty.isVector() || Ty.getElementType() != S16) 4235 return true; 4236 4237 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4238 if (RepackedReg != VData) { 4239 MI.getOperand(1).setReg(RepackedReg); 4240 } 4241 4242 return true; 4243 } 4244 4245 Register DstReg = MI.getOperand(0).getReg(); 4246 LLT Ty = MRI->getType(DstReg); 4247 const LLT EltTy = Ty.getScalarType(); 4248 const bool IsD16 = Ty.getScalarType() == S16; 4249 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4250 4251 // Confirm that the return type is large enough for the dmask specified 4252 if (NumElts < DMaskLanes) 4253 return false; 4254 4255 if (NumElts > 4 || DMaskLanes > 4) 4256 return false; 4257 4258 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4259 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4260 4261 // The raw dword aligned data component of the load. The only legal cases 4262 // where this matters should be when using the packed D16 format, for 4263 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4264 LLT RoundedTy; 4265 4266 // S32 vector to to cover all data, plus TFE result element. 4267 LLT TFETy; 4268 4269 // Register type to use for each loaded component. Will be S32 or V2S16. 4270 LLT RegTy; 4271 4272 if (IsD16 && ST.hasUnpackedD16VMem()) { 4273 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4274 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4275 RegTy = S32; 4276 } else { 4277 unsigned EltSize = EltTy.getSizeInBits(); 4278 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4279 unsigned RoundedSize = 32 * RoundedElts; 4280 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4281 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4282 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4283 } 4284 4285 // The return type does not need adjustment. 4286 // TODO: Should we change s16 case to s32 or <2 x s16>? 4287 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4288 return true; 4289 4290 Register Dst1Reg; 4291 4292 // Insert after the instruction. 4293 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4294 4295 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4296 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4297 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4298 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4299 4300 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4301 4302 MI.getOperand(0).setReg(NewResultReg); 4303 4304 // In the IR, TFE is supposed to be used with a 2 element struct return 4305 // type. The intruction really returns these two values in one contiguous 4306 // register, with one additional dword beyond the loaded data. Rewrite the 4307 // return type to use a single register result. 4308 4309 if (IsTFE) { 4310 Dst1Reg = MI.getOperand(1).getReg(); 4311 if (MRI->getType(Dst1Reg) != S32) 4312 return false; 4313 4314 // TODO: Make sure the TFE operand bit is set. 4315 MI.RemoveOperand(1); 4316 4317 // Handle the easy case that requires no repack instructions. 4318 if (Ty == S32) { 4319 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4320 return true; 4321 } 4322 } 4323 4324 // Now figure out how to copy the new result register back into the old 4325 // result. 4326 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4327 4328 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4329 4330 if (ResultNumRegs == 1) { 4331 assert(!IsTFE); 4332 ResultRegs[0] = NewResultReg; 4333 } else { 4334 // We have to repack into a new vector of some kind. 4335 for (int I = 0; I != NumDataRegs; ++I) 4336 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4337 B.buildUnmerge(ResultRegs, NewResultReg); 4338 4339 // Drop the final TFE element to get the data part. The TFE result is 4340 // directly written to the right place already. 4341 if (IsTFE) 4342 ResultRegs.resize(NumDataRegs); 4343 } 4344 4345 // For an s16 scalar result, we form an s32 result with a truncate regardless 4346 // of packed vs. unpacked. 4347 if (IsD16 && !Ty.isVector()) { 4348 B.buildTrunc(DstReg, ResultRegs[0]); 4349 return true; 4350 } 4351 4352 // Avoid a build/concat_vector of 1 entry. 4353 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4354 B.buildBitcast(DstReg, ResultRegs[0]); 4355 return true; 4356 } 4357 4358 assert(Ty.isVector()); 4359 4360 if (IsD16) { 4361 // For packed D16 results with TFE enabled, all the data components are 4362 // S32. Cast back to the expected type. 4363 // 4364 // TODO: We don't really need to use load s32 elements. We would only need one 4365 // cast for the TFE result if a multiple of v2s16 was used. 4366 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4367 for (Register &Reg : ResultRegs) 4368 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4369 } else if (ST.hasUnpackedD16VMem()) { 4370 for (Register &Reg : ResultRegs) 4371 Reg = B.buildTrunc(S16, Reg).getReg(0); 4372 } 4373 } 4374 4375 auto padWithUndef = [&](LLT Ty, int NumElts) { 4376 if (NumElts == 0) 4377 return; 4378 Register Undef = B.buildUndef(Ty).getReg(0); 4379 for (int I = 0; I != NumElts; ++I) 4380 ResultRegs.push_back(Undef); 4381 }; 4382 4383 // Pad out any elements eliminated due to the dmask. 4384 LLT ResTy = MRI->getType(ResultRegs[0]); 4385 if (!ResTy.isVector()) { 4386 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4387 B.buildBuildVector(DstReg, ResultRegs); 4388 return true; 4389 } 4390 4391 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4392 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4393 4394 // Deal with the one annoying legal case. 4395 const LLT V3S16 = LLT::vector(3, 16); 4396 if (Ty == V3S16) { 4397 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4398 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4399 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4400 return true; 4401 } 4402 4403 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4404 B.buildConcatVectors(DstReg, ResultRegs); 4405 return true; 4406 } 4407 4408 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4409 LegalizerHelper &Helper, MachineInstr &MI) const { 4410 MachineIRBuilder &B = Helper.MIRBuilder; 4411 GISelChangeObserver &Observer = Helper.Observer; 4412 4413 Register Dst = MI.getOperand(0).getReg(); 4414 LLT Ty = B.getMRI()->getType(Dst); 4415 unsigned Size = Ty.getSizeInBits(); 4416 MachineFunction &MF = B.getMF(); 4417 4418 Observer.changingInstr(MI); 4419 4420 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4421 Ty = getBitcastRegisterType(Ty); 4422 Helper.bitcastDst(MI, Ty, 0); 4423 Dst = MI.getOperand(0).getReg(); 4424 B.setInsertPt(B.getMBB(), MI); 4425 } 4426 4427 // FIXME: We don't really need this intermediate instruction. The intrinsic 4428 // should be fixed to have a memory operand. Since it's readnone, we're not 4429 // allowed to add one. 4430 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4431 MI.RemoveOperand(1); // Remove intrinsic ID 4432 4433 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4434 // TODO: Should this use datalayout alignment? 4435 const unsigned MemSize = (Size + 7) / 8; 4436 const Align MemAlign(4); 4437 MachineMemOperand *MMO = MF.getMachineMemOperand( 4438 MachinePointerInfo(), 4439 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4440 MachineMemOperand::MOInvariant, 4441 MemSize, MemAlign); 4442 MI.addMemOperand(MF, MMO); 4443 4444 // There are no 96-bit result scalar loads, but widening to 128-bit should 4445 // always be legal. We may need to restore this to a 96-bit result if it turns 4446 // out this needs to be converted to a vector load during RegBankSelect. 4447 if (!isPowerOf2_32(Size)) { 4448 if (Ty.isVector()) 4449 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4450 else 4451 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4452 } 4453 4454 Observer.changedInstr(MI); 4455 return true; 4456 } 4457 4458 // TODO: Move to selection 4459 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4460 MachineRegisterInfo &MRI, 4461 MachineIRBuilder &B) const { 4462 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4463 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4464 !ST.isTrapHandlerEnabled()) { 4465 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4466 } else { 4467 // Pass queue pointer to trap handler as input, and insert trap instruction 4468 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4469 MachineRegisterInfo &MRI = *B.getMRI(); 4470 4471 Register LiveIn = 4472 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4473 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4474 return false; 4475 4476 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4477 B.buildCopy(SGPR01, LiveIn); 4478 B.buildInstr(AMDGPU::S_TRAP) 4479 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4480 .addReg(SGPR01, RegState::Implicit); 4481 } 4482 4483 MI.eraseFromParent(); 4484 return true; 4485 } 4486 4487 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4488 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4489 // Is non-HSA path or trap-handler disabled? then, report a warning 4490 // accordingly 4491 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4492 !ST.isTrapHandlerEnabled()) { 4493 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4494 "debugtrap handler not supported", 4495 MI.getDebugLoc(), DS_Warning); 4496 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4497 Ctx.diagnose(NoTrap); 4498 } else { 4499 // Insert debug-trap instruction 4500 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4501 } 4502 4503 MI.eraseFromParent(); 4504 return true; 4505 } 4506 4507 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4508 MachineIRBuilder &B) const { 4509 MachineRegisterInfo &MRI = *B.getMRI(); 4510 const LLT S16 = LLT::scalar(16); 4511 const LLT S32 = LLT::scalar(32); 4512 4513 Register DstReg = MI.getOperand(0).getReg(); 4514 Register NodePtr = MI.getOperand(2).getReg(); 4515 Register RayExtent = MI.getOperand(3).getReg(); 4516 Register RayOrigin = MI.getOperand(4).getReg(); 4517 Register RayDir = MI.getOperand(5).getReg(); 4518 Register RayInvDir = MI.getOperand(6).getReg(); 4519 Register TDescr = MI.getOperand(7).getReg(); 4520 4521 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4522 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4523 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4524 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4525 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4526 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4527 4528 SmallVector<Register, 12> Ops; 4529 if (Is64) { 4530 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4531 Ops.push_back(Unmerge.getReg(0)); 4532 Ops.push_back(Unmerge.getReg(1)); 4533 } else { 4534 Ops.push_back(NodePtr); 4535 } 4536 Ops.push_back(RayExtent); 4537 4538 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4539 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4540 Ops.push_back(Unmerge.getReg(0)); 4541 Ops.push_back(Unmerge.getReg(1)); 4542 Ops.push_back(Unmerge.getReg(2)); 4543 }; 4544 4545 packLanes(RayOrigin); 4546 if (IsA16) { 4547 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4548 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4549 Register R1 = MRI.createGenericVirtualRegister(S32); 4550 Register R2 = MRI.createGenericVirtualRegister(S32); 4551 Register R3 = MRI.createGenericVirtualRegister(S32); 4552 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4553 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4554 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4555 Ops.push_back(R1); 4556 Ops.push_back(R2); 4557 Ops.push_back(R3); 4558 } else { 4559 packLanes(RayDir); 4560 packLanes(RayInvDir); 4561 } 4562 4563 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4564 .addDef(DstReg) 4565 .addImm(Opcode); 4566 4567 for (Register R : Ops) { 4568 MIB.addUse(R); 4569 } 4570 4571 MIB.addUse(TDescr) 4572 .addImm(IsA16 ? 1 : 0) 4573 .cloneMemRefs(MI); 4574 4575 MI.eraseFromParent(); 4576 return true; 4577 } 4578 4579 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4580 MachineInstr &MI) const { 4581 MachineIRBuilder &B = Helper.MIRBuilder; 4582 MachineRegisterInfo &MRI = *B.getMRI(); 4583 4584 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4585 auto IntrID = MI.getIntrinsicID(); 4586 switch (IntrID) { 4587 case Intrinsic::amdgcn_if: 4588 case Intrinsic::amdgcn_else: { 4589 MachineInstr *Br = nullptr; 4590 MachineBasicBlock *UncondBrTarget = nullptr; 4591 bool Negated = false; 4592 if (MachineInstr *BrCond = 4593 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4594 const SIRegisterInfo *TRI 4595 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4596 4597 Register Def = MI.getOperand(1).getReg(); 4598 Register Use = MI.getOperand(3).getReg(); 4599 4600 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4601 4602 if (Negated) 4603 std::swap(CondBrTarget, UncondBrTarget); 4604 4605 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4606 if (IntrID == Intrinsic::amdgcn_if) { 4607 B.buildInstr(AMDGPU::SI_IF) 4608 .addDef(Def) 4609 .addUse(Use) 4610 .addMBB(UncondBrTarget); 4611 } else { 4612 B.buildInstr(AMDGPU::SI_ELSE) 4613 .addDef(Def) 4614 .addUse(Use) 4615 .addMBB(UncondBrTarget); 4616 } 4617 4618 if (Br) { 4619 Br->getOperand(0).setMBB(CondBrTarget); 4620 } else { 4621 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4622 // since we're swapping branch targets it needs to be reinserted. 4623 // FIXME: IRTranslator should probably not do this 4624 B.buildBr(*CondBrTarget); 4625 } 4626 4627 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4628 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4629 MI.eraseFromParent(); 4630 BrCond->eraseFromParent(); 4631 return true; 4632 } 4633 4634 return false; 4635 } 4636 case Intrinsic::amdgcn_loop: { 4637 MachineInstr *Br = nullptr; 4638 MachineBasicBlock *UncondBrTarget = nullptr; 4639 bool Negated = false; 4640 if (MachineInstr *BrCond = 4641 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4642 const SIRegisterInfo *TRI 4643 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4644 4645 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4646 Register Reg = MI.getOperand(2).getReg(); 4647 4648 if (Negated) 4649 std::swap(CondBrTarget, UncondBrTarget); 4650 4651 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4652 B.buildInstr(AMDGPU::SI_LOOP) 4653 .addUse(Reg) 4654 .addMBB(UncondBrTarget); 4655 4656 if (Br) 4657 Br->getOperand(0).setMBB(CondBrTarget); 4658 else 4659 B.buildBr(*CondBrTarget); 4660 4661 MI.eraseFromParent(); 4662 BrCond->eraseFromParent(); 4663 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4664 return true; 4665 } 4666 4667 return false; 4668 } 4669 case Intrinsic::amdgcn_kernarg_segment_ptr: 4670 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4671 // This only makes sense to call in a kernel, so just lower to null. 4672 B.buildConstant(MI.getOperand(0).getReg(), 0); 4673 MI.eraseFromParent(); 4674 return true; 4675 } 4676 4677 return legalizePreloadedArgIntrin( 4678 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4679 case Intrinsic::amdgcn_implicitarg_ptr: 4680 return legalizeImplicitArgPtr(MI, MRI, B); 4681 case Intrinsic::amdgcn_workitem_id_x: 4682 return legalizePreloadedArgIntrin(MI, MRI, B, 4683 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4684 case Intrinsic::amdgcn_workitem_id_y: 4685 return legalizePreloadedArgIntrin(MI, MRI, B, 4686 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4687 case Intrinsic::amdgcn_workitem_id_z: 4688 return legalizePreloadedArgIntrin(MI, MRI, B, 4689 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4690 case Intrinsic::amdgcn_workgroup_id_x: 4691 return legalizePreloadedArgIntrin(MI, MRI, B, 4692 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4693 case Intrinsic::amdgcn_workgroup_id_y: 4694 return legalizePreloadedArgIntrin(MI, MRI, B, 4695 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4696 case Intrinsic::amdgcn_workgroup_id_z: 4697 return legalizePreloadedArgIntrin(MI, MRI, B, 4698 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4699 case Intrinsic::amdgcn_dispatch_ptr: 4700 return legalizePreloadedArgIntrin(MI, MRI, B, 4701 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4702 case Intrinsic::amdgcn_queue_ptr: 4703 return legalizePreloadedArgIntrin(MI, MRI, B, 4704 AMDGPUFunctionArgInfo::QUEUE_PTR); 4705 case Intrinsic::amdgcn_implicit_buffer_ptr: 4706 return legalizePreloadedArgIntrin( 4707 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4708 case Intrinsic::amdgcn_dispatch_id: 4709 return legalizePreloadedArgIntrin(MI, MRI, B, 4710 AMDGPUFunctionArgInfo::DISPATCH_ID); 4711 case Intrinsic::amdgcn_fdiv_fast: 4712 return legalizeFDIVFastIntrin(MI, MRI, B); 4713 case Intrinsic::amdgcn_is_shared: 4714 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4715 case Intrinsic::amdgcn_is_private: 4716 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4717 case Intrinsic::amdgcn_wavefrontsize: { 4718 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4719 MI.eraseFromParent(); 4720 return true; 4721 } 4722 case Intrinsic::amdgcn_s_buffer_load: 4723 return legalizeSBufferLoad(Helper, MI); 4724 case Intrinsic::amdgcn_raw_buffer_store: 4725 case Intrinsic::amdgcn_struct_buffer_store: 4726 return legalizeBufferStore(MI, MRI, B, false, false); 4727 case Intrinsic::amdgcn_raw_buffer_store_format: 4728 case Intrinsic::amdgcn_struct_buffer_store_format: 4729 return legalizeBufferStore(MI, MRI, B, false, true); 4730 case Intrinsic::amdgcn_raw_tbuffer_store: 4731 case Intrinsic::amdgcn_struct_tbuffer_store: 4732 return legalizeBufferStore(MI, MRI, B, true, true); 4733 case Intrinsic::amdgcn_raw_buffer_load: 4734 case Intrinsic::amdgcn_struct_buffer_load: 4735 return legalizeBufferLoad(MI, MRI, B, false, false); 4736 case Intrinsic::amdgcn_raw_buffer_load_format: 4737 case Intrinsic::amdgcn_struct_buffer_load_format: 4738 return legalizeBufferLoad(MI, MRI, B, true, false); 4739 case Intrinsic::amdgcn_raw_tbuffer_load: 4740 case Intrinsic::amdgcn_struct_tbuffer_load: 4741 return legalizeBufferLoad(MI, MRI, B, true, true); 4742 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4743 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4744 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4745 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4746 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4747 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4748 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4749 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4750 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4751 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4752 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4753 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4754 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4755 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4756 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4757 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4758 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4759 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4760 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4761 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4762 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4763 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4764 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4765 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4766 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4767 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4768 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4769 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4770 return legalizeBufferAtomic(MI, B, IntrID); 4771 case Intrinsic::amdgcn_atomic_inc: 4772 return legalizeAtomicIncDec(MI, B, true); 4773 case Intrinsic::amdgcn_atomic_dec: 4774 return legalizeAtomicIncDec(MI, B, false); 4775 case Intrinsic::trap: 4776 return legalizeTrapIntrinsic(MI, MRI, B); 4777 case Intrinsic::debugtrap: 4778 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4779 case Intrinsic::amdgcn_rsq_clamp: 4780 return legalizeRsqClampIntrinsic(MI, MRI, B); 4781 case Intrinsic::amdgcn_ds_fadd: 4782 case Intrinsic::amdgcn_ds_fmin: 4783 case Intrinsic::amdgcn_ds_fmax: 4784 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4785 case Intrinsic::amdgcn_image_bvh_intersect_ray: 4786 return legalizeBVHIntrinsic(MI, B); 4787 default: { 4788 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4789 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4790 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4791 return true; 4792 } 4793 } 4794 4795 return true; 4796 } 4797