1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 /// \returs true if this is an odd sized vector which should widen by adding an 64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65 /// excludes s1 vectors, which should always be scalarized. 66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 67 return [=](const LegalityQuery &Query) { 68 const LLT Ty = Query.Types[TypeIdx]; 69 if (!Ty.isVector()) 70 return false; 71 72 const LLT EltTy = Ty.getElementType(); 73 const unsigned EltSize = EltTy.getSizeInBits(); 74 return Ty.getNumElements() % 2 != 0 && 75 EltSize > 1 && EltSize < 32 && 76 Ty.getSizeInBits() % 32 != 0; 77 }; 78 } 79 80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 return Ty.getSizeInBits() % 32 == 0; 84 }; 85 } 86 87 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getScalarType(); 91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 92 }; 93 } 94 95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 111 }; 112 } 113 114 // Increase the number of vector elements to reach the next multiple of 32-bit 115 // type. 116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 117 return [=](const LegalityQuery &Query) { 118 const LLT Ty = Query.Types[TypeIdx]; 119 120 const LLT EltTy = Ty.getElementType(); 121 const int Size = Ty.getSizeInBits(); 122 const int EltSize = EltTy.getSizeInBits(); 123 const int NextMul32 = (Size + 31) / 32; 124 125 assert(EltSize < 32); 126 127 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 128 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 129 }; 130 } 131 132 static LLT getBitcastRegisterType(const LLT Ty) { 133 const unsigned Size = Ty.getSizeInBits(); 134 135 LLT CoercedTy; 136 if (Size <= 32) { 137 // <2 x s8> -> s16 138 // <4 x s8> -> s32 139 return LLT::scalar(Size); 140 } 141 142 return LLT::scalarOrVector(Size / 32, 32); 143 } 144 145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 146 return [=](const LegalityQuery &Query) { 147 const LLT Ty = Query.Types[TypeIdx]; 148 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 149 }; 150 } 151 152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 unsigned Size = Ty.getSizeInBits(); 156 assert(Size % 32 == 0); 157 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 158 }; 159 } 160 161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 162 return [=](const LegalityQuery &Query) { 163 const LLT QueryTy = Query.Types[TypeIdx]; 164 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 165 }; 166 } 167 168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 169 return [=](const LegalityQuery &Query) { 170 const LLT QueryTy = Query.Types[TypeIdx]; 171 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 172 }; 173 } 174 175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 176 return [=](const LegalityQuery &Query) { 177 const LLT QueryTy = Query.Types[TypeIdx]; 178 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 179 }; 180 } 181 182 static bool isRegisterSize(unsigned Size) { 183 return Size % 32 == 0 && Size <= MaxRegisterSize; 184 } 185 186 static bool isRegisterVectorElementType(LLT EltTy) { 187 const int EltSize = EltTy.getSizeInBits(); 188 return EltSize == 16 || EltSize % 32 == 0; 189 } 190 191 static bool isRegisterVectorType(LLT Ty) { 192 const int EltSize = Ty.getElementType().getSizeInBits(); 193 return EltSize == 32 || EltSize == 64 || 194 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 195 EltSize == 128 || EltSize == 256; 196 } 197 198 static bool isRegisterType(LLT Ty) { 199 if (!isRegisterSize(Ty.getSizeInBits())) 200 return false; 201 202 if (Ty.isVector()) 203 return isRegisterVectorType(Ty); 204 205 return true; 206 } 207 208 // Any combination of 32 or 64-bit elements up the maximum register size, and 209 // multiples of v2s16. 210 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 211 return [=](const LegalityQuery &Query) { 212 return isRegisterType(Query.Types[TypeIdx]); 213 }; 214 } 215 216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 217 return [=](const LegalityQuery &Query) { 218 const LLT QueryTy = Query.Types[TypeIdx]; 219 if (!QueryTy.isVector()) 220 return false; 221 const LLT EltTy = QueryTy.getElementType(); 222 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 223 }; 224 } 225 226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 227 return [=](const LegalityQuery &Query) { 228 const LLT Ty = Query.Types[TypeIdx]; 229 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 230 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 231 }; 232 } 233 234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 235 // handle some operations by just promoting the register during 236 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 238 bool IsLoad) { 239 switch (AS) { 240 case AMDGPUAS::PRIVATE_ADDRESS: 241 // FIXME: Private element size. 242 return 32; 243 case AMDGPUAS::LOCAL_ADDRESS: 244 return ST.useDS128() ? 128 : 64; 245 case AMDGPUAS::GLOBAL_ADDRESS: 246 case AMDGPUAS::CONSTANT_ADDRESS: 247 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 248 // Treat constant and global as identical. SMRD loads are sometimes usable for 249 // global loads (ideally constant address space should be eliminated) 250 // depending on the context. Legality cannot be context dependent, but 251 // RegBankSelect can split the load as necessary depending on the pointer 252 // register bank/uniformity and if the memory is invariant or not written in a 253 // kernel. 254 return IsLoad ? 512 : 128; 255 default: 256 // Flat addresses may contextually need to be split to 32-bit parts if they 257 // may alias scratch depending on the subtarget. 258 return 128; 259 } 260 } 261 262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 263 const LegalityQuery &Query, 264 unsigned Opcode) { 265 const LLT Ty = Query.Types[0]; 266 267 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 268 const bool IsLoad = Opcode != AMDGPU::G_STORE; 269 270 unsigned RegSize = Ty.getSizeInBits(); 271 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 272 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 273 unsigned AS = Query.Types[1].getAddressSpace(); 274 275 // All of these need to be custom lowered to cast the pointer operand. 276 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 277 return false; 278 279 // TODO: We should be able to widen loads if the alignment is high enough, but 280 // we also need to modify the memory access size. 281 #if 0 282 // Accept widening loads based on alignment. 283 if (IsLoad && MemSize < Size) 284 MemSize = std::max(MemSize, Align); 285 #endif 286 287 // Only 1-byte and 2-byte to 32-bit extloads are valid. 288 if (MemSize != RegSize && RegSize != 32) 289 return false; 290 291 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 292 return false; 293 294 switch (MemSize) { 295 case 8: 296 case 16: 297 case 32: 298 case 64: 299 case 128: 300 break; 301 case 96: 302 if (!ST.hasDwordx3LoadStores()) 303 return false; 304 break; 305 case 256: 306 case 512: 307 // These may contextually need to be broken down. 308 break; 309 default: 310 return false; 311 } 312 313 assert(RegSize >= MemSize); 314 315 if (AlignBits < MemSize) { 316 const SITargetLowering *TLI = ST.getTargetLowering(); 317 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 318 Align(AlignBits / 8))) 319 return false; 320 } 321 322 return true; 323 } 324 325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 326 // workaround this. Eventually it should ignore the type for loads and only care 327 // about the size. Return true in cases where we will workaround this for now by 328 // bitcasting. 329 static bool loadStoreBitcastWorkaround(const LLT Ty) { 330 if (EnableNewLegality) 331 return false; 332 333 const unsigned Size = Ty.getSizeInBits(); 334 if (Size <= 64) 335 return false; 336 if (!Ty.isVector()) 337 return true; 338 unsigned EltSize = Ty.getElementType().getSizeInBits(); 339 return EltSize != 32 && EltSize != 64; 340 } 341 342 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 343 unsigned Opcode) { 344 const LLT Ty = Query.Types[0]; 345 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 346 !loadStoreBitcastWorkaround(Ty); 347 } 348 349 /// Return true if a load or store of the type should be lowered with a bitcast 350 /// to a different type. 351 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 352 const unsigned MemSizeInBits) { 353 const unsigned Size = Ty.getSizeInBits(); 354 if (Size != MemSizeInBits) 355 return Size <= 32 && Ty.isVector(); 356 357 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 358 return true; 359 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 360 !isRegisterVectorElementType(Ty.getElementType()); 361 } 362 363 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 364 const GCNTargetMachine &TM) 365 : ST(ST_) { 366 using namespace TargetOpcode; 367 368 auto GetAddrSpacePtr = [&TM](unsigned AS) { 369 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 370 }; 371 372 const LLT S1 = LLT::scalar(1); 373 const LLT S16 = LLT::scalar(16); 374 const LLT S32 = LLT::scalar(32); 375 const LLT S64 = LLT::scalar(64); 376 const LLT S128 = LLT::scalar(128); 377 const LLT S256 = LLT::scalar(256); 378 const LLT S512 = LLT::scalar(512); 379 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 380 381 const LLT V2S16 = LLT::vector(2, 16); 382 const LLT V4S16 = LLT::vector(4, 16); 383 384 const LLT V2S32 = LLT::vector(2, 32); 385 const LLT V3S32 = LLT::vector(3, 32); 386 const LLT V4S32 = LLT::vector(4, 32); 387 const LLT V5S32 = LLT::vector(5, 32); 388 const LLT V6S32 = LLT::vector(6, 32); 389 const LLT V7S32 = LLT::vector(7, 32); 390 const LLT V8S32 = LLT::vector(8, 32); 391 const LLT V9S32 = LLT::vector(9, 32); 392 const LLT V10S32 = LLT::vector(10, 32); 393 const LLT V11S32 = LLT::vector(11, 32); 394 const LLT V12S32 = LLT::vector(12, 32); 395 const LLT V13S32 = LLT::vector(13, 32); 396 const LLT V14S32 = LLT::vector(14, 32); 397 const LLT V15S32 = LLT::vector(15, 32); 398 const LLT V16S32 = LLT::vector(16, 32); 399 const LLT V32S32 = LLT::vector(32, 32); 400 401 const LLT V2S64 = LLT::vector(2, 64); 402 const LLT V3S64 = LLT::vector(3, 64); 403 const LLT V4S64 = LLT::vector(4, 64); 404 const LLT V5S64 = LLT::vector(5, 64); 405 const LLT V6S64 = LLT::vector(6, 64); 406 const LLT V7S64 = LLT::vector(7, 64); 407 const LLT V8S64 = LLT::vector(8, 64); 408 const LLT V16S64 = LLT::vector(16, 64); 409 410 std::initializer_list<LLT> AllS32Vectors = 411 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 412 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 413 std::initializer_list<LLT> AllS64Vectors = 414 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 415 416 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 417 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 418 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 419 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 420 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 421 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 422 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 423 424 const LLT CodePtr = FlatPtr; 425 426 const std::initializer_list<LLT> AddrSpaces64 = { 427 GlobalPtr, ConstantPtr, FlatPtr 428 }; 429 430 const std::initializer_list<LLT> AddrSpaces32 = { 431 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 432 }; 433 434 const std::initializer_list<LLT> FPTypesBase = { 435 S32, S64 436 }; 437 438 const std::initializer_list<LLT> FPTypes16 = { 439 S32, S64, S16 440 }; 441 442 const std::initializer_list<LLT> FPTypesPK16 = { 443 S32, S64, S16, V2S16 444 }; 445 446 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 447 448 setAction({G_BRCOND, S1}, Legal); // VCC branches 449 setAction({G_BRCOND, S32}, Legal); // SCC branches 450 451 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 452 // elements for v3s16 453 getActionDefinitionsBuilder(G_PHI) 454 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 455 .legalFor(AllS32Vectors) 456 .legalFor(AllS64Vectors) 457 .legalFor(AddrSpaces64) 458 .legalFor(AddrSpaces32) 459 .legalIf(isPointer(0)) 460 .clampScalar(0, S16, S256) 461 .widenScalarToNextPow2(0, 32) 462 .clampMaxNumElements(0, S32, 16) 463 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 464 .scalarize(0); 465 466 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 467 // Full set of gfx9 features. 468 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 469 .legalFor({S32, S16, V2S16}) 470 .clampScalar(0, S16, S32) 471 .clampMaxNumElements(0, S16, 2) 472 .scalarize(0) 473 .widenScalarToNextPow2(0, 32); 474 475 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 476 .legalFor({S32, S16, V2S16}) // Clamp modifier 477 .minScalarOrElt(0, S16) 478 .clampMaxNumElements(0, S16, 2) 479 .scalarize(0) 480 .widenScalarToNextPow2(0, 32) 481 .lower(); 482 } else if (ST.has16BitInsts()) { 483 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 484 .legalFor({S32, S16}) 485 .clampScalar(0, S16, S32) 486 .scalarize(0) 487 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 488 489 // Technically the saturating operations require clamp bit support, but this 490 // was introduced at the same time as 16-bit operations. 491 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 492 .legalFor({S32, S16}) // Clamp modifier 493 .minScalar(0, S16) 494 .scalarize(0) 495 .widenScalarToNextPow2(0, 16) 496 .lower(); 497 498 // We're just lowering this, but it helps get a better result to try to 499 // coerce to the desired type first. 500 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 501 .minScalar(0, S16) 502 .scalarize(0) 503 .lower(); 504 } else { 505 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 506 .legalFor({S32}) 507 .clampScalar(0, S32, S32) 508 .scalarize(0); 509 510 if (ST.hasIntClamp()) { 511 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 512 .legalFor({S32}) // Clamp modifier. 513 .scalarize(0) 514 .minScalarOrElt(0, S32) 515 .lower(); 516 } else { 517 // Clamp bit support was added in VI, along with 16-bit operations. 518 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 519 .minScalar(0, S32) 520 .scalarize(0) 521 .lower(); 522 } 523 524 // FIXME: DAG expansion gets better results. The widening uses the smaller 525 // range values and goes for the min/max lowering directly. 526 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 527 .minScalar(0, S32) 528 .scalarize(0) 529 .lower(); 530 } 531 532 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 533 .customFor({S32, S64}) 534 .clampScalar(0, S32, S64) 535 .widenScalarToNextPow2(0, 32) 536 .scalarize(0); 537 538 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 539 .legalFor({S32}) 540 .clampScalar(0, S32, S32) 541 .scalarize(0); 542 543 // Report legal for any types we can handle anywhere. For the cases only legal 544 // on the SALU, RegBankSelect will be able to re-legalize. 545 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 546 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 547 .clampScalar(0, S32, S64) 548 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 549 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 550 .widenScalarToNextPow2(0) 551 .scalarize(0); 552 553 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 554 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 555 .legalFor({{S32, S1}, {S32, S32}}) 556 .minScalar(0, S32) 557 // TODO: .scalarize(0) 558 .lower(); 559 560 getActionDefinitionsBuilder(G_BITCAST) 561 // Don't worry about the size constraint. 562 .legalIf(all(isRegisterType(0), isRegisterType(1))) 563 .lower(); 564 565 566 getActionDefinitionsBuilder(G_CONSTANT) 567 .legalFor({S1, S32, S64, S16, GlobalPtr, 568 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 569 .legalIf(isPointer(0)) 570 .clampScalar(0, S32, S64) 571 .widenScalarToNextPow2(0); 572 573 getActionDefinitionsBuilder(G_FCONSTANT) 574 .legalFor({S32, S64, S16}) 575 .clampScalar(0, S16, S64); 576 577 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 578 .legalIf(isRegisterType(0)) 579 // s1 and s16 are special cases because they have legal operations on 580 // them, but don't really occupy registers in the normal way. 581 .legalFor({S1, S16}) 582 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 583 .clampScalarOrElt(0, S32, MaxScalar) 584 .widenScalarToNextPow2(0, 32) 585 .clampMaxNumElements(0, S32, 16); 586 587 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 588 589 // If the amount is divergent, we have to do a wave reduction to get the 590 // maximum value, so this is expanded during RegBankSelect. 591 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 592 .legalFor({{PrivatePtr, S32}}); 593 594 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 595 .customIf(typeIsNot(0, PrivatePtr)); 596 597 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 598 599 auto &FPOpActions = getActionDefinitionsBuilder( 600 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 601 .legalFor({S32, S64}); 602 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 603 .customFor({S32, S64}); 604 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 605 .customFor({S32, S64}); 606 607 if (ST.has16BitInsts()) { 608 if (ST.hasVOP3PInsts()) 609 FPOpActions.legalFor({S16, V2S16}); 610 else 611 FPOpActions.legalFor({S16}); 612 613 TrigActions.customFor({S16}); 614 FDIVActions.customFor({S16}); 615 } 616 617 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 618 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 619 620 if (ST.hasVOP3PInsts()) { 621 MinNumMaxNum.customFor(FPTypesPK16) 622 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 623 .clampMaxNumElements(0, S16, 2) 624 .clampScalar(0, S16, S64) 625 .scalarize(0); 626 } else if (ST.has16BitInsts()) { 627 MinNumMaxNum.customFor(FPTypes16) 628 .clampScalar(0, S16, S64) 629 .scalarize(0); 630 } else { 631 MinNumMaxNum.customFor(FPTypesBase) 632 .clampScalar(0, S32, S64) 633 .scalarize(0); 634 } 635 636 if (ST.hasVOP3PInsts()) 637 FPOpActions.clampMaxNumElements(0, S16, 2); 638 639 FPOpActions 640 .scalarize(0) 641 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 642 643 TrigActions 644 .scalarize(0) 645 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 646 647 FDIVActions 648 .scalarize(0) 649 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 650 651 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 652 .legalFor(FPTypesPK16) 653 .clampMaxNumElements(0, S16, 2) 654 .scalarize(0) 655 .clampScalar(0, S16, S64); 656 657 if (ST.has16BitInsts()) { 658 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 659 .legalFor({S32, S64, S16}) 660 .scalarize(0) 661 .clampScalar(0, S16, S64); 662 } else { 663 getActionDefinitionsBuilder(G_FSQRT) 664 .legalFor({S32, S64}) 665 .scalarize(0) 666 .clampScalar(0, S32, S64); 667 668 if (ST.hasFractBug()) { 669 getActionDefinitionsBuilder(G_FFLOOR) 670 .customFor({S64}) 671 .legalFor({S32, S64}) 672 .scalarize(0) 673 .clampScalar(0, S32, S64); 674 } else { 675 getActionDefinitionsBuilder(G_FFLOOR) 676 .legalFor({S32, S64}) 677 .scalarize(0) 678 .clampScalar(0, S32, S64); 679 } 680 } 681 682 getActionDefinitionsBuilder(G_FPTRUNC) 683 .legalFor({{S32, S64}, {S16, S32}}) 684 .scalarize(0) 685 .lower(); 686 687 getActionDefinitionsBuilder(G_FPEXT) 688 .legalFor({{S64, S32}, {S32, S16}}) 689 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 690 .scalarize(0); 691 692 getActionDefinitionsBuilder(G_FSUB) 693 // Use actual fsub instruction 694 .legalFor({S32}) 695 // Must use fadd + fneg 696 .lowerFor({S64, S16, V2S16}) 697 .scalarize(0) 698 .clampScalar(0, S32, S64); 699 700 // Whether this is legal depends on the floating point mode for the function. 701 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 702 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 703 FMad.customFor({S32, S16}); 704 else if (ST.hasMadMacF32Insts()) 705 FMad.customFor({S32}); 706 else if (ST.hasMadF16()) 707 FMad.customFor({S16}); 708 FMad.scalarize(0) 709 .lower(); 710 711 auto &FRem = getActionDefinitionsBuilder(G_FREM); 712 if (ST.has16BitInsts()) { 713 FRem.customFor({S16, S32, S64}); 714 } else { 715 FRem.minScalar(0, S32) 716 .customFor({S32, S64}); 717 } 718 FRem.scalarize(0); 719 720 // TODO: Do we need to clamp maximum bitwidth? 721 getActionDefinitionsBuilder(G_TRUNC) 722 .legalIf(isScalar(0)) 723 .legalFor({{V2S16, V2S32}}) 724 .clampMaxNumElements(0, S16, 2) 725 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 726 // situations (like an invalid implicit use), we don't want to infinite loop 727 // in the legalizer. 728 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 729 .alwaysLegal(); 730 731 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 732 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 733 {S32, S1}, {S64, S1}, {S16, S1}}) 734 .scalarize(0) 735 .clampScalar(0, S32, S64) 736 .widenScalarToNextPow2(1, 32); 737 738 // TODO: Split s1->s64 during regbankselect for VALU. 739 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 740 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 741 .lowerFor({{S32, S64}}) 742 .lowerIf(typeIs(1, S1)) 743 .customFor({{S64, S64}}); 744 if (ST.has16BitInsts()) 745 IToFP.legalFor({{S16, S16}}); 746 IToFP.clampScalar(1, S32, S64) 747 .minScalar(0, S32) 748 .scalarize(0) 749 .widenScalarToNextPow2(1); 750 751 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 752 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 753 .customFor({{S64, S64}}) 754 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 755 if (ST.has16BitInsts()) 756 FPToI.legalFor({{S16, S16}}); 757 else 758 FPToI.minScalar(1, S32); 759 760 FPToI.minScalar(0, S32) 761 .scalarize(0) 762 .lower(); 763 764 // Lower roundeven into G_FRINT 765 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 766 .scalarize(0) 767 .lower(); 768 769 if (ST.has16BitInsts()) { 770 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 771 .legalFor({S16, S32, S64}) 772 .clampScalar(0, S16, S64) 773 .scalarize(0); 774 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 775 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 776 .legalFor({S32, S64}) 777 .clampScalar(0, S32, S64) 778 .scalarize(0); 779 } else { 780 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 781 .legalFor({S32}) 782 .customFor({S64}) 783 .clampScalar(0, S32, S64) 784 .scalarize(0); 785 } 786 787 getActionDefinitionsBuilder(G_PTR_ADD) 788 .legalIf(all(isPointer(0), sameSize(0, 1))) 789 .scalarize(0) 790 .scalarSameSizeAs(1, 0); 791 792 getActionDefinitionsBuilder(G_PTRMASK) 793 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 794 .scalarSameSizeAs(1, 0) 795 .scalarize(0); 796 797 auto &CmpBuilder = 798 getActionDefinitionsBuilder(G_ICMP) 799 // The compare output type differs based on the register bank of the output, 800 // so make both s1 and s32 legal. 801 // 802 // Scalar compares producing output in scc will be promoted to s32, as that 803 // is the allocatable register type that will be needed for the copy from 804 // scc. This will be promoted during RegBankSelect, and we assume something 805 // before that won't try to use s32 result types. 806 // 807 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 808 // bank. 809 .legalForCartesianProduct( 810 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 811 .legalForCartesianProduct( 812 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 813 if (ST.has16BitInsts()) { 814 CmpBuilder.legalFor({{S1, S16}}); 815 } 816 817 CmpBuilder 818 .widenScalarToNextPow2(1) 819 .clampScalar(1, S32, S64) 820 .scalarize(0) 821 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 822 823 getActionDefinitionsBuilder(G_FCMP) 824 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 825 .widenScalarToNextPow2(1) 826 .clampScalar(1, S32, S64) 827 .scalarize(0); 828 829 // FIXME: fpow has a selection pattern that should move to custom lowering. 830 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 831 if (ST.has16BitInsts()) 832 Exp2Ops.legalFor({S32, S16}); 833 else 834 Exp2Ops.legalFor({S32}); 835 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 836 Exp2Ops.scalarize(0); 837 838 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 839 if (ST.has16BitInsts()) 840 ExpOps.customFor({{S32}, {S16}}); 841 else 842 ExpOps.customFor({S32}); 843 ExpOps.clampScalar(0, MinScalarFPTy, S32) 844 .scalarize(0); 845 846 getActionDefinitionsBuilder(G_FPOWI) 847 .clampScalar(0, MinScalarFPTy, S32) 848 .lower(); 849 850 // The 64-bit versions produce 32-bit results, but only on the SALU. 851 getActionDefinitionsBuilder(G_CTPOP) 852 .legalFor({{S32, S32}, {S32, S64}}) 853 .clampScalar(0, S32, S32) 854 .clampScalar(1, S32, S64) 855 .scalarize(0) 856 .widenScalarToNextPow2(0, 32) 857 .widenScalarToNextPow2(1, 32); 858 859 // The hardware instructions return a different result on 0 than the generic 860 // instructions expect. The hardware produces -1, but these produce the 861 // bitwidth. 862 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 863 .scalarize(0) 864 .clampScalar(0, S32, S32) 865 .clampScalar(1, S32, S64) 866 .widenScalarToNextPow2(0, 32) 867 .widenScalarToNextPow2(1, 32) 868 .lower(); 869 870 // The 64-bit versions produce 32-bit results, but only on the SALU. 871 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 872 .legalFor({{S32, S32}, {S32, S64}}) 873 .clampScalar(0, S32, S32) 874 .clampScalar(1, S32, S64) 875 .scalarize(0) 876 .widenScalarToNextPow2(0, 32) 877 .widenScalarToNextPow2(1, 32); 878 879 getActionDefinitionsBuilder(G_BITREVERSE) 880 .legalFor({S32}) 881 .clampScalar(0, S32, S32) 882 .scalarize(0); 883 884 if (ST.has16BitInsts()) { 885 getActionDefinitionsBuilder(G_BSWAP) 886 .legalFor({S16, S32, V2S16}) 887 .clampMaxNumElements(0, S16, 2) 888 // FIXME: Fixing non-power-of-2 before clamp is workaround for 889 // narrowScalar limitation. 890 .widenScalarToNextPow2(0) 891 .clampScalar(0, S16, S32) 892 .scalarize(0); 893 894 if (ST.hasVOP3PInsts()) { 895 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 896 .legalFor({S32, S16, V2S16}) 897 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 898 .clampMaxNumElements(0, S16, 2) 899 .minScalar(0, S16) 900 .widenScalarToNextPow2(0) 901 .scalarize(0) 902 .lower(); 903 } else { 904 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 905 .legalFor({S32, S16}) 906 .widenScalarToNextPow2(0) 907 .minScalar(0, S16) 908 .scalarize(0) 909 .lower(); 910 } 911 } else { 912 // TODO: Should have same legality without v_perm_b32 913 getActionDefinitionsBuilder(G_BSWAP) 914 .legalFor({S32}) 915 .lowerIf(scalarNarrowerThan(0, 32)) 916 // FIXME: Fixing non-power-of-2 before clamp is workaround for 917 // narrowScalar limitation. 918 .widenScalarToNextPow2(0) 919 .maxScalar(0, S32) 920 .scalarize(0) 921 .lower(); 922 923 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 924 .legalFor({S32}) 925 .minScalar(0, S32) 926 .widenScalarToNextPow2(0) 927 .scalarize(0) 928 .lower(); 929 } 930 931 getActionDefinitionsBuilder(G_INTTOPTR) 932 // List the common cases 933 .legalForCartesianProduct(AddrSpaces64, {S64}) 934 .legalForCartesianProduct(AddrSpaces32, {S32}) 935 .scalarize(0) 936 // Accept any address space as long as the size matches 937 .legalIf(sameSize(0, 1)) 938 .widenScalarIf(smallerThan(1, 0), 939 [](const LegalityQuery &Query) { 940 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 941 }) 942 .narrowScalarIf(largerThan(1, 0), 943 [](const LegalityQuery &Query) { 944 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 945 }); 946 947 getActionDefinitionsBuilder(G_PTRTOINT) 948 // List the common cases 949 .legalForCartesianProduct(AddrSpaces64, {S64}) 950 .legalForCartesianProduct(AddrSpaces32, {S32}) 951 .scalarize(0) 952 // Accept any address space as long as the size matches 953 .legalIf(sameSize(0, 1)) 954 .widenScalarIf(smallerThan(0, 1), 955 [](const LegalityQuery &Query) { 956 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 957 }) 958 .narrowScalarIf( 959 largerThan(0, 1), 960 [](const LegalityQuery &Query) { 961 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 962 }); 963 964 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 965 .scalarize(0) 966 .custom(); 967 968 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 969 bool IsLoad) -> bool { 970 const LLT DstTy = Query.Types[0]; 971 972 // Split vector extloads. 973 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 974 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 975 976 if (MemSize < DstTy.getSizeInBits()) 977 MemSize = std::max(MemSize, AlignBits); 978 979 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 980 return true; 981 982 const LLT PtrTy = Query.Types[1]; 983 unsigned AS = PtrTy.getAddressSpace(); 984 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 985 return true; 986 987 // Catch weird sized loads that don't evenly divide into the access sizes 988 // TODO: May be able to widen depending on alignment etc. 989 unsigned NumRegs = (MemSize + 31) / 32; 990 if (NumRegs == 3) { 991 if (!ST.hasDwordx3LoadStores()) 992 return true; 993 } else { 994 // If the alignment allows, these should have been widened. 995 if (!isPowerOf2_32(NumRegs)) 996 return true; 997 } 998 999 if (AlignBits < MemSize) { 1000 const SITargetLowering *TLI = ST.getTargetLowering(); 1001 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1002 Align(AlignBits / 8)); 1003 } 1004 1005 return false; 1006 }; 1007 1008 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 1009 unsigned Opc) -> bool { 1010 unsigned Size = Query.Types[0].getSizeInBits(); 1011 if (isPowerOf2_32(Size)) 1012 return false; 1013 1014 if (Size == 96 && ST.hasDwordx3LoadStores()) 1015 return false; 1016 1017 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 1018 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 1019 return false; 1020 1021 unsigned Align = Query.MMODescrs[0].AlignInBits; 1022 unsigned RoundedSize = NextPowerOf2(Size); 1023 return (Align >= RoundedSize); 1024 }; 1025 1026 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 1027 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 1028 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 1029 1030 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1031 // LDS 1032 // TODO: Unsupported flat for SI. 1033 1034 for (unsigned Op : {G_LOAD, G_STORE}) { 1035 const bool IsStore = Op == G_STORE; 1036 1037 auto &Actions = getActionDefinitionsBuilder(Op); 1038 // Explicitly list some common cases. 1039 // TODO: Does this help compile time at all? 1040 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1041 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1042 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1043 {S64, GlobalPtr, 64, GlobalAlign32}, 1044 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1045 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1046 {S32, GlobalPtr, 8, GlobalAlign8}, 1047 {S32, GlobalPtr, 16, GlobalAlign16}, 1048 1049 {S32, LocalPtr, 32, 32}, 1050 {S64, LocalPtr, 64, 32}, 1051 {V2S32, LocalPtr, 64, 32}, 1052 {S32, LocalPtr, 8, 8}, 1053 {S32, LocalPtr, 16, 16}, 1054 {V2S16, LocalPtr, 32, 32}, 1055 1056 {S32, PrivatePtr, 32, 32}, 1057 {S32, PrivatePtr, 8, 8}, 1058 {S32, PrivatePtr, 16, 16}, 1059 {V2S16, PrivatePtr, 32, 32}, 1060 1061 {S32, ConstantPtr, 32, GlobalAlign32}, 1062 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1063 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1064 {S64, ConstantPtr, 64, GlobalAlign32}, 1065 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1066 Actions.legalIf( 1067 [=](const LegalityQuery &Query) -> bool { 1068 return isLoadStoreLegal(ST, Query, Op); 1069 }); 1070 1071 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1072 // 64-bits. 1073 // 1074 // TODO: Should generalize bitcast action into coerce, which will also cover 1075 // inserting addrspacecasts. 1076 Actions.customIf(typeIs(1, Constant32Ptr)); 1077 1078 // Turn any illegal element vectors into something easier to deal 1079 // with. These will ultimately produce 32-bit scalar shifts to extract the 1080 // parts anyway. 1081 // 1082 // For odd 16-bit element vectors, prefer to split those into pieces with 1083 // 16-bit vector parts. 1084 Actions.bitcastIf( 1085 [=](const LegalityQuery &Query) -> bool { 1086 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1087 Query.MMODescrs[0].SizeInBits); 1088 }, bitcastToRegisterType(0)); 1089 1090 Actions 1091 .customIf(typeIs(1, Constant32Ptr)) 1092 // Widen suitably aligned loads by loading extra elements. 1093 .moreElementsIf([=](const LegalityQuery &Query) { 1094 const LLT Ty = Query.Types[0]; 1095 return Op == G_LOAD && Ty.isVector() && 1096 shouldWidenLoadResult(Query, Op); 1097 }, moreElementsToNextPow2(0)) 1098 .widenScalarIf([=](const LegalityQuery &Query) { 1099 const LLT Ty = Query.Types[0]; 1100 return Op == G_LOAD && !Ty.isVector() && 1101 shouldWidenLoadResult(Query, Op); 1102 }, widenScalarOrEltToNextPow2(0)) 1103 .narrowScalarIf( 1104 [=](const LegalityQuery &Query) -> bool { 1105 return !Query.Types[0].isVector() && 1106 needToSplitMemOp(Query, Op == G_LOAD); 1107 }, 1108 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1109 const LLT DstTy = Query.Types[0]; 1110 const LLT PtrTy = Query.Types[1]; 1111 1112 const unsigned DstSize = DstTy.getSizeInBits(); 1113 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1114 1115 // Split extloads. 1116 if (DstSize > MemSize) 1117 return std::make_pair(0, LLT::scalar(MemSize)); 1118 1119 if (!isPowerOf2_32(DstSize)) { 1120 // We're probably decomposing an odd sized store. Try to split 1121 // to the widest type. TODO: Account for alignment. As-is it 1122 // should be OK, since the new parts will be further legalized. 1123 unsigned FloorSize = PowerOf2Floor(DstSize); 1124 return std::make_pair(0, LLT::scalar(FloorSize)); 1125 } 1126 1127 if (DstSize > 32 && (DstSize % 32 != 0)) { 1128 // FIXME: Need a way to specify non-extload of larger size if 1129 // suitably aligned. 1130 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1131 } 1132 1133 unsigned MaxSize = maxSizeForAddrSpace(ST, 1134 PtrTy.getAddressSpace(), 1135 Op == G_LOAD); 1136 if (MemSize > MaxSize) 1137 return std::make_pair(0, LLT::scalar(MaxSize)); 1138 1139 unsigned Align = Query.MMODescrs[0].AlignInBits; 1140 return std::make_pair(0, LLT::scalar(Align)); 1141 }) 1142 .fewerElementsIf( 1143 [=](const LegalityQuery &Query) -> bool { 1144 return Query.Types[0].isVector() && 1145 needToSplitMemOp(Query, Op == G_LOAD); 1146 }, 1147 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1148 const LLT DstTy = Query.Types[0]; 1149 const LLT PtrTy = Query.Types[1]; 1150 1151 LLT EltTy = DstTy.getElementType(); 1152 unsigned MaxSize = maxSizeForAddrSpace(ST, 1153 PtrTy.getAddressSpace(), 1154 Op == G_LOAD); 1155 1156 // FIXME: Handle widened to power of 2 results better. This ends 1157 // up scalarizing. 1158 // FIXME: 3 element stores scalarized on SI 1159 1160 // Split if it's too large for the address space. 1161 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1162 unsigned NumElts = DstTy.getNumElements(); 1163 unsigned EltSize = EltTy.getSizeInBits(); 1164 1165 if (MaxSize % EltSize == 0) { 1166 return std::make_pair( 1167 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1168 } 1169 1170 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1171 1172 // FIXME: Refine when odd breakdowns handled 1173 // The scalars will need to be re-legalized. 1174 if (NumPieces == 1 || NumPieces >= NumElts || 1175 NumElts % NumPieces != 0) 1176 return std::make_pair(0, EltTy); 1177 1178 return std::make_pair(0, 1179 LLT::vector(NumElts / NumPieces, EltTy)); 1180 } 1181 1182 // FIXME: We could probably handle weird extending loads better. 1183 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1184 if (DstTy.getSizeInBits() > MemSize) 1185 return std::make_pair(0, EltTy); 1186 1187 unsigned EltSize = EltTy.getSizeInBits(); 1188 unsigned DstSize = DstTy.getSizeInBits(); 1189 if (!isPowerOf2_32(DstSize)) { 1190 // We're probably decomposing an odd sized store. Try to split 1191 // to the widest type. TODO: Account for alignment. As-is it 1192 // should be OK, since the new parts will be further legalized. 1193 unsigned FloorSize = PowerOf2Floor(DstSize); 1194 return std::make_pair( 1195 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1196 } 1197 1198 // Need to split because of alignment. 1199 unsigned Align = Query.MMODescrs[0].AlignInBits; 1200 if (EltSize > Align && 1201 (EltSize / Align < DstTy.getNumElements())) { 1202 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1203 } 1204 1205 // May need relegalization for the scalars. 1206 return std::make_pair(0, EltTy); 1207 }) 1208 .minScalar(0, S32); 1209 1210 if (IsStore) 1211 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1212 1213 // TODO: Need a bitcast lower option? 1214 Actions 1215 .widenScalarToNextPow2(0) 1216 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1217 } 1218 1219 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1220 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1221 {S32, GlobalPtr, 16, 2 * 8}, 1222 {S32, LocalPtr, 8, 8}, 1223 {S32, LocalPtr, 16, 16}, 1224 {S32, PrivatePtr, 8, 8}, 1225 {S32, PrivatePtr, 16, 16}, 1226 {S32, ConstantPtr, 8, 8}, 1227 {S32, ConstantPtr, 16, 2 * 8}}); 1228 if (ST.hasFlatAddressSpace()) { 1229 ExtLoads.legalForTypesWithMemDesc( 1230 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1231 } 1232 1233 ExtLoads.clampScalar(0, S32, S32) 1234 .widenScalarToNextPow2(0) 1235 .unsupportedIfMemSizeNotPow2() 1236 .lower(); 1237 1238 auto &Atomics = getActionDefinitionsBuilder( 1239 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1240 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1241 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1242 G_ATOMICRMW_UMIN}) 1243 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1244 {S64, GlobalPtr}, {S64, LocalPtr}, 1245 {S32, RegionPtr}, {S64, RegionPtr}}); 1246 if (ST.hasFlatAddressSpace()) { 1247 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1248 } 1249 1250 if (ST.hasLDSFPAtomics()) { 1251 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1252 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1253 } 1254 1255 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1256 // demarshalling 1257 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1258 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1259 {S32, FlatPtr}, {S64, FlatPtr}}) 1260 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1261 {S32, RegionPtr}, {S64, RegionPtr}}); 1262 // TODO: Pointer types, any 32-bit or 64-bit vector 1263 1264 // Condition should be s32 for scalar, s1 for vector. 1265 getActionDefinitionsBuilder(G_SELECT) 1266 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1267 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1268 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1269 .clampScalar(0, S16, S64) 1270 .scalarize(1) 1271 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1272 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1273 .clampMaxNumElements(0, S32, 2) 1274 .clampMaxNumElements(0, LocalPtr, 2) 1275 .clampMaxNumElements(0, PrivatePtr, 2) 1276 .scalarize(0) 1277 .widenScalarToNextPow2(0) 1278 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1279 1280 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1281 // be more flexible with the shift amount type. 1282 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1283 .legalFor({{S32, S32}, {S64, S32}}); 1284 if (ST.has16BitInsts()) { 1285 if (ST.hasVOP3PInsts()) { 1286 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1287 .clampMaxNumElements(0, S16, 2); 1288 } else 1289 Shifts.legalFor({{S16, S16}}); 1290 1291 // TODO: Support 16-bit shift amounts for all types 1292 Shifts.widenScalarIf( 1293 [=](const LegalityQuery &Query) { 1294 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1295 // 32-bit amount. 1296 const LLT ValTy = Query.Types[0]; 1297 const LLT AmountTy = Query.Types[1]; 1298 return ValTy.getSizeInBits() <= 16 && 1299 AmountTy.getSizeInBits() < 16; 1300 }, changeTo(1, S16)); 1301 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1302 Shifts.clampScalar(1, S32, S32); 1303 Shifts.clampScalar(0, S16, S64); 1304 Shifts.widenScalarToNextPow2(0, 16); 1305 1306 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1307 .minScalar(0, S16) 1308 .scalarize(0) 1309 .lower(); 1310 } else { 1311 // Make sure we legalize the shift amount type first, as the general 1312 // expansion for the shifted type will produce much worse code if it hasn't 1313 // been truncated already. 1314 Shifts.clampScalar(1, S32, S32); 1315 Shifts.clampScalar(0, S32, S64); 1316 Shifts.widenScalarToNextPow2(0, 32); 1317 1318 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1319 .minScalar(0, S32) 1320 .scalarize(0) 1321 .lower(); 1322 } 1323 Shifts.scalarize(0); 1324 1325 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1326 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1327 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1328 unsigned IdxTypeIdx = 2; 1329 1330 getActionDefinitionsBuilder(Op) 1331 .customIf([=](const LegalityQuery &Query) { 1332 const LLT EltTy = Query.Types[EltTypeIdx]; 1333 const LLT VecTy = Query.Types[VecTypeIdx]; 1334 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1335 const unsigned EltSize = EltTy.getSizeInBits(); 1336 return (EltSize == 32 || EltSize == 64) && 1337 VecTy.getSizeInBits() % 32 == 0 && 1338 VecTy.getSizeInBits() <= MaxRegisterSize && 1339 IdxTy.getSizeInBits() == 32; 1340 }) 1341 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1342 bitcastToVectorElement32(VecTypeIdx)) 1343 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1344 .bitcastIf( 1345 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1346 [=](const LegalityQuery &Query) { 1347 // For > 64-bit element types, try to turn this into a 64-bit 1348 // element vector since we may be able to do better indexing 1349 // if this is scalar. If not, fall back to 32. 1350 const LLT EltTy = Query.Types[EltTypeIdx]; 1351 const LLT VecTy = Query.Types[VecTypeIdx]; 1352 const unsigned DstEltSize = EltTy.getSizeInBits(); 1353 const unsigned VecSize = VecTy.getSizeInBits(); 1354 1355 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1356 return std::make_pair( 1357 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1358 }) 1359 .clampScalar(EltTypeIdx, S32, S64) 1360 .clampScalar(VecTypeIdx, S32, S64) 1361 .clampScalar(IdxTypeIdx, S32, S32) 1362 .clampMaxNumElements(VecTypeIdx, S32, 32) 1363 // TODO: Clamp elements for 64-bit vectors? 1364 // It should only be necessary with variable indexes. 1365 // As a last resort, lower to the stack 1366 .lower(); 1367 } 1368 1369 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1370 .unsupportedIf([=](const LegalityQuery &Query) { 1371 const LLT &EltTy = Query.Types[1].getElementType(); 1372 return Query.Types[0] != EltTy; 1373 }); 1374 1375 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1376 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1377 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1378 1379 // FIXME: Doesn't handle extract of illegal sizes. 1380 getActionDefinitionsBuilder(Op) 1381 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1382 // FIXME: Multiples of 16 should not be legal. 1383 .legalIf([=](const LegalityQuery &Query) { 1384 const LLT BigTy = Query.Types[BigTyIdx]; 1385 const LLT LitTy = Query.Types[LitTyIdx]; 1386 return (BigTy.getSizeInBits() % 32 == 0) && 1387 (LitTy.getSizeInBits() % 16 == 0); 1388 }) 1389 .widenScalarIf( 1390 [=](const LegalityQuery &Query) { 1391 const LLT BigTy = Query.Types[BigTyIdx]; 1392 return (BigTy.getScalarSizeInBits() < 16); 1393 }, 1394 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1395 .widenScalarIf( 1396 [=](const LegalityQuery &Query) { 1397 const LLT LitTy = Query.Types[LitTyIdx]; 1398 return (LitTy.getScalarSizeInBits() < 16); 1399 }, 1400 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1401 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1402 .widenScalarToNextPow2(BigTyIdx, 32); 1403 1404 } 1405 1406 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1407 .legalForCartesianProduct(AllS32Vectors, {S32}) 1408 .legalForCartesianProduct(AllS64Vectors, {S64}) 1409 .clampNumElements(0, V16S32, V32S32) 1410 .clampNumElements(0, V2S64, V16S64) 1411 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1412 1413 if (ST.hasScalarPackInsts()) { 1414 BuildVector 1415 // FIXME: Should probably widen s1 vectors straight to s32 1416 .minScalarOrElt(0, S16) 1417 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1418 .minScalar(1, S32); 1419 1420 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1421 .legalFor({V2S16, S32}) 1422 .lower(); 1423 BuildVector.minScalarOrElt(0, S32); 1424 } else { 1425 BuildVector.customFor({V2S16, S16}); 1426 BuildVector.minScalarOrElt(0, S32); 1427 1428 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1429 .customFor({V2S16, S32}) 1430 .lower(); 1431 } 1432 1433 BuildVector.legalIf(isRegisterType(0)); 1434 1435 // FIXME: Clamp maximum size 1436 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1437 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1438 .clampMaxNumElements(0, S32, 32) 1439 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1440 .clampMaxNumElements(0, S16, 64); 1441 1442 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1443 // pre-legalize. 1444 if (ST.hasVOP3PInsts()) { 1445 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1446 .customFor({V2S16, V2S16}) 1447 .lower(); 1448 } else 1449 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1450 1451 // Merge/Unmerge 1452 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1453 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1454 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1455 1456 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1457 const LLT Ty = Query.Types[TypeIdx]; 1458 if (Ty.isVector()) { 1459 const LLT &EltTy = Ty.getElementType(); 1460 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1461 return true; 1462 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1463 return true; 1464 } 1465 return false; 1466 }; 1467 1468 auto &Builder = getActionDefinitionsBuilder(Op) 1469 .lowerFor({{S16, V2S16}}) 1470 .lowerIf([=](const LegalityQuery &Query) { 1471 const LLT BigTy = Query.Types[BigTyIdx]; 1472 return BigTy.getSizeInBits() == 32; 1473 }) 1474 // Try to widen to s16 first for small types. 1475 // TODO: Only do this on targets with legal s16 shifts 1476 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1477 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1478 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1479 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1480 elementTypeIs(1, S16)), 1481 changeTo(1, V2S16)) 1482 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1483 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1484 // valid. 1485 .clampScalar(LitTyIdx, S32, S512) 1486 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1487 // Break up vectors with weird elements into scalars 1488 .fewerElementsIf( 1489 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1490 scalarize(0)) 1491 .fewerElementsIf( 1492 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1493 scalarize(1)) 1494 .clampScalar(BigTyIdx, S32, MaxScalar); 1495 1496 if (Op == G_MERGE_VALUES) { 1497 Builder.widenScalarIf( 1498 // TODO: Use 16-bit shifts if legal for 8-bit values? 1499 [=](const LegalityQuery &Query) { 1500 const LLT Ty = Query.Types[LitTyIdx]; 1501 return Ty.getSizeInBits() < 32; 1502 }, 1503 changeTo(LitTyIdx, S32)); 1504 } 1505 1506 Builder.widenScalarIf( 1507 [=](const LegalityQuery &Query) { 1508 const LLT Ty = Query.Types[BigTyIdx]; 1509 return !isPowerOf2_32(Ty.getSizeInBits()) && 1510 Ty.getSizeInBits() % 16 != 0; 1511 }, 1512 [=](const LegalityQuery &Query) { 1513 // Pick the next power of 2, or a multiple of 64 over 128. 1514 // Whichever is smaller. 1515 const LLT &Ty = Query.Types[BigTyIdx]; 1516 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1517 if (NewSizeInBits >= 256) { 1518 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1519 if (RoundedTo < NewSizeInBits) 1520 NewSizeInBits = RoundedTo; 1521 } 1522 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1523 }) 1524 .legalIf([=](const LegalityQuery &Query) { 1525 const LLT &BigTy = Query.Types[BigTyIdx]; 1526 const LLT &LitTy = Query.Types[LitTyIdx]; 1527 1528 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1529 return false; 1530 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1531 return false; 1532 1533 return BigTy.getSizeInBits() % 16 == 0 && 1534 LitTy.getSizeInBits() % 16 == 0 && 1535 BigTy.getSizeInBits() <= MaxRegisterSize; 1536 }) 1537 // Any vectors left are the wrong size. Scalarize them. 1538 .scalarize(0) 1539 .scalarize(1); 1540 } 1541 1542 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1543 // RegBankSelect. 1544 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1545 .legalFor({{S32}, {S64}}); 1546 1547 if (ST.hasVOP3PInsts()) { 1548 SextInReg.lowerFor({{V2S16}}) 1549 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1550 // get more vector shift opportunities, since we'll get those when 1551 // expanded. 1552 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1553 } else if (ST.has16BitInsts()) { 1554 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1555 } else { 1556 // Prefer to promote to s32 before lowering if we don't have 16-bit 1557 // shifts. This avoid a lot of intermediate truncate and extend operations. 1558 SextInReg.lowerFor({{S32}, {S64}}); 1559 } 1560 1561 SextInReg 1562 .scalarize(0) 1563 .clampScalar(0, S32, S64) 1564 .lower(); 1565 1566 getActionDefinitionsBuilder(G_FSHR) 1567 .legalFor({{S32, S32}}) 1568 .scalarize(0) 1569 .lower(); 1570 1571 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1572 .legalFor({S64}); 1573 1574 getActionDefinitionsBuilder(G_FENCE) 1575 .alwaysLegal(); 1576 1577 getActionDefinitionsBuilder({ 1578 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1579 G_FCOPYSIGN, 1580 1581 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1582 G_ATOMICRMW_NAND, 1583 G_ATOMICRMW_FSUB, 1584 G_READ_REGISTER, 1585 G_WRITE_REGISTER, 1586 1587 G_SADDO, G_SSUBO, 1588 1589 // TODO: Implement 1590 G_FMINIMUM, G_FMAXIMUM, 1591 G_FSHL 1592 }).lower(); 1593 1594 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1595 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1596 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1597 .unsupported(); 1598 1599 computeTables(); 1600 verify(*ST.getInstrInfo()); 1601 } 1602 1603 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1604 MachineInstr &MI) const { 1605 MachineIRBuilder &B = Helper.MIRBuilder; 1606 MachineRegisterInfo &MRI = *B.getMRI(); 1607 1608 switch (MI.getOpcode()) { 1609 case TargetOpcode::G_ADDRSPACE_CAST: 1610 return legalizeAddrSpaceCast(MI, MRI, B); 1611 case TargetOpcode::G_FRINT: 1612 return legalizeFrint(MI, MRI, B); 1613 case TargetOpcode::G_FCEIL: 1614 return legalizeFceil(MI, MRI, B); 1615 case TargetOpcode::G_FREM: 1616 return legalizeFrem(MI, MRI, B); 1617 case TargetOpcode::G_INTRINSIC_TRUNC: 1618 return legalizeIntrinsicTrunc(MI, MRI, B); 1619 case TargetOpcode::G_SITOFP: 1620 return legalizeITOFP(MI, MRI, B, true); 1621 case TargetOpcode::G_UITOFP: 1622 return legalizeITOFP(MI, MRI, B, false); 1623 case TargetOpcode::G_FPTOSI: 1624 return legalizeFPTOI(MI, MRI, B, true); 1625 case TargetOpcode::G_FPTOUI: 1626 return legalizeFPTOI(MI, MRI, B, false); 1627 case TargetOpcode::G_FMINNUM: 1628 case TargetOpcode::G_FMAXNUM: 1629 case TargetOpcode::G_FMINNUM_IEEE: 1630 case TargetOpcode::G_FMAXNUM_IEEE: 1631 return legalizeMinNumMaxNum(Helper, MI); 1632 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1633 return legalizeExtractVectorElt(MI, MRI, B); 1634 case TargetOpcode::G_INSERT_VECTOR_ELT: 1635 return legalizeInsertVectorElt(MI, MRI, B); 1636 case TargetOpcode::G_SHUFFLE_VECTOR: 1637 return legalizeShuffleVector(MI, MRI, B); 1638 case TargetOpcode::G_FSIN: 1639 case TargetOpcode::G_FCOS: 1640 return legalizeSinCos(MI, MRI, B); 1641 case TargetOpcode::G_GLOBAL_VALUE: 1642 return legalizeGlobalValue(MI, MRI, B); 1643 case TargetOpcode::G_LOAD: 1644 return legalizeLoad(Helper, MI); 1645 case TargetOpcode::G_FMAD: 1646 return legalizeFMad(MI, MRI, B); 1647 case TargetOpcode::G_FDIV: 1648 return legalizeFDIV(MI, MRI, B); 1649 case TargetOpcode::G_UDIV: 1650 case TargetOpcode::G_UREM: 1651 return legalizeUDIV_UREM(MI, MRI, B); 1652 case TargetOpcode::G_SDIV: 1653 case TargetOpcode::G_SREM: 1654 return legalizeSDIV_SREM(MI, MRI, B); 1655 case TargetOpcode::G_ATOMIC_CMPXCHG: 1656 return legalizeAtomicCmpXChg(MI, MRI, B); 1657 case TargetOpcode::G_FLOG: 1658 return legalizeFlog(MI, B, numbers::ln2f); 1659 case TargetOpcode::G_FLOG10: 1660 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1661 case TargetOpcode::G_FEXP: 1662 return legalizeFExp(MI, B); 1663 case TargetOpcode::G_FPOW: 1664 return legalizeFPow(MI, B); 1665 case TargetOpcode::G_FFLOOR: 1666 return legalizeFFloor(MI, MRI, B); 1667 case TargetOpcode::G_BUILD_VECTOR: 1668 return legalizeBuildVector(MI, MRI, B); 1669 default: 1670 return false; 1671 } 1672 1673 llvm_unreachable("expected switch to return"); 1674 } 1675 1676 Register AMDGPULegalizerInfo::getSegmentAperture( 1677 unsigned AS, 1678 MachineRegisterInfo &MRI, 1679 MachineIRBuilder &B) const { 1680 MachineFunction &MF = B.getMF(); 1681 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1682 const LLT S32 = LLT::scalar(32); 1683 1684 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1685 1686 if (ST.hasApertureRegs()) { 1687 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1688 // getreg. 1689 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1690 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1691 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1692 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1693 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1694 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1695 unsigned Encoding = 1696 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1697 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1698 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1699 1700 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1701 1702 B.buildInstr(AMDGPU::S_GETREG_B32) 1703 .addDef(GetReg) 1704 .addImm(Encoding); 1705 MRI.setType(GetReg, S32); 1706 1707 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1708 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1709 } 1710 1711 Register QueuePtr = MRI.createGenericVirtualRegister( 1712 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1713 1714 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1715 return Register(); 1716 1717 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1718 // private_segment_aperture_base_hi. 1719 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1720 1721 // TODO: can we be smarter about machine pointer info? 1722 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1723 MachineMemOperand *MMO = MF.getMachineMemOperand( 1724 PtrInfo, 1725 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1726 MachineMemOperand::MOInvariant, 1727 4, commonAlignment(Align(64), StructOffset)); 1728 1729 Register LoadAddr; 1730 1731 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1732 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1733 } 1734 1735 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1736 MachineInstr &MI, MachineRegisterInfo &MRI, 1737 MachineIRBuilder &B) const { 1738 MachineFunction &MF = B.getMF(); 1739 1740 const LLT S32 = LLT::scalar(32); 1741 Register Dst = MI.getOperand(0).getReg(); 1742 Register Src = MI.getOperand(1).getReg(); 1743 1744 LLT DstTy = MRI.getType(Dst); 1745 LLT SrcTy = MRI.getType(Src); 1746 unsigned DestAS = DstTy.getAddressSpace(); 1747 unsigned SrcAS = SrcTy.getAddressSpace(); 1748 1749 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1750 // vector element. 1751 assert(!DstTy.isVector()); 1752 1753 const AMDGPUTargetMachine &TM 1754 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1755 1756 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1757 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1758 return true; 1759 } 1760 1761 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1762 // Truncate. 1763 B.buildExtract(Dst, Src, 0); 1764 MI.eraseFromParent(); 1765 return true; 1766 } 1767 1768 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1769 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1770 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1771 1772 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1773 // another. Merge operands are required to be the same type, but creating an 1774 // extra ptrtoint would be kind of pointless. 1775 auto HighAddr = B.buildConstant( 1776 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1777 B.buildMerge(Dst, {Src, HighAddr}); 1778 MI.eraseFromParent(); 1779 return true; 1780 } 1781 1782 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1783 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1784 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1785 unsigned NullVal = TM.getNullPointerValue(DestAS); 1786 1787 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1788 auto FlatNull = B.buildConstant(SrcTy, 0); 1789 1790 // Extract low 32-bits of the pointer. 1791 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1792 1793 auto CmpRes = 1794 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1795 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1796 1797 MI.eraseFromParent(); 1798 return true; 1799 } 1800 1801 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1802 return false; 1803 1804 if (!ST.hasFlatAddressSpace()) 1805 return false; 1806 1807 auto SegmentNull = 1808 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1809 auto FlatNull = 1810 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1811 1812 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1813 if (!ApertureReg.isValid()) 1814 return false; 1815 1816 auto CmpRes = 1817 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1818 1819 // Coerce the type of the low half of the result so we can use merge_values. 1820 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1821 1822 // TODO: Should we allow mismatched types but matching sizes in merges to 1823 // avoid the ptrtoint? 1824 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1825 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1826 1827 MI.eraseFromParent(); 1828 return true; 1829 } 1830 1831 bool AMDGPULegalizerInfo::legalizeFrint( 1832 MachineInstr &MI, MachineRegisterInfo &MRI, 1833 MachineIRBuilder &B) const { 1834 Register Src = MI.getOperand(1).getReg(); 1835 LLT Ty = MRI.getType(Src); 1836 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1837 1838 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1839 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1840 1841 auto C1 = B.buildFConstant(Ty, C1Val); 1842 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1843 1844 // TODO: Should this propagate fast-math-flags? 1845 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1846 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1847 1848 auto C2 = B.buildFConstant(Ty, C2Val); 1849 auto Fabs = B.buildFAbs(Ty, Src); 1850 1851 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1852 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1853 MI.eraseFromParent(); 1854 return true; 1855 } 1856 1857 bool AMDGPULegalizerInfo::legalizeFceil( 1858 MachineInstr &MI, MachineRegisterInfo &MRI, 1859 MachineIRBuilder &B) const { 1860 1861 const LLT S1 = LLT::scalar(1); 1862 const LLT S64 = LLT::scalar(64); 1863 1864 Register Src = MI.getOperand(1).getReg(); 1865 assert(MRI.getType(Src) == S64); 1866 1867 // result = trunc(src) 1868 // if (src > 0.0 && src != result) 1869 // result += 1.0 1870 1871 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1872 1873 const auto Zero = B.buildFConstant(S64, 0.0); 1874 const auto One = B.buildFConstant(S64, 1.0); 1875 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1876 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1877 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1878 auto Add = B.buildSelect(S64, And, One, Zero); 1879 1880 // TODO: Should this propagate fast-math-flags? 1881 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1882 return true; 1883 } 1884 1885 bool AMDGPULegalizerInfo::legalizeFrem( 1886 MachineInstr &MI, MachineRegisterInfo &MRI, 1887 MachineIRBuilder &B) const { 1888 Register DstReg = MI.getOperand(0).getReg(); 1889 Register Src0Reg = MI.getOperand(1).getReg(); 1890 Register Src1Reg = MI.getOperand(2).getReg(); 1891 auto Flags = MI.getFlags(); 1892 LLT Ty = MRI.getType(DstReg); 1893 1894 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1895 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1896 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1897 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1898 MI.eraseFromParent(); 1899 return true; 1900 } 1901 1902 static MachineInstrBuilder extractF64Exponent(Register Hi, 1903 MachineIRBuilder &B) { 1904 const unsigned FractBits = 52; 1905 const unsigned ExpBits = 11; 1906 LLT S32 = LLT::scalar(32); 1907 1908 auto Const0 = B.buildConstant(S32, FractBits - 32); 1909 auto Const1 = B.buildConstant(S32, ExpBits); 1910 1911 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1912 .addUse(Hi) 1913 .addUse(Const0.getReg(0)) 1914 .addUse(Const1.getReg(0)); 1915 1916 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1917 } 1918 1919 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1920 MachineInstr &MI, MachineRegisterInfo &MRI, 1921 MachineIRBuilder &B) const { 1922 const LLT S1 = LLT::scalar(1); 1923 const LLT S32 = LLT::scalar(32); 1924 const LLT S64 = LLT::scalar(64); 1925 1926 Register Src = MI.getOperand(1).getReg(); 1927 assert(MRI.getType(Src) == S64); 1928 1929 // TODO: Should this use extract since the low half is unused? 1930 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1931 Register Hi = Unmerge.getReg(1); 1932 1933 // Extract the upper half, since this is where we will find the sign and 1934 // exponent. 1935 auto Exp = extractF64Exponent(Hi, B); 1936 1937 const unsigned FractBits = 52; 1938 1939 // Extract the sign bit. 1940 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1941 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1942 1943 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1944 1945 const auto Zero32 = B.buildConstant(S32, 0); 1946 1947 // Extend back to 64-bits. 1948 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1949 1950 auto Shr = B.buildAShr(S64, FractMask, Exp); 1951 auto Not = B.buildNot(S64, Shr); 1952 auto Tmp0 = B.buildAnd(S64, Src, Not); 1953 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1954 1955 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1956 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1957 1958 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1959 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1960 MI.eraseFromParent(); 1961 return true; 1962 } 1963 1964 bool AMDGPULegalizerInfo::legalizeITOFP( 1965 MachineInstr &MI, MachineRegisterInfo &MRI, 1966 MachineIRBuilder &B, bool Signed) const { 1967 1968 Register Dst = MI.getOperand(0).getReg(); 1969 Register Src = MI.getOperand(1).getReg(); 1970 1971 const LLT S64 = LLT::scalar(64); 1972 const LLT S32 = LLT::scalar(32); 1973 1974 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1975 1976 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1977 1978 auto CvtHi = Signed ? 1979 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1980 B.buildUITOFP(S64, Unmerge.getReg(1)); 1981 1982 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1983 1984 auto ThirtyTwo = B.buildConstant(S32, 32); 1985 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1986 .addUse(CvtHi.getReg(0)) 1987 .addUse(ThirtyTwo.getReg(0)); 1988 1989 // TODO: Should this propagate fast-math-flags? 1990 B.buildFAdd(Dst, LdExp, CvtLo); 1991 MI.eraseFromParent(); 1992 return true; 1993 } 1994 1995 // TODO: Copied from DAG implementation. Verify logic and document how this 1996 // actually works. 1997 bool AMDGPULegalizerInfo::legalizeFPTOI( 1998 MachineInstr &MI, MachineRegisterInfo &MRI, 1999 MachineIRBuilder &B, bool Signed) const { 2000 2001 Register Dst = MI.getOperand(0).getReg(); 2002 Register Src = MI.getOperand(1).getReg(); 2003 2004 const LLT S64 = LLT::scalar(64); 2005 const LLT S32 = LLT::scalar(32); 2006 2007 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2008 2009 unsigned Flags = MI.getFlags(); 2010 2011 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2012 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2013 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2014 2015 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2016 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2017 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2018 2019 auto Hi = Signed ? 2020 B.buildFPTOSI(S32, FloorMul) : 2021 B.buildFPTOUI(S32, FloorMul); 2022 auto Lo = B.buildFPTOUI(S32, Fma); 2023 2024 B.buildMerge(Dst, { Lo, Hi }); 2025 MI.eraseFromParent(); 2026 2027 return true; 2028 } 2029 2030 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2031 MachineInstr &MI) const { 2032 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2033 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2034 2035 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2036 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2037 2038 // With ieee_mode disabled, the instructions have the correct behavior 2039 // already for G_FMINNUM/G_FMAXNUM 2040 if (!MFI->getMode().IEEE) 2041 return !IsIEEEOp; 2042 2043 if (IsIEEEOp) 2044 return true; 2045 2046 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2047 } 2048 2049 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2050 MachineInstr &MI, MachineRegisterInfo &MRI, 2051 MachineIRBuilder &B) const { 2052 // TODO: Should move some of this into LegalizerHelper. 2053 2054 // TODO: Promote dynamic indexing of s16 to s32 2055 2056 // FIXME: Artifact combiner probably should have replaced the truncated 2057 // constant before this, so we shouldn't need 2058 // getConstantVRegValWithLookThrough. 2059 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2060 MI.getOperand(2).getReg(), MRI); 2061 if (!IdxVal) // Dynamic case will be selected to register indexing. 2062 return true; 2063 2064 Register Dst = MI.getOperand(0).getReg(); 2065 Register Vec = MI.getOperand(1).getReg(); 2066 2067 LLT VecTy = MRI.getType(Vec); 2068 LLT EltTy = VecTy.getElementType(); 2069 assert(EltTy == MRI.getType(Dst)); 2070 2071 if (IdxVal->Value < VecTy.getNumElements()) 2072 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 2073 else 2074 B.buildUndef(Dst); 2075 2076 MI.eraseFromParent(); 2077 return true; 2078 } 2079 2080 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2081 MachineInstr &MI, MachineRegisterInfo &MRI, 2082 MachineIRBuilder &B) const { 2083 // TODO: Should move some of this into LegalizerHelper. 2084 2085 // TODO: Promote dynamic indexing of s16 to s32 2086 2087 // FIXME: Artifact combiner probably should have replaced the truncated 2088 // constant before this, so we shouldn't need 2089 // getConstantVRegValWithLookThrough. 2090 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2091 MI.getOperand(3).getReg(), MRI); 2092 if (!IdxVal) // Dynamic case will be selected to register indexing. 2093 return true; 2094 2095 Register Dst = MI.getOperand(0).getReg(); 2096 Register Vec = MI.getOperand(1).getReg(); 2097 Register Ins = MI.getOperand(2).getReg(); 2098 2099 LLT VecTy = MRI.getType(Vec); 2100 LLT EltTy = VecTy.getElementType(); 2101 assert(EltTy == MRI.getType(Ins)); 2102 2103 if (IdxVal->Value < VecTy.getNumElements()) 2104 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2105 else 2106 B.buildUndef(Dst); 2107 2108 MI.eraseFromParent(); 2109 return true; 2110 } 2111 2112 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2113 MachineInstr &MI, MachineRegisterInfo &MRI, 2114 MachineIRBuilder &B) const { 2115 const LLT V2S16 = LLT::vector(2, 16); 2116 2117 Register Dst = MI.getOperand(0).getReg(); 2118 Register Src0 = MI.getOperand(1).getReg(); 2119 LLT DstTy = MRI.getType(Dst); 2120 LLT SrcTy = MRI.getType(Src0); 2121 2122 if (SrcTy == V2S16 && DstTy == V2S16 && 2123 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2124 return true; 2125 2126 MachineIRBuilder HelperBuilder(MI); 2127 GISelObserverWrapper DummyObserver; 2128 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2129 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2130 } 2131 2132 bool AMDGPULegalizerInfo::legalizeSinCos( 2133 MachineInstr &MI, MachineRegisterInfo &MRI, 2134 MachineIRBuilder &B) const { 2135 2136 Register DstReg = MI.getOperand(0).getReg(); 2137 Register SrcReg = MI.getOperand(1).getReg(); 2138 LLT Ty = MRI.getType(DstReg); 2139 unsigned Flags = MI.getFlags(); 2140 2141 Register TrigVal; 2142 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2143 if (ST.hasTrigReducedRange()) { 2144 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2145 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2146 .addUse(MulVal.getReg(0)) 2147 .setMIFlags(Flags).getReg(0); 2148 } else 2149 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2150 2151 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2152 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2153 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2154 .addUse(TrigVal) 2155 .setMIFlags(Flags); 2156 MI.eraseFromParent(); 2157 return true; 2158 } 2159 2160 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2161 MachineIRBuilder &B, 2162 const GlobalValue *GV, 2163 int64_t Offset, 2164 unsigned GAFlags) const { 2165 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2166 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2167 // to the following code sequence: 2168 // 2169 // For constant address space: 2170 // s_getpc_b64 s[0:1] 2171 // s_add_u32 s0, s0, $symbol 2172 // s_addc_u32 s1, s1, 0 2173 // 2174 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2175 // a fixup or relocation is emitted to replace $symbol with a literal 2176 // constant, which is a pc-relative offset from the encoding of the $symbol 2177 // operand to the global variable. 2178 // 2179 // For global address space: 2180 // s_getpc_b64 s[0:1] 2181 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2182 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2183 // 2184 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2185 // fixups or relocations are emitted to replace $symbol@*@lo and 2186 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2187 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2188 // operand to the global variable. 2189 // 2190 // What we want here is an offset from the value returned by s_getpc 2191 // (which is the address of the s_add_u32 instruction) to the global 2192 // variable, but since the encoding of $symbol starts 4 bytes after the start 2193 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2194 // small. This requires us to add 4 to the global variable offset in order to 2195 // compute the correct address. 2196 2197 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2198 2199 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2200 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2201 2202 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2203 .addDef(PCReg); 2204 2205 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2206 if (GAFlags == SIInstrInfo::MO_NONE) 2207 MIB.addImm(0); 2208 else 2209 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2210 2211 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2212 2213 if (PtrTy.getSizeInBits() == 32) 2214 B.buildExtract(DstReg, PCReg, 0); 2215 return true; 2216 } 2217 2218 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2219 MachineInstr &MI, MachineRegisterInfo &MRI, 2220 MachineIRBuilder &B) const { 2221 Register DstReg = MI.getOperand(0).getReg(); 2222 LLT Ty = MRI.getType(DstReg); 2223 unsigned AS = Ty.getAddressSpace(); 2224 2225 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2226 MachineFunction &MF = B.getMF(); 2227 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2228 2229 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2230 if (!MFI->isEntryFunction()) { 2231 const Function &Fn = MF.getFunction(); 2232 DiagnosticInfoUnsupported BadLDSDecl( 2233 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2234 DS_Warning); 2235 Fn.getContext().diagnose(BadLDSDecl); 2236 2237 // We currently don't have a way to correctly allocate LDS objects that 2238 // aren't directly associated with a kernel. We do force inlining of 2239 // functions that use local objects. However, if these dead functions are 2240 // not eliminated, we don't want a compile time error. Just emit a warning 2241 // and a trap, since there should be no callable path here. 2242 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2243 B.buildUndef(DstReg); 2244 MI.eraseFromParent(); 2245 return true; 2246 } 2247 2248 // TODO: We could emit code to handle the initialization somewhere. 2249 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2250 const SITargetLowering *TLI = ST.getTargetLowering(); 2251 if (!TLI->shouldUseLDSConstAddress(GV)) { 2252 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2253 return true; // Leave in place; 2254 } 2255 2256 B.buildConstant( 2257 DstReg, 2258 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2259 MI.eraseFromParent(); 2260 return true; 2261 } 2262 2263 const Function &Fn = MF.getFunction(); 2264 DiagnosticInfoUnsupported BadInit( 2265 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2266 Fn.getContext().diagnose(BadInit); 2267 return true; 2268 } 2269 2270 const SITargetLowering *TLI = ST.getTargetLowering(); 2271 2272 if (TLI->shouldEmitFixup(GV)) { 2273 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2274 MI.eraseFromParent(); 2275 return true; 2276 } 2277 2278 if (TLI->shouldEmitPCReloc(GV)) { 2279 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2280 MI.eraseFromParent(); 2281 return true; 2282 } 2283 2284 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2285 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2286 2287 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2288 MachinePointerInfo::getGOT(MF), 2289 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2290 MachineMemOperand::MOInvariant, 2291 8 /*Size*/, Align(8)); 2292 2293 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2294 2295 if (Ty.getSizeInBits() == 32) { 2296 // Truncate if this is a 32-bit constant adrdess. 2297 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2298 B.buildExtract(DstReg, Load, 0); 2299 } else 2300 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2301 2302 MI.eraseFromParent(); 2303 return true; 2304 } 2305 2306 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2307 MachineInstr &MI) const { 2308 MachineIRBuilder &B = Helper.MIRBuilder; 2309 MachineRegisterInfo &MRI = *B.getMRI(); 2310 GISelChangeObserver &Observer = Helper.Observer; 2311 2312 Register PtrReg = MI.getOperand(1).getReg(); 2313 LLT PtrTy = MRI.getType(PtrReg); 2314 unsigned AddrSpace = PtrTy.getAddressSpace(); 2315 2316 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2317 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2318 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2319 Observer.changingInstr(MI); 2320 MI.getOperand(1).setReg(Cast.getReg(0)); 2321 Observer.changedInstr(MI); 2322 return true; 2323 } 2324 2325 return false; 2326 } 2327 2328 bool AMDGPULegalizerInfo::legalizeFMad( 2329 MachineInstr &MI, MachineRegisterInfo &MRI, 2330 MachineIRBuilder &B) const { 2331 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2332 assert(Ty.isScalar()); 2333 2334 MachineFunction &MF = B.getMF(); 2335 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2336 2337 // TODO: Always legal with future ftz flag. 2338 // FIXME: Do we need just output? 2339 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2340 return true; 2341 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2342 return true; 2343 2344 MachineIRBuilder HelperBuilder(MI); 2345 GISelObserverWrapper DummyObserver; 2346 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2347 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2348 } 2349 2350 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2351 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2352 Register DstReg = MI.getOperand(0).getReg(); 2353 Register PtrReg = MI.getOperand(1).getReg(); 2354 Register CmpVal = MI.getOperand(2).getReg(); 2355 Register NewVal = MI.getOperand(3).getReg(); 2356 2357 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2358 "this should not have been custom lowered"); 2359 2360 LLT ValTy = MRI.getType(CmpVal); 2361 LLT VecTy = LLT::vector(2, ValTy); 2362 2363 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2364 2365 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2366 .addDef(DstReg) 2367 .addUse(PtrReg) 2368 .addUse(PackedVal) 2369 .setMemRefs(MI.memoperands()); 2370 2371 MI.eraseFromParent(); 2372 return true; 2373 } 2374 2375 bool AMDGPULegalizerInfo::legalizeFlog( 2376 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2377 Register Dst = MI.getOperand(0).getReg(); 2378 Register Src = MI.getOperand(1).getReg(); 2379 LLT Ty = B.getMRI()->getType(Dst); 2380 unsigned Flags = MI.getFlags(); 2381 2382 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2383 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2384 2385 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2386 MI.eraseFromParent(); 2387 return true; 2388 } 2389 2390 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2391 MachineIRBuilder &B) const { 2392 Register Dst = MI.getOperand(0).getReg(); 2393 Register Src = MI.getOperand(1).getReg(); 2394 unsigned Flags = MI.getFlags(); 2395 LLT Ty = B.getMRI()->getType(Dst); 2396 2397 auto K = B.buildFConstant(Ty, numbers::log2e); 2398 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2399 B.buildFExp2(Dst, Mul, Flags); 2400 MI.eraseFromParent(); 2401 return true; 2402 } 2403 2404 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2405 MachineIRBuilder &B) const { 2406 Register Dst = MI.getOperand(0).getReg(); 2407 Register Src0 = MI.getOperand(1).getReg(); 2408 Register Src1 = MI.getOperand(2).getReg(); 2409 unsigned Flags = MI.getFlags(); 2410 LLT Ty = B.getMRI()->getType(Dst); 2411 const LLT S16 = LLT::scalar(16); 2412 const LLT S32 = LLT::scalar(32); 2413 2414 if (Ty == S32) { 2415 auto Log = B.buildFLog2(S32, Src0, Flags); 2416 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2417 .addUse(Log.getReg(0)) 2418 .addUse(Src1) 2419 .setMIFlags(Flags); 2420 B.buildFExp2(Dst, Mul, Flags); 2421 } else if (Ty == S16) { 2422 // There's no f16 fmul_legacy, so we need to convert for it. 2423 auto Log = B.buildFLog2(S16, Src0, Flags); 2424 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2425 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2426 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2427 .addUse(Ext0.getReg(0)) 2428 .addUse(Ext1.getReg(0)) 2429 .setMIFlags(Flags); 2430 2431 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2432 } else 2433 return false; 2434 2435 MI.eraseFromParent(); 2436 return true; 2437 } 2438 2439 // Find a source register, ignoring any possible source modifiers. 2440 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2441 Register ModSrc = OrigSrc; 2442 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2443 ModSrc = SrcFNeg->getOperand(1).getReg(); 2444 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2445 ModSrc = SrcFAbs->getOperand(1).getReg(); 2446 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2447 ModSrc = SrcFAbs->getOperand(1).getReg(); 2448 return ModSrc; 2449 } 2450 2451 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2452 MachineRegisterInfo &MRI, 2453 MachineIRBuilder &B) const { 2454 2455 const LLT S1 = LLT::scalar(1); 2456 const LLT S64 = LLT::scalar(64); 2457 Register Dst = MI.getOperand(0).getReg(); 2458 Register OrigSrc = MI.getOperand(1).getReg(); 2459 unsigned Flags = MI.getFlags(); 2460 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2461 "this should not have been custom lowered"); 2462 2463 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2464 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2465 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2466 // V_FRACT bug is: 2467 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2468 // 2469 // Convert floor(x) to (x - fract(x)) 2470 2471 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2472 .addUse(OrigSrc) 2473 .setMIFlags(Flags); 2474 2475 // Give source modifier matching some assistance before obscuring a foldable 2476 // pattern. 2477 2478 // TODO: We can avoid the neg on the fract? The input sign to fract 2479 // shouldn't matter? 2480 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2481 2482 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2483 2484 Register Min = MRI.createGenericVirtualRegister(S64); 2485 2486 // We don't need to concern ourselves with the snan handling difference, so 2487 // use the one which will directly select. 2488 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2489 if (MFI->getMode().IEEE) 2490 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2491 else 2492 B.buildFMinNum(Min, Fract, Const, Flags); 2493 2494 Register CorrectedFract = Min; 2495 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2496 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2497 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2498 } 2499 2500 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2501 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2502 2503 MI.eraseFromParent(); 2504 return true; 2505 } 2506 2507 // Turn an illegal packed v2s16 build vector into bit operations. 2508 // TODO: This should probably be a bitcast action in LegalizerHelper. 2509 bool AMDGPULegalizerInfo::legalizeBuildVector( 2510 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2511 Register Dst = MI.getOperand(0).getReg(); 2512 const LLT S32 = LLT::scalar(32); 2513 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2514 2515 Register Src0 = MI.getOperand(1).getReg(); 2516 Register Src1 = MI.getOperand(2).getReg(); 2517 assert(MRI.getType(Src0) == LLT::scalar(16)); 2518 2519 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2520 B.buildBitcast(Dst, Merge); 2521 2522 MI.eraseFromParent(); 2523 return true; 2524 } 2525 2526 // Return the use branch instruction, otherwise null if the usage is invalid. 2527 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2528 MachineRegisterInfo &MRI, 2529 MachineInstr *&Br, 2530 MachineBasicBlock *&UncondBrTarget) { 2531 Register CondDef = MI.getOperand(0).getReg(); 2532 if (!MRI.hasOneNonDBGUse(CondDef)) 2533 return nullptr; 2534 2535 MachineBasicBlock *Parent = MI.getParent(); 2536 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2537 if (UseMI.getParent() != Parent || 2538 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2539 return nullptr; 2540 2541 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2542 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2543 if (Next == Parent->end()) { 2544 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2545 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2546 return nullptr; 2547 UncondBrTarget = &*NextMBB; 2548 } else { 2549 if (Next->getOpcode() != AMDGPU::G_BR) 2550 return nullptr; 2551 Br = &*Next; 2552 UncondBrTarget = Br->getOperand(0).getMBB(); 2553 } 2554 2555 return &UseMI; 2556 } 2557 2558 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2559 const ArgDescriptor *Arg, 2560 const TargetRegisterClass *ArgRC, 2561 LLT ArgTy) const { 2562 MCRegister SrcReg = Arg->getRegister(); 2563 assert(SrcReg.isPhysical() && "Physical register expected"); 2564 assert(DstReg.isVirtual() && "Virtual register expected"); 2565 2566 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2567 ArgTy); 2568 if (Arg->isMasked()) { 2569 // TODO: Should we try to emit this once in the entry block? 2570 const LLT S32 = LLT::scalar(32); 2571 const unsigned Mask = Arg->getMask(); 2572 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2573 2574 Register AndMaskSrc = LiveIn; 2575 2576 if (Shift != 0) { 2577 auto ShiftAmt = B.buildConstant(S32, Shift); 2578 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2579 } 2580 2581 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2582 } else { 2583 B.buildCopy(DstReg, LiveIn); 2584 } 2585 2586 return true; 2587 } 2588 2589 bool AMDGPULegalizerInfo::loadInputValue( 2590 Register DstReg, MachineIRBuilder &B, 2591 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2592 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2593 const ArgDescriptor *Arg; 2594 const TargetRegisterClass *ArgRC; 2595 LLT ArgTy; 2596 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2597 2598 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2599 return false; // TODO: Handle these 2600 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2601 } 2602 2603 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2604 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2605 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2606 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2607 return false; 2608 2609 MI.eraseFromParent(); 2610 return true; 2611 } 2612 2613 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2614 MachineRegisterInfo &MRI, 2615 MachineIRBuilder &B) const { 2616 Register Dst = MI.getOperand(0).getReg(); 2617 LLT DstTy = MRI.getType(Dst); 2618 LLT S16 = LLT::scalar(16); 2619 LLT S32 = LLT::scalar(32); 2620 LLT S64 = LLT::scalar(64); 2621 2622 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2623 return true; 2624 2625 if (DstTy == S16) 2626 return legalizeFDIV16(MI, MRI, B); 2627 if (DstTy == S32) 2628 return legalizeFDIV32(MI, MRI, B); 2629 if (DstTy == S64) 2630 return legalizeFDIV64(MI, MRI, B); 2631 2632 return false; 2633 } 2634 2635 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2636 Register DstReg, 2637 Register X, 2638 Register Y, 2639 bool IsDiv) const { 2640 const LLT S1 = LLT::scalar(1); 2641 const LLT S32 = LLT::scalar(32); 2642 2643 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2644 // algorithm used here. 2645 2646 // Initial estimate of inv(y). 2647 auto FloatY = B.buildUITOFP(S32, Y); 2648 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2649 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2650 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2651 auto Z = B.buildFPTOUI(S32, ScaledY); 2652 2653 // One round of UNR. 2654 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2655 auto NegYZ = B.buildMul(S32, NegY, Z); 2656 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2657 2658 // Quotient/remainder estimate. 2659 auto Q = B.buildUMulH(S32, X, Z); 2660 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2661 2662 // First quotient/remainder refinement. 2663 auto One = B.buildConstant(S32, 1); 2664 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2665 if (IsDiv) 2666 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2667 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2668 2669 // Second quotient/remainder refinement. 2670 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2671 if (IsDiv) 2672 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2673 else 2674 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2675 } 2676 2677 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2678 MachineRegisterInfo &MRI, 2679 MachineIRBuilder &B) const { 2680 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2681 Register DstReg = MI.getOperand(0).getReg(); 2682 Register Num = MI.getOperand(1).getReg(); 2683 Register Den = MI.getOperand(2).getReg(); 2684 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2685 MI.eraseFromParent(); 2686 return true; 2687 } 2688 2689 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2690 // 2691 // Return lo, hi of result 2692 // 2693 // %cvt.lo = G_UITOFP Val.lo 2694 // %cvt.hi = G_UITOFP Val.hi 2695 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2696 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2697 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2698 // %mul2 = G_FMUL %mul1, 2**(-32) 2699 // %trunc = G_INTRINSIC_TRUNC %mul2 2700 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2701 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2702 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2703 Register Val) { 2704 const LLT S32 = LLT::scalar(32); 2705 auto Unmerge = B.buildUnmerge(S32, Val); 2706 2707 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2708 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2709 2710 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2711 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2712 2713 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2714 auto Mul1 = 2715 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2716 2717 // 2**(-32) 2718 auto Mul2 = 2719 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2720 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2721 2722 // -(2**32) 2723 auto Mad2 = B.buildFMAD(S32, Trunc, 2724 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2725 2726 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2727 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2728 2729 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2730 } 2731 2732 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2733 Register DstReg, 2734 Register Numer, 2735 Register Denom, 2736 bool IsDiv) const { 2737 const LLT S32 = LLT::scalar(32); 2738 const LLT S64 = LLT::scalar(64); 2739 const LLT S1 = LLT::scalar(1); 2740 Register RcpLo, RcpHi; 2741 2742 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2743 2744 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2745 2746 auto Zero64 = B.buildConstant(S64, 0); 2747 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2748 2749 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2750 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2751 2752 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2753 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2754 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2755 2756 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2757 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2758 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2759 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2760 2761 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2762 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2763 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2764 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2765 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2766 2767 auto Zero32 = B.buildConstant(S32, 0); 2768 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2769 auto Add2_HiC = 2770 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2771 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2772 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2773 2774 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2775 Register NumerLo = UnmergeNumer.getReg(0); 2776 Register NumerHi = UnmergeNumer.getReg(1); 2777 2778 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2779 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2780 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2781 Register Mul3_Lo = UnmergeMul3.getReg(0); 2782 Register Mul3_Hi = UnmergeMul3.getReg(1); 2783 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2784 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2785 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2786 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2787 2788 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2789 Register DenomLo = UnmergeDenom.getReg(0); 2790 Register DenomHi = UnmergeDenom.getReg(1); 2791 2792 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2793 auto C1 = B.buildSExt(S32, CmpHi); 2794 2795 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2796 auto C2 = B.buildSExt(S32, CmpLo); 2797 2798 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2799 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2800 2801 // TODO: Here and below portions of the code can be enclosed into if/endif. 2802 // Currently control flow is unconditional and we have 4 selects after 2803 // potential endif to substitute PHIs. 2804 2805 // if C3 != 0 ... 2806 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2807 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2808 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2809 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2810 2811 auto One64 = B.buildConstant(S64, 1); 2812 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2813 2814 auto C4 = 2815 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2816 auto C5 = 2817 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2818 auto C6 = B.buildSelect( 2819 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2820 2821 // if (C6 != 0) 2822 auto Add4 = B.buildAdd(S64, Add3, One64); 2823 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2824 2825 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2826 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2827 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2828 2829 // endif C6 2830 // endif C3 2831 2832 if (IsDiv) { 2833 auto Sel1 = B.buildSelect( 2834 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2835 B.buildSelect(DstReg, 2836 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2837 } else { 2838 auto Sel2 = B.buildSelect( 2839 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2840 B.buildSelect(DstReg, 2841 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2842 } 2843 } 2844 2845 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2846 MachineRegisterInfo &MRI, 2847 MachineIRBuilder &B) const { 2848 const LLT S64 = LLT::scalar(64); 2849 const LLT S32 = LLT::scalar(32); 2850 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2851 Register DstReg = MI.getOperand(0).getReg(); 2852 Register Num = MI.getOperand(1).getReg(); 2853 Register Den = MI.getOperand(2).getReg(); 2854 LLT Ty = MRI.getType(DstReg); 2855 2856 if (Ty == S32) 2857 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2858 else if (Ty == S64) 2859 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2860 else 2861 return false; 2862 2863 MI.eraseFromParent(); 2864 return true; 2865 2866 } 2867 2868 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2869 MachineRegisterInfo &MRI, 2870 MachineIRBuilder &B) const { 2871 const LLT S64 = LLT::scalar(64); 2872 const LLT S32 = LLT::scalar(32); 2873 2874 Register DstReg = MI.getOperand(0).getReg(); 2875 const LLT Ty = MRI.getType(DstReg); 2876 if (Ty != S32 && Ty != S64) 2877 return false; 2878 2879 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2880 2881 Register LHS = MI.getOperand(1).getReg(); 2882 Register RHS = MI.getOperand(2).getReg(); 2883 2884 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2885 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2886 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2887 2888 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2889 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2890 2891 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2892 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2893 2894 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2895 if (Ty == S32) 2896 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2897 else 2898 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2899 2900 Register Sign; 2901 if (IsDiv) 2902 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2903 else 2904 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2905 2906 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2907 B.buildSub(DstReg, UDivRem, Sign); 2908 2909 MI.eraseFromParent(); 2910 return true; 2911 } 2912 2913 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2914 MachineRegisterInfo &MRI, 2915 MachineIRBuilder &B) const { 2916 Register Res = MI.getOperand(0).getReg(); 2917 Register LHS = MI.getOperand(1).getReg(); 2918 Register RHS = MI.getOperand(2).getReg(); 2919 2920 uint16_t Flags = MI.getFlags(); 2921 2922 LLT ResTy = MRI.getType(Res); 2923 LLT S32 = LLT::scalar(32); 2924 LLT S64 = LLT::scalar(64); 2925 2926 const MachineFunction &MF = B.getMF(); 2927 bool Unsafe = 2928 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2929 2930 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2931 return false; 2932 2933 if (!Unsafe && ResTy == S32 && 2934 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2935 return false; 2936 2937 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2938 // 1 / x -> RCP(x) 2939 if (CLHS->isExactlyValue(1.0)) { 2940 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2941 .addUse(RHS) 2942 .setMIFlags(Flags); 2943 2944 MI.eraseFromParent(); 2945 return true; 2946 } 2947 2948 // -1 / x -> RCP( FNEG(x) ) 2949 if (CLHS->isExactlyValue(-1.0)) { 2950 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2951 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2952 .addUse(FNeg.getReg(0)) 2953 .setMIFlags(Flags); 2954 2955 MI.eraseFromParent(); 2956 return true; 2957 } 2958 } 2959 2960 // x / y -> x * (1.0 / y) 2961 if (Unsafe) { 2962 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2963 .addUse(RHS) 2964 .setMIFlags(Flags); 2965 B.buildFMul(Res, LHS, RCP, Flags); 2966 2967 MI.eraseFromParent(); 2968 return true; 2969 } 2970 2971 return false; 2972 } 2973 2974 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2975 MachineRegisterInfo &MRI, 2976 MachineIRBuilder &B) const { 2977 Register Res = MI.getOperand(0).getReg(); 2978 Register LHS = MI.getOperand(1).getReg(); 2979 Register RHS = MI.getOperand(2).getReg(); 2980 2981 uint16_t Flags = MI.getFlags(); 2982 2983 LLT S16 = LLT::scalar(16); 2984 LLT S32 = LLT::scalar(32); 2985 2986 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2987 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2988 2989 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2990 .addUse(RHSExt.getReg(0)) 2991 .setMIFlags(Flags); 2992 2993 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2994 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2995 2996 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2997 .addUse(RDst.getReg(0)) 2998 .addUse(RHS) 2999 .addUse(LHS) 3000 .setMIFlags(Flags); 3001 3002 MI.eraseFromParent(); 3003 return true; 3004 } 3005 3006 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3007 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3008 static void toggleSPDenormMode(bool Enable, 3009 MachineIRBuilder &B, 3010 const GCNSubtarget &ST, 3011 AMDGPU::SIModeRegisterDefaults Mode) { 3012 // Set SP denorm mode to this value. 3013 unsigned SPDenormMode = 3014 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3015 3016 if (ST.hasDenormModeInst()) { 3017 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3018 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3019 3020 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3021 B.buildInstr(AMDGPU::S_DENORM_MODE) 3022 .addImm(NewDenormModeValue); 3023 3024 } else { 3025 // Select FP32 bit field in mode register. 3026 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3027 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3028 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3029 3030 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3031 .addImm(SPDenormMode) 3032 .addImm(SPDenormModeBitField); 3033 } 3034 } 3035 3036 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3037 MachineRegisterInfo &MRI, 3038 MachineIRBuilder &B) const { 3039 Register Res = MI.getOperand(0).getReg(); 3040 Register LHS = MI.getOperand(1).getReg(); 3041 Register RHS = MI.getOperand(2).getReg(); 3042 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3043 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3044 3045 uint16_t Flags = MI.getFlags(); 3046 3047 LLT S32 = LLT::scalar(32); 3048 LLT S1 = LLT::scalar(1); 3049 3050 auto One = B.buildFConstant(S32, 1.0f); 3051 3052 auto DenominatorScaled = 3053 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3054 .addUse(LHS) 3055 .addUse(RHS) 3056 .addImm(0) 3057 .setMIFlags(Flags); 3058 auto NumeratorScaled = 3059 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3060 .addUse(LHS) 3061 .addUse(RHS) 3062 .addImm(1) 3063 .setMIFlags(Flags); 3064 3065 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3066 .addUse(DenominatorScaled.getReg(0)) 3067 .setMIFlags(Flags); 3068 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3069 3070 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3071 // aren't modeled as reading it. 3072 if (!Mode.allFP32Denormals()) 3073 toggleSPDenormMode(true, B, ST, Mode); 3074 3075 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3076 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3077 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3078 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3079 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3080 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3081 3082 if (!Mode.allFP32Denormals()) 3083 toggleSPDenormMode(false, B, ST, Mode); 3084 3085 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3086 .addUse(Fma4.getReg(0)) 3087 .addUse(Fma1.getReg(0)) 3088 .addUse(Fma3.getReg(0)) 3089 .addUse(NumeratorScaled.getReg(1)) 3090 .setMIFlags(Flags); 3091 3092 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3093 .addUse(Fmas.getReg(0)) 3094 .addUse(RHS) 3095 .addUse(LHS) 3096 .setMIFlags(Flags); 3097 3098 MI.eraseFromParent(); 3099 return true; 3100 } 3101 3102 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3103 MachineRegisterInfo &MRI, 3104 MachineIRBuilder &B) const { 3105 Register Res = MI.getOperand(0).getReg(); 3106 Register LHS = MI.getOperand(1).getReg(); 3107 Register RHS = MI.getOperand(2).getReg(); 3108 3109 uint16_t Flags = MI.getFlags(); 3110 3111 LLT S64 = LLT::scalar(64); 3112 LLT S1 = LLT::scalar(1); 3113 3114 auto One = B.buildFConstant(S64, 1.0); 3115 3116 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3117 .addUse(LHS) 3118 .addUse(RHS) 3119 .addImm(0) 3120 .setMIFlags(Flags); 3121 3122 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3123 3124 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3125 .addUse(DivScale0.getReg(0)) 3126 .setMIFlags(Flags); 3127 3128 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3129 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3130 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3131 3132 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3133 .addUse(LHS) 3134 .addUse(RHS) 3135 .addImm(1) 3136 .setMIFlags(Flags); 3137 3138 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3139 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3140 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3141 3142 Register Scale; 3143 if (!ST.hasUsableDivScaleConditionOutput()) { 3144 // Workaround a hardware bug on SI where the condition output from div_scale 3145 // is not usable. 3146 3147 LLT S32 = LLT::scalar(32); 3148 3149 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3150 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3151 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3152 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3153 3154 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3155 Scale1Unmerge.getReg(1)); 3156 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3157 Scale0Unmerge.getReg(1)); 3158 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3159 } else { 3160 Scale = DivScale1.getReg(1); 3161 } 3162 3163 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3164 .addUse(Fma4.getReg(0)) 3165 .addUse(Fma3.getReg(0)) 3166 .addUse(Mul.getReg(0)) 3167 .addUse(Scale) 3168 .setMIFlags(Flags); 3169 3170 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3171 .addUse(Fmas.getReg(0)) 3172 .addUse(RHS) 3173 .addUse(LHS) 3174 .setMIFlags(Flags); 3175 3176 MI.eraseFromParent(); 3177 return true; 3178 } 3179 3180 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3181 MachineRegisterInfo &MRI, 3182 MachineIRBuilder &B) const { 3183 Register Res = MI.getOperand(0).getReg(); 3184 Register LHS = MI.getOperand(2).getReg(); 3185 Register RHS = MI.getOperand(3).getReg(); 3186 uint16_t Flags = MI.getFlags(); 3187 3188 LLT S32 = LLT::scalar(32); 3189 LLT S1 = LLT::scalar(1); 3190 3191 auto Abs = B.buildFAbs(S32, RHS, Flags); 3192 const APFloat C0Val(1.0f); 3193 3194 auto C0 = B.buildConstant(S32, 0x6f800000); 3195 auto C1 = B.buildConstant(S32, 0x2f800000); 3196 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3197 3198 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3199 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3200 3201 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3202 3203 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3204 .addUse(Mul0.getReg(0)) 3205 .setMIFlags(Flags); 3206 3207 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3208 3209 B.buildFMul(Res, Sel, Mul1, Flags); 3210 3211 MI.eraseFromParent(); 3212 return true; 3213 } 3214 3215 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3216 // FIXME: Why do we handle this one but not other removed instructions? 3217 // 3218 // Reciprocal square root. The clamp prevents infinite results, clamping 3219 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3220 // +-max_float. 3221 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3222 MachineRegisterInfo &MRI, 3223 MachineIRBuilder &B) const { 3224 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3225 return true; 3226 3227 Register Dst = MI.getOperand(0).getReg(); 3228 Register Src = MI.getOperand(2).getReg(); 3229 auto Flags = MI.getFlags(); 3230 3231 LLT Ty = MRI.getType(Dst); 3232 3233 const fltSemantics *FltSemantics; 3234 if (Ty == LLT::scalar(32)) 3235 FltSemantics = &APFloat::IEEEsingle(); 3236 else if (Ty == LLT::scalar(64)) 3237 FltSemantics = &APFloat::IEEEdouble(); 3238 else 3239 return false; 3240 3241 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3242 .addUse(Src) 3243 .setMIFlags(Flags); 3244 3245 // We don't need to concern ourselves with the snan handling difference, since 3246 // the rsq quieted (or not) so use the one which will directly select. 3247 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3248 const bool UseIEEE = MFI->getMode().IEEE; 3249 3250 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3251 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3252 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3253 3254 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3255 3256 if (UseIEEE) 3257 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3258 else 3259 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3260 MI.eraseFromParent(); 3261 return true; 3262 } 3263 3264 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3265 switch (IID) { 3266 case Intrinsic::amdgcn_ds_fadd: 3267 return AMDGPU::G_ATOMICRMW_FADD; 3268 case Intrinsic::amdgcn_ds_fmin: 3269 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3270 case Intrinsic::amdgcn_ds_fmax: 3271 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3272 default: 3273 llvm_unreachable("not a DS FP intrinsic"); 3274 } 3275 } 3276 3277 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3278 MachineInstr &MI, 3279 Intrinsic::ID IID) const { 3280 GISelChangeObserver &Observer = Helper.Observer; 3281 Observer.changingInstr(MI); 3282 3283 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3284 3285 // The remaining operands were used to set fields in the MemOperand on 3286 // construction. 3287 for (int I = 6; I > 3; --I) 3288 MI.RemoveOperand(I); 3289 3290 MI.RemoveOperand(1); // Remove the intrinsic ID. 3291 Observer.changedInstr(MI); 3292 return true; 3293 } 3294 3295 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3296 MachineRegisterInfo &MRI, 3297 MachineIRBuilder &B) const { 3298 uint64_t Offset = 3299 ST.getTargetLowering()->getImplicitParameterOffset( 3300 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3301 LLT DstTy = MRI.getType(DstReg); 3302 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3303 3304 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3305 if (!loadInputValue(KernargPtrReg, B, 3306 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3307 return false; 3308 3309 // FIXME: This should be nuw 3310 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3311 return true; 3312 } 3313 3314 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3315 MachineRegisterInfo &MRI, 3316 MachineIRBuilder &B) const { 3317 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3318 if (!MFI->isEntryFunction()) { 3319 return legalizePreloadedArgIntrin(MI, MRI, B, 3320 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3321 } 3322 3323 Register DstReg = MI.getOperand(0).getReg(); 3324 if (!getImplicitArgPtr(DstReg, MRI, B)) 3325 return false; 3326 3327 MI.eraseFromParent(); 3328 return true; 3329 } 3330 3331 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3332 MachineRegisterInfo &MRI, 3333 MachineIRBuilder &B, 3334 unsigned AddrSpace) const { 3335 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3336 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3337 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3338 MI.eraseFromParent(); 3339 return true; 3340 } 3341 3342 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3343 // offset (the offset that is included in bounds checking and swizzling, to be 3344 // split between the instruction's voffset and immoffset fields) and soffset 3345 // (the offset that is excluded from bounds checking and swizzling, to go in 3346 // the instruction's soffset field). This function takes the first kind of 3347 // offset and figures out how to split it between voffset and immoffset. 3348 std::tuple<Register, unsigned, unsigned> 3349 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3350 Register OrigOffset) const { 3351 const unsigned MaxImm = 4095; 3352 Register BaseReg; 3353 unsigned TotalConstOffset; 3354 MachineInstr *OffsetDef; 3355 const LLT S32 = LLT::scalar(32); 3356 3357 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3358 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3359 3360 unsigned ImmOffset = TotalConstOffset; 3361 3362 // If the immediate value is too big for the immoffset field, put the value 3363 // and -4096 into the immoffset field so that the value that is copied/added 3364 // for the voffset field is a multiple of 4096, and it stands more chance 3365 // of being CSEd with the copy/add for another similar load/store. 3366 // However, do not do that rounding down to a multiple of 4096 if that is a 3367 // negative number, as it appears to be illegal to have a negative offset 3368 // in the vgpr, even if adding the immediate offset makes it positive. 3369 unsigned Overflow = ImmOffset & ~MaxImm; 3370 ImmOffset -= Overflow; 3371 if ((int32_t)Overflow < 0) { 3372 Overflow += ImmOffset; 3373 ImmOffset = 0; 3374 } 3375 3376 if (Overflow != 0) { 3377 if (!BaseReg) { 3378 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3379 } else { 3380 auto OverflowVal = B.buildConstant(S32, Overflow); 3381 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3382 } 3383 } 3384 3385 if (!BaseReg) 3386 BaseReg = B.buildConstant(S32, 0).getReg(0); 3387 3388 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3389 } 3390 3391 /// Handle register layout difference for f16 images for some subtargets. 3392 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3393 MachineRegisterInfo &MRI, 3394 Register Reg) const { 3395 if (!ST.hasUnpackedD16VMem()) 3396 return Reg; 3397 3398 const LLT S16 = LLT::scalar(16); 3399 const LLT S32 = LLT::scalar(32); 3400 LLT StoreVT = MRI.getType(Reg); 3401 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3402 3403 auto Unmerge = B.buildUnmerge(S16, Reg); 3404 3405 SmallVector<Register, 4> WideRegs; 3406 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3407 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3408 3409 int NumElts = StoreVT.getNumElements(); 3410 3411 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3412 } 3413 3414 Register AMDGPULegalizerInfo::fixStoreSourceType( 3415 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3416 MachineRegisterInfo *MRI = B.getMRI(); 3417 LLT Ty = MRI->getType(VData); 3418 3419 const LLT S16 = LLT::scalar(16); 3420 3421 // Fixup illegal register types for i8 stores. 3422 if (Ty == LLT::scalar(8) || Ty == S16) { 3423 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3424 return AnyExt; 3425 } 3426 3427 if (Ty.isVector()) { 3428 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3429 if (IsFormat) 3430 return handleD16VData(B, *MRI, VData); 3431 } 3432 } 3433 3434 return VData; 3435 } 3436 3437 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3438 MachineRegisterInfo &MRI, 3439 MachineIRBuilder &B, 3440 bool IsTyped, 3441 bool IsFormat) const { 3442 Register VData = MI.getOperand(1).getReg(); 3443 LLT Ty = MRI.getType(VData); 3444 LLT EltTy = Ty.getScalarType(); 3445 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3446 const LLT S32 = LLT::scalar(32); 3447 3448 VData = fixStoreSourceType(B, VData, IsFormat); 3449 Register RSrc = MI.getOperand(2).getReg(); 3450 3451 MachineMemOperand *MMO = *MI.memoperands_begin(); 3452 const int MemSize = MMO->getSize(); 3453 3454 unsigned ImmOffset; 3455 unsigned TotalOffset; 3456 3457 // The typed intrinsics add an immediate after the registers. 3458 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3459 3460 // The struct intrinsic variants add one additional operand over raw. 3461 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3462 Register VIndex; 3463 int OpOffset = 0; 3464 if (HasVIndex) { 3465 VIndex = MI.getOperand(3).getReg(); 3466 OpOffset = 1; 3467 } 3468 3469 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3470 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3471 3472 unsigned Format = 0; 3473 if (IsTyped) { 3474 Format = MI.getOperand(5 + OpOffset).getImm(); 3475 ++OpOffset; 3476 } 3477 3478 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3479 3480 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3481 if (TotalOffset != 0) 3482 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3483 3484 unsigned Opc; 3485 if (IsTyped) { 3486 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3487 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3488 } else if (IsFormat) { 3489 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3490 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3491 } else { 3492 switch (MemSize) { 3493 case 1: 3494 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3495 break; 3496 case 2: 3497 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3498 break; 3499 default: 3500 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3501 break; 3502 } 3503 } 3504 3505 if (!VIndex) 3506 VIndex = B.buildConstant(S32, 0).getReg(0); 3507 3508 auto MIB = B.buildInstr(Opc) 3509 .addUse(VData) // vdata 3510 .addUse(RSrc) // rsrc 3511 .addUse(VIndex) // vindex 3512 .addUse(VOffset) // voffset 3513 .addUse(SOffset) // soffset 3514 .addImm(ImmOffset); // offset(imm) 3515 3516 if (IsTyped) 3517 MIB.addImm(Format); 3518 3519 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3520 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3521 .addMemOperand(MMO); 3522 3523 MI.eraseFromParent(); 3524 return true; 3525 } 3526 3527 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3528 MachineRegisterInfo &MRI, 3529 MachineIRBuilder &B, 3530 bool IsFormat, 3531 bool IsTyped) const { 3532 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3533 MachineMemOperand *MMO = *MI.memoperands_begin(); 3534 const int MemSize = MMO->getSize(); 3535 const LLT S32 = LLT::scalar(32); 3536 3537 Register Dst = MI.getOperand(0).getReg(); 3538 Register RSrc = MI.getOperand(2).getReg(); 3539 3540 // The typed intrinsics add an immediate after the registers. 3541 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3542 3543 // The struct intrinsic variants add one additional operand over raw. 3544 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3545 Register VIndex; 3546 int OpOffset = 0; 3547 if (HasVIndex) { 3548 VIndex = MI.getOperand(3).getReg(); 3549 OpOffset = 1; 3550 } 3551 3552 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3553 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3554 3555 unsigned Format = 0; 3556 if (IsTyped) { 3557 Format = MI.getOperand(5 + OpOffset).getImm(); 3558 ++OpOffset; 3559 } 3560 3561 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3562 unsigned ImmOffset; 3563 unsigned TotalOffset; 3564 3565 LLT Ty = MRI.getType(Dst); 3566 LLT EltTy = Ty.getScalarType(); 3567 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3568 const bool Unpacked = ST.hasUnpackedD16VMem(); 3569 3570 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3571 if (TotalOffset != 0) 3572 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3573 3574 unsigned Opc; 3575 3576 if (IsTyped) { 3577 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3578 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3579 } else if (IsFormat) { 3580 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3581 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3582 } else { 3583 switch (MemSize) { 3584 case 1: 3585 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3586 break; 3587 case 2: 3588 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3589 break; 3590 default: 3591 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3592 break; 3593 } 3594 } 3595 3596 Register LoadDstReg; 3597 3598 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3599 LLT UnpackedTy = Ty.changeElementSize(32); 3600 3601 if (IsExtLoad) 3602 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3603 else if (Unpacked && IsD16 && Ty.isVector()) 3604 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3605 else 3606 LoadDstReg = Dst; 3607 3608 if (!VIndex) 3609 VIndex = B.buildConstant(S32, 0).getReg(0); 3610 3611 auto MIB = B.buildInstr(Opc) 3612 .addDef(LoadDstReg) // vdata 3613 .addUse(RSrc) // rsrc 3614 .addUse(VIndex) // vindex 3615 .addUse(VOffset) // voffset 3616 .addUse(SOffset) // soffset 3617 .addImm(ImmOffset); // offset(imm) 3618 3619 if (IsTyped) 3620 MIB.addImm(Format); 3621 3622 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3623 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3624 .addMemOperand(MMO); 3625 3626 if (LoadDstReg != Dst) { 3627 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3628 3629 // Widen result for extending loads was widened. 3630 if (IsExtLoad) 3631 B.buildTrunc(Dst, LoadDstReg); 3632 else { 3633 // Repack to original 16-bit vector result 3634 // FIXME: G_TRUNC should work, but legalization currently fails 3635 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3636 SmallVector<Register, 4> Repack; 3637 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3638 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3639 B.buildMerge(Dst, Repack); 3640 } 3641 } 3642 3643 MI.eraseFromParent(); 3644 return true; 3645 } 3646 3647 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3648 MachineIRBuilder &B, 3649 bool IsInc) const { 3650 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3651 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3652 B.buildInstr(Opc) 3653 .addDef(MI.getOperand(0).getReg()) 3654 .addUse(MI.getOperand(2).getReg()) 3655 .addUse(MI.getOperand(3).getReg()) 3656 .cloneMemRefs(MI); 3657 MI.eraseFromParent(); 3658 return true; 3659 } 3660 3661 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3662 switch (IntrID) { 3663 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3664 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3665 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3666 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3667 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3668 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3669 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3670 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3671 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3672 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3673 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3674 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3675 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3676 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3677 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3678 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3679 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3680 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3681 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3682 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3683 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3684 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3685 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3686 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3687 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3688 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3689 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3690 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3691 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3692 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3693 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3694 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3695 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3696 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3697 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3698 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3699 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3700 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3701 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3702 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3703 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3704 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3705 default: 3706 llvm_unreachable("unhandled atomic opcode"); 3707 } 3708 } 3709 3710 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3711 MachineIRBuilder &B, 3712 Intrinsic::ID IID) const { 3713 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3714 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3715 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3716 3717 Register Dst; 3718 3719 int OpOffset = 0; 3720 if (HasReturn) { 3721 // A few FP atomics do not support return values. 3722 Dst = MI.getOperand(0).getReg(); 3723 } else { 3724 OpOffset = -1; 3725 } 3726 3727 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3728 Register CmpVal; 3729 3730 if (IsCmpSwap) { 3731 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3732 ++OpOffset; 3733 } 3734 3735 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3736 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3737 3738 // The struct intrinsic variants add one additional operand over raw. 3739 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3740 Register VIndex; 3741 if (HasVIndex) { 3742 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3743 ++OpOffset; 3744 } 3745 3746 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3747 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3748 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3749 3750 MachineMemOperand *MMO = *MI.memoperands_begin(); 3751 3752 unsigned ImmOffset; 3753 unsigned TotalOffset; 3754 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3755 if (TotalOffset != 0) 3756 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3757 3758 if (!VIndex) 3759 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3760 3761 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3762 3763 if (HasReturn) 3764 MIB.addDef(Dst); 3765 3766 MIB.addUse(VData); // vdata 3767 3768 if (IsCmpSwap) 3769 MIB.addReg(CmpVal); 3770 3771 MIB.addUse(RSrc) // rsrc 3772 .addUse(VIndex) // vindex 3773 .addUse(VOffset) // voffset 3774 .addUse(SOffset) // soffset 3775 .addImm(ImmOffset) // offset(imm) 3776 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3777 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3778 .addMemOperand(MMO); 3779 3780 MI.eraseFromParent(); 3781 return true; 3782 } 3783 3784 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3785 /// vector with s16 typed elements. 3786 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3787 SmallVectorImpl<Register> &PackedAddrs, 3788 int AddrIdx, int DimIdx, int EndIdx, 3789 int NumGradients) { 3790 const LLT S16 = LLT::scalar(16); 3791 const LLT V2S16 = LLT::vector(2, 16); 3792 3793 for (int I = AddrIdx; I < EndIdx; ++I) { 3794 MachineOperand &SrcOp = MI.getOperand(I); 3795 if (!SrcOp.isReg()) 3796 continue; // _L to _LZ may have eliminated this. 3797 3798 Register AddrReg = SrcOp.getReg(); 3799 3800 if (I < DimIdx) { 3801 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3802 PackedAddrs.push_back(AddrReg); 3803 } else { 3804 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3805 // derivatives dx/dh and dx/dv are packed with undef. 3806 if (((I + 1) >= EndIdx) || 3807 ((NumGradients / 2) % 2 == 1 && 3808 (I == DimIdx + (NumGradients / 2) - 1 || 3809 I == DimIdx + NumGradients - 1)) || 3810 // Check for _L to _LZ optimization 3811 !MI.getOperand(I + 1).isReg()) { 3812 PackedAddrs.push_back( 3813 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3814 .getReg(0)); 3815 } else { 3816 PackedAddrs.push_back( 3817 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3818 .getReg(0)); 3819 ++I; 3820 } 3821 } 3822 } 3823 } 3824 3825 /// Convert from separate vaddr components to a single vector address register, 3826 /// and replace the remaining operands with $noreg. 3827 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3828 int DimIdx, int NumVAddrs) { 3829 const LLT S32 = LLT::scalar(32); 3830 3831 SmallVector<Register, 8> AddrRegs; 3832 for (int I = 0; I != NumVAddrs; ++I) { 3833 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3834 if (SrcOp.isReg()) { 3835 AddrRegs.push_back(SrcOp.getReg()); 3836 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3837 } 3838 } 3839 3840 int NumAddrRegs = AddrRegs.size(); 3841 if (NumAddrRegs != 1) { 3842 // Round up to 8 elements for v5-v7 3843 // FIXME: Missing intermediate sized register classes and instructions. 3844 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3845 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3846 auto Undef = B.buildUndef(S32); 3847 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3848 NumAddrRegs = RoundedNumRegs; 3849 } 3850 3851 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3852 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3853 } 3854 3855 for (int I = 1; I != NumVAddrs; ++I) { 3856 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3857 if (SrcOp.isReg()) 3858 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3859 } 3860 } 3861 3862 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3863 /// 3864 /// Depending on the subtarget, load/store with 16-bit element data need to be 3865 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3866 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3867 /// registers. 3868 /// 3869 /// We don't want to directly select image instructions just yet, but also want 3870 /// to exposes all register repacking to the legalizer/combiners. We also don't 3871 /// want a selected instrution entering RegBankSelect. In order to avoid 3872 /// defining a multitude of intermediate image instructions, directly hack on 3873 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3874 /// now unnecessary arguments with $noreg. 3875 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3876 MachineInstr &MI, MachineIRBuilder &B, 3877 GISelChangeObserver &Observer, 3878 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3879 3880 const int NumDefs = MI.getNumExplicitDefs(); 3881 bool IsTFE = NumDefs == 2; 3882 // We are only processing the operands of d16 image operations on subtargets 3883 // that use the unpacked register layout, or need to repack the TFE result. 3884 3885 // TODO: Do we need to guard against already legalized intrinsics? 3886 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3887 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3888 3889 MachineRegisterInfo *MRI = B.getMRI(); 3890 const LLT S32 = LLT::scalar(32); 3891 const LLT S16 = LLT::scalar(16); 3892 const LLT V2S16 = LLT::vector(2, 16); 3893 3894 // Index of first address argument 3895 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3896 3897 int NumVAddrs, NumGradients; 3898 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3899 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3900 getDMaskIdx(BaseOpcode, NumDefs); 3901 unsigned DMask = 0; 3902 3903 // Check for 16 bit addresses and pack if true. 3904 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3905 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3906 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3907 const bool IsG16 = GradTy == S16; 3908 const bool IsA16 = AddrTy == S16; 3909 3910 int DMaskLanes = 0; 3911 if (!BaseOpcode->Atomic) { 3912 DMask = MI.getOperand(DMaskIdx).getImm(); 3913 if (BaseOpcode->Gather4) { 3914 DMaskLanes = 4; 3915 } else if (DMask != 0) { 3916 DMaskLanes = countPopulation(DMask); 3917 } else if (!IsTFE && !BaseOpcode->Store) { 3918 // If dmask is 0, this is a no-op load. This can be eliminated. 3919 B.buildUndef(MI.getOperand(0)); 3920 MI.eraseFromParent(); 3921 return true; 3922 } 3923 } 3924 3925 Observer.changingInstr(MI); 3926 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3927 3928 unsigned NewOpcode = NumDefs == 0 ? 3929 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3930 3931 // Track that we legalized this 3932 MI.setDesc(B.getTII().get(NewOpcode)); 3933 3934 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3935 // dmask to be at least 1 otherwise the instruction will fail 3936 if (IsTFE && DMask == 0) { 3937 DMask = 0x1; 3938 DMaskLanes = 1; 3939 MI.getOperand(DMaskIdx).setImm(DMask); 3940 } 3941 3942 if (BaseOpcode->Atomic) { 3943 Register VData0 = MI.getOperand(2).getReg(); 3944 LLT Ty = MRI->getType(VData0); 3945 3946 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3947 if (Ty.isVector()) 3948 return false; 3949 3950 if (BaseOpcode->AtomicX2) { 3951 Register VData1 = MI.getOperand(3).getReg(); 3952 // The two values are packed in one register. 3953 LLT PackedTy = LLT::vector(2, Ty); 3954 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3955 MI.getOperand(2).setReg(Concat.getReg(0)); 3956 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3957 } 3958 } 3959 3960 int CorrectedNumVAddrs = NumVAddrs; 3961 3962 // Optimize _L to _LZ when _L is zero 3963 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3964 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3965 const ConstantFP *ConstantLod; 3966 const int LodIdx = AddrIdx + NumVAddrs - 1; 3967 3968 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3969 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3970 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3971 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3972 LZMappingInfo->LZ, ImageDimIntr->Dim); 3973 3974 // The starting indexes should remain in the same place. 3975 --NumVAddrs; 3976 --CorrectedNumVAddrs; 3977 3978 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3979 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3980 MI.RemoveOperand(LodIdx); 3981 } 3982 } 3983 } 3984 3985 // Optimize _mip away, when 'lod' is zero 3986 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3987 int64_t ConstantLod; 3988 const int LodIdx = AddrIdx + NumVAddrs - 1; 3989 3990 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3991 if (ConstantLod == 0) { 3992 // TODO: Change intrinsic opcode and remove operand instead or replacing 3993 // it with 0, as the _L to _LZ handling is done above. 3994 MI.getOperand(LodIdx).ChangeToImmediate(0); 3995 --CorrectedNumVAddrs; 3996 } 3997 } 3998 } 3999 4000 // Rewrite the addressing register layout before doing anything else. 4001 if (IsA16 || IsG16) { 4002 if (IsA16) { 4003 // Target must support the feature and gradients need to be 16 bit too 4004 if (!ST.hasA16() || !IsG16) 4005 return false; 4006 } else if (!ST.hasG16()) 4007 return false; 4008 4009 if (NumVAddrs > 1) { 4010 SmallVector<Register, 4> PackedRegs; 4011 // Don't compress addresses for G16 4012 const int PackEndIdx = 4013 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 4014 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 4015 PackEndIdx, NumGradients); 4016 4017 if (!IsA16) { 4018 // Add uncompressed address 4019 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 4020 int AddrReg = MI.getOperand(I).getReg(); 4021 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 4022 PackedRegs.push_back(AddrReg); 4023 } 4024 } 4025 4026 // See also below in the non-a16 branch 4027 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4028 4029 if (!UseNSA && PackedRegs.size() > 1) { 4030 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4031 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4032 PackedRegs[0] = Concat.getReg(0); 4033 PackedRegs.resize(1); 4034 } 4035 4036 const int NumPacked = PackedRegs.size(); 4037 for (int I = 0; I != NumVAddrs; ++I) { 4038 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 4039 if (!SrcOp.isReg()) { 4040 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4041 continue; 4042 } 4043 4044 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4045 4046 if (I < NumPacked) 4047 SrcOp.setReg(PackedRegs[I]); 4048 else 4049 SrcOp.setReg(AMDGPU::NoRegister); 4050 } 4051 } 4052 } else { 4053 // If the register allocator cannot place the address registers contiguously 4054 // without introducing moves, then using the non-sequential address encoding 4055 // is always preferable, since it saves VALU instructions and is usually a 4056 // wash in terms of code size or even better. 4057 // 4058 // However, we currently have no way of hinting to the register allocator 4059 // that MIMG addresses should be placed contiguously when it is possible to 4060 // do so, so force non-NSA for the common 2-address case as a heuristic. 4061 // 4062 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4063 // allocation when possible. 4064 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4065 4066 if (!UseNSA && NumVAddrs > 1) 4067 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 4068 } 4069 4070 int Flags = 0; 4071 if (IsA16) 4072 Flags |= 1; 4073 if (IsG16) 4074 Flags |= 2; 4075 MI.addOperand(MachineOperand::CreateImm(Flags)); 4076 4077 if (BaseOpcode->Store) { // No TFE for stores? 4078 // TODO: Handle dmask trim 4079 Register VData = MI.getOperand(1).getReg(); 4080 LLT Ty = MRI->getType(VData); 4081 if (!Ty.isVector() || Ty.getElementType() != S16) 4082 return true; 4083 4084 Register RepackedReg = handleD16VData(B, *MRI, VData); 4085 if (RepackedReg != VData) { 4086 MI.getOperand(1).setReg(RepackedReg); 4087 } 4088 4089 return true; 4090 } 4091 4092 Register DstReg = MI.getOperand(0).getReg(); 4093 LLT Ty = MRI->getType(DstReg); 4094 const LLT EltTy = Ty.getScalarType(); 4095 const bool IsD16 = Ty.getScalarType() == S16; 4096 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4097 4098 // Confirm that the return type is large enough for the dmask specified 4099 if (NumElts < DMaskLanes) 4100 return false; 4101 4102 if (NumElts > 4 || DMaskLanes > 4) 4103 return false; 4104 4105 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4106 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4107 4108 // The raw dword aligned data component of the load. The only legal cases 4109 // where this matters should be when using the packed D16 format, for 4110 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4111 LLT RoundedTy; 4112 4113 // S32 vector to to cover all data, plus TFE result element. 4114 LLT TFETy; 4115 4116 // Register type to use for each loaded component. Will be S32 or V2S16. 4117 LLT RegTy; 4118 4119 if (IsD16 && ST.hasUnpackedD16VMem()) { 4120 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4121 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4122 RegTy = S32; 4123 } else { 4124 unsigned EltSize = EltTy.getSizeInBits(); 4125 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4126 unsigned RoundedSize = 32 * RoundedElts; 4127 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4128 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4129 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4130 } 4131 4132 // The return type does not need adjustment. 4133 // TODO: Should we change s16 case to s32 or <2 x s16>? 4134 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4135 return true; 4136 4137 Register Dst1Reg; 4138 4139 // Insert after the instruction. 4140 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4141 4142 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4143 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4144 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4145 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4146 4147 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4148 4149 MI.getOperand(0).setReg(NewResultReg); 4150 4151 // In the IR, TFE is supposed to be used with a 2 element struct return 4152 // type. The intruction really returns these two values in one contiguous 4153 // register, with one additional dword beyond the loaded data. Rewrite the 4154 // return type to use a single register result. 4155 4156 if (IsTFE) { 4157 Dst1Reg = MI.getOperand(1).getReg(); 4158 if (MRI->getType(Dst1Reg) != S32) 4159 return false; 4160 4161 // TODO: Make sure the TFE operand bit is set. 4162 MI.RemoveOperand(1); 4163 4164 // Handle the easy case that requires no repack instructions. 4165 if (Ty == S32) { 4166 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4167 return true; 4168 } 4169 } 4170 4171 // Now figure out how to copy the new result register back into the old 4172 // result. 4173 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4174 4175 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4176 4177 if (ResultNumRegs == 1) { 4178 assert(!IsTFE); 4179 ResultRegs[0] = NewResultReg; 4180 } else { 4181 // We have to repack into a new vector of some kind. 4182 for (int I = 0; I != NumDataRegs; ++I) 4183 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4184 B.buildUnmerge(ResultRegs, NewResultReg); 4185 4186 // Drop the final TFE element to get the data part. The TFE result is 4187 // directly written to the right place already. 4188 if (IsTFE) 4189 ResultRegs.resize(NumDataRegs); 4190 } 4191 4192 // For an s16 scalar result, we form an s32 result with a truncate regardless 4193 // of packed vs. unpacked. 4194 if (IsD16 && !Ty.isVector()) { 4195 B.buildTrunc(DstReg, ResultRegs[0]); 4196 return true; 4197 } 4198 4199 // Avoid a build/concat_vector of 1 entry. 4200 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4201 B.buildBitcast(DstReg, ResultRegs[0]); 4202 return true; 4203 } 4204 4205 assert(Ty.isVector()); 4206 4207 if (IsD16) { 4208 // For packed D16 results with TFE enabled, all the data components are 4209 // S32. Cast back to the expected type. 4210 // 4211 // TODO: We don't really need to use load s32 elements. We would only need one 4212 // cast for the TFE result if a multiple of v2s16 was used. 4213 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4214 for (Register &Reg : ResultRegs) 4215 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4216 } else if (ST.hasUnpackedD16VMem()) { 4217 for (Register &Reg : ResultRegs) 4218 Reg = B.buildTrunc(S16, Reg).getReg(0); 4219 } 4220 } 4221 4222 auto padWithUndef = [&](LLT Ty, int NumElts) { 4223 if (NumElts == 0) 4224 return; 4225 Register Undef = B.buildUndef(Ty).getReg(0); 4226 for (int I = 0; I != NumElts; ++I) 4227 ResultRegs.push_back(Undef); 4228 }; 4229 4230 // Pad out any elements eliminated due to the dmask. 4231 LLT ResTy = MRI->getType(ResultRegs[0]); 4232 if (!ResTy.isVector()) { 4233 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4234 B.buildBuildVector(DstReg, ResultRegs); 4235 return true; 4236 } 4237 4238 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4239 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4240 4241 // Deal with the one annoying legal case. 4242 const LLT V3S16 = LLT::vector(3, 16); 4243 if (Ty == V3S16) { 4244 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4245 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4246 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4247 return true; 4248 } 4249 4250 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4251 B.buildConcatVectors(DstReg, ResultRegs); 4252 return true; 4253 } 4254 4255 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4256 LegalizerHelper &Helper, MachineInstr &MI) const { 4257 MachineIRBuilder &B = Helper.MIRBuilder; 4258 GISelChangeObserver &Observer = Helper.Observer; 4259 4260 Register Dst = MI.getOperand(0).getReg(); 4261 LLT Ty = B.getMRI()->getType(Dst); 4262 unsigned Size = Ty.getSizeInBits(); 4263 MachineFunction &MF = B.getMF(); 4264 4265 Observer.changingInstr(MI); 4266 4267 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4268 Ty = getBitcastRegisterType(Ty); 4269 Helper.bitcastDst(MI, Ty, 0); 4270 Dst = MI.getOperand(0).getReg(); 4271 B.setInsertPt(B.getMBB(), MI); 4272 } 4273 4274 // FIXME: We don't really need this intermediate instruction. The intrinsic 4275 // should be fixed to have a memory operand. Since it's readnone, we're not 4276 // allowed to add one. 4277 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4278 MI.RemoveOperand(1); // Remove intrinsic ID 4279 4280 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4281 // TODO: Should this use datalayout alignment? 4282 const unsigned MemSize = (Size + 7) / 8; 4283 const Align MemAlign(4); 4284 MachineMemOperand *MMO = MF.getMachineMemOperand( 4285 MachinePointerInfo(), 4286 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4287 MachineMemOperand::MOInvariant, 4288 MemSize, MemAlign); 4289 MI.addMemOperand(MF, MMO); 4290 4291 // There are no 96-bit result scalar loads, but widening to 128-bit should 4292 // always be legal. We may need to restore this to a 96-bit result if it turns 4293 // out this needs to be converted to a vector load during RegBankSelect. 4294 if (!isPowerOf2_32(Size)) { 4295 if (Ty.isVector()) 4296 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4297 else 4298 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4299 } 4300 4301 Observer.changedInstr(MI); 4302 return true; 4303 } 4304 4305 // TODO: Move to selection 4306 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4307 MachineRegisterInfo &MRI, 4308 MachineIRBuilder &B) const { 4309 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4310 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4311 !ST.isTrapHandlerEnabled()) { 4312 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4313 } else { 4314 // Pass queue pointer to trap handler as input, and insert trap instruction 4315 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4316 MachineRegisterInfo &MRI = *B.getMRI(); 4317 4318 Register LiveIn = 4319 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4320 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4321 return false; 4322 4323 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4324 B.buildCopy(SGPR01, LiveIn); 4325 B.buildInstr(AMDGPU::S_TRAP) 4326 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4327 .addReg(SGPR01, RegState::Implicit); 4328 } 4329 4330 MI.eraseFromParent(); 4331 return true; 4332 } 4333 4334 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4335 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4336 // Is non-HSA path or trap-handler disabled? then, report a warning 4337 // accordingly 4338 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4339 !ST.isTrapHandlerEnabled()) { 4340 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4341 "debugtrap handler not supported", 4342 MI.getDebugLoc(), DS_Warning); 4343 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4344 Ctx.diagnose(NoTrap); 4345 } else { 4346 // Insert debug-trap instruction 4347 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4348 } 4349 4350 MI.eraseFromParent(); 4351 return true; 4352 } 4353 4354 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4355 MachineInstr &MI) const { 4356 MachineIRBuilder &B = Helper.MIRBuilder; 4357 MachineRegisterInfo &MRI = *B.getMRI(); 4358 4359 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4360 auto IntrID = MI.getIntrinsicID(); 4361 switch (IntrID) { 4362 case Intrinsic::amdgcn_if: 4363 case Intrinsic::amdgcn_else: { 4364 MachineInstr *Br = nullptr; 4365 MachineBasicBlock *UncondBrTarget = nullptr; 4366 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4367 const SIRegisterInfo *TRI 4368 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4369 4370 Register Def = MI.getOperand(1).getReg(); 4371 Register Use = MI.getOperand(3).getReg(); 4372 4373 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4374 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4375 if (IntrID == Intrinsic::amdgcn_if) { 4376 B.buildInstr(AMDGPU::SI_IF) 4377 .addDef(Def) 4378 .addUse(Use) 4379 .addMBB(UncondBrTarget); 4380 } else { 4381 B.buildInstr(AMDGPU::SI_ELSE) 4382 .addDef(Def) 4383 .addUse(Use) 4384 .addMBB(UncondBrTarget) 4385 .addImm(0); 4386 } 4387 4388 if (Br) { 4389 Br->getOperand(0).setMBB(CondBrTarget); 4390 } else { 4391 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4392 // since we're swapping branch targets it needs to be reinserted. 4393 // FIXME: IRTranslator should probably not do this 4394 B.buildBr(*CondBrTarget); 4395 } 4396 4397 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4398 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4399 MI.eraseFromParent(); 4400 BrCond->eraseFromParent(); 4401 return true; 4402 } 4403 4404 return false; 4405 } 4406 case Intrinsic::amdgcn_loop: { 4407 MachineInstr *Br = nullptr; 4408 MachineBasicBlock *UncondBrTarget = nullptr; 4409 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4410 const SIRegisterInfo *TRI 4411 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4412 4413 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4414 Register Reg = MI.getOperand(2).getReg(); 4415 4416 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4417 B.buildInstr(AMDGPU::SI_LOOP) 4418 .addUse(Reg) 4419 .addMBB(UncondBrTarget); 4420 4421 if (Br) 4422 Br->getOperand(0).setMBB(CondBrTarget); 4423 else 4424 B.buildBr(*CondBrTarget); 4425 4426 MI.eraseFromParent(); 4427 BrCond->eraseFromParent(); 4428 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4429 return true; 4430 } 4431 4432 return false; 4433 } 4434 case Intrinsic::amdgcn_kernarg_segment_ptr: 4435 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4436 // This only makes sense to call in a kernel, so just lower to null. 4437 B.buildConstant(MI.getOperand(0).getReg(), 0); 4438 MI.eraseFromParent(); 4439 return true; 4440 } 4441 4442 return legalizePreloadedArgIntrin( 4443 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4444 case Intrinsic::amdgcn_implicitarg_ptr: 4445 return legalizeImplicitArgPtr(MI, MRI, B); 4446 case Intrinsic::amdgcn_workitem_id_x: 4447 return legalizePreloadedArgIntrin(MI, MRI, B, 4448 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4449 case Intrinsic::amdgcn_workitem_id_y: 4450 return legalizePreloadedArgIntrin(MI, MRI, B, 4451 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4452 case Intrinsic::amdgcn_workitem_id_z: 4453 return legalizePreloadedArgIntrin(MI, MRI, B, 4454 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4455 case Intrinsic::amdgcn_workgroup_id_x: 4456 return legalizePreloadedArgIntrin(MI, MRI, B, 4457 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4458 case Intrinsic::amdgcn_workgroup_id_y: 4459 return legalizePreloadedArgIntrin(MI, MRI, B, 4460 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4461 case Intrinsic::amdgcn_workgroup_id_z: 4462 return legalizePreloadedArgIntrin(MI, MRI, B, 4463 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4464 case Intrinsic::amdgcn_dispatch_ptr: 4465 return legalizePreloadedArgIntrin(MI, MRI, B, 4466 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4467 case Intrinsic::amdgcn_queue_ptr: 4468 return legalizePreloadedArgIntrin(MI, MRI, B, 4469 AMDGPUFunctionArgInfo::QUEUE_PTR); 4470 case Intrinsic::amdgcn_implicit_buffer_ptr: 4471 return legalizePreloadedArgIntrin( 4472 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4473 case Intrinsic::amdgcn_dispatch_id: 4474 return legalizePreloadedArgIntrin(MI, MRI, B, 4475 AMDGPUFunctionArgInfo::DISPATCH_ID); 4476 case Intrinsic::amdgcn_fdiv_fast: 4477 return legalizeFDIVFastIntrin(MI, MRI, B); 4478 case Intrinsic::amdgcn_is_shared: 4479 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4480 case Intrinsic::amdgcn_is_private: 4481 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4482 case Intrinsic::amdgcn_wavefrontsize: { 4483 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4484 MI.eraseFromParent(); 4485 return true; 4486 } 4487 case Intrinsic::amdgcn_s_buffer_load: 4488 return legalizeSBufferLoad(Helper, MI); 4489 case Intrinsic::amdgcn_raw_buffer_store: 4490 case Intrinsic::amdgcn_struct_buffer_store: 4491 return legalizeBufferStore(MI, MRI, B, false, false); 4492 case Intrinsic::amdgcn_raw_buffer_store_format: 4493 case Intrinsic::amdgcn_struct_buffer_store_format: 4494 return legalizeBufferStore(MI, MRI, B, false, true); 4495 case Intrinsic::amdgcn_raw_tbuffer_store: 4496 case Intrinsic::amdgcn_struct_tbuffer_store: 4497 return legalizeBufferStore(MI, MRI, B, true, true); 4498 case Intrinsic::amdgcn_raw_buffer_load: 4499 case Intrinsic::amdgcn_struct_buffer_load: 4500 return legalizeBufferLoad(MI, MRI, B, false, false); 4501 case Intrinsic::amdgcn_raw_buffer_load_format: 4502 case Intrinsic::amdgcn_struct_buffer_load_format: 4503 return legalizeBufferLoad(MI, MRI, B, true, false); 4504 case Intrinsic::amdgcn_raw_tbuffer_load: 4505 case Intrinsic::amdgcn_struct_tbuffer_load: 4506 return legalizeBufferLoad(MI, MRI, B, true, true); 4507 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4508 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4509 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4510 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4511 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4512 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4513 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4514 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4515 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4516 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4517 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4518 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4519 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4520 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4521 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4522 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4523 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4524 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4525 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4526 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4527 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4528 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4529 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4530 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4531 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4532 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4533 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4534 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4535 return legalizeBufferAtomic(MI, B, IntrID); 4536 case Intrinsic::amdgcn_atomic_inc: 4537 return legalizeAtomicIncDec(MI, B, true); 4538 case Intrinsic::amdgcn_atomic_dec: 4539 return legalizeAtomicIncDec(MI, B, false); 4540 case Intrinsic::trap: 4541 return legalizeTrapIntrinsic(MI, MRI, B); 4542 case Intrinsic::debugtrap: 4543 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4544 case Intrinsic::amdgcn_rsq_clamp: 4545 return legalizeRsqClampIntrinsic(MI, MRI, B); 4546 case Intrinsic::amdgcn_ds_fadd: 4547 case Intrinsic::amdgcn_ds_fmin: 4548 case Intrinsic::amdgcn_ds_fmax: 4549 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4550 default: { 4551 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4552 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4553 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4554 return true; 4555 } 4556 } 4557 4558 return true; 4559 } 4560