1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 return Ty.isVector() && 67 Ty.getNumElements() % 2 != 0 && 68 Ty.getElementType().getSizeInBits() < 32 && 69 Ty.getSizeInBits() % 32 != 0; 70 }; 71 } 72 73 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 74 return [=](const LegalityQuery &Query) { 75 const LLT Ty = Query.Types[TypeIdx]; 76 const LLT EltTy = Ty.getScalarType(); 77 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 78 }; 79 } 80 81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 82 return [=](const LegalityQuery &Query) { 83 const LLT Ty = Query.Types[TypeIdx]; 84 const LLT EltTy = Ty.getElementType(); 85 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 86 }; 87 } 88 89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 90 return [=](const LegalityQuery &Query) { 91 const LLT Ty = Query.Types[TypeIdx]; 92 const LLT EltTy = Ty.getElementType(); 93 unsigned Size = Ty.getSizeInBits(); 94 unsigned Pieces = (Size + 63) / 64; 95 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 96 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 97 }; 98 } 99 100 // Increase the number of vector elements to reach the next multiple of 32-bit 101 // type. 102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 106 const LLT EltTy = Ty.getElementType(); 107 const int Size = Ty.getSizeInBits(); 108 const int EltSize = EltTy.getSizeInBits(); 109 const int NextMul32 = (Size + 31) / 32; 110 111 assert(EltSize < 32); 112 113 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 114 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 115 }; 116 } 117 118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 unsigned Size = Ty.getSizeInBits(); 122 123 LLT CoercedTy; 124 if (Size < 32) { 125 // <2 x s8> -> s16 126 assert(Size == 16); 127 CoercedTy = LLT::scalar(16); 128 } else 129 CoercedTy = LLT::scalarOrVector(Size / 32, 32); 130 131 return std::make_pair(TypeIdx, CoercedTy); 132 }; 133 } 134 135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 136 return [=](const LegalityQuery &Query) { 137 const LLT QueryTy = Query.Types[TypeIdx]; 138 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 139 }; 140 } 141 142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 143 return [=](const LegalityQuery &Query) { 144 const LLT QueryTy = Query.Types[TypeIdx]; 145 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 146 }; 147 } 148 149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT QueryTy = Query.Types[TypeIdx]; 152 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 153 }; 154 } 155 156 static bool isRegisterSize(unsigned Size) { 157 return Size % 32 == 0 && Size <= MaxRegisterSize; 158 } 159 160 static bool isRegisterVectorElementType(LLT EltTy) { 161 const int EltSize = EltTy.getSizeInBits(); 162 return EltSize == 16 || EltSize % 32 == 0; 163 } 164 165 static bool isRegisterVectorType(LLT Ty) { 166 const int EltSize = Ty.getElementType().getSizeInBits(); 167 return EltSize == 32 || EltSize == 64 || 168 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 169 EltSize == 128 || EltSize == 256; 170 } 171 172 static bool isRegisterType(LLT Ty) { 173 if (!isRegisterSize(Ty.getSizeInBits())) 174 return false; 175 176 if (Ty.isVector()) 177 return isRegisterVectorType(Ty); 178 179 return true; 180 } 181 182 // Any combination of 32 or 64-bit elements up the maximum register size, and 183 // multiples of v2s16. 184 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 return isRegisterType(Query.Types[TypeIdx]); 187 }; 188 } 189 190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 191 return [=](const LegalityQuery &Query) { 192 const LLT QueryTy = Query.Types[TypeIdx]; 193 if (!QueryTy.isVector()) 194 return false; 195 const LLT EltTy = QueryTy.getElementType(); 196 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 197 }; 198 } 199 200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 201 return [=](const LegalityQuery &Query) { 202 const LLT Ty = Query.Types[TypeIdx]; 203 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 204 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 205 }; 206 } 207 208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 209 // handle some operations by just promoting the register during 210 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 212 bool IsLoad) { 213 switch (AS) { 214 case AMDGPUAS::PRIVATE_ADDRESS: 215 // FIXME: Private element size. 216 return 32; 217 case AMDGPUAS::LOCAL_ADDRESS: 218 return ST.useDS128() ? 128 : 64; 219 case AMDGPUAS::GLOBAL_ADDRESS: 220 case AMDGPUAS::CONSTANT_ADDRESS: 221 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 222 // Treat constant and global as identical. SMRD loads are sometimes usable for 223 // global loads (ideally constant address space should be eliminated) 224 // depending on the context. Legality cannot be context dependent, but 225 // RegBankSelect can split the load as necessary depending on the pointer 226 // register bank/uniformity and if the memory is invariant or not written in a 227 // kernel. 228 return IsLoad ? 512 : 128; 229 default: 230 // Flat addresses may contextually need to be split to 32-bit parts if they 231 // may alias scratch depending on the subtarget. 232 return 128; 233 } 234 } 235 236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 237 const LegalityQuery &Query, 238 unsigned Opcode) { 239 const LLT Ty = Query.Types[0]; 240 241 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 242 const bool IsLoad = Opcode != AMDGPU::G_STORE; 243 244 unsigned RegSize = Ty.getSizeInBits(); 245 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 246 unsigned Align = Query.MMODescrs[0].AlignInBits; 247 unsigned AS = Query.Types[1].getAddressSpace(); 248 249 // All of these need to be custom lowered to cast the pointer operand. 250 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 251 return false; 252 253 // TODO: We should be able to widen loads if the alignment is high enough, but 254 // we also need to modify the memory access size. 255 #if 0 256 // Accept widening loads based on alignment. 257 if (IsLoad && MemSize < Size) 258 MemSize = std::max(MemSize, Align); 259 #endif 260 261 // Only 1-byte and 2-byte to 32-bit extloads are valid. 262 if (MemSize != RegSize && RegSize != 32) 263 return false; 264 265 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 266 return false; 267 268 switch (MemSize) { 269 case 8: 270 case 16: 271 case 32: 272 case 64: 273 case 128: 274 break; 275 case 96: 276 if (!ST.hasDwordx3LoadStores()) 277 return false; 278 break; 279 case 256: 280 case 512: 281 // These may contextually need to be broken down. 282 break; 283 default: 284 return false; 285 } 286 287 assert(RegSize >= MemSize); 288 289 if (Align < MemSize) { 290 const SITargetLowering *TLI = ST.getTargetLowering(); 291 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 292 return false; 293 } 294 295 return true; 296 } 297 298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 299 // workaround this. Eventually it should ignore the type for loads and only care 300 // about the size. Return true in cases where we will workaround this for now by 301 // bitcasting. 302 static bool loadStoreBitcastWorkaround(const LLT Ty) { 303 if (EnableNewLegality) 304 return false; 305 306 const unsigned Size = Ty.getSizeInBits(); 307 if (Size <= 64) 308 return false; 309 if (!Ty.isVector()) 310 return true; 311 unsigned EltSize = Ty.getElementType().getSizeInBits(); 312 return EltSize != 32 && EltSize != 64; 313 } 314 315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 316 unsigned Opcode) { 317 const LLT Ty = Query.Types[0]; 318 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 319 !loadStoreBitcastWorkaround(Ty); 320 } 321 322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 323 const GCNTargetMachine &TM) 324 : ST(ST_) { 325 using namespace TargetOpcode; 326 327 auto GetAddrSpacePtr = [&TM](unsigned AS) { 328 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 329 }; 330 331 const LLT S1 = LLT::scalar(1); 332 const LLT S16 = LLT::scalar(16); 333 const LLT S32 = LLT::scalar(32); 334 const LLT S64 = LLT::scalar(64); 335 const LLT S128 = LLT::scalar(128); 336 const LLT S256 = LLT::scalar(256); 337 const LLT S512 = LLT::scalar(512); 338 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 339 340 const LLT V2S16 = LLT::vector(2, 16); 341 const LLT V4S16 = LLT::vector(4, 16); 342 343 const LLT V2S32 = LLT::vector(2, 32); 344 const LLT V3S32 = LLT::vector(3, 32); 345 const LLT V4S32 = LLT::vector(4, 32); 346 const LLT V5S32 = LLT::vector(5, 32); 347 const LLT V6S32 = LLT::vector(6, 32); 348 const LLT V7S32 = LLT::vector(7, 32); 349 const LLT V8S32 = LLT::vector(8, 32); 350 const LLT V9S32 = LLT::vector(9, 32); 351 const LLT V10S32 = LLT::vector(10, 32); 352 const LLT V11S32 = LLT::vector(11, 32); 353 const LLT V12S32 = LLT::vector(12, 32); 354 const LLT V13S32 = LLT::vector(13, 32); 355 const LLT V14S32 = LLT::vector(14, 32); 356 const LLT V15S32 = LLT::vector(15, 32); 357 const LLT V16S32 = LLT::vector(16, 32); 358 const LLT V32S32 = LLT::vector(32, 32); 359 360 const LLT V2S64 = LLT::vector(2, 64); 361 const LLT V3S64 = LLT::vector(3, 64); 362 const LLT V4S64 = LLT::vector(4, 64); 363 const LLT V5S64 = LLT::vector(5, 64); 364 const LLT V6S64 = LLT::vector(6, 64); 365 const LLT V7S64 = LLT::vector(7, 64); 366 const LLT V8S64 = LLT::vector(8, 64); 367 const LLT V16S64 = LLT::vector(16, 64); 368 369 std::initializer_list<LLT> AllS32Vectors = 370 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 371 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 372 std::initializer_list<LLT> AllS64Vectors = 373 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 374 375 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 376 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 377 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 378 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 379 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 380 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 381 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 382 383 const LLT CodePtr = FlatPtr; 384 385 const std::initializer_list<LLT> AddrSpaces64 = { 386 GlobalPtr, ConstantPtr, FlatPtr 387 }; 388 389 const std::initializer_list<LLT> AddrSpaces32 = { 390 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 391 }; 392 393 const std::initializer_list<LLT> FPTypesBase = { 394 S32, S64 395 }; 396 397 const std::initializer_list<LLT> FPTypes16 = { 398 S32, S64, S16 399 }; 400 401 const std::initializer_list<LLT> FPTypesPK16 = { 402 S32, S64, S16, V2S16 403 }; 404 405 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 406 407 setAction({G_BRCOND, S1}, Legal); // VCC branches 408 setAction({G_BRCOND, S32}, Legal); // SCC branches 409 410 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 411 // elements for v3s16 412 getActionDefinitionsBuilder(G_PHI) 413 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 414 .legalFor(AllS32Vectors) 415 .legalFor(AllS64Vectors) 416 .legalFor(AddrSpaces64) 417 .legalFor(AddrSpaces32) 418 .clampScalar(0, S32, S256) 419 .widenScalarToNextPow2(0, 32) 420 .clampMaxNumElements(0, S32, 16) 421 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 422 .legalIf(isPointer(0)); 423 424 if (ST.hasVOP3PInsts()) { 425 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 426 .legalFor({S32, S16, V2S16}) 427 .clampScalar(0, S16, S32) 428 .clampMaxNumElements(0, S16, 2) 429 .scalarize(0) 430 .widenScalarToNextPow2(0, 32); 431 } else if (ST.has16BitInsts()) { 432 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 433 .legalFor({S32, S16}) 434 .clampScalar(0, S16, S32) 435 .scalarize(0) 436 .widenScalarToNextPow2(0, 32); 437 } else { 438 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 439 .legalFor({S32}) 440 .clampScalar(0, S32, S32) 441 .scalarize(0); 442 } 443 444 // FIXME: Not really legal. Placeholder for custom lowering. 445 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 446 .customFor({S32, S64}) 447 .clampScalar(0, S32, S64) 448 .widenScalarToNextPow2(0, 32) 449 .scalarize(0); 450 451 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 452 .legalFor({S32}) 453 .clampScalar(0, S32, S32) 454 .scalarize(0); 455 456 // Report legal for any types we can handle anywhere. For the cases only legal 457 // on the SALU, RegBankSelect will be able to re-legalize. 458 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 459 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 460 .clampScalar(0, S32, S64) 461 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 462 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 463 .widenScalarToNextPow2(0) 464 .scalarize(0); 465 466 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 467 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 468 .legalFor({{S32, S1}, {S32, S32}}) 469 .minScalar(0, S32) 470 // TODO: .scalarize(0) 471 .lower(); 472 473 getActionDefinitionsBuilder(G_BITCAST) 474 // Don't worry about the size constraint. 475 .legalIf(all(isRegisterType(0), isRegisterType(1))) 476 .lower(); 477 478 479 getActionDefinitionsBuilder(G_CONSTANT) 480 .legalFor({S1, S32, S64, S16, GlobalPtr, 481 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 482 .clampScalar(0, S32, S64) 483 .widenScalarToNextPow2(0) 484 .legalIf(isPointer(0)); 485 486 getActionDefinitionsBuilder(G_FCONSTANT) 487 .legalFor({S32, S64, S16}) 488 .clampScalar(0, S16, S64); 489 490 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 491 .legalIf(isRegisterType(0)) 492 // s1 and s16 are special cases because they have legal operations on 493 // them, but don't really occupy registers in the normal way. 494 .legalFor({S1, S16}) 495 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 496 .clampScalarOrElt(0, S32, MaxScalar) 497 .widenScalarToNextPow2(0, 32) 498 .clampMaxNumElements(0, S32, 16); 499 500 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 501 502 // If the amount is divergent, we have to do a wave reduction to get the 503 // maximum value, so this is expanded during RegBankSelect. 504 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 505 .legalFor({{PrivatePtr, S32}}); 506 507 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 508 .unsupportedFor({PrivatePtr}) 509 .custom(); 510 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 511 512 auto &FPOpActions = getActionDefinitionsBuilder( 513 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 514 .legalFor({S32, S64}); 515 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 516 .customFor({S32, S64}); 517 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 518 .customFor({S32, S64}); 519 520 if (ST.has16BitInsts()) { 521 if (ST.hasVOP3PInsts()) 522 FPOpActions.legalFor({S16, V2S16}); 523 else 524 FPOpActions.legalFor({S16}); 525 526 TrigActions.customFor({S16}); 527 FDIVActions.customFor({S16}); 528 } 529 530 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 531 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 532 533 if (ST.hasVOP3PInsts()) { 534 MinNumMaxNum.customFor(FPTypesPK16) 535 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 536 .clampMaxNumElements(0, S16, 2) 537 .clampScalar(0, S16, S64) 538 .scalarize(0); 539 } else if (ST.has16BitInsts()) { 540 MinNumMaxNum.customFor(FPTypes16) 541 .clampScalar(0, S16, S64) 542 .scalarize(0); 543 } else { 544 MinNumMaxNum.customFor(FPTypesBase) 545 .clampScalar(0, S32, S64) 546 .scalarize(0); 547 } 548 549 if (ST.hasVOP3PInsts()) 550 FPOpActions.clampMaxNumElements(0, S16, 2); 551 552 FPOpActions 553 .scalarize(0) 554 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 555 556 TrigActions 557 .scalarize(0) 558 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 559 560 FDIVActions 561 .scalarize(0) 562 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 563 564 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 565 .legalFor(FPTypesPK16) 566 .clampMaxNumElements(0, S16, 2) 567 .scalarize(0) 568 .clampScalar(0, S16, S64); 569 570 if (ST.has16BitInsts()) { 571 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 572 .legalFor({S32, S64, S16}) 573 .scalarize(0) 574 .clampScalar(0, S16, S64); 575 } else { 576 getActionDefinitionsBuilder(G_FSQRT) 577 .legalFor({S32, S64}) 578 .scalarize(0) 579 .clampScalar(0, S32, S64); 580 581 if (ST.hasFractBug()) { 582 getActionDefinitionsBuilder(G_FFLOOR) 583 .customFor({S64}) 584 .legalFor({S32, S64}) 585 .scalarize(0) 586 .clampScalar(0, S32, S64); 587 } else { 588 getActionDefinitionsBuilder(G_FFLOOR) 589 .legalFor({S32, S64}) 590 .scalarize(0) 591 .clampScalar(0, S32, S64); 592 } 593 } 594 595 getActionDefinitionsBuilder(G_FPTRUNC) 596 .legalFor({{S32, S64}, {S16, S32}}) 597 .scalarize(0) 598 .lower(); 599 600 getActionDefinitionsBuilder(G_FPEXT) 601 .legalFor({{S64, S32}, {S32, S16}}) 602 .lowerFor({{S64, S16}}) // FIXME: Implement 603 .scalarize(0); 604 605 getActionDefinitionsBuilder(G_FSUB) 606 // Use actual fsub instruction 607 .legalFor({S32}) 608 // Must use fadd + fneg 609 .lowerFor({S64, S16, V2S16}) 610 .scalarize(0) 611 .clampScalar(0, S32, S64); 612 613 // Whether this is legal depends on the floating point mode for the function. 614 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 615 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 616 FMad.customFor({S32, S16}); 617 else if (ST.hasMadMacF32Insts()) 618 FMad.customFor({S32}); 619 else if (ST.hasMadF16()) 620 FMad.customFor({S16}); 621 FMad.scalarize(0) 622 .lower(); 623 624 // TODO: Do we need to clamp maximum bitwidth? 625 getActionDefinitionsBuilder(G_TRUNC) 626 .legalIf(isScalar(0)) 627 .legalFor({{V2S16, V2S32}}) 628 .clampMaxNumElements(0, S16, 2) 629 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 630 // situations (like an invalid implicit use), we don't want to infinite loop 631 // in the legalizer. 632 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 633 .alwaysLegal(); 634 635 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 636 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 637 {S32, S1}, {S64, S1}, {S16, S1}}) 638 .scalarize(0) 639 .clampScalar(0, S32, S64) 640 .widenScalarToNextPow2(1, 32); 641 642 // TODO: Split s1->s64 during regbankselect for VALU. 643 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 644 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 645 .lowerFor({{S32, S64}}) 646 .lowerIf(typeIs(1, S1)) 647 .customFor({{S64, S64}}); 648 if (ST.has16BitInsts()) 649 IToFP.legalFor({{S16, S16}}); 650 IToFP.clampScalar(1, S32, S64) 651 .scalarize(0) 652 .widenScalarToNextPow2(1); 653 654 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 655 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 656 .customFor({{S64, S64}}); 657 if (ST.has16BitInsts()) 658 FPToI.legalFor({{S16, S16}}); 659 else 660 FPToI.minScalar(1, S32); 661 662 FPToI.minScalar(0, S32) 663 .scalarize(0) 664 .lower(); 665 666 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 667 .scalarize(0) 668 .lower(); 669 670 if (ST.has16BitInsts()) { 671 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 672 .legalFor({S16, S32, S64}) 673 .clampScalar(0, S16, S64) 674 .scalarize(0); 675 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 676 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 677 .legalFor({S32, S64}) 678 .clampScalar(0, S32, S64) 679 .scalarize(0); 680 } else { 681 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 682 .legalFor({S32}) 683 .customFor({S64}) 684 .clampScalar(0, S32, S64) 685 .scalarize(0); 686 } 687 688 // FIXME: Clamp offset operand. 689 getActionDefinitionsBuilder(G_PTR_ADD) 690 .legalIf(isPointer(0)) 691 .scalarize(0); 692 693 getActionDefinitionsBuilder(G_PTRMASK) 694 .legalIf(typeInSet(1, {S64, S32})) 695 .minScalar(1, S32) 696 .maxScalarIf(sizeIs(0, 32), 1, S32) 697 .maxScalarIf(sizeIs(0, 64), 1, S64) 698 .scalarize(0); 699 700 auto &CmpBuilder = 701 getActionDefinitionsBuilder(G_ICMP) 702 // The compare output type differs based on the register bank of the output, 703 // so make both s1 and s32 legal. 704 // 705 // Scalar compares producing output in scc will be promoted to s32, as that 706 // is the allocatable register type that will be needed for the copy from 707 // scc. This will be promoted during RegBankSelect, and we assume something 708 // before that won't try to use s32 result types. 709 // 710 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 711 // bank. 712 .legalForCartesianProduct( 713 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 714 .legalForCartesianProduct( 715 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 716 if (ST.has16BitInsts()) { 717 CmpBuilder.legalFor({{S1, S16}}); 718 } 719 720 CmpBuilder 721 .widenScalarToNextPow2(1) 722 .clampScalar(1, S32, S64) 723 .scalarize(0) 724 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 725 726 getActionDefinitionsBuilder(G_FCMP) 727 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 728 .widenScalarToNextPow2(1) 729 .clampScalar(1, S32, S64) 730 .scalarize(0); 731 732 // FIXME: fpow has a selection pattern that should move to custom lowering. 733 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 734 if (ST.has16BitInsts()) 735 Exp2Ops.legalFor({S32, S16}); 736 else 737 Exp2Ops.legalFor({S32}); 738 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 739 Exp2Ops.scalarize(0); 740 741 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 742 if (ST.has16BitInsts()) 743 ExpOps.customFor({{S32}, {S16}}); 744 else 745 ExpOps.customFor({S32}); 746 ExpOps.clampScalar(0, MinScalarFPTy, S32) 747 .scalarize(0); 748 749 // The 64-bit versions produce 32-bit results, but only on the SALU. 750 getActionDefinitionsBuilder(G_CTPOP) 751 .legalFor({{S32, S32}, {S32, S64}}) 752 .clampScalar(0, S32, S32) 753 .clampScalar(1, S32, S64) 754 .scalarize(0) 755 .widenScalarToNextPow2(0, 32) 756 .widenScalarToNextPow2(1, 32); 757 758 // The hardware instructions return a different result on 0 than the generic 759 // instructions expect. The hardware produces -1, but these produce the 760 // bitwidth. 761 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 762 .scalarize(0) 763 .clampScalar(0, S32, S32) 764 .clampScalar(1, S32, S64) 765 .widenScalarToNextPow2(0, 32) 766 .widenScalarToNextPow2(1, 32) 767 .lower(); 768 769 // The 64-bit versions produce 32-bit results, but only on the SALU. 770 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 771 .legalFor({{S32, S32}, {S32, S64}}) 772 .clampScalar(0, S32, S32) 773 .clampScalar(1, S32, S64) 774 .scalarize(0) 775 .widenScalarToNextPow2(0, 32) 776 .widenScalarToNextPow2(1, 32); 777 778 getActionDefinitionsBuilder(G_BITREVERSE) 779 .legalFor({S32}) 780 .clampScalar(0, S32, S32) 781 .scalarize(0); 782 783 if (ST.has16BitInsts()) { 784 getActionDefinitionsBuilder(G_BSWAP) 785 .legalFor({S16, S32, V2S16}) 786 .clampMaxNumElements(0, S16, 2) 787 // FIXME: Fixing non-power-of-2 before clamp is workaround for 788 // narrowScalar limitation. 789 .widenScalarToNextPow2(0) 790 .clampScalar(0, S16, S32) 791 .scalarize(0); 792 793 if (ST.hasVOP3PInsts()) { 794 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 795 .legalFor({S32, S16, V2S16}) 796 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 797 .clampMaxNumElements(0, S16, 2) 798 .minScalar(0, S16) 799 .widenScalarToNextPow2(0) 800 .scalarize(0) 801 .lower(); 802 } else { 803 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 804 .legalFor({S32, S16}) 805 .widenScalarToNextPow2(0) 806 .minScalar(0, S16) 807 .scalarize(0) 808 .lower(); 809 } 810 } else { 811 // TODO: Should have same legality without v_perm_b32 812 getActionDefinitionsBuilder(G_BSWAP) 813 .legalFor({S32}) 814 .lowerIf(scalarNarrowerThan(0, 32)) 815 // FIXME: Fixing non-power-of-2 before clamp is workaround for 816 // narrowScalar limitation. 817 .widenScalarToNextPow2(0) 818 .maxScalar(0, S32) 819 .scalarize(0) 820 .lower(); 821 822 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 823 .legalFor({S32}) 824 .minScalar(0, S32) 825 .widenScalarToNextPow2(0) 826 .scalarize(0) 827 .lower(); 828 } 829 830 getActionDefinitionsBuilder(G_INTTOPTR) 831 // List the common cases 832 .legalForCartesianProduct(AddrSpaces64, {S64}) 833 .legalForCartesianProduct(AddrSpaces32, {S32}) 834 .scalarize(0) 835 // Accept any address space as long as the size matches 836 .legalIf(sameSize(0, 1)) 837 .widenScalarIf(smallerThan(1, 0), 838 [](const LegalityQuery &Query) { 839 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 840 }) 841 .narrowScalarIf(largerThan(1, 0), 842 [](const LegalityQuery &Query) { 843 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 844 }); 845 846 getActionDefinitionsBuilder(G_PTRTOINT) 847 // List the common cases 848 .legalForCartesianProduct(AddrSpaces64, {S64}) 849 .legalForCartesianProduct(AddrSpaces32, {S32}) 850 .scalarize(0) 851 // Accept any address space as long as the size matches 852 .legalIf(sameSize(0, 1)) 853 .widenScalarIf(smallerThan(0, 1), 854 [](const LegalityQuery &Query) { 855 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 856 }) 857 .narrowScalarIf( 858 largerThan(0, 1), 859 [](const LegalityQuery &Query) { 860 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 861 }); 862 863 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 864 .scalarize(0) 865 .custom(); 866 867 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 868 bool IsLoad) -> bool { 869 const LLT DstTy = Query.Types[0]; 870 871 // Split vector extloads. 872 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 873 unsigned Align = Query.MMODescrs[0].AlignInBits; 874 875 if (MemSize < DstTy.getSizeInBits()) 876 MemSize = std::max(MemSize, Align); 877 878 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 879 return true; 880 881 const LLT PtrTy = Query.Types[1]; 882 unsigned AS = PtrTy.getAddressSpace(); 883 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 884 return true; 885 886 // Catch weird sized loads that don't evenly divide into the access sizes 887 // TODO: May be able to widen depending on alignment etc. 888 unsigned NumRegs = (MemSize + 31) / 32; 889 if (NumRegs == 3) { 890 if (!ST.hasDwordx3LoadStores()) 891 return true; 892 } else { 893 // If the alignment allows, these should have been widened. 894 if (!isPowerOf2_32(NumRegs)) 895 return true; 896 } 897 898 if (Align < MemSize) { 899 const SITargetLowering *TLI = ST.getTargetLowering(); 900 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 901 } 902 903 return false; 904 }; 905 906 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 907 unsigned Opc) -> bool { 908 unsigned Size = Query.Types[0].getSizeInBits(); 909 if (isPowerOf2_32(Size)) 910 return false; 911 912 if (Size == 96 && ST.hasDwordx3LoadStores()) 913 return false; 914 915 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 916 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 917 return false; 918 919 unsigned Align = Query.MMODescrs[0].AlignInBits; 920 unsigned RoundedSize = NextPowerOf2(Size); 921 return (Align >= RoundedSize); 922 }; 923 924 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 925 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 926 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 927 928 // TODO: Refine based on subtargets which support unaligned access or 128-bit 929 // LDS 930 // TODO: Unsupported flat for SI. 931 932 for (unsigned Op : {G_LOAD, G_STORE}) { 933 const bool IsStore = Op == G_STORE; 934 935 auto &Actions = getActionDefinitionsBuilder(Op); 936 // Explicitly list some common cases. 937 // TODO: Does this help compile time at all? 938 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 939 {V2S32, GlobalPtr, 64, GlobalAlign32}, 940 {V4S32, GlobalPtr, 128, GlobalAlign32}, 941 {S64, GlobalPtr, 64, GlobalAlign32}, 942 {V2S64, GlobalPtr, 128, GlobalAlign32}, 943 {V2S16, GlobalPtr, 32, GlobalAlign32}, 944 {S32, GlobalPtr, 8, GlobalAlign8}, 945 {S32, GlobalPtr, 16, GlobalAlign16}, 946 947 {S32, LocalPtr, 32, 32}, 948 {S64, LocalPtr, 64, 32}, 949 {V2S32, LocalPtr, 64, 32}, 950 {S32, LocalPtr, 8, 8}, 951 {S32, LocalPtr, 16, 16}, 952 {V2S16, LocalPtr, 32, 32}, 953 954 {S32, PrivatePtr, 32, 32}, 955 {S32, PrivatePtr, 8, 8}, 956 {S32, PrivatePtr, 16, 16}, 957 {V2S16, PrivatePtr, 32, 32}, 958 959 {S32, ConstantPtr, 32, GlobalAlign32}, 960 {V2S32, ConstantPtr, 64, GlobalAlign32}, 961 {V4S32, ConstantPtr, 128, GlobalAlign32}, 962 {S64, ConstantPtr, 64, GlobalAlign32}, 963 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 964 Actions.legalIf( 965 [=](const LegalityQuery &Query) -> bool { 966 return isLoadStoreLegal(ST, Query, Op); 967 }); 968 969 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 970 // 64-bits. 971 // 972 // TODO: Should generalize bitcast action into coerce, which will also cover 973 // inserting addrspacecasts. 974 Actions.customIf(typeIs(1, Constant32Ptr)); 975 976 // Turn any illegal element vectors into something easier to deal 977 // with. These will ultimately produce 32-bit scalar shifts to extract the 978 // parts anyway. 979 // 980 // For odd 16-bit element vectors, prefer to split those into pieces with 981 // 16-bit vector parts. 982 Actions.bitcastIf( 983 [=](const LegalityQuery &Query) -> bool { 984 const LLT Ty = Query.Types[0]; 985 986 // Do not cast an extload/truncstore. 987 if (Ty.getSizeInBits() != Query.MMODescrs[0].SizeInBits) 988 return false; 989 990 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 991 return true; 992 const unsigned Size = Ty.getSizeInBits(); 993 return Ty.isVector() && isRegisterSize(Size) && 994 !isRegisterVectorElementType(Ty.getElementType()); 995 }, bitcastToRegisterType(0)); 996 997 Actions 998 .customIf(typeIs(1, Constant32Ptr)) 999 // Widen suitably aligned loads by loading extra elements. 1000 .moreElementsIf([=](const LegalityQuery &Query) { 1001 const LLT Ty = Query.Types[0]; 1002 return Op == G_LOAD && Ty.isVector() && 1003 shouldWidenLoadResult(Query, Op); 1004 }, moreElementsToNextPow2(0)) 1005 .widenScalarIf([=](const LegalityQuery &Query) { 1006 const LLT Ty = Query.Types[0]; 1007 return Op == G_LOAD && !Ty.isVector() && 1008 shouldWidenLoadResult(Query, Op); 1009 }, widenScalarOrEltToNextPow2(0)) 1010 .narrowScalarIf( 1011 [=](const LegalityQuery &Query) -> bool { 1012 return !Query.Types[0].isVector() && 1013 needToSplitMemOp(Query, Op == G_LOAD); 1014 }, 1015 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1016 const LLT DstTy = Query.Types[0]; 1017 const LLT PtrTy = Query.Types[1]; 1018 1019 const unsigned DstSize = DstTy.getSizeInBits(); 1020 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1021 1022 // Split extloads. 1023 if (DstSize > MemSize) 1024 return std::make_pair(0, LLT::scalar(MemSize)); 1025 1026 if (!isPowerOf2_32(DstSize)) { 1027 // We're probably decomposing an odd sized store. Try to split 1028 // to the widest type. TODO: Account for alignment. As-is it 1029 // should be OK, since the new parts will be further legalized. 1030 unsigned FloorSize = PowerOf2Floor(DstSize); 1031 return std::make_pair(0, LLT::scalar(FloorSize)); 1032 } 1033 1034 if (DstSize > 32 && (DstSize % 32 != 0)) { 1035 // FIXME: Need a way to specify non-extload of larger size if 1036 // suitably aligned. 1037 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1038 } 1039 1040 unsigned MaxSize = maxSizeForAddrSpace(ST, 1041 PtrTy.getAddressSpace(), 1042 Op == G_LOAD); 1043 if (MemSize > MaxSize) 1044 return std::make_pair(0, LLT::scalar(MaxSize)); 1045 1046 unsigned Align = Query.MMODescrs[0].AlignInBits; 1047 return std::make_pair(0, LLT::scalar(Align)); 1048 }) 1049 .fewerElementsIf( 1050 [=](const LegalityQuery &Query) -> bool { 1051 return Query.Types[0].isVector() && 1052 needToSplitMemOp(Query, Op == G_LOAD); 1053 }, 1054 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1055 const LLT DstTy = Query.Types[0]; 1056 const LLT PtrTy = Query.Types[1]; 1057 1058 LLT EltTy = DstTy.getElementType(); 1059 unsigned MaxSize = maxSizeForAddrSpace(ST, 1060 PtrTy.getAddressSpace(), 1061 Op == G_LOAD); 1062 1063 // FIXME: Handle widened to power of 2 results better. This ends 1064 // up scalarizing. 1065 // FIXME: 3 element stores scalarized on SI 1066 1067 // Split if it's too large for the address space. 1068 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1069 unsigned NumElts = DstTy.getNumElements(); 1070 unsigned EltSize = EltTy.getSizeInBits(); 1071 1072 if (MaxSize % EltSize == 0) { 1073 return std::make_pair( 1074 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1075 } 1076 1077 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1078 1079 // FIXME: Refine when odd breakdowns handled 1080 // The scalars will need to be re-legalized. 1081 if (NumPieces == 1 || NumPieces >= NumElts || 1082 NumElts % NumPieces != 0) 1083 return std::make_pair(0, EltTy); 1084 1085 return std::make_pair(0, 1086 LLT::vector(NumElts / NumPieces, EltTy)); 1087 } 1088 1089 // FIXME: We could probably handle weird extending loads better. 1090 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1091 if (DstTy.getSizeInBits() > MemSize) 1092 return std::make_pair(0, EltTy); 1093 1094 unsigned EltSize = EltTy.getSizeInBits(); 1095 unsigned DstSize = DstTy.getSizeInBits(); 1096 if (!isPowerOf2_32(DstSize)) { 1097 // We're probably decomposing an odd sized store. Try to split 1098 // to the widest type. TODO: Account for alignment. As-is it 1099 // should be OK, since the new parts will be further legalized. 1100 unsigned FloorSize = PowerOf2Floor(DstSize); 1101 return std::make_pair( 1102 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1103 } 1104 1105 // Need to split because of alignment. 1106 unsigned Align = Query.MMODescrs[0].AlignInBits; 1107 if (EltSize > Align && 1108 (EltSize / Align < DstTy.getNumElements())) { 1109 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1110 } 1111 1112 // May need relegalization for the scalars. 1113 return std::make_pair(0, EltTy); 1114 }) 1115 .minScalar(0, S32); 1116 1117 if (IsStore) 1118 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1119 1120 // TODO: Need a bitcast lower option? 1121 Actions 1122 .widenScalarToNextPow2(0) 1123 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1124 } 1125 1126 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1127 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1128 {S32, GlobalPtr, 16, 2 * 8}, 1129 {S32, LocalPtr, 8, 8}, 1130 {S32, LocalPtr, 16, 16}, 1131 {S32, PrivatePtr, 8, 8}, 1132 {S32, PrivatePtr, 16, 16}, 1133 {S32, ConstantPtr, 8, 8}, 1134 {S32, ConstantPtr, 16, 2 * 8}}); 1135 if (ST.hasFlatAddressSpace()) { 1136 ExtLoads.legalForTypesWithMemDesc( 1137 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1138 } 1139 1140 ExtLoads.clampScalar(0, S32, S32) 1141 .widenScalarToNextPow2(0) 1142 .unsupportedIfMemSizeNotPow2() 1143 .lower(); 1144 1145 auto &Atomics = getActionDefinitionsBuilder( 1146 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1147 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1148 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1149 G_ATOMICRMW_UMIN}) 1150 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1151 {S64, GlobalPtr}, {S64, LocalPtr}}); 1152 if (ST.hasFlatAddressSpace()) { 1153 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1154 } 1155 1156 if (ST.hasLDSFPAtomics()) { 1157 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1158 .legalFor({{S32, LocalPtr}}); 1159 } 1160 1161 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1162 // demarshalling 1163 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1164 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1165 {S32, FlatPtr}, {S64, FlatPtr}}) 1166 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1167 {S32, RegionPtr}, {S64, RegionPtr}}); 1168 // TODO: Pointer types, any 32-bit or 64-bit vector 1169 1170 // Condition should be s32 for scalar, s1 for vector. 1171 getActionDefinitionsBuilder(G_SELECT) 1172 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1173 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1174 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1175 .clampScalar(0, S16, S64) 1176 .scalarize(1) 1177 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1178 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1179 .clampMaxNumElements(0, S32, 2) 1180 .clampMaxNumElements(0, LocalPtr, 2) 1181 .clampMaxNumElements(0, PrivatePtr, 2) 1182 .scalarize(0) 1183 .widenScalarToNextPow2(0) 1184 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1185 1186 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1187 // be more flexible with the shift amount type. 1188 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1189 .legalFor({{S32, S32}, {S64, S32}}); 1190 if (ST.has16BitInsts()) { 1191 if (ST.hasVOP3PInsts()) { 1192 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1193 .clampMaxNumElements(0, S16, 2); 1194 } else 1195 Shifts.legalFor({{S16, S16}}); 1196 1197 // TODO: Support 16-bit shift amounts for all types 1198 Shifts.widenScalarIf( 1199 [=](const LegalityQuery &Query) { 1200 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1201 // 32-bit amount. 1202 const LLT ValTy = Query.Types[0]; 1203 const LLT AmountTy = Query.Types[1]; 1204 return ValTy.getSizeInBits() <= 16 && 1205 AmountTy.getSizeInBits() < 16; 1206 }, changeTo(1, S16)); 1207 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1208 Shifts.clampScalar(1, S32, S32); 1209 Shifts.clampScalar(0, S16, S64); 1210 Shifts.widenScalarToNextPow2(0, 16); 1211 } else { 1212 // Make sure we legalize the shift amount type first, as the general 1213 // expansion for the shifted type will produce much worse code if it hasn't 1214 // been truncated already. 1215 Shifts.clampScalar(1, S32, S32); 1216 Shifts.clampScalar(0, S32, S64); 1217 Shifts.widenScalarToNextPow2(0, 32); 1218 } 1219 Shifts.scalarize(0); 1220 1221 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1222 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1223 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1224 unsigned IdxTypeIdx = 2; 1225 1226 getActionDefinitionsBuilder(Op) 1227 .customIf([=](const LegalityQuery &Query) { 1228 const LLT EltTy = Query.Types[EltTypeIdx]; 1229 const LLT VecTy = Query.Types[VecTypeIdx]; 1230 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1231 return (EltTy.getSizeInBits() == 16 || 1232 EltTy.getSizeInBits() % 32 == 0) && 1233 VecTy.getSizeInBits() % 32 == 0 && 1234 VecTy.getSizeInBits() <= MaxRegisterSize && 1235 IdxTy.getSizeInBits() == 32; 1236 }) 1237 .clampScalar(EltTypeIdx, S32, S64) 1238 .clampScalar(VecTypeIdx, S32, S64) 1239 .clampScalar(IdxTypeIdx, S32, S32); 1240 } 1241 1242 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1243 .unsupportedIf([=](const LegalityQuery &Query) { 1244 const LLT &EltTy = Query.Types[1].getElementType(); 1245 return Query.Types[0] != EltTy; 1246 }); 1247 1248 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1249 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1250 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1251 1252 // FIXME: Doesn't handle extract of illegal sizes. 1253 getActionDefinitionsBuilder(Op) 1254 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1255 // FIXME: Multiples of 16 should not be legal. 1256 .legalIf([=](const LegalityQuery &Query) { 1257 const LLT BigTy = Query.Types[BigTyIdx]; 1258 const LLT LitTy = Query.Types[LitTyIdx]; 1259 return (BigTy.getSizeInBits() % 32 == 0) && 1260 (LitTy.getSizeInBits() % 16 == 0); 1261 }) 1262 .widenScalarIf( 1263 [=](const LegalityQuery &Query) { 1264 const LLT BigTy = Query.Types[BigTyIdx]; 1265 return (BigTy.getScalarSizeInBits() < 16); 1266 }, 1267 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1268 .widenScalarIf( 1269 [=](const LegalityQuery &Query) { 1270 const LLT LitTy = Query.Types[LitTyIdx]; 1271 return (LitTy.getScalarSizeInBits() < 16); 1272 }, 1273 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1274 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1275 .widenScalarToNextPow2(BigTyIdx, 32); 1276 1277 } 1278 1279 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1280 .legalForCartesianProduct(AllS32Vectors, {S32}) 1281 .legalForCartesianProduct(AllS64Vectors, {S64}) 1282 .clampNumElements(0, V16S32, V32S32) 1283 .clampNumElements(0, V2S64, V16S64) 1284 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1285 1286 if (ST.hasScalarPackInsts()) { 1287 BuildVector 1288 // FIXME: Should probably widen s1 vectors straight to s32 1289 .minScalarOrElt(0, S16) 1290 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1291 .minScalar(1, S32); 1292 1293 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1294 .legalFor({V2S16, S32}) 1295 .lower(); 1296 BuildVector.minScalarOrElt(0, S32); 1297 } else { 1298 BuildVector.customFor({V2S16, S16}); 1299 BuildVector.minScalarOrElt(0, S32); 1300 1301 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1302 .customFor({V2S16, S32}) 1303 .lower(); 1304 } 1305 1306 BuildVector.legalIf(isRegisterType(0)); 1307 1308 // FIXME: Clamp maximum size 1309 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1310 .legalIf(isRegisterType(0)); 1311 1312 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1313 // pre-legalize. 1314 if (ST.hasVOP3PInsts()) { 1315 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1316 .customFor({V2S16, V2S16}) 1317 .lower(); 1318 } else 1319 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1320 1321 // Merge/Unmerge 1322 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1323 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1324 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1325 1326 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1327 const LLT Ty = Query.Types[TypeIdx]; 1328 if (Ty.isVector()) { 1329 const LLT &EltTy = Ty.getElementType(); 1330 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1331 return true; 1332 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1333 return true; 1334 } 1335 return false; 1336 }; 1337 1338 auto &Builder = getActionDefinitionsBuilder(Op) 1339 .lowerFor({{S16, V2S16}}) 1340 .lowerIf([=](const LegalityQuery &Query) { 1341 const LLT BigTy = Query.Types[BigTyIdx]; 1342 return BigTy.getSizeInBits() == 32; 1343 }) 1344 // Try to widen to s16 first for small types. 1345 // TODO: Only do this on targets with legal s16 shifts 1346 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1347 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1348 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1349 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1350 elementTypeIs(1, S16)), 1351 changeTo(1, V2S16)) 1352 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1353 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1354 // valid. 1355 .clampScalar(LitTyIdx, S32, S512) 1356 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1357 // Break up vectors with weird elements into scalars 1358 .fewerElementsIf( 1359 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1360 scalarize(0)) 1361 .fewerElementsIf( 1362 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1363 scalarize(1)) 1364 .clampScalar(BigTyIdx, S32, MaxScalar); 1365 1366 if (Op == G_MERGE_VALUES) { 1367 Builder.widenScalarIf( 1368 // TODO: Use 16-bit shifts if legal for 8-bit values? 1369 [=](const LegalityQuery &Query) { 1370 const LLT Ty = Query.Types[LitTyIdx]; 1371 return Ty.getSizeInBits() < 32; 1372 }, 1373 changeTo(LitTyIdx, S32)); 1374 } 1375 1376 Builder.widenScalarIf( 1377 [=](const LegalityQuery &Query) { 1378 const LLT Ty = Query.Types[BigTyIdx]; 1379 return !isPowerOf2_32(Ty.getSizeInBits()) && 1380 Ty.getSizeInBits() % 16 != 0; 1381 }, 1382 [=](const LegalityQuery &Query) { 1383 // Pick the next power of 2, or a multiple of 64 over 128. 1384 // Whichever is smaller. 1385 const LLT &Ty = Query.Types[BigTyIdx]; 1386 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1387 if (NewSizeInBits >= 256) { 1388 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1389 if (RoundedTo < NewSizeInBits) 1390 NewSizeInBits = RoundedTo; 1391 } 1392 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1393 }) 1394 .legalIf([=](const LegalityQuery &Query) { 1395 const LLT &BigTy = Query.Types[BigTyIdx]; 1396 const LLT &LitTy = Query.Types[LitTyIdx]; 1397 1398 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1399 return false; 1400 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1401 return false; 1402 1403 return BigTy.getSizeInBits() % 16 == 0 && 1404 LitTy.getSizeInBits() % 16 == 0 && 1405 BigTy.getSizeInBits() <= MaxRegisterSize; 1406 }) 1407 // Any vectors left are the wrong size. Scalarize them. 1408 .scalarize(0) 1409 .scalarize(1); 1410 } 1411 1412 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1413 // RegBankSelect. 1414 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1415 .legalFor({{S32}, {S64}}); 1416 1417 if (ST.hasVOP3PInsts()) { 1418 SextInReg.lowerFor({{V2S16}}) 1419 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1420 // get more vector shift opportunities, since we'll get those when 1421 // expanded. 1422 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1423 } else if (ST.has16BitInsts()) { 1424 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1425 } else { 1426 // Prefer to promote to s32 before lowering if we don't have 16-bit 1427 // shifts. This avoid a lot of intermediate truncate and extend operations. 1428 SextInReg.lowerFor({{S32}, {S64}}); 1429 } 1430 1431 SextInReg 1432 .scalarize(0) 1433 .clampScalar(0, S32, S64) 1434 .lower(); 1435 1436 getActionDefinitionsBuilder(G_FSHR) 1437 .legalFor({{S32, S32}}) 1438 .scalarize(0) 1439 .lower(); 1440 1441 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1442 .legalFor({S64}); 1443 1444 getActionDefinitionsBuilder({ 1445 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1446 G_FCOPYSIGN, 1447 1448 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1449 G_READ_REGISTER, 1450 G_WRITE_REGISTER, 1451 1452 G_SADDO, G_SSUBO, 1453 1454 // TODO: Implement 1455 G_FMINIMUM, G_FMAXIMUM, 1456 G_FSHL 1457 }).lower(); 1458 1459 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1460 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1461 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1462 .unsupported(); 1463 1464 computeTables(); 1465 verify(*ST.getInstrInfo()); 1466 } 1467 1468 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1469 MachineInstr &MI) const { 1470 MachineIRBuilder &B = Helper.MIRBuilder; 1471 MachineRegisterInfo &MRI = *B.getMRI(); 1472 GISelChangeObserver &Observer = Helper.Observer; 1473 1474 switch (MI.getOpcode()) { 1475 case TargetOpcode::G_ADDRSPACE_CAST: 1476 return legalizeAddrSpaceCast(MI, MRI, B); 1477 case TargetOpcode::G_FRINT: 1478 return legalizeFrint(MI, MRI, B); 1479 case TargetOpcode::G_FCEIL: 1480 return legalizeFceil(MI, MRI, B); 1481 case TargetOpcode::G_INTRINSIC_TRUNC: 1482 return legalizeIntrinsicTrunc(MI, MRI, B); 1483 case TargetOpcode::G_SITOFP: 1484 return legalizeITOFP(MI, MRI, B, true); 1485 case TargetOpcode::G_UITOFP: 1486 return legalizeITOFP(MI, MRI, B, false); 1487 case TargetOpcode::G_FPTOSI: 1488 return legalizeFPTOI(MI, MRI, B, true); 1489 case TargetOpcode::G_FPTOUI: 1490 return legalizeFPTOI(MI, MRI, B, false); 1491 case TargetOpcode::G_FMINNUM: 1492 case TargetOpcode::G_FMAXNUM: 1493 case TargetOpcode::G_FMINNUM_IEEE: 1494 case TargetOpcode::G_FMAXNUM_IEEE: 1495 return legalizeMinNumMaxNum(Helper, MI); 1496 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1497 return legalizeExtractVectorElt(MI, MRI, B); 1498 case TargetOpcode::G_INSERT_VECTOR_ELT: 1499 return legalizeInsertVectorElt(MI, MRI, B); 1500 case TargetOpcode::G_SHUFFLE_VECTOR: 1501 return legalizeShuffleVector(MI, MRI, B); 1502 case TargetOpcode::G_FSIN: 1503 case TargetOpcode::G_FCOS: 1504 return legalizeSinCos(MI, MRI, B); 1505 case TargetOpcode::G_GLOBAL_VALUE: 1506 return legalizeGlobalValue(MI, MRI, B); 1507 case TargetOpcode::G_LOAD: 1508 return legalizeLoad(MI, MRI, B, Observer); 1509 case TargetOpcode::G_FMAD: 1510 return legalizeFMad(MI, MRI, B); 1511 case TargetOpcode::G_FDIV: 1512 return legalizeFDIV(MI, MRI, B); 1513 case TargetOpcode::G_UDIV: 1514 case TargetOpcode::G_UREM: 1515 return legalizeUDIV_UREM(MI, MRI, B); 1516 case TargetOpcode::G_SDIV: 1517 case TargetOpcode::G_SREM: 1518 return legalizeSDIV_SREM(MI, MRI, B); 1519 case TargetOpcode::G_ATOMIC_CMPXCHG: 1520 return legalizeAtomicCmpXChg(MI, MRI, B); 1521 case TargetOpcode::G_FLOG: 1522 return legalizeFlog(MI, B, numbers::ln2f); 1523 case TargetOpcode::G_FLOG10: 1524 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1525 case TargetOpcode::G_FEXP: 1526 return legalizeFExp(MI, B); 1527 case TargetOpcode::G_FPOW: 1528 return legalizeFPow(MI, B); 1529 case TargetOpcode::G_FFLOOR: 1530 return legalizeFFloor(MI, MRI, B); 1531 case TargetOpcode::G_BUILD_VECTOR: 1532 return legalizeBuildVector(MI, MRI, B); 1533 default: 1534 return false; 1535 } 1536 1537 llvm_unreachable("expected switch to return"); 1538 } 1539 1540 Register AMDGPULegalizerInfo::getSegmentAperture( 1541 unsigned AS, 1542 MachineRegisterInfo &MRI, 1543 MachineIRBuilder &B) const { 1544 MachineFunction &MF = B.getMF(); 1545 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1546 const LLT S32 = LLT::scalar(32); 1547 1548 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1549 1550 if (ST.hasApertureRegs()) { 1551 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1552 // getreg. 1553 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1554 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1555 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1556 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1557 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1558 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1559 unsigned Encoding = 1560 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1561 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1562 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1563 1564 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1565 1566 B.buildInstr(AMDGPU::S_GETREG_B32) 1567 .addDef(GetReg) 1568 .addImm(Encoding); 1569 MRI.setType(GetReg, S32); 1570 1571 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1572 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1573 } 1574 1575 Register QueuePtr = MRI.createGenericVirtualRegister( 1576 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1577 1578 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1579 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1580 return Register(); 1581 1582 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1583 // private_segment_aperture_base_hi. 1584 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1585 1586 // TODO: can we be smarter about machine pointer info? 1587 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1588 MachineMemOperand *MMO = MF.getMachineMemOperand( 1589 PtrInfo, 1590 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1591 MachineMemOperand::MOInvariant, 1592 4, commonAlignment(Align(64), StructOffset)); 1593 1594 Register LoadAddr; 1595 1596 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1597 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1598 } 1599 1600 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1601 MachineInstr &MI, MachineRegisterInfo &MRI, 1602 MachineIRBuilder &B) const { 1603 MachineFunction &MF = B.getMF(); 1604 1605 const LLT S32 = LLT::scalar(32); 1606 Register Dst = MI.getOperand(0).getReg(); 1607 Register Src = MI.getOperand(1).getReg(); 1608 1609 LLT DstTy = MRI.getType(Dst); 1610 LLT SrcTy = MRI.getType(Src); 1611 unsigned DestAS = DstTy.getAddressSpace(); 1612 unsigned SrcAS = SrcTy.getAddressSpace(); 1613 1614 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1615 // vector element. 1616 assert(!DstTy.isVector()); 1617 1618 const AMDGPUTargetMachine &TM 1619 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1620 1621 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1622 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1623 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1624 return true; 1625 } 1626 1627 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1628 // Truncate. 1629 B.buildExtract(Dst, Src, 0); 1630 MI.eraseFromParent(); 1631 return true; 1632 } 1633 1634 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1635 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1636 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1637 1638 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1639 // another. Merge operands are required to be the same type, but creating an 1640 // extra ptrtoint would be kind of pointless. 1641 auto HighAddr = B.buildConstant( 1642 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1643 B.buildMerge(Dst, {Src, HighAddr}); 1644 MI.eraseFromParent(); 1645 return true; 1646 } 1647 1648 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1649 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1650 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1651 unsigned NullVal = TM.getNullPointerValue(DestAS); 1652 1653 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1654 auto FlatNull = B.buildConstant(SrcTy, 0); 1655 1656 // Extract low 32-bits of the pointer. 1657 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1658 1659 auto CmpRes = 1660 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1661 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1662 1663 MI.eraseFromParent(); 1664 return true; 1665 } 1666 1667 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1668 return false; 1669 1670 if (!ST.hasFlatAddressSpace()) 1671 return false; 1672 1673 auto SegmentNull = 1674 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1675 auto FlatNull = 1676 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1677 1678 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1679 if (!ApertureReg.isValid()) 1680 return false; 1681 1682 auto CmpRes = 1683 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1684 1685 // Coerce the type of the low half of the result so we can use merge_values. 1686 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1687 1688 // TODO: Should we allow mismatched types but matching sizes in merges to 1689 // avoid the ptrtoint? 1690 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1691 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1692 1693 MI.eraseFromParent(); 1694 return true; 1695 } 1696 1697 bool AMDGPULegalizerInfo::legalizeFrint( 1698 MachineInstr &MI, MachineRegisterInfo &MRI, 1699 MachineIRBuilder &B) const { 1700 Register Src = MI.getOperand(1).getReg(); 1701 LLT Ty = MRI.getType(Src); 1702 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1703 1704 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1705 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1706 1707 auto C1 = B.buildFConstant(Ty, C1Val); 1708 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1709 1710 // TODO: Should this propagate fast-math-flags? 1711 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1712 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1713 1714 auto C2 = B.buildFConstant(Ty, C2Val); 1715 auto Fabs = B.buildFAbs(Ty, Src); 1716 1717 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1718 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1719 return true; 1720 } 1721 1722 bool AMDGPULegalizerInfo::legalizeFceil( 1723 MachineInstr &MI, MachineRegisterInfo &MRI, 1724 MachineIRBuilder &B) const { 1725 1726 const LLT S1 = LLT::scalar(1); 1727 const LLT S64 = LLT::scalar(64); 1728 1729 Register Src = MI.getOperand(1).getReg(); 1730 assert(MRI.getType(Src) == S64); 1731 1732 // result = trunc(src) 1733 // if (src > 0.0 && src != result) 1734 // result += 1.0 1735 1736 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1737 1738 const auto Zero = B.buildFConstant(S64, 0.0); 1739 const auto One = B.buildFConstant(S64, 1.0); 1740 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1741 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1742 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1743 auto Add = B.buildSelect(S64, And, One, Zero); 1744 1745 // TODO: Should this propagate fast-math-flags? 1746 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1747 return true; 1748 } 1749 1750 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1751 MachineIRBuilder &B) { 1752 const unsigned FractBits = 52; 1753 const unsigned ExpBits = 11; 1754 LLT S32 = LLT::scalar(32); 1755 1756 auto Const0 = B.buildConstant(S32, FractBits - 32); 1757 auto Const1 = B.buildConstant(S32, ExpBits); 1758 1759 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1760 .addUse(Const0.getReg(0)) 1761 .addUse(Const1.getReg(0)); 1762 1763 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1764 } 1765 1766 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1767 MachineInstr &MI, MachineRegisterInfo &MRI, 1768 MachineIRBuilder &B) const { 1769 const LLT S1 = LLT::scalar(1); 1770 const LLT S32 = LLT::scalar(32); 1771 const LLT S64 = LLT::scalar(64); 1772 1773 Register Src = MI.getOperand(1).getReg(); 1774 assert(MRI.getType(Src) == S64); 1775 1776 // TODO: Should this use extract since the low half is unused? 1777 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1778 Register Hi = Unmerge.getReg(1); 1779 1780 // Extract the upper half, since this is where we will find the sign and 1781 // exponent. 1782 auto Exp = extractF64Exponent(Hi, B); 1783 1784 const unsigned FractBits = 52; 1785 1786 // Extract the sign bit. 1787 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1788 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1789 1790 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1791 1792 const auto Zero32 = B.buildConstant(S32, 0); 1793 1794 // Extend back to 64-bits. 1795 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1796 1797 auto Shr = B.buildAShr(S64, FractMask, Exp); 1798 auto Not = B.buildNot(S64, Shr); 1799 auto Tmp0 = B.buildAnd(S64, Src, Not); 1800 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1801 1802 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1803 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1804 1805 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1806 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1807 return true; 1808 } 1809 1810 bool AMDGPULegalizerInfo::legalizeITOFP( 1811 MachineInstr &MI, MachineRegisterInfo &MRI, 1812 MachineIRBuilder &B, bool Signed) const { 1813 1814 Register Dst = MI.getOperand(0).getReg(); 1815 Register Src = MI.getOperand(1).getReg(); 1816 1817 const LLT S64 = LLT::scalar(64); 1818 const LLT S32 = LLT::scalar(32); 1819 1820 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1821 1822 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1823 1824 auto CvtHi = Signed ? 1825 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1826 B.buildUITOFP(S64, Unmerge.getReg(1)); 1827 1828 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1829 1830 auto ThirtyTwo = B.buildConstant(S32, 32); 1831 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1832 .addUse(CvtHi.getReg(0)) 1833 .addUse(ThirtyTwo.getReg(0)); 1834 1835 // TODO: Should this propagate fast-math-flags? 1836 B.buildFAdd(Dst, LdExp, CvtLo); 1837 MI.eraseFromParent(); 1838 return true; 1839 } 1840 1841 // TODO: Copied from DAG implementation. Verify logic and document how this 1842 // actually works. 1843 bool AMDGPULegalizerInfo::legalizeFPTOI( 1844 MachineInstr &MI, MachineRegisterInfo &MRI, 1845 MachineIRBuilder &B, bool Signed) const { 1846 1847 Register Dst = MI.getOperand(0).getReg(); 1848 Register Src = MI.getOperand(1).getReg(); 1849 1850 const LLT S64 = LLT::scalar(64); 1851 const LLT S32 = LLT::scalar(32); 1852 1853 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1854 1855 unsigned Flags = MI.getFlags(); 1856 1857 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1858 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1859 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1860 1861 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1862 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1863 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1864 1865 auto Hi = Signed ? 1866 B.buildFPTOSI(S32, FloorMul) : 1867 B.buildFPTOUI(S32, FloorMul); 1868 auto Lo = B.buildFPTOUI(S32, Fma); 1869 1870 B.buildMerge(Dst, { Lo, Hi }); 1871 MI.eraseFromParent(); 1872 1873 return true; 1874 } 1875 1876 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 1877 MachineInstr &MI) const { 1878 MachineFunction &MF = Helper.MIRBuilder.getMF(); 1879 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1880 1881 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1882 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1883 1884 // With ieee_mode disabled, the instructions have the correct behavior 1885 // already for G_FMINNUM/G_FMAXNUM 1886 if (!MFI->getMode().IEEE) 1887 return !IsIEEEOp; 1888 1889 if (IsIEEEOp) 1890 return true; 1891 1892 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1893 } 1894 1895 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1896 MachineInstr &MI, MachineRegisterInfo &MRI, 1897 MachineIRBuilder &B) const { 1898 // TODO: Should move some of this into LegalizerHelper. 1899 1900 // TODO: Promote dynamic indexing of s16 to s32 1901 1902 // FIXME: Artifact combiner probably should have replaced the truncated 1903 // constant before this, so we shouldn't need 1904 // getConstantVRegValWithLookThrough. 1905 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1906 MI.getOperand(2).getReg(), MRI); 1907 if (!IdxVal) // Dynamic case will be selected to register indexing. 1908 return true; 1909 1910 Register Dst = MI.getOperand(0).getReg(); 1911 Register Vec = MI.getOperand(1).getReg(); 1912 1913 LLT VecTy = MRI.getType(Vec); 1914 LLT EltTy = VecTy.getElementType(); 1915 assert(EltTy == MRI.getType(Dst)); 1916 1917 if (IdxVal->Value < VecTy.getNumElements()) 1918 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1919 else 1920 B.buildUndef(Dst); 1921 1922 MI.eraseFromParent(); 1923 return true; 1924 } 1925 1926 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1927 MachineInstr &MI, MachineRegisterInfo &MRI, 1928 MachineIRBuilder &B) const { 1929 // TODO: Should move some of this into LegalizerHelper. 1930 1931 // TODO: Promote dynamic indexing of s16 to s32 1932 1933 // FIXME: Artifact combiner probably should have replaced the truncated 1934 // constant before this, so we shouldn't need 1935 // getConstantVRegValWithLookThrough. 1936 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1937 MI.getOperand(3).getReg(), MRI); 1938 if (!IdxVal) // Dynamic case will be selected to register indexing. 1939 return true; 1940 1941 Register Dst = MI.getOperand(0).getReg(); 1942 Register Vec = MI.getOperand(1).getReg(); 1943 Register Ins = MI.getOperand(2).getReg(); 1944 1945 LLT VecTy = MRI.getType(Vec); 1946 LLT EltTy = VecTy.getElementType(); 1947 assert(EltTy == MRI.getType(Ins)); 1948 1949 if (IdxVal->Value < VecTy.getNumElements()) 1950 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1951 else 1952 B.buildUndef(Dst); 1953 1954 MI.eraseFromParent(); 1955 return true; 1956 } 1957 1958 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1959 MachineInstr &MI, MachineRegisterInfo &MRI, 1960 MachineIRBuilder &B) const { 1961 const LLT V2S16 = LLT::vector(2, 16); 1962 1963 Register Dst = MI.getOperand(0).getReg(); 1964 Register Src0 = MI.getOperand(1).getReg(); 1965 LLT DstTy = MRI.getType(Dst); 1966 LLT SrcTy = MRI.getType(Src0); 1967 1968 if (SrcTy == V2S16 && DstTy == V2S16 && 1969 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1970 return true; 1971 1972 MachineIRBuilder HelperBuilder(MI); 1973 GISelObserverWrapper DummyObserver; 1974 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1975 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1976 } 1977 1978 bool AMDGPULegalizerInfo::legalizeSinCos( 1979 MachineInstr &MI, MachineRegisterInfo &MRI, 1980 MachineIRBuilder &B) const { 1981 1982 Register DstReg = MI.getOperand(0).getReg(); 1983 Register SrcReg = MI.getOperand(1).getReg(); 1984 LLT Ty = MRI.getType(DstReg); 1985 unsigned Flags = MI.getFlags(); 1986 1987 Register TrigVal; 1988 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 1989 if (ST.hasTrigReducedRange()) { 1990 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1991 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1992 .addUse(MulVal.getReg(0)) 1993 .setMIFlags(Flags).getReg(0); 1994 } else 1995 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1996 1997 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1998 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1999 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2000 .addUse(TrigVal) 2001 .setMIFlags(Flags); 2002 MI.eraseFromParent(); 2003 return true; 2004 } 2005 2006 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2007 MachineIRBuilder &B, 2008 const GlobalValue *GV, 2009 int64_t Offset, 2010 unsigned GAFlags) const { 2011 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2012 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2013 // to the following code sequence: 2014 // 2015 // For constant address space: 2016 // s_getpc_b64 s[0:1] 2017 // s_add_u32 s0, s0, $symbol 2018 // s_addc_u32 s1, s1, 0 2019 // 2020 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2021 // a fixup or relocation is emitted to replace $symbol with a literal 2022 // constant, which is a pc-relative offset from the encoding of the $symbol 2023 // operand to the global variable. 2024 // 2025 // For global address space: 2026 // s_getpc_b64 s[0:1] 2027 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2028 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2029 // 2030 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2031 // fixups or relocations are emitted to replace $symbol@*@lo and 2032 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2033 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2034 // operand to the global variable. 2035 // 2036 // What we want here is an offset from the value returned by s_getpc 2037 // (which is the address of the s_add_u32 instruction) to the global 2038 // variable, but since the encoding of $symbol starts 4 bytes after the start 2039 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2040 // small. This requires us to add 4 to the global variable offset in order to 2041 // compute the correct address. 2042 2043 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2044 2045 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2046 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2047 2048 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2049 .addDef(PCReg); 2050 2051 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2052 if (GAFlags == SIInstrInfo::MO_NONE) 2053 MIB.addImm(0); 2054 else 2055 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2056 2057 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2058 2059 if (PtrTy.getSizeInBits() == 32) 2060 B.buildExtract(DstReg, PCReg, 0); 2061 return true; 2062 } 2063 2064 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2065 MachineInstr &MI, MachineRegisterInfo &MRI, 2066 MachineIRBuilder &B) const { 2067 Register DstReg = MI.getOperand(0).getReg(); 2068 LLT Ty = MRI.getType(DstReg); 2069 unsigned AS = Ty.getAddressSpace(); 2070 2071 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2072 MachineFunction &MF = B.getMF(); 2073 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2074 2075 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2076 if (!MFI->isEntryFunction()) { 2077 const Function &Fn = MF.getFunction(); 2078 DiagnosticInfoUnsupported BadLDSDecl( 2079 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2080 DS_Warning); 2081 Fn.getContext().diagnose(BadLDSDecl); 2082 2083 // We currently don't have a way to correctly allocate LDS objects that 2084 // aren't directly associated with a kernel. We do force inlining of 2085 // functions that use local objects. However, if these dead functions are 2086 // not eliminated, we don't want a compile time error. Just emit a warning 2087 // and a trap, since there should be no callable path here. 2088 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2089 B.buildUndef(DstReg); 2090 MI.eraseFromParent(); 2091 return true; 2092 } 2093 2094 // TODO: We could emit code to handle the initialization somewhere. 2095 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2096 const SITargetLowering *TLI = ST.getTargetLowering(); 2097 if (!TLI->shouldUseLDSConstAddress(GV)) { 2098 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2099 return true; // Leave in place; 2100 } 2101 2102 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 2103 MI.eraseFromParent(); 2104 return true; 2105 } 2106 2107 const Function &Fn = MF.getFunction(); 2108 DiagnosticInfoUnsupported BadInit( 2109 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2110 Fn.getContext().diagnose(BadInit); 2111 return true; 2112 } 2113 2114 const SITargetLowering *TLI = ST.getTargetLowering(); 2115 2116 if (TLI->shouldEmitFixup(GV)) { 2117 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2118 MI.eraseFromParent(); 2119 return true; 2120 } 2121 2122 if (TLI->shouldEmitPCReloc(GV)) { 2123 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2124 MI.eraseFromParent(); 2125 return true; 2126 } 2127 2128 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2129 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2130 2131 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2132 MachinePointerInfo::getGOT(MF), 2133 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2134 MachineMemOperand::MOInvariant, 2135 8 /*Size*/, Align(8)); 2136 2137 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2138 2139 if (Ty.getSizeInBits() == 32) { 2140 // Truncate if this is a 32-bit constant adrdess. 2141 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2142 B.buildExtract(DstReg, Load, 0); 2143 } else 2144 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2145 2146 MI.eraseFromParent(); 2147 return true; 2148 } 2149 2150 bool AMDGPULegalizerInfo::legalizeLoad( 2151 MachineInstr &MI, MachineRegisterInfo &MRI, 2152 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2153 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2154 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2155 Observer.changingInstr(MI); 2156 MI.getOperand(1).setReg(Cast.getReg(0)); 2157 Observer.changedInstr(MI); 2158 return true; 2159 } 2160 2161 bool AMDGPULegalizerInfo::legalizeFMad( 2162 MachineInstr &MI, MachineRegisterInfo &MRI, 2163 MachineIRBuilder &B) const { 2164 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2165 assert(Ty.isScalar()); 2166 2167 MachineFunction &MF = B.getMF(); 2168 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2169 2170 // TODO: Always legal with future ftz flag. 2171 // FIXME: Do we need just output? 2172 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2173 return true; 2174 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2175 return true; 2176 2177 MachineIRBuilder HelperBuilder(MI); 2178 GISelObserverWrapper DummyObserver; 2179 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2180 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2181 } 2182 2183 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2184 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2185 Register DstReg = MI.getOperand(0).getReg(); 2186 Register PtrReg = MI.getOperand(1).getReg(); 2187 Register CmpVal = MI.getOperand(2).getReg(); 2188 Register NewVal = MI.getOperand(3).getReg(); 2189 2190 assert(SITargetLowering::isFlatGlobalAddrSpace( 2191 MRI.getType(PtrReg).getAddressSpace()) && 2192 "this should not have been custom lowered"); 2193 2194 LLT ValTy = MRI.getType(CmpVal); 2195 LLT VecTy = LLT::vector(2, ValTy); 2196 2197 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2198 2199 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2200 .addDef(DstReg) 2201 .addUse(PtrReg) 2202 .addUse(PackedVal) 2203 .setMemRefs(MI.memoperands()); 2204 2205 MI.eraseFromParent(); 2206 return true; 2207 } 2208 2209 bool AMDGPULegalizerInfo::legalizeFlog( 2210 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2211 Register Dst = MI.getOperand(0).getReg(); 2212 Register Src = MI.getOperand(1).getReg(); 2213 LLT Ty = B.getMRI()->getType(Dst); 2214 unsigned Flags = MI.getFlags(); 2215 2216 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2217 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2218 2219 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2220 MI.eraseFromParent(); 2221 return true; 2222 } 2223 2224 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2225 MachineIRBuilder &B) const { 2226 Register Dst = MI.getOperand(0).getReg(); 2227 Register Src = MI.getOperand(1).getReg(); 2228 unsigned Flags = MI.getFlags(); 2229 LLT Ty = B.getMRI()->getType(Dst); 2230 2231 auto K = B.buildFConstant(Ty, numbers::log2e); 2232 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2233 B.buildFExp2(Dst, Mul, Flags); 2234 MI.eraseFromParent(); 2235 return true; 2236 } 2237 2238 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2239 MachineIRBuilder &B) const { 2240 Register Dst = MI.getOperand(0).getReg(); 2241 Register Src0 = MI.getOperand(1).getReg(); 2242 Register Src1 = MI.getOperand(2).getReg(); 2243 unsigned Flags = MI.getFlags(); 2244 LLT Ty = B.getMRI()->getType(Dst); 2245 const LLT S16 = LLT::scalar(16); 2246 const LLT S32 = LLT::scalar(32); 2247 2248 if (Ty == S32) { 2249 auto Log = B.buildFLog2(S32, Src0, Flags); 2250 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2251 .addUse(Log.getReg(0)) 2252 .addUse(Src1) 2253 .setMIFlags(Flags); 2254 B.buildFExp2(Dst, Mul, Flags); 2255 } else if (Ty == S16) { 2256 // There's no f16 fmul_legacy, so we need to convert for it. 2257 auto Log = B.buildFLog2(S16, Src0, Flags); 2258 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2259 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2260 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2261 .addUse(Ext0.getReg(0)) 2262 .addUse(Ext1.getReg(0)) 2263 .setMIFlags(Flags); 2264 2265 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2266 } else 2267 return false; 2268 2269 MI.eraseFromParent(); 2270 return true; 2271 } 2272 2273 // Find a source register, ignoring any possible source modifiers. 2274 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2275 Register ModSrc = OrigSrc; 2276 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2277 ModSrc = SrcFNeg->getOperand(1).getReg(); 2278 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2279 ModSrc = SrcFAbs->getOperand(1).getReg(); 2280 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2281 ModSrc = SrcFAbs->getOperand(1).getReg(); 2282 return ModSrc; 2283 } 2284 2285 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2286 MachineRegisterInfo &MRI, 2287 MachineIRBuilder &B) const { 2288 2289 const LLT S1 = LLT::scalar(1); 2290 const LLT S64 = LLT::scalar(64); 2291 Register Dst = MI.getOperand(0).getReg(); 2292 Register OrigSrc = MI.getOperand(1).getReg(); 2293 unsigned Flags = MI.getFlags(); 2294 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2295 "this should not have been custom lowered"); 2296 2297 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2298 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2299 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2300 // V_FRACT bug is: 2301 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2302 // 2303 // Convert floor(x) to (x - fract(x)) 2304 2305 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2306 .addUse(OrigSrc) 2307 .setMIFlags(Flags); 2308 2309 // Give source modifier matching some assistance before obscuring a foldable 2310 // pattern. 2311 2312 // TODO: We can avoid the neg on the fract? The input sign to fract 2313 // shouldn't matter? 2314 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2315 2316 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2317 2318 Register Min = MRI.createGenericVirtualRegister(S64); 2319 2320 // We don't need to concern ourselves with the snan handling difference, so 2321 // use the one which will directly select. 2322 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2323 if (MFI->getMode().IEEE) 2324 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2325 else 2326 B.buildFMinNum(Min, Fract, Const, Flags); 2327 2328 Register CorrectedFract = Min; 2329 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2330 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2331 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2332 } 2333 2334 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2335 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2336 2337 MI.eraseFromParent(); 2338 return true; 2339 } 2340 2341 // Turn an illegal packed v2s16 build vector into bit operations. 2342 // TODO: This should probably be a bitcast action in LegalizerHelper. 2343 bool AMDGPULegalizerInfo::legalizeBuildVector( 2344 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2345 Register Dst = MI.getOperand(0).getReg(); 2346 const LLT S32 = LLT::scalar(32); 2347 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2348 2349 Register Src0 = MI.getOperand(1).getReg(); 2350 Register Src1 = MI.getOperand(2).getReg(); 2351 assert(MRI.getType(Src0) == LLT::scalar(16)); 2352 2353 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2354 B.buildBitcast(Dst, Merge); 2355 2356 MI.eraseFromParent(); 2357 return true; 2358 } 2359 2360 // Return the use branch instruction, otherwise null if the usage is invalid. 2361 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2362 MachineRegisterInfo &MRI, 2363 MachineInstr *&Br, 2364 MachineBasicBlock *&UncondBrTarget) { 2365 Register CondDef = MI.getOperand(0).getReg(); 2366 if (!MRI.hasOneNonDBGUse(CondDef)) 2367 return nullptr; 2368 2369 MachineBasicBlock *Parent = MI.getParent(); 2370 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2371 if (UseMI.getParent() != Parent || 2372 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2373 return nullptr; 2374 2375 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2376 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2377 if (Next == Parent->end()) { 2378 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2379 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2380 return nullptr; 2381 UncondBrTarget = &*NextMBB; 2382 } else { 2383 if (Next->getOpcode() != AMDGPU::G_BR) 2384 return nullptr; 2385 Br = &*Next; 2386 UncondBrTarget = Br->getOperand(0).getMBB(); 2387 } 2388 2389 return &UseMI; 2390 } 2391 2392 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2393 MachineRegisterInfo &MRI, 2394 Register LiveIn, 2395 Register PhyReg) const { 2396 assert(PhyReg.isPhysical() && "Physical register expected"); 2397 2398 // Insert the live-in copy, if required, by defining destination virtual 2399 // register. 2400 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2401 if (!MRI.getVRegDef(LiveIn)) { 2402 // FIXME: Should have scoped insert pt 2403 MachineBasicBlock &OrigInsBB = B.getMBB(); 2404 auto OrigInsPt = B.getInsertPt(); 2405 2406 MachineBasicBlock &EntryMBB = B.getMF().front(); 2407 EntryMBB.addLiveIn(PhyReg); 2408 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2409 B.buildCopy(LiveIn, PhyReg); 2410 2411 B.setInsertPt(OrigInsBB, OrigInsPt); 2412 } 2413 2414 return LiveIn; 2415 } 2416 2417 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2418 MachineRegisterInfo &MRI, 2419 Register PhyReg, LLT Ty, 2420 bool InsertLiveInCopy) const { 2421 assert(PhyReg.isPhysical() && "Physical register expected"); 2422 2423 // Get or create virtual live-in regester 2424 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2425 if (!LiveIn) { 2426 LiveIn = MRI.createGenericVirtualRegister(Ty); 2427 MRI.addLiveIn(PhyReg, LiveIn); 2428 } 2429 2430 // When the actual true copy required is from virtual register to physical 2431 // register (to be inserted later), live-in copy insertion from physical 2432 // to register virtual register is not required 2433 if (!InsertLiveInCopy) 2434 return LiveIn; 2435 2436 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2437 } 2438 2439 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2440 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2441 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2442 const ArgDescriptor *Arg; 2443 const TargetRegisterClass *RC; 2444 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2445 if (!Arg) { 2446 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2447 return nullptr; 2448 } 2449 return Arg; 2450 } 2451 2452 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2453 const ArgDescriptor *Arg) const { 2454 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2455 return false; // TODO: Handle these 2456 2457 Register SrcReg = Arg->getRegister(); 2458 assert(SrcReg.isPhysical() && "Physical register expected"); 2459 assert(DstReg.isVirtual() && "Virtual register expected"); 2460 2461 MachineRegisterInfo &MRI = *B.getMRI(); 2462 2463 LLT Ty = MRI.getType(DstReg); 2464 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2465 2466 if (Arg->isMasked()) { 2467 // TODO: Should we try to emit this once in the entry block? 2468 const LLT S32 = LLT::scalar(32); 2469 const unsigned Mask = Arg->getMask(); 2470 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2471 2472 Register AndMaskSrc = LiveIn; 2473 2474 if (Shift != 0) { 2475 auto ShiftAmt = B.buildConstant(S32, Shift); 2476 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2477 } 2478 2479 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2480 } else { 2481 B.buildCopy(DstReg, LiveIn); 2482 } 2483 2484 return true; 2485 } 2486 2487 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2488 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2489 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2490 2491 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2492 if (!Arg) 2493 return false; 2494 2495 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2496 return false; 2497 2498 MI.eraseFromParent(); 2499 return true; 2500 } 2501 2502 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2503 MachineRegisterInfo &MRI, 2504 MachineIRBuilder &B) const { 2505 Register Dst = MI.getOperand(0).getReg(); 2506 LLT DstTy = MRI.getType(Dst); 2507 LLT S16 = LLT::scalar(16); 2508 LLT S32 = LLT::scalar(32); 2509 LLT S64 = LLT::scalar(64); 2510 2511 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2512 return true; 2513 2514 if (DstTy == S16) 2515 return legalizeFDIV16(MI, MRI, B); 2516 if (DstTy == S32) 2517 return legalizeFDIV32(MI, MRI, B); 2518 if (DstTy == S64) 2519 return legalizeFDIV64(MI, MRI, B); 2520 2521 return false; 2522 } 2523 2524 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2525 const LLT S32 = LLT::scalar(32); 2526 2527 auto Cvt0 = B.buildUITOFP(S32, Src); 2528 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2529 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2530 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2531 return B.buildFPTOUI(S32, Mul).getReg(0); 2532 } 2533 2534 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2535 Register DstReg, 2536 Register Num, 2537 Register Den, 2538 bool IsRem) const { 2539 const LLT S1 = LLT::scalar(1); 2540 const LLT S32 = LLT::scalar(32); 2541 2542 // RCP = URECIP(Den) = 2^32 / Den + e 2543 // e is rounding error. 2544 auto RCP = buildDivRCP(B, Den); 2545 2546 // RCP_LO = mul(RCP, Den) 2547 auto RCP_LO = B.buildMul(S32, RCP, Den); 2548 2549 // RCP_HI = mulhu (RCP, Den) */ 2550 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2551 2552 // NEG_RCP_LO = -RCP_LO 2553 auto Zero = B.buildConstant(S32, 0); 2554 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2555 2556 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2557 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2558 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2559 2560 // Calculate the rounding error from the URECIP instruction 2561 // E = mulhu(ABS_RCP_LO, RCP) 2562 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2563 2564 // RCP_A_E = RCP + E 2565 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2566 2567 // RCP_S_E = RCP - E 2568 auto RCP_S_E = B.buildSub(S32, RCP, E); 2569 2570 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2571 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2572 2573 // Quotient = mulhu(Tmp0, Num)stmp 2574 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2575 2576 // Num_S_Remainder = Quotient * Den 2577 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2578 2579 // Remainder = Num - Num_S_Remainder 2580 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2581 2582 // Remainder_GE_Den = Remainder >= Den 2583 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2584 2585 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2586 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2587 Num, Num_S_Remainder); 2588 2589 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2590 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2591 2592 // Calculate Division result: 2593 2594 // Quotient_A_One = Quotient + 1 2595 auto One = B.buildConstant(S32, 1); 2596 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2597 2598 // Quotient_S_One = Quotient - 1 2599 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2600 2601 // Div = (Tmp1 ? Quotient_A_One : Quotient) 2602 auto Div = B.buildSelect(S32, Tmp1, Quotient_A_One, Quotient); 2603 2604 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2605 if (IsRem) { 2606 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2607 2608 // Calculate Rem result: 2609 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2610 2611 // Remainder_A_Den = Remainder + Den 2612 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2613 2614 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2615 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2616 2617 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2618 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2619 } else { 2620 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2621 } 2622 } 2623 2624 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2625 MachineRegisterInfo &MRI, 2626 MachineIRBuilder &B) const { 2627 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2628 Register DstReg = MI.getOperand(0).getReg(); 2629 Register Num = MI.getOperand(1).getReg(); 2630 Register Den = MI.getOperand(2).getReg(); 2631 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2632 MI.eraseFromParent(); 2633 return true; 2634 } 2635 2636 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2637 // 2638 // Return lo, hi of result 2639 // 2640 // %cvt.lo = G_UITOFP Val.lo 2641 // %cvt.hi = G_UITOFP Val.hi 2642 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2643 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2644 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2645 // %mul2 = G_FMUL %mul1, 2**(-32) 2646 // %trunc = G_INTRINSIC_TRUNC %mul2 2647 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2648 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2649 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2650 Register Val) { 2651 const LLT S32 = LLT::scalar(32); 2652 auto Unmerge = B.buildUnmerge(S32, Val); 2653 2654 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2655 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2656 2657 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2658 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2659 2660 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2661 auto Mul1 = 2662 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2663 2664 // 2**(-32) 2665 auto Mul2 = 2666 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2667 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2668 2669 // -(2**32) 2670 auto Mad2 = B.buildFMAD(S32, Trunc, 2671 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2672 2673 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2674 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2675 2676 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2677 } 2678 2679 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI, 2680 MachineRegisterInfo &MRI, 2681 MachineIRBuilder &B) const { 2682 const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV; 2683 const LLT S32 = LLT::scalar(32); 2684 const LLT S64 = LLT::scalar(64); 2685 const LLT S1 = LLT::scalar(1); 2686 Register Numer = MI.getOperand(1).getReg(); 2687 Register Denom = MI.getOperand(2).getReg(); 2688 Register RcpLo, RcpHi; 2689 2690 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2691 2692 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2693 2694 auto Zero64 = B.buildConstant(S64, 0); 2695 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2696 2697 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2698 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2699 2700 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2701 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2702 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2703 2704 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2705 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2706 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2707 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2708 2709 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2710 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2711 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2712 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2713 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2714 2715 auto Zero32 = B.buildConstant(S32, 0); 2716 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2717 auto Add2_HiC = 2718 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2719 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2720 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2721 2722 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2723 Register NumerLo = UnmergeNumer.getReg(0); 2724 Register NumerHi = UnmergeNumer.getReg(1); 2725 2726 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2727 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2728 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2729 Register Mul3_Lo = UnmergeMul3.getReg(0); 2730 Register Mul3_Hi = UnmergeMul3.getReg(1); 2731 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2732 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2733 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2734 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2735 2736 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2737 Register DenomLo = UnmergeDenom.getReg(0); 2738 Register DenomHi = UnmergeDenom.getReg(1); 2739 2740 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2741 auto C1 = B.buildSExt(S32, CmpHi); 2742 2743 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2744 auto C2 = B.buildSExt(S32, CmpLo); 2745 2746 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2747 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2748 2749 // TODO: Here and below portions of the code can be enclosed into if/endif. 2750 // Currently control flow is unconditional and we have 4 selects after 2751 // potential endif to substitute PHIs. 2752 2753 // if C3 != 0 ... 2754 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2755 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2756 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2757 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2758 2759 auto One64 = B.buildConstant(S64, 1); 2760 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2761 2762 auto C4 = 2763 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2764 auto C5 = 2765 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2766 auto C6 = B.buildSelect( 2767 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2768 2769 // if (C6 != 0) 2770 auto Add4 = B.buildAdd(S64, Add3, One64); 2771 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2772 2773 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2774 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2775 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2776 2777 // endif C6 2778 // endif C3 2779 2780 if (IsDiv) { 2781 auto Sel1 = B.buildSelect( 2782 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2783 B.buildSelect(MI.getOperand(0), 2784 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2785 } else { 2786 auto Sel2 = B.buildSelect( 2787 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2788 B.buildSelect(MI.getOperand(0), 2789 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2790 } 2791 2792 MI.eraseFromParent(); 2793 return true; 2794 } 2795 2796 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2797 MachineRegisterInfo &MRI, 2798 MachineIRBuilder &B) const { 2799 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2800 if (Ty == LLT::scalar(32)) 2801 return legalizeUDIV_UREM32(MI, MRI, B); 2802 if (Ty == LLT::scalar(64)) 2803 return legalizeUDIV_UREM64(MI, MRI, B); 2804 return false; 2805 } 2806 2807 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2808 MachineRegisterInfo &MRI, 2809 MachineIRBuilder &B) const { 2810 const LLT S32 = LLT::scalar(32); 2811 2812 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2813 Register DstReg = MI.getOperand(0).getReg(); 2814 Register LHS = MI.getOperand(1).getReg(); 2815 Register RHS = MI.getOperand(2).getReg(); 2816 2817 auto ThirtyOne = B.buildConstant(S32, 31); 2818 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2819 auto RHSign = B.buildAShr(S32, RHS, ThirtyOne); 2820 2821 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2822 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2823 2824 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2825 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2826 2827 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2828 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2829 2830 if (IsRem) { 2831 auto RSign = LHSign; // Remainder sign is the same as LHS 2832 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2833 B.buildSub(DstReg, UDivRem, RSign); 2834 } else { 2835 auto DSign = B.buildXor(S32, LHSign, RHSign); 2836 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2837 B.buildSub(DstReg, UDivRem, DSign); 2838 } 2839 2840 MI.eraseFromParent(); 2841 return true; 2842 } 2843 2844 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2845 MachineRegisterInfo &MRI, 2846 MachineIRBuilder &B) const { 2847 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2848 return legalizeSDIV_SREM32(MI, MRI, B); 2849 return false; 2850 } 2851 2852 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2853 MachineRegisterInfo &MRI, 2854 MachineIRBuilder &B) const { 2855 Register Res = MI.getOperand(0).getReg(); 2856 Register LHS = MI.getOperand(1).getReg(); 2857 Register RHS = MI.getOperand(2).getReg(); 2858 2859 uint16_t Flags = MI.getFlags(); 2860 2861 LLT ResTy = MRI.getType(Res); 2862 LLT S32 = LLT::scalar(32); 2863 LLT S64 = LLT::scalar(64); 2864 2865 const MachineFunction &MF = B.getMF(); 2866 bool Unsafe = 2867 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2868 2869 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2870 return false; 2871 2872 if (!Unsafe && ResTy == S32 && 2873 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2874 return false; 2875 2876 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2877 // 1 / x -> RCP(x) 2878 if (CLHS->isExactlyValue(1.0)) { 2879 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2880 .addUse(RHS) 2881 .setMIFlags(Flags); 2882 2883 MI.eraseFromParent(); 2884 return true; 2885 } 2886 2887 // -1 / x -> RCP( FNEG(x) ) 2888 if (CLHS->isExactlyValue(-1.0)) { 2889 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2890 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2891 .addUse(FNeg.getReg(0)) 2892 .setMIFlags(Flags); 2893 2894 MI.eraseFromParent(); 2895 return true; 2896 } 2897 } 2898 2899 // x / y -> x * (1.0 / y) 2900 if (Unsafe) { 2901 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2902 .addUse(RHS) 2903 .setMIFlags(Flags); 2904 B.buildFMul(Res, LHS, RCP, Flags); 2905 2906 MI.eraseFromParent(); 2907 return true; 2908 } 2909 2910 return false; 2911 } 2912 2913 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2914 MachineRegisterInfo &MRI, 2915 MachineIRBuilder &B) const { 2916 Register Res = MI.getOperand(0).getReg(); 2917 Register LHS = MI.getOperand(1).getReg(); 2918 Register RHS = MI.getOperand(2).getReg(); 2919 2920 uint16_t Flags = MI.getFlags(); 2921 2922 LLT S16 = LLT::scalar(16); 2923 LLT S32 = LLT::scalar(32); 2924 2925 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2926 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2927 2928 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2929 .addUse(RHSExt.getReg(0)) 2930 .setMIFlags(Flags); 2931 2932 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2933 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2934 2935 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2936 .addUse(RDst.getReg(0)) 2937 .addUse(RHS) 2938 .addUse(LHS) 2939 .setMIFlags(Flags); 2940 2941 MI.eraseFromParent(); 2942 return true; 2943 } 2944 2945 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2946 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2947 static void toggleSPDenormMode(bool Enable, 2948 MachineIRBuilder &B, 2949 const GCNSubtarget &ST, 2950 AMDGPU::SIModeRegisterDefaults Mode) { 2951 // Set SP denorm mode to this value. 2952 unsigned SPDenormMode = 2953 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2954 2955 if (ST.hasDenormModeInst()) { 2956 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2957 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2958 2959 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2960 B.buildInstr(AMDGPU::S_DENORM_MODE) 2961 .addImm(NewDenormModeValue); 2962 2963 } else { 2964 // Select FP32 bit field in mode register. 2965 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2966 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2967 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2968 2969 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2970 .addImm(SPDenormMode) 2971 .addImm(SPDenormModeBitField); 2972 } 2973 } 2974 2975 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2976 MachineRegisterInfo &MRI, 2977 MachineIRBuilder &B) const { 2978 Register Res = MI.getOperand(0).getReg(); 2979 Register LHS = MI.getOperand(1).getReg(); 2980 Register RHS = MI.getOperand(2).getReg(); 2981 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2982 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2983 2984 uint16_t Flags = MI.getFlags(); 2985 2986 LLT S32 = LLT::scalar(32); 2987 LLT S1 = LLT::scalar(1); 2988 2989 auto One = B.buildFConstant(S32, 1.0f); 2990 2991 auto DenominatorScaled = 2992 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2993 .addUse(LHS) 2994 .addUse(RHS) 2995 .addImm(0) 2996 .setMIFlags(Flags); 2997 auto NumeratorScaled = 2998 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2999 .addUse(LHS) 3000 .addUse(RHS) 3001 .addImm(1) 3002 .setMIFlags(Flags); 3003 3004 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3005 .addUse(DenominatorScaled.getReg(0)) 3006 .setMIFlags(Flags); 3007 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3008 3009 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3010 // aren't modeled as reading it. 3011 if (!Mode.allFP32Denormals()) 3012 toggleSPDenormMode(true, B, ST, Mode); 3013 3014 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3015 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3016 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3017 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3018 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3019 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3020 3021 if (!Mode.allFP32Denormals()) 3022 toggleSPDenormMode(false, B, ST, Mode); 3023 3024 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3025 .addUse(Fma4.getReg(0)) 3026 .addUse(Fma1.getReg(0)) 3027 .addUse(Fma3.getReg(0)) 3028 .addUse(NumeratorScaled.getReg(1)) 3029 .setMIFlags(Flags); 3030 3031 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3032 .addUse(Fmas.getReg(0)) 3033 .addUse(RHS) 3034 .addUse(LHS) 3035 .setMIFlags(Flags); 3036 3037 MI.eraseFromParent(); 3038 return true; 3039 } 3040 3041 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3042 MachineRegisterInfo &MRI, 3043 MachineIRBuilder &B) const { 3044 Register Res = MI.getOperand(0).getReg(); 3045 Register LHS = MI.getOperand(1).getReg(); 3046 Register RHS = MI.getOperand(2).getReg(); 3047 3048 uint16_t Flags = MI.getFlags(); 3049 3050 LLT S64 = LLT::scalar(64); 3051 LLT S1 = LLT::scalar(1); 3052 3053 auto One = B.buildFConstant(S64, 1.0); 3054 3055 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3056 .addUse(LHS) 3057 .addUse(RHS) 3058 .addImm(0) 3059 .setMIFlags(Flags); 3060 3061 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3062 3063 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3064 .addUse(DivScale0.getReg(0)) 3065 .setMIFlags(Flags); 3066 3067 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3068 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3069 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3070 3071 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3072 .addUse(LHS) 3073 .addUse(RHS) 3074 .addImm(1) 3075 .setMIFlags(Flags); 3076 3077 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3078 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3079 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3080 3081 Register Scale; 3082 if (!ST.hasUsableDivScaleConditionOutput()) { 3083 // Workaround a hardware bug on SI where the condition output from div_scale 3084 // is not usable. 3085 3086 LLT S32 = LLT::scalar(32); 3087 3088 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3089 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3090 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3091 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3092 3093 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3094 Scale1Unmerge.getReg(1)); 3095 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3096 Scale0Unmerge.getReg(1)); 3097 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3098 } else { 3099 Scale = DivScale1.getReg(1); 3100 } 3101 3102 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3103 .addUse(Fma4.getReg(0)) 3104 .addUse(Fma3.getReg(0)) 3105 .addUse(Mul.getReg(0)) 3106 .addUse(Scale) 3107 .setMIFlags(Flags); 3108 3109 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3110 .addUse(Fmas.getReg(0)) 3111 .addUse(RHS) 3112 .addUse(LHS) 3113 .setMIFlags(Flags); 3114 3115 MI.eraseFromParent(); 3116 return true; 3117 } 3118 3119 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3120 MachineRegisterInfo &MRI, 3121 MachineIRBuilder &B) const { 3122 Register Res = MI.getOperand(0).getReg(); 3123 Register LHS = MI.getOperand(2).getReg(); 3124 Register RHS = MI.getOperand(3).getReg(); 3125 uint16_t Flags = MI.getFlags(); 3126 3127 LLT S32 = LLT::scalar(32); 3128 LLT S1 = LLT::scalar(1); 3129 3130 auto Abs = B.buildFAbs(S32, RHS, Flags); 3131 const APFloat C0Val(1.0f); 3132 3133 auto C0 = B.buildConstant(S32, 0x6f800000); 3134 auto C1 = B.buildConstant(S32, 0x2f800000); 3135 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3136 3137 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3138 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3139 3140 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3141 3142 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3143 .addUse(Mul0.getReg(0)) 3144 .setMIFlags(Flags); 3145 3146 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3147 3148 B.buildFMul(Res, Sel, Mul1, Flags); 3149 3150 MI.eraseFromParent(); 3151 return true; 3152 } 3153 3154 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3155 MachineRegisterInfo &MRI, 3156 MachineIRBuilder &B) const { 3157 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3158 if (!MFI->isEntryFunction()) { 3159 return legalizePreloadedArgIntrin(MI, MRI, B, 3160 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3161 } 3162 3163 uint64_t Offset = 3164 ST.getTargetLowering()->getImplicitParameterOffset( 3165 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3166 Register DstReg = MI.getOperand(0).getReg(); 3167 LLT DstTy = MRI.getType(DstReg); 3168 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3169 3170 const ArgDescriptor *Arg; 3171 const TargetRegisterClass *RC; 3172 std::tie(Arg, RC) 3173 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3174 if (!Arg) 3175 return false; 3176 3177 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3178 if (!loadInputValue(KernargPtrReg, B, Arg)) 3179 return false; 3180 3181 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3182 MI.eraseFromParent(); 3183 return true; 3184 } 3185 3186 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3187 MachineRegisterInfo &MRI, 3188 MachineIRBuilder &B, 3189 unsigned AddrSpace) const { 3190 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3191 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3192 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3193 MI.eraseFromParent(); 3194 return true; 3195 } 3196 3197 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3198 // offset (the offset that is included in bounds checking and swizzling, to be 3199 // split between the instruction's voffset and immoffset fields) and soffset 3200 // (the offset that is excluded from bounds checking and swizzling, to go in 3201 // the instruction's soffset field). This function takes the first kind of 3202 // offset and figures out how to split it between voffset and immoffset. 3203 std::tuple<Register, unsigned, unsigned> 3204 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3205 Register OrigOffset) const { 3206 const unsigned MaxImm = 4095; 3207 Register BaseReg; 3208 unsigned TotalConstOffset; 3209 MachineInstr *OffsetDef; 3210 const LLT S32 = LLT::scalar(32); 3211 3212 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3213 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3214 3215 unsigned ImmOffset = TotalConstOffset; 3216 3217 // If the immediate value is too big for the immoffset field, put the value 3218 // and -4096 into the immoffset field so that the value that is copied/added 3219 // for the voffset field is a multiple of 4096, and it stands more chance 3220 // of being CSEd with the copy/add for another similar load/store. 3221 // However, do not do that rounding down to a multiple of 4096 if that is a 3222 // negative number, as it appears to be illegal to have a negative offset 3223 // in the vgpr, even if adding the immediate offset makes it positive. 3224 unsigned Overflow = ImmOffset & ~MaxImm; 3225 ImmOffset -= Overflow; 3226 if ((int32_t)Overflow < 0) { 3227 Overflow += ImmOffset; 3228 ImmOffset = 0; 3229 } 3230 3231 if (Overflow != 0) { 3232 if (!BaseReg) { 3233 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3234 } else { 3235 auto OverflowVal = B.buildConstant(S32, Overflow); 3236 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3237 } 3238 } 3239 3240 if (!BaseReg) 3241 BaseReg = B.buildConstant(S32, 0).getReg(0); 3242 3243 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3244 } 3245 3246 /// Handle register layout difference for f16 images for some subtargets. 3247 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3248 MachineRegisterInfo &MRI, 3249 Register Reg) const { 3250 if (!ST.hasUnpackedD16VMem()) 3251 return Reg; 3252 3253 const LLT S16 = LLT::scalar(16); 3254 const LLT S32 = LLT::scalar(32); 3255 LLT StoreVT = MRI.getType(Reg); 3256 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3257 3258 auto Unmerge = B.buildUnmerge(S16, Reg); 3259 3260 SmallVector<Register, 4> WideRegs; 3261 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3262 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3263 3264 int NumElts = StoreVT.getNumElements(); 3265 3266 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3267 } 3268 3269 Register AMDGPULegalizerInfo::fixStoreSourceType( 3270 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3271 MachineRegisterInfo *MRI = B.getMRI(); 3272 LLT Ty = MRI->getType(VData); 3273 3274 const LLT S16 = LLT::scalar(16); 3275 3276 // Fixup illegal register types for i8 stores. 3277 if (Ty == LLT::scalar(8) || Ty == S16) { 3278 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3279 return AnyExt; 3280 } 3281 3282 if (Ty.isVector()) { 3283 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3284 if (IsFormat) 3285 return handleD16VData(B, *MRI, VData); 3286 } 3287 } 3288 3289 return VData; 3290 } 3291 3292 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3293 MachineRegisterInfo &MRI, 3294 MachineIRBuilder &B, 3295 bool IsTyped, 3296 bool IsFormat) const { 3297 Register VData = MI.getOperand(1).getReg(); 3298 LLT Ty = MRI.getType(VData); 3299 LLT EltTy = Ty.getScalarType(); 3300 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3301 const LLT S32 = LLT::scalar(32); 3302 3303 VData = fixStoreSourceType(B, VData, IsFormat); 3304 Register RSrc = MI.getOperand(2).getReg(); 3305 3306 MachineMemOperand *MMO = *MI.memoperands_begin(); 3307 const int MemSize = MMO->getSize(); 3308 3309 unsigned ImmOffset; 3310 unsigned TotalOffset; 3311 3312 // The typed intrinsics add an immediate after the registers. 3313 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3314 3315 // The struct intrinsic variants add one additional operand over raw. 3316 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3317 Register VIndex; 3318 int OpOffset = 0; 3319 if (HasVIndex) { 3320 VIndex = MI.getOperand(3).getReg(); 3321 OpOffset = 1; 3322 } 3323 3324 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3325 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3326 3327 unsigned Format = 0; 3328 if (IsTyped) { 3329 Format = MI.getOperand(5 + OpOffset).getImm(); 3330 ++OpOffset; 3331 } 3332 3333 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3334 3335 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3336 if (TotalOffset != 0) 3337 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3338 3339 unsigned Opc; 3340 if (IsTyped) { 3341 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3342 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3343 } else if (IsFormat) { 3344 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3345 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3346 } else { 3347 switch (MemSize) { 3348 case 1: 3349 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3350 break; 3351 case 2: 3352 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3353 break; 3354 default: 3355 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3356 break; 3357 } 3358 } 3359 3360 if (!VIndex) 3361 VIndex = B.buildConstant(S32, 0).getReg(0); 3362 3363 auto MIB = B.buildInstr(Opc) 3364 .addUse(VData) // vdata 3365 .addUse(RSrc) // rsrc 3366 .addUse(VIndex) // vindex 3367 .addUse(VOffset) // voffset 3368 .addUse(SOffset) // soffset 3369 .addImm(ImmOffset); // offset(imm) 3370 3371 if (IsTyped) 3372 MIB.addImm(Format); 3373 3374 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3375 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3376 .addMemOperand(MMO); 3377 3378 MI.eraseFromParent(); 3379 return true; 3380 } 3381 3382 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3383 MachineRegisterInfo &MRI, 3384 MachineIRBuilder &B, 3385 bool IsFormat, 3386 bool IsTyped) const { 3387 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3388 MachineMemOperand *MMO = *MI.memoperands_begin(); 3389 const int MemSize = MMO->getSize(); 3390 const LLT S32 = LLT::scalar(32); 3391 3392 Register Dst = MI.getOperand(0).getReg(); 3393 Register RSrc = MI.getOperand(2).getReg(); 3394 3395 // The typed intrinsics add an immediate after the registers. 3396 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3397 3398 // The struct intrinsic variants add one additional operand over raw. 3399 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3400 Register VIndex; 3401 int OpOffset = 0; 3402 if (HasVIndex) { 3403 VIndex = MI.getOperand(3).getReg(); 3404 OpOffset = 1; 3405 } 3406 3407 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3408 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3409 3410 unsigned Format = 0; 3411 if (IsTyped) { 3412 Format = MI.getOperand(5 + OpOffset).getImm(); 3413 ++OpOffset; 3414 } 3415 3416 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3417 unsigned ImmOffset; 3418 unsigned TotalOffset; 3419 3420 LLT Ty = MRI.getType(Dst); 3421 LLT EltTy = Ty.getScalarType(); 3422 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3423 const bool Unpacked = ST.hasUnpackedD16VMem(); 3424 3425 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3426 if (TotalOffset != 0) 3427 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3428 3429 unsigned Opc; 3430 3431 if (IsTyped) { 3432 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3433 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3434 } else if (IsFormat) { 3435 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3436 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3437 } else { 3438 switch (MemSize) { 3439 case 1: 3440 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3441 break; 3442 case 2: 3443 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3444 break; 3445 default: 3446 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3447 break; 3448 } 3449 } 3450 3451 Register LoadDstReg; 3452 3453 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3454 LLT UnpackedTy = Ty.changeElementSize(32); 3455 3456 if (IsExtLoad) 3457 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3458 else if (Unpacked && IsD16 && Ty.isVector()) 3459 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3460 else 3461 LoadDstReg = Dst; 3462 3463 if (!VIndex) 3464 VIndex = B.buildConstant(S32, 0).getReg(0); 3465 3466 auto MIB = B.buildInstr(Opc) 3467 .addDef(LoadDstReg) // vdata 3468 .addUse(RSrc) // rsrc 3469 .addUse(VIndex) // vindex 3470 .addUse(VOffset) // voffset 3471 .addUse(SOffset) // soffset 3472 .addImm(ImmOffset); // offset(imm) 3473 3474 if (IsTyped) 3475 MIB.addImm(Format); 3476 3477 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3478 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3479 .addMemOperand(MMO); 3480 3481 if (LoadDstReg != Dst) { 3482 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3483 3484 // Widen result for extending loads was widened. 3485 if (IsExtLoad) 3486 B.buildTrunc(Dst, LoadDstReg); 3487 else { 3488 // Repack to original 16-bit vector result 3489 // FIXME: G_TRUNC should work, but legalization currently fails 3490 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3491 SmallVector<Register, 4> Repack; 3492 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3493 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3494 B.buildMerge(Dst, Repack); 3495 } 3496 } 3497 3498 MI.eraseFromParent(); 3499 return true; 3500 } 3501 3502 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3503 MachineIRBuilder &B, 3504 bool IsInc) const { 3505 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3506 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3507 B.buildInstr(Opc) 3508 .addDef(MI.getOperand(0).getReg()) 3509 .addUse(MI.getOperand(2).getReg()) 3510 .addUse(MI.getOperand(3).getReg()) 3511 .cloneMemRefs(MI); 3512 MI.eraseFromParent(); 3513 return true; 3514 } 3515 3516 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3517 switch (IntrID) { 3518 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3519 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3520 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3521 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3522 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3523 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3524 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3525 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3526 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3527 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3528 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3529 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3530 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3531 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3532 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3533 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3534 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3535 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3536 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3537 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3538 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3539 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3540 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3541 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3542 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3543 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3544 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3545 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3546 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3547 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3548 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3549 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3550 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3551 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3552 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3553 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3554 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3555 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3556 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3557 default: 3558 llvm_unreachable("unhandled atomic opcode"); 3559 } 3560 } 3561 3562 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3563 MachineIRBuilder &B, 3564 Intrinsic::ID IID) const { 3565 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3566 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3567 3568 Register Dst = MI.getOperand(0).getReg(); 3569 Register VData = MI.getOperand(2).getReg(); 3570 3571 Register CmpVal; 3572 int OpOffset = 0; 3573 3574 if (IsCmpSwap) { 3575 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3576 ++OpOffset; 3577 } 3578 3579 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3580 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3581 3582 // The struct intrinsic variants add one additional operand over raw. 3583 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3584 Register VIndex; 3585 if (HasVIndex) { 3586 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3587 ++OpOffset; 3588 } 3589 3590 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3591 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3592 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3593 3594 MachineMemOperand *MMO = *MI.memoperands_begin(); 3595 3596 unsigned ImmOffset; 3597 unsigned TotalOffset; 3598 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3599 if (TotalOffset != 0) 3600 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3601 3602 if (!VIndex) 3603 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3604 3605 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3606 .addDef(Dst) 3607 .addUse(VData); // vdata 3608 3609 if (IsCmpSwap) 3610 MIB.addReg(CmpVal); 3611 3612 MIB.addUse(RSrc) // rsrc 3613 .addUse(VIndex) // vindex 3614 .addUse(VOffset) // voffset 3615 .addUse(SOffset) // soffset 3616 .addImm(ImmOffset) // offset(imm) 3617 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3618 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3619 .addMemOperand(MMO); 3620 3621 MI.eraseFromParent(); 3622 return true; 3623 } 3624 3625 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3626 /// vector with s16 typed elements. 3627 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3628 SmallVectorImpl<Register> &PackedAddrs, 3629 int AddrIdx, int DimIdx, int EndIdx, 3630 int NumGradients) { 3631 const LLT S16 = LLT::scalar(16); 3632 const LLT V2S16 = LLT::vector(2, 16); 3633 3634 for (int I = AddrIdx; I < EndIdx; ++I) { 3635 MachineOperand &SrcOp = MI.getOperand(I); 3636 if (!SrcOp.isReg()) 3637 continue; // _L to _LZ may have eliminated this. 3638 3639 Register AddrReg = SrcOp.getReg(); 3640 3641 if (I < DimIdx) { 3642 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3643 PackedAddrs.push_back(AddrReg); 3644 } else { 3645 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3646 // derivatives dx/dh and dx/dv are packed with undef. 3647 if (((I + 1) >= EndIdx) || 3648 ((NumGradients / 2) % 2 == 1 && 3649 (I == DimIdx + (NumGradients / 2) - 1 || 3650 I == DimIdx + NumGradients - 1)) || 3651 // Check for _L to _LZ optimization 3652 !MI.getOperand(I + 1).isReg()) { 3653 PackedAddrs.push_back( 3654 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3655 .getReg(0)); 3656 } else { 3657 PackedAddrs.push_back( 3658 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3659 .getReg(0)); 3660 ++I; 3661 } 3662 } 3663 } 3664 } 3665 3666 /// Convert from separate vaddr components to a single vector address register, 3667 /// and replace the remaining operands with $noreg. 3668 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3669 int DimIdx, int NumVAddrs) { 3670 const LLT S32 = LLT::scalar(32); 3671 3672 SmallVector<Register, 8> AddrRegs; 3673 for (int I = 0; I != NumVAddrs; ++I) { 3674 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3675 if (SrcOp.isReg()) { 3676 AddrRegs.push_back(SrcOp.getReg()); 3677 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3678 } 3679 } 3680 3681 int NumAddrRegs = AddrRegs.size(); 3682 if (NumAddrRegs != 1) { 3683 // Round up to 8 elements for v5-v7 3684 // FIXME: Missing intermediate sized register classes and instructions. 3685 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3686 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3687 auto Undef = B.buildUndef(S32); 3688 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3689 NumAddrRegs = RoundedNumRegs; 3690 } 3691 3692 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3693 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3694 } 3695 3696 for (int I = 1; I != NumVAddrs; ++I) { 3697 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3698 if (SrcOp.isReg()) 3699 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3700 } 3701 } 3702 3703 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3704 /// 3705 /// Depending on the subtarget, load/store with 16-bit element data need to be 3706 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3707 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3708 /// registers. 3709 /// 3710 /// We don't want to directly select image instructions just yet, but also want 3711 /// to exposes all register repacking to the legalizer/combiners. We also don't 3712 /// want a selected instrution entering RegBankSelect. In order to avoid 3713 /// defining a multitude of intermediate image instructions, directly hack on 3714 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3715 /// now unnecessary arguments with $noreg. 3716 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3717 MachineInstr &MI, MachineIRBuilder &B, 3718 GISelChangeObserver &Observer, 3719 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3720 3721 const int NumDefs = MI.getNumExplicitDefs(); 3722 bool IsTFE = NumDefs == 2; 3723 // We are only processing the operands of d16 image operations on subtargets 3724 // that use the unpacked register layout, or need to repack the TFE result. 3725 3726 // TODO: Do we need to guard against already legalized intrinsics? 3727 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3728 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3729 3730 MachineRegisterInfo *MRI = B.getMRI(); 3731 const LLT S32 = LLT::scalar(32); 3732 const LLT S16 = LLT::scalar(16); 3733 const LLT V2S16 = LLT::vector(2, 16); 3734 3735 // Index of first address argument 3736 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3737 3738 int NumVAddrs, NumGradients; 3739 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3740 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3741 getDMaskIdx(BaseOpcode, NumDefs); 3742 unsigned DMask = 0; 3743 3744 // Check for 16 bit addresses and pack if true. 3745 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3746 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3747 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3748 const bool IsG16 = GradTy == S16; 3749 const bool IsA16 = AddrTy == S16; 3750 3751 int DMaskLanes = 0; 3752 if (!BaseOpcode->Atomic) { 3753 DMask = MI.getOperand(DMaskIdx).getImm(); 3754 if (BaseOpcode->Gather4) { 3755 DMaskLanes = 4; 3756 } else if (DMask != 0) { 3757 DMaskLanes = countPopulation(DMask); 3758 } else if (!IsTFE && !BaseOpcode->Store) { 3759 // If dmask is 0, this is a no-op load. This can be eliminated. 3760 B.buildUndef(MI.getOperand(0)); 3761 MI.eraseFromParent(); 3762 return true; 3763 } 3764 } 3765 3766 Observer.changingInstr(MI); 3767 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3768 3769 unsigned NewOpcode = NumDefs == 0 ? 3770 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3771 3772 // Track that we legalized this 3773 MI.setDesc(B.getTII().get(NewOpcode)); 3774 3775 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3776 // dmask to be at least 1 otherwise the instruction will fail 3777 if (IsTFE && DMask == 0) { 3778 DMask = 0x1; 3779 DMaskLanes = 1; 3780 MI.getOperand(DMaskIdx).setImm(DMask); 3781 } 3782 3783 if (BaseOpcode->Atomic) { 3784 Register VData0 = MI.getOperand(2).getReg(); 3785 LLT Ty = MRI->getType(VData0); 3786 3787 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3788 if (Ty.isVector()) 3789 return false; 3790 3791 if (BaseOpcode->AtomicX2) { 3792 Register VData1 = MI.getOperand(3).getReg(); 3793 // The two values are packed in one register. 3794 LLT PackedTy = LLT::vector(2, Ty); 3795 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3796 MI.getOperand(2).setReg(Concat.getReg(0)); 3797 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3798 } 3799 } 3800 3801 int CorrectedNumVAddrs = NumVAddrs; 3802 3803 // Optimize _L to _LZ when _L is zero 3804 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3805 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3806 const ConstantFP *ConstantLod; 3807 const int LodIdx = AddrIdx + NumVAddrs - 1; 3808 3809 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3810 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3811 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3812 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3813 LZMappingInfo->LZ, ImageDimIntr->Dim); 3814 3815 // The starting indexes should remain in the same place. 3816 --NumVAddrs; 3817 --CorrectedNumVAddrs; 3818 3819 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3820 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3821 MI.RemoveOperand(LodIdx); 3822 } 3823 } 3824 } 3825 3826 // Optimize _mip away, when 'lod' is zero 3827 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3828 int64_t ConstantLod; 3829 const int LodIdx = AddrIdx + NumVAddrs - 1; 3830 3831 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3832 if (ConstantLod == 0) { 3833 // TODO: Change intrinsic opcode and remove operand instead or replacing 3834 // it with 0, as the _L to _LZ handling is done above. 3835 MI.getOperand(LodIdx).ChangeToImmediate(0); 3836 --CorrectedNumVAddrs; 3837 } 3838 } 3839 } 3840 3841 // Rewrite the addressing register layout before doing anything else. 3842 if (IsA16 || IsG16) { 3843 if (IsA16) { 3844 // Target must support the feature and gradients need to be 16 bit too 3845 if (!ST.hasA16() || !IsG16) 3846 return false; 3847 } else if (!ST.hasG16()) 3848 return false; 3849 3850 if (NumVAddrs > 1) { 3851 SmallVector<Register, 4> PackedRegs; 3852 // Don't compress addresses for G16 3853 const int PackEndIdx = 3854 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3855 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3856 PackEndIdx, NumGradients); 3857 3858 if (!IsA16) { 3859 // Add uncompressed address 3860 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3861 int AddrReg = MI.getOperand(I).getReg(); 3862 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3863 PackedRegs.push_back(AddrReg); 3864 } 3865 } 3866 3867 // See also below in the non-a16 branch 3868 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3869 3870 if (!UseNSA && PackedRegs.size() > 1) { 3871 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3872 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3873 PackedRegs[0] = Concat.getReg(0); 3874 PackedRegs.resize(1); 3875 } 3876 3877 const int NumPacked = PackedRegs.size(); 3878 for (int I = 0; I != NumVAddrs; ++I) { 3879 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3880 if (!SrcOp.isReg()) { 3881 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3882 continue; 3883 } 3884 3885 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3886 3887 if (I < NumPacked) 3888 SrcOp.setReg(PackedRegs[I]); 3889 else 3890 SrcOp.setReg(AMDGPU::NoRegister); 3891 } 3892 } 3893 } else { 3894 // If the register allocator cannot place the address registers contiguously 3895 // without introducing moves, then using the non-sequential address encoding 3896 // is always preferable, since it saves VALU instructions and is usually a 3897 // wash in terms of code size or even better. 3898 // 3899 // However, we currently have no way of hinting to the register allocator 3900 // that MIMG addresses should be placed contiguously when it is possible to 3901 // do so, so force non-NSA for the common 2-address case as a heuristic. 3902 // 3903 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3904 // allocation when possible. 3905 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3906 3907 if (!UseNSA && NumVAddrs > 1) 3908 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3909 } 3910 3911 int Flags = 0; 3912 if (IsA16) 3913 Flags |= 1; 3914 if (IsG16) 3915 Flags |= 2; 3916 MI.addOperand(MachineOperand::CreateImm(Flags)); 3917 3918 if (BaseOpcode->Store) { // No TFE for stores? 3919 // TODO: Handle dmask trim 3920 Register VData = MI.getOperand(1).getReg(); 3921 LLT Ty = MRI->getType(VData); 3922 if (!Ty.isVector() || Ty.getElementType() != S16) 3923 return true; 3924 3925 Register RepackedReg = handleD16VData(B, *MRI, VData); 3926 if (RepackedReg != VData) { 3927 MI.getOperand(1).setReg(RepackedReg); 3928 } 3929 3930 return true; 3931 } 3932 3933 Register DstReg = MI.getOperand(0).getReg(); 3934 LLT Ty = MRI->getType(DstReg); 3935 const LLT EltTy = Ty.getScalarType(); 3936 const bool IsD16 = Ty.getScalarType() == S16; 3937 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3938 3939 // Confirm that the return type is large enough for the dmask specified 3940 if (NumElts < DMaskLanes) 3941 return false; 3942 3943 if (NumElts > 4 || DMaskLanes > 4) 3944 return false; 3945 3946 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3947 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3948 3949 // The raw dword aligned data component of the load. The only legal cases 3950 // where this matters should be when using the packed D16 format, for 3951 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3952 LLT RoundedTy; 3953 3954 // S32 vector to to cover all data, plus TFE result element. 3955 LLT TFETy; 3956 3957 // Register type to use for each loaded component. Will be S32 or V2S16. 3958 LLT RegTy; 3959 3960 if (IsD16 && ST.hasUnpackedD16VMem()) { 3961 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3962 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3963 RegTy = S32; 3964 } else { 3965 unsigned EltSize = EltTy.getSizeInBits(); 3966 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3967 unsigned RoundedSize = 32 * RoundedElts; 3968 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3969 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3970 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3971 } 3972 3973 // The return type does not need adjustment. 3974 // TODO: Should we change s16 case to s32 or <2 x s16>? 3975 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3976 return true; 3977 3978 Register Dst1Reg; 3979 3980 // Insert after the instruction. 3981 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3982 3983 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3984 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3985 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3986 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3987 3988 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3989 3990 MI.getOperand(0).setReg(NewResultReg); 3991 3992 // In the IR, TFE is supposed to be used with a 2 element struct return 3993 // type. The intruction really returns these two values in one contiguous 3994 // register, with one additional dword beyond the loaded data. Rewrite the 3995 // return type to use a single register result. 3996 3997 if (IsTFE) { 3998 Dst1Reg = MI.getOperand(1).getReg(); 3999 if (MRI->getType(Dst1Reg) != S32) 4000 return false; 4001 4002 // TODO: Make sure the TFE operand bit is set. 4003 MI.RemoveOperand(1); 4004 4005 // Handle the easy case that requires no repack instructions. 4006 if (Ty == S32) { 4007 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4008 return true; 4009 } 4010 } 4011 4012 // Now figure out how to copy the new result register back into the old 4013 // result. 4014 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4015 4016 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4017 4018 if (ResultNumRegs == 1) { 4019 assert(!IsTFE); 4020 ResultRegs[0] = NewResultReg; 4021 } else { 4022 // We have to repack into a new vector of some kind. 4023 for (int I = 0; I != NumDataRegs; ++I) 4024 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4025 B.buildUnmerge(ResultRegs, NewResultReg); 4026 4027 // Drop the final TFE element to get the data part. The TFE result is 4028 // directly written to the right place already. 4029 if (IsTFE) 4030 ResultRegs.resize(NumDataRegs); 4031 } 4032 4033 // For an s16 scalar result, we form an s32 result with a truncate regardless 4034 // of packed vs. unpacked. 4035 if (IsD16 && !Ty.isVector()) { 4036 B.buildTrunc(DstReg, ResultRegs[0]); 4037 return true; 4038 } 4039 4040 // Avoid a build/concat_vector of 1 entry. 4041 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4042 B.buildBitcast(DstReg, ResultRegs[0]); 4043 return true; 4044 } 4045 4046 assert(Ty.isVector()); 4047 4048 if (IsD16) { 4049 // For packed D16 results with TFE enabled, all the data components are 4050 // S32. Cast back to the expected type. 4051 // 4052 // TODO: We don't really need to use load s32 elements. We would only need one 4053 // cast for the TFE result if a multiple of v2s16 was used. 4054 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4055 for (Register &Reg : ResultRegs) 4056 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4057 } else if (ST.hasUnpackedD16VMem()) { 4058 for (Register &Reg : ResultRegs) 4059 Reg = B.buildTrunc(S16, Reg).getReg(0); 4060 } 4061 } 4062 4063 auto padWithUndef = [&](LLT Ty, int NumElts) { 4064 if (NumElts == 0) 4065 return; 4066 Register Undef = B.buildUndef(Ty).getReg(0); 4067 for (int I = 0; I != NumElts; ++I) 4068 ResultRegs.push_back(Undef); 4069 }; 4070 4071 // Pad out any elements eliminated due to the dmask. 4072 LLT ResTy = MRI->getType(ResultRegs[0]); 4073 if (!ResTy.isVector()) { 4074 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4075 B.buildBuildVector(DstReg, ResultRegs); 4076 return true; 4077 } 4078 4079 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4080 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4081 4082 // Deal with the one annoying legal case. 4083 const LLT V3S16 = LLT::vector(3, 16); 4084 if (Ty == V3S16) { 4085 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4086 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4087 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4088 return true; 4089 } 4090 4091 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4092 B.buildConcatVectors(DstReg, ResultRegs); 4093 return true; 4094 } 4095 4096 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4097 MachineInstr &MI, MachineIRBuilder &B, 4098 GISelChangeObserver &Observer) const { 4099 Register Dst = MI.getOperand(0).getReg(); 4100 LLT Ty = B.getMRI()->getType(Dst); 4101 unsigned Size = Ty.getSizeInBits(); 4102 MachineFunction &MF = B.getMF(); 4103 4104 Observer.changingInstr(MI); 4105 4106 // FIXME: We don't really need this intermediate instruction. The intrinsic 4107 // should be fixed to have a memory operand. Since it's readnone, we're not 4108 // allowed to add one. 4109 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4110 MI.RemoveOperand(1); // Remove intrinsic ID 4111 4112 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4113 // TODO: Should this use datalayout alignment? 4114 const unsigned MemSize = (Size + 7) / 8; 4115 const Align MemAlign(4); 4116 MachineMemOperand *MMO = MF.getMachineMemOperand( 4117 MachinePointerInfo(), 4118 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4119 MachineMemOperand::MOInvariant, 4120 MemSize, MemAlign); 4121 MI.addMemOperand(MF, MMO); 4122 4123 // There are no 96-bit result scalar loads, but widening to 128-bit should 4124 // always be legal. We may need to restore this to a 96-bit result if it turns 4125 // out this needs to be converted to a vector load during RegBankSelect. 4126 if (!isPowerOf2_32(Size)) { 4127 LegalizerHelper Helper(MF, *this, Observer, B); 4128 4129 if (Ty.isVector()) 4130 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4131 else 4132 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4133 } 4134 4135 Observer.changedInstr(MI); 4136 return true; 4137 } 4138 4139 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4140 MachineRegisterInfo &MRI, 4141 MachineIRBuilder &B) const { 4142 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4143 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4144 !ST.isTrapHandlerEnabled()) { 4145 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4146 } else { 4147 // Pass queue pointer to trap handler as input, and insert trap instruction 4148 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4149 const ArgDescriptor *Arg = 4150 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4151 if (!Arg) 4152 return false; 4153 MachineRegisterInfo &MRI = *B.getMRI(); 4154 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4155 Register LiveIn = getLiveInRegister( 4156 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4157 /*InsertLiveInCopy=*/false); 4158 if (!loadInputValue(LiveIn, B, Arg)) 4159 return false; 4160 B.buildCopy(SGPR01, LiveIn); 4161 B.buildInstr(AMDGPU::S_TRAP) 4162 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4163 .addReg(SGPR01, RegState::Implicit); 4164 } 4165 4166 MI.eraseFromParent(); 4167 return true; 4168 } 4169 4170 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4171 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4172 // Is non-HSA path or trap-handler disabled? then, report a warning 4173 // accordingly 4174 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4175 !ST.isTrapHandlerEnabled()) { 4176 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4177 "debugtrap handler not supported", 4178 MI.getDebugLoc(), DS_Warning); 4179 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4180 Ctx.diagnose(NoTrap); 4181 } else { 4182 // Insert debug-trap instruction 4183 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4184 } 4185 4186 MI.eraseFromParent(); 4187 return true; 4188 } 4189 4190 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4191 MachineInstr &MI) const { 4192 MachineIRBuilder &B = Helper.MIRBuilder; 4193 MachineRegisterInfo &MRI = *B.getMRI(); 4194 4195 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4196 auto IntrID = MI.getIntrinsicID(); 4197 switch (IntrID) { 4198 case Intrinsic::amdgcn_if: 4199 case Intrinsic::amdgcn_else: { 4200 MachineInstr *Br = nullptr; 4201 MachineBasicBlock *UncondBrTarget = nullptr; 4202 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4203 const SIRegisterInfo *TRI 4204 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4205 4206 Register Def = MI.getOperand(1).getReg(); 4207 Register Use = MI.getOperand(3).getReg(); 4208 4209 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4210 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4211 if (IntrID == Intrinsic::amdgcn_if) { 4212 B.buildInstr(AMDGPU::SI_IF) 4213 .addDef(Def) 4214 .addUse(Use) 4215 .addMBB(UncondBrTarget); 4216 } else { 4217 B.buildInstr(AMDGPU::SI_ELSE) 4218 .addDef(Def) 4219 .addUse(Use) 4220 .addMBB(UncondBrTarget) 4221 .addImm(0); 4222 } 4223 4224 if (Br) { 4225 Br->getOperand(0).setMBB(CondBrTarget); 4226 } else { 4227 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4228 // since we're swapping branch targets it needs to be reinserted. 4229 // FIXME: IRTranslator should probably not do this 4230 B.buildBr(*CondBrTarget); 4231 } 4232 4233 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4234 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4235 MI.eraseFromParent(); 4236 BrCond->eraseFromParent(); 4237 return true; 4238 } 4239 4240 return false; 4241 } 4242 case Intrinsic::amdgcn_loop: { 4243 MachineInstr *Br = nullptr; 4244 MachineBasicBlock *UncondBrTarget = nullptr; 4245 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4246 const SIRegisterInfo *TRI 4247 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4248 4249 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4250 Register Reg = MI.getOperand(2).getReg(); 4251 4252 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4253 B.buildInstr(AMDGPU::SI_LOOP) 4254 .addUse(Reg) 4255 .addMBB(UncondBrTarget); 4256 4257 if (Br) 4258 Br->getOperand(0).setMBB(CondBrTarget); 4259 else 4260 B.buildBr(*CondBrTarget); 4261 4262 MI.eraseFromParent(); 4263 BrCond->eraseFromParent(); 4264 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4265 return true; 4266 } 4267 4268 return false; 4269 } 4270 case Intrinsic::amdgcn_kernarg_segment_ptr: 4271 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4272 // This only makes sense to call in a kernel, so just lower to null. 4273 B.buildConstant(MI.getOperand(0).getReg(), 0); 4274 MI.eraseFromParent(); 4275 return true; 4276 } 4277 4278 return legalizePreloadedArgIntrin( 4279 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4280 case Intrinsic::amdgcn_implicitarg_ptr: 4281 return legalizeImplicitArgPtr(MI, MRI, B); 4282 case Intrinsic::amdgcn_workitem_id_x: 4283 return legalizePreloadedArgIntrin(MI, MRI, B, 4284 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4285 case Intrinsic::amdgcn_workitem_id_y: 4286 return legalizePreloadedArgIntrin(MI, MRI, B, 4287 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4288 case Intrinsic::amdgcn_workitem_id_z: 4289 return legalizePreloadedArgIntrin(MI, MRI, B, 4290 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4291 case Intrinsic::amdgcn_workgroup_id_x: 4292 return legalizePreloadedArgIntrin(MI, MRI, B, 4293 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4294 case Intrinsic::amdgcn_workgroup_id_y: 4295 return legalizePreloadedArgIntrin(MI, MRI, B, 4296 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4297 case Intrinsic::amdgcn_workgroup_id_z: 4298 return legalizePreloadedArgIntrin(MI, MRI, B, 4299 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4300 case Intrinsic::amdgcn_dispatch_ptr: 4301 return legalizePreloadedArgIntrin(MI, MRI, B, 4302 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4303 case Intrinsic::amdgcn_queue_ptr: 4304 return legalizePreloadedArgIntrin(MI, MRI, B, 4305 AMDGPUFunctionArgInfo::QUEUE_PTR); 4306 case Intrinsic::amdgcn_implicit_buffer_ptr: 4307 return legalizePreloadedArgIntrin( 4308 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4309 case Intrinsic::amdgcn_dispatch_id: 4310 return legalizePreloadedArgIntrin(MI, MRI, B, 4311 AMDGPUFunctionArgInfo::DISPATCH_ID); 4312 case Intrinsic::amdgcn_fdiv_fast: 4313 return legalizeFDIVFastIntrin(MI, MRI, B); 4314 case Intrinsic::amdgcn_is_shared: 4315 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4316 case Intrinsic::amdgcn_is_private: 4317 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4318 case Intrinsic::amdgcn_wavefrontsize: { 4319 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4320 MI.eraseFromParent(); 4321 return true; 4322 } 4323 case Intrinsic::amdgcn_s_buffer_load: 4324 return legalizeSBufferLoad(MI, B, Helper.Observer); 4325 case Intrinsic::amdgcn_raw_buffer_store: 4326 case Intrinsic::amdgcn_struct_buffer_store: 4327 return legalizeBufferStore(MI, MRI, B, false, false); 4328 case Intrinsic::amdgcn_raw_buffer_store_format: 4329 case Intrinsic::amdgcn_struct_buffer_store_format: 4330 return legalizeBufferStore(MI, MRI, B, false, true); 4331 case Intrinsic::amdgcn_raw_tbuffer_store: 4332 case Intrinsic::amdgcn_struct_tbuffer_store: 4333 return legalizeBufferStore(MI, MRI, B, true, true); 4334 case Intrinsic::amdgcn_raw_buffer_load: 4335 case Intrinsic::amdgcn_struct_buffer_load: 4336 return legalizeBufferLoad(MI, MRI, B, false, false); 4337 case Intrinsic::amdgcn_raw_buffer_load_format: 4338 case Intrinsic::amdgcn_struct_buffer_load_format: 4339 return legalizeBufferLoad(MI, MRI, B, true, false); 4340 case Intrinsic::amdgcn_raw_tbuffer_load: 4341 case Intrinsic::amdgcn_struct_tbuffer_load: 4342 return legalizeBufferLoad(MI, MRI, B, true, true); 4343 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4344 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4345 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4346 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4347 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4348 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4349 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4350 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4351 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4352 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4353 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4354 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4355 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4356 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4357 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4358 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4359 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4360 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4361 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4362 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4363 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4364 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4365 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4366 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4367 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4368 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4369 return legalizeBufferAtomic(MI, B, IntrID); 4370 case Intrinsic::amdgcn_atomic_inc: 4371 return legalizeAtomicIncDec(MI, B, true); 4372 case Intrinsic::amdgcn_atomic_dec: 4373 return legalizeAtomicIncDec(MI, B, false); 4374 case Intrinsic::trap: 4375 return legalizeTrapIntrinsic(MI, MRI, B); 4376 case Intrinsic::debugtrap: 4377 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4378 default: { 4379 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4380 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4381 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4382 return true; 4383 } 4384 } 4385 4386 return true; 4387 } 4388