1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 return Ty.isVector() && 67 Ty.getNumElements() % 2 != 0 && 68 Ty.getElementType().getSizeInBits() < 32 && 69 Ty.getSizeInBits() % 32 != 0; 70 }; 71 } 72 73 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 74 return [=](const LegalityQuery &Query) { 75 const LLT Ty = Query.Types[TypeIdx]; 76 const LLT EltTy = Ty.getScalarType(); 77 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 78 }; 79 } 80 81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 82 return [=](const LegalityQuery &Query) { 83 const LLT Ty = Query.Types[TypeIdx]; 84 const LLT EltTy = Ty.getElementType(); 85 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 86 }; 87 } 88 89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 90 return [=](const LegalityQuery &Query) { 91 const LLT Ty = Query.Types[TypeIdx]; 92 const LLT EltTy = Ty.getElementType(); 93 unsigned Size = Ty.getSizeInBits(); 94 unsigned Pieces = (Size + 63) / 64; 95 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 96 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 97 }; 98 } 99 100 // Increase the number of vector elements to reach the next multiple of 32-bit 101 // type. 102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 106 const LLT EltTy = Ty.getElementType(); 107 const int Size = Ty.getSizeInBits(); 108 const int EltSize = EltTy.getSizeInBits(); 109 const int NextMul32 = (Size + 31) / 32; 110 111 assert(EltSize < 32); 112 113 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 114 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 115 }; 116 } 117 118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 unsigned Size = Ty.getSizeInBits(); 122 123 LLT CoercedTy; 124 if (Size <= 32) { 125 // <2 x s8> -> s16 126 // <4 x s8> -> s32 127 CoercedTy = LLT::scalar(Size); 128 } else 129 CoercedTy = LLT::scalarOrVector(Size / 32, 32); 130 131 return std::make_pair(TypeIdx, CoercedTy); 132 }; 133 } 134 135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 136 return [=](const LegalityQuery &Query) { 137 const LLT QueryTy = Query.Types[TypeIdx]; 138 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 139 }; 140 } 141 142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 143 return [=](const LegalityQuery &Query) { 144 const LLT QueryTy = Query.Types[TypeIdx]; 145 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 146 }; 147 } 148 149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT QueryTy = Query.Types[TypeIdx]; 152 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 153 }; 154 } 155 156 static bool isRegisterSize(unsigned Size) { 157 return Size % 32 == 0 && Size <= MaxRegisterSize; 158 } 159 160 static bool isRegisterVectorElementType(LLT EltTy) { 161 const int EltSize = EltTy.getSizeInBits(); 162 return EltSize == 16 || EltSize % 32 == 0; 163 } 164 165 static bool isRegisterVectorType(LLT Ty) { 166 const int EltSize = Ty.getElementType().getSizeInBits(); 167 return EltSize == 32 || EltSize == 64 || 168 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 169 EltSize == 128 || EltSize == 256; 170 } 171 172 static bool isRegisterType(LLT Ty) { 173 if (!isRegisterSize(Ty.getSizeInBits())) 174 return false; 175 176 if (Ty.isVector()) 177 return isRegisterVectorType(Ty); 178 179 return true; 180 } 181 182 // Any combination of 32 or 64-bit elements up the maximum register size, and 183 // multiples of v2s16. 184 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 return isRegisterType(Query.Types[TypeIdx]); 187 }; 188 } 189 190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 191 return [=](const LegalityQuery &Query) { 192 const LLT QueryTy = Query.Types[TypeIdx]; 193 if (!QueryTy.isVector()) 194 return false; 195 const LLT EltTy = QueryTy.getElementType(); 196 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 197 }; 198 } 199 200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 201 return [=](const LegalityQuery &Query) { 202 const LLT Ty = Query.Types[TypeIdx]; 203 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 204 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 205 }; 206 } 207 208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 209 // handle some operations by just promoting the register during 210 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 212 bool IsLoad) { 213 switch (AS) { 214 case AMDGPUAS::PRIVATE_ADDRESS: 215 // FIXME: Private element size. 216 return 32; 217 case AMDGPUAS::LOCAL_ADDRESS: 218 return ST.useDS128() ? 128 : 64; 219 case AMDGPUAS::GLOBAL_ADDRESS: 220 case AMDGPUAS::CONSTANT_ADDRESS: 221 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 222 // Treat constant and global as identical. SMRD loads are sometimes usable for 223 // global loads (ideally constant address space should be eliminated) 224 // depending on the context. Legality cannot be context dependent, but 225 // RegBankSelect can split the load as necessary depending on the pointer 226 // register bank/uniformity and if the memory is invariant or not written in a 227 // kernel. 228 return IsLoad ? 512 : 128; 229 default: 230 // Flat addresses may contextually need to be split to 32-bit parts if they 231 // may alias scratch depending on the subtarget. 232 return 128; 233 } 234 } 235 236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 237 const LegalityQuery &Query, 238 unsigned Opcode) { 239 const LLT Ty = Query.Types[0]; 240 241 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 242 const bool IsLoad = Opcode != AMDGPU::G_STORE; 243 244 unsigned RegSize = Ty.getSizeInBits(); 245 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 246 unsigned Align = Query.MMODescrs[0].AlignInBits; 247 unsigned AS = Query.Types[1].getAddressSpace(); 248 249 // All of these need to be custom lowered to cast the pointer operand. 250 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 251 return false; 252 253 // TODO: We should be able to widen loads if the alignment is high enough, but 254 // we also need to modify the memory access size. 255 #if 0 256 // Accept widening loads based on alignment. 257 if (IsLoad && MemSize < Size) 258 MemSize = std::max(MemSize, Align); 259 #endif 260 261 // Only 1-byte and 2-byte to 32-bit extloads are valid. 262 if (MemSize != RegSize && RegSize != 32) 263 return false; 264 265 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 266 return false; 267 268 switch (MemSize) { 269 case 8: 270 case 16: 271 case 32: 272 case 64: 273 case 128: 274 break; 275 case 96: 276 if (!ST.hasDwordx3LoadStores()) 277 return false; 278 break; 279 case 256: 280 case 512: 281 // These may contextually need to be broken down. 282 break; 283 default: 284 return false; 285 } 286 287 assert(RegSize >= MemSize); 288 289 if (Align < MemSize) { 290 const SITargetLowering *TLI = ST.getTargetLowering(); 291 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 292 return false; 293 } 294 295 return true; 296 } 297 298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 299 // workaround this. Eventually it should ignore the type for loads and only care 300 // about the size. Return true in cases where we will workaround this for now by 301 // bitcasting. 302 static bool loadStoreBitcastWorkaround(const LLT Ty) { 303 if (EnableNewLegality) 304 return false; 305 306 const unsigned Size = Ty.getSizeInBits(); 307 if (Size <= 64) 308 return false; 309 if (!Ty.isVector()) 310 return true; 311 unsigned EltSize = Ty.getElementType().getSizeInBits(); 312 return EltSize != 32 && EltSize != 64; 313 } 314 315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 316 unsigned Opcode) { 317 const LLT Ty = Query.Types[0]; 318 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 319 !loadStoreBitcastWorkaround(Ty); 320 } 321 322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 323 const GCNTargetMachine &TM) 324 : ST(ST_) { 325 using namespace TargetOpcode; 326 327 auto GetAddrSpacePtr = [&TM](unsigned AS) { 328 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 329 }; 330 331 const LLT S1 = LLT::scalar(1); 332 const LLT S16 = LLT::scalar(16); 333 const LLT S32 = LLT::scalar(32); 334 const LLT S64 = LLT::scalar(64); 335 const LLT S128 = LLT::scalar(128); 336 const LLT S256 = LLT::scalar(256); 337 const LLT S512 = LLT::scalar(512); 338 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 339 340 const LLT V2S16 = LLT::vector(2, 16); 341 const LLT V4S16 = LLT::vector(4, 16); 342 343 const LLT V2S32 = LLT::vector(2, 32); 344 const LLT V3S32 = LLT::vector(3, 32); 345 const LLT V4S32 = LLT::vector(4, 32); 346 const LLT V5S32 = LLT::vector(5, 32); 347 const LLT V6S32 = LLT::vector(6, 32); 348 const LLT V7S32 = LLT::vector(7, 32); 349 const LLT V8S32 = LLT::vector(8, 32); 350 const LLT V9S32 = LLT::vector(9, 32); 351 const LLT V10S32 = LLT::vector(10, 32); 352 const LLT V11S32 = LLT::vector(11, 32); 353 const LLT V12S32 = LLT::vector(12, 32); 354 const LLT V13S32 = LLT::vector(13, 32); 355 const LLT V14S32 = LLT::vector(14, 32); 356 const LLT V15S32 = LLT::vector(15, 32); 357 const LLT V16S32 = LLT::vector(16, 32); 358 const LLT V32S32 = LLT::vector(32, 32); 359 360 const LLT V2S64 = LLT::vector(2, 64); 361 const LLT V3S64 = LLT::vector(3, 64); 362 const LLT V4S64 = LLT::vector(4, 64); 363 const LLT V5S64 = LLT::vector(5, 64); 364 const LLT V6S64 = LLT::vector(6, 64); 365 const LLT V7S64 = LLT::vector(7, 64); 366 const LLT V8S64 = LLT::vector(8, 64); 367 const LLT V16S64 = LLT::vector(16, 64); 368 369 std::initializer_list<LLT> AllS32Vectors = 370 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 371 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 372 std::initializer_list<LLT> AllS64Vectors = 373 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 374 375 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 376 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 377 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 378 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 379 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 380 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 381 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 382 383 const LLT CodePtr = FlatPtr; 384 385 const std::initializer_list<LLT> AddrSpaces64 = { 386 GlobalPtr, ConstantPtr, FlatPtr 387 }; 388 389 const std::initializer_list<LLT> AddrSpaces32 = { 390 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 391 }; 392 393 const std::initializer_list<LLT> FPTypesBase = { 394 S32, S64 395 }; 396 397 const std::initializer_list<LLT> FPTypes16 = { 398 S32, S64, S16 399 }; 400 401 const std::initializer_list<LLT> FPTypesPK16 = { 402 S32, S64, S16, V2S16 403 }; 404 405 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 406 407 setAction({G_BRCOND, S1}, Legal); // VCC branches 408 setAction({G_BRCOND, S32}, Legal); // SCC branches 409 410 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 411 // elements for v3s16 412 getActionDefinitionsBuilder(G_PHI) 413 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 414 .legalFor(AllS32Vectors) 415 .legalFor(AllS64Vectors) 416 .legalFor(AddrSpaces64) 417 .legalFor(AddrSpaces32) 418 .clampScalar(0, S32, S256) 419 .widenScalarToNextPow2(0, 32) 420 .clampMaxNumElements(0, S32, 16) 421 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 422 .legalIf(isPointer(0)); 423 424 if (ST.hasVOP3PInsts()) { 425 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 426 .legalFor({S32, S16, V2S16}) 427 .clampScalar(0, S16, S32) 428 .clampMaxNumElements(0, S16, 2) 429 .scalarize(0) 430 .widenScalarToNextPow2(0, 32); 431 } else if (ST.has16BitInsts()) { 432 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 433 .legalFor({S32, S16}) 434 .clampScalar(0, S16, S32) 435 .scalarize(0) 436 .widenScalarToNextPow2(0, 32); 437 } else { 438 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 439 .legalFor({S32}) 440 .clampScalar(0, S32, S32) 441 .scalarize(0); 442 } 443 444 // FIXME: Not really legal. Placeholder for custom lowering. 445 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 446 .customFor({S32, S64}) 447 .clampScalar(0, S32, S64) 448 .widenScalarToNextPow2(0, 32) 449 .scalarize(0); 450 451 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 452 .legalFor({S32}) 453 .clampScalar(0, S32, S32) 454 .scalarize(0); 455 456 // Report legal for any types we can handle anywhere. For the cases only legal 457 // on the SALU, RegBankSelect will be able to re-legalize. 458 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 459 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 460 .clampScalar(0, S32, S64) 461 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 462 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 463 .widenScalarToNextPow2(0) 464 .scalarize(0); 465 466 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 467 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 468 .legalFor({{S32, S1}, {S32, S32}}) 469 .minScalar(0, S32) 470 // TODO: .scalarize(0) 471 .lower(); 472 473 getActionDefinitionsBuilder(G_BITCAST) 474 // Don't worry about the size constraint. 475 .legalIf(all(isRegisterType(0), isRegisterType(1))) 476 .lower(); 477 478 479 getActionDefinitionsBuilder(G_CONSTANT) 480 .legalFor({S1, S32, S64, S16, GlobalPtr, 481 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 482 .clampScalar(0, S32, S64) 483 .widenScalarToNextPow2(0) 484 .legalIf(isPointer(0)); 485 486 getActionDefinitionsBuilder(G_FCONSTANT) 487 .legalFor({S32, S64, S16}) 488 .clampScalar(0, S16, S64); 489 490 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 491 .legalIf(isRegisterType(0)) 492 // s1 and s16 are special cases because they have legal operations on 493 // them, but don't really occupy registers in the normal way. 494 .legalFor({S1, S16}) 495 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 496 .clampScalarOrElt(0, S32, MaxScalar) 497 .widenScalarToNextPow2(0, 32) 498 .clampMaxNumElements(0, S32, 16); 499 500 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 501 502 // If the amount is divergent, we have to do a wave reduction to get the 503 // maximum value, so this is expanded during RegBankSelect. 504 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 505 .legalFor({{PrivatePtr, S32}}); 506 507 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 508 .unsupportedFor({PrivatePtr}) 509 .custom(); 510 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 511 512 auto &FPOpActions = getActionDefinitionsBuilder( 513 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 514 .legalFor({S32, S64}); 515 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 516 .customFor({S32, S64}); 517 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 518 .customFor({S32, S64}); 519 520 if (ST.has16BitInsts()) { 521 if (ST.hasVOP3PInsts()) 522 FPOpActions.legalFor({S16, V2S16}); 523 else 524 FPOpActions.legalFor({S16}); 525 526 TrigActions.customFor({S16}); 527 FDIVActions.customFor({S16}); 528 } 529 530 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 531 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 532 533 if (ST.hasVOP3PInsts()) { 534 MinNumMaxNum.customFor(FPTypesPK16) 535 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 536 .clampMaxNumElements(0, S16, 2) 537 .clampScalar(0, S16, S64) 538 .scalarize(0); 539 } else if (ST.has16BitInsts()) { 540 MinNumMaxNum.customFor(FPTypes16) 541 .clampScalar(0, S16, S64) 542 .scalarize(0); 543 } else { 544 MinNumMaxNum.customFor(FPTypesBase) 545 .clampScalar(0, S32, S64) 546 .scalarize(0); 547 } 548 549 if (ST.hasVOP3PInsts()) 550 FPOpActions.clampMaxNumElements(0, S16, 2); 551 552 FPOpActions 553 .scalarize(0) 554 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 555 556 TrigActions 557 .scalarize(0) 558 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 559 560 FDIVActions 561 .scalarize(0) 562 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 563 564 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 565 .legalFor(FPTypesPK16) 566 .clampMaxNumElements(0, S16, 2) 567 .scalarize(0) 568 .clampScalar(0, S16, S64); 569 570 if (ST.has16BitInsts()) { 571 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 572 .legalFor({S32, S64, S16}) 573 .scalarize(0) 574 .clampScalar(0, S16, S64); 575 } else { 576 getActionDefinitionsBuilder(G_FSQRT) 577 .legalFor({S32, S64}) 578 .scalarize(0) 579 .clampScalar(0, S32, S64); 580 581 if (ST.hasFractBug()) { 582 getActionDefinitionsBuilder(G_FFLOOR) 583 .customFor({S64}) 584 .legalFor({S32, S64}) 585 .scalarize(0) 586 .clampScalar(0, S32, S64); 587 } else { 588 getActionDefinitionsBuilder(G_FFLOOR) 589 .legalFor({S32, S64}) 590 .scalarize(0) 591 .clampScalar(0, S32, S64); 592 } 593 } 594 595 getActionDefinitionsBuilder(G_FPTRUNC) 596 .legalFor({{S32, S64}, {S16, S32}}) 597 .scalarize(0) 598 .lower(); 599 600 getActionDefinitionsBuilder(G_FPEXT) 601 .legalFor({{S64, S32}, {S32, S16}}) 602 .lowerFor({{S64, S16}}) // FIXME: Implement 603 .scalarize(0); 604 605 getActionDefinitionsBuilder(G_FSUB) 606 // Use actual fsub instruction 607 .legalFor({S32}) 608 // Must use fadd + fneg 609 .lowerFor({S64, S16, V2S16}) 610 .scalarize(0) 611 .clampScalar(0, S32, S64); 612 613 // Whether this is legal depends on the floating point mode for the function. 614 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 615 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 616 FMad.customFor({S32, S16}); 617 else if (ST.hasMadMacF32Insts()) 618 FMad.customFor({S32}); 619 else if (ST.hasMadF16()) 620 FMad.customFor({S16}); 621 FMad.scalarize(0) 622 .lower(); 623 624 // TODO: Do we need to clamp maximum bitwidth? 625 getActionDefinitionsBuilder(G_TRUNC) 626 .legalIf(isScalar(0)) 627 .legalFor({{V2S16, V2S32}}) 628 .clampMaxNumElements(0, S16, 2) 629 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 630 // situations (like an invalid implicit use), we don't want to infinite loop 631 // in the legalizer. 632 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 633 .alwaysLegal(); 634 635 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 636 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 637 {S32, S1}, {S64, S1}, {S16, S1}}) 638 .scalarize(0) 639 .clampScalar(0, S32, S64) 640 .widenScalarToNextPow2(1, 32); 641 642 // TODO: Split s1->s64 during regbankselect for VALU. 643 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 644 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 645 .lowerFor({{S32, S64}}) 646 .lowerIf(typeIs(1, S1)) 647 .customFor({{S64, S64}}); 648 if (ST.has16BitInsts()) 649 IToFP.legalFor({{S16, S16}}); 650 IToFP.clampScalar(1, S32, S64) 651 .scalarize(0) 652 .widenScalarToNextPow2(1); 653 654 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 655 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 656 .customFor({{S64, S64}}); 657 if (ST.has16BitInsts()) 658 FPToI.legalFor({{S16, S16}}); 659 else 660 FPToI.minScalar(1, S32); 661 662 FPToI.minScalar(0, S32) 663 .scalarize(0) 664 .lower(); 665 666 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 667 .scalarize(0) 668 .lower(); 669 670 if (ST.has16BitInsts()) { 671 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 672 .legalFor({S16, S32, S64}) 673 .clampScalar(0, S16, S64) 674 .scalarize(0); 675 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 676 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 677 .legalFor({S32, S64}) 678 .clampScalar(0, S32, S64) 679 .scalarize(0); 680 } else { 681 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 682 .legalFor({S32}) 683 .customFor({S64}) 684 .clampScalar(0, S32, S64) 685 .scalarize(0); 686 } 687 688 // FIXME: Clamp offset operand. 689 getActionDefinitionsBuilder(G_PTR_ADD) 690 .legalIf(isPointer(0)) 691 .scalarize(0); 692 693 getActionDefinitionsBuilder(G_PTRMASK) 694 .legalIf(typeInSet(1, {S64, S32})) 695 .minScalar(1, S32) 696 .maxScalarIf(sizeIs(0, 32), 1, S32) 697 .maxScalarIf(sizeIs(0, 64), 1, S64) 698 .scalarize(0); 699 700 auto &CmpBuilder = 701 getActionDefinitionsBuilder(G_ICMP) 702 // The compare output type differs based on the register bank of the output, 703 // so make both s1 and s32 legal. 704 // 705 // Scalar compares producing output in scc will be promoted to s32, as that 706 // is the allocatable register type that will be needed for the copy from 707 // scc. This will be promoted during RegBankSelect, and we assume something 708 // before that won't try to use s32 result types. 709 // 710 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 711 // bank. 712 .legalForCartesianProduct( 713 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 714 .legalForCartesianProduct( 715 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 716 if (ST.has16BitInsts()) { 717 CmpBuilder.legalFor({{S1, S16}}); 718 } 719 720 CmpBuilder 721 .widenScalarToNextPow2(1) 722 .clampScalar(1, S32, S64) 723 .scalarize(0) 724 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 725 726 getActionDefinitionsBuilder(G_FCMP) 727 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 728 .widenScalarToNextPow2(1) 729 .clampScalar(1, S32, S64) 730 .scalarize(0); 731 732 // FIXME: fpow has a selection pattern that should move to custom lowering. 733 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 734 if (ST.has16BitInsts()) 735 Exp2Ops.legalFor({S32, S16}); 736 else 737 Exp2Ops.legalFor({S32}); 738 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 739 Exp2Ops.scalarize(0); 740 741 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 742 if (ST.has16BitInsts()) 743 ExpOps.customFor({{S32}, {S16}}); 744 else 745 ExpOps.customFor({S32}); 746 ExpOps.clampScalar(0, MinScalarFPTy, S32) 747 .scalarize(0); 748 749 // The 64-bit versions produce 32-bit results, but only on the SALU. 750 getActionDefinitionsBuilder(G_CTPOP) 751 .legalFor({{S32, S32}, {S32, S64}}) 752 .clampScalar(0, S32, S32) 753 .clampScalar(1, S32, S64) 754 .scalarize(0) 755 .widenScalarToNextPow2(0, 32) 756 .widenScalarToNextPow2(1, 32); 757 758 // The hardware instructions return a different result on 0 than the generic 759 // instructions expect. The hardware produces -1, but these produce the 760 // bitwidth. 761 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 762 .scalarize(0) 763 .clampScalar(0, S32, S32) 764 .clampScalar(1, S32, S64) 765 .widenScalarToNextPow2(0, 32) 766 .widenScalarToNextPow2(1, 32) 767 .lower(); 768 769 // The 64-bit versions produce 32-bit results, but only on the SALU. 770 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 771 .legalFor({{S32, S32}, {S32, S64}}) 772 .clampScalar(0, S32, S32) 773 .clampScalar(1, S32, S64) 774 .scalarize(0) 775 .widenScalarToNextPow2(0, 32) 776 .widenScalarToNextPow2(1, 32); 777 778 getActionDefinitionsBuilder(G_BITREVERSE) 779 .legalFor({S32}) 780 .clampScalar(0, S32, S32) 781 .scalarize(0); 782 783 if (ST.has16BitInsts()) { 784 getActionDefinitionsBuilder(G_BSWAP) 785 .legalFor({S16, S32, V2S16}) 786 .clampMaxNumElements(0, S16, 2) 787 // FIXME: Fixing non-power-of-2 before clamp is workaround for 788 // narrowScalar limitation. 789 .widenScalarToNextPow2(0) 790 .clampScalar(0, S16, S32) 791 .scalarize(0); 792 793 if (ST.hasVOP3PInsts()) { 794 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 795 .legalFor({S32, S16, V2S16}) 796 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 797 .clampMaxNumElements(0, S16, 2) 798 .minScalar(0, S16) 799 .widenScalarToNextPow2(0) 800 .scalarize(0) 801 .lower(); 802 } else { 803 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 804 .legalFor({S32, S16}) 805 .widenScalarToNextPow2(0) 806 .minScalar(0, S16) 807 .scalarize(0) 808 .lower(); 809 } 810 } else { 811 // TODO: Should have same legality without v_perm_b32 812 getActionDefinitionsBuilder(G_BSWAP) 813 .legalFor({S32}) 814 .lowerIf(scalarNarrowerThan(0, 32)) 815 // FIXME: Fixing non-power-of-2 before clamp is workaround for 816 // narrowScalar limitation. 817 .widenScalarToNextPow2(0) 818 .maxScalar(0, S32) 819 .scalarize(0) 820 .lower(); 821 822 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 823 .legalFor({S32}) 824 .minScalar(0, S32) 825 .widenScalarToNextPow2(0) 826 .scalarize(0) 827 .lower(); 828 } 829 830 getActionDefinitionsBuilder(G_INTTOPTR) 831 // List the common cases 832 .legalForCartesianProduct(AddrSpaces64, {S64}) 833 .legalForCartesianProduct(AddrSpaces32, {S32}) 834 .scalarize(0) 835 // Accept any address space as long as the size matches 836 .legalIf(sameSize(0, 1)) 837 .widenScalarIf(smallerThan(1, 0), 838 [](const LegalityQuery &Query) { 839 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 840 }) 841 .narrowScalarIf(largerThan(1, 0), 842 [](const LegalityQuery &Query) { 843 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 844 }); 845 846 getActionDefinitionsBuilder(G_PTRTOINT) 847 // List the common cases 848 .legalForCartesianProduct(AddrSpaces64, {S64}) 849 .legalForCartesianProduct(AddrSpaces32, {S32}) 850 .scalarize(0) 851 // Accept any address space as long as the size matches 852 .legalIf(sameSize(0, 1)) 853 .widenScalarIf(smallerThan(0, 1), 854 [](const LegalityQuery &Query) { 855 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 856 }) 857 .narrowScalarIf( 858 largerThan(0, 1), 859 [](const LegalityQuery &Query) { 860 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 861 }); 862 863 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 864 .scalarize(0) 865 .custom(); 866 867 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 868 bool IsLoad) -> bool { 869 const LLT DstTy = Query.Types[0]; 870 871 // Split vector extloads. 872 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 873 unsigned Align = Query.MMODescrs[0].AlignInBits; 874 875 if (MemSize < DstTy.getSizeInBits()) 876 MemSize = std::max(MemSize, Align); 877 878 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 879 return true; 880 881 const LLT PtrTy = Query.Types[1]; 882 unsigned AS = PtrTy.getAddressSpace(); 883 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 884 return true; 885 886 // Catch weird sized loads that don't evenly divide into the access sizes 887 // TODO: May be able to widen depending on alignment etc. 888 unsigned NumRegs = (MemSize + 31) / 32; 889 if (NumRegs == 3) { 890 if (!ST.hasDwordx3LoadStores()) 891 return true; 892 } else { 893 // If the alignment allows, these should have been widened. 894 if (!isPowerOf2_32(NumRegs)) 895 return true; 896 } 897 898 if (Align < MemSize) { 899 const SITargetLowering *TLI = ST.getTargetLowering(); 900 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 901 } 902 903 return false; 904 }; 905 906 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 907 unsigned Opc) -> bool { 908 unsigned Size = Query.Types[0].getSizeInBits(); 909 if (isPowerOf2_32(Size)) 910 return false; 911 912 if (Size == 96 && ST.hasDwordx3LoadStores()) 913 return false; 914 915 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 916 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 917 return false; 918 919 unsigned Align = Query.MMODescrs[0].AlignInBits; 920 unsigned RoundedSize = NextPowerOf2(Size); 921 return (Align >= RoundedSize); 922 }; 923 924 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 925 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 926 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 927 928 // TODO: Refine based on subtargets which support unaligned access or 128-bit 929 // LDS 930 // TODO: Unsupported flat for SI. 931 932 for (unsigned Op : {G_LOAD, G_STORE}) { 933 const bool IsStore = Op == G_STORE; 934 935 auto &Actions = getActionDefinitionsBuilder(Op); 936 // Explicitly list some common cases. 937 // TODO: Does this help compile time at all? 938 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 939 {V2S32, GlobalPtr, 64, GlobalAlign32}, 940 {V4S32, GlobalPtr, 128, GlobalAlign32}, 941 {S64, GlobalPtr, 64, GlobalAlign32}, 942 {V2S64, GlobalPtr, 128, GlobalAlign32}, 943 {V2S16, GlobalPtr, 32, GlobalAlign32}, 944 {S32, GlobalPtr, 8, GlobalAlign8}, 945 {S32, GlobalPtr, 16, GlobalAlign16}, 946 947 {S32, LocalPtr, 32, 32}, 948 {S64, LocalPtr, 64, 32}, 949 {V2S32, LocalPtr, 64, 32}, 950 {S32, LocalPtr, 8, 8}, 951 {S32, LocalPtr, 16, 16}, 952 {V2S16, LocalPtr, 32, 32}, 953 954 {S32, PrivatePtr, 32, 32}, 955 {S32, PrivatePtr, 8, 8}, 956 {S32, PrivatePtr, 16, 16}, 957 {V2S16, PrivatePtr, 32, 32}, 958 959 {S32, ConstantPtr, 32, GlobalAlign32}, 960 {V2S32, ConstantPtr, 64, GlobalAlign32}, 961 {V4S32, ConstantPtr, 128, GlobalAlign32}, 962 {S64, ConstantPtr, 64, GlobalAlign32}, 963 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 964 Actions.legalIf( 965 [=](const LegalityQuery &Query) -> bool { 966 return isLoadStoreLegal(ST, Query, Op); 967 }); 968 969 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 970 // 64-bits. 971 // 972 // TODO: Should generalize bitcast action into coerce, which will also cover 973 // inserting addrspacecasts. 974 Actions.customIf(typeIs(1, Constant32Ptr)); 975 976 // Turn any illegal element vectors into something easier to deal 977 // with. These will ultimately produce 32-bit scalar shifts to extract the 978 // parts anyway. 979 // 980 // For odd 16-bit element vectors, prefer to split those into pieces with 981 // 16-bit vector parts. 982 Actions.bitcastIf( 983 [=](const LegalityQuery &Query) -> bool { 984 const LLT Ty = Query.Types[0]; 985 const unsigned Size = Ty.getSizeInBits(); 986 987 if (Size != Query.MMODescrs[0].SizeInBits) 988 return Size <= 32 && Ty.isVector(); 989 990 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 991 return true; 992 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 993 !isRegisterVectorElementType(Ty.getElementType()); 994 }, bitcastToRegisterType(0)); 995 996 Actions 997 .customIf(typeIs(1, Constant32Ptr)) 998 // Widen suitably aligned loads by loading extra elements. 999 .moreElementsIf([=](const LegalityQuery &Query) { 1000 const LLT Ty = Query.Types[0]; 1001 return Op == G_LOAD && Ty.isVector() && 1002 shouldWidenLoadResult(Query, Op); 1003 }, moreElementsToNextPow2(0)) 1004 .widenScalarIf([=](const LegalityQuery &Query) { 1005 const LLT Ty = Query.Types[0]; 1006 return Op == G_LOAD && !Ty.isVector() && 1007 shouldWidenLoadResult(Query, Op); 1008 }, widenScalarOrEltToNextPow2(0)) 1009 .narrowScalarIf( 1010 [=](const LegalityQuery &Query) -> bool { 1011 return !Query.Types[0].isVector() && 1012 needToSplitMemOp(Query, Op == G_LOAD); 1013 }, 1014 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1015 const LLT DstTy = Query.Types[0]; 1016 const LLT PtrTy = Query.Types[1]; 1017 1018 const unsigned DstSize = DstTy.getSizeInBits(); 1019 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1020 1021 // Split extloads. 1022 if (DstSize > MemSize) 1023 return std::make_pair(0, LLT::scalar(MemSize)); 1024 1025 if (!isPowerOf2_32(DstSize)) { 1026 // We're probably decomposing an odd sized store. Try to split 1027 // to the widest type. TODO: Account for alignment. As-is it 1028 // should be OK, since the new parts will be further legalized. 1029 unsigned FloorSize = PowerOf2Floor(DstSize); 1030 return std::make_pair(0, LLT::scalar(FloorSize)); 1031 } 1032 1033 if (DstSize > 32 && (DstSize % 32 != 0)) { 1034 // FIXME: Need a way to specify non-extload of larger size if 1035 // suitably aligned. 1036 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1037 } 1038 1039 unsigned MaxSize = maxSizeForAddrSpace(ST, 1040 PtrTy.getAddressSpace(), 1041 Op == G_LOAD); 1042 if (MemSize > MaxSize) 1043 return std::make_pair(0, LLT::scalar(MaxSize)); 1044 1045 unsigned Align = Query.MMODescrs[0].AlignInBits; 1046 return std::make_pair(0, LLT::scalar(Align)); 1047 }) 1048 .fewerElementsIf( 1049 [=](const LegalityQuery &Query) -> bool { 1050 return Query.Types[0].isVector() && 1051 needToSplitMemOp(Query, Op == G_LOAD); 1052 }, 1053 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1054 const LLT DstTy = Query.Types[0]; 1055 const LLT PtrTy = Query.Types[1]; 1056 1057 LLT EltTy = DstTy.getElementType(); 1058 unsigned MaxSize = maxSizeForAddrSpace(ST, 1059 PtrTy.getAddressSpace(), 1060 Op == G_LOAD); 1061 1062 // FIXME: Handle widened to power of 2 results better. This ends 1063 // up scalarizing. 1064 // FIXME: 3 element stores scalarized on SI 1065 1066 // Split if it's too large for the address space. 1067 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1068 unsigned NumElts = DstTy.getNumElements(); 1069 unsigned EltSize = EltTy.getSizeInBits(); 1070 1071 if (MaxSize % EltSize == 0) { 1072 return std::make_pair( 1073 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1074 } 1075 1076 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1077 1078 // FIXME: Refine when odd breakdowns handled 1079 // The scalars will need to be re-legalized. 1080 if (NumPieces == 1 || NumPieces >= NumElts || 1081 NumElts % NumPieces != 0) 1082 return std::make_pair(0, EltTy); 1083 1084 return std::make_pair(0, 1085 LLT::vector(NumElts / NumPieces, EltTy)); 1086 } 1087 1088 // FIXME: We could probably handle weird extending loads better. 1089 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1090 if (DstTy.getSizeInBits() > MemSize) 1091 return std::make_pair(0, EltTy); 1092 1093 unsigned EltSize = EltTy.getSizeInBits(); 1094 unsigned DstSize = DstTy.getSizeInBits(); 1095 if (!isPowerOf2_32(DstSize)) { 1096 // We're probably decomposing an odd sized store. Try to split 1097 // to the widest type. TODO: Account for alignment. As-is it 1098 // should be OK, since the new parts will be further legalized. 1099 unsigned FloorSize = PowerOf2Floor(DstSize); 1100 return std::make_pair( 1101 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1102 } 1103 1104 // Need to split because of alignment. 1105 unsigned Align = Query.MMODescrs[0].AlignInBits; 1106 if (EltSize > Align && 1107 (EltSize / Align < DstTy.getNumElements())) { 1108 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1109 } 1110 1111 // May need relegalization for the scalars. 1112 return std::make_pair(0, EltTy); 1113 }) 1114 .minScalar(0, S32); 1115 1116 if (IsStore) 1117 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1118 1119 // TODO: Need a bitcast lower option? 1120 Actions 1121 .widenScalarToNextPow2(0) 1122 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1123 } 1124 1125 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1126 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1127 {S32, GlobalPtr, 16, 2 * 8}, 1128 {S32, LocalPtr, 8, 8}, 1129 {S32, LocalPtr, 16, 16}, 1130 {S32, PrivatePtr, 8, 8}, 1131 {S32, PrivatePtr, 16, 16}, 1132 {S32, ConstantPtr, 8, 8}, 1133 {S32, ConstantPtr, 16, 2 * 8}}); 1134 if (ST.hasFlatAddressSpace()) { 1135 ExtLoads.legalForTypesWithMemDesc( 1136 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1137 } 1138 1139 ExtLoads.clampScalar(0, S32, S32) 1140 .widenScalarToNextPow2(0) 1141 .unsupportedIfMemSizeNotPow2() 1142 .lower(); 1143 1144 auto &Atomics = getActionDefinitionsBuilder( 1145 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1146 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1147 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1148 G_ATOMICRMW_UMIN}) 1149 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1150 {S64, GlobalPtr}, {S64, LocalPtr}}); 1151 if (ST.hasFlatAddressSpace()) { 1152 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1153 } 1154 1155 if (ST.hasLDSFPAtomics()) { 1156 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1157 .legalFor({{S32, LocalPtr}}); 1158 } 1159 1160 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1161 // demarshalling 1162 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1163 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1164 {S32, FlatPtr}, {S64, FlatPtr}}) 1165 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1166 {S32, RegionPtr}, {S64, RegionPtr}}); 1167 // TODO: Pointer types, any 32-bit or 64-bit vector 1168 1169 // Condition should be s32 for scalar, s1 for vector. 1170 getActionDefinitionsBuilder(G_SELECT) 1171 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1172 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1173 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1174 .clampScalar(0, S16, S64) 1175 .scalarize(1) 1176 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1177 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1178 .clampMaxNumElements(0, S32, 2) 1179 .clampMaxNumElements(0, LocalPtr, 2) 1180 .clampMaxNumElements(0, PrivatePtr, 2) 1181 .scalarize(0) 1182 .widenScalarToNextPow2(0) 1183 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1184 1185 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1186 // be more flexible with the shift amount type. 1187 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1188 .legalFor({{S32, S32}, {S64, S32}}); 1189 if (ST.has16BitInsts()) { 1190 if (ST.hasVOP3PInsts()) { 1191 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1192 .clampMaxNumElements(0, S16, 2); 1193 } else 1194 Shifts.legalFor({{S16, S16}}); 1195 1196 // TODO: Support 16-bit shift amounts for all types 1197 Shifts.widenScalarIf( 1198 [=](const LegalityQuery &Query) { 1199 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1200 // 32-bit amount. 1201 const LLT ValTy = Query.Types[0]; 1202 const LLT AmountTy = Query.Types[1]; 1203 return ValTy.getSizeInBits() <= 16 && 1204 AmountTy.getSizeInBits() < 16; 1205 }, changeTo(1, S16)); 1206 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1207 Shifts.clampScalar(1, S32, S32); 1208 Shifts.clampScalar(0, S16, S64); 1209 Shifts.widenScalarToNextPow2(0, 16); 1210 } else { 1211 // Make sure we legalize the shift amount type first, as the general 1212 // expansion for the shifted type will produce much worse code if it hasn't 1213 // been truncated already. 1214 Shifts.clampScalar(1, S32, S32); 1215 Shifts.clampScalar(0, S32, S64); 1216 Shifts.widenScalarToNextPow2(0, 32); 1217 } 1218 Shifts.scalarize(0); 1219 1220 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1221 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1222 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1223 unsigned IdxTypeIdx = 2; 1224 1225 getActionDefinitionsBuilder(Op) 1226 .customIf([=](const LegalityQuery &Query) { 1227 const LLT EltTy = Query.Types[EltTypeIdx]; 1228 const LLT VecTy = Query.Types[VecTypeIdx]; 1229 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1230 return (EltTy.getSizeInBits() == 16 || 1231 EltTy.getSizeInBits() % 32 == 0) && 1232 VecTy.getSizeInBits() % 32 == 0 && 1233 VecTy.getSizeInBits() <= MaxRegisterSize && 1234 IdxTy.getSizeInBits() == 32; 1235 }) 1236 .clampScalar(EltTypeIdx, S32, S64) 1237 .clampScalar(VecTypeIdx, S32, S64) 1238 .clampScalar(IdxTypeIdx, S32, S32); 1239 } 1240 1241 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1242 .unsupportedIf([=](const LegalityQuery &Query) { 1243 const LLT &EltTy = Query.Types[1].getElementType(); 1244 return Query.Types[0] != EltTy; 1245 }); 1246 1247 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1248 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1249 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1250 1251 // FIXME: Doesn't handle extract of illegal sizes. 1252 getActionDefinitionsBuilder(Op) 1253 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1254 // FIXME: Multiples of 16 should not be legal. 1255 .legalIf([=](const LegalityQuery &Query) { 1256 const LLT BigTy = Query.Types[BigTyIdx]; 1257 const LLT LitTy = Query.Types[LitTyIdx]; 1258 return (BigTy.getSizeInBits() % 32 == 0) && 1259 (LitTy.getSizeInBits() % 16 == 0); 1260 }) 1261 .widenScalarIf( 1262 [=](const LegalityQuery &Query) { 1263 const LLT BigTy = Query.Types[BigTyIdx]; 1264 return (BigTy.getScalarSizeInBits() < 16); 1265 }, 1266 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1267 .widenScalarIf( 1268 [=](const LegalityQuery &Query) { 1269 const LLT LitTy = Query.Types[LitTyIdx]; 1270 return (LitTy.getScalarSizeInBits() < 16); 1271 }, 1272 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1273 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1274 .widenScalarToNextPow2(BigTyIdx, 32); 1275 1276 } 1277 1278 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1279 .legalForCartesianProduct(AllS32Vectors, {S32}) 1280 .legalForCartesianProduct(AllS64Vectors, {S64}) 1281 .clampNumElements(0, V16S32, V32S32) 1282 .clampNumElements(0, V2S64, V16S64) 1283 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1284 1285 if (ST.hasScalarPackInsts()) { 1286 BuildVector 1287 // FIXME: Should probably widen s1 vectors straight to s32 1288 .minScalarOrElt(0, S16) 1289 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1290 .minScalar(1, S32); 1291 1292 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1293 .legalFor({V2S16, S32}) 1294 .lower(); 1295 BuildVector.minScalarOrElt(0, S32); 1296 } else { 1297 BuildVector.customFor({V2S16, S16}); 1298 BuildVector.minScalarOrElt(0, S32); 1299 1300 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1301 .customFor({V2S16, S32}) 1302 .lower(); 1303 } 1304 1305 BuildVector.legalIf(isRegisterType(0)); 1306 1307 // FIXME: Clamp maximum size 1308 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1309 .legalIf(isRegisterType(0)); 1310 1311 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1312 // pre-legalize. 1313 if (ST.hasVOP3PInsts()) { 1314 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1315 .customFor({V2S16, V2S16}) 1316 .lower(); 1317 } else 1318 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1319 1320 // Merge/Unmerge 1321 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1322 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1323 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1324 1325 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1326 const LLT Ty = Query.Types[TypeIdx]; 1327 if (Ty.isVector()) { 1328 const LLT &EltTy = Ty.getElementType(); 1329 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1330 return true; 1331 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1332 return true; 1333 } 1334 return false; 1335 }; 1336 1337 auto &Builder = getActionDefinitionsBuilder(Op) 1338 .lowerFor({{S16, V2S16}}) 1339 .lowerIf([=](const LegalityQuery &Query) { 1340 const LLT BigTy = Query.Types[BigTyIdx]; 1341 return BigTy.getSizeInBits() == 32; 1342 }) 1343 // Try to widen to s16 first for small types. 1344 // TODO: Only do this on targets with legal s16 shifts 1345 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1346 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1347 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1348 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1349 elementTypeIs(1, S16)), 1350 changeTo(1, V2S16)) 1351 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1352 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1353 // valid. 1354 .clampScalar(LitTyIdx, S32, S512) 1355 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1356 // Break up vectors with weird elements into scalars 1357 .fewerElementsIf( 1358 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1359 scalarize(0)) 1360 .fewerElementsIf( 1361 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1362 scalarize(1)) 1363 .clampScalar(BigTyIdx, S32, MaxScalar); 1364 1365 if (Op == G_MERGE_VALUES) { 1366 Builder.widenScalarIf( 1367 // TODO: Use 16-bit shifts if legal for 8-bit values? 1368 [=](const LegalityQuery &Query) { 1369 const LLT Ty = Query.Types[LitTyIdx]; 1370 return Ty.getSizeInBits() < 32; 1371 }, 1372 changeTo(LitTyIdx, S32)); 1373 } 1374 1375 Builder.widenScalarIf( 1376 [=](const LegalityQuery &Query) { 1377 const LLT Ty = Query.Types[BigTyIdx]; 1378 return !isPowerOf2_32(Ty.getSizeInBits()) && 1379 Ty.getSizeInBits() % 16 != 0; 1380 }, 1381 [=](const LegalityQuery &Query) { 1382 // Pick the next power of 2, or a multiple of 64 over 128. 1383 // Whichever is smaller. 1384 const LLT &Ty = Query.Types[BigTyIdx]; 1385 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1386 if (NewSizeInBits >= 256) { 1387 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1388 if (RoundedTo < NewSizeInBits) 1389 NewSizeInBits = RoundedTo; 1390 } 1391 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1392 }) 1393 .legalIf([=](const LegalityQuery &Query) { 1394 const LLT &BigTy = Query.Types[BigTyIdx]; 1395 const LLT &LitTy = Query.Types[LitTyIdx]; 1396 1397 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1398 return false; 1399 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1400 return false; 1401 1402 return BigTy.getSizeInBits() % 16 == 0 && 1403 LitTy.getSizeInBits() % 16 == 0 && 1404 BigTy.getSizeInBits() <= MaxRegisterSize; 1405 }) 1406 // Any vectors left are the wrong size. Scalarize them. 1407 .scalarize(0) 1408 .scalarize(1); 1409 } 1410 1411 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1412 // RegBankSelect. 1413 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1414 .legalFor({{S32}, {S64}}); 1415 1416 if (ST.hasVOP3PInsts()) { 1417 SextInReg.lowerFor({{V2S16}}) 1418 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1419 // get more vector shift opportunities, since we'll get those when 1420 // expanded. 1421 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1422 } else if (ST.has16BitInsts()) { 1423 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1424 } else { 1425 // Prefer to promote to s32 before lowering if we don't have 16-bit 1426 // shifts. This avoid a lot of intermediate truncate and extend operations. 1427 SextInReg.lowerFor({{S32}, {S64}}); 1428 } 1429 1430 SextInReg 1431 .scalarize(0) 1432 .clampScalar(0, S32, S64) 1433 .lower(); 1434 1435 getActionDefinitionsBuilder(G_FSHR) 1436 .legalFor({{S32, S32}}) 1437 .scalarize(0) 1438 .lower(); 1439 1440 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1441 .legalFor({S64}); 1442 1443 getActionDefinitionsBuilder({ 1444 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1445 G_FCOPYSIGN, 1446 1447 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1448 G_READ_REGISTER, 1449 G_WRITE_REGISTER, 1450 1451 G_SADDO, G_SSUBO, 1452 1453 // TODO: Implement 1454 G_FMINIMUM, G_FMAXIMUM, 1455 G_FSHL 1456 }).lower(); 1457 1458 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1459 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1460 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1461 .unsupported(); 1462 1463 computeTables(); 1464 verify(*ST.getInstrInfo()); 1465 } 1466 1467 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1468 MachineInstr &MI) const { 1469 MachineIRBuilder &B = Helper.MIRBuilder; 1470 MachineRegisterInfo &MRI = *B.getMRI(); 1471 GISelChangeObserver &Observer = Helper.Observer; 1472 1473 switch (MI.getOpcode()) { 1474 case TargetOpcode::G_ADDRSPACE_CAST: 1475 return legalizeAddrSpaceCast(MI, MRI, B); 1476 case TargetOpcode::G_FRINT: 1477 return legalizeFrint(MI, MRI, B); 1478 case TargetOpcode::G_FCEIL: 1479 return legalizeFceil(MI, MRI, B); 1480 case TargetOpcode::G_INTRINSIC_TRUNC: 1481 return legalizeIntrinsicTrunc(MI, MRI, B); 1482 case TargetOpcode::G_SITOFP: 1483 return legalizeITOFP(MI, MRI, B, true); 1484 case TargetOpcode::G_UITOFP: 1485 return legalizeITOFP(MI, MRI, B, false); 1486 case TargetOpcode::G_FPTOSI: 1487 return legalizeFPTOI(MI, MRI, B, true); 1488 case TargetOpcode::G_FPTOUI: 1489 return legalizeFPTOI(MI, MRI, B, false); 1490 case TargetOpcode::G_FMINNUM: 1491 case TargetOpcode::G_FMAXNUM: 1492 case TargetOpcode::G_FMINNUM_IEEE: 1493 case TargetOpcode::G_FMAXNUM_IEEE: 1494 return legalizeMinNumMaxNum(Helper, MI); 1495 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1496 return legalizeExtractVectorElt(MI, MRI, B); 1497 case TargetOpcode::G_INSERT_VECTOR_ELT: 1498 return legalizeInsertVectorElt(MI, MRI, B); 1499 case TargetOpcode::G_SHUFFLE_VECTOR: 1500 return legalizeShuffleVector(MI, MRI, B); 1501 case TargetOpcode::G_FSIN: 1502 case TargetOpcode::G_FCOS: 1503 return legalizeSinCos(MI, MRI, B); 1504 case TargetOpcode::G_GLOBAL_VALUE: 1505 return legalizeGlobalValue(MI, MRI, B); 1506 case TargetOpcode::G_LOAD: 1507 return legalizeLoad(MI, MRI, B, Observer); 1508 case TargetOpcode::G_FMAD: 1509 return legalizeFMad(MI, MRI, B); 1510 case TargetOpcode::G_FDIV: 1511 return legalizeFDIV(MI, MRI, B); 1512 case TargetOpcode::G_UDIV: 1513 case TargetOpcode::G_UREM: 1514 return legalizeUDIV_UREM(MI, MRI, B); 1515 case TargetOpcode::G_SDIV: 1516 case TargetOpcode::G_SREM: 1517 return legalizeSDIV_SREM(MI, MRI, B); 1518 case TargetOpcode::G_ATOMIC_CMPXCHG: 1519 return legalizeAtomicCmpXChg(MI, MRI, B); 1520 case TargetOpcode::G_FLOG: 1521 return legalizeFlog(MI, B, numbers::ln2f); 1522 case TargetOpcode::G_FLOG10: 1523 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1524 case TargetOpcode::G_FEXP: 1525 return legalizeFExp(MI, B); 1526 case TargetOpcode::G_FPOW: 1527 return legalizeFPow(MI, B); 1528 case TargetOpcode::G_FFLOOR: 1529 return legalizeFFloor(MI, MRI, B); 1530 case TargetOpcode::G_BUILD_VECTOR: 1531 return legalizeBuildVector(MI, MRI, B); 1532 default: 1533 return false; 1534 } 1535 1536 llvm_unreachable("expected switch to return"); 1537 } 1538 1539 Register AMDGPULegalizerInfo::getSegmentAperture( 1540 unsigned AS, 1541 MachineRegisterInfo &MRI, 1542 MachineIRBuilder &B) const { 1543 MachineFunction &MF = B.getMF(); 1544 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1545 const LLT S32 = LLT::scalar(32); 1546 1547 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1548 1549 if (ST.hasApertureRegs()) { 1550 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1551 // getreg. 1552 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1553 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1554 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1555 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1556 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1557 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1558 unsigned Encoding = 1559 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1560 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1561 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1562 1563 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1564 1565 B.buildInstr(AMDGPU::S_GETREG_B32) 1566 .addDef(GetReg) 1567 .addImm(Encoding); 1568 MRI.setType(GetReg, S32); 1569 1570 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1571 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1572 } 1573 1574 Register QueuePtr = MRI.createGenericVirtualRegister( 1575 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1576 1577 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1578 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1579 return Register(); 1580 1581 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1582 // private_segment_aperture_base_hi. 1583 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1584 1585 // TODO: can we be smarter about machine pointer info? 1586 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1587 MachineMemOperand *MMO = MF.getMachineMemOperand( 1588 PtrInfo, 1589 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1590 MachineMemOperand::MOInvariant, 1591 4, commonAlignment(Align(64), StructOffset)); 1592 1593 Register LoadAddr; 1594 1595 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1596 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1597 } 1598 1599 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1600 MachineInstr &MI, MachineRegisterInfo &MRI, 1601 MachineIRBuilder &B) const { 1602 MachineFunction &MF = B.getMF(); 1603 1604 const LLT S32 = LLT::scalar(32); 1605 Register Dst = MI.getOperand(0).getReg(); 1606 Register Src = MI.getOperand(1).getReg(); 1607 1608 LLT DstTy = MRI.getType(Dst); 1609 LLT SrcTy = MRI.getType(Src); 1610 unsigned DestAS = DstTy.getAddressSpace(); 1611 unsigned SrcAS = SrcTy.getAddressSpace(); 1612 1613 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1614 // vector element. 1615 assert(!DstTy.isVector()); 1616 1617 const AMDGPUTargetMachine &TM 1618 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1619 1620 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1621 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1622 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1623 return true; 1624 } 1625 1626 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1627 // Truncate. 1628 B.buildExtract(Dst, Src, 0); 1629 MI.eraseFromParent(); 1630 return true; 1631 } 1632 1633 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1634 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1635 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1636 1637 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1638 // another. Merge operands are required to be the same type, but creating an 1639 // extra ptrtoint would be kind of pointless. 1640 auto HighAddr = B.buildConstant( 1641 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1642 B.buildMerge(Dst, {Src, HighAddr}); 1643 MI.eraseFromParent(); 1644 return true; 1645 } 1646 1647 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1648 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1649 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1650 unsigned NullVal = TM.getNullPointerValue(DestAS); 1651 1652 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1653 auto FlatNull = B.buildConstant(SrcTy, 0); 1654 1655 // Extract low 32-bits of the pointer. 1656 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1657 1658 auto CmpRes = 1659 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1660 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1661 1662 MI.eraseFromParent(); 1663 return true; 1664 } 1665 1666 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1667 return false; 1668 1669 if (!ST.hasFlatAddressSpace()) 1670 return false; 1671 1672 auto SegmentNull = 1673 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1674 auto FlatNull = 1675 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1676 1677 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1678 if (!ApertureReg.isValid()) 1679 return false; 1680 1681 auto CmpRes = 1682 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1683 1684 // Coerce the type of the low half of the result so we can use merge_values. 1685 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1686 1687 // TODO: Should we allow mismatched types but matching sizes in merges to 1688 // avoid the ptrtoint? 1689 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1690 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1691 1692 MI.eraseFromParent(); 1693 return true; 1694 } 1695 1696 bool AMDGPULegalizerInfo::legalizeFrint( 1697 MachineInstr &MI, MachineRegisterInfo &MRI, 1698 MachineIRBuilder &B) const { 1699 Register Src = MI.getOperand(1).getReg(); 1700 LLT Ty = MRI.getType(Src); 1701 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1702 1703 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1704 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1705 1706 auto C1 = B.buildFConstant(Ty, C1Val); 1707 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1708 1709 // TODO: Should this propagate fast-math-flags? 1710 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1711 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1712 1713 auto C2 = B.buildFConstant(Ty, C2Val); 1714 auto Fabs = B.buildFAbs(Ty, Src); 1715 1716 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1717 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1718 return true; 1719 } 1720 1721 bool AMDGPULegalizerInfo::legalizeFceil( 1722 MachineInstr &MI, MachineRegisterInfo &MRI, 1723 MachineIRBuilder &B) const { 1724 1725 const LLT S1 = LLT::scalar(1); 1726 const LLT S64 = LLT::scalar(64); 1727 1728 Register Src = MI.getOperand(1).getReg(); 1729 assert(MRI.getType(Src) == S64); 1730 1731 // result = trunc(src) 1732 // if (src > 0.0 && src != result) 1733 // result += 1.0 1734 1735 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1736 1737 const auto Zero = B.buildFConstant(S64, 0.0); 1738 const auto One = B.buildFConstant(S64, 1.0); 1739 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1740 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1741 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1742 auto Add = B.buildSelect(S64, And, One, Zero); 1743 1744 // TODO: Should this propagate fast-math-flags? 1745 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1746 return true; 1747 } 1748 1749 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1750 MachineIRBuilder &B) { 1751 const unsigned FractBits = 52; 1752 const unsigned ExpBits = 11; 1753 LLT S32 = LLT::scalar(32); 1754 1755 auto Const0 = B.buildConstant(S32, FractBits - 32); 1756 auto Const1 = B.buildConstant(S32, ExpBits); 1757 1758 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1759 .addUse(Const0.getReg(0)) 1760 .addUse(Const1.getReg(0)); 1761 1762 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1763 } 1764 1765 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1766 MachineInstr &MI, MachineRegisterInfo &MRI, 1767 MachineIRBuilder &B) const { 1768 const LLT S1 = LLT::scalar(1); 1769 const LLT S32 = LLT::scalar(32); 1770 const LLT S64 = LLT::scalar(64); 1771 1772 Register Src = MI.getOperand(1).getReg(); 1773 assert(MRI.getType(Src) == S64); 1774 1775 // TODO: Should this use extract since the low half is unused? 1776 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1777 Register Hi = Unmerge.getReg(1); 1778 1779 // Extract the upper half, since this is where we will find the sign and 1780 // exponent. 1781 auto Exp = extractF64Exponent(Hi, B); 1782 1783 const unsigned FractBits = 52; 1784 1785 // Extract the sign bit. 1786 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1787 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1788 1789 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1790 1791 const auto Zero32 = B.buildConstant(S32, 0); 1792 1793 // Extend back to 64-bits. 1794 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1795 1796 auto Shr = B.buildAShr(S64, FractMask, Exp); 1797 auto Not = B.buildNot(S64, Shr); 1798 auto Tmp0 = B.buildAnd(S64, Src, Not); 1799 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1800 1801 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1802 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1803 1804 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1805 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1806 return true; 1807 } 1808 1809 bool AMDGPULegalizerInfo::legalizeITOFP( 1810 MachineInstr &MI, MachineRegisterInfo &MRI, 1811 MachineIRBuilder &B, bool Signed) const { 1812 1813 Register Dst = MI.getOperand(0).getReg(); 1814 Register Src = MI.getOperand(1).getReg(); 1815 1816 const LLT S64 = LLT::scalar(64); 1817 const LLT S32 = LLT::scalar(32); 1818 1819 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1820 1821 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1822 1823 auto CvtHi = Signed ? 1824 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1825 B.buildUITOFP(S64, Unmerge.getReg(1)); 1826 1827 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1828 1829 auto ThirtyTwo = B.buildConstant(S32, 32); 1830 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1831 .addUse(CvtHi.getReg(0)) 1832 .addUse(ThirtyTwo.getReg(0)); 1833 1834 // TODO: Should this propagate fast-math-flags? 1835 B.buildFAdd(Dst, LdExp, CvtLo); 1836 MI.eraseFromParent(); 1837 return true; 1838 } 1839 1840 // TODO: Copied from DAG implementation. Verify logic and document how this 1841 // actually works. 1842 bool AMDGPULegalizerInfo::legalizeFPTOI( 1843 MachineInstr &MI, MachineRegisterInfo &MRI, 1844 MachineIRBuilder &B, bool Signed) const { 1845 1846 Register Dst = MI.getOperand(0).getReg(); 1847 Register Src = MI.getOperand(1).getReg(); 1848 1849 const LLT S64 = LLT::scalar(64); 1850 const LLT S32 = LLT::scalar(32); 1851 1852 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1853 1854 unsigned Flags = MI.getFlags(); 1855 1856 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1857 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1858 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1859 1860 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1861 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1862 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1863 1864 auto Hi = Signed ? 1865 B.buildFPTOSI(S32, FloorMul) : 1866 B.buildFPTOUI(S32, FloorMul); 1867 auto Lo = B.buildFPTOUI(S32, Fma); 1868 1869 B.buildMerge(Dst, { Lo, Hi }); 1870 MI.eraseFromParent(); 1871 1872 return true; 1873 } 1874 1875 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 1876 MachineInstr &MI) const { 1877 MachineFunction &MF = Helper.MIRBuilder.getMF(); 1878 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1879 1880 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1881 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1882 1883 // With ieee_mode disabled, the instructions have the correct behavior 1884 // already for G_FMINNUM/G_FMAXNUM 1885 if (!MFI->getMode().IEEE) 1886 return !IsIEEEOp; 1887 1888 if (IsIEEEOp) 1889 return true; 1890 1891 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1892 } 1893 1894 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1895 MachineInstr &MI, MachineRegisterInfo &MRI, 1896 MachineIRBuilder &B) const { 1897 // TODO: Should move some of this into LegalizerHelper. 1898 1899 // TODO: Promote dynamic indexing of s16 to s32 1900 1901 // FIXME: Artifact combiner probably should have replaced the truncated 1902 // constant before this, so we shouldn't need 1903 // getConstantVRegValWithLookThrough. 1904 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1905 MI.getOperand(2).getReg(), MRI); 1906 if (!IdxVal) // Dynamic case will be selected to register indexing. 1907 return true; 1908 1909 Register Dst = MI.getOperand(0).getReg(); 1910 Register Vec = MI.getOperand(1).getReg(); 1911 1912 LLT VecTy = MRI.getType(Vec); 1913 LLT EltTy = VecTy.getElementType(); 1914 assert(EltTy == MRI.getType(Dst)); 1915 1916 if (IdxVal->Value < VecTy.getNumElements()) 1917 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1918 else 1919 B.buildUndef(Dst); 1920 1921 MI.eraseFromParent(); 1922 return true; 1923 } 1924 1925 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1926 MachineInstr &MI, MachineRegisterInfo &MRI, 1927 MachineIRBuilder &B) const { 1928 // TODO: Should move some of this into LegalizerHelper. 1929 1930 // TODO: Promote dynamic indexing of s16 to s32 1931 1932 // FIXME: Artifact combiner probably should have replaced the truncated 1933 // constant before this, so we shouldn't need 1934 // getConstantVRegValWithLookThrough. 1935 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1936 MI.getOperand(3).getReg(), MRI); 1937 if (!IdxVal) // Dynamic case will be selected to register indexing. 1938 return true; 1939 1940 Register Dst = MI.getOperand(0).getReg(); 1941 Register Vec = MI.getOperand(1).getReg(); 1942 Register Ins = MI.getOperand(2).getReg(); 1943 1944 LLT VecTy = MRI.getType(Vec); 1945 LLT EltTy = VecTy.getElementType(); 1946 assert(EltTy == MRI.getType(Ins)); 1947 1948 if (IdxVal->Value < VecTy.getNumElements()) 1949 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1950 else 1951 B.buildUndef(Dst); 1952 1953 MI.eraseFromParent(); 1954 return true; 1955 } 1956 1957 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1958 MachineInstr &MI, MachineRegisterInfo &MRI, 1959 MachineIRBuilder &B) const { 1960 const LLT V2S16 = LLT::vector(2, 16); 1961 1962 Register Dst = MI.getOperand(0).getReg(); 1963 Register Src0 = MI.getOperand(1).getReg(); 1964 LLT DstTy = MRI.getType(Dst); 1965 LLT SrcTy = MRI.getType(Src0); 1966 1967 if (SrcTy == V2S16 && DstTy == V2S16 && 1968 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1969 return true; 1970 1971 MachineIRBuilder HelperBuilder(MI); 1972 GISelObserverWrapper DummyObserver; 1973 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1974 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1975 } 1976 1977 bool AMDGPULegalizerInfo::legalizeSinCos( 1978 MachineInstr &MI, MachineRegisterInfo &MRI, 1979 MachineIRBuilder &B) const { 1980 1981 Register DstReg = MI.getOperand(0).getReg(); 1982 Register SrcReg = MI.getOperand(1).getReg(); 1983 LLT Ty = MRI.getType(DstReg); 1984 unsigned Flags = MI.getFlags(); 1985 1986 Register TrigVal; 1987 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 1988 if (ST.hasTrigReducedRange()) { 1989 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1990 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1991 .addUse(MulVal.getReg(0)) 1992 .setMIFlags(Flags).getReg(0); 1993 } else 1994 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1995 1996 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1997 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1998 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1999 .addUse(TrigVal) 2000 .setMIFlags(Flags); 2001 MI.eraseFromParent(); 2002 return true; 2003 } 2004 2005 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2006 MachineIRBuilder &B, 2007 const GlobalValue *GV, 2008 int64_t Offset, 2009 unsigned GAFlags) const { 2010 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2011 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2012 // to the following code sequence: 2013 // 2014 // For constant address space: 2015 // s_getpc_b64 s[0:1] 2016 // s_add_u32 s0, s0, $symbol 2017 // s_addc_u32 s1, s1, 0 2018 // 2019 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2020 // a fixup or relocation is emitted to replace $symbol with a literal 2021 // constant, which is a pc-relative offset from the encoding of the $symbol 2022 // operand to the global variable. 2023 // 2024 // For global address space: 2025 // s_getpc_b64 s[0:1] 2026 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2027 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2028 // 2029 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2030 // fixups or relocations are emitted to replace $symbol@*@lo and 2031 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2032 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2033 // operand to the global variable. 2034 // 2035 // What we want here is an offset from the value returned by s_getpc 2036 // (which is the address of the s_add_u32 instruction) to the global 2037 // variable, but since the encoding of $symbol starts 4 bytes after the start 2038 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2039 // small. This requires us to add 4 to the global variable offset in order to 2040 // compute the correct address. 2041 2042 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2043 2044 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2045 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2046 2047 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2048 .addDef(PCReg); 2049 2050 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2051 if (GAFlags == SIInstrInfo::MO_NONE) 2052 MIB.addImm(0); 2053 else 2054 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2055 2056 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2057 2058 if (PtrTy.getSizeInBits() == 32) 2059 B.buildExtract(DstReg, PCReg, 0); 2060 return true; 2061 } 2062 2063 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2064 MachineInstr &MI, MachineRegisterInfo &MRI, 2065 MachineIRBuilder &B) const { 2066 Register DstReg = MI.getOperand(0).getReg(); 2067 LLT Ty = MRI.getType(DstReg); 2068 unsigned AS = Ty.getAddressSpace(); 2069 2070 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2071 MachineFunction &MF = B.getMF(); 2072 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2073 2074 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2075 if (!MFI->isEntryFunction()) { 2076 const Function &Fn = MF.getFunction(); 2077 DiagnosticInfoUnsupported BadLDSDecl( 2078 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2079 DS_Warning); 2080 Fn.getContext().diagnose(BadLDSDecl); 2081 2082 // We currently don't have a way to correctly allocate LDS objects that 2083 // aren't directly associated with a kernel. We do force inlining of 2084 // functions that use local objects. However, if these dead functions are 2085 // not eliminated, we don't want a compile time error. Just emit a warning 2086 // and a trap, since there should be no callable path here. 2087 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2088 B.buildUndef(DstReg); 2089 MI.eraseFromParent(); 2090 return true; 2091 } 2092 2093 // TODO: We could emit code to handle the initialization somewhere. 2094 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2095 const SITargetLowering *TLI = ST.getTargetLowering(); 2096 if (!TLI->shouldUseLDSConstAddress(GV)) { 2097 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2098 return true; // Leave in place; 2099 } 2100 2101 B.buildConstant( 2102 DstReg, 2103 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2104 MI.eraseFromParent(); 2105 return true; 2106 } 2107 2108 const Function &Fn = MF.getFunction(); 2109 DiagnosticInfoUnsupported BadInit( 2110 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2111 Fn.getContext().diagnose(BadInit); 2112 return true; 2113 } 2114 2115 const SITargetLowering *TLI = ST.getTargetLowering(); 2116 2117 if (TLI->shouldEmitFixup(GV)) { 2118 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2119 MI.eraseFromParent(); 2120 return true; 2121 } 2122 2123 if (TLI->shouldEmitPCReloc(GV)) { 2124 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2125 MI.eraseFromParent(); 2126 return true; 2127 } 2128 2129 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2130 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2131 2132 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2133 MachinePointerInfo::getGOT(MF), 2134 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2135 MachineMemOperand::MOInvariant, 2136 8 /*Size*/, Align(8)); 2137 2138 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2139 2140 if (Ty.getSizeInBits() == 32) { 2141 // Truncate if this is a 32-bit constant adrdess. 2142 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2143 B.buildExtract(DstReg, Load, 0); 2144 } else 2145 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2146 2147 MI.eraseFromParent(); 2148 return true; 2149 } 2150 2151 bool AMDGPULegalizerInfo::legalizeLoad( 2152 MachineInstr &MI, MachineRegisterInfo &MRI, 2153 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2154 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2155 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2156 Observer.changingInstr(MI); 2157 MI.getOperand(1).setReg(Cast.getReg(0)); 2158 Observer.changedInstr(MI); 2159 return true; 2160 } 2161 2162 bool AMDGPULegalizerInfo::legalizeFMad( 2163 MachineInstr &MI, MachineRegisterInfo &MRI, 2164 MachineIRBuilder &B) const { 2165 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2166 assert(Ty.isScalar()); 2167 2168 MachineFunction &MF = B.getMF(); 2169 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2170 2171 // TODO: Always legal with future ftz flag. 2172 // FIXME: Do we need just output? 2173 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2174 return true; 2175 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2176 return true; 2177 2178 MachineIRBuilder HelperBuilder(MI); 2179 GISelObserverWrapper DummyObserver; 2180 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2181 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2182 } 2183 2184 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2185 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2186 Register DstReg = MI.getOperand(0).getReg(); 2187 Register PtrReg = MI.getOperand(1).getReg(); 2188 Register CmpVal = MI.getOperand(2).getReg(); 2189 Register NewVal = MI.getOperand(3).getReg(); 2190 2191 assert(SITargetLowering::isFlatGlobalAddrSpace( 2192 MRI.getType(PtrReg).getAddressSpace()) && 2193 "this should not have been custom lowered"); 2194 2195 LLT ValTy = MRI.getType(CmpVal); 2196 LLT VecTy = LLT::vector(2, ValTy); 2197 2198 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2199 2200 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2201 .addDef(DstReg) 2202 .addUse(PtrReg) 2203 .addUse(PackedVal) 2204 .setMemRefs(MI.memoperands()); 2205 2206 MI.eraseFromParent(); 2207 return true; 2208 } 2209 2210 bool AMDGPULegalizerInfo::legalizeFlog( 2211 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2212 Register Dst = MI.getOperand(0).getReg(); 2213 Register Src = MI.getOperand(1).getReg(); 2214 LLT Ty = B.getMRI()->getType(Dst); 2215 unsigned Flags = MI.getFlags(); 2216 2217 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2218 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2219 2220 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2221 MI.eraseFromParent(); 2222 return true; 2223 } 2224 2225 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2226 MachineIRBuilder &B) const { 2227 Register Dst = MI.getOperand(0).getReg(); 2228 Register Src = MI.getOperand(1).getReg(); 2229 unsigned Flags = MI.getFlags(); 2230 LLT Ty = B.getMRI()->getType(Dst); 2231 2232 auto K = B.buildFConstant(Ty, numbers::log2e); 2233 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2234 B.buildFExp2(Dst, Mul, Flags); 2235 MI.eraseFromParent(); 2236 return true; 2237 } 2238 2239 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2240 MachineIRBuilder &B) const { 2241 Register Dst = MI.getOperand(0).getReg(); 2242 Register Src0 = MI.getOperand(1).getReg(); 2243 Register Src1 = MI.getOperand(2).getReg(); 2244 unsigned Flags = MI.getFlags(); 2245 LLT Ty = B.getMRI()->getType(Dst); 2246 const LLT S16 = LLT::scalar(16); 2247 const LLT S32 = LLT::scalar(32); 2248 2249 if (Ty == S32) { 2250 auto Log = B.buildFLog2(S32, Src0, Flags); 2251 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2252 .addUse(Log.getReg(0)) 2253 .addUse(Src1) 2254 .setMIFlags(Flags); 2255 B.buildFExp2(Dst, Mul, Flags); 2256 } else if (Ty == S16) { 2257 // There's no f16 fmul_legacy, so we need to convert for it. 2258 auto Log = B.buildFLog2(S16, Src0, Flags); 2259 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2260 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2261 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2262 .addUse(Ext0.getReg(0)) 2263 .addUse(Ext1.getReg(0)) 2264 .setMIFlags(Flags); 2265 2266 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2267 } else 2268 return false; 2269 2270 MI.eraseFromParent(); 2271 return true; 2272 } 2273 2274 // Find a source register, ignoring any possible source modifiers. 2275 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2276 Register ModSrc = OrigSrc; 2277 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2278 ModSrc = SrcFNeg->getOperand(1).getReg(); 2279 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2280 ModSrc = SrcFAbs->getOperand(1).getReg(); 2281 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2282 ModSrc = SrcFAbs->getOperand(1).getReg(); 2283 return ModSrc; 2284 } 2285 2286 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2287 MachineRegisterInfo &MRI, 2288 MachineIRBuilder &B) const { 2289 2290 const LLT S1 = LLT::scalar(1); 2291 const LLT S64 = LLT::scalar(64); 2292 Register Dst = MI.getOperand(0).getReg(); 2293 Register OrigSrc = MI.getOperand(1).getReg(); 2294 unsigned Flags = MI.getFlags(); 2295 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2296 "this should not have been custom lowered"); 2297 2298 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2299 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2300 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2301 // V_FRACT bug is: 2302 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2303 // 2304 // Convert floor(x) to (x - fract(x)) 2305 2306 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2307 .addUse(OrigSrc) 2308 .setMIFlags(Flags); 2309 2310 // Give source modifier matching some assistance before obscuring a foldable 2311 // pattern. 2312 2313 // TODO: We can avoid the neg on the fract? The input sign to fract 2314 // shouldn't matter? 2315 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2316 2317 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2318 2319 Register Min = MRI.createGenericVirtualRegister(S64); 2320 2321 // We don't need to concern ourselves with the snan handling difference, so 2322 // use the one which will directly select. 2323 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2324 if (MFI->getMode().IEEE) 2325 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2326 else 2327 B.buildFMinNum(Min, Fract, Const, Flags); 2328 2329 Register CorrectedFract = Min; 2330 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2331 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2332 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2333 } 2334 2335 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2336 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2337 2338 MI.eraseFromParent(); 2339 return true; 2340 } 2341 2342 // Turn an illegal packed v2s16 build vector into bit operations. 2343 // TODO: This should probably be a bitcast action in LegalizerHelper. 2344 bool AMDGPULegalizerInfo::legalizeBuildVector( 2345 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2346 Register Dst = MI.getOperand(0).getReg(); 2347 const LLT S32 = LLT::scalar(32); 2348 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2349 2350 Register Src0 = MI.getOperand(1).getReg(); 2351 Register Src1 = MI.getOperand(2).getReg(); 2352 assert(MRI.getType(Src0) == LLT::scalar(16)); 2353 2354 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2355 B.buildBitcast(Dst, Merge); 2356 2357 MI.eraseFromParent(); 2358 return true; 2359 } 2360 2361 // Return the use branch instruction, otherwise null if the usage is invalid. 2362 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2363 MachineRegisterInfo &MRI, 2364 MachineInstr *&Br, 2365 MachineBasicBlock *&UncondBrTarget) { 2366 Register CondDef = MI.getOperand(0).getReg(); 2367 if (!MRI.hasOneNonDBGUse(CondDef)) 2368 return nullptr; 2369 2370 MachineBasicBlock *Parent = MI.getParent(); 2371 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2372 if (UseMI.getParent() != Parent || 2373 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2374 return nullptr; 2375 2376 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2377 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2378 if (Next == Parent->end()) { 2379 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2380 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2381 return nullptr; 2382 UncondBrTarget = &*NextMBB; 2383 } else { 2384 if (Next->getOpcode() != AMDGPU::G_BR) 2385 return nullptr; 2386 Br = &*Next; 2387 UncondBrTarget = Br->getOperand(0).getMBB(); 2388 } 2389 2390 return &UseMI; 2391 } 2392 2393 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2394 MachineRegisterInfo &MRI, 2395 Register LiveIn, 2396 Register PhyReg) const { 2397 assert(PhyReg.isPhysical() && "Physical register expected"); 2398 2399 // Insert the live-in copy, if required, by defining destination virtual 2400 // register. 2401 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2402 if (!MRI.getVRegDef(LiveIn)) { 2403 // FIXME: Should have scoped insert pt 2404 MachineBasicBlock &OrigInsBB = B.getMBB(); 2405 auto OrigInsPt = B.getInsertPt(); 2406 2407 MachineBasicBlock &EntryMBB = B.getMF().front(); 2408 EntryMBB.addLiveIn(PhyReg); 2409 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2410 B.buildCopy(LiveIn, PhyReg); 2411 2412 B.setInsertPt(OrigInsBB, OrigInsPt); 2413 } 2414 2415 return LiveIn; 2416 } 2417 2418 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2419 MachineRegisterInfo &MRI, 2420 Register PhyReg, LLT Ty, 2421 bool InsertLiveInCopy) const { 2422 assert(PhyReg.isPhysical() && "Physical register expected"); 2423 2424 // Get or create virtual live-in regester 2425 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2426 if (!LiveIn) { 2427 LiveIn = MRI.createGenericVirtualRegister(Ty); 2428 MRI.addLiveIn(PhyReg, LiveIn); 2429 } 2430 2431 // When the actual true copy required is from virtual register to physical 2432 // register (to be inserted later), live-in copy insertion from physical 2433 // to register virtual register is not required 2434 if (!InsertLiveInCopy) 2435 return LiveIn; 2436 2437 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2438 } 2439 2440 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2441 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2442 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2443 const ArgDescriptor *Arg; 2444 const TargetRegisterClass *RC; 2445 LLT ArgTy; 2446 std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType); 2447 if (!Arg) { 2448 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2449 return nullptr; 2450 } 2451 return Arg; 2452 } 2453 2454 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2455 const ArgDescriptor *Arg) const { 2456 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2457 return false; // TODO: Handle these 2458 2459 Register SrcReg = Arg->getRegister(); 2460 assert(SrcReg.isPhysical() && "Physical register expected"); 2461 assert(DstReg.isVirtual() && "Virtual register expected"); 2462 2463 MachineRegisterInfo &MRI = *B.getMRI(); 2464 2465 LLT Ty = MRI.getType(DstReg); 2466 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2467 2468 if (Arg->isMasked()) { 2469 // TODO: Should we try to emit this once in the entry block? 2470 const LLT S32 = LLT::scalar(32); 2471 const unsigned Mask = Arg->getMask(); 2472 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2473 2474 Register AndMaskSrc = LiveIn; 2475 2476 if (Shift != 0) { 2477 auto ShiftAmt = B.buildConstant(S32, Shift); 2478 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2479 } 2480 2481 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2482 } else { 2483 B.buildCopy(DstReg, LiveIn); 2484 } 2485 2486 return true; 2487 } 2488 2489 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2490 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2491 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2492 2493 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2494 if (!Arg) 2495 return false; 2496 2497 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2498 return false; 2499 2500 MI.eraseFromParent(); 2501 return true; 2502 } 2503 2504 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2505 MachineRegisterInfo &MRI, 2506 MachineIRBuilder &B) const { 2507 Register Dst = MI.getOperand(0).getReg(); 2508 LLT DstTy = MRI.getType(Dst); 2509 LLT S16 = LLT::scalar(16); 2510 LLT S32 = LLT::scalar(32); 2511 LLT S64 = LLT::scalar(64); 2512 2513 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2514 return true; 2515 2516 if (DstTy == S16) 2517 return legalizeFDIV16(MI, MRI, B); 2518 if (DstTy == S32) 2519 return legalizeFDIV32(MI, MRI, B); 2520 if (DstTy == S64) 2521 return legalizeFDIV64(MI, MRI, B); 2522 2523 return false; 2524 } 2525 2526 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2527 const LLT S32 = LLT::scalar(32); 2528 2529 auto Cvt0 = B.buildUITOFP(S32, Src); 2530 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2531 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2532 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2533 return B.buildFPTOUI(S32, Mul).getReg(0); 2534 } 2535 2536 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2537 Register DstReg, 2538 Register Num, 2539 Register Den, 2540 bool IsDiv) const { 2541 const LLT S1 = LLT::scalar(1); 2542 const LLT S32 = LLT::scalar(32); 2543 2544 // RCP = URECIP(Den) = 2^32 / Den + e 2545 // e is rounding error. 2546 auto RCP = buildDivRCP(B, Den); 2547 2548 // RCP_LO = mul(RCP, Den) 2549 auto RCP_LO = B.buildMul(S32, RCP, Den); 2550 2551 // RCP_HI = mulhu (RCP, Den) */ 2552 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2553 2554 // NEG_RCP_LO = -RCP_LO 2555 auto Zero = B.buildConstant(S32, 0); 2556 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2557 2558 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2559 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2560 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2561 2562 // Calculate the rounding error from the URECIP instruction 2563 // E = mulhu(ABS_RCP_LO, RCP) 2564 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2565 2566 // RCP_A_E = RCP + E 2567 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2568 2569 // RCP_S_E = RCP - E 2570 auto RCP_S_E = B.buildSub(S32, RCP, E); 2571 2572 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2573 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2574 2575 // Quotient = mulhu(Tmp0, Num)stmp 2576 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2577 2578 // Num_S_Remainder = Quotient * Den 2579 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2580 2581 // Remainder = Num - Num_S_Remainder 2582 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2583 2584 // Remainder_GE_Den = Remainder >= Den 2585 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2586 2587 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2588 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2589 Num, Num_S_Remainder); 2590 2591 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2592 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2593 2594 // Calculate Division result: 2595 2596 // Quotient_A_One = Quotient + 1 2597 auto One = B.buildConstant(S32, 1); 2598 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2599 2600 // Quotient_S_One = Quotient - 1 2601 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2602 2603 // Div = (Tmp1 ? Quotient_A_One : Quotient) 2604 auto Div = B.buildSelect(S32, Tmp1, Quotient_A_One, Quotient); 2605 2606 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2607 if (IsDiv) { 2608 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2609 } else { 2610 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2611 2612 // Calculate Rem result: 2613 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2614 2615 // Remainder_A_Den = Remainder + Den 2616 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2617 2618 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2619 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2620 2621 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2622 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2623 } 2624 } 2625 2626 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2627 MachineRegisterInfo &MRI, 2628 MachineIRBuilder &B) const { 2629 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2630 Register DstReg = MI.getOperand(0).getReg(); 2631 Register Num = MI.getOperand(1).getReg(); 2632 Register Den = MI.getOperand(2).getReg(); 2633 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2634 MI.eraseFromParent(); 2635 return true; 2636 } 2637 2638 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2639 // 2640 // Return lo, hi of result 2641 // 2642 // %cvt.lo = G_UITOFP Val.lo 2643 // %cvt.hi = G_UITOFP Val.hi 2644 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2645 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2646 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2647 // %mul2 = G_FMUL %mul1, 2**(-32) 2648 // %trunc = G_INTRINSIC_TRUNC %mul2 2649 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2650 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2651 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2652 Register Val) { 2653 const LLT S32 = LLT::scalar(32); 2654 auto Unmerge = B.buildUnmerge(S32, Val); 2655 2656 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2657 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2658 2659 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2660 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2661 2662 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2663 auto Mul1 = 2664 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2665 2666 // 2**(-32) 2667 auto Mul2 = 2668 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2669 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2670 2671 // -(2**32) 2672 auto Mad2 = B.buildFMAD(S32, Trunc, 2673 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2674 2675 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2676 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2677 2678 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2679 } 2680 2681 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2682 Register DstReg, 2683 Register Numer, 2684 Register Denom, 2685 bool IsDiv) const { 2686 const LLT S32 = LLT::scalar(32); 2687 const LLT S64 = LLT::scalar(64); 2688 const LLT S1 = LLT::scalar(1); 2689 Register RcpLo, RcpHi; 2690 2691 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2692 2693 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2694 2695 auto Zero64 = B.buildConstant(S64, 0); 2696 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2697 2698 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2699 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2700 2701 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2702 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2703 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2704 2705 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2706 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2707 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2708 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2709 2710 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2711 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2712 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2713 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2714 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2715 2716 auto Zero32 = B.buildConstant(S32, 0); 2717 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2718 auto Add2_HiC = 2719 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2720 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2721 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2722 2723 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2724 Register NumerLo = UnmergeNumer.getReg(0); 2725 Register NumerHi = UnmergeNumer.getReg(1); 2726 2727 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2728 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2729 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2730 Register Mul3_Lo = UnmergeMul3.getReg(0); 2731 Register Mul3_Hi = UnmergeMul3.getReg(1); 2732 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2733 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2734 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2735 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2736 2737 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2738 Register DenomLo = UnmergeDenom.getReg(0); 2739 Register DenomHi = UnmergeDenom.getReg(1); 2740 2741 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2742 auto C1 = B.buildSExt(S32, CmpHi); 2743 2744 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2745 auto C2 = B.buildSExt(S32, CmpLo); 2746 2747 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2748 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2749 2750 // TODO: Here and below portions of the code can be enclosed into if/endif. 2751 // Currently control flow is unconditional and we have 4 selects after 2752 // potential endif to substitute PHIs. 2753 2754 // if C3 != 0 ... 2755 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2756 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2757 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2758 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2759 2760 auto One64 = B.buildConstant(S64, 1); 2761 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2762 2763 auto C4 = 2764 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2765 auto C5 = 2766 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2767 auto C6 = B.buildSelect( 2768 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2769 2770 // if (C6 != 0) 2771 auto Add4 = B.buildAdd(S64, Add3, One64); 2772 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2773 2774 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2775 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2776 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2777 2778 // endif C6 2779 // endif C3 2780 2781 if (IsDiv) { 2782 auto Sel1 = B.buildSelect( 2783 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2784 B.buildSelect(DstReg, 2785 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2786 } else { 2787 auto Sel2 = B.buildSelect( 2788 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2789 B.buildSelect(DstReg, 2790 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2791 } 2792 } 2793 2794 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2795 MachineRegisterInfo &MRI, 2796 MachineIRBuilder &B) const { 2797 const LLT S64 = LLT::scalar(64); 2798 const LLT S32 = LLT::scalar(32); 2799 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2800 Register DstReg = MI.getOperand(0).getReg(); 2801 Register Num = MI.getOperand(1).getReg(); 2802 Register Den = MI.getOperand(2).getReg(); 2803 LLT Ty = MRI.getType(DstReg); 2804 2805 if (Ty == S32) 2806 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2807 else if (Ty == S64) 2808 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2809 else 2810 return false; 2811 2812 MI.eraseFromParent(); 2813 return true; 2814 2815 } 2816 2817 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2818 MachineRegisterInfo &MRI, 2819 MachineIRBuilder &B) const { 2820 const LLT S64 = LLT::scalar(64); 2821 const LLT S32 = LLT::scalar(32); 2822 2823 Register DstReg = MI.getOperand(0).getReg(); 2824 const LLT Ty = MRI.getType(DstReg); 2825 if (Ty != S32 && Ty != S64) 2826 return false; 2827 2828 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2829 2830 Register LHS = MI.getOperand(1).getReg(); 2831 Register RHS = MI.getOperand(2).getReg(); 2832 2833 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2834 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2835 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2836 2837 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2838 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2839 2840 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2841 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2842 2843 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2844 if (Ty == S32) 2845 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2846 else 2847 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2848 2849 Register Sign; 2850 if (IsDiv) 2851 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2852 else 2853 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2854 2855 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2856 B.buildSub(DstReg, UDivRem, Sign); 2857 2858 MI.eraseFromParent(); 2859 return true; 2860 } 2861 2862 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2863 MachineRegisterInfo &MRI, 2864 MachineIRBuilder &B) const { 2865 Register Res = MI.getOperand(0).getReg(); 2866 Register LHS = MI.getOperand(1).getReg(); 2867 Register RHS = MI.getOperand(2).getReg(); 2868 2869 uint16_t Flags = MI.getFlags(); 2870 2871 LLT ResTy = MRI.getType(Res); 2872 LLT S32 = LLT::scalar(32); 2873 LLT S64 = LLT::scalar(64); 2874 2875 const MachineFunction &MF = B.getMF(); 2876 bool Unsafe = 2877 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2878 2879 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2880 return false; 2881 2882 if (!Unsafe && ResTy == S32 && 2883 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2884 return false; 2885 2886 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2887 // 1 / x -> RCP(x) 2888 if (CLHS->isExactlyValue(1.0)) { 2889 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2890 .addUse(RHS) 2891 .setMIFlags(Flags); 2892 2893 MI.eraseFromParent(); 2894 return true; 2895 } 2896 2897 // -1 / x -> RCP( FNEG(x) ) 2898 if (CLHS->isExactlyValue(-1.0)) { 2899 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2900 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2901 .addUse(FNeg.getReg(0)) 2902 .setMIFlags(Flags); 2903 2904 MI.eraseFromParent(); 2905 return true; 2906 } 2907 } 2908 2909 // x / y -> x * (1.0 / y) 2910 if (Unsafe) { 2911 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2912 .addUse(RHS) 2913 .setMIFlags(Flags); 2914 B.buildFMul(Res, LHS, RCP, Flags); 2915 2916 MI.eraseFromParent(); 2917 return true; 2918 } 2919 2920 return false; 2921 } 2922 2923 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2924 MachineRegisterInfo &MRI, 2925 MachineIRBuilder &B) const { 2926 Register Res = MI.getOperand(0).getReg(); 2927 Register LHS = MI.getOperand(1).getReg(); 2928 Register RHS = MI.getOperand(2).getReg(); 2929 2930 uint16_t Flags = MI.getFlags(); 2931 2932 LLT S16 = LLT::scalar(16); 2933 LLT S32 = LLT::scalar(32); 2934 2935 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2936 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2937 2938 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2939 .addUse(RHSExt.getReg(0)) 2940 .setMIFlags(Flags); 2941 2942 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2943 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2944 2945 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2946 .addUse(RDst.getReg(0)) 2947 .addUse(RHS) 2948 .addUse(LHS) 2949 .setMIFlags(Flags); 2950 2951 MI.eraseFromParent(); 2952 return true; 2953 } 2954 2955 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2956 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2957 static void toggleSPDenormMode(bool Enable, 2958 MachineIRBuilder &B, 2959 const GCNSubtarget &ST, 2960 AMDGPU::SIModeRegisterDefaults Mode) { 2961 // Set SP denorm mode to this value. 2962 unsigned SPDenormMode = 2963 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2964 2965 if (ST.hasDenormModeInst()) { 2966 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2967 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2968 2969 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2970 B.buildInstr(AMDGPU::S_DENORM_MODE) 2971 .addImm(NewDenormModeValue); 2972 2973 } else { 2974 // Select FP32 bit field in mode register. 2975 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2976 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2977 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2978 2979 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2980 .addImm(SPDenormMode) 2981 .addImm(SPDenormModeBitField); 2982 } 2983 } 2984 2985 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2986 MachineRegisterInfo &MRI, 2987 MachineIRBuilder &B) const { 2988 Register Res = MI.getOperand(0).getReg(); 2989 Register LHS = MI.getOperand(1).getReg(); 2990 Register RHS = MI.getOperand(2).getReg(); 2991 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2992 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2993 2994 uint16_t Flags = MI.getFlags(); 2995 2996 LLT S32 = LLT::scalar(32); 2997 LLT S1 = LLT::scalar(1); 2998 2999 auto One = B.buildFConstant(S32, 1.0f); 3000 3001 auto DenominatorScaled = 3002 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3003 .addUse(LHS) 3004 .addUse(RHS) 3005 .addImm(0) 3006 .setMIFlags(Flags); 3007 auto NumeratorScaled = 3008 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3009 .addUse(LHS) 3010 .addUse(RHS) 3011 .addImm(1) 3012 .setMIFlags(Flags); 3013 3014 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3015 .addUse(DenominatorScaled.getReg(0)) 3016 .setMIFlags(Flags); 3017 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3018 3019 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3020 // aren't modeled as reading it. 3021 if (!Mode.allFP32Denormals()) 3022 toggleSPDenormMode(true, B, ST, Mode); 3023 3024 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3025 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3026 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3027 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3028 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3029 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3030 3031 if (!Mode.allFP32Denormals()) 3032 toggleSPDenormMode(false, B, ST, Mode); 3033 3034 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3035 .addUse(Fma4.getReg(0)) 3036 .addUse(Fma1.getReg(0)) 3037 .addUse(Fma3.getReg(0)) 3038 .addUse(NumeratorScaled.getReg(1)) 3039 .setMIFlags(Flags); 3040 3041 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3042 .addUse(Fmas.getReg(0)) 3043 .addUse(RHS) 3044 .addUse(LHS) 3045 .setMIFlags(Flags); 3046 3047 MI.eraseFromParent(); 3048 return true; 3049 } 3050 3051 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3052 MachineRegisterInfo &MRI, 3053 MachineIRBuilder &B) const { 3054 Register Res = MI.getOperand(0).getReg(); 3055 Register LHS = MI.getOperand(1).getReg(); 3056 Register RHS = MI.getOperand(2).getReg(); 3057 3058 uint16_t Flags = MI.getFlags(); 3059 3060 LLT S64 = LLT::scalar(64); 3061 LLT S1 = LLT::scalar(1); 3062 3063 auto One = B.buildFConstant(S64, 1.0); 3064 3065 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3066 .addUse(LHS) 3067 .addUse(RHS) 3068 .addImm(0) 3069 .setMIFlags(Flags); 3070 3071 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3072 3073 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3074 .addUse(DivScale0.getReg(0)) 3075 .setMIFlags(Flags); 3076 3077 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3078 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3079 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3080 3081 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3082 .addUse(LHS) 3083 .addUse(RHS) 3084 .addImm(1) 3085 .setMIFlags(Flags); 3086 3087 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3088 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3089 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3090 3091 Register Scale; 3092 if (!ST.hasUsableDivScaleConditionOutput()) { 3093 // Workaround a hardware bug on SI where the condition output from div_scale 3094 // is not usable. 3095 3096 LLT S32 = LLT::scalar(32); 3097 3098 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3099 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3100 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3101 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3102 3103 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3104 Scale1Unmerge.getReg(1)); 3105 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3106 Scale0Unmerge.getReg(1)); 3107 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3108 } else { 3109 Scale = DivScale1.getReg(1); 3110 } 3111 3112 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3113 .addUse(Fma4.getReg(0)) 3114 .addUse(Fma3.getReg(0)) 3115 .addUse(Mul.getReg(0)) 3116 .addUse(Scale) 3117 .setMIFlags(Flags); 3118 3119 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3120 .addUse(Fmas.getReg(0)) 3121 .addUse(RHS) 3122 .addUse(LHS) 3123 .setMIFlags(Flags); 3124 3125 MI.eraseFromParent(); 3126 return true; 3127 } 3128 3129 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3130 MachineRegisterInfo &MRI, 3131 MachineIRBuilder &B) const { 3132 Register Res = MI.getOperand(0).getReg(); 3133 Register LHS = MI.getOperand(2).getReg(); 3134 Register RHS = MI.getOperand(3).getReg(); 3135 uint16_t Flags = MI.getFlags(); 3136 3137 LLT S32 = LLT::scalar(32); 3138 LLT S1 = LLT::scalar(1); 3139 3140 auto Abs = B.buildFAbs(S32, RHS, Flags); 3141 const APFloat C0Val(1.0f); 3142 3143 auto C0 = B.buildConstant(S32, 0x6f800000); 3144 auto C1 = B.buildConstant(S32, 0x2f800000); 3145 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3146 3147 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3148 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3149 3150 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3151 3152 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3153 .addUse(Mul0.getReg(0)) 3154 .setMIFlags(Flags); 3155 3156 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3157 3158 B.buildFMul(Res, Sel, Mul1, Flags); 3159 3160 MI.eraseFromParent(); 3161 return true; 3162 } 3163 3164 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3165 MachineRegisterInfo &MRI, 3166 MachineIRBuilder &B) const { 3167 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3168 if (!MFI->isEntryFunction()) { 3169 return legalizePreloadedArgIntrin(MI, MRI, B, 3170 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3171 } 3172 3173 uint64_t Offset = 3174 ST.getTargetLowering()->getImplicitParameterOffset( 3175 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3176 Register DstReg = MI.getOperand(0).getReg(); 3177 LLT DstTy = MRI.getType(DstReg); 3178 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3179 3180 const ArgDescriptor *Arg; 3181 const TargetRegisterClass *RC; 3182 LLT ArgTy; 3183 std::tie(Arg, RC, ArgTy) = 3184 MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3185 if (!Arg) 3186 return false; 3187 3188 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3189 if (!loadInputValue(KernargPtrReg, B, Arg)) 3190 return false; 3191 3192 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3193 MI.eraseFromParent(); 3194 return true; 3195 } 3196 3197 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3198 MachineRegisterInfo &MRI, 3199 MachineIRBuilder &B, 3200 unsigned AddrSpace) const { 3201 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3202 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3203 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3204 MI.eraseFromParent(); 3205 return true; 3206 } 3207 3208 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3209 // offset (the offset that is included in bounds checking and swizzling, to be 3210 // split between the instruction's voffset and immoffset fields) and soffset 3211 // (the offset that is excluded from bounds checking and swizzling, to go in 3212 // the instruction's soffset field). This function takes the first kind of 3213 // offset and figures out how to split it between voffset and immoffset. 3214 std::tuple<Register, unsigned, unsigned> 3215 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3216 Register OrigOffset) const { 3217 const unsigned MaxImm = 4095; 3218 Register BaseReg; 3219 unsigned TotalConstOffset; 3220 MachineInstr *OffsetDef; 3221 const LLT S32 = LLT::scalar(32); 3222 3223 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3224 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3225 3226 unsigned ImmOffset = TotalConstOffset; 3227 3228 // If the immediate value is too big for the immoffset field, put the value 3229 // and -4096 into the immoffset field so that the value that is copied/added 3230 // for the voffset field is a multiple of 4096, and it stands more chance 3231 // of being CSEd with the copy/add for another similar load/store. 3232 // However, do not do that rounding down to a multiple of 4096 if that is a 3233 // negative number, as it appears to be illegal to have a negative offset 3234 // in the vgpr, even if adding the immediate offset makes it positive. 3235 unsigned Overflow = ImmOffset & ~MaxImm; 3236 ImmOffset -= Overflow; 3237 if ((int32_t)Overflow < 0) { 3238 Overflow += ImmOffset; 3239 ImmOffset = 0; 3240 } 3241 3242 if (Overflow != 0) { 3243 if (!BaseReg) { 3244 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3245 } else { 3246 auto OverflowVal = B.buildConstant(S32, Overflow); 3247 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3248 } 3249 } 3250 3251 if (!BaseReg) 3252 BaseReg = B.buildConstant(S32, 0).getReg(0); 3253 3254 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3255 } 3256 3257 /// Handle register layout difference for f16 images for some subtargets. 3258 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3259 MachineRegisterInfo &MRI, 3260 Register Reg) const { 3261 if (!ST.hasUnpackedD16VMem()) 3262 return Reg; 3263 3264 const LLT S16 = LLT::scalar(16); 3265 const LLT S32 = LLT::scalar(32); 3266 LLT StoreVT = MRI.getType(Reg); 3267 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3268 3269 auto Unmerge = B.buildUnmerge(S16, Reg); 3270 3271 SmallVector<Register, 4> WideRegs; 3272 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3273 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3274 3275 int NumElts = StoreVT.getNumElements(); 3276 3277 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3278 } 3279 3280 Register AMDGPULegalizerInfo::fixStoreSourceType( 3281 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3282 MachineRegisterInfo *MRI = B.getMRI(); 3283 LLT Ty = MRI->getType(VData); 3284 3285 const LLT S16 = LLT::scalar(16); 3286 3287 // Fixup illegal register types for i8 stores. 3288 if (Ty == LLT::scalar(8) || Ty == S16) { 3289 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3290 return AnyExt; 3291 } 3292 3293 if (Ty.isVector()) { 3294 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3295 if (IsFormat) 3296 return handleD16VData(B, *MRI, VData); 3297 } 3298 } 3299 3300 return VData; 3301 } 3302 3303 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3304 MachineRegisterInfo &MRI, 3305 MachineIRBuilder &B, 3306 bool IsTyped, 3307 bool IsFormat) const { 3308 Register VData = MI.getOperand(1).getReg(); 3309 LLT Ty = MRI.getType(VData); 3310 LLT EltTy = Ty.getScalarType(); 3311 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3312 const LLT S32 = LLT::scalar(32); 3313 3314 VData = fixStoreSourceType(B, VData, IsFormat); 3315 Register RSrc = MI.getOperand(2).getReg(); 3316 3317 MachineMemOperand *MMO = *MI.memoperands_begin(); 3318 const int MemSize = MMO->getSize(); 3319 3320 unsigned ImmOffset; 3321 unsigned TotalOffset; 3322 3323 // The typed intrinsics add an immediate after the registers. 3324 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3325 3326 // The struct intrinsic variants add one additional operand over raw. 3327 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3328 Register VIndex; 3329 int OpOffset = 0; 3330 if (HasVIndex) { 3331 VIndex = MI.getOperand(3).getReg(); 3332 OpOffset = 1; 3333 } 3334 3335 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3336 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3337 3338 unsigned Format = 0; 3339 if (IsTyped) { 3340 Format = MI.getOperand(5 + OpOffset).getImm(); 3341 ++OpOffset; 3342 } 3343 3344 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3345 3346 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3347 if (TotalOffset != 0) 3348 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3349 3350 unsigned Opc; 3351 if (IsTyped) { 3352 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3353 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3354 } else if (IsFormat) { 3355 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3356 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3357 } else { 3358 switch (MemSize) { 3359 case 1: 3360 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3361 break; 3362 case 2: 3363 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3364 break; 3365 default: 3366 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3367 break; 3368 } 3369 } 3370 3371 if (!VIndex) 3372 VIndex = B.buildConstant(S32, 0).getReg(0); 3373 3374 auto MIB = B.buildInstr(Opc) 3375 .addUse(VData) // vdata 3376 .addUse(RSrc) // rsrc 3377 .addUse(VIndex) // vindex 3378 .addUse(VOffset) // voffset 3379 .addUse(SOffset) // soffset 3380 .addImm(ImmOffset); // offset(imm) 3381 3382 if (IsTyped) 3383 MIB.addImm(Format); 3384 3385 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3386 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3387 .addMemOperand(MMO); 3388 3389 MI.eraseFromParent(); 3390 return true; 3391 } 3392 3393 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3394 MachineRegisterInfo &MRI, 3395 MachineIRBuilder &B, 3396 bool IsFormat, 3397 bool IsTyped) const { 3398 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3399 MachineMemOperand *MMO = *MI.memoperands_begin(); 3400 const int MemSize = MMO->getSize(); 3401 const LLT S32 = LLT::scalar(32); 3402 3403 Register Dst = MI.getOperand(0).getReg(); 3404 Register RSrc = MI.getOperand(2).getReg(); 3405 3406 // The typed intrinsics add an immediate after the registers. 3407 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3408 3409 // The struct intrinsic variants add one additional operand over raw. 3410 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3411 Register VIndex; 3412 int OpOffset = 0; 3413 if (HasVIndex) { 3414 VIndex = MI.getOperand(3).getReg(); 3415 OpOffset = 1; 3416 } 3417 3418 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3419 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3420 3421 unsigned Format = 0; 3422 if (IsTyped) { 3423 Format = MI.getOperand(5 + OpOffset).getImm(); 3424 ++OpOffset; 3425 } 3426 3427 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3428 unsigned ImmOffset; 3429 unsigned TotalOffset; 3430 3431 LLT Ty = MRI.getType(Dst); 3432 LLT EltTy = Ty.getScalarType(); 3433 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3434 const bool Unpacked = ST.hasUnpackedD16VMem(); 3435 3436 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3437 if (TotalOffset != 0) 3438 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3439 3440 unsigned Opc; 3441 3442 if (IsTyped) { 3443 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3444 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3445 } else if (IsFormat) { 3446 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3447 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3448 } else { 3449 switch (MemSize) { 3450 case 1: 3451 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3452 break; 3453 case 2: 3454 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3455 break; 3456 default: 3457 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3458 break; 3459 } 3460 } 3461 3462 Register LoadDstReg; 3463 3464 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3465 LLT UnpackedTy = Ty.changeElementSize(32); 3466 3467 if (IsExtLoad) 3468 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3469 else if (Unpacked && IsD16 && Ty.isVector()) 3470 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3471 else 3472 LoadDstReg = Dst; 3473 3474 if (!VIndex) 3475 VIndex = B.buildConstant(S32, 0).getReg(0); 3476 3477 auto MIB = B.buildInstr(Opc) 3478 .addDef(LoadDstReg) // vdata 3479 .addUse(RSrc) // rsrc 3480 .addUse(VIndex) // vindex 3481 .addUse(VOffset) // voffset 3482 .addUse(SOffset) // soffset 3483 .addImm(ImmOffset); // offset(imm) 3484 3485 if (IsTyped) 3486 MIB.addImm(Format); 3487 3488 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3489 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3490 .addMemOperand(MMO); 3491 3492 if (LoadDstReg != Dst) { 3493 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3494 3495 // Widen result for extending loads was widened. 3496 if (IsExtLoad) 3497 B.buildTrunc(Dst, LoadDstReg); 3498 else { 3499 // Repack to original 16-bit vector result 3500 // FIXME: G_TRUNC should work, but legalization currently fails 3501 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3502 SmallVector<Register, 4> Repack; 3503 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3504 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3505 B.buildMerge(Dst, Repack); 3506 } 3507 } 3508 3509 MI.eraseFromParent(); 3510 return true; 3511 } 3512 3513 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3514 MachineIRBuilder &B, 3515 bool IsInc) const { 3516 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3517 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3518 B.buildInstr(Opc) 3519 .addDef(MI.getOperand(0).getReg()) 3520 .addUse(MI.getOperand(2).getReg()) 3521 .addUse(MI.getOperand(3).getReg()) 3522 .cloneMemRefs(MI); 3523 MI.eraseFromParent(); 3524 return true; 3525 } 3526 3527 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3528 switch (IntrID) { 3529 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3530 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3531 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3532 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3533 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3534 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3535 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3536 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3537 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3538 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3539 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3540 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3541 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3542 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3543 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3544 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3545 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3546 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3547 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3548 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3549 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3550 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3551 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3552 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3553 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3554 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3555 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3556 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3557 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3558 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3559 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3560 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3561 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3562 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3563 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3564 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3565 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3566 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3567 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3568 default: 3569 llvm_unreachable("unhandled atomic opcode"); 3570 } 3571 } 3572 3573 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3574 MachineIRBuilder &B, 3575 Intrinsic::ID IID) const { 3576 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3577 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3578 3579 Register Dst = MI.getOperand(0).getReg(); 3580 Register VData = MI.getOperand(2).getReg(); 3581 3582 Register CmpVal; 3583 int OpOffset = 0; 3584 3585 if (IsCmpSwap) { 3586 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3587 ++OpOffset; 3588 } 3589 3590 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3591 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3592 3593 // The struct intrinsic variants add one additional operand over raw. 3594 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3595 Register VIndex; 3596 if (HasVIndex) { 3597 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3598 ++OpOffset; 3599 } 3600 3601 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3602 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3603 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3604 3605 MachineMemOperand *MMO = *MI.memoperands_begin(); 3606 3607 unsigned ImmOffset; 3608 unsigned TotalOffset; 3609 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3610 if (TotalOffset != 0) 3611 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3612 3613 if (!VIndex) 3614 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3615 3616 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3617 .addDef(Dst) 3618 .addUse(VData); // vdata 3619 3620 if (IsCmpSwap) 3621 MIB.addReg(CmpVal); 3622 3623 MIB.addUse(RSrc) // rsrc 3624 .addUse(VIndex) // vindex 3625 .addUse(VOffset) // voffset 3626 .addUse(SOffset) // soffset 3627 .addImm(ImmOffset) // offset(imm) 3628 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3629 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3630 .addMemOperand(MMO); 3631 3632 MI.eraseFromParent(); 3633 return true; 3634 } 3635 3636 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3637 /// vector with s16 typed elements. 3638 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3639 SmallVectorImpl<Register> &PackedAddrs, 3640 int AddrIdx, int DimIdx, int EndIdx, 3641 int NumGradients) { 3642 const LLT S16 = LLT::scalar(16); 3643 const LLT V2S16 = LLT::vector(2, 16); 3644 3645 for (int I = AddrIdx; I < EndIdx; ++I) { 3646 MachineOperand &SrcOp = MI.getOperand(I); 3647 if (!SrcOp.isReg()) 3648 continue; // _L to _LZ may have eliminated this. 3649 3650 Register AddrReg = SrcOp.getReg(); 3651 3652 if (I < DimIdx) { 3653 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3654 PackedAddrs.push_back(AddrReg); 3655 } else { 3656 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3657 // derivatives dx/dh and dx/dv are packed with undef. 3658 if (((I + 1) >= EndIdx) || 3659 ((NumGradients / 2) % 2 == 1 && 3660 (I == DimIdx + (NumGradients / 2) - 1 || 3661 I == DimIdx + NumGradients - 1)) || 3662 // Check for _L to _LZ optimization 3663 !MI.getOperand(I + 1).isReg()) { 3664 PackedAddrs.push_back( 3665 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3666 .getReg(0)); 3667 } else { 3668 PackedAddrs.push_back( 3669 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3670 .getReg(0)); 3671 ++I; 3672 } 3673 } 3674 } 3675 } 3676 3677 /// Convert from separate vaddr components to a single vector address register, 3678 /// and replace the remaining operands with $noreg. 3679 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3680 int DimIdx, int NumVAddrs) { 3681 const LLT S32 = LLT::scalar(32); 3682 3683 SmallVector<Register, 8> AddrRegs; 3684 for (int I = 0; I != NumVAddrs; ++I) { 3685 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3686 if (SrcOp.isReg()) { 3687 AddrRegs.push_back(SrcOp.getReg()); 3688 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3689 } 3690 } 3691 3692 int NumAddrRegs = AddrRegs.size(); 3693 if (NumAddrRegs != 1) { 3694 // Round up to 8 elements for v5-v7 3695 // FIXME: Missing intermediate sized register classes and instructions. 3696 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3697 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3698 auto Undef = B.buildUndef(S32); 3699 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3700 NumAddrRegs = RoundedNumRegs; 3701 } 3702 3703 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3704 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3705 } 3706 3707 for (int I = 1; I != NumVAddrs; ++I) { 3708 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3709 if (SrcOp.isReg()) 3710 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3711 } 3712 } 3713 3714 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3715 /// 3716 /// Depending on the subtarget, load/store with 16-bit element data need to be 3717 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3718 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3719 /// registers. 3720 /// 3721 /// We don't want to directly select image instructions just yet, but also want 3722 /// to exposes all register repacking to the legalizer/combiners. We also don't 3723 /// want a selected instrution entering RegBankSelect. In order to avoid 3724 /// defining a multitude of intermediate image instructions, directly hack on 3725 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3726 /// now unnecessary arguments with $noreg. 3727 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3728 MachineInstr &MI, MachineIRBuilder &B, 3729 GISelChangeObserver &Observer, 3730 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3731 3732 const int NumDefs = MI.getNumExplicitDefs(); 3733 bool IsTFE = NumDefs == 2; 3734 // We are only processing the operands of d16 image operations on subtargets 3735 // that use the unpacked register layout, or need to repack the TFE result. 3736 3737 // TODO: Do we need to guard against already legalized intrinsics? 3738 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3739 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3740 3741 MachineRegisterInfo *MRI = B.getMRI(); 3742 const LLT S32 = LLT::scalar(32); 3743 const LLT S16 = LLT::scalar(16); 3744 const LLT V2S16 = LLT::vector(2, 16); 3745 3746 // Index of first address argument 3747 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3748 3749 int NumVAddrs, NumGradients; 3750 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3751 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3752 getDMaskIdx(BaseOpcode, NumDefs); 3753 unsigned DMask = 0; 3754 3755 // Check for 16 bit addresses and pack if true. 3756 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3757 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3758 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3759 const bool IsG16 = GradTy == S16; 3760 const bool IsA16 = AddrTy == S16; 3761 3762 int DMaskLanes = 0; 3763 if (!BaseOpcode->Atomic) { 3764 DMask = MI.getOperand(DMaskIdx).getImm(); 3765 if (BaseOpcode->Gather4) { 3766 DMaskLanes = 4; 3767 } else if (DMask != 0) { 3768 DMaskLanes = countPopulation(DMask); 3769 } else if (!IsTFE && !BaseOpcode->Store) { 3770 // If dmask is 0, this is a no-op load. This can be eliminated. 3771 B.buildUndef(MI.getOperand(0)); 3772 MI.eraseFromParent(); 3773 return true; 3774 } 3775 } 3776 3777 Observer.changingInstr(MI); 3778 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3779 3780 unsigned NewOpcode = NumDefs == 0 ? 3781 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3782 3783 // Track that we legalized this 3784 MI.setDesc(B.getTII().get(NewOpcode)); 3785 3786 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3787 // dmask to be at least 1 otherwise the instruction will fail 3788 if (IsTFE && DMask == 0) { 3789 DMask = 0x1; 3790 DMaskLanes = 1; 3791 MI.getOperand(DMaskIdx).setImm(DMask); 3792 } 3793 3794 if (BaseOpcode->Atomic) { 3795 Register VData0 = MI.getOperand(2).getReg(); 3796 LLT Ty = MRI->getType(VData0); 3797 3798 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3799 if (Ty.isVector()) 3800 return false; 3801 3802 if (BaseOpcode->AtomicX2) { 3803 Register VData1 = MI.getOperand(3).getReg(); 3804 // The two values are packed in one register. 3805 LLT PackedTy = LLT::vector(2, Ty); 3806 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3807 MI.getOperand(2).setReg(Concat.getReg(0)); 3808 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3809 } 3810 } 3811 3812 int CorrectedNumVAddrs = NumVAddrs; 3813 3814 // Optimize _L to _LZ when _L is zero 3815 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3816 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3817 const ConstantFP *ConstantLod; 3818 const int LodIdx = AddrIdx + NumVAddrs - 1; 3819 3820 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3821 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3822 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3823 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3824 LZMappingInfo->LZ, ImageDimIntr->Dim); 3825 3826 // The starting indexes should remain in the same place. 3827 --NumVAddrs; 3828 --CorrectedNumVAddrs; 3829 3830 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3831 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3832 MI.RemoveOperand(LodIdx); 3833 } 3834 } 3835 } 3836 3837 // Optimize _mip away, when 'lod' is zero 3838 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3839 int64_t ConstantLod; 3840 const int LodIdx = AddrIdx + NumVAddrs - 1; 3841 3842 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3843 if (ConstantLod == 0) { 3844 // TODO: Change intrinsic opcode and remove operand instead or replacing 3845 // it with 0, as the _L to _LZ handling is done above. 3846 MI.getOperand(LodIdx).ChangeToImmediate(0); 3847 --CorrectedNumVAddrs; 3848 } 3849 } 3850 } 3851 3852 // Rewrite the addressing register layout before doing anything else. 3853 if (IsA16 || IsG16) { 3854 if (IsA16) { 3855 // Target must support the feature and gradients need to be 16 bit too 3856 if (!ST.hasA16() || !IsG16) 3857 return false; 3858 } else if (!ST.hasG16()) 3859 return false; 3860 3861 if (NumVAddrs > 1) { 3862 SmallVector<Register, 4> PackedRegs; 3863 // Don't compress addresses for G16 3864 const int PackEndIdx = 3865 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3866 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3867 PackEndIdx, NumGradients); 3868 3869 if (!IsA16) { 3870 // Add uncompressed address 3871 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3872 int AddrReg = MI.getOperand(I).getReg(); 3873 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3874 PackedRegs.push_back(AddrReg); 3875 } 3876 } 3877 3878 // See also below in the non-a16 branch 3879 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3880 3881 if (!UseNSA && PackedRegs.size() > 1) { 3882 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3883 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3884 PackedRegs[0] = Concat.getReg(0); 3885 PackedRegs.resize(1); 3886 } 3887 3888 const int NumPacked = PackedRegs.size(); 3889 for (int I = 0; I != NumVAddrs; ++I) { 3890 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3891 if (!SrcOp.isReg()) { 3892 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3893 continue; 3894 } 3895 3896 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3897 3898 if (I < NumPacked) 3899 SrcOp.setReg(PackedRegs[I]); 3900 else 3901 SrcOp.setReg(AMDGPU::NoRegister); 3902 } 3903 } 3904 } else { 3905 // If the register allocator cannot place the address registers contiguously 3906 // without introducing moves, then using the non-sequential address encoding 3907 // is always preferable, since it saves VALU instructions and is usually a 3908 // wash in terms of code size or even better. 3909 // 3910 // However, we currently have no way of hinting to the register allocator 3911 // that MIMG addresses should be placed contiguously when it is possible to 3912 // do so, so force non-NSA for the common 2-address case as a heuristic. 3913 // 3914 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3915 // allocation when possible. 3916 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3917 3918 if (!UseNSA && NumVAddrs > 1) 3919 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3920 } 3921 3922 int Flags = 0; 3923 if (IsA16) 3924 Flags |= 1; 3925 if (IsG16) 3926 Flags |= 2; 3927 MI.addOperand(MachineOperand::CreateImm(Flags)); 3928 3929 if (BaseOpcode->Store) { // No TFE for stores? 3930 // TODO: Handle dmask trim 3931 Register VData = MI.getOperand(1).getReg(); 3932 LLT Ty = MRI->getType(VData); 3933 if (!Ty.isVector() || Ty.getElementType() != S16) 3934 return true; 3935 3936 Register RepackedReg = handleD16VData(B, *MRI, VData); 3937 if (RepackedReg != VData) { 3938 MI.getOperand(1).setReg(RepackedReg); 3939 } 3940 3941 return true; 3942 } 3943 3944 Register DstReg = MI.getOperand(0).getReg(); 3945 LLT Ty = MRI->getType(DstReg); 3946 const LLT EltTy = Ty.getScalarType(); 3947 const bool IsD16 = Ty.getScalarType() == S16; 3948 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3949 3950 // Confirm that the return type is large enough for the dmask specified 3951 if (NumElts < DMaskLanes) 3952 return false; 3953 3954 if (NumElts > 4 || DMaskLanes > 4) 3955 return false; 3956 3957 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3958 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3959 3960 // The raw dword aligned data component of the load. The only legal cases 3961 // where this matters should be when using the packed D16 format, for 3962 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3963 LLT RoundedTy; 3964 3965 // S32 vector to to cover all data, plus TFE result element. 3966 LLT TFETy; 3967 3968 // Register type to use for each loaded component. Will be S32 or V2S16. 3969 LLT RegTy; 3970 3971 if (IsD16 && ST.hasUnpackedD16VMem()) { 3972 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3973 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3974 RegTy = S32; 3975 } else { 3976 unsigned EltSize = EltTy.getSizeInBits(); 3977 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3978 unsigned RoundedSize = 32 * RoundedElts; 3979 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3980 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3981 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3982 } 3983 3984 // The return type does not need adjustment. 3985 // TODO: Should we change s16 case to s32 or <2 x s16>? 3986 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3987 return true; 3988 3989 Register Dst1Reg; 3990 3991 // Insert after the instruction. 3992 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3993 3994 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3995 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3996 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3997 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3998 3999 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4000 4001 MI.getOperand(0).setReg(NewResultReg); 4002 4003 // In the IR, TFE is supposed to be used with a 2 element struct return 4004 // type. The intruction really returns these two values in one contiguous 4005 // register, with one additional dword beyond the loaded data. Rewrite the 4006 // return type to use a single register result. 4007 4008 if (IsTFE) { 4009 Dst1Reg = MI.getOperand(1).getReg(); 4010 if (MRI->getType(Dst1Reg) != S32) 4011 return false; 4012 4013 // TODO: Make sure the TFE operand bit is set. 4014 MI.RemoveOperand(1); 4015 4016 // Handle the easy case that requires no repack instructions. 4017 if (Ty == S32) { 4018 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4019 return true; 4020 } 4021 } 4022 4023 // Now figure out how to copy the new result register back into the old 4024 // result. 4025 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4026 4027 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4028 4029 if (ResultNumRegs == 1) { 4030 assert(!IsTFE); 4031 ResultRegs[0] = NewResultReg; 4032 } else { 4033 // We have to repack into a new vector of some kind. 4034 for (int I = 0; I != NumDataRegs; ++I) 4035 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4036 B.buildUnmerge(ResultRegs, NewResultReg); 4037 4038 // Drop the final TFE element to get the data part. The TFE result is 4039 // directly written to the right place already. 4040 if (IsTFE) 4041 ResultRegs.resize(NumDataRegs); 4042 } 4043 4044 // For an s16 scalar result, we form an s32 result with a truncate regardless 4045 // of packed vs. unpacked. 4046 if (IsD16 && !Ty.isVector()) { 4047 B.buildTrunc(DstReg, ResultRegs[0]); 4048 return true; 4049 } 4050 4051 // Avoid a build/concat_vector of 1 entry. 4052 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4053 B.buildBitcast(DstReg, ResultRegs[0]); 4054 return true; 4055 } 4056 4057 assert(Ty.isVector()); 4058 4059 if (IsD16) { 4060 // For packed D16 results with TFE enabled, all the data components are 4061 // S32. Cast back to the expected type. 4062 // 4063 // TODO: We don't really need to use load s32 elements. We would only need one 4064 // cast for the TFE result if a multiple of v2s16 was used. 4065 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4066 for (Register &Reg : ResultRegs) 4067 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4068 } else if (ST.hasUnpackedD16VMem()) { 4069 for (Register &Reg : ResultRegs) 4070 Reg = B.buildTrunc(S16, Reg).getReg(0); 4071 } 4072 } 4073 4074 auto padWithUndef = [&](LLT Ty, int NumElts) { 4075 if (NumElts == 0) 4076 return; 4077 Register Undef = B.buildUndef(Ty).getReg(0); 4078 for (int I = 0; I != NumElts; ++I) 4079 ResultRegs.push_back(Undef); 4080 }; 4081 4082 // Pad out any elements eliminated due to the dmask. 4083 LLT ResTy = MRI->getType(ResultRegs[0]); 4084 if (!ResTy.isVector()) { 4085 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4086 B.buildBuildVector(DstReg, ResultRegs); 4087 return true; 4088 } 4089 4090 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4091 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4092 4093 // Deal with the one annoying legal case. 4094 const LLT V3S16 = LLT::vector(3, 16); 4095 if (Ty == V3S16) { 4096 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4097 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4098 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4099 return true; 4100 } 4101 4102 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4103 B.buildConcatVectors(DstReg, ResultRegs); 4104 return true; 4105 } 4106 4107 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4108 MachineInstr &MI, MachineIRBuilder &B, 4109 GISelChangeObserver &Observer) const { 4110 Register Dst = MI.getOperand(0).getReg(); 4111 LLT Ty = B.getMRI()->getType(Dst); 4112 unsigned Size = Ty.getSizeInBits(); 4113 MachineFunction &MF = B.getMF(); 4114 4115 Observer.changingInstr(MI); 4116 4117 // FIXME: We don't really need this intermediate instruction. The intrinsic 4118 // should be fixed to have a memory operand. Since it's readnone, we're not 4119 // allowed to add one. 4120 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4121 MI.RemoveOperand(1); // Remove intrinsic ID 4122 4123 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4124 // TODO: Should this use datalayout alignment? 4125 const unsigned MemSize = (Size + 7) / 8; 4126 const Align MemAlign(4); 4127 MachineMemOperand *MMO = MF.getMachineMemOperand( 4128 MachinePointerInfo(), 4129 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4130 MachineMemOperand::MOInvariant, 4131 MemSize, MemAlign); 4132 MI.addMemOperand(MF, MMO); 4133 4134 // There are no 96-bit result scalar loads, but widening to 128-bit should 4135 // always be legal. We may need to restore this to a 96-bit result if it turns 4136 // out this needs to be converted to a vector load during RegBankSelect. 4137 if (!isPowerOf2_32(Size)) { 4138 LegalizerHelper Helper(MF, *this, Observer, B); 4139 4140 if (Ty.isVector()) 4141 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4142 else 4143 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4144 } 4145 4146 Observer.changedInstr(MI); 4147 return true; 4148 } 4149 4150 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4151 MachineRegisterInfo &MRI, 4152 MachineIRBuilder &B) const { 4153 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4154 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4155 !ST.isTrapHandlerEnabled()) { 4156 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4157 } else { 4158 // Pass queue pointer to trap handler as input, and insert trap instruction 4159 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4160 const ArgDescriptor *Arg = 4161 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4162 if (!Arg) 4163 return false; 4164 MachineRegisterInfo &MRI = *B.getMRI(); 4165 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4166 Register LiveIn = getLiveInRegister( 4167 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4168 /*InsertLiveInCopy=*/false); 4169 if (!loadInputValue(LiveIn, B, Arg)) 4170 return false; 4171 B.buildCopy(SGPR01, LiveIn); 4172 B.buildInstr(AMDGPU::S_TRAP) 4173 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4174 .addReg(SGPR01, RegState::Implicit); 4175 } 4176 4177 MI.eraseFromParent(); 4178 return true; 4179 } 4180 4181 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4182 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4183 // Is non-HSA path or trap-handler disabled? then, report a warning 4184 // accordingly 4185 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4186 !ST.isTrapHandlerEnabled()) { 4187 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4188 "debugtrap handler not supported", 4189 MI.getDebugLoc(), DS_Warning); 4190 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4191 Ctx.diagnose(NoTrap); 4192 } else { 4193 // Insert debug-trap instruction 4194 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4195 } 4196 4197 MI.eraseFromParent(); 4198 return true; 4199 } 4200 4201 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4202 MachineInstr &MI) const { 4203 MachineIRBuilder &B = Helper.MIRBuilder; 4204 MachineRegisterInfo &MRI = *B.getMRI(); 4205 4206 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4207 auto IntrID = MI.getIntrinsicID(); 4208 switch (IntrID) { 4209 case Intrinsic::amdgcn_if: 4210 case Intrinsic::amdgcn_else: { 4211 MachineInstr *Br = nullptr; 4212 MachineBasicBlock *UncondBrTarget = nullptr; 4213 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4214 const SIRegisterInfo *TRI 4215 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4216 4217 Register Def = MI.getOperand(1).getReg(); 4218 Register Use = MI.getOperand(3).getReg(); 4219 4220 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4221 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4222 if (IntrID == Intrinsic::amdgcn_if) { 4223 B.buildInstr(AMDGPU::SI_IF) 4224 .addDef(Def) 4225 .addUse(Use) 4226 .addMBB(UncondBrTarget); 4227 } else { 4228 B.buildInstr(AMDGPU::SI_ELSE) 4229 .addDef(Def) 4230 .addUse(Use) 4231 .addMBB(UncondBrTarget) 4232 .addImm(0); 4233 } 4234 4235 if (Br) { 4236 Br->getOperand(0).setMBB(CondBrTarget); 4237 } else { 4238 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4239 // since we're swapping branch targets it needs to be reinserted. 4240 // FIXME: IRTranslator should probably not do this 4241 B.buildBr(*CondBrTarget); 4242 } 4243 4244 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4245 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4246 MI.eraseFromParent(); 4247 BrCond->eraseFromParent(); 4248 return true; 4249 } 4250 4251 return false; 4252 } 4253 case Intrinsic::amdgcn_loop: { 4254 MachineInstr *Br = nullptr; 4255 MachineBasicBlock *UncondBrTarget = nullptr; 4256 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4257 const SIRegisterInfo *TRI 4258 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4259 4260 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4261 Register Reg = MI.getOperand(2).getReg(); 4262 4263 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4264 B.buildInstr(AMDGPU::SI_LOOP) 4265 .addUse(Reg) 4266 .addMBB(UncondBrTarget); 4267 4268 if (Br) 4269 Br->getOperand(0).setMBB(CondBrTarget); 4270 else 4271 B.buildBr(*CondBrTarget); 4272 4273 MI.eraseFromParent(); 4274 BrCond->eraseFromParent(); 4275 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4276 return true; 4277 } 4278 4279 return false; 4280 } 4281 case Intrinsic::amdgcn_kernarg_segment_ptr: 4282 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4283 // This only makes sense to call in a kernel, so just lower to null. 4284 B.buildConstant(MI.getOperand(0).getReg(), 0); 4285 MI.eraseFromParent(); 4286 return true; 4287 } 4288 4289 return legalizePreloadedArgIntrin( 4290 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4291 case Intrinsic::amdgcn_implicitarg_ptr: 4292 return legalizeImplicitArgPtr(MI, MRI, B); 4293 case Intrinsic::amdgcn_workitem_id_x: 4294 return legalizePreloadedArgIntrin(MI, MRI, B, 4295 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4296 case Intrinsic::amdgcn_workitem_id_y: 4297 return legalizePreloadedArgIntrin(MI, MRI, B, 4298 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4299 case Intrinsic::amdgcn_workitem_id_z: 4300 return legalizePreloadedArgIntrin(MI, MRI, B, 4301 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4302 case Intrinsic::amdgcn_workgroup_id_x: 4303 return legalizePreloadedArgIntrin(MI, MRI, B, 4304 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4305 case Intrinsic::amdgcn_workgroup_id_y: 4306 return legalizePreloadedArgIntrin(MI, MRI, B, 4307 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4308 case Intrinsic::amdgcn_workgroup_id_z: 4309 return legalizePreloadedArgIntrin(MI, MRI, B, 4310 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4311 case Intrinsic::amdgcn_dispatch_ptr: 4312 return legalizePreloadedArgIntrin(MI, MRI, B, 4313 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4314 case Intrinsic::amdgcn_queue_ptr: 4315 return legalizePreloadedArgIntrin(MI, MRI, B, 4316 AMDGPUFunctionArgInfo::QUEUE_PTR); 4317 case Intrinsic::amdgcn_implicit_buffer_ptr: 4318 return legalizePreloadedArgIntrin( 4319 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4320 case Intrinsic::amdgcn_dispatch_id: 4321 return legalizePreloadedArgIntrin(MI, MRI, B, 4322 AMDGPUFunctionArgInfo::DISPATCH_ID); 4323 case Intrinsic::amdgcn_fdiv_fast: 4324 return legalizeFDIVFastIntrin(MI, MRI, B); 4325 case Intrinsic::amdgcn_is_shared: 4326 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4327 case Intrinsic::amdgcn_is_private: 4328 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4329 case Intrinsic::amdgcn_wavefrontsize: { 4330 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4331 MI.eraseFromParent(); 4332 return true; 4333 } 4334 case Intrinsic::amdgcn_s_buffer_load: 4335 return legalizeSBufferLoad(MI, B, Helper.Observer); 4336 case Intrinsic::amdgcn_raw_buffer_store: 4337 case Intrinsic::amdgcn_struct_buffer_store: 4338 return legalizeBufferStore(MI, MRI, B, false, false); 4339 case Intrinsic::amdgcn_raw_buffer_store_format: 4340 case Intrinsic::amdgcn_struct_buffer_store_format: 4341 return legalizeBufferStore(MI, MRI, B, false, true); 4342 case Intrinsic::amdgcn_raw_tbuffer_store: 4343 case Intrinsic::amdgcn_struct_tbuffer_store: 4344 return legalizeBufferStore(MI, MRI, B, true, true); 4345 case Intrinsic::amdgcn_raw_buffer_load: 4346 case Intrinsic::amdgcn_struct_buffer_load: 4347 return legalizeBufferLoad(MI, MRI, B, false, false); 4348 case Intrinsic::amdgcn_raw_buffer_load_format: 4349 case Intrinsic::amdgcn_struct_buffer_load_format: 4350 return legalizeBufferLoad(MI, MRI, B, true, false); 4351 case Intrinsic::amdgcn_raw_tbuffer_load: 4352 case Intrinsic::amdgcn_struct_tbuffer_load: 4353 return legalizeBufferLoad(MI, MRI, B, true, true); 4354 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4355 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4356 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4357 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4358 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4359 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4360 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4361 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4362 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4363 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4364 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4365 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4366 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4367 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4368 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4369 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4370 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4371 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4372 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4373 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4374 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4375 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4376 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4377 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4378 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4379 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4380 return legalizeBufferAtomic(MI, B, IntrID); 4381 case Intrinsic::amdgcn_atomic_inc: 4382 return legalizeAtomicIncDec(MI, B, true); 4383 case Intrinsic::amdgcn_atomic_dec: 4384 return legalizeAtomicIncDec(MI, B, false); 4385 case Intrinsic::trap: 4386 return legalizeTrapIntrinsic(MI, MRI, B); 4387 case Intrinsic::debugtrap: 4388 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4389 default: { 4390 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4391 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4392 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4393 return true; 4394 } 4395 } 4396 4397 return true; 4398 } 4399