1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 return Ty.isVector() && 67 Ty.getNumElements() % 2 != 0 && 68 Ty.getElementType().getSizeInBits() < 32 && 69 Ty.getSizeInBits() % 32 != 0; 70 }; 71 } 72 73 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 74 return [=](const LegalityQuery &Query) { 75 const LLT Ty = Query.Types[TypeIdx]; 76 const LLT EltTy = Ty.getScalarType(); 77 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 78 }; 79 } 80 81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 82 return [=](const LegalityQuery &Query) { 83 const LLT Ty = Query.Types[TypeIdx]; 84 const LLT EltTy = Ty.getElementType(); 85 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 86 }; 87 } 88 89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 90 return [=](const LegalityQuery &Query) { 91 const LLT Ty = Query.Types[TypeIdx]; 92 const LLT EltTy = Ty.getElementType(); 93 unsigned Size = Ty.getSizeInBits(); 94 unsigned Pieces = (Size + 63) / 64; 95 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 96 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 97 }; 98 } 99 100 // Increase the number of vector elements to reach the next multiple of 32-bit 101 // type. 102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 106 const LLT EltTy = Ty.getElementType(); 107 const int Size = Ty.getSizeInBits(); 108 const int EltSize = EltTy.getSizeInBits(); 109 const int NextMul32 = (Size + 31) / 32; 110 111 assert(EltSize < 32); 112 113 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 114 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 115 }; 116 } 117 118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 unsigned Size = Ty.getSizeInBits(); 122 123 LLT CoercedTy; 124 if (Size <= 32) { 125 // <2 x s8> -> s16 126 // <4 x s8> -> s32 127 CoercedTy = LLT::scalar(Size); 128 } else 129 CoercedTy = LLT::scalarOrVector(Size / 32, 32); 130 131 return std::make_pair(TypeIdx, CoercedTy); 132 }; 133 } 134 135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 136 return [=](const LegalityQuery &Query) { 137 const LLT QueryTy = Query.Types[TypeIdx]; 138 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 139 }; 140 } 141 142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 143 return [=](const LegalityQuery &Query) { 144 const LLT QueryTy = Query.Types[TypeIdx]; 145 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 146 }; 147 } 148 149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT QueryTy = Query.Types[TypeIdx]; 152 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 153 }; 154 } 155 156 static bool isRegisterSize(unsigned Size) { 157 return Size % 32 == 0 && Size <= MaxRegisterSize; 158 } 159 160 static bool isRegisterVectorElementType(LLT EltTy) { 161 const int EltSize = EltTy.getSizeInBits(); 162 return EltSize == 16 || EltSize % 32 == 0; 163 } 164 165 static bool isRegisterVectorType(LLT Ty) { 166 const int EltSize = Ty.getElementType().getSizeInBits(); 167 return EltSize == 32 || EltSize == 64 || 168 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 169 EltSize == 128 || EltSize == 256; 170 } 171 172 static bool isRegisterType(LLT Ty) { 173 if (!isRegisterSize(Ty.getSizeInBits())) 174 return false; 175 176 if (Ty.isVector()) 177 return isRegisterVectorType(Ty); 178 179 return true; 180 } 181 182 // Any combination of 32 or 64-bit elements up the maximum register size, and 183 // multiples of v2s16. 184 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 return isRegisterType(Query.Types[TypeIdx]); 187 }; 188 } 189 190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 191 return [=](const LegalityQuery &Query) { 192 const LLT QueryTy = Query.Types[TypeIdx]; 193 if (!QueryTy.isVector()) 194 return false; 195 const LLT EltTy = QueryTy.getElementType(); 196 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 197 }; 198 } 199 200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 201 return [=](const LegalityQuery &Query) { 202 const LLT Ty = Query.Types[TypeIdx]; 203 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 204 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 205 }; 206 } 207 208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 209 // handle some operations by just promoting the register during 210 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 212 bool IsLoad) { 213 switch (AS) { 214 case AMDGPUAS::PRIVATE_ADDRESS: 215 // FIXME: Private element size. 216 return 32; 217 case AMDGPUAS::LOCAL_ADDRESS: 218 return ST.useDS128() ? 128 : 64; 219 case AMDGPUAS::GLOBAL_ADDRESS: 220 case AMDGPUAS::CONSTANT_ADDRESS: 221 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 222 // Treat constant and global as identical. SMRD loads are sometimes usable for 223 // global loads (ideally constant address space should be eliminated) 224 // depending on the context. Legality cannot be context dependent, but 225 // RegBankSelect can split the load as necessary depending on the pointer 226 // register bank/uniformity and if the memory is invariant or not written in a 227 // kernel. 228 return IsLoad ? 512 : 128; 229 default: 230 // Flat addresses may contextually need to be split to 32-bit parts if they 231 // may alias scratch depending on the subtarget. 232 return 128; 233 } 234 } 235 236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 237 const LegalityQuery &Query, 238 unsigned Opcode) { 239 const LLT Ty = Query.Types[0]; 240 241 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 242 const bool IsLoad = Opcode != AMDGPU::G_STORE; 243 244 unsigned RegSize = Ty.getSizeInBits(); 245 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 246 unsigned Align = Query.MMODescrs[0].AlignInBits; 247 unsigned AS = Query.Types[1].getAddressSpace(); 248 249 // All of these need to be custom lowered to cast the pointer operand. 250 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 251 return false; 252 253 // TODO: We should be able to widen loads if the alignment is high enough, but 254 // we also need to modify the memory access size. 255 #if 0 256 // Accept widening loads based on alignment. 257 if (IsLoad && MemSize < Size) 258 MemSize = std::max(MemSize, Align); 259 #endif 260 261 // Only 1-byte and 2-byte to 32-bit extloads are valid. 262 if (MemSize != RegSize && RegSize != 32) 263 return false; 264 265 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 266 return false; 267 268 switch (MemSize) { 269 case 8: 270 case 16: 271 case 32: 272 case 64: 273 case 128: 274 break; 275 case 96: 276 if (!ST.hasDwordx3LoadStores()) 277 return false; 278 break; 279 case 256: 280 case 512: 281 // These may contextually need to be broken down. 282 break; 283 default: 284 return false; 285 } 286 287 assert(RegSize >= MemSize); 288 289 if (Align < MemSize) { 290 const SITargetLowering *TLI = ST.getTargetLowering(); 291 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 292 return false; 293 } 294 295 return true; 296 } 297 298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 299 // workaround this. Eventually it should ignore the type for loads and only care 300 // about the size. Return true in cases where we will workaround this for now by 301 // bitcasting. 302 static bool loadStoreBitcastWorkaround(const LLT Ty) { 303 if (EnableNewLegality) 304 return false; 305 306 const unsigned Size = Ty.getSizeInBits(); 307 if (Size <= 64) 308 return false; 309 if (!Ty.isVector()) 310 return true; 311 unsigned EltSize = Ty.getElementType().getSizeInBits(); 312 return EltSize != 32 && EltSize != 64; 313 } 314 315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 316 unsigned Opcode) { 317 const LLT Ty = Query.Types[0]; 318 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 319 !loadStoreBitcastWorkaround(Ty); 320 } 321 322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 323 const GCNTargetMachine &TM) 324 : ST(ST_) { 325 using namespace TargetOpcode; 326 327 auto GetAddrSpacePtr = [&TM](unsigned AS) { 328 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 329 }; 330 331 const LLT S1 = LLT::scalar(1); 332 const LLT S16 = LLT::scalar(16); 333 const LLT S32 = LLT::scalar(32); 334 const LLT S64 = LLT::scalar(64); 335 const LLT S128 = LLT::scalar(128); 336 const LLT S256 = LLT::scalar(256); 337 const LLT S512 = LLT::scalar(512); 338 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 339 340 const LLT V2S16 = LLT::vector(2, 16); 341 const LLT V4S16 = LLT::vector(4, 16); 342 343 const LLT V2S32 = LLT::vector(2, 32); 344 const LLT V3S32 = LLT::vector(3, 32); 345 const LLT V4S32 = LLT::vector(4, 32); 346 const LLT V5S32 = LLT::vector(5, 32); 347 const LLT V6S32 = LLT::vector(6, 32); 348 const LLT V7S32 = LLT::vector(7, 32); 349 const LLT V8S32 = LLT::vector(8, 32); 350 const LLT V9S32 = LLT::vector(9, 32); 351 const LLT V10S32 = LLT::vector(10, 32); 352 const LLT V11S32 = LLT::vector(11, 32); 353 const LLT V12S32 = LLT::vector(12, 32); 354 const LLT V13S32 = LLT::vector(13, 32); 355 const LLT V14S32 = LLT::vector(14, 32); 356 const LLT V15S32 = LLT::vector(15, 32); 357 const LLT V16S32 = LLT::vector(16, 32); 358 const LLT V32S32 = LLT::vector(32, 32); 359 360 const LLT V2S64 = LLT::vector(2, 64); 361 const LLT V3S64 = LLT::vector(3, 64); 362 const LLT V4S64 = LLT::vector(4, 64); 363 const LLT V5S64 = LLT::vector(5, 64); 364 const LLT V6S64 = LLT::vector(6, 64); 365 const LLT V7S64 = LLT::vector(7, 64); 366 const LLT V8S64 = LLT::vector(8, 64); 367 const LLT V16S64 = LLT::vector(16, 64); 368 369 std::initializer_list<LLT> AllS32Vectors = 370 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 371 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 372 std::initializer_list<LLT> AllS64Vectors = 373 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 374 375 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 376 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 377 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 378 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 379 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 380 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 381 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 382 383 const LLT CodePtr = FlatPtr; 384 385 const std::initializer_list<LLT> AddrSpaces64 = { 386 GlobalPtr, ConstantPtr, FlatPtr 387 }; 388 389 const std::initializer_list<LLT> AddrSpaces32 = { 390 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 391 }; 392 393 const std::initializer_list<LLT> FPTypesBase = { 394 S32, S64 395 }; 396 397 const std::initializer_list<LLT> FPTypes16 = { 398 S32, S64, S16 399 }; 400 401 const std::initializer_list<LLT> FPTypesPK16 = { 402 S32, S64, S16, V2S16 403 }; 404 405 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 406 407 setAction({G_BRCOND, S1}, Legal); // VCC branches 408 setAction({G_BRCOND, S32}, Legal); // SCC branches 409 410 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 411 // elements for v3s16 412 getActionDefinitionsBuilder(G_PHI) 413 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 414 .legalFor(AllS32Vectors) 415 .legalFor(AllS64Vectors) 416 .legalFor(AddrSpaces64) 417 .legalFor(AddrSpaces32) 418 .clampScalar(0, S32, S256) 419 .widenScalarToNextPow2(0, 32) 420 .clampMaxNumElements(0, S32, 16) 421 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 422 .legalIf(isPointer(0)); 423 424 if (ST.hasVOP3PInsts()) { 425 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 426 .legalFor({S32, S16, V2S16}) 427 .clampScalar(0, S16, S32) 428 .clampMaxNumElements(0, S16, 2) 429 .scalarize(0) 430 .widenScalarToNextPow2(0, 32); 431 } else if (ST.has16BitInsts()) { 432 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 433 .legalFor({S32, S16}) 434 .clampScalar(0, S16, S32) 435 .scalarize(0) 436 .widenScalarToNextPow2(0, 32); 437 } else { 438 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 439 .legalFor({S32}) 440 .clampScalar(0, S32, S32) 441 .scalarize(0); 442 } 443 444 // FIXME: Not really legal. Placeholder for custom lowering. 445 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 446 .customFor({S32, S64}) 447 .clampScalar(0, S32, S64) 448 .widenScalarToNextPow2(0, 32) 449 .scalarize(0); 450 451 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 452 .legalFor({S32}) 453 .clampScalar(0, S32, S32) 454 .scalarize(0); 455 456 // Report legal for any types we can handle anywhere. For the cases only legal 457 // on the SALU, RegBankSelect will be able to re-legalize. 458 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 459 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 460 .clampScalar(0, S32, S64) 461 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 462 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 463 .widenScalarToNextPow2(0) 464 .scalarize(0); 465 466 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 467 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 468 .legalFor({{S32, S1}, {S32, S32}}) 469 .minScalar(0, S32) 470 // TODO: .scalarize(0) 471 .lower(); 472 473 getActionDefinitionsBuilder(G_BITCAST) 474 // Don't worry about the size constraint. 475 .legalIf(all(isRegisterType(0), isRegisterType(1))) 476 .lower(); 477 478 479 getActionDefinitionsBuilder(G_CONSTANT) 480 .legalFor({S1, S32, S64, S16, GlobalPtr, 481 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 482 .clampScalar(0, S32, S64) 483 .widenScalarToNextPow2(0) 484 .legalIf(isPointer(0)); 485 486 getActionDefinitionsBuilder(G_FCONSTANT) 487 .legalFor({S32, S64, S16}) 488 .clampScalar(0, S16, S64); 489 490 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 491 .legalIf(isRegisterType(0)) 492 // s1 and s16 are special cases because they have legal operations on 493 // them, but don't really occupy registers in the normal way. 494 .legalFor({S1, S16}) 495 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 496 .clampScalarOrElt(0, S32, MaxScalar) 497 .widenScalarToNextPow2(0, 32) 498 .clampMaxNumElements(0, S32, 16); 499 500 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 501 502 // If the amount is divergent, we have to do a wave reduction to get the 503 // maximum value, so this is expanded during RegBankSelect. 504 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 505 .legalFor({{PrivatePtr, S32}}); 506 507 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 508 .unsupportedFor({PrivatePtr}) 509 .custom(); 510 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 511 512 auto &FPOpActions = getActionDefinitionsBuilder( 513 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 514 .legalFor({S32, S64}); 515 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 516 .customFor({S32, S64}); 517 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 518 .customFor({S32, S64}); 519 520 if (ST.has16BitInsts()) { 521 if (ST.hasVOP3PInsts()) 522 FPOpActions.legalFor({S16, V2S16}); 523 else 524 FPOpActions.legalFor({S16}); 525 526 TrigActions.customFor({S16}); 527 FDIVActions.customFor({S16}); 528 } 529 530 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 531 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 532 533 if (ST.hasVOP3PInsts()) { 534 MinNumMaxNum.customFor(FPTypesPK16) 535 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 536 .clampMaxNumElements(0, S16, 2) 537 .clampScalar(0, S16, S64) 538 .scalarize(0); 539 } else if (ST.has16BitInsts()) { 540 MinNumMaxNum.customFor(FPTypes16) 541 .clampScalar(0, S16, S64) 542 .scalarize(0); 543 } else { 544 MinNumMaxNum.customFor(FPTypesBase) 545 .clampScalar(0, S32, S64) 546 .scalarize(0); 547 } 548 549 if (ST.hasVOP3PInsts()) 550 FPOpActions.clampMaxNumElements(0, S16, 2); 551 552 FPOpActions 553 .scalarize(0) 554 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 555 556 TrigActions 557 .scalarize(0) 558 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 559 560 FDIVActions 561 .scalarize(0) 562 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 563 564 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 565 .legalFor(FPTypesPK16) 566 .clampMaxNumElements(0, S16, 2) 567 .scalarize(0) 568 .clampScalar(0, S16, S64); 569 570 if (ST.has16BitInsts()) { 571 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 572 .legalFor({S32, S64, S16}) 573 .scalarize(0) 574 .clampScalar(0, S16, S64); 575 } else { 576 getActionDefinitionsBuilder(G_FSQRT) 577 .legalFor({S32, S64}) 578 .scalarize(0) 579 .clampScalar(0, S32, S64); 580 581 if (ST.hasFractBug()) { 582 getActionDefinitionsBuilder(G_FFLOOR) 583 .customFor({S64}) 584 .legalFor({S32, S64}) 585 .scalarize(0) 586 .clampScalar(0, S32, S64); 587 } else { 588 getActionDefinitionsBuilder(G_FFLOOR) 589 .legalFor({S32, S64}) 590 .scalarize(0) 591 .clampScalar(0, S32, S64); 592 } 593 } 594 595 getActionDefinitionsBuilder(G_FPTRUNC) 596 .legalFor({{S32, S64}, {S16, S32}}) 597 .scalarize(0) 598 .lower(); 599 600 getActionDefinitionsBuilder(G_FPEXT) 601 .legalFor({{S64, S32}, {S32, S16}}) 602 .lowerFor({{S64, S16}}) // FIXME: Implement 603 .scalarize(0); 604 605 getActionDefinitionsBuilder(G_FSUB) 606 // Use actual fsub instruction 607 .legalFor({S32}) 608 // Must use fadd + fneg 609 .lowerFor({S64, S16, V2S16}) 610 .scalarize(0) 611 .clampScalar(0, S32, S64); 612 613 // Whether this is legal depends on the floating point mode for the function. 614 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 615 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 616 FMad.customFor({S32, S16}); 617 else if (ST.hasMadMacF32Insts()) 618 FMad.customFor({S32}); 619 else if (ST.hasMadF16()) 620 FMad.customFor({S16}); 621 FMad.scalarize(0) 622 .lower(); 623 624 // TODO: Do we need to clamp maximum bitwidth? 625 getActionDefinitionsBuilder(G_TRUNC) 626 .legalIf(isScalar(0)) 627 .legalFor({{V2S16, V2S32}}) 628 .clampMaxNumElements(0, S16, 2) 629 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 630 // situations (like an invalid implicit use), we don't want to infinite loop 631 // in the legalizer. 632 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 633 .alwaysLegal(); 634 635 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 636 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 637 {S32, S1}, {S64, S1}, {S16, S1}}) 638 .scalarize(0) 639 .clampScalar(0, S32, S64) 640 .widenScalarToNextPow2(1, 32); 641 642 // TODO: Split s1->s64 during regbankselect for VALU. 643 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 644 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 645 .lowerFor({{S32, S64}}) 646 .lowerIf(typeIs(1, S1)) 647 .customFor({{S64, S64}}); 648 if (ST.has16BitInsts()) 649 IToFP.legalFor({{S16, S16}}); 650 IToFP.clampScalar(1, S32, S64) 651 .minScalar(0, S32) 652 .scalarize(0) 653 .widenScalarToNextPow2(1); 654 655 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 656 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 657 .customFor({{S64, S64}}); 658 if (ST.has16BitInsts()) 659 FPToI.legalFor({{S16, S16}}); 660 else 661 FPToI.minScalar(1, S32); 662 663 FPToI.minScalar(0, S32) 664 .scalarize(0) 665 .lower(); 666 667 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 668 .scalarize(0) 669 .lower(); 670 671 if (ST.has16BitInsts()) { 672 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 673 .legalFor({S16, S32, S64}) 674 .clampScalar(0, S16, S64) 675 .scalarize(0); 676 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 677 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 678 .legalFor({S32, S64}) 679 .clampScalar(0, S32, S64) 680 .scalarize(0); 681 } else { 682 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 683 .legalFor({S32}) 684 .customFor({S64}) 685 .clampScalar(0, S32, S64) 686 .scalarize(0); 687 } 688 689 // FIXME: Clamp offset operand. 690 getActionDefinitionsBuilder(G_PTR_ADD) 691 .legalIf(isPointer(0)) 692 .scalarize(0); 693 694 getActionDefinitionsBuilder(G_PTRMASK) 695 .legalIf(typeInSet(1, {S64, S32})) 696 .minScalar(1, S32) 697 .maxScalarIf(sizeIs(0, 32), 1, S32) 698 .maxScalarIf(sizeIs(0, 64), 1, S64) 699 .scalarize(0); 700 701 auto &CmpBuilder = 702 getActionDefinitionsBuilder(G_ICMP) 703 // The compare output type differs based on the register bank of the output, 704 // so make both s1 and s32 legal. 705 // 706 // Scalar compares producing output in scc will be promoted to s32, as that 707 // is the allocatable register type that will be needed for the copy from 708 // scc. This will be promoted during RegBankSelect, and we assume something 709 // before that won't try to use s32 result types. 710 // 711 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 712 // bank. 713 .legalForCartesianProduct( 714 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 715 .legalForCartesianProduct( 716 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 717 if (ST.has16BitInsts()) { 718 CmpBuilder.legalFor({{S1, S16}}); 719 } 720 721 CmpBuilder 722 .widenScalarToNextPow2(1) 723 .clampScalar(1, S32, S64) 724 .scalarize(0) 725 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 726 727 getActionDefinitionsBuilder(G_FCMP) 728 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 729 .widenScalarToNextPow2(1) 730 .clampScalar(1, S32, S64) 731 .scalarize(0); 732 733 // FIXME: fpow has a selection pattern that should move to custom lowering. 734 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 735 if (ST.has16BitInsts()) 736 Exp2Ops.legalFor({S32, S16}); 737 else 738 Exp2Ops.legalFor({S32}); 739 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 740 Exp2Ops.scalarize(0); 741 742 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 743 if (ST.has16BitInsts()) 744 ExpOps.customFor({{S32}, {S16}}); 745 else 746 ExpOps.customFor({S32}); 747 ExpOps.clampScalar(0, MinScalarFPTy, S32) 748 .scalarize(0); 749 750 // The 64-bit versions produce 32-bit results, but only on the SALU. 751 getActionDefinitionsBuilder(G_CTPOP) 752 .legalFor({{S32, S32}, {S32, S64}}) 753 .clampScalar(0, S32, S32) 754 .clampScalar(1, S32, S64) 755 .scalarize(0) 756 .widenScalarToNextPow2(0, 32) 757 .widenScalarToNextPow2(1, 32); 758 759 // The hardware instructions return a different result on 0 than the generic 760 // instructions expect. The hardware produces -1, but these produce the 761 // bitwidth. 762 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 763 .scalarize(0) 764 .clampScalar(0, S32, S32) 765 .clampScalar(1, S32, S64) 766 .widenScalarToNextPow2(0, 32) 767 .widenScalarToNextPow2(1, 32) 768 .lower(); 769 770 // The 64-bit versions produce 32-bit results, but only on the SALU. 771 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 772 .legalFor({{S32, S32}, {S32, S64}}) 773 .clampScalar(0, S32, S32) 774 .clampScalar(1, S32, S64) 775 .scalarize(0) 776 .widenScalarToNextPow2(0, 32) 777 .widenScalarToNextPow2(1, 32); 778 779 getActionDefinitionsBuilder(G_BITREVERSE) 780 .legalFor({S32}) 781 .clampScalar(0, S32, S32) 782 .scalarize(0); 783 784 if (ST.has16BitInsts()) { 785 getActionDefinitionsBuilder(G_BSWAP) 786 .legalFor({S16, S32, V2S16}) 787 .clampMaxNumElements(0, S16, 2) 788 // FIXME: Fixing non-power-of-2 before clamp is workaround for 789 // narrowScalar limitation. 790 .widenScalarToNextPow2(0) 791 .clampScalar(0, S16, S32) 792 .scalarize(0); 793 794 if (ST.hasVOP3PInsts()) { 795 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 796 .legalFor({S32, S16, V2S16}) 797 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 798 .clampMaxNumElements(0, S16, 2) 799 .minScalar(0, S16) 800 .widenScalarToNextPow2(0) 801 .scalarize(0) 802 .lower(); 803 } else { 804 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 805 .legalFor({S32, S16}) 806 .widenScalarToNextPow2(0) 807 .minScalar(0, S16) 808 .scalarize(0) 809 .lower(); 810 } 811 } else { 812 // TODO: Should have same legality without v_perm_b32 813 getActionDefinitionsBuilder(G_BSWAP) 814 .legalFor({S32}) 815 .lowerIf(scalarNarrowerThan(0, 32)) 816 // FIXME: Fixing non-power-of-2 before clamp is workaround for 817 // narrowScalar limitation. 818 .widenScalarToNextPow2(0) 819 .maxScalar(0, S32) 820 .scalarize(0) 821 .lower(); 822 823 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 824 .legalFor({S32}) 825 .minScalar(0, S32) 826 .widenScalarToNextPow2(0) 827 .scalarize(0) 828 .lower(); 829 } 830 831 getActionDefinitionsBuilder(G_INTTOPTR) 832 // List the common cases 833 .legalForCartesianProduct(AddrSpaces64, {S64}) 834 .legalForCartesianProduct(AddrSpaces32, {S32}) 835 .scalarize(0) 836 // Accept any address space as long as the size matches 837 .legalIf(sameSize(0, 1)) 838 .widenScalarIf(smallerThan(1, 0), 839 [](const LegalityQuery &Query) { 840 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 841 }) 842 .narrowScalarIf(largerThan(1, 0), 843 [](const LegalityQuery &Query) { 844 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 845 }); 846 847 getActionDefinitionsBuilder(G_PTRTOINT) 848 // List the common cases 849 .legalForCartesianProduct(AddrSpaces64, {S64}) 850 .legalForCartesianProduct(AddrSpaces32, {S32}) 851 .scalarize(0) 852 // Accept any address space as long as the size matches 853 .legalIf(sameSize(0, 1)) 854 .widenScalarIf(smallerThan(0, 1), 855 [](const LegalityQuery &Query) { 856 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 857 }) 858 .narrowScalarIf( 859 largerThan(0, 1), 860 [](const LegalityQuery &Query) { 861 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 862 }); 863 864 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 865 .scalarize(0) 866 .custom(); 867 868 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 869 bool IsLoad) -> bool { 870 const LLT DstTy = Query.Types[0]; 871 872 // Split vector extloads. 873 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 874 unsigned Align = Query.MMODescrs[0].AlignInBits; 875 876 if (MemSize < DstTy.getSizeInBits()) 877 MemSize = std::max(MemSize, Align); 878 879 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 880 return true; 881 882 const LLT PtrTy = Query.Types[1]; 883 unsigned AS = PtrTy.getAddressSpace(); 884 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 885 return true; 886 887 // Catch weird sized loads that don't evenly divide into the access sizes 888 // TODO: May be able to widen depending on alignment etc. 889 unsigned NumRegs = (MemSize + 31) / 32; 890 if (NumRegs == 3) { 891 if (!ST.hasDwordx3LoadStores()) 892 return true; 893 } else { 894 // If the alignment allows, these should have been widened. 895 if (!isPowerOf2_32(NumRegs)) 896 return true; 897 } 898 899 if (Align < MemSize) { 900 const SITargetLowering *TLI = ST.getTargetLowering(); 901 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 902 } 903 904 return false; 905 }; 906 907 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 908 unsigned Opc) -> bool { 909 unsigned Size = Query.Types[0].getSizeInBits(); 910 if (isPowerOf2_32(Size)) 911 return false; 912 913 if (Size == 96 && ST.hasDwordx3LoadStores()) 914 return false; 915 916 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 917 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 918 return false; 919 920 unsigned Align = Query.MMODescrs[0].AlignInBits; 921 unsigned RoundedSize = NextPowerOf2(Size); 922 return (Align >= RoundedSize); 923 }; 924 925 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 926 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 927 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 928 929 // TODO: Refine based on subtargets which support unaligned access or 128-bit 930 // LDS 931 // TODO: Unsupported flat for SI. 932 933 for (unsigned Op : {G_LOAD, G_STORE}) { 934 const bool IsStore = Op == G_STORE; 935 936 auto &Actions = getActionDefinitionsBuilder(Op); 937 // Explicitly list some common cases. 938 // TODO: Does this help compile time at all? 939 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 940 {V2S32, GlobalPtr, 64, GlobalAlign32}, 941 {V4S32, GlobalPtr, 128, GlobalAlign32}, 942 {S64, GlobalPtr, 64, GlobalAlign32}, 943 {V2S64, GlobalPtr, 128, GlobalAlign32}, 944 {V2S16, GlobalPtr, 32, GlobalAlign32}, 945 {S32, GlobalPtr, 8, GlobalAlign8}, 946 {S32, GlobalPtr, 16, GlobalAlign16}, 947 948 {S32, LocalPtr, 32, 32}, 949 {S64, LocalPtr, 64, 32}, 950 {V2S32, LocalPtr, 64, 32}, 951 {S32, LocalPtr, 8, 8}, 952 {S32, LocalPtr, 16, 16}, 953 {V2S16, LocalPtr, 32, 32}, 954 955 {S32, PrivatePtr, 32, 32}, 956 {S32, PrivatePtr, 8, 8}, 957 {S32, PrivatePtr, 16, 16}, 958 {V2S16, PrivatePtr, 32, 32}, 959 960 {S32, ConstantPtr, 32, GlobalAlign32}, 961 {V2S32, ConstantPtr, 64, GlobalAlign32}, 962 {V4S32, ConstantPtr, 128, GlobalAlign32}, 963 {S64, ConstantPtr, 64, GlobalAlign32}, 964 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 965 Actions.legalIf( 966 [=](const LegalityQuery &Query) -> bool { 967 return isLoadStoreLegal(ST, Query, Op); 968 }); 969 970 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 971 // 64-bits. 972 // 973 // TODO: Should generalize bitcast action into coerce, which will also cover 974 // inserting addrspacecasts. 975 Actions.customIf(typeIs(1, Constant32Ptr)); 976 977 // Turn any illegal element vectors into something easier to deal 978 // with. These will ultimately produce 32-bit scalar shifts to extract the 979 // parts anyway. 980 // 981 // For odd 16-bit element vectors, prefer to split those into pieces with 982 // 16-bit vector parts. 983 Actions.bitcastIf( 984 [=](const LegalityQuery &Query) -> bool { 985 const LLT Ty = Query.Types[0]; 986 const unsigned Size = Ty.getSizeInBits(); 987 988 if (Size != Query.MMODescrs[0].SizeInBits) 989 return Size <= 32 && Ty.isVector(); 990 991 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 992 return true; 993 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 994 !isRegisterVectorElementType(Ty.getElementType()); 995 }, bitcastToRegisterType(0)); 996 997 Actions 998 .customIf(typeIs(1, Constant32Ptr)) 999 // Widen suitably aligned loads by loading extra elements. 1000 .moreElementsIf([=](const LegalityQuery &Query) { 1001 const LLT Ty = Query.Types[0]; 1002 return Op == G_LOAD && Ty.isVector() && 1003 shouldWidenLoadResult(Query, Op); 1004 }, moreElementsToNextPow2(0)) 1005 .widenScalarIf([=](const LegalityQuery &Query) { 1006 const LLT Ty = Query.Types[0]; 1007 return Op == G_LOAD && !Ty.isVector() && 1008 shouldWidenLoadResult(Query, Op); 1009 }, widenScalarOrEltToNextPow2(0)) 1010 .narrowScalarIf( 1011 [=](const LegalityQuery &Query) -> bool { 1012 return !Query.Types[0].isVector() && 1013 needToSplitMemOp(Query, Op == G_LOAD); 1014 }, 1015 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1016 const LLT DstTy = Query.Types[0]; 1017 const LLT PtrTy = Query.Types[1]; 1018 1019 const unsigned DstSize = DstTy.getSizeInBits(); 1020 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1021 1022 // Split extloads. 1023 if (DstSize > MemSize) 1024 return std::make_pair(0, LLT::scalar(MemSize)); 1025 1026 if (!isPowerOf2_32(DstSize)) { 1027 // We're probably decomposing an odd sized store. Try to split 1028 // to the widest type. TODO: Account for alignment. As-is it 1029 // should be OK, since the new parts will be further legalized. 1030 unsigned FloorSize = PowerOf2Floor(DstSize); 1031 return std::make_pair(0, LLT::scalar(FloorSize)); 1032 } 1033 1034 if (DstSize > 32 && (DstSize % 32 != 0)) { 1035 // FIXME: Need a way to specify non-extload of larger size if 1036 // suitably aligned. 1037 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1038 } 1039 1040 unsigned MaxSize = maxSizeForAddrSpace(ST, 1041 PtrTy.getAddressSpace(), 1042 Op == G_LOAD); 1043 if (MemSize > MaxSize) 1044 return std::make_pair(0, LLT::scalar(MaxSize)); 1045 1046 unsigned Align = Query.MMODescrs[0].AlignInBits; 1047 return std::make_pair(0, LLT::scalar(Align)); 1048 }) 1049 .fewerElementsIf( 1050 [=](const LegalityQuery &Query) -> bool { 1051 return Query.Types[0].isVector() && 1052 needToSplitMemOp(Query, Op == G_LOAD); 1053 }, 1054 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1055 const LLT DstTy = Query.Types[0]; 1056 const LLT PtrTy = Query.Types[1]; 1057 1058 LLT EltTy = DstTy.getElementType(); 1059 unsigned MaxSize = maxSizeForAddrSpace(ST, 1060 PtrTy.getAddressSpace(), 1061 Op == G_LOAD); 1062 1063 // FIXME: Handle widened to power of 2 results better. This ends 1064 // up scalarizing. 1065 // FIXME: 3 element stores scalarized on SI 1066 1067 // Split if it's too large for the address space. 1068 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1069 unsigned NumElts = DstTy.getNumElements(); 1070 unsigned EltSize = EltTy.getSizeInBits(); 1071 1072 if (MaxSize % EltSize == 0) { 1073 return std::make_pair( 1074 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1075 } 1076 1077 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1078 1079 // FIXME: Refine when odd breakdowns handled 1080 // The scalars will need to be re-legalized. 1081 if (NumPieces == 1 || NumPieces >= NumElts || 1082 NumElts % NumPieces != 0) 1083 return std::make_pair(0, EltTy); 1084 1085 return std::make_pair(0, 1086 LLT::vector(NumElts / NumPieces, EltTy)); 1087 } 1088 1089 // FIXME: We could probably handle weird extending loads better. 1090 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1091 if (DstTy.getSizeInBits() > MemSize) 1092 return std::make_pair(0, EltTy); 1093 1094 unsigned EltSize = EltTy.getSizeInBits(); 1095 unsigned DstSize = DstTy.getSizeInBits(); 1096 if (!isPowerOf2_32(DstSize)) { 1097 // We're probably decomposing an odd sized store. Try to split 1098 // to the widest type. TODO: Account for alignment. As-is it 1099 // should be OK, since the new parts will be further legalized. 1100 unsigned FloorSize = PowerOf2Floor(DstSize); 1101 return std::make_pair( 1102 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1103 } 1104 1105 // Need to split because of alignment. 1106 unsigned Align = Query.MMODescrs[0].AlignInBits; 1107 if (EltSize > Align && 1108 (EltSize / Align < DstTy.getNumElements())) { 1109 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1110 } 1111 1112 // May need relegalization for the scalars. 1113 return std::make_pair(0, EltTy); 1114 }) 1115 .minScalar(0, S32); 1116 1117 if (IsStore) 1118 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1119 1120 // TODO: Need a bitcast lower option? 1121 Actions 1122 .widenScalarToNextPow2(0) 1123 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1124 } 1125 1126 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1127 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1128 {S32, GlobalPtr, 16, 2 * 8}, 1129 {S32, LocalPtr, 8, 8}, 1130 {S32, LocalPtr, 16, 16}, 1131 {S32, PrivatePtr, 8, 8}, 1132 {S32, PrivatePtr, 16, 16}, 1133 {S32, ConstantPtr, 8, 8}, 1134 {S32, ConstantPtr, 16, 2 * 8}}); 1135 if (ST.hasFlatAddressSpace()) { 1136 ExtLoads.legalForTypesWithMemDesc( 1137 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1138 } 1139 1140 ExtLoads.clampScalar(0, S32, S32) 1141 .widenScalarToNextPow2(0) 1142 .unsupportedIfMemSizeNotPow2() 1143 .lower(); 1144 1145 auto &Atomics = getActionDefinitionsBuilder( 1146 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1147 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1148 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1149 G_ATOMICRMW_UMIN}) 1150 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1151 {S64, GlobalPtr}, {S64, LocalPtr}}); 1152 if (ST.hasFlatAddressSpace()) { 1153 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1154 } 1155 1156 if (ST.hasLDSFPAtomics()) { 1157 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1158 .legalFor({{S32, LocalPtr}}); 1159 } 1160 1161 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1162 // demarshalling 1163 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1164 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1165 {S32, FlatPtr}, {S64, FlatPtr}}) 1166 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1167 {S32, RegionPtr}, {S64, RegionPtr}}); 1168 // TODO: Pointer types, any 32-bit or 64-bit vector 1169 1170 // Condition should be s32 for scalar, s1 for vector. 1171 getActionDefinitionsBuilder(G_SELECT) 1172 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1173 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1174 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1175 .clampScalar(0, S16, S64) 1176 .scalarize(1) 1177 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1178 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1179 .clampMaxNumElements(0, S32, 2) 1180 .clampMaxNumElements(0, LocalPtr, 2) 1181 .clampMaxNumElements(0, PrivatePtr, 2) 1182 .scalarize(0) 1183 .widenScalarToNextPow2(0) 1184 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1185 1186 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1187 // be more flexible with the shift amount type. 1188 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1189 .legalFor({{S32, S32}, {S64, S32}}); 1190 if (ST.has16BitInsts()) { 1191 if (ST.hasVOP3PInsts()) { 1192 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1193 .clampMaxNumElements(0, S16, 2); 1194 } else 1195 Shifts.legalFor({{S16, S16}}); 1196 1197 // TODO: Support 16-bit shift amounts for all types 1198 Shifts.widenScalarIf( 1199 [=](const LegalityQuery &Query) { 1200 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1201 // 32-bit amount. 1202 const LLT ValTy = Query.Types[0]; 1203 const LLT AmountTy = Query.Types[1]; 1204 return ValTy.getSizeInBits() <= 16 && 1205 AmountTy.getSizeInBits() < 16; 1206 }, changeTo(1, S16)); 1207 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1208 Shifts.clampScalar(1, S32, S32); 1209 Shifts.clampScalar(0, S16, S64); 1210 Shifts.widenScalarToNextPow2(0, 16); 1211 } else { 1212 // Make sure we legalize the shift amount type first, as the general 1213 // expansion for the shifted type will produce much worse code if it hasn't 1214 // been truncated already. 1215 Shifts.clampScalar(1, S32, S32); 1216 Shifts.clampScalar(0, S32, S64); 1217 Shifts.widenScalarToNextPow2(0, 32); 1218 } 1219 Shifts.scalarize(0); 1220 1221 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1222 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1223 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1224 unsigned IdxTypeIdx = 2; 1225 1226 getActionDefinitionsBuilder(Op) 1227 .customIf([=](const LegalityQuery &Query) { 1228 const LLT EltTy = Query.Types[EltTypeIdx]; 1229 const LLT VecTy = Query.Types[VecTypeIdx]; 1230 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1231 return (EltTy.getSizeInBits() == 16 || 1232 EltTy.getSizeInBits() % 32 == 0) && 1233 VecTy.getSizeInBits() % 32 == 0 && 1234 VecTy.getSizeInBits() <= MaxRegisterSize && 1235 IdxTy.getSizeInBits() == 32; 1236 }) 1237 .clampScalar(EltTypeIdx, S32, S64) 1238 .clampScalar(VecTypeIdx, S32, S64) 1239 .clampScalar(IdxTypeIdx, S32, S32); 1240 } 1241 1242 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1243 .unsupportedIf([=](const LegalityQuery &Query) { 1244 const LLT &EltTy = Query.Types[1].getElementType(); 1245 return Query.Types[0] != EltTy; 1246 }); 1247 1248 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1249 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1250 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1251 1252 // FIXME: Doesn't handle extract of illegal sizes. 1253 getActionDefinitionsBuilder(Op) 1254 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1255 // FIXME: Multiples of 16 should not be legal. 1256 .legalIf([=](const LegalityQuery &Query) { 1257 const LLT BigTy = Query.Types[BigTyIdx]; 1258 const LLT LitTy = Query.Types[LitTyIdx]; 1259 return (BigTy.getSizeInBits() % 32 == 0) && 1260 (LitTy.getSizeInBits() % 16 == 0); 1261 }) 1262 .widenScalarIf( 1263 [=](const LegalityQuery &Query) { 1264 const LLT BigTy = Query.Types[BigTyIdx]; 1265 return (BigTy.getScalarSizeInBits() < 16); 1266 }, 1267 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1268 .widenScalarIf( 1269 [=](const LegalityQuery &Query) { 1270 const LLT LitTy = Query.Types[LitTyIdx]; 1271 return (LitTy.getScalarSizeInBits() < 16); 1272 }, 1273 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1274 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1275 .widenScalarToNextPow2(BigTyIdx, 32); 1276 1277 } 1278 1279 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1280 .legalForCartesianProduct(AllS32Vectors, {S32}) 1281 .legalForCartesianProduct(AllS64Vectors, {S64}) 1282 .clampNumElements(0, V16S32, V32S32) 1283 .clampNumElements(0, V2S64, V16S64) 1284 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1285 1286 if (ST.hasScalarPackInsts()) { 1287 BuildVector 1288 // FIXME: Should probably widen s1 vectors straight to s32 1289 .minScalarOrElt(0, S16) 1290 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1291 .minScalar(1, S32); 1292 1293 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1294 .legalFor({V2S16, S32}) 1295 .lower(); 1296 BuildVector.minScalarOrElt(0, S32); 1297 } else { 1298 BuildVector.customFor({V2S16, S16}); 1299 BuildVector.minScalarOrElt(0, S32); 1300 1301 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1302 .customFor({V2S16, S32}) 1303 .lower(); 1304 } 1305 1306 BuildVector.legalIf(isRegisterType(0)); 1307 1308 // FIXME: Clamp maximum size 1309 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1310 .legalIf(isRegisterType(0)); 1311 1312 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1313 // pre-legalize. 1314 if (ST.hasVOP3PInsts()) { 1315 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1316 .customFor({V2S16, V2S16}) 1317 .lower(); 1318 } else 1319 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1320 1321 // Merge/Unmerge 1322 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1323 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1324 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1325 1326 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1327 const LLT Ty = Query.Types[TypeIdx]; 1328 if (Ty.isVector()) { 1329 const LLT &EltTy = Ty.getElementType(); 1330 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1331 return true; 1332 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1333 return true; 1334 } 1335 return false; 1336 }; 1337 1338 auto &Builder = getActionDefinitionsBuilder(Op) 1339 .lowerFor({{S16, V2S16}}) 1340 .lowerIf([=](const LegalityQuery &Query) { 1341 const LLT BigTy = Query.Types[BigTyIdx]; 1342 return BigTy.getSizeInBits() == 32; 1343 }) 1344 // Try to widen to s16 first for small types. 1345 // TODO: Only do this on targets with legal s16 shifts 1346 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1347 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1348 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1349 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1350 elementTypeIs(1, S16)), 1351 changeTo(1, V2S16)) 1352 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1353 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1354 // valid. 1355 .clampScalar(LitTyIdx, S32, S512) 1356 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1357 // Break up vectors with weird elements into scalars 1358 .fewerElementsIf( 1359 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1360 scalarize(0)) 1361 .fewerElementsIf( 1362 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1363 scalarize(1)) 1364 .clampScalar(BigTyIdx, S32, MaxScalar); 1365 1366 if (Op == G_MERGE_VALUES) { 1367 Builder.widenScalarIf( 1368 // TODO: Use 16-bit shifts if legal for 8-bit values? 1369 [=](const LegalityQuery &Query) { 1370 const LLT Ty = Query.Types[LitTyIdx]; 1371 return Ty.getSizeInBits() < 32; 1372 }, 1373 changeTo(LitTyIdx, S32)); 1374 } 1375 1376 Builder.widenScalarIf( 1377 [=](const LegalityQuery &Query) { 1378 const LLT Ty = Query.Types[BigTyIdx]; 1379 return !isPowerOf2_32(Ty.getSizeInBits()) && 1380 Ty.getSizeInBits() % 16 != 0; 1381 }, 1382 [=](const LegalityQuery &Query) { 1383 // Pick the next power of 2, or a multiple of 64 over 128. 1384 // Whichever is smaller. 1385 const LLT &Ty = Query.Types[BigTyIdx]; 1386 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1387 if (NewSizeInBits >= 256) { 1388 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1389 if (RoundedTo < NewSizeInBits) 1390 NewSizeInBits = RoundedTo; 1391 } 1392 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1393 }) 1394 .legalIf([=](const LegalityQuery &Query) { 1395 const LLT &BigTy = Query.Types[BigTyIdx]; 1396 const LLT &LitTy = Query.Types[LitTyIdx]; 1397 1398 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1399 return false; 1400 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1401 return false; 1402 1403 return BigTy.getSizeInBits() % 16 == 0 && 1404 LitTy.getSizeInBits() % 16 == 0 && 1405 BigTy.getSizeInBits() <= MaxRegisterSize; 1406 }) 1407 // Any vectors left are the wrong size. Scalarize them. 1408 .scalarize(0) 1409 .scalarize(1); 1410 } 1411 1412 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1413 // RegBankSelect. 1414 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1415 .legalFor({{S32}, {S64}}); 1416 1417 if (ST.hasVOP3PInsts()) { 1418 SextInReg.lowerFor({{V2S16}}) 1419 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1420 // get more vector shift opportunities, since we'll get those when 1421 // expanded. 1422 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1423 } else if (ST.has16BitInsts()) { 1424 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1425 } else { 1426 // Prefer to promote to s32 before lowering if we don't have 16-bit 1427 // shifts. This avoid a lot of intermediate truncate and extend operations. 1428 SextInReg.lowerFor({{S32}, {S64}}); 1429 } 1430 1431 // FIXME: Placeholder rule. Really depends on whether the clamp modifier is 1432 // available, and is selectively legal for s16, s32, v2s16. 1433 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT, G_UADDSAT, G_USUBSAT}) 1434 .scalarize(0) 1435 .clampScalar(0, S16, S32); 1436 1437 SextInReg 1438 .scalarize(0) 1439 .clampScalar(0, S32, S64) 1440 .lower(); 1441 1442 getActionDefinitionsBuilder(G_FSHR) 1443 .legalFor({{S32, S32}}) 1444 .scalarize(0) 1445 .lower(); 1446 1447 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1448 .legalFor({S64}); 1449 1450 getActionDefinitionsBuilder({ 1451 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1452 G_FCOPYSIGN, 1453 1454 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1455 G_READ_REGISTER, 1456 G_WRITE_REGISTER, 1457 1458 G_SADDO, G_SSUBO, 1459 1460 // TODO: Implement 1461 G_FMINIMUM, G_FMAXIMUM, 1462 G_FSHL 1463 }).lower(); 1464 1465 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1466 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1467 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1468 .unsupported(); 1469 1470 computeTables(); 1471 verify(*ST.getInstrInfo()); 1472 } 1473 1474 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1475 MachineInstr &MI) const { 1476 MachineIRBuilder &B = Helper.MIRBuilder; 1477 MachineRegisterInfo &MRI = *B.getMRI(); 1478 GISelChangeObserver &Observer = Helper.Observer; 1479 1480 switch (MI.getOpcode()) { 1481 case TargetOpcode::G_ADDRSPACE_CAST: 1482 return legalizeAddrSpaceCast(MI, MRI, B); 1483 case TargetOpcode::G_FRINT: 1484 return legalizeFrint(MI, MRI, B); 1485 case TargetOpcode::G_FCEIL: 1486 return legalizeFceil(MI, MRI, B); 1487 case TargetOpcode::G_INTRINSIC_TRUNC: 1488 return legalizeIntrinsicTrunc(MI, MRI, B); 1489 case TargetOpcode::G_SITOFP: 1490 return legalizeITOFP(MI, MRI, B, true); 1491 case TargetOpcode::G_UITOFP: 1492 return legalizeITOFP(MI, MRI, B, false); 1493 case TargetOpcode::G_FPTOSI: 1494 return legalizeFPTOI(MI, MRI, B, true); 1495 case TargetOpcode::G_FPTOUI: 1496 return legalizeFPTOI(MI, MRI, B, false); 1497 case TargetOpcode::G_FMINNUM: 1498 case TargetOpcode::G_FMAXNUM: 1499 case TargetOpcode::G_FMINNUM_IEEE: 1500 case TargetOpcode::G_FMAXNUM_IEEE: 1501 return legalizeMinNumMaxNum(Helper, MI); 1502 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1503 return legalizeExtractVectorElt(MI, MRI, B); 1504 case TargetOpcode::G_INSERT_VECTOR_ELT: 1505 return legalizeInsertVectorElt(MI, MRI, B); 1506 case TargetOpcode::G_SHUFFLE_VECTOR: 1507 return legalizeShuffleVector(MI, MRI, B); 1508 case TargetOpcode::G_FSIN: 1509 case TargetOpcode::G_FCOS: 1510 return legalizeSinCos(MI, MRI, B); 1511 case TargetOpcode::G_GLOBAL_VALUE: 1512 return legalizeGlobalValue(MI, MRI, B); 1513 case TargetOpcode::G_LOAD: 1514 return legalizeLoad(MI, MRI, B, Observer); 1515 case TargetOpcode::G_FMAD: 1516 return legalizeFMad(MI, MRI, B); 1517 case TargetOpcode::G_FDIV: 1518 return legalizeFDIV(MI, MRI, B); 1519 case TargetOpcode::G_UDIV: 1520 case TargetOpcode::G_UREM: 1521 return legalizeUDIV_UREM(MI, MRI, B); 1522 case TargetOpcode::G_SDIV: 1523 case TargetOpcode::G_SREM: 1524 return legalizeSDIV_SREM(MI, MRI, B); 1525 case TargetOpcode::G_ATOMIC_CMPXCHG: 1526 return legalizeAtomicCmpXChg(MI, MRI, B); 1527 case TargetOpcode::G_FLOG: 1528 return legalizeFlog(MI, B, numbers::ln2f); 1529 case TargetOpcode::G_FLOG10: 1530 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1531 case TargetOpcode::G_FEXP: 1532 return legalizeFExp(MI, B); 1533 case TargetOpcode::G_FPOW: 1534 return legalizeFPow(MI, B); 1535 case TargetOpcode::G_FFLOOR: 1536 return legalizeFFloor(MI, MRI, B); 1537 case TargetOpcode::G_BUILD_VECTOR: 1538 return legalizeBuildVector(MI, MRI, B); 1539 default: 1540 return false; 1541 } 1542 1543 llvm_unreachable("expected switch to return"); 1544 } 1545 1546 Register AMDGPULegalizerInfo::getSegmentAperture( 1547 unsigned AS, 1548 MachineRegisterInfo &MRI, 1549 MachineIRBuilder &B) const { 1550 MachineFunction &MF = B.getMF(); 1551 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1552 const LLT S32 = LLT::scalar(32); 1553 1554 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1555 1556 if (ST.hasApertureRegs()) { 1557 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1558 // getreg. 1559 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1560 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1561 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1562 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1563 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1564 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1565 unsigned Encoding = 1566 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1567 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1568 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1569 1570 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1571 1572 B.buildInstr(AMDGPU::S_GETREG_B32) 1573 .addDef(GetReg) 1574 .addImm(Encoding); 1575 MRI.setType(GetReg, S32); 1576 1577 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1578 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1579 } 1580 1581 Register QueuePtr = MRI.createGenericVirtualRegister( 1582 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1583 1584 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1585 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1586 return Register(); 1587 1588 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1589 // private_segment_aperture_base_hi. 1590 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1591 1592 // TODO: can we be smarter about machine pointer info? 1593 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1594 MachineMemOperand *MMO = MF.getMachineMemOperand( 1595 PtrInfo, 1596 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1597 MachineMemOperand::MOInvariant, 1598 4, commonAlignment(Align(64), StructOffset)); 1599 1600 Register LoadAddr; 1601 1602 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1603 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1604 } 1605 1606 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1607 MachineInstr &MI, MachineRegisterInfo &MRI, 1608 MachineIRBuilder &B) const { 1609 MachineFunction &MF = B.getMF(); 1610 1611 const LLT S32 = LLT::scalar(32); 1612 Register Dst = MI.getOperand(0).getReg(); 1613 Register Src = MI.getOperand(1).getReg(); 1614 1615 LLT DstTy = MRI.getType(Dst); 1616 LLT SrcTy = MRI.getType(Src); 1617 unsigned DestAS = DstTy.getAddressSpace(); 1618 unsigned SrcAS = SrcTy.getAddressSpace(); 1619 1620 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1621 // vector element. 1622 assert(!DstTy.isVector()); 1623 1624 const AMDGPUTargetMachine &TM 1625 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1626 1627 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1628 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1629 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1630 return true; 1631 } 1632 1633 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1634 // Truncate. 1635 B.buildExtract(Dst, Src, 0); 1636 MI.eraseFromParent(); 1637 return true; 1638 } 1639 1640 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1641 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1642 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1643 1644 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1645 // another. Merge operands are required to be the same type, but creating an 1646 // extra ptrtoint would be kind of pointless. 1647 auto HighAddr = B.buildConstant( 1648 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1649 B.buildMerge(Dst, {Src, HighAddr}); 1650 MI.eraseFromParent(); 1651 return true; 1652 } 1653 1654 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1655 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1656 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1657 unsigned NullVal = TM.getNullPointerValue(DestAS); 1658 1659 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1660 auto FlatNull = B.buildConstant(SrcTy, 0); 1661 1662 // Extract low 32-bits of the pointer. 1663 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1664 1665 auto CmpRes = 1666 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1667 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1668 1669 MI.eraseFromParent(); 1670 return true; 1671 } 1672 1673 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1674 return false; 1675 1676 if (!ST.hasFlatAddressSpace()) 1677 return false; 1678 1679 auto SegmentNull = 1680 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1681 auto FlatNull = 1682 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1683 1684 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1685 if (!ApertureReg.isValid()) 1686 return false; 1687 1688 auto CmpRes = 1689 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1690 1691 // Coerce the type of the low half of the result so we can use merge_values. 1692 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1693 1694 // TODO: Should we allow mismatched types but matching sizes in merges to 1695 // avoid the ptrtoint? 1696 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1697 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1698 1699 MI.eraseFromParent(); 1700 return true; 1701 } 1702 1703 bool AMDGPULegalizerInfo::legalizeFrint( 1704 MachineInstr &MI, MachineRegisterInfo &MRI, 1705 MachineIRBuilder &B) const { 1706 Register Src = MI.getOperand(1).getReg(); 1707 LLT Ty = MRI.getType(Src); 1708 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1709 1710 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1711 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1712 1713 auto C1 = B.buildFConstant(Ty, C1Val); 1714 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1715 1716 // TODO: Should this propagate fast-math-flags? 1717 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1718 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1719 1720 auto C2 = B.buildFConstant(Ty, C2Val); 1721 auto Fabs = B.buildFAbs(Ty, Src); 1722 1723 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1724 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1725 return true; 1726 } 1727 1728 bool AMDGPULegalizerInfo::legalizeFceil( 1729 MachineInstr &MI, MachineRegisterInfo &MRI, 1730 MachineIRBuilder &B) const { 1731 1732 const LLT S1 = LLT::scalar(1); 1733 const LLT S64 = LLT::scalar(64); 1734 1735 Register Src = MI.getOperand(1).getReg(); 1736 assert(MRI.getType(Src) == S64); 1737 1738 // result = trunc(src) 1739 // if (src > 0.0 && src != result) 1740 // result += 1.0 1741 1742 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1743 1744 const auto Zero = B.buildFConstant(S64, 0.0); 1745 const auto One = B.buildFConstant(S64, 1.0); 1746 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1747 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1748 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1749 auto Add = B.buildSelect(S64, And, One, Zero); 1750 1751 // TODO: Should this propagate fast-math-flags? 1752 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1753 return true; 1754 } 1755 1756 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1757 MachineIRBuilder &B) { 1758 const unsigned FractBits = 52; 1759 const unsigned ExpBits = 11; 1760 LLT S32 = LLT::scalar(32); 1761 1762 auto Const0 = B.buildConstant(S32, FractBits - 32); 1763 auto Const1 = B.buildConstant(S32, ExpBits); 1764 1765 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1766 .addUse(Const0.getReg(0)) 1767 .addUse(Const1.getReg(0)); 1768 1769 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1770 } 1771 1772 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1773 MachineInstr &MI, MachineRegisterInfo &MRI, 1774 MachineIRBuilder &B) const { 1775 const LLT S1 = LLT::scalar(1); 1776 const LLT S32 = LLT::scalar(32); 1777 const LLT S64 = LLT::scalar(64); 1778 1779 Register Src = MI.getOperand(1).getReg(); 1780 assert(MRI.getType(Src) == S64); 1781 1782 // TODO: Should this use extract since the low half is unused? 1783 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1784 Register Hi = Unmerge.getReg(1); 1785 1786 // Extract the upper half, since this is where we will find the sign and 1787 // exponent. 1788 auto Exp = extractF64Exponent(Hi, B); 1789 1790 const unsigned FractBits = 52; 1791 1792 // Extract the sign bit. 1793 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1794 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1795 1796 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1797 1798 const auto Zero32 = B.buildConstant(S32, 0); 1799 1800 // Extend back to 64-bits. 1801 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1802 1803 auto Shr = B.buildAShr(S64, FractMask, Exp); 1804 auto Not = B.buildNot(S64, Shr); 1805 auto Tmp0 = B.buildAnd(S64, Src, Not); 1806 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1807 1808 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1809 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1810 1811 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1812 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1813 return true; 1814 } 1815 1816 bool AMDGPULegalizerInfo::legalizeITOFP( 1817 MachineInstr &MI, MachineRegisterInfo &MRI, 1818 MachineIRBuilder &B, bool Signed) const { 1819 1820 Register Dst = MI.getOperand(0).getReg(); 1821 Register Src = MI.getOperand(1).getReg(); 1822 1823 const LLT S64 = LLT::scalar(64); 1824 const LLT S32 = LLT::scalar(32); 1825 1826 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1827 1828 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1829 1830 auto CvtHi = Signed ? 1831 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1832 B.buildUITOFP(S64, Unmerge.getReg(1)); 1833 1834 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1835 1836 auto ThirtyTwo = B.buildConstant(S32, 32); 1837 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1838 .addUse(CvtHi.getReg(0)) 1839 .addUse(ThirtyTwo.getReg(0)); 1840 1841 // TODO: Should this propagate fast-math-flags? 1842 B.buildFAdd(Dst, LdExp, CvtLo); 1843 MI.eraseFromParent(); 1844 return true; 1845 } 1846 1847 // TODO: Copied from DAG implementation. Verify logic and document how this 1848 // actually works. 1849 bool AMDGPULegalizerInfo::legalizeFPTOI( 1850 MachineInstr &MI, MachineRegisterInfo &MRI, 1851 MachineIRBuilder &B, bool Signed) const { 1852 1853 Register Dst = MI.getOperand(0).getReg(); 1854 Register Src = MI.getOperand(1).getReg(); 1855 1856 const LLT S64 = LLT::scalar(64); 1857 const LLT S32 = LLT::scalar(32); 1858 1859 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1860 1861 unsigned Flags = MI.getFlags(); 1862 1863 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1864 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1865 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1866 1867 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1868 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1869 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1870 1871 auto Hi = Signed ? 1872 B.buildFPTOSI(S32, FloorMul) : 1873 B.buildFPTOUI(S32, FloorMul); 1874 auto Lo = B.buildFPTOUI(S32, Fma); 1875 1876 B.buildMerge(Dst, { Lo, Hi }); 1877 MI.eraseFromParent(); 1878 1879 return true; 1880 } 1881 1882 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 1883 MachineInstr &MI) const { 1884 MachineFunction &MF = Helper.MIRBuilder.getMF(); 1885 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1886 1887 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1888 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1889 1890 // With ieee_mode disabled, the instructions have the correct behavior 1891 // already for G_FMINNUM/G_FMAXNUM 1892 if (!MFI->getMode().IEEE) 1893 return !IsIEEEOp; 1894 1895 if (IsIEEEOp) 1896 return true; 1897 1898 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1899 } 1900 1901 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1902 MachineInstr &MI, MachineRegisterInfo &MRI, 1903 MachineIRBuilder &B) const { 1904 // TODO: Should move some of this into LegalizerHelper. 1905 1906 // TODO: Promote dynamic indexing of s16 to s32 1907 1908 // FIXME: Artifact combiner probably should have replaced the truncated 1909 // constant before this, so we shouldn't need 1910 // getConstantVRegValWithLookThrough. 1911 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1912 MI.getOperand(2).getReg(), MRI); 1913 if (!IdxVal) // Dynamic case will be selected to register indexing. 1914 return true; 1915 1916 Register Dst = MI.getOperand(0).getReg(); 1917 Register Vec = MI.getOperand(1).getReg(); 1918 1919 LLT VecTy = MRI.getType(Vec); 1920 LLT EltTy = VecTy.getElementType(); 1921 assert(EltTy == MRI.getType(Dst)); 1922 1923 if (IdxVal->Value < VecTy.getNumElements()) 1924 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1925 else 1926 B.buildUndef(Dst); 1927 1928 MI.eraseFromParent(); 1929 return true; 1930 } 1931 1932 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1933 MachineInstr &MI, MachineRegisterInfo &MRI, 1934 MachineIRBuilder &B) const { 1935 // TODO: Should move some of this into LegalizerHelper. 1936 1937 // TODO: Promote dynamic indexing of s16 to s32 1938 1939 // FIXME: Artifact combiner probably should have replaced the truncated 1940 // constant before this, so we shouldn't need 1941 // getConstantVRegValWithLookThrough. 1942 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1943 MI.getOperand(3).getReg(), MRI); 1944 if (!IdxVal) // Dynamic case will be selected to register indexing. 1945 return true; 1946 1947 Register Dst = MI.getOperand(0).getReg(); 1948 Register Vec = MI.getOperand(1).getReg(); 1949 Register Ins = MI.getOperand(2).getReg(); 1950 1951 LLT VecTy = MRI.getType(Vec); 1952 LLT EltTy = VecTy.getElementType(); 1953 assert(EltTy == MRI.getType(Ins)); 1954 1955 if (IdxVal->Value < VecTy.getNumElements()) 1956 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1957 else 1958 B.buildUndef(Dst); 1959 1960 MI.eraseFromParent(); 1961 return true; 1962 } 1963 1964 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1965 MachineInstr &MI, MachineRegisterInfo &MRI, 1966 MachineIRBuilder &B) const { 1967 const LLT V2S16 = LLT::vector(2, 16); 1968 1969 Register Dst = MI.getOperand(0).getReg(); 1970 Register Src0 = MI.getOperand(1).getReg(); 1971 LLT DstTy = MRI.getType(Dst); 1972 LLT SrcTy = MRI.getType(Src0); 1973 1974 if (SrcTy == V2S16 && DstTy == V2S16 && 1975 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1976 return true; 1977 1978 MachineIRBuilder HelperBuilder(MI); 1979 GISelObserverWrapper DummyObserver; 1980 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1981 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1982 } 1983 1984 bool AMDGPULegalizerInfo::legalizeSinCos( 1985 MachineInstr &MI, MachineRegisterInfo &MRI, 1986 MachineIRBuilder &B) const { 1987 1988 Register DstReg = MI.getOperand(0).getReg(); 1989 Register SrcReg = MI.getOperand(1).getReg(); 1990 LLT Ty = MRI.getType(DstReg); 1991 unsigned Flags = MI.getFlags(); 1992 1993 Register TrigVal; 1994 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 1995 if (ST.hasTrigReducedRange()) { 1996 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1997 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1998 .addUse(MulVal.getReg(0)) 1999 .setMIFlags(Flags).getReg(0); 2000 } else 2001 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2002 2003 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2004 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2005 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2006 .addUse(TrigVal) 2007 .setMIFlags(Flags); 2008 MI.eraseFromParent(); 2009 return true; 2010 } 2011 2012 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2013 MachineIRBuilder &B, 2014 const GlobalValue *GV, 2015 int64_t Offset, 2016 unsigned GAFlags) const { 2017 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2018 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2019 // to the following code sequence: 2020 // 2021 // For constant address space: 2022 // s_getpc_b64 s[0:1] 2023 // s_add_u32 s0, s0, $symbol 2024 // s_addc_u32 s1, s1, 0 2025 // 2026 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2027 // a fixup or relocation is emitted to replace $symbol with a literal 2028 // constant, which is a pc-relative offset from the encoding of the $symbol 2029 // operand to the global variable. 2030 // 2031 // For global address space: 2032 // s_getpc_b64 s[0:1] 2033 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2034 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2035 // 2036 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2037 // fixups or relocations are emitted to replace $symbol@*@lo and 2038 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2039 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2040 // operand to the global variable. 2041 // 2042 // What we want here is an offset from the value returned by s_getpc 2043 // (which is the address of the s_add_u32 instruction) to the global 2044 // variable, but since the encoding of $symbol starts 4 bytes after the start 2045 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2046 // small. This requires us to add 4 to the global variable offset in order to 2047 // compute the correct address. 2048 2049 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2050 2051 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2052 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2053 2054 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2055 .addDef(PCReg); 2056 2057 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2058 if (GAFlags == SIInstrInfo::MO_NONE) 2059 MIB.addImm(0); 2060 else 2061 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2062 2063 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2064 2065 if (PtrTy.getSizeInBits() == 32) 2066 B.buildExtract(DstReg, PCReg, 0); 2067 return true; 2068 } 2069 2070 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2071 MachineInstr &MI, MachineRegisterInfo &MRI, 2072 MachineIRBuilder &B) const { 2073 Register DstReg = MI.getOperand(0).getReg(); 2074 LLT Ty = MRI.getType(DstReg); 2075 unsigned AS = Ty.getAddressSpace(); 2076 2077 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2078 MachineFunction &MF = B.getMF(); 2079 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2080 2081 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2082 if (!MFI->isEntryFunction()) { 2083 const Function &Fn = MF.getFunction(); 2084 DiagnosticInfoUnsupported BadLDSDecl( 2085 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2086 DS_Warning); 2087 Fn.getContext().diagnose(BadLDSDecl); 2088 2089 // We currently don't have a way to correctly allocate LDS objects that 2090 // aren't directly associated with a kernel. We do force inlining of 2091 // functions that use local objects. However, if these dead functions are 2092 // not eliminated, we don't want a compile time error. Just emit a warning 2093 // and a trap, since there should be no callable path here. 2094 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2095 B.buildUndef(DstReg); 2096 MI.eraseFromParent(); 2097 return true; 2098 } 2099 2100 // TODO: We could emit code to handle the initialization somewhere. 2101 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2102 const SITargetLowering *TLI = ST.getTargetLowering(); 2103 if (!TLI->shouldUseLDSConstAddress(GV)) { 2104 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2105 return true; // Leave in place; 2106 } 2107 2108 B.buildConstant( 2109 DstReg, 2110 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2111 MI.eraseFromParent(); 2112 return true; 2113 } 2114 2115 const Function &Fn = MF.getFunction(); 2116 DiagnosticInfoUnsupported BadInit( 2117 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2118 Fn.getContext().diagnose(BadInit); 2119 return true; 2120 } 2121 2122 const SITargetLowering *TLI = ST.getTargetLowering(); 2123 2124 if (TLI->shouldEmitFixup(GV)) { 2125 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2126 MI.eraseFromParent(); 2127 return true; 2128 } 2129 2130 if (TLI->shouldEmitPCReloc(GV)) { 2131 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2132 MI.eraseFromParent(); 2133 return true; 2134 } 2135 2136 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2137 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2138 2139 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2140 MachinePointerInfo::getGOT(MF), 2141 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2142 MachineMemOperand::MOInvariant, 2143 8 /*Size*/, Align(8)); 2144 2145 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2146 2147 if (Ty.getSizeInBits() == 32) { 2148 // Truncate if this is a 32-bit constant adrdess. 2149 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2150 B.buildExtract(DstReg, Load, 0); 2151 } else 2152 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2153 2154 MI.eraseFromParent(); 2155 return true; 2156 } 2157 2158 bool AMDGPULegalizerInfo::legalizeLoad( 2159 MachineInstr &MI, MachineRegisterInfo &MRI, 2160 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2161 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2162 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2163 Observer.changingInstr(MI); 2164 MI.getOperand(1).setReg(Cast.getReg(0)); 2165 Observer.changedInstr(MI); 2166 return true; 2167 } 2168 2169 bool AMDGPULegalizerInfo::legalizeFMad( 2170 MachineInstr &MI, MachineRegisterInfo &MRI, 2171 MachineIRBuilder &B) const { 2172 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2173 assert(Ty.isScalar()); 2174 2175 MachineFunction &MF = B.getMF(); 2176 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2177 2178 // TODO: Always legal with future ftz flag. 2179 // FIXME: Do we need just output? 2180 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2181 return true; 2182 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2183 return true; 2184 2185 MachineIRBuilder HelperBuilder(MI); 2186 GISelObserverWrapper DummyObserver; 2187 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2188 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2189 } 2190 2191 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2192 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2193 Register DstReg = MI.getOperand(0).getReg(); 2194 Register PtrReg = MI.getOperand(1).getReg(); 2195 Register CmpVal = MI.getOperand(2).getReg(); 2196 Register NewVal = MI.getOperand(3).getReg(); 2197 2198 assert(SITargetLowering::isFlatGlobalAddrSpace( 2199 MRI.getType(PtrReg).getAddressSpace()) && 2200 "this should not have been custom lowered"); 2201 2202 LLT ValTy = MRI.getType(CmpVal); 2203 LLT VecTy = LLT::vector(2, ValTy); 2204 2205 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2206 2207 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2208 .addDef(DstReg) 2209 .addUse(PtrReg) 2210 .addUse(PackedVal) 2211 .setMemRefs(MI.memoperands()); 2212 2213 MI.eraseFromParent(); 2214 return true; 2215 } 2216 2217 bool AMDGPULegalizerInfo::legalizeFlog( 2218 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2219 Register Dst = MI.getOperand(0).getReg(); 2220 Register Src = MI.getOperand(1).getReg(); 2221 LLT Ty = B.getMRI()->getType(Dst); 2222 unsigned Flags = MI.getFlags(); 2223 2224 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2225 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2226 2227 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2228 MI.eraseFromParent(); 2229 return true; 2230 } 2231 2232 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2233 MachineIRBuilder &B) const { 2234 Register Dst = MI.getOperand(0).getReg(); 2235 Register Src = MI.getOperand(1).getReg(); 2236 unsigned Flags = MI.getFlags(); 2237 LLT Ty = B.getMRI()->getType(Dst); 2238 2239 auto K = B.buildFConstant(Ty, numbers::log2e); 2240 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2241 B.buildFExp2(Dst, Mul, Flags); 2242 MI.eraseFromParent(); 2243 return true; 2244 } 2245 2246 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2247 MachineIRBuilder &B) const { 2248 Register Dst = MI.getOperand(0).getReg(); 2249 Register Src0 = MI.getOperand(1).getReg(); 2250 Register Src1 = MI.getOperand(2).getReg(); 2251 unsigned Flags = MI.getFlags(); 2252 LLT Ty = B.getMRI()->getType(Dst); 2253 const LLT S16 = LLT::scalar(16); 2254 const LLT S32 = LLT::scalar(32); 2255 2256 if (Ty == S32) { 2257 auto Log = B.buildFLog2(S32, Src0, Flags); 2258 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2259 .addUse(Log.getReg(0)) 2260 .addUse(Src1) 2261 .setMIFlags(Flags); 2262 B.buildFExp2(Dst, Mul, Flags); 2263 } else if (Ty == S16) { 2264 // There's no f16 fmul_legacy, so we need to convert for it. 2265 auto Log = B.buildFLog2(S16, Src0, Flags); 2266 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2267 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2268 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2269 .addUse(Ext0.getReg(0)) 2270 .addUse(Ext1.getReg(0)) 2271 .setMIFlags(Flags); 2272 2273 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2274 } else 2275 return false; 2276 2277 MI.eraseFromParent(); 2278 return true; 2279 } 2280 2281 // Find a source register, ignoring any possible source modifiers. 2282 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2283 Register ModSrc = OrigSrc; 2284 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2285 ModSrc = SrcFNeg->getOperand(1).getReg(); 2286 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2287 ModSrc = SrcFAbs->getOperand(1).getReg(); 2288 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2289 ModSrc = SrcFAbs->getOperand(1).getReg(); 2290 return ModSrc; 2291 } 2292 2293 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2294 MachineRegisterInfo &MRI, 2295 MachineIRBuilder &B) const { 2296 2297 const LLT S1 = LLT::scalar(1); 2298 const LLT S64 = LLT::scalar(64); 2299 Register Dst = MI.getOperand(0).getReg(); 2300 Register OrigSrc = MI.getOperand(1).getReg(); 2301 unsigned Flags = MI.getFlags(); 2302 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2303 "this should not have been custom lowered"); 2304 2305 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2306 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2307 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2308 // V_FRACT bug is: 2309 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2310 // 2311 // Convert floor(x) to (x - fract(x)) 2312 2313 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2314 .addUse(OrigSrc) 2315 .setMIFlags(Flags); 2316 2317 // Give source modifier matching some assistance before obscuring a foldable 2318 // pattern. 2319 2320 // TODO: We can avoid the neg on the fract? The input sign to fract 2321 // shouldn't matter? 2322 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2323 2324 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2325 2326 Register Min = MRI.createGenericVirtualRegister(S64); 2327 2328 // We don't need to concern ourselves with the snan handling difference, so 2329 // use the one which will directly select. 2330 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2331 if (MFI->getMode().IEEE) 2332 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2333 else 2334 B.buildFMinNum(Min, Fract, Const, Flags); 2335 2336 Register CorrectedFract = Min; 2337 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2338 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2339 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2340 } 2341 2342 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2343 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2344 2345 MI.eraseFromParent(); 2346 return true; 2347 } 2348 2349 // Turn an illegal packed v2s16 build vector into bit operations. 2350 // TODO: This should probably be a bitcast action in LegalizerHelper. 2351 bool AMDGPULegalizerInfo::legalizeBuildVector( 2352 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2353 Register Dst = MI.getOperand(0).getReg(); 2354 const LLT S32 = LLT::scalar(32); 2355 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2356 2357 Register Src0 = MI.getOperand(1).getReg(); 2358 Register Src1 = MI.getOperand(2).getReg(); 2359 assert(MRI.getType(Src0) == LLT::scalar(16)); 2360 2361 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2362 B.buildBitcast(Dst, Merge); 2363 2364 MI.eraseFromParent(); 2365 return true; 2366 } 2367 2368 // Return the use branch instruction, otherwise null if the usage is invalid. 2369 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2370 MachineRegisterInfo &MRI, 2371 MachineInstr *&Br, 2372 MachineBasicBlock *&UncondBrTarget) { 2373 Register CondDef = MI.getOperand(0).getReg(); 2374 if (!MRI.hasOneNonDBGUse(CondDef)) 2375 return nullptr; 2376 2377 MachineBasicBlock *Parent = MI.getParent(); 2378 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2379 if (UseMI.getParent() != Parent || 2380 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2381 return nullptr; 2382 2383 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2384 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2385 if (Next == Parent->end()) { 2386 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2387 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2388 return nullptr; 2389 UncondBrTarget = &*NextMBB; 2390 } else { 2391 if (Next->getOpcode() != AMDGPU::G_BR) 2392 return nullptr; 2393 Br = &*Next; 2394 UncondBrTarget = Br->getOperand(0).getMBB(); 2395 } 2396 2397 return &UseMI; 2398 } 2399 2400 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2401 MachineRegisterInfo &MRI, 2402 Register LiveIn, 2403 Register PhyReg) const { 2404 assert(PhyReg.isPhysical() && "Physical register expected"); 2405 2406 // Insert the live-in copy, if required, by defining destination virtual 2407 // register. 2408 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2409 if (!MRI.getVRegDef(LiveIn)) { 2410 // FIXME: Should have scoped insert pt 2411 MachineBasicBlock &OrigInsBB = B.getMBB(); 2412 auto OrigInsPt = B.getInsertPt(); 2413 2414 MachineBasicBlock &EntryMBB = B.getMF().front(); 2415 EntryMBB.addLiveIn(PhyReg); 2416 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2417 B.buildCopy(LiveIn, PhyReg); 2418 2419 B.setInsertPt(OrigInsBB, OrigInsPt); 2420 } 2421 2422 return LiveIn; 2423 } 2424 2425 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2426 MachineRegisterInfo &MRI, 2427 Register PhyReg, LLT Ty, 2428 bool InsertLiveInCopy) const { 2429 assert(PhyReg.isPhysical() && "Physical register expected"); 2430 2431 // Get or create virtual live-in regester 2432 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2433 if (!LiveIn) { 2434 LiveIn = MRI.createGenericVirtualRegister(Ty); 2435 MRI.addLiveIn(PhyReg, LiveIn); 2436 } 2437 2438 // When the actual true copy required is from virtual register to physical 2439 // register (to be inserted later), live-in copy insertion from physical 2440 // to register virtual register is not required 2441 if (!InsertLiveInCopy) 2442 return LiveIn; 2443 2444 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2445 } 2446 2447 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2448 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2449 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2450 const ArgDescriptor *Arg; 2451 const TargetRegisterClass *RC; 2452 LLT ArgTy; 2453 std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType); 2454 if (!Arg) { 2455 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2456 return nullptr; 2457 } 2458 return Arg; 2459 } 2460 2461 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2462 const ArgDescriptor *Arg) const { 2463 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2464 return false; // TODO: Handle these 2465 2466 Register SrcReg = Arg->getRegister(); 2467 assert(SrcReg.isPhysical() && "Physical register expected"); 2468 assert(DstReg.isVirtual() && "Virtual register expected"); 2469 2470 MachineRegisterInfo &MRI = *B.getMRI(); 2471 2472 LLT Ty = MRI.getType(DstReg); 2473 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2474 2475 if (Arg->isMasked()) { 2476 // TODO: Should we try to emit this once in the entry block? 2477 const LLT S32 = LLT::scalar(32); 2478 const unsigned Mask = Arg->getMask(); 2479 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2480 2481 Register AndMaskSrc = LiveIn; 2482 2483 if (Shift != 0) { 2484 auto ShiftAmt = B.buildConstant(S32, Shift); 2485 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2486 } 2487 2488 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2489 } else { 2490 B.buildCopy(DstReg, LiveIn); 2491 } 2492 2493 return true; 2494 } 2495 2496 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2497 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2498 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2499 2500 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2501 if (!Arg) 2502 return false; 2503 2504 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2505 return false; 2506 2507 MI.eraseFromParent(); 2508 return true; 2509 } 2510 2511 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2512 MachineRegisterInfo &MRI, 2513 MachineIRBuilder &B) const { 2514 Register Dst = MI.getOperand(0).getReg(); 2515 LLT DstTy = MRI.getType(Dst); 2516 LLT S16 = LLT::scalar(16); 2517 LLT S32 = LLT::scalar(32); 2518 LLT S64 = LLT::scalar(64); 2519 2520 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2521 return true; 2522 2523 if (DstTy == S16) 2524 return legalizeFDIV16(MI, MRI, B); 2525 if (DstTy == S32) 2526 return legalizeFDIV32(MI, MRI, B); 2527 if (DstTy == S64) 2528 return legalizeFDIV64(MI, MRI, B); 2529 2530 return false; 2531 } 2532 2533 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2534 Register DstReg, 2535 Register X, 2536 Register Y, 2537 bool IsDiv) const { 2538 const LLT S1 = LLT::scalar(1); 2539 const LLT S32 = LLT::scalar(32); 2540 2541 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2542 // algorithm used here. 2543 2544 // Initial estimate of inv(y). 2545 auto FloatY = B.buildUITOFP(S32, Y); 2546 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2547 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2548 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2549 auto Z = B.buildFPTOUI(S32, ScaledY); 2550 2551 // One round of UNR. 2552 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2553 auto NegYZ = B.buildMul(S32, NegY, Z); 2554 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2555 2556 // Quotient/remainder estimate. 2557 auto Q = B.buildUMulH(S32, X, Z); 2558 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2559 2560 // First quotient/remainder refinement. 2561 auto One = B.buildConstant(S32, 1); 2562 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2563 if (IsDiv) 2564 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2565 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2566 2567 // Second quotient/remainder refinement. 2568 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2569 if (IsDiv) 2570 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2571 else 2572 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2573 } 2574 2575 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2576 MachineRegisterInfo &MRI, 2577 MachineIRBuilder &B) const { 2578 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2579 Register DstReg = MI.getOperand(0).getReg(); 2580 Register Num = MI.getOperand(1).getReg(); 2581 Register Den = MI.getOperand(2).getReg(); 2582 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2583 MI.eraseFromParent(); 2584 return true; 2585 } 2586 2587 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2588 // 2589 // Return lo, hi of result 2590 // 2591 // %cvt.lo = G_UITOFP Val.lo 2592 // %cvt.hi = G_UITOFP Val.hi 2593 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2594 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2595 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2596 // %mul2 = G_FMUL %mul1, 2**(-32) 2597 // %trunc = G_INTRINSIC_TRUNC %mul2 2598 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2599 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2600 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2601 Register Val) { 2602 const LLT S32 = LLT::scalar(32); 2603 auto Unmerge = B.buildUnmerge(S32, Val); 2604 2605 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2606 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2607 2608 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2609 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2610 2611 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2612 auto Mul1 = 2613 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2614 2615 // 2**(-32) 2616 auto Mul2 = 2617 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2618 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2619 2620 // -(2**32) 2621 auto Mad2 = B.buildFMAD(S32, Trunc, 2622 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2623 2624 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2625 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2626 2627 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2628 } 2629 2630 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2631 Register DstReg, 2632 Register Numer, 2633 Register Denom, 2634 bool IsDiv) const { 2635 const LLT S32 = LLT::scalar(32); 2636 const LLT S64 = LLT::scalar(64); 2637 const LLT S1 = LLT::scalar(1); 2638 Register RcpLo, RcpHi; 2639 2640 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2641 2642 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2643 2644 auto Zero64 = B.buildConstant(S64, 0); 2645 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2646 2647 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2648 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2649 2650 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2651 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2652 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2653 2654 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2655 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2656 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2657 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2658 2659 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2660 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2661 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2662 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2663 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2664 2665 auto Zero32 = B.buildConstant(S32, 0); 2666 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2667 auto Add2_HiC = 2668 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2669 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2670 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2671 2672 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2673 Register NumerLo = UnmergeNumer.getReg(0); 2674 Register NumerHi = UnmergeNumer.getReg(1); 2675 2676 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2677 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2678 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2679 Register Mul3_Lo = UnmergeMul3.getReg(0); 2680 Register Mul3_Hi = UnmergeMul3.getReg(1); 2681 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2682 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2683 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2684 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2685 2686 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2687 Register DenomLo = UnmergeDenom.getReg(0); 2688 Register DenomHi = UnmergeDenom.getReg(1); 2689 2690 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2691 auto C1 = B.buildSExt(S32, CmpHi); 2692 2693 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2694 auto C2 = B.buildSExt(S32, CmpLo); 2695 2696 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2697 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2698 2699 // TODO: Here and below portions of the code can be enclosed into if/endif. 2700 // Currently control flow is unconditional and we have 4 selects after 2701 // potential endif to substitute PHIs. 2702 2703 // if C3 != 0 ... 2704 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2705 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2706 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2707 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2708 2709 auto One64 = B.buildConstant(S64, 1); 2710 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2711 2712 auto C4 = 2713 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2714 auto C5 = 2715 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2716 auto C6 = B.buildSelect( 2717 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2718 2719 // if (C6 != 0) 2720 auto Add4 = B.buildAdd(S64, Add3, One64); 2721 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2722 2723 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2724 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2725 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2726 2727 // endif C6 2728 // endif C3 2729 2730 if (IsDiv) { 2731 auto Sel1 = B.buildSelect( 2732 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2733 B.buildSelect(DstReg, 2734 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2735 } else { 2736 auto Sel2 = B.buildSelect( 2737 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2738 B.buildSelect(DstReg, 2739 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2740 } 2741 } 2742 2743 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2744 MachineRegisterInfo &MRI, 2745 MachineIRBuilder &B) const { 2746 const LLT S64 = LLT::scalar(64); 2747 const LLT S32 = LLT::scalar(32); 2748 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2749 Register DstReg = MI.getOperand(0).getReg(); 2750 Register Num = MI.getOperand(1).getReg(); 2751 Register Den = MI.getOperand(2).getReg(); 2752 LLT Ty = MRI.getType(DstReg); 2753 2754 if (Ty == S32) 2755 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2756 else if (Ty == S64) 2757 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2758 else 2759 return false; 2760 2761 MI.eraseFromParent(); 2762 return true; 2763 2764 } 2765 2766 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2767 MachineRegisterInfo &MRI, 2768 MachineIRBuilder &B) const { 2769 const LLT S64 = LLT::scalar(64); 2770 const LLT S32 = LLT::scalar(32); 2771 2772 Register DstReg = MI.getOperand(0).getReg(); 2773 const LLT Ty = MRI.getType(DstReg); 2774 if (Ty != S32 && Ty != S64) 2775 return false; 2776 2777 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2778 2779 Register LHS = MI.getOperand(1).getReg(); 2780 Register RHS = MI.getOperand(2).getReg(); 2781 2782 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2783 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2784 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2785 2786 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2787 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2788 2789 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2790 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2791 2792 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2793 if (Ty == S32) 2794 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2795 else 2796 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2797 2798 Register Sign; 2799 if (IsDiv) 2800 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2801 else 2802 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2803 2804 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2805 B.buildSub(DstReg, UDivRem, Sign); 2806 2807 MI.eraseFromParent(); 2808 return true; 2809 } 2810 2811 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2812 MachineRegisterInfo &MRI, 2813 MachineIRBuilder &B) const { 2814 Register Res = MI.getOperand(0).getReg(); 2815 Register LHS = MI.getOperand(1).getReg(); 2816 Register RHS = MI.getOperand(2).getReg(); 2817 2818 uint16_t Flags = MI.getFlags(); 2819 2820 LLT ResTy = MRI.getType(Res); 2821 LLT S32 = LLT::scalar(32); 2822 LLT S64 = LLT::scalar(64); 2823 2824 const MachineFunction &MF = B.getMF(); 2825 bool Unsafe = 2826 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2827 2828 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2829 return false; 2830 2831 if (!Unsafe && ResTy == S32 && 2832 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2833 return false; 2834 2835 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2836 // 1 / x -> RCP(x) 2837 if (CLHS->isExactlyValue(1.0)) { 2838 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2839 .addUse(RHS) 2840 .setMIFlags(Flags); 2841 2842 MI.eraseFromParent(); 2843 return true; 2844 } 2845 2846 // -1 / x -> RCP( FNEG(x) ) 2847 if (CLHS->isExactlyValue(-1.0)) { 2848 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2849 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2850 .addUse(FNeg.getReg(0)) 2851 .setMIFlags(Flags); 2852 2853 MI.eraseFromParent(); 2854 return true; 2855 } 2856 } 2857 2858 // x / y -> x * (1.0 / y) 2859 if (Unsafe) { 2860 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2861 .addUse(RHS) 2862 .setMIFlags(Flags); 2863 B.buildFMul(Res, LHS, RCP, Flags); 2864 2865 MI.eraseFromParent(); 2866 return true; 2867 } 2868 2869 return false; 2870 } 2871 2872 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2873 MachineRegisterInfo &MRI, 2874 MachineIRBuilder &B) const { 2875 Register Res = MI.getOperand(0).getReg(); 2876 Register LHS = MI.getOperand(1).getReg(); 2877 Register RHS = MI.getOperand(2).getReg(); 2878 2879 uint16_t Flags = MI.getFlags(); 2880 2881 LLT S16 = LLT::scalar(16); 2882 LLT S32 = LLT::scalar(32); 2883 2884 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2885 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2886 2887 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2888 .addUse(RHSExt.getReg(0)) 2889 .setMIFlags(Flags); 2890 2891 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2892 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2893 2894 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2895 .addUse(RDst.getReg(0)) 2896 .addUse(RHS) 2897 .addUse(LHS) 2898 .setMIFlags(Flags); 2899 2900 MI.eraseFromParent(); 2901 return true; 2902 } 2903 2904 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2905 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2906 static void toggleSPDenormMode(bool Enable, 2907 MachineIRBuilder &B, 2908 const GCNSubtarget &ST, 2909 AMDGPU::SIModeRegisterDefaults Mode) { 2910 // Set SP denorm mode to this value. 2911 unsigned SPDenormMode = 2912 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2913 2914 if (ST.hasDenormModeInst()) { 2915 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2916 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2917 2918 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2919 B.buildInstr(AMDGPU::S_DENORM_MODE) 2920 .addImm(NewDenormModeValue); 2921 2922 } else { 2923 // Select FP32 bit field in mode register. 2924 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2925 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2926 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2927 2928 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2929 .addImm(SPDenormMode) 2930 .addImm(SPDenormModeBitField); 2931 } 2932 } 2933 2934 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2935 MachineRegisterInfo &MRI, 2936 MachineIRBuilder &B) const { 2937 Register Res = MI.getOperand(0).getReg(); 2938 Register LHS = MI.getOperand(1).getReg(); 2939 Register RHS = MI.getOperand(2).getReg(); 2940 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2941 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2942 2943 uint16_t Flags = MI.getFlags(); 2944 2945 LLT S32 = LLT::scalar(32); 2946 LLT S1 = LLT::scalar(1); 2947 2948 auto One = B.buildFConstant(S32, 1.0f); 2949 2950 auto DenominatorScaled = 2951 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2952 .addUse(LHS) 2953 .addUse(RHS) 2954 .addImm(0) 2955 .setMIFlags(Flags); 2956 auto NumeratorScaled = 2957 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2958 .addUse(LHS) 2959 .addUse(RHS) 2960 .addImm(1) 2961 .setMIFlags(Flags); 2962 2963 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2964 .addUse(DenominatorScaled.getReg(0)) 2965 .setMIFlags(Flags); 2966 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2967 2968 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2969 // aren't modeled as reading it. 2970 if (!Mode.allFP32Denormals()) 2971 toggleSPDenormMode(true, B, ST, Mode); 2972 2973 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2974 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2975 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2976 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2977 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2978 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2979 2980 if (!Mode.allFP32Denormals()) 2981 toggleSPDenormMode(false, B, ST, Mode); 2982 2983 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2984 .addUse(Fma4.getReg(0)) 2985 .addUse(Fma1.getReg(0)) 2986 .addUse(Fma3.getReg(0)) 2987 .addUse(NumeratorScaled.getReg(1)) 2988 .setMIFlags(Flags); 2989 2990 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2991 .addUse(Fmas.getReg(0)) 2992 .addUse(RHS) 2993 .addUse(LHS) 2994 .setMIFlags(Flags); 2995 2996 MI.eraseFromParent(); 2997 return true; 2998 } 2999 3000 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3001 MachineRegisterInfo &MRI, 3002 MachineIRBuilder &B) const { 3003 Register Res = MI.getOperand(0).getReg(); 3004 Register LHS = MI.getOperand(1).getReg(); 3005 Register RHS = MI.getOperand(2).getReg(); 3006 3007 uint16_t Flags = MI.getFlags(); 3008 3009 LLT S64 = LLT::scalar(64); 3010 LLT S1 = LLT::scalar(1); 3011 3012 auto One = B.buildFConstant(S64, 1.0); 3013 3014 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3015 .addUse(LHS) 3016 .addUse(RHS) 3017 .addImm(0) 3018 .setMIFlags(Flags); 3019 3020 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3021 3022 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3023 .addUse(DivScale0.getReg(0)) 3024 .setMIFlags(Flags); 3025 3026 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3027 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3028 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3029 3030 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3031 .addUse(LHS) 3032 .addUse(RHS) 3033 .addImm(1) 3034 .setMIFlags(Flags); 3035 3036 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3037 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3038 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3039 3040 Register Scale; 3041 if (!ST.hasUsableDivScaleConditionOutput()) { 3042 // Workaround a hardware bug on SI where the condition output from div_scale 3043 // is not usable. 3044 3045 LLT S32 = LLT::scalar(32); 3046 3047 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3048 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3049 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3050 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3051 3052 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3053 Scale1Unmerge.getReg(1)); 3054 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3055 Scale0Unmerge.getReg(1)); 3056 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3057 } else { 3058 Scale = DivScale1.getReg(1); 3059 } 3060 3061 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3062 .addUse(Fma4.getReg(0)) 3063 .addUse(Fma3.getReg(0)) 3064 .addUse(Mul.getReg(0)) 3065 .addUse(Scale) 3066 .setMIFlags(Flags); 3067 3068 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3069 .addUse(Fmas.getReg(0)) 3070 .addUse(RHS) 3071 .addUse(LHS) 3072 .setMIFlags(Flags); 3073 3074 MI.eraseFromParent(); 3075 return true; 3076 } 3077 3078 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3079 MachineRegisterInfo &MRI, 3080 MachineIRBuilder &B) const { 3081 Register Res = MI.getOperand(0).getReg(); 3082 Register LHS = MI.getOperand(2).getReg(); 3083 Register RHS = MI.getOperand(3).getReg(); 3084 uint16_t Flags = MI.getFlags(); 3085 3086 LLT S32 = LLT::scalar(32); 3087 LLT S1 = LLT::scalar(1); 3088 3089 auto Abs = B.buildFAbs(S32, RHS, Flags); 3090 const APFloat C0Val(1.0f); 3091 3092 auto C0 = B.buildConstant(S32, 0x6f800000); 3093 auto C1 = B.buildConstant(S32, 0x2f800000); 3094 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3095 3096 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3097 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3098 3099 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3100 3101 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3102 .addUse(Mul0.getReg(0)) 3103 .setMIFlags(Flags); 3104 3105 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3106 3107 B.buildFMul(Res, Sel, Mul1, Flags); 3108 3109 MI.eraseFromParent(); 3110 return true; 3111 } 3112 3113 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3114 MachineRegisterInfo &MRI, 3115 MachineIRBuilder &B) const { 3116 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3117 if (!MFI->isEntryFunction()) { 3118 return legalizePreloadedArgIntrin(MI, MRI, B, 3119 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3120 } 3121 3122 uint64_t Offset = 3123 ST.getTargetLowering()->getImplicitParameterOffset( 3124 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3125 Register DstReg = MI.getOperand(0).getReg(); 3126 LLT DstTy = MRI.getType(DstReg); 3127 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3128 3129 const ArgDescriptor *Arg; 3130 const TargetRegisterClass *RC; 3131 LLT ArgTy; 3132 std::tie(Arg, RC, ArgTy) = 3133 MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3134 if (!Arg) 3135 return false; 3136 3137 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3138 if (!loadInputValue(KernargPtrReg, B, Arg)) 3139 return false; 3140 3141 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3142 MI.eraseFromParent(); 3143 return true; 3144 } 3145 3146 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3147 MachineRegisterInfo &MRI, 3148 MachineIRBuilder &B, 3149 unsigned AddrSpace) const { 3150 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3151 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3152 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3153 MI.eraseFromParent(); 3154 return true; 3155 } 3156 3157 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3158 // offset (the offset that is included in bounds checking and swizzling, to be 3159 // split between the instruction's voffset and immoffset fields) and soffset 3160 // (the offset that is excluded from bounds checking and swizzling, to go in 3161 // the instruction's soffset field). This function takes the first kind of 3162 // offset and figures out how to split it between voffset and immoffset. 3163 std::tuple<Register, unsigned, unsigned> 3164 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3165 Register OrigOffset) const { 3166 const unsigned MaxImm = 4095; 3167 Register BaseReg; 3168 unsigned TotalConstOffset; 3169 MachineInstr *OffsetDef; 3170 const LLT S32 = LLT::scalar(32); 3171 3172 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3173 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3174 3175 unsigned ImmOffset = TotalConstOffset; 3176 3177 // If the immediate value is too big for the immoffset field, put the value 3178 // and -4096 into the immoffset field so that the value that is copied/added 3179 // for the voffset field is a multiple of 4096, and it stands more chance 3180 // of being CSEd with the copy/add for another similar load/store. 3181 // However, do not do that rounding down to a multiple of 4096 if that is a 3182 // negative number, as it appears to be illegal to have a negative offset 3183 // in the vgpr, even if adding the immediate offset makes it positive. 3184 unsigned Overflow = ImmOffset & ~MaxImm; 3185 ImmOffset -= Overflow; 3186 if ((int32_t)Overflow < 0) { 3187 Overflow += ImmOffset; 3188 ImmOffset = 0; 3189 } 3190 3191 if (Overflow != 0) { 3192 if (!BaseReg) { 3193 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3194 } else { 3195 auto OverflowVal = B.buildConstant(S32, Overflow); 3196 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3197 } 3198 } 3199 3200 if (!BaseReg) 3201 BaseReg = B.buildConstant(S32, 0).getReg(0); 3202 3203 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3204 } 3205 3206 /// Handle register layout difference for f16 images for some subtargets. 3207 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3208 MachineRegisterInfo &MRI, 3209 Register Reg) const { 3210 if (!ST.hasUnpackedD16VMem()) 3211 return Reg; 3212 3213 const LLT S16 = LLT::scalar(16); 3214 const LLT S32 = LLT::scalar(32); 3215 LLT StoreVT = MRI.getType(Reg); 3216 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3217 3218 auto Unmerge = B.buildUnmerge(S16, Reg); 3219 3220 SmallVector<Register, 4> WideRegs; 3221 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3222 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3223 3224 int NumElts = StoreVT.getNumElements(); 3225 3226 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3227 } 3228 3229 Register AMDGPULegalizerInfo::fixStoreSourceType( 3230 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3231 MachineRegisterInfo *MRI = B.getMRI(); 3232 LLT Ty = MRI->getType(VData); 3233 3234 const LLT S16 = LLT::scalar(16); 3235 3236 // Fixup illegal register types for i8 stores. 3237 if (Ty == LLT::scalar(8) || Ty == S16) { 3238 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3239 return AnyExt; 3240 } 3241 3242 if (Ty.isVector()) { 3243 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3244 if (IsFormat) 3245 return handleD16VData(B, *MRI, VData); 3246 } 3247 } 3248 3249 return VData; 3250 } 3251 3252 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3253 MachineRegisterInfo &MRI, 3254 MachineIRBuilder &B, 3255 bool IsTyped, 3256 bool IsFormat) const { 3257 Register VData = MI.getOperand(1).getReg(); 3258 LLT Ty = MRI.getType(VData); 3259 LLT EltTy = Ty.getScalarType(); 3260 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3261 const LLT S32 = LLT::scalar(32); 3262 3263 VData = fixStoreSourceType(B, VData, IsFormat); 3264 Register RSrc = MI.getOperand(2).getReg(); 3265 3266 MachineMemOperand *MMO = *MI.memoperands_begin(); 3267 const int MemSize = MMO->getSize(); 3268 3269 unsigned ImmOffset; 3270 unsigned TotalOffset; 3271 3272 // The typed intrinsics add an immediate after the registers. 3273 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3274 3275 // The struct intrinsic variants add one additional operand over raw. 3276 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3277 Register VIndex; 3278 int OpOffset = 0; 3279 if (HasVIndex) { 3280 VIndex = MI.getOperand(3).getReg(); 3281 OpOffset = 1; 3282 } 3283 3284 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3285 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3286 3287 unsigned Format = 0; 3288 if (IsTyped) { 3289 Format = MI.getOperand(5 + OpOffset).getImm(); 3290 ++OpOffset; 3291 } 3292 3293 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3294 3295 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3296 if (TotalOffset != 0) 3297 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3298 3299 unsigned Opc; 3300 if (IsTyped) { 3301 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3302 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3303 } else if (IsFormat) { 3304 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3305 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3306 } else { 3307 switch (MemSize) { 3308 case 1: 3309 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3310 break; 3311 case 2: 3312 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3313 break; 3314 default: 3315 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3316 break; 3317 } 3318 } 3319 3320 if (!VIndex) 3321 VIndex = B.buildConstant(S32, 0).getReg(0); 3322 3323 auto MIB = B.buildInstr(Opc) 3324 .addUse(VData) // vdata 3325 .addUse(RSrc) // rsrc 3326 .addUse(VIndex) // vindex 3327 .addUse(VOffset) // voffset 3328 .addUse(SOffset) // soffset 3329 .addImm(ImmOffset); // offset(imm) 3330 3331 if (IsTyped) 3332 MIB.addImm(Format); 3333 3334 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3335 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3336 .addMemOperand(MMO); 3337 3338 MI.eraseFromParent(); 3339 return true; 3340 } 3341 3342 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3343 MachineRegisterInfo &MRI, 3344 MachineIRBuilder &B, 3345 bool IsFormat, 3346 bool IsTyped) const { 3347 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3348 MachineMemOperand *MMO = *MI.memoperands_begin(); 3349 const int MemSize = MMO->getSize(); 3350 const LLT S32 = LLT::scalar(32); 3351 3352 Register Dst = MI.getOperand(0).getReg(); 3353 Register RSrc = MI.getOperand(2).getReg(); 3354 3355 // The typed intrinsics add an immediate after the registers. 3356 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3357 3358 // The struct intrinsic variants add one additional operand over raw. 3359 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3360 Register VIndex; 3361 int OpOffset = 0; 3362 if (HasVIndex) { 3363 VIndex = MI.getOperand(3).getReg(); 3364 OpOffset = 1; 3365 } 3366 3367 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3368 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3369 3370 unsigned Format = 0; 3371 if (IsTyped) { 3372 Format = MI.getOperand(5 + OpOffset).getImm(); 3373 ++OpOffset; 3374 } 3375 3376 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3377 unsigned ImmOffset; 3378 unsigned TotalOffset; 3379 3380 LLT Ty = MRI.getType(Dst); 3381 LLT EltTy = Ty.getScalarType(); 3382 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3383 const bool Unpacked = ST.hasUnpackedD16VMem(); 3384 3385 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3386 if (TotalOffset != 0) 3387 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3388 3389 unsigned Opc; 3390 3391 if (IsTyped) { 3392 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3393 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3394 } else if (IsFormat) { 3395 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3396 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3397 } else { 3398 switch (MemSize) { 3399 case 1: 3400 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3401 break; 3402 case 2: 3403 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3404 break; 3405 default: 3406 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3407 break; 3408 } 3409 } 3410 3411 Register LoadDstReg; 3412 3413 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3414 LLT UnpackedTy = Ty.changeElementSize(32); 3415 3416 if (IsExtLoad) 3417 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3418 else if (Unpacked && IsD16 && Ty.isVector()) 3419 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3420 else 3421 LoadDstReg = Dst; 3422 3423 if (!VIndex) 3424 VIndex = B.buildConstant(S32, 0).getReg(0); 3425 3426 auto MIB = B.buildInstr(Opc) 3427 .addDef(LoadDstReg) // vdata 3428 .addUse(RSrc) // rsrc 3429 .addUse(VIndex) // vindex 3430 .addUse(VOffset) // voffset 3431 .addUse(SOffset) // soffset 3432 .addImm(ImmOffset); // offset(imm) 3433 3434 if (IsTyped) 3435 MIB.addImm(Format); 3436 3437 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3438 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3439 .addMemOperand(MMO); 3440 3441 if (LoadDstReg != Dst) { 3442 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3443 3444 // Widen result for extending loads was widened. 3445 if (IsExtLoad) 3446 B.buildTrunc(Dst, LoadDstReg); 3447 else { 3448 // Repack to original 16-bit vector result 3449 // FIXME: G_TRUNC should work, but legalization currently fails 3450 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3451 SmallVector<Register, 4> Repack; 3452 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3453 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3454 B.buildMerge(Dst, Repack); 3455 } 3456 } 3457 3458 MI.eraseFromParent(); 3459 return true; 3460 } 3461 3462 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3463 MachineIRBuilder &B, 3464 bool IsInc) const { 3465 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3466 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3467 B.buildInstr(Opc) 3468 .addDef(MI.getOperand(0).getReg()) 3469 .addUse(MI.getOperand(2).getReg()) 3470 .addUse(MI.getOperand(3).getReg()) 3471 .cloneMemRefs(MI); 3472 MI.eraseFromParent(); 3473 return true; 3474 } 3475 3476 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3477 switch (IntrID) { 3478 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3479 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3480 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3481 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3482 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3483 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3484 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3485 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3486 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3487 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3488 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3489 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3490 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3491 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3492 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3493 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3494 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3495 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3496 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3497 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3498 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3499 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3500 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3501 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3502 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3503 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3504 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3505 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3506 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3507 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3508 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3509 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3510 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3511 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3512 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3513 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3514 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3515 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3516 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3517 default: 3518 llvm_unreachable("unhandled atomic opcode"); 3519 } 3520 } 3521 3522 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3523 MachineIRBuilder &B, 3524 Intrinsic::ID IID) const { 3525 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3526 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3527 3528 Register Dst = MI.getOperand(0).getReg(); 3529 Register VData = MI.getOperand(2).getReg(); 3530 3531 Register CmpVal; 3532 int OpOffset = 0; 3533 3534 if (IsCmpSwap) { 3535 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3536 ++OpOffset; 3537 } 3538 3539 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3540 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3541 3542 // The struct intrinsic variants add one additional operand over raw. 3543 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3544 Register VIndex; 3545 if (HasVIndex) { 3546 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3547 ++OpOffset; 3548 } 3549 3550 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3551 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3552 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3553 3554 MachineMemOperand *MMO = *MI.memoperands_begin(); 3555 3556 unsigned ImmOffset; 3557 unsigned TotalOffset; 3558 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3559 if (TotalOffset != 0) 3560 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3561 3562 if (!VIndex) 3563 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3564 3565 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3566 .addDef(Dst) 3567 .addUse(VData); // vdata 3568 3569 if (IsCmpSwap) 3570 MIB.addReg(CmpVal); 3571 3572 MIB.addUse(RSrc) // rsrc 3573 .addUse(VIndex) // vindex 3574 .addUse(VOffset) // voffset 3575 .addUse(SOffset) // soffset 3576 .addImm(ImmOffset) // offset(imm) 3577 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3578 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3579 .addMemOperand(MMO); 3580 3581 MI.eraseFromParent(); 3582 return true; 3583 } 3584 3585 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3586 /// vector with s16 typed elements. 3587 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3588 SmallVectorImpl<Register> &PackedAddrs, 3589 int AddrIdx, int DimIdx, int EndIdx, 3590 int NumGradients) { 3591 const LLT S16 = LLT::scalar(16); 3592 const LLT V2S16 = LLT::vector(2, 16); 3593 3594 for (int I = AddrIdx; I < EndIdx; ++I) { 3595 MachineOperand &SrcOp = MI.getOperand(I); 3596 if (!SrcOp.isReg()) 3597 continue; // _L to _LZ may have eliminated this. 3598 3599 Register AddrReg = SrcOp.getReg(); 3600 3601 if (I < DimIdx) { 3602 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3603 PackedAddrs.push_back(AddrReg); 3604 } else { 3605 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3606 // derivatives dx/dh and dx/dv are packed with undef. 3607 if (((I + 1) >= EndIdx) || 3608 ((NumGradients / 2) % 2 == 1 && 3609 (I == DimIdx + (NumGradients / 2) - 1 || 3610 I == DimIdx + NumGradients - 1)) || 3611 // Check for _L to _LZ optimization 3612 !MI.getOperand(I + 1).isReg()) { 3613 PackedAddrs.push_back( 3614 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3615 .getReg(0)); 3616 } else { 3617 PackedAddrs.push_back( 3618 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3619 .getReg(0)); 3620 ++I; 3621 } 3622 } 3623 } 3624 } 3625 3626 /// Convert from separate vaddr components to a single vector address register, 3627 /// and replace the remaining operands with $noreg. 3628 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3629 int DimIdx, int NumVAddrs) { 3630 const LLT S32 = LLT::scalar(32); 3631 3632 SmallVector<Register, 8> AddrRegs; 3633 for (int I = 0; I != NumVAddrs; ++I) { 3634 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3635 if (SrcOp.isReg()) { 3636 AddrRegs.push_back(SrcOp.getReg()); 3637 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3638 } 3639 } 3640 3641 int NumAddrRegs = AddrRegs.size(); 3642 if (NumAddrRegs != 1) { 3643 // Round up to 8 elements for v5-v7 3644 // FIXME: Missing intermediate sized register classes and instructions. 3645 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3646 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3647 auto Undef = B.buildUndef(S32); 3648 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3649 NumAddrRegs = RoundedNumRegs; 3650 } 3651 3652 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3653 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3654 } 3655 3656 for (int I = 1; I != NumVAddrs; ++I) { 3657 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3658 if (SrcOp.isReg()) 3659 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3660 } 3661 } 3662 3663 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3664 /// 3665 /// Depending on the subtarget, load/store with 16-bit element data need to be 3666 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3667 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3668 /// registers. 3669 /// 3670 /// We don't want to directly select image instructions just yet, but also want 3671 /// to exposes all register repacking to the legalizer/combiners. We also don't 3672 /// want a selected instrution entering RegBankSelect. In order to avoid 3673 /// defining a multitude of intermediate image instructions, directly hack on 3674 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3675 /// now unnecessary arguments with $noreg. 3676 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3677 MachineInstr &MI, MachineIRBuilder &B, 3678 GISelChangeObserver &Observer, 3679 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3680 3681 const int NumDefs = MI.getNumExplicitDefs(); 3682 bool IsTFE = NumDefs == 2; 3683 // We are only processing the operands of d16 image operations on subtargets 3684 // that use the unpacked register layout, or need to repack the TFE result. 3685 3686 // TODO: Do we need to guard against already legalized intrinsics? 3687 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3688 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3689 3690 MachineRegisterInfo *MRI = B.getMRI(); 3691 const LLT S32 = LLT::scalar(32); 3692 const LLT S16 = LLT::scalar(16); 3693 const LLT V2S16 = LLT::vector(2, 16); 3694 3695 // Index of first address argument 3696 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3697 3698 int NumVAddrs, NumGradients; 3699 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3700 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3701 getDMaskIdx(BaseOpcode, NumDefs); 3702 unsigned DMask = 0; 3703 3704 // Check for 16 bit addresses and pack if true. 3705 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3706 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3707 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3708 const bool IsG16 = GradTy == S16; 3709 const bool IsA16 = AddrTy == S16; 3710 3711 int DMaskLanes = 0; 3712 if (!BaseOpcode->Atomic) { 3713 DMask = MI.getOperand(DMaskIdx).getImm(); 3714 if (BaseOpcode->Gather4) { 3715 DMaskLanes = 4; 3716 } else if (DMask != 0) { 3717 DMaskLanes = countPopulation(DMask); 3718 } else if (!IsTFE && !BaseOpcode->Store) { 3719 // If dmask is 0, this is a no-op load. This can be eliminated. 3720 B.buildUndef(MI.getOperand(0)); 3721 MI.eraseFromParent(); 3722 return true; 3723 } 3724 } 3725 3726 Observer.changingInstr(MI); 3727 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3728 3729 unsigned NewOpcode = NumDefs == 0 ? 3730 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3731 3732 // Track that we legalized this 3733 MI.setDesc(B.getTII().get(NewOpcode)); 3734 3735 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3736 // dmask to be at least 1 otherwise the instruction will fail 3737 if (IsTFE && DMask == 0) { 3738 DMask = 0x1; 3739 DMaskLanes = 1; 3740 MI.getOperand(DMaskIdx).setImm(DMask); 3741 } 3742 3743 if (BaseOpcode->Atomic) { 3744 Register VData0 = MI.getOperand(2).getReg(); 3745 LLT Ty = MRI->getType(VData0); 3746 3747 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3748 if (Ty.isVector()) 3749 return false; 3750 3751 if (BaseOpcode->AtomicX2) { 3752 Register VData1 = MI.getOperand(3).getReg(); 3753 // The two values are packed in one register. 3754 LLT PackedTy = LLT::vector(2, Ty); 3755 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3756 MI.getOperand(2).setReg(Concat.getReg(0)); 3757 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3758 } 3759 } 3760 3761 int CorrectedNumVAddrs = NumVAddrs; 3762 3763 // Optimize _L to _LZ when _L is zero 3764 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3765 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3766 const ConstantFP *ConstantLod; 3767 const int LodIdx = AddrIdx + NumVAddrs - 1; 3768 3769 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3770 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3771 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3772 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3773 LZMappingInfo->LZ, ImageDimIntr->Dim); 3774 3775 // The starting indexes should remain in the same place. 3776 --NumVAddrs; 3777 --CorrectedNumVAddrs; 3778 3779 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3780 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3781 MI.RemoveOperand(LodIdx); 3782 } 3783 } 3784 } 3785 3786 // Optimize _mip away, when 'lod' is zero 3787 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3788 int64_t ConstantLod; 3789 const int LodIdx = AddrIdx + NumVAddrs - 1; 3790 3791 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3792 if (ConstantLod == 0) { 3793 // TODO: Change intrinsic opcode and remove operand instead or replacing 3794 // it with 0, as the _L to _LZ handling is done above. 3795 MI.getOperand(LodIdx).ChangeToImmediate(0); 3796 --CorrectedNumVAddrs; 3797 } 3798 } 3799 } 3800 3801 // Rewrite the addressing register layout before doing anything else. 3802 if (IsA16 || IsG16) { 3803 if (IsA16) { 3804 // Target must support the feature and gradients need to be 16 bit too 3805 if (!ST.hasA16() || !IsG16) 3806 return false; 3807 } else if (!ST.hasG16()) 3808 return false; 3809 3810 if (NumVAddrs > 1) { 3811 SmallVector<Register, 4> PackedRegs; 3812 // Don't compress addresses for G16 3813 const int PackEndIdx = 3814 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3815 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3816 PackEndIdx, NumGradients); 3817 3818 if (!IsA16) { 3819 // Add uncompressed address 3820 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3821 int AddrReg = MI.getOperand(I).getReg(); 3822 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3823 PackedRegs.push_back(AddrReg); 3824 } 3825 } 3826 3827 // See also below in the non-a16 branch 3828 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3829 3830 if (!UseNSA && PackedRegs.size() > 1) { 3831 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3832 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3833 PackedRegs[0] = Concat.getReg(0); 3834 PackedRegs.resize(1); 3835 } 3836 3837 const int NumPacked = PackedRegs.size(); 3838 for (int I = 0; I != NumVAddrs; ++I) { 3839 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3840 if (!SrcOp.isReg()) { 3841 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3842 continue; 3843 } 3844 3845 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3846 3847 if (I < NumPacked) 3848 SrcOp.setReg(PackedRegs[I]); 3849 else 3850 SrcOp.setReg(AMDGPU::NoRegister); 3851 } 3852 } 3853 } else { 3854 // If the register allocator cannot place the address registers contiguously 3855 // without introducing moves, then using the non-sequential address encoding 3856 // is always preferable, since it saves VALU instructions and is usually a 3857 // wash in terms of code size or even better. 3858 // 3859 // However, we currently have no way of hinting to the register allocator 3860 // that MIMG addresses should be placed contiguously when it is possible to 3861 // do so, so force non-NSA for the common 2-address case as a heuristic. 3862 // 3863 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3864 // allocation when possible. 3865 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3866 3867 if (!UseNSA && NumVAddrs > 1) 3868 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3869 } 3870 3871 int Flags = 0; 3872 if (IsA16) 3873 Flags |= 1; 3874 if (IsG16) 3875 Flags |= 2; 3876 MI.addOperand(MachineOperand::CreateImm(Flags)); 3877 3878 if (BaseOpcode->Store) { // No TFE for stores? 3879 // TODO: Handle dmask trim 3880 Register VData = MI.getOperand(1).getReg(); 3881 LLT Ty = MRI->getType(VData); 3882 if (!Ty.isVector() || Ty.getElementType() != S16) 3883 return true; 3884 3885 Register RepackedReg = handleD16VData(B, *MRI, VData); 3886 if (RepackedReg != VData) { 3887 MI.getOperand(1).setReg(RepackedReg); 3888 } 3889 3890 return true; 3891 } 3892 3893 Register DstReg = MI.getOperand(0).getReg(); 3894 LLT Ty = MRI->getType(DstReg); 3895 const LLT EltTy = Ty.getScalarType(); 3896 const bool IsD16 = Ty.getScalarType() == S16; 3897 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3898 3899 // Confirm that the return type is large enough for the dmask specified 3900 if (NumElts < DMaskLanes) 3901 return false; 3902 3903 if (NumElts > 4 || DMaskLanes > 4) 3904 return false; 3905 3906 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3907 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3908 3909 // The raw dword aligned data component of the load. The only legal cases 3910 // where this matters should be when using the packed D16 format, for 3911 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3912 LLT RoundedTy; 3913 3914 // S32 vector to to cover all data, plus TFE result element. 3915 LLT TFETy; 3916 3917 // Register type to use for each loaded component. Will be S32 or V2S16. 3918 LLT RegTy; 3919 3920 if (IsD16 && ST.hasUnpackedD16VMem()) { 3921 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3922 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3923 RegTy = S32; 3924 } else { 3925 unsigned EltSize = EltTy.getSizeInBits(); 3926 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3927 unsigned RoundedSize = 32 * RoundedElts; 3928 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3929 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3930 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3931 } 3932 3933 // The return type does not need adjustment. 3934 // TODO: Should we change s16 case to s32 or <2 x s16>? 3935 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3936 return true; 3937 3938 Register Dst1Reg; 3939 3940 // Insert after the instruction. 3941 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3942 3943 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3944 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3945 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3946 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3947 3948 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3949 3950 MI.getOperand(0).setReg(NewResultReg); 3951 3952 // In the IR, TFE is supposed to be used with a 2 element struct return 3953 // type. The intruction really returns these two values in one contiguous 3954 // register, with one additional dword beyond the loaded data. Rewrite the 3955 // return type to use a single register result. 3956 3957 if (IsTFE) { 3958 Dst1Reg = MI.getOperand(1).getReg(); 3959 if (MRI->getType(Dst1Reg) != S32) 3960 return false; 3961 3962 // TODO: Make sure the TFE operand bit is set. 3963 MI.RemoveOperand(1); 3964 3965 // Handle the easy case that requires no repack instructions. 3966 if (Ty == S32) { 3967 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 3968 return true; 3969 } 3970 } 3971 3972 // Now figure out how to copy the new result register back into the old 3973 // result. 3974 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 3975 3976 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 3977 3978 if (ResultNumRegs == 1) { 3979 assert(!IsTFE); 3980 ResultRegs[0] = NewResultReg; 3981 } else { 3982 // We have to repack into a new vector of some kind. 3983 for (int I = 0; I != NumDataRegs; ++I) 3984 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 3985 B.buildUnmerge(ResultRegs, NewResultReg); 3986 3987 // Drop the final TFE element to get the data part. The TFE result is 3988 // directly written to the right place already. 3989 if (IsTFE) 3990 ResultRegs.resize(NumDataRegs); 3991 } 3992 3993 // For an s16 scalar result, we form an s32 result with a truncate regardless 3994 // of packed vs. unpacked. 3995 if (IsD16 && !Ty.isVector()) { 3996 B.buildTrunc(DstReg, ResultRegs[0]); 3997 return true; 3998 } 3999 4000 // Avoid a build/concat_vector of 1 entry. 4001 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4002 B.buildBitcast(DstReg, ResultRegs[0]); 4003 return true; 4004 } 4005 4006 assert(Ty.isVector()); 4007 4008 if (IsD16) { 4009 // For packed D16 results with TFE enabled, all the data components are 4010 // S32. Cast back to the expected type. 4011 // 4012 // TODO: We don't really need to use load s32 elements. We would only need one 4013 // cast for the TFE result if a multiple of v2s16 was used. 4014 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4015 for (Register &Reg : ResultRegs) 4016 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4017 } else if (ST.hasUnpackedD16VMem()) { 4018 for (Register &Reg : ResultRegs) 4019 Reg = B.buildTrunc(S16, Reg).getReg(0); 4020 } 4021 } 4022 4023 auto padWithUndef = [&](LLT Ty, int NumElts) { 4024 if (NumElts == 0) 4025 return; 4026 Register Undef = B.buildUndef(Ty).getReg(0); 4027 for (int I = 0; I != NumElts; ++I) 4028 ResultRegs.push_back(Undef); 4029 }; 4030 4031 // Pad out any elements eliminated due to the dmask. 4032 LLT ResTy = MRI->getType(ResultRegs[0]); 4033 if (!ResTy.isVector()) { 4034 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4035 B.buildBuildVector(DstReg, ResultRegs); 4036 return true; 4037 } 4038 4039 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4040 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4041 4042 // Deal with the one annoying legal case. 4043 const LLT V3S16 = LLT::vector(3, 16); 4044 if (Ty == V3S16) { 4045 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4046 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4047 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4048 return true; 4049 } 4050 4051 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4052 B.buildConcatVectors(DstReg, ResultRegs); 4053 return true; 4054 } 4055 4056 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4057 MachineInstr &MI, MachineIRBuilder &B, 4058 GISelChangeObserver &Observer) const { 4059 Register Dst = MI.getOperand(0).getReg(); 4060 LLT Ty = B.getMRI()->getType(Dst); 4061 unsigned Size = Ty.getSizeInBits(); 4062 MachineFunction &MF = B.getMF(); 4063 4064 Observer.changingInstr(MI); 4065 4066 // FIXME: We don't really need this intermediate instruction. The intrinsic 4067 // should be fixed to have a memory operand. Since it's readnone, we're not 4068 // allowed to add one. 4069 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4070 MI.RemoveOperand(1); // Remove intrinsic ID 4071 4072 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4073 // TODO: Should this use datalayout alignment? 4074 const unsigned MemSize = (Size + 7) / 8; 4075 const Align MemAlign(4); 4076 MachineMemOperand *MMO = MF.getMachineMemOperand( 4077 MachinePointerInfo(), 4078 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4079 MachineMemOperand::MOInvariant, 4080 MemSize, MemAlign); 4081 MI.addMemOperand(MF, MMO); 4082 4083 // There are no 96-bit result scalar loads, but widening to 128-bit should 4084 // always be legal. We may need to restore this to a 96-bit result if it turns 4085 // out this needs to be converted to a vector load during RegBankSelect. 4086 if (!isPowerOf2_32(Size)) { 4087 LegalizerHelper Helper(MF, *this, Observer, B); 4088 4089 if (Ty.isVector()) 4090 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4091 else 4092 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4093 } 4094 4095 Observer.changedInstr(MI); 4096 return true; 4097 } 4098 4099 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4100 MachineRegisterInfo &MRI, 4101 MachineIRBuilder &B) const { 4102 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4103 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4104 !ST.isTrapHandlerEnabled()) { 4105 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4106 } else { 4107 // Pass queue pointer to trap handler as input, and insert trap instruction 4108 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4109 const ArgDescriptor *Arg = 4110 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4111 if (!Arg) 4112 return false; 4113 MachineRegisterInfo &MRI = *B.getMRI(); 4114 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4115 Register LiveIn = getLiveInRegister( 4116 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4117 /*InsertLiveInCopy=*/false); 4118 if (!loadInputValue(LiveIn, B, Arg)) 4119 return false; 4120 B.buildCopy(SGPR01, LiveIn); 4121 B.buildInstr(AMDGPU::S_TRAP) 4122 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4123 .addReg(SGPR01, RegState::Implicit); 4124 } 4125 4126 MI.eraseFromParent(); 4127 return true; 4128 } 4129 4130 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4131 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4132 // Is non-HSA path or trap-handler disabled? then, report a warning 4133 // accordingly 4134 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4135 !ST.isTrapHandlerEnabled()) { 4136 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4137 "debugtrap handler not supported", 4138 MI.getDebugLoc(), DS_Warning); 4139 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4140 Ctx.diagnose(NoTrap); 4141 } else { 4142 // Insert debug-trap instruction 4143 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4144 } 4145 4146 MI.eraseFromParent(); 4147 return true; 4148 } 4149 4150 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4151 MachineInstr &MI) const { 4152 MachineIRBuilder &B = Helper.MIRBuilder; 4153 MachineRegisterInfo &MRI = *B.getMRI(); 4154 4155 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4156 auto IntrID = MI.getIntrinsicID(); 4157 switch (IntrID) { 4158 case Intrinsic::amdgcn_if: 4159 case Intrinsic::amdgcn_else: { 4160 MachineInstr *Br = nullptr; 4161 MachineBasicBlock *UncondBrTarget = nullptr; 4162 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4163 const SIRegisterInfo *TRI 4164 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4165 4166 Register Def = MI.getOperand(1).getReg(); 4167 Register Use = MI.getOperand(3).getReg(); 4168 4169 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4170 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4171 if (IntrID == Intrinsic::amdgcn_if) { 4172 B.buildInstr(AMDGPU::SI_IF) 4173 .addDef(Def) 4174 .addUse(Use) 4175 .addMBB(UncondBrTarget); 4176 } else { 4177 B.buildInstr(AMDGPU::SI_ELSE) 4178 .addDef(Def) 4179 .addUse(Use) 4180 .addMBB(UncondBrTarget) 4181 .addImm(0); 4182 } 4183 4184 if (Br) { 4185 Br->getOperand(0).setMBB(CondBrTarget); 4186 } else { 4187 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4188 // since we're swapping branch targets it needs to be reinserted. 4189 // FIXME: IRTranslator should probably not do this 4190 B.buildBr(*CondBrTarget); 4191 } 4192 4193 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4194 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4195 MI.eraseFromParent(); 4196 BrCond->eraseFromParent(); 4197 return true; 4198 } 4199 4200 return false; 4201 } 4202 case Intrinsic::amdgcn_loop: { 4203 MachineInstr *Br = nullptr; 4204 MachineBasicBlock *UncondBrTarget = nullptr; 4205 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4206 const SIRegisterInfo *TRI 4207 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4208 4209 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4210 Register Reg = MI.getOperand(2).getReg(); 4211 4212 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4213 B.buildInstr(AMDGPU::SI_LOOP) 4214 .addUse(Reg) 4215 .addMBB(UncondBrTarget); 4216 4217 if (Br) 4218 Br->getOperand(0).setMBB(CondBrTarget); 4219 else 4220 B.buildBr(*CondBrTarget); 4221 4222 MI.eraseFromParent(); 4223 BrCond->eraseFromParent(); 4224 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4225 return true; 4226 } 4227 4228 return false; 4229 } 4230 case Intrinsic::amdgcn_kernarg_segment_ptr: 4231 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4232 // This only makes sense to call in a kernel, so just lower to null. 4233 B.buildConstant(MI.getOperand(0).getReg(), 0); 4234 MI.eraseFromParent(); 4235 return true; 4236 } 4237 4238 return legalizePreloadedArgIntrin( 4239 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4240 case Intrinsic::amdgcn_implicitarg_ptr: 4241 return legalizeImplicitArgPtr(MI, MRI, B); 4242 case Intrinsic::amdgcn_workitem_id_x: 4243 return legalizePreloadedArgIntrin(MI, MRI, B, 4244 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4245 case Intrinsic::amdgcn_workitem_id_y: 4246 return legalizePreloadedArgIntrin(MI, MRI, B, 4247 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4248 case Intrinsic::amdgcn_workitem_id_z: 4249 return legalizePreloadedArgIntrin(MI, MRI, B, 4250 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4251 case Intrinsic::amdgcn_workgroup_id_x: 4252 return legalizePreloadedArgIntrin(MI, MRI, B, 4253 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4254 case Intrinsic::amdgcn_workgroup_id_y: 4255 return legalizePreloadedArgIntrin(MI, MRI, B, 4256 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4257 case Intrinsic::amdgcn_workgroup_id_z: 4258 return legalizePreloadedArgIntrin(MI, MRI, B, 4259 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4260 case Intrinsic::amdgcn_dispatch_ptr: 4261 return legalizePreloadedArgIntrin(MI, MRI, B, 4262 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4263 case Intrinsic::amdgcn_queue_ptr: 4264 return legalizePreloadedArgIntrin(MI, MRI, B, 4265 AMDGPUFunctionArgInfo::QUEUE_PTR); 4266 case Intrinsic::amdgcn_implicit_buffer_ptr: 4267 return legalizePreloadedArgIntrin( 4268 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4269 case Intrinsic::amdgcn_dispatch_id: 4270 return legalizePreloadedArgIntrin(MI, MRI, B, 4271 AMDGPUFunctionArgInfo::DISPATCH_ID); 4272 case Intrinsic::amdgcn_fdiv_fast: 4273 return legalizeFDIVFastIntrin(MI, MRI, B); 4274 case Intrinsic::amdgcn_is_shared: 4275 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4276 case Intrinsic::amdgcn_is_private: 4277 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4278 case Intrinsic::amdgcn_wavefrontsize: { 4279 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4280 MI.eraseFromParent(); 4281 return true; 4282 } 4283 case Intrinsic::amdgcn_s_buffer_load: 4284 return legalizeSBufferLoad(MI, B, Helper.Observer); 4285 case Intrinsic::amdgcn_raw_buffer_store: 4286 case Intrinsic::amdgcn_struct_buffer_store: 4287 return legalizeBufferStore(MI, MRI, B, false, false); 4288 case Intrinsic::amdgcn_raw_buffer_store_format: 4289 case Intrinsic::amdgcn_struct_buffer_store_format: 4290 return legalizeBufferStore(MI, MRI, B, false, true); 4291 case Intrinsic::amdgcn_raw_tbuffer_store: 4292 case Intrinsic::amdgcn_struct_tbuffer_store: 4293 return legalizeBufferStore(MI, MRI, B, true, true); 4294 case Intrinsic::amdgcn_raw_buffer_load: 4295 case Intrinsic::amdgcn_struct_buffer_load: 4296 return legalizeBufferLoad(MI, MRI, B, false, false); 4297 case Intrinsic::amdgcn_raw_buffer_load_format: 4298 case Intrinsic::amdgcn_struct_buffer_load_format: 4299 return legalizeBufferLoad(MI, MRI, B, true, false); 4300 case Intrinsic::amdgcn_raw_tbuffer_load: 4301 case Intrinsic::amdgcn_struct_tbuffer_load: 4302 return legalizeBufferLoad(MI, MRI, B, true, true); 4303 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4304 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4305 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4306 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4307 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4308 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4309 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4310 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4311 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4312 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4313 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4314 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4315 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4316 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4317 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4318 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4319 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4320 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4321 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4322 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4323 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4324 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4325 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4326 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4327 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4328 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4329 return legalizeBufferAtomic(MI, B, IntrID); 4330 case Intrinsic::amdgcn_atomic_inc: 4331 return legalizeAtomicIncDec(MI, B, true); 4332 case Intrinsic::amdgcn_atomic_dec: 4333 return legalizeAtomicIncDec(MI, B, false); 4334 case Intrinsic::trap: 4335 return legalizeTrapIntrinsic(MI, MRI, B); 4336 case Intrinsic::debugtrap: 4337 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4338 default: { 4339 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4340 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4341 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4342 return true; 4343 } 4344 } 4345 4346 return true; 4347 } 4348