1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 return Ty.isVector() && 67 Ty.getNumElements() % 2 != 0 && 68 Ty.getElementType().getSizeInBits() < 32 && 69 Ty.getSizeInBits() % 32 != 0; 70 }; 71 } 72 73 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 74 return [=](const LegalityQuery &Query) { 75 const LLT Ty = Query.Types[TypeIdx]; 76 const LLT EltTy = Ty.getScalarType(); 77 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 78 }; 79 } 80 81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 82 return [=](const LegalityQuery &Query) { 83 const LLT Ty = Query.Types[TypeIdx]; 84 const LLT EltTy = Ty.getElementType(); 85 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 86 }; 87 } 88 89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 90 return [=](const LegalityQuery &Query) { 91 const LLT Ty = Query.Types[TypeIdx]; 92 const LLT EltTy = Ty.getElementType(); 93 unsigned Size = Ty.getSizeInBits(); 94 unsigned Pieces = (Size + 63) / 64; 95 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 96 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 97 }; 98 } 99 100 // Increase the number of vector elements to reach the next multiple of 32-bit 101 // type. 102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 106 const LLT EltTy = Ty.getElementType(); 107 const int Size = Ty.getSizeInBits(); 108 const int EltSize = EltTy.getSizeInBits(); 109 const int NextMul32 = (Size + 31) / 32; 110 111 assert(EltSize < 32); 112 113 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 114 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 115 }; 116 } 117 118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 unsigned Size = Ty.getSizeInBits(); 122 123 LLT CoercedTy; 124 if (Size <= 32) { 125 // <2 x s8> -> s16 126 // <4 x s8> -> s32 127 CoercedTy = LLT::scalar(Size); 128 } else 129 CoercedTy = LLT::scalarOrVector(Size / 32, 32); 130 131 return std::make_pair(TypeIdx, CoercedTy); 132 }; 133 } 134 135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 136 return [=](const LegalityQuery &Query) { 137 const LLT QueryTy = Query.Types[TypeIdx]; 138 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 139 }; 140 } 141 142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 143 return [=](const LegalityQuery &Query) { 144 const LLT QueryTy = Query.Types[TypeIdx]; 145 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 146 }; 147 } 148 149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT QueryTy = Query.Types[TypeIdx]; 152 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 153 }; 154 } 155 156 static bool isRegisterSize(unsigned Size) { 157 return Size % 32 == 0 && Size <= MaxRegisterSize; 158 } 159 160 static bool isRegisterVectorElementType(LLT EltTy) { 161 const int EltSize = EltTy.getSizeInBits(); 162 return EltSize == 16 || EltSize % 32 == 0; 163 } 164 165 static bool isRegisterVectorType(LLT Ty) { 166 const int EltSize = Ty.getElementType().getSizeInBits(); 167 return EltSize == 32 || EltSize == 64 || 168 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 169 EltSize == 128 || EltSize == 256; 170 } 171 172 static bool isRegisterType(LLT Ty) { 173 if (!isRegisterSize(Ty.getSizeInBits())) 174 return false; 175 176 if (Ty.isVector()) 177 return isRegisterVectorType(Ty); 178 179 return true; 180 } 181 182 // Any combination of 32 or 64-bit elements up the maximum register size, and 183 // multiples of v2s16. 184 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 return isRegisterType(Query.Types[TypeIdx]); 187 }; 188 } 189 190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 191 return [=](const LegalityQuery &Query) { 192 const LLT QueryTy = Query.Types[TypeIdx]; 193 if (!QueryTy.isVector()) 194 return false; 195 const LLT EltTy = QueryTy.getElementType(); 196 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 197 }; 198 } 199 200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 201 return [=](const LegalityQuery &Query) { 202 const LLT Ty = Query.Types[TypeIdx]; 203 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 204 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 205 }; 206 } 207 208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 209 // handle some operations by just promoting the register during 210 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 212 bool IsLoad) { 213 switch (AS) { 214 case AMDGPUAS::PRIVATE_ADDRESS: 215 // FIXME: Private element size. 216 return 32; 217 case AMDGPUAS::LOCAL_ADDRESS: 218 return ST.useDS128() ? 128 : 64; 219 case AMDGPUAS::GLOBAL_ADDRESS: 220 case AMDGPUAS::CONSTANT_ADDRESS: 221 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 222 // Treat constant and global as identical. SMRD loads are sometimes usable for 223 // global loads (ideally constant address space should be eliminated) 224 // depending on the context. Legality cannot be context dependent, but 225 // RegBankSelect can split the load as necessary depending on the pointer 226 // register bank/uniformity and if the memory is invariant or not written in a 227 // kernel. 228 return IsLoad ? 512 : 128; 229 default: 230 // Flat addresses may contextually need to be split to 32-bit parts if they 231 // may alias scratch depending on the subtarget. 232 return 128; 233 } 234 } 235 236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 237 const LegalityQuery &Query, 238 unsigned Opcode) { 239 const LLT Ty = Query.Types[0]; 240 241 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 242 const bool IsLoad = Opcode != AMDGPU::G_STORE; 243 244 unsigned RegSize = Ty.getSizeInBits(); 245 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 246 unsigned Align = Query.MMODescrs[0].AlignInBits; 247 unsigned AS = Query.Types[1].getAddressSpace(); 248 249 // All of these need to be custom lowered to cast the pointer operand. 250 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 251 return false; 252 253 // TODO: We should be able to widen loads if the alignment is high enough, but 254 // we also need to modify the memory access size. 255 #if 0 256 // Accept widening loads based on alignment. 257 if (IsLoad && MemSize < Size) 258 MemSize = std::max(MemSize, Align); 259 #endif 260 261 // Only 1-byte and 2-byte to 32-bit extloads are valid. 262 if (MemSize != RegSize && RegSize != 32) 263 return false; 264 265 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 266 return false; 267 268 switch (MemSize) { 269 case 8: 270 case 16: 271 case 32: 272 case 64: 273 case 128: 274 break; 275 case 96: 276 if (!ST.hasDwordx3LoadStores()) 277 return false; 278 break; 279 case 256: 280 case 512: 281 // These may contextually need to be broken down. 282 break; 283 default: 284 return false; 285 } 286 287 assert(RegSize >= MemSize); 288 289 if (Align < MemSize) { 290 const SITargetLowering *TLI = ST.getTargetLowering(); 291 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 292 return false; 293 } 294 295 return true; 296 } 297 298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 299 // workaround this. Eventually it should ignore the type for loads and only care 300 // about the size. Return true in cases where we will workaround this for now by 301 // bitcasting. 302 static bool loadStoreBitcastWorkaround(const LLT Ty) { 303 if (EnableNewLegality) 304 return false; 305 306 const unsigned Size = Ty.getSizeInBits(); 307 if (Size <= 64) 308 return false; 309 if (!Ty.isVector()) 310 return true; 311 unsigned EltSize = Ty.getElementType().getSizeInBits(); 312 return EltSize != 32 && EltSize != 64; 313 } 314 315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 316 unsigned Opcode) { 317 const LLT Ty = Query.Types[0]; 318 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 319 !loadStoreBitcastWorkaround(Ty); 320 } 321 322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 323 const GCNTargetMachine &TM) 324 : ST(ST_) { 325 using namespace TargetOpcode; 326 327 auto GetAddrSpacePtr = [&TM](unsigned AS) { 328 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 329 }; 330 331 const LLT S1 = LLT::scalar(1); 332 const LLT S16 = LLT::scalar(16); 333 const LLT S32 = LLT::scalar(32); 334 const LLT S64 = LLT::scalar(64); 335 const LLT S128 = LLT::scalar(128); 336 const LLT S256 = LLT::scalar(256); 337 const LLT S512 = LLT::scalar(512); 338 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 339 340 const LLT V2S16 = LLT::vector(2, 16); 341 const LLT V4S16 = LLT::vector(4, 16); 342 343 const LLT V2S32 = LLT::vector(2, 32); 344 const LLT V3S32 = LLT::vector(3, 32); 345 const LLT V4S32 = LLT::vector(4, 32); 346 const LLT V5S32 = LLT::vector(5, 32); 347 const LLT V6S32 = LLT::vector(6, 32); 348 const LLT V7S32 = LLT::vector(7, 32); 349 const LLT V8S32 = LLT::vector(8, 32); 350 const LLT V9S32 = LLT::vector(9, 32); 351 const LLT V10S32 = LLT::vector(10, 32); 352 const LLT V11S32 = LLT::vector(11, 32); 353 const LLT V12S32 = LLT::vector(12, 32); 354 const LLT V13S32 = LLT::vector(13, 32); 355 const LLT V14S32 = LLT::vector(14, 32); 356 const LLT V15S32 = LLT::vector(15, 32); 357 const LLT V16S32 = LLT::vector(16, 32); 358 const LLT V32S32 = LLT::vector(32, 32); 359 360 const LLT V2S64 = LLT::vector(2, 64); 361 const LLT V3S64 = LLT::vector(3, 64); 362 const LLT V4S64 = LLT::vector(4, 64); 363 const LLT V5S64 = LLT::vector(5, 64); 364 const LLT V6S64 = LLT::vector(6, 64); 365 const LLT V7S64 = LLT::vector(7, 64); 366 const LLT V8S64 = LLT::vector(8, 64); 367 const LLT V16S64 = LLT::vector(16, 64); 368 369 std::initializer_list<LLT> AllS32Vectors = 370 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 371 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 372 std::initializer_list<LLT> AllS64Vectors = 373 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 374 375 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 376 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 377 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 378 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 379 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 380 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 381 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 382 383 const LLT CodePtr = FlatPtr; 384 385 const std::initializer_list<LLT> AddrSpaces64 = { 386 GlobalPtr, ConstantPtr, FlatPtr 387 }; 388 389 const std::initializer_list<LLT> AddrSpaces32 = { 390 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 391 }; 392 393 const std::initializer_list<LLT> FPTypesBase = { 394 S32, S64 395 }; 396 397 const std::initializer_list<LLT> FPTypes16 = { 398 S32, S64, S16 399 }; 400 401 const std::initializer_list<LLT> FPTypesPK16 = { 402 S32, S64, S16, V2S16 403 }; 404 405 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 406 407 setAction({G_BRCOND, S1}, Legal); // VCC branches 408 setAction({G_BRCOND, S32}, Legal); // SCC branches 409 410 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 411 // elements for v3s16 412 getActionDefinitionsBuilder(G_PHI) 413 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 414 .legalFor(AllS32Vectors) 415 .legalFor(AllS64Vectors) 416 .legalFor(AddrSpaces64) 417 .legalFor(AddrSpaces32) 418 .legalIf(isPointer(0)) 419 .clampScalar(0, S32, S256) 420 .widenScalarToNextPow2(0, 32) 421 .clampMaxNumElements(0, S32, 16) 422 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 423 .scalarize(0); 424 425 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 426 // Full set of gfx9 features. 427 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 428 .legalFor({S32, S16, V2S16}) 429 .clampScalar(0, S16, S32) 430 .clampMaxNumElements(0, S16, 2) 431 .scalarize(0) 432 .widenScalarToNextPow2(0, 32); 433 434 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 435 .legalFor({S32, S16, V2S16}) // Clamp modifier 436 .minScalar(0, S16) 437 .clampMaxNumElements(0, S16, 2) 438 .scalarize(0) 439 .widenScalarToNextPow2(0, 32) 440 .lower(); 441 } else if (ST.has16BitInsts()) { 442 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 443 .legalFor({S32, S16}) 444 .clampScalar(0, S16, S32) 445 .scalarize(0) 446 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 447 448 // Technically the saturating operations require clamp bit support, but this 449 // was introduced at the same time as 16-bit operations. 450 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 451 .legalFor({S32, S16}) // Clamp modifier 452 .minScalar(0, S16) 453 .scalarize(0) 454 .widenScalarToNextPow2(0, 16) 455 .lower(); 456 457 // We're just lowering this, but it helps get a better result to try to 458 // coerce to the desired type first. 459 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 460 .minScalar(0, S16) 461 .scalarize(0) 462 .lower(); 463 } else { 464 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 465 .legalFor({S32}) 466 .clampScalar(0, S32, S32) 467 .scalarize(0); 468 469 if (ST.hasIntClamp()) { 470 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 471 .legalFor({S32}) // Clamp modifier. 472 .scalarize(0) 473 .minScalarOrElt(0, S32) 474 .lower(); 475 } else { 476 // Clamp bit support was added in VI, along with 16-bit operations. 477 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 478 .minScalar(0, S32) 479 .scalarize(0) 480 .lower(); 481 } 482 483 // FIXME: DAG expansion gets better results. The widening uses the smaller 484 // range values and goes for the min/max lowering directly. 485 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 486 .minScalar(0, S32) 487 .scalarize(0) 488 .lower(); 489 } 490 491 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 492 .customFor({S32, S64}) 493 .clampScalar(0, S32, S64) 494 .widenScalarToNextPow2(0, 32) 495 .scalarize(0); 496 497 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 498 .legalFor({S32}) 499 .clampScalar(0, S32, S32) 500 .scalarize(0); 501 502 // Report legal for any types we can handle anywhere. For the cases only legal 503 // on the SALU, RegBankSelect will be able to re-legalize. 504 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 505 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 506 .clampScalar(0, S32, S64) 507 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 508 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 509 .widenScalarToNextPow2(0) 510 .scalarize(0); 511 512 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 513 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 514 .legalFor({{S32, S1}, {S32, S32}}) 515 .minScalar(0, S32) 516 // TODO: .scalarize(0) 517 .lower(); 518 519 getActionDefinitionsBuilder(G_BITCAST) 520 // Don't worry about the size constraint. 521 .legalIf(all(isRegisterType(0), isRegisterType(1))) 522 .lower(); 523 524 525 getActionDefinitionsBuilder(G_CONSTANT) 526 .legalFor({S1, S32, S64, S16, GlobalPtr, 527 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 528 .legalIf(isPointer(0)) 529 .clampScalar(0, S32, S64) 530 .widenScalarToNextPow2(0); 531 532 getActionDefinitionsBuilder(G_FCONSTANT) 533 .legalFor({S32, S64, S16}) 534 .clampScalar(0, S16, S64); 535 536 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 537 .legalIf(isRegisterType(0)) 538 // s1 and s16 are special cases because they have legal operations on 539 // them, but don't really occupy registers in the normal way. 540 .legalFor({S1, S16}) 541 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 542 .clampScalarOrElt(0, S32, MaxScalar) 543 .widenScalarToNextPow2(0, 32) 544 .clampMaxNumElements(0, S32, 16); 545 546 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 547 548 // If the amount is divergent, we have to do a wave reduction to get the 549 // maximum value, so this is expanded during RegBankSelect. 550 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 551 .legalFor({{PrivatePtr, S32}}); 552 553 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 554 .unsupportedFor({PrivatePtr}) 555 .custom(); 556 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 557 558 auto &FPOpActions = getActionDefinitionsBuilder( 559 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 560 .legalFor({S32, S64}); 561 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 562 .customFor({S32, S64}); 563 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 564 .customFor({S32, S64}); 565 566 if (ST.has16BitInsts()) { 567 if (ST.hasVOP3PInsts()) 568 FPOpActions.legalFor({S16, V2S16}); 569 else 570 FPOpActions.legalFor({S16}); 571 572 TrigActions.customFor({S16}); 573 FDIVActions.customFor({S16}); 574 } 575 576 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 577 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 578 579 if (ST.hasVOP3PInsts()) { 580 MinNumMaxNum.customFor(FPTypesPK16) 581 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 582 .clampMaxNumElements(0, S16, 2) 583 .clampScalar(0, S16, S64) 584 .scalarize(0); 585 } else if (ST.has16BitInsts()) { 586 MinNumMaxNum.customFor(FPTypes16) 587 .clampScalar(0, S16, S64) 588 .scalarize(0); 589 } else { 590 MinNumMaxNum.customFor(FPTypesBase) 591 .clampScalar(0, S32, S64) 592 .scalarize(0); 593 } 594 595 if (ST.hasVOP3PInsts()) 596 FPOpActions.clampMaxNumElements(0, S16, 2); 597 598 FPOpActions 599 .scalarize(0) 600 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 601 602 TrigActions 603 .scalarize(0) 604 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 605 606 FDIVActions 607 .scalarize(0) 608 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 609 610 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 611 .legalFor(FPTypesPK16) 612 .clampMaxNumElements(0, S16, 2) 613 .scalarize(0) 614 .clampScalar(0, S16, S64); 615 616 if (ST.has16BitInsts()) { 617 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 618 .legalFor({S32, S64, S16}) 619 .scalarize(0) 620 .clampScalar(0, S16, S64); 621 } else { 622 getActionDefinitionsBuilder(G_FSQRT) 623 .legalFor({S32, S64}) 624 .scalarize(0) 625 .clampScalar(0, S32, S64); 626 627 if (ST.hasFractBug()) { 628 getActionDefinitionsBuilder(G_FFLOOR) 629 .customFor({S64}) 630 .legalFor({S32, S64}) 631 .scalarize(0) 632 .clampScalar(0, S32, S64); 633 } else { 634 getActionDefinitionsBuilder(G_FFLOOR) 635 .legalFor({S32, S64}) 636 .scalarize(0) 637 .clampScalar(0, S32, S64); 638 } 639 } 640 641 getActionDefinitionsBuilder(G_FPTRUNC) 642 .legalFor({{S32, S64}, {S16, S32}}) 643 .scalarize(0) 644 .lower(); 645 646 getActionDefinitionsBuilder(G_FPEXT) 647 .legalFor({{S64, S32}, {S32, S16}}) 648 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 649 .scalarize(0); 650 651 getActionDefinitionsBuilder(G_FSUB) 652 // Use actual fsub instruction 653 .legalFor({S32}) 654 // Must use fadd + fneg 655 .lowerFor({S64, S16, V2S16}) 656 .scalarize(0) 657 .clampScalar(0, S32, S64); 658 659 // Whether this is legal depends on the floating point mode for the function. 660 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 661 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 662 FMad.customFor({S32, S16}); 663 else if (ST.hasMadMacF32Insts()) 664 FMad.customFor({S32}); 665 else if (ST.hasMadF16()) 666 FMad.customFor({S16}); 667 FMad.scalarize(0) 668 .lower(); 669 670 // TODO: Do we need to clamp maximum bitwidth? 671 getActionDefinitionsBuilder(G_TRUNC) 672 .legalIf(isScalar(0)) 673 .legalFor({{V2S16, V2S32}}) 674 .clampMaxNumElements(0, S16, 2) 675 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 676 // situations (like an invalid implicit use), we don't want to infinite loop 677 // in the legalizer. 678 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 679 .alwaysLegal(); 680 681 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 682 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 683 {S32, S1}, {S64, S1}, {S16, S1}}) 684 .scalarize(0) 685 .clampScalar(0, S32, S64) 686 .widenScalarToNextPow2(1, 32); 687 688 // TODO: Split s1->s64 during regbankselect for VALU. 689 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 690 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 691 .lowerFor({{S32, S64}}) 692 .lowerIf(typeIs(1, S1)) 693 .customFor({{S64, S64}}); 694 if (ST.has16BitInsts()) 695 IToFP.legalFor({{S16, S16}}); 696 IToFP.clampScalar(1, S32, S64) 697 .minScalar(0, S32) 698 .scalarize(0) 699 .widenScalarToNextPow2(1); 700 701 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 702 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 703 .customFor({{S64, S64}}) 704 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 705 if (ST.has16BitInsts()) 706 FPToI.legalFor({{S16, S16}}); 707 else 708 FPToI.minScalar(1, S32); 709 710 FPToI.minScalar(0, S32) 711 .scalarize(0) 712 .lower(); 713 714 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 715 .scalarize(0) 716 .lower(); 717 718 if (ST.has16BitInsts()) { 719 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 720 .legalFor({S16, S32, S64}) 721 .clampScalar(0, S16, S64) 722 .scalarize(0); 723 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 724 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 725 .legalFor({S32, S64}) 726 .clampScalar(0, S32, S64) 727 .scalarize(0); 728 } else { 729 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 730 .legalFor({S32}) 731 .customFor({S64}) 732 .clampScalar(0, S32, S64) 733 .scalarize(0); 734 } 735 736 getActionDefinitionsBuilder(G_PTR_ADD) 737 .legalIf(all(isPointer(0), sameSize(0, 1))) 738 .scalarize(0) 739 .scalarSameSizeAs(1, 0); 740 741 getActionDefinitionsBuilder(G_PTRMASK) 742 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 743 .scalarSameSizeAs(1, 0) 744 .scalarize(0); 745 746 auto &CmpBuilder = 747 getActionDefinitionsBuilder(G_ICMP) 748 // The compare output type differs based on the register bank of the output, 749 // so make both s1 and s32 legal. 750 // 751 // Scalar compares producing output in scc will be promoted to s32, as that 752 // is the allocatable register type that will be needed for the copy from 753 // scc. This will be promoted during RegBankSelect, and we assume something 754 // before that won't try to use s32 result types. 755 // 756 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 757 // bank. 758 .legalForCartesianProduct( 759 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 760 .legalForCartesianProduct( 761 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 762 if (ST.has16BitInsts()) { 763 CmpBuilder.legalFor({{S1, S16}}); 764 } 765 766 CmpBuilder 767 .widenScalarToNextPow2(1) 768 .clampScalar(1, S32, S64) 769 .scalarize(0) 770 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 771 772 getActionDefinitionsBuilder(G_FCMP) 773 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 774 .widenScalarToNextPow2(1) 775 .clampScalar(1, S32, S64) 776 .scalarize(0); 777 778 // FIXME: fpow has a selection pattern that should move to custom lowering. 779 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 780 if (ST.has16BitInsts()) 781 Exp2Ops.legalFor({S32, S16}); 782 else 783 Exp2Ops.legalFor({S32}); 784 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 785 Exp2Ops.scalarize(0); 786 787 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 788 if (ST.has16BitInsts()) 789 ExpOps.customFor({{S32}, {S16}}); 790 else 791 ExpOps.customFor({S32}); 792 ExpOps.clampScalar(0, MinScalarFPTy, S32) 793 .scalarize(0); 794 795 getActionDefinitionsBuilder(G_FPOWI) 796 .clampScalar(0, MinScalarFPTy, S32) 797 .lower(); 798 799 // The 64-bit versions produce 32-bit results, but only on the SALU. 800 getActionDefinitionsBuilder(G_CTPOP) 801 .legalFor({{S32, S32}, {S32, S64}}) 802 .clampScalar(0, S32, S32) 803 .clampScalar(1, S32, S64) 804 .scalarize(0) 805 .widenScalarToNextPow2(0, 32) 806 .widenScalarToNextPow2(1, 32); 807 808 // The hardware instructions return a different result on 0 than the generic 809 // instructions expect. The hardware produces -1, but these produce the 810 // bitwidth. 811 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 812 .scalarize(0) 813 .clampScalar(0, S32, S32) 814 .clampScalar(1, S32, S64) 815 .widenScalarToNextPow2(0, 32) 816 .widenScalarToNextPow2(1, 32) 817 .lower(); 818 819 // The 64-bit versions produce 32-bit results, but only on the SALU. 820 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 821 .legalFor({{S32, S32}, {S32, S64}}) 822 .clampScalar(0, S32, S32) 823 .clampScalar(1, S32, S64) 824 .scalarize(0) 825 .widenScalarToNextPow2(0, 32) 826 .widenScalarToNextPow2(1, 32); 827 828 getActionDefinitionsBuilder(G_BITREVERSE) 829 .legalFor({S32}) 830 .clampScalar(0, S32, S32) 831 .scalarize(0); 832 833 if (ST.has16BitInsts()) { 834 getActionDefinitionsBuilder(G_BSWAP) 835 .legalFor({S16, S32, V2S16}) 836 .clampMaxNumElements(0, S16, 2) 837 // FIXME: Fixing non-power-of-2 before clamp is workaround for 838 // narrowScalar limitation. 839 .widenScalarToNextPow2(0) 840 .clampScalar(0, S16, S32) 841 .scalarize(0); 842 843 if (ST.hasVOP3PInsts()) { 844 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 845 .legalFor({S32, S16, V2S16}) 846 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 847 .clampMaxNumElements(0, S16, 2) 848 .minScalar(0, S16) 849 .widenScalarToNextPow2(0) 850 .scalarize(0) 851 .lower(); 852 } else { 853 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 854 .legalFor({S32, S16}) 855 .widenScalarToNextPow2(0) 856 .minScalar(0, S16) 857 .scalarize(0) 858 .lower(); 859 } 860 } else { 861 // TODO: Should have same legality without v_perm_b32 862 getActionDefinitionsBuilder(G_BSWAP) 863 .legalFor({S32}) 864 .lowerIf(scalarNarrowerThan(0, 32)) 865 // FIXME: Fixing non-power-of-2 before clamp is workaround for 866 // narrowScalar limitation. 867 .widenScalarToNextPow2(0) 868 .maxScalar(0, S32) 869 .scalarize(0) 870 .lower(); 871 872 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 873 .legalFor({S32}) 874 .minScalar(0, S32) 875 .widenScalarToNextPow2(0) 876 .scalarize(0) 877 .lower(); 878 } 879 880 getActionDefinitionsBuilder(G_INTTOPTR) 881 // List the common cases 882 .legalForCartesianProduct(AddrSpaces64, {S64}) 883 .legalForCartesianProduct(AddrSpaces32, {S32}) 884 .scalarize(0) 885 // Accept any address space as long as the size matches 886 .legalIf(sameSize(0, 1)) 887 .widenScalarIf(smallerThan(1, 0), 888 [](const LegalityQuery &Query) { 889 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 890 }) 891 .narrowScalarIf(largerThan(1, 0), 892 [](const LegalityQuery &Query) { 893 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 894 }); 895 896 getActionDefinitionsBuilder(G_PTRTOINT) 897 // List the common cases 898 .legalForCartesianProduct(AddrSpaces64, {S64}) 899 .legalForCartesianProduct(AddrSpaces32, {S32}) 900 .scalarize(0) 901 // Accept any address space as long as the size matches 902 .legalIf(sameSize(0, 1)) 903 .widenScalarIf(smallerThan(0, 1), 904 [](const LegalityQuery &Query) { 905 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 906 }) 907 .narrowScalarIf( 908 largerThan(0, 1), 909 [](const LegalityQuery &Query) { 910 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 911 }); 912 913 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 914 .scalarize(0) 915 .custom(); 916 917 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 918 bool IsLoad) -> bool { 919 const LLT DstTy = Query.Types[0]; 920 921 // Split vector extloads. 922 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 923 unsigned Align = Query.MMODescrs[0].AlignInBits; 924 925 if (MemSize < DstTy.getSizeInBits()) 926 MemSize = std::max(MemSize, Align); 927 928 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 929 return true; 930 931 const LLT PtrTy = Query.Types[1]; 932 unsigned AS = PtrTy.getAddressSpace(); 933 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 934 return true; 935 936 // Catch weird sized loads that don't evenly divide into the access sizes 937 // TODO: May be able to widen depending on alignment etc. 938 unsigned NumRegs = (MemSize + 31) / 32; 939 if (NumRegs == 3) { 940 if (!ST.hasDwordx3LoadStores()) 941 return true; 942 } else { 943 // If the alignment allows, these should have been widened. 944 if (!isPowerOf2_32(NumRegs)) 945 return true; 946 } 947 948 if (Align < MemSize) { 949 const SITargetLowering *TLI = ST.getTargetLowering(); 950 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 951 } 952 953 return false; 954 }; 955 956 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 957 unsigned Opc) -> bool { 958 unsigned Size = Query.Types[0].getSizeInBits(); 959 if (isPowerOf2_32(Size)) 960 return false; 961 962 if (Size == 96 && ST.hasDwordx3LoadStores()) 963 return false; 964 965 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 966 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 967 return false; 968 969 unsigned Align = Query.MMODescrs[0].AlignInBits; 970 unsigned RoundedSize = NextPowerOf2(Size); 971 return (Align >= RoundedSize); 972 }; 973 974 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 975 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 976 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 977 978 // TODO: Refine based on subtargets which support unaligned access or 128-bit 979 // LDS 980 // TODO: Unsupported flat for SI. 981 982 for (unsigned Op : {G_LOAD, G_STORE}) { 983 const bool IsStore = Op == G_STORE; 984 985 auto &Actions = getActionDefinitionsBuilder(Op); 986 // Explicitly list some common cases. 987 // TODO: Does this help compile time at all? 988 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 989 {V2S32, GlobalPtr, 64, GlobalAlign32}, 990 {V4S32, GlobalPtr, 128, GlobalAlign32}, 991 {S64, GlobalPtr, 64, GlobalAlign32}, 992 {V2S64, GlobalPtr, 128, GlobalAlign32}, 993 {V2S16, GlobalPtr, 32, GlobalAlign32}, 994 {S32, GlobalPtr, 8, GlobalAlign8}, 995 {S32, GlobalPtr, 16, GlobalAlign16}, 996 997 {S32, LocalPtr, 32, 32}, 998 {S64, LocalPtr, 64, 32}, 999 {V2S32, LocalPtr, 64, 32}, 1000 {S32, LocalPtr, 8, 8}, 1001 {S32, LocalPtr, 16, 16}, 1002 {V2S16, LocalPtr, 32, 32}, 1003 1004 {S32, PrivatePtr, 32, 32}, 1005 {S32, PrivatePtr, 8, 8}, 1006 {S32, PrivatePtr, 16, 16}, 1007 {V2S16, PrivatePtr, 32, 32}, 1008 1009 {S32, ConstantPtr, 32, GlobalAlign32}, 1010 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1011 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1012 {S64, ConstantPtr, 64, GlobalAlign32}, 1013 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1014 Actions.legalIf( 1015 [=](const LegalityQuery &Query) -> bool { 1016 return isLoadStoreLegal(ST, Query, Op); 1017 }); 1018 1019 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1020 // 64-bits. 1021 // 1022 // TODO: Should generalize bitcast action into coerce, which will also cover 1023 // inserting addrspacecasts. 1024 Actions.customIf(typeIs(1, Constant32Ptr)); 1025 1026 // Turn any illegal element vectors into something easier to deal 1027 // with. These will ultimately produce 32-bit scalar shifts to extract the 1028 // parts anyway. 1029 // 1030 // For odd 16-bit element vectors, prefer to split those into pieces with 1031 // 16-bit vector parts. 1032 Actions.bitcastIf( 1033 [=](const LegalityQuery &Query) -> bool { 1034 const LLT Ty = Query.Types[0]; 1035 const unsigned Size = Ty.getSizeInBits(); 1036 1037 if (Size != Query.MMODescrs[0].SizeInBits) 1038 return Size <= 32 && Ty.isVector(); 1039 1040 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 1041 return true; 1042 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 1043 !isRegisterVectorElementType(Ty.getElementType()); 1044 }, bitcastToRegisterType(0)); 1045 1046 Actions 1047 .customIf(typeIs(1, Constant32Ptr)) 1048 // Widen suitably aligned loads by loading extra elements. 1049 .moreElementsIf([=](const LegalityQuery &Query) { 1050 const LLT Ty = Query.Types[0]; 1051 return Op == G_LOAD && Ty.isVector() && 1052 shouldWidenLoadResult(Query, Op); 1053 }, moreElementsToNextPow2(0)) 1054 .widenScalarIf([=](const LegalityQuery &Query) { 1055 const LLT Ty = Query.Types[0]; 1056 return Op == G_LOAD && !Ty.isVector() && 1057 shouldWidenLoadResult(Query, Op); 1058 }, widenScalarOrEltToNextPow2(0)) 1059 .narrowScalarIf( 1060 [=](const LegalityQuery &Query) -> bool { 1061 return !Query.Types[0].isVector() && 1062 needToSplitMemOp(Query, Op == G_LOAD); 1063 }, 1064 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1065 const LLT DstTy = Query.Types[0]; 1066 const LLT PtrTy = Query.Types[1]; 1067 1068 const unsigned DstSize = DstTy.getSizeInBits(); 1069 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1070 1071 // Split extloads. 1072 if (DstSize > MemSize) 1073 return std::make_pair(0, LLT::scalar(MemSize)); 1074 1075 if (!isPowerOf2_32(DstSize)) { 1076 // We're probably decomposing an odd sized store. Try to split 1077 // to the widest type. TODO: Account for alignment. As-is it 1078 // should be OK, since the new parts will be further legalized. 1079 unsigned FloorSize = PowerOf2Floor(DstSize); 1080 return std::make_pair(0, LLT::scalar(FloorSize)); 1081 } 1082 1083 if (DstSize > 32 && (DstSize % 32 != 0)) { 1084 // FIXME: Need a way to specify non-extload of larger size if 1085 // suitably aligned. 1086 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1087 } 1088 1089 unsigned MaxSize = maxSizeForAddrSpace(ST, 1090 PtrTy.getAddressSpace(), 1091 Op == G_LOAD); 1092 if (MemSize > MaxSize) 1093 return std::make_pair(0, LLT::scalar(MaxSize)); 1094 1095 unsigned Align = Query.MMODescrs[0].AlignInBits; 1096 return std::make_pair(0, LLT::scalar(Align)); 1097 }) 1098 .fewerElementsIf( 1099 [=](const LegalityQuery &Query) -> bool { 1100 return Query.Types[0].isVector() && 1101 needToSplitMemOp(Query, Op == G_LOAD); 1102 }, 1103 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1104 const LLT DstTy = Query.Types[0]; 1105 const LLT PtrTy = Query.Types[1]; 1106 1107 LLT EltTy = DstTy.getElementType(); 1108 unsigned MaxSize = maxSizeForAddrSpace(ST, 1109 PtrTy.getAddressSpace(), 1110 Op == G_LOAD); 1111 1112 // FIXME: Handle widened to power of 2 results better. This ends 1113 // up scalarizing. 1114 // FIXME: 3 element stores scalarized on SI 1115 1116 // Split if it's too large for the address space. 1117 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1118 unsigned NumElts = DstTy.getNumElements(); 1119 unsigned EltSize = EltTy.getSizeInBits(); 1120 1121 if (MaxSize % EltSize == 0) { 1122 return std::make_pair( 1123 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1124 } 1125 1126 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1127 1128 // FIXME: Refine when odd breakdowns handled 1129 // The scalars will need to be re-legalized. 1130 if (NumPieces == 1 || NumPieces >= NumElts || 1131 NumElts % NumPieces != 0) 1132 return std::make_pair(0, EltTy); 1133 1134 return std::make_pair(0, 1135 LLT::vector(NumElts / NumPieces, EltTy)); 1136 } 1137 1138 // FIXME: We could probably handle weird extending loads better. 1139 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1140 if (DstTy.getSizeInBits() > MemSize) 1141 return std::make_pair(0, EltTy); 1142 1143 unsigned EltSize = EltTy.getSizeInBits(); 1144 unsigned DstSize = DstTy.getSizeInBits(); 1145 if (!isPowerOf2_32(DstSize)) { 1146 // We're probably decomposing an odd sized store. Try to split 1147 // to the widest type. TODO: Account for alignment. As-is it 1148 // should be OK, since the new parts will be further legalized. 1149 unsigned FloorSize = PowerOf2Floor(DstSize); 1150 return std::make_pair( 1151 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1152 } 1153 1154 // Need to split because of alignment. 1155 unsigned Align = Query.MMODescrs[0].AlignInBits; 1156 if (EltSize > Align && 1157 (EltSize / Align < DstTy.getNumElements())) { 1158 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1159 } 1160 1161 // May need relegalization for the scalars. 1162 return std::make_pair(0, EltTy); 1163 }) 1164 .minScalar(0, S32); 1165 1166 if (IsStore) 1167 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1168 1169 // TODO: Need a bitcast lower option? 1170 Actions 1171 .widenScalarToNextPow2(0) 1172 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1173 } 1174 1175 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1176 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1177 {S32, GlobalPtr, 16, 2 * 8}, 1178 {S32, LocalPtr, 8, 8}, 1179 {S32, LocalPtr, 16, 16}, 1180 {S32, PrivatePtr, 8, 8}, 1181 {S32, PrivatePtr, 16, 16}, 1182 {S32, ConstantPtr, 8, 8}, 1183 {S32, ConstantPtr, 16, 2 * 8}}); 1184 if (ST.hasFlatAddressSpace()) { 1185 ExtLoads.legalForTypesWithMemDesc( 1186 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1187 } 1188 1189 ExtLoads.clampScalar(0, S32, S32) 1190 .widenScalarToNextPow2(0) 1191 .unsupportedIfMemSizeNotPow2() 1192 .lower(); 1193 1194 auto &Atomics = getActionDefinitionsBuilder( 1195 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1196 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1197 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1198 G_ATOMICRMW_UMIN}) 1199 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1200 {S64, GlobalPtr}, {S64, LocalPtr}, 1201 {S32, RegionPtr}, {S64, RegionPtr}}); 1202 if (ST.hasFlatAddressSpace()) { 1203 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1204 } 1205 1206 if (ST.hasLDSFPAtomics()) { 1207 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1208 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1209 } 1210 1211 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1212 // demarshalling 1213 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1214 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1215 {S32, FlatPtr}, {S64, FlatPtr}}) 1216 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1217 {S32, RegionPtr}, {S64, RegionPtr}}); 1218 // TODO: Pointer types, any 32-bit or 64-bit vector 1219 1220 // Condition should be s32 for scalar, s1 for vector. 1221 getActionDefinitionsBuilder(G_SELECT) 1222 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1223 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1224 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1225 .clampScalar(0, S16, S64) 1226 .scalarize(1) 1227 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1228 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1229 .clampMaxNumElements(0, S32, 2) 1230 .clampMaxNumElements(0, LocalPtr, 2) 1231 .clampMaxNumElements(0, PrivatePtr, 2) 1232 .scalarize(0) 1233 .widenScalarToNextPow2(0) 1234 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1235 1236 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1237 // be more flexible with the shift amount type. 1238 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1239 .legalFor({{S32, S32}, {S64, S32}}); 1240 if (ST.has16BitInsts()) { 1241 if (ST.hasVOP3PInsts()) { 1242 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1243 .clampMaxNumElements(0, S16, 2); 1244 } else 1245 Shifts.legalFor({{S16, S16}}); 1246 1247 // TODO: Support 16-bit shift amounts for all types 1248 Shifts.widenScalarIf( 1249 [=](const LegalityQuery &Query) { 1250 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1251 // 32-bit amount. 1252 const LLT ValTy = Query.Types[0]; 1253 const LLT AmountTy = Query.Types[1]; 1254 return ValTy.getSizeInBits() <= 16 && 1255 AmountTy.getSizeInBits() < 16; 1256 }, changeTo(1, S16)); 1257 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1258 Shifts.clampScalar(1, S32, S32); 1259 Shifts.clampScalar(0, S16, S64); 1260 Shifts.widenScalarToNextPow2(0, 16); 1261 } else { 1262 // Make sure we legalize the shift amount type first, as the general 1263 // expansion for the shifted type will produce much worse code if it hasn't 1264 // been truncated already. 1265 Shifts.clampScalar(1, S32, S32); 1266 Shifts.clampScalar(0, S32, S64); 1267 Shifts.widenScalarToNextPow2(0, 32); 1268 } 1269 Shifts.scalarize(0); 1270 1271 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1272 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1273 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1274 unsigned IdxTypeIdx = 2; 1275 1276 getActionDefinitionsBuilder(Op) 1277 .customIf([=](const LegalityQuery &Query) { 1278 const LLT EltTy = Query.Types[EltTypeIdx]; 1279 const LLT VecTy = Query.Types[VecTypeIdx]; 1280 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1281 return (EltTy.getSizeInBits() == 16 || 1282 EltTy.getSizeInBits() % 32 == 0) && 1283 VecTy.getSizeInBits() % 32 == 0 && 1284 VecTy.getSizeInBits() <= MaxRegisterSize && 1285 IdxTy.getSizeInBits() == 32; 1286 }) 1287 .clampScalar(EltTypeIdx, S32, S64) 1288 .clampScalar(VecTypeIdx, S32, S64) 1289 .clampScalar(IdxTypeIdx, S32, S32); 1290 } 1291 1292 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1293 .unsupportedIf([=](const LegalityQuery &Query) { 1294 const LLT &EltTy = Query.Types[1].getElementType(); 1295 return Query.Types[0] != EltTy; 1296 }); 1297 1298 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1299 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1300 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1301 1302 // FIXME: Doesn't handle extract of illegal sizes. 1303 getActionDefinitionsBuilder(Op) 1304 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1305 // FIXME: Multiples of 16 should not be legal. 1306 .legalIf([=](const LegalityQuery &Query) { 1307 const LLT BigTy = Query.Types[BigTyIdx]; 1308 const LLT LitTy = Query.Types[LitTyIdx]; 1309 return (BigTy.getSizeInBits() % 32 == 0) && 1310 (LitTy.getSizeInBits() % 16 == 0); 1311 }) 1312 .widenScalarIf( 1313 [=](const LegalityQuery &Query) { 1314 const LLT BigTy = Query.Types[BigTyIdx]; 1315 return (BigTy.getScalarSizeInBits() < 16); 1316 }, 1317 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1318 .widenScalarIf( 1319 [=](const LegalityQuery &Query) { 1320 const LLT LitTy = Query.Types[LitTyIdx]; 1321 return (LitTy.getScalarSizeInBits() < 16); 1322 }, 1323 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1324 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1325 .widenScalarToNextPow2(BigTyIdx, 32); 1326 1327 } 1328 1329 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1330 .legalForCartesianProduct(AllS32Vectors, {S32}) 1331 .legalForCartesianProduct(AllS64Vectors, {S64}) 1332 .clampNumElements(0, V16S32, V32S32) 1333 .clampNumElements(0, V2S64, V16S64) 1334 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1335 1336 if (ST.hasScalarPackInsts()) { 1337 BuildVector 1338 // FIXME: Should probably widen s1 vectors straight to s32 1339 .minScalarOrElt(0, S16) 1340 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1341 .minScalar(1, S32); 1342 1343 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1344 .legalFor({V2S16, S32}) 1345 .lower(); 1346 BuildVector.minScalarOrElt(0, S32); 1347 } else { 1348 BuildVector.customFor({V2S16, S16}); 1349 BuildVector.minScalarOrElt(0, S32); 1350 1351 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1352 .customFor({V2S16, S32}) 1353 .lower(); 1354 } 1355 1356 BuildVector.legalIf(isRegisterType(0)); 1357 1358 // FIXME: Clamp maximum size 1359 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1360 .legalIf(isRegisterType(0)); 1361 1362 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1363 // pre-legalize. 1364 if (ST.hasVOP3PInsts()) { 1365 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1366 .customFor({V2S16, V2S16}) 1367 .lower(); 1368 } else 1369 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1370 1371 // Merge/Unmerge 1372 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1373 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1374 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1375 1376 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1377 const LLT Ty = Query.Types[TypeIdx]; 1378 if (Ty.isVector()) { 1379 const LLT &EltTy = Ty.getElementType(); 1380 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1381 return true; 1382 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1383 return true; 1384 } 1385 return false; 1386 }; 1387 1388 auto &Builder = getActionDefinitionsBuilder(Op) 1389 .lowerFor({{S16, V2S16}}) 1390 .lowerIf([=](const LegalityQuery &Query) { 1391 const LLT BigTy = Query.Types[BigTyIdx]; 1392 return BigTy.getSizeInBits() == 32; 1393 }) 1394 // Try to widen to s16 first for small types. 1395 // TODO: Only do this on targets with legal s16 shifts 1396 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1397 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1398 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1399 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1400 elementTypeIs(1, S16)), 1401 changeTo(1, V2S16)) 1402 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1403 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1404 // valid. 1405 .clampScalar(LitTyIdx, S32, S512) 1406 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1407 // Break up vectors with weird elements into scalars 1408 .fewerElementsIf( 1409 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1410 scalarize(0)) 1411 .fewerElementsIf( 1412 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1413 scalarize(1)) 1414 .clampScalar(BigTyIdx, S32, MaxScalar); 1415 1416 if (Op == G_MERGE_VALUES) { 1417 Builder.widenScalarIf( 1418 // TODO: Use 16-bit shifts if legal for 8-bit values? 1419 [=](const LegalityQuery &Query) { 1420 const LLT Ty = Query.Types[LitTyIdx]; 1421 return Ty.getSizeInBits() < 32; 1422 }, 1423 changeTo(LitTyIdx, S32)); 1424 } 1425 1426 Builder.widenScalarIf( 1427 [=](const LegalityQuery &Query) { 1428 const LLT Ty = Query.Types[BigTyIdx]; 1429 return !isPowerOf2_32(Ty.getSizeInBits()) && 1430 Ty.getSizeInBits() % 16 != 0; 1431 }, 1432 [=](const LegalityQuery &Query) { 1433 // Pick the next power of 2, or a multiple of 64 over 128. 1434 // Whichever is smaller. 1435 const LLT &Ty = Query.Types[BigTyIdx]; 1436 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1437 if (NewSizeInBits >= 256) { 1438 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1439 if (RoundedTo < NewSizeInBits) 1440 NewSizeInBits = RoundedTo; 1441 } 1442 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1443 }) 1444 .legalIf([=](const LegalityQuery &Query) { 1445 const LLT &BigTy = Query.Types[BigTyIdx]; 1446 const LLT &LitTy = Query.Types[LitTyIdx]; 1447 1448 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1449 return false; 1450 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1451 return false; 1452 1453 return BigTy.getSizeInBits() % 16 == 0 && 1454 LitTy.getSizeInBits() % 16 == 0 && 1455 BigTy.getSizeInBits() <= MaxRegisterSize; 1456 }) 1457 // Any vectors left are the wrong size. Scalarize them. 1458 .scalarize(0) 1459 .scalarize(1); 1460 } 1461 1462 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1463 // RegBankSelect. 1464 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1465 .legalFor({{S32}, {S64}}); 1466 1467 if (ST.hasVOP3PInsts()) { 1468 SextInReg.lowerFor({{V2S16}}) 1469 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1470 // get more vector shift opportunities, since we'll get those when 1471 // expanded. 1472 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1473 } else if (ST.has16BitInsts()) { 1474 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1475 } else { 1476 // Prefer to promote to s32 before lowering if we don't have 16-bit 1477 // shifts. This avoid a lot of intermediate truncate and extend operations. 1478 SextInReg.lowerFor({{S32}, {S64}}); 1479 } 1480 1481 SextInReg 1482 .scalarize(0) 1483 .clampScalar(0, S32, S64) 1484 .lower(); 1485 1486 getActionDefinitionsBuilder(G_FSHR) 1487 .legalFor({{S32, S32}}) 1488 .scalarize(0) 1489 .lower(); 1490 1491 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1492 .legalFor({S64}); 1493 1494 getActionDefinitionsBuilder({ 1495 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1496 G_FCOPYSIGN, 1497 1498 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1499 G_ATOMICRMW_NAND, 1500 G_ATOMICRMW_FSUB, 1501 G_READ_REGISTER, 1502 G_WRITE_REGISTER, 1503 1504 G_SADDO, G_SSUBO, 1505 1506 // TODO: Implement 1507 G_FMINIMUM, G_FMAXIMUM, 1508 G_FSHL 1509 }).lower(); 1510 1511 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1512 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1513 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1514 .unsupported(); 1515 1516 computeTables(); 1517 verify(*ST.getInstrInfo()); 1518 } 1519 1520 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1521 MachineInstr &MI) const { 1522 MachineIRBuilder &B = Helper.MIRBuilder; 1523 MachineRegisterInfo &MRI = *B.getMRI(); 1524 GISelChangeObserver &Observer = Helper.Observer; 1525 1526 switch (MI.getOpcode()) { 1527 case TargetOpcode::G_ADDRSPACE_CAST: 1528 return legalizeAddrSpaceCast(MI, MRI, B); 1529 case TargetOpcode::G_FRINT: 1530 return legalizeFrint(MI, MRI, B); 1531 case TargetOpcode::G_FCEIL: 1532 return legalizeFceil(MI, MRI, B); 1533 case TargetOpcode::G_INTRINSIC_TRUNC: 1534 return legalizeIntrinsicTrunc(MI, MRI, B); 1535 case TargetOpcode::G_SITOFP: 1536 return legalizeITOFP(MI, MRI, B, true); 1537 case TargetOpcode::G_UITOFP: 1538 return legalizeITOFP(MI, MRI, B, false); 1539 case TargetOpcode::G_FPTOSI: 1540 return legalizeFPTOI(MI, MRI, B, true); 1541 case TargetOpcode::G_FPTOUI: 1542 return legalizeFPTOI(MI, MRI, B, false); 1543 case TargetOpcode::G_FMINNUM: 1544 case TargetOpcode::G_FMAXNUM: 1545 case TargetOpcode::G_FMINNUM_IEEE: 1546 case TargetOpcode::G_FMAXNUM_IEEE: 1547 return legalizeMinNumMaxNum(Helper, MI); 1548 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1549 return legalizeExtractVectorElt(MI, MRI, B); 1550 case TargetOpcode::G_INSERT_VECTOR_ELT: 1551 return legalizeInsertVectorElt(MI, MRI, B); 1552 case TargetOpcode::G_SHUFFLE_VECTOR: 1553 return legalizeShuffleVector(MI, MRI, B); 1554 case TargetOpcode::G_FSIN: 1555 case TargetOpcode::G_FCOS: 1556 return legalizeSinCos(MI, MRI, B); 1557 case TargetOpcode::G_GLOBAL_VALUE: 1558 return legalizeGlobalValue(MI, MRI, B); 1559 case TargetOpcode::G_LOAD: 1560 return legalizeLoad(MI, MRI, B, Observer); 1561 case TargetOpcode::G_FMAD: 1562 return legalizeFMad(MI, MRI, B); 1563 case TargetOpcode::G_FDIV: 1564 return legalizeFDIV(MI, MRI, B); 1565 case TargetOpcode::G_UDIV: 1566 case TargetOpcode::G_UREM: 1567 return legalizeUDIV_UREM(MI, MRI, B); 1568 case TargetOpcode::G_SDIV: 1569 case TargetOpcode::G_SREM: 1570 return legalizeSDIV_SREM(MI, MRI, B); 1571 case TargetOpcode::G_ATOMIC_CMPXCHG: 1572 return legalizeAtomicCmpXChg(MI, MRI, B); 1573 case TargetOpcode::G_FLOG: 1574 return legalizeFlog(MI, B, numbers::ln2f); 1575 case TargetOpcode::G_FLOG10: 1576 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1577 case TargetOpcode::G_FEXP: 1578 return legalizeFExp(MI, B); 1579 case TargetOpcode::G_FPOW: 1580 return legalizeFPow(MI, B); 1581 case TargetOpcode::G_FFLOOR: 1582 return legalizeFFloor(MI, MRI, B); 1583 case TargetOpcode::G_BUILD_VECTOR: 1584 return legalizeBuildVector(MI, MRI, B); 1585 default: 1586 return false; 1587 } 1588 1589 llvm_unreachable("expected switch to return"); 1590 } 1591 1592 Register AMDGPULegalizerInfo::getSegmentAperture( 1593 unsigned AS, 1594 MachineRegisterInfo &MRI, 1595 MachineIRBuilder &B) const { 1596 MachineFunction &MF = B.getMF(); 1597 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1598 const LLT S32 = LLT::scalar(32); 1599 1600 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1601 1602 if (ST.hasApertureRegs()) { 1603 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1604 // getreg. 1605 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1606 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1607 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1608 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1609 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1610 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1611 unsigned Encoding = 1612 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1613 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1614 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1615 1616 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1617 1618 B.buildInstr(AMDGPU::S_GETREG_B32) 1619 .addDef(GetReg) 1620 .addImm(Encoding); 1621 MRI.setType(GetReg, S32); 1622 1623 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1624 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1625 } 1626 1627 Register QueuePtr = MRI.createGenericVirtualRegister( 1628 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1629 1630 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1631 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1632 return Register(); 1633 1634 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1635 // private_segment_aperture_base_hi. 1636 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1637 1638 // TODO: can we be smarter about machine pointer info? 1639 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1640 MachineMemOperand *MMO = MF.getMachineMemOperand( 1641 PtrInfo, 1642 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1643 MachineMemOperand::MOInvariant, 1644 4, commonAlignment(Align(64), StructOffset)); 1645 1646 Register LoadAddr; 1647 1648 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1649 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1650 } 1651 1652 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1653 MachineInstr &MI, MachineRegisterInfo &MRI, 1654 MachineIRBuilder &B) const { 1655 MachineFunction &MF = B.getMF(); 1656 1657 const LLT S32 = LLT::scalar(32); 1658 Register Dst = MI.getOperand(0).getReg(); 1659 Register Src = MI.getOperand(1).getReg(); 1660 1661 LLT DstTy = MRI.getType(Dst); 1662 LLT SrcTy = MRI.getType(Src); 1663 unsigned DestAS = DstTy.getAddressSpace(); 1664 unsigned SrcAS = SrcTy.getAddressSpace(); 1665 1666 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1667 // vector element. 1668 assert(!DstTy.isVector()); 1669 1670 const AMDGPUTargetMachine &TM 1671 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1672 1673 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1674 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1675 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1676 return true; 1677 } 1678 1679 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1680 // Truncate. 1681 B.buildExtract(Dst, Src, 0); 1682 MI.eraseFromParent(); 1683 return true; 1684 } 1685 1686 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1687 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1688 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1689 1690 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1691 // another. Merge operands are required to be the same type, but creating an 1692 // extra ptrtoint would be kind of pointless. 1693 auto HighAddr = B.buildConstant( 1694 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1695 B.buildMerge(Dst, {Src, HighAddr}); 1696 MI.eraseFromParent(); 1697 return true; 1698 } 1699 1700 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1701 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1702 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1703 unsigned NullVal = TM.getNullPointerValue(DestAS); 1704 1705 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1706 auto FlatNull = B.buildConstant(SrcTy, 0); 1707 1708 // Extract low 32-bits of the pointer. 1709 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1710 1711 auto CmpRes = 1712 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1713 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1714 1715 MI.eraseFromParent(); 1716 return true; 1717 } 1718 1719 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1720 return false; 1721 1722 if (!ST.hasFlatAddressSpace()) 1723 return false; 1724 1725 auto SegmentNull = 1726 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1727 auto FlatNull = 1728 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1729 1730 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1731 if (!ApertureReg.isValid()) 1732 return false; 1733 1734 auto CmpRes = 1735 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1736 1737 // Coerce the type of the low half of the result so we can use merge_values. 1738 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1739 1740 // TODO: Should we allow mismatched types but matching sizes in merges to 1741 // avoid the ptrtoint? 1742 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1743 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1744 1745 MI.eraseFromParent(); 1746 return true; 1747 } 1748 1749 bool AMDGPULegalizerInfo::legalizeFrint( 1750 MachineInstr &MI, MachineRegisterInfo &MRI, 1751 MachineIRBuilder &B) const { 1752 Register Src = MI.getOperand(1).getReg(); 1753 LLT Ty = MRI.getType(Src); 1754 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1755 1756 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1757 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1758 1759 auto C1 = B.buildFConstant(Ty, C1Val); 1760 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1761 1762 // TODO: Should this propagate fast-math-flags? 1763 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1764 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1765 1766 auto C2 = B.buildFConstant(Ty, C2Val); 1767 auto Fabs = B.buildFAbs(Ty, Src); 1768 1769 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1770 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1771 MI.eraseFromParent(); 1772 return true; 1773 } 1774 1775 bool AMDGPULegalizerInfo::legalizeFceil( 1776 MachineInstr &MI, MachineRegisterInfo &MRI, 1777 MachineIRBuilder &B) const { 1778 1779 const LLT S1 = LLT::scalar(1); 1780 const LLT S64 = LLT::scalar(64); 1781 1782 Register Src = MI.getOperand(1).getReg(); 1783 assert(MRI.getType(Src) == S64); 1784 1785 // result = trunc(src) 1786 // if (src > 0.0 && src != result) 1787 // result += 1.0 1788 1789 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1790 1791 const auto Zero = B.buildFConstant(S64, 0.0); 1792 const auto One = B.buildFConstant(S64, 1.0); 1793 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1794 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1795 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1796 auto Add = B.buildSelect(S64, And, One, Zero); 1797 1798 // TODO: Should this propagate fast-math-flags? 1799 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1800 return true; 1801 } 1802 1803 static MachineInstrBuilder extractF64Exponent(Register Hi, 1804 MachineIRBuilder &B) { 1805 const unsigned FractBits = 52; 1806 const unsigned ExpBits = 11; 1807 LLT S32 = LLT::scalar(32); 1808 1809 auto Const0 = B.buildConstant(S32, FractBits - 32); 1810 auto Const1 = B.buildConstant(S32, ExpBits); 1811 1812 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1813 .addUse(Hi) 1814 .addUse(Const0.getReg(0)) 1815 .addUse(Const1.getReg(0)); 1816 1817 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1818 } 1819 1820 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1821 MachineInstr &MI, MachineRegisterInfo &MRI, 1822 MachineIRBuilder &B) const { 1823 const LLT S1 = LLT::scalar(1); 1824 const LLT S32 = LLT::scalar(32); 1825 const LLT S64 = LLT::scalar(64); 1826 1827 Register Src = MI.getOperand(1).getReg(); 1828 assert(MRI.getType(Src) == S64); 1829 1830 // TODO: Should this use extract since the low half is unused? 1831 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1832 Register Hi = Unmerge.getReg(1); 1833 1834 // Extract the upper half, since this is where we will find the sign and 1835 // exponent. 1836 auto Exp = extractF64Exponent(Hi, B); 1837 1838 const unsigned FractBits = 52; 1839 1840 // Extract the sign bit. 1841 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1842 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1843 1844 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1845 1846 const auto Zero32 = B.buildConstant(S32, 0); 1847 1848 // Extend back to 64-bits. 1849 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1850 1851 auto Shr = B.buildAShr(S64, FractMask, Exp); 1852 auto Not = B.buildNot(S64, Shr); 1853 auto Tmp0 = B.buildAnd(S64, Src, Not); 1854 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1855 1856 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1857 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1858 1859 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1860 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1861 MI.eraseFromParent(); 1862 return true; 1863 } 1864 1865 bool AMDGPULegalizerInfo::legalizeITOFP( 1866 MachineInstr &MI, MachineRegisterInfo &MRI, 1867 MachineIRBuilder &B, bool Signed) const { 1868 1869 Register Dst = MI.getOperand(0).getReg(); 1870 Register Src = MI.getOperand(1).getReg(); 1871 1872 const LLT S64 = LLT::scalar(64); 1873 const LLT S32 = LLT::scalar(32); 1874 1875 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1876 1877 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1878 1879 auto CvtHi = Signed ? 1880 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1881 B.buildUITOFP(S64, Unmerge.getReg(1)); 1882 1883 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1884 1885 auto ThirtyTwo = B.buildConstant(S32, 32); 1886 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1887 .addUse(CvtHi.getReg(0)) 1888 .addUse(ThirtyTwo.getReg(0)); 1889 1890 // TODO: Should this propagate fast-math-flags? 1891 B.buildFAdd(Dst, LdExp, CvtLo); 1892 MI.eraseFromParent(); 1893 return true; 1894 } 1895 1896 // TODO: Copied from DAG implementation. Verify logic and document how this 1897 // actually works. 1898 bool AMDGPULegalizerInfo::legalizeFPTOI( 1899 MachineInstr &MI, MachineRegisterInfo &MRI, 1900 MachineIRBuilder &B, bool Signed) const { 1901 1902 Register Dst = MI.getOperand(0).getReg(); 1903 Register Src = MI.getOperand(1).getReg(); 1904 1905 const LLT S64 = LLT::scalar(64); 1906 const LLT S32 = LLT::scalar(32); 1907 1908 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1909 1910 unsigned Flags = MI.getFlags(); 1911 1912 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1913 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1914 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1915 1916 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1917 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1918 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1919 1920 auto Hi = Signed ? 1921 B.buildFPTOSI(S32, FloorMul) : 1922 B.buildFPTOUI(S32, FloorMul); 1923 auto Lo = B.buildFPTOUI(S32, Fma); 1924 1925 B.buildMerge(Dst, { Lo, Hi }); 1926 MI.eraseFromParent(); 1927 1928 return true; 1929 } 1930 1931 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 1932 MachineInstr &MI) const { 1933 MachineFunction &MF = Helper.MIRBuilder.getMF(); 1934 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1935 1936 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1937 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1938 1939 // With ieee_mode disabled, the instructions have the correct behavior 1940 // already for G_FMINNUM/G_FMAXNUM 1941 if (!MFI->getMode().IEEE) 1942 return !IsIEEEOp; 1943 1944 if (IsIEEEOp) 1945 return true; 1946 1947 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1948 } 1949 1950 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1951 MachineInstr &MI, MachineRegisterInfo &MRI, 1952 MachineIRBuilder &B) const { 1953 // TODO: Should move some of this into LegalizerHelper. 1954 1955 // TODO: Promote dynamic indexing of s16 to s32 1956 1957 // FIXME: Artifact combiner probably should have replaced the truncated 1958 // constant before this, so we shouldn't need 1959 // getConstantVRegValWithLookThrough. 1960 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1961 MI.getOperand(2).getReg(), MRI); 1962 if (!IdxVal) // Dynamic case will be selected to register indexing. 1963 return true; 1964 1965 Register Dst = MI.getOperand(0).getReg(); 1966 Register Vec = MI.getOperand(1).getReg(); 1967 1968 LLT VecTy = MRI.getType(Vec); 1969 LLT EltTy = VecTy.getElementType(); 1970 assert(EltTy == MRI.getType(Dst)); 1971 1972 if (IdxVal->Value < VecTy.getNumElements()) 1973 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1974 else 1975 B.buildUndef(Dst); 1976 1977 MI.eraseFromParent(); 1978 return true; 1979 } 1980 1981 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1982 MachineInstr &MI, MachineRegisterInfo &MRI, 1983 MachineIRBuilder &B) const { 1984 // TODO: Should move some of this into LegalizerHelper. 1985 1986 // TODO: Promote dynamic indexing of s16 to s32 1987 1988 // FIXME: Artifact combiner probably should have replaced the truncated 1989 // constant before this, so we shouldn't need 1990 // getConstantVRegValWithLookThrough. 1991 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1992 MI.getOperand(3).getReg(), MRI); 1993 if (!IdxVal) // Dynamic case will be selected to register indexing. 1994 return true; 1995 1996 Register Dst = MI.getOperand(0).getReg(); 1997 Register Vec = MI.getOperand(1).getReg(); 1998 Register Ins = MI.getOperand(2).getReg(); 1999 2000 LLT VecTy = MRI.getType(Vec); 2001 LLT EltTy = VecTy.getElementType(); 2002 assert(EltTy == MRI.getType(Ins)); 2003 2004 if (IdxVal->Value < VecTy.getNumElements()) 2005 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2006 else 2007 B.buildUndef(Dst); 2008 2009 MI.eraseFromParent(); 2010 return true; 2011 } 2012 2013 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2014 MachineInstr &MI, MachineRegisterInfo &MRI, 2015 MachineIRBuilder &B) const { 2016 const LLT V2S16 = LLT::vector(2, 16); 2017 2018 Register Dst = MI.getOperand(0).getReg(); 2019 Register Src0 = MI.getOperand(1).getReg(); 2020 LLT DstTy = MRI.getType(Dst); 2021 LLT SrcTy = MRI.getType(Src0); 2022 2023 if (SrcTy == V2S16 && DstTy == V2S16 && 2024 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2025 return true; 2026 2027 MachineIRBuilder HelperBuilder(MI); 2028 GISelObserverWrapper DummyObserver; 2029 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2030 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2031 } 2032 2033 bool AMDGPULegalizerInfo::legalizeSinCos( 2034 MachineInstr &MI, MachineRegisterInfo &MRI, 2035 MachineIRBuilder &B) const { 2036 2037 Register DstReg = MI.getOperand(0).getReg(); 2038 Register SrcReg = MI.getOperand(1).getReg(); 2039 LLT Ty = MRI.getType(DstReg); 2040 unsigned Flags = MI.getFlags(); 2041 2042 Register TrigVal; 2043 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2044 if (ST.hasTrigReducedRange()) { 2045 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2046 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2047 .addUse(MulVal.getReg(0)) 2048 .setMIFlags(Flags).getReg(0); 2049 } else 2050 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2051 2052 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2053 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2054 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2055 .addUse(TrigVal) 2056 .setMIFlags(Flags); 2057 MI.eraseFromParent(); 2058 return true; 2059 } 2060 2061 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2062 MachineIRBuilder &B, 2063 const GlobalValue *GV, 2064 int64_t Offset, 2065 unsigned GAFlags) const { 2066 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2067 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2068 // to the following code sequence: 2069 // 2070 // For constant address space: 2071 // s_getpc_b64 s[0:1] 2072 // s_add_u32 s0, s0, $symbol 2073 // s_addc_u32 s1, s1, 0 2074 // 2075 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2076 // a fixup or relocation is emitted to replace $symbol with a literal 2077 // constant, which is a pc-relative offset from the encoding of the $symbol 2078 // operand to the global variable. 2079 // 2080 // For global address space: 2081 // s_getpc_b64 s[0:1] 2082 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2083 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2084 // 2085 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2086 // fixups or relocations are emitted to replace $symbol@*@lo and 2087 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2088 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2089 // operand to the global variable. 2090 // 2091 // What we want here is an offset from the value returned by s_getpc 2092 // (which is the address of the s_add_u32 instruction) to the global 2093 // variable, but since the encoding of $symbol starts 4 bytes after the start 2094 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2095 // small. This requires us to add 4 to the global variable offset in order to 2096 // compute the correct address. 2097 2098 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2099 2100 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2101 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2102 2103 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2104 .addDef(PCReg); 2105 2106 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2107 if (GAFlags == SIInstrInfo::MO_NONE) 2108 MIB.addImm(0); 2109 else 2110 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2111 2112 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2113 2114 if (PtrTy.getSizeInBits() == 32) 2115 B.buildExtract(DstReg, PCReg, 0); 2116 return true; 2117 } 2118 2119 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2120 MachineInstr &MI, MachineRegisterInfo &MRI, 2121 MachineIRBuilder &B) const { 2122 Register DstReg = MI.getOperand(0).getReg(); 2123 LLT Ty = MRI.getType(DstReg); 2124 unsigned AS = Ty.getAddressSpace(); 2125 2126 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2127 MachineFunction &MF = B.getMF(); 2128 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2129 2130 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2131 if (!MFI->isEntryFunction()) { 2132 const Function &Fn = MF.getFunction(); 2133 DiagnosticInfoUnsupported BadLDSDecl( 2134 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2135 DS_Warning); 2136 Fn.getContext().diagnose(BadLDSDecl); 2137 2138 // We currently don't have a way to correctly allocate LDS objects that 2139 // aren't directly associated with a kernel. We do force inlining of 2140 // functions that use local objects. However, if these dead functions are 2141 // not eliminated, we don't want a compile time error. Just emit a warning 2142 // and a trap, since there should be no callable path here. 2143 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2144 B.buildUndef(DstReg); 2145 MI.eraseFromParent(); 2146 return true; 2147 } 2148 2149 // TODO: We could emit code to handle the initialization somewhere. 2150 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2151 const SITargetLowering *TLI = ST.getTargetLowering(); 2152 if (!TLI->shouldUseLDSConstAddress(GV)) { 2153 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2154 return true; // Leave in place; 2155 } 2156 2157 B.buildConstant( 2158 DstReg, 2159 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2160 MI.eraseFromParent(); 2161 return true; 2162 } 2163 2164 const Function &Fn = MF.getFunction(); 2165 DiagnosticInfoUnsupported BadInit( 2166 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2167 Fn.getContext().diagnose(BadInit); 2168 return true; 2169 } 2170 2171 const SITargetLowering *TLI = ST.getTargetLowering(); 2172 2173 if (TLI->shouldEmitFixup(GV)) { 2174 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2175 MI.eraseFromParent(); 2176 return true; 2177 } 2178 2179 if (TLI->shouldEmitPCReloc(GV)) { 2180 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2181 MI.eraseFromParent(); 2182 return true; 2183 } 2184 2185 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2186 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2187 2188 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2189 MachinePointerInfo::getGOT(MF), 2190 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2191 MachineMemOperand::MOInvariant, 2192 8 /*Size*/, Align(8)); 2193 2194 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2195 2196 if (Ty.getSizeInBits() == 32) { 2197 // Truncate if this is a 32-bit constant adrdess. 2198 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2199 B.buildExtract(DstReg, Load, 0); 2200 } else 2201 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2202 2203 MI.eraseFromParent(); 2204 return true; 2205 } 2206 2207 bool AMDGPULegalizerInfo::legalizeLoad( 2208 MachineInstr &MI, MachineRegisterInfo &MRI, 2209 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2210 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2211 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2212 Observer.changingInstr(MI); 2213 MI.getOperand(1).setReg(Cast.getReg(0)); 2214 Observer.changedInstr(MI); 2215 return true; 2216 } 2217 2218 bool AMDGPULegalizerInfo::legalizeFMad( 2219 MachineInstr &MI, MachineRegisterInfo &MRI, 2220 MachineIRBuilder &B) const { 2221 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2222 assert(Ty.isScalar()); 2223 2224 MachineFunction &MF = B.getMF(); 2225 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2226 2227 // TODO: Always legal with future ftz flag. 2228 // FIXME: Do we need just output? 2229 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2230 return true; 2231 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2232 return true; 2233 2234 MachineIRBuilder HelperBuilder(MI); 2235 GISelObserverWrapper DummyObserver; 2236 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2237 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2238 } 2239 2240 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2241 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2242 Register DstReg = MI.getOperand(0).getReg(); 2243 Register PtrReg = MI.getOperand(1).getReg(); 2244 Register CmpVal = MI.getOperand(2).getReg(); 2245 Register NewVal = MI.getOperand(3).getReg(); 2246 2247 assert(SITargetLowering::isFlatGlobalAddrSpace( 2248 MRI.getType(PtrReg).getAddressSpace()) && 2249 "this should not have been custom lowered"); 2250 2251 LLT ValTy = MRI.getType(CmpVal); 2252 LLT VecTy = LLT::vector(2, ValTy); 2253 2254 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2255 2256 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2257 .addDef(DstReg) 2258 .addUse(PtrReg) 2259 .addUse(PackedVal) 2260 .setMemRefs(MI.memoperands()); 2261 2262 MI.eraseFromParent(); 2263 return true; 2264 } 2265 2266 bool AMDGPULegalizerInfo::legalizeFlog( 2267 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2268 Register Dst = MI.getOperand(0).getReg(); 2269 Register Src = MI.getOperand(1).getReg(); 2270 LLT Ty = B.getMRI()->getType(Dst); 2271 unsigned Flags = MI.getFlags(); 2272 2273 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2274 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2275 2276 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2277 MI.eraseFromParent(); 2278 return true; 2279 } 2280 2281 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2282 MachineIRBuilder &B) const { 2283 Register Dst = MI.getOperand(0).getReg(); 2284 Register Src = MI.getOperand(1).getReg(); 2285 unsigned Flags = MI.getFlags(); 2286 LLT Ty = B.getMRI()->getType(Dst); 2287 2288 auto K = B.buildFConstant(Ty, numbers::log2e); 2289 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2290 B.buildFExp2(Dst, Mul, Flags); 2291 MI.eraseFromParent(); 2292 return true; 2293 } 2294 2295 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2296 MachineIRBuilder &B) const { 2297 Register Dst = MI.getOperand(0).getReg(); 2298 Register Src0 = MI.getOperand(1).getReg(); 2299 Register Src1 = MI.getOperand(2).getReg(); 2300 unsigned Flags = MI.getFlags(); 2301 LLT Ty = B.getMRI()->getType(Dst); 2302 const LLT S16 = LLT::scalar(16); 2303 const LLT S32 = LLT::scalar(32); 2304 2305 if (Ty == S32) { 2306 auto Log = B.buildFLog2(S32, Src0, Flags); 2307 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2308 .addUse(Log.getReg(0)) 2309 .addUse(Src1) 2310 .setMIFlags(Flags); 2311 B.buildFExp2(Dst, Mul, Flags); 2312 } else if (Ty == S16) { 2313 // There's no f16 fmul_legacy, so we need to convert for it. 2314 auto Log = B.buildFLog2(S16, Src0, Flags); 2315 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2316 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2317 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2318 .addUse(Ext0.getReg(0)) 2319 .addUse(Ext1.getReg(0)) 2320 .setMIFlags(Flags); 2321 2322 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2323 } else 2324 return false; 2325 2326 MI.eraseFromParent(); 2327 return true; 2328 } 2329 2330 // Find a source register, ignoring any possible source modifiers. 2331 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2332 Register ModSrc = OrigSrc; 2333 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2334 ModSrc = SrcFNeg->getOperand(1).getReg(); 2335 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2336 ModSrc = SrcFAbs->getOperand(1).getReg(); 2337 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2338 ModSrc = SrcFAbs->getOperand(1).getReg(); 2339 return ModSrc; 2340 } 2341 2342 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2343 MachineRegisterInfo &MRI, 2344 MachineIRBuilder &B) const { 2345 2346 const LLT S1 = LLT::scalar(1); 2347 const LLT S64 = LLT::scalar(64); 2348 Register Dst = MI.getOperand(0).getReg(); 2349 Register OrigSrc = MI.getOperand(1).getReg(); 2350 unsigned Flags = MI.getFlags(); 2351 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2352 "this should not have been custom lowered"); 2353 2354 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2355 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2356 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2357 // V_FRACT bug is: 2358 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2359 // 2360 // Convert floor(x) to (x - fract(x)) 2361 2362 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2363 .addUse(OrigSrc) 2364 .setMIFlags(Flags); 2365 2366 // Give source modifier matching some assistance before obscuring a foldable 2367 // pattern. 2368 2369 // TODO: We can avoid the neg on the fract? The input sign to fract 2370 // shouldn't matter? 2371 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2372 2373 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2374 2375 Register Min = MRI.createGenericVirtualRegister(S64); 2376 2377 // We don't need to concern ourselves with the snan handling difference, so 2378 // use the one which will directly select. 2379 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2380 if (MFI->getMode().IEEE) 2381 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2382 else 2383 B.buildFMinNum(Min, Fract, Const, Flags); 2384 2385 Register CorrectedFract = Min; 2386 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2387 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2388 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2389 } 2390 2391 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2392 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2393 2394 MI.eraseFromParent(); 2395 return true; 2396 } 2397 2398 // Turn an illegal packed v2s16 build vector into bit operations. 2399 // TODO: This should probably be a bitcast action in LegalizerHelper. 2400 bool AMDGPULegalizerInfo::legalizeBuildVector( 2401 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2402 Register Dst = MI.getOperand(0).getReg(); 2403 const LLT S32 = LLT::scalar(32); 2404 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2405 2406 Register Src0 = MI.getOperand(1).getReg(); 2407 Register Src1 = MI.getOperand(2).getReg(); 2408 assert(MRI.getType(Src0) == LLT::scalar(16)); 2409 2410 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2411 B.buildBitcast(Dst, Merge); 2412 2413 MI.eraseFromParent(); 2414 return true; 2415 } 2416 2417 // Return the use branch instruction, otherwise null if the usage is invalid. 2418 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2419 MachineRegisterInfo &MRI, 2420 MachineInstr *&Br, 2421 MachineBasicBlock *&UncondBrTarget) { 2422 Register CondDef = MI.getOperand(0).getReg(); 2423 if (!MRI.hasOneNonDBGUse(CondDef)) 2424 return nullptr; 2425 2426 MachineBasicBlock *Parent = MI.getParent(); 2427 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2428 if (UseMI.getParent() != Parent || 2429 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2430 return nullptr; 2431 2432 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2433 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2434 if (Next == Parent->end()) { 2435 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2436 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2437 return nullptr; 2438 UncondBrTarget = &*NextMBB; 2439 } else { 2440 if (Next->getOpcode() != AMDGPU::G_BR) 2441 return nullptr; 2442 Br = &*Next; 2443 UncondBrTarget = Br->getOperand(0).getMBB(); 2444 } 2445 2446 return &UseMI; 2447 } 2448 2449 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2450 MachineRegisterInfo &MRI, 2451 Register LiveIn, 2452 Register PhyReg) const { 2453 assert(PhyReg.isPhysical() && "Physical register expected"); 2454 2455 // Insert the live-in copy, if required, by defining destination virtual 2456 // register. 2457 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2458 if (!MRI.getVRegDef(LiveIn)) { 2459 // FIXME: Should have scoped insert pt 2460 MachineBasicBlock &OrigInsBB = B.getMBB(); 2461 auto OrigInsPt = B.getInsertPt(); 2462 2463 MachineBasicBlock &EntryMBB = B.getMF().front(); 2464 EntryMBB.addLiveIn(PhyReg); 2465 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2466 B.buildCopy(LiveIn, PhyReg); 2467 2468 B.setInsertPt(OrigInsBB, OrigInsPt); 2469 } 2470 2471 return LiveIn; 2472 } 2473 2474 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2475 MachineRegisterInfo &MRI, 2476 Register PhyReg, LLT Ty, 2477 bool InsertLiveInCopy) const { 2478 assert(PhyReg.isPhysical() && "Physical register expected"); 2479 2480 // Get or create virtual live-in regester 2481 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2482 if (!LiveIn) { 2483 LiveIn = MRI.createGenericVirtualRegister(Ty); 2484 MRI.addLiveIn(PhyReg, LiveIn); 2485 } 2486 2487 // When the actual true copy required is from virtual register to physical 2488 // register (to be inserted later), live-in copy insertion from physical 2489 // to register virtual register is not required 2490 if (!InsertLiveInCopy) 2491 return LiveIn; 2492 2493 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2494 } 2495 2496 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2497 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2498 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2499 const ArgDescriptor *Arg; 2500 const TargetRegisterClass *RC; 2501 LLT ArgTy; 2502 std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType); 2503 if (!Arg) { 2504 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2505 return nullptr; 2506 } 2507 return Arg; 2508 } 2509 2510 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2511 const ArgDescriptor *Arg) const { 2512 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2513 return false; // TODO: Handle these 2514 2515 Register SrcReg = Arg->getRegister(); 2516 assert(SrcReg.isPhysical() && "Physical register expected"); 2517 assert(DstReg.isVirtual() && "Virtual register expected"); 2518 2519 MachineRegisterInfo &MRI = *B.getMRI(); 2520 2521 LLT Ty = MRI.getType(DstReg); 2522 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2523 2524 if (Arg->isMasked()) { 2525 // TODO: Should we try to emit this once in the entry block? 2526 const LLT S32 = LLT::scalar(32); 2527 const unsigned Mask = Arg->getMask(); 2528 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2529 2530 Register AndMaskSrc = LiveIn; 2531 2532 if (Shift != 0) { 2533 auto ShiftAmt = B.buildConstant(S32, Shift); 2534 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2535 } 2536 2537 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2538 } else { 2539 B.buildCopy(DstReg, LiveIn); 2540 } 2541 2542 return true; 2543 } 2544 2545 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2546 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2547 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2548 2549 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2550 if (!Arg) 2551 return false; 2552 2553 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2554 return false; 2555 2556 MI.eraseFromParent(); 2557 return true; 2558 } 2559 2560 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2561 MachineRegisterInfo &MRI, 2562 MachineIRBuilder &B) const { 2563 Register Dst = MI.getOperand(0).getReg(); 2564 LLT DstTy = MRI.getType(Dst); 2565 LLT S16 = LLT::scalar(16); 2566 LLT S32 = LLT::scalar(32); 2567 LLT S64 = LLT::scalar(64); 2568 2569 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2570 return true; 2571 2572 if (DstTy == S16) 2573 return legalizeFDIV16(MI, MRI, B); 2574 if (DstTy == S32) 2575 return legalizeFDIV32(MI, MRI, B); 2576 if (DstTy == S64) 2577 return legalizeFDIV64(MI, MRI, B); 2578 2579 return false; 2580 } 2581 2582 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2583 Register DstReg, 2584 Register X, 2585 Register Y, 2586 bool IsDiv) const { 2587 const LLT S1 = LLT::scalar(1); 2588 const LLT S32 = LLT::scalar(32); 2589 2590 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2591 // algorithm used here. 2592 2593 // Initial estimate of inv(y). 2594 auto FloatY = B.buildUITOFP(S32, Y); 2595 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2596 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2597 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2598 auto Z = B.buildFPTOUI(S32, ScaledY); 2599 2600 // One round of UNR. 2601 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2602 auto NegYZ = B.buildMul(S32, NegY, Z); 2603 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2604 2605 // Quotient/remainder estimate. 2606 auto Q = B.buildUMulH(S32, X, Z); 2607 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2608 2609 // First quotient/remainder refinement. 2610 auto One = B.buildConstant(S32, 1); 2611 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2612 if (IsDiv) 2613 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2614 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2615 2616 // Second quotient/remainder refinement. 2617 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2618 if (IsDiv) 2619 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2620 else 2621 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2622 } 2623 2624 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2625 MachineRegisterInfo &MRI, 2626 MachineIRBuilder &B) const { 2627 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2628 Register DstReg = MI.getOperand(0).getReg(); 2629 Register Num = MI.getOperand(1).getReg(); 2630 Register Den = MI.getOperand(2).getReg(); 2631 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2632 MI.eraseFromParent(); 2633 return true; 2634 } 2635 2636 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2637 // 2638 // Return lo, hi of result 2639 // 2640 // %cvt.lo = G_UITOFP Val.lo 2641 // %cvt.hi = G_UITOFP Val.hi 2642 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2643 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2644 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2645 // %mul2 = G_FMUL %mul1, 2**(-32) 2646 // %trunc = G_INTRINSIC_TRUNC %mul2 2647 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2648 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2649 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2650 Register Val) { 2651 const LLT S32 = LLT::scalar(32); 2652 auto Unmerge = B.buildUnmerge(S32, Val); 2653 2654 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2655 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2656 2657 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2658 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2659 2660 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2661 auto Mul1 = 2662 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2663 2664 // 2**(-32) 2665 auto Mul2 = 2666 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2667 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2668 2669 // -(2**32) 2670 auto Mad2 = B.buildFMAD(S32, Trunc, 2671 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2672 2673 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2674 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2675 2676 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2677 } 2678 2679 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2680 Register DstReg, 2681 Register Numer, 2682 Register Denom, 2683 bool IsDiv) const { 2684 const LLT S32 = LLT::scalar(32); 2685 const LLT S64 = LLT::scalar(64); 2686 const LLT S1 = LLT::scalar(1); 2687 Register RcpLo, RcpHi; 2688 2689 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2690 2691 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2692 2693 auto Zero64 = B.buildConstant(S64, 0); 2694 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2695 2696 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2697 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2698 2699 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2700 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2701 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2702 2703 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2704 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2705 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2706 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2707 2708 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2709 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2710 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2711 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2712 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2713 2714 auto Zero32 = B.buildConstant(S32, 0); 2715 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2716 auto Add2_HiC = 2717 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2718 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2719 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2720 2721 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2722 Register NumerLo = UnmergeNumer.getReg(0); 2723 Register NumerHi = UnmergeNumer.getReg(1); 2724 2725 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2726 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2727 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2728 Register Mul3_Lo = UnmergeMul3.getReg(0); 2729 Register Mul3_Hi = UnmergeMul3.getReg(1); 2730 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2731 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2732 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2733 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2734 2735 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2736 Register DenomLo = UnmergeDenom.getReg(0); 2737 Register DenomHi = UnmergeDenom.getReg(1); 2738 2739 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2740 auto C1 = B.buildSExt(S32, CmpHi); 2741 2742 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2743 auto C2 = B.buildSExt(S32, CmpLo); 2744 2745 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2746 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2747 2748 // TODO: Here and below portions of the code can be enclosed into if/endif. 2749 // Currently control flow is unconditional and we have 4 selects after 2750 // potential endif to substitute PHIs. 2751 2752 // if C3 != 0 ... 2753 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2754 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2755 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2756 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2757 2758 auto One64 = B.buildConstant(S64, 1); 2759 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2760 2761 auto C4 = 2762 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2763 auto C5 = 2764 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2765 auto C6 = B.buildSelect( 2766 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2767 2768 // if (C6 != 0) 2769 auto Add4 = B.buildAdd(S64, Add3, One64); 2770 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2771 2772 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2773 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2774 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2775 2776 // endif C6 2777 // endif C3 2778 2779 if (IsDiv) { 2780 auto Sel1 = B.buildSelect( 2781 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2782 B.buildSelect(DstReg, 2783 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2784 } else { 2785 auto Sel2 = B.buildSelect( 2786 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2787 B.buildSelect(DstReg, 2788 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2789 } 2790 } 2791 2792 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2793 MachineRegisterInfo &MRI, 2794 MachineIRBuilder &B) const { 2795 const LLT S64 = LLT::scalar(64); 2796 const LLT S32 = LLT::scalar(32); 2797 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2798 Register DstReg = MI.getOperand(0).getReg(); 2799 Register Num = MI.getOperand(1).getReg(); 2800 Register Den = MI.getOperand(2).getReg(); 2801 LLT Ty = MRI.getType(DstReg); 2802 2803 if (Ty == S32) 2804 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2805 else if (Ty == S64) 2806 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2807 else 2808 return false; 2809 2810 MI.eraseFromParent(); 2811 return true; 2812 2813 } 2814 2815 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2816 MachineRegisterInfo &MRI, 2817 MachineIRBuilder &B) const { 2818 const LLT S64 = LLT::scalar(64); 2819 const LLT S32 = LLT::scalar(32); 2820 2821 Register DstReg = MI.getOperand(0).getReg(); 2822 const LLT Ty = MRI.getType(DstReg); 2823 if (Ty != S32 && Ty != S64) 2824 return false; 2825 2826 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2827 2828 Register LHS = MI.getOperand(1).getReg(); 2829 Register RHS = MI.getOperand(2).getReg(); 2830 2831 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2832 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2833 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2834 2835 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2836 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2837 2838 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2839 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2840 2841 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2842 if (Ty == S32) 2843 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2844 else 2845 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2846 2847 Register Sign; 2848 if (IsDiv) 2849 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2850 else 2851 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2852 2853 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2854 B.buildSub(DstReg, UDivRem, Sign); 2855 2856 MI.eraseFromParent(); 2857 return true; 2858 } 2859 2860 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2861 MachineRegisterInfo &MRI, 2862 MachineIRBuilder &B) const { 2863 Register Res = MI.getOperand(0).getReg(); 2864 Register LHS = MI.getOperand(1).getReg(); 2865 Register RHS = MI.getOperand(2).getReg(); 2866 2867 uint16_t Flags = MI.getFlags(); 2868 2869 LLT ResTy = MRI.getType(Res); 2870 LLT S32 = LLT::scalar(32); 2871 LLT S64 = LLT::scalar(64); 2872 2873 const MachineFunction &MF = B.getMF(); 2874 bool Unsafe = 2875 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2876 2877 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2878 return false; 2879 2880 if (!Unsafe && ResTy == S32 && 2881 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2882 return false; 2883 2884 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2885 // 1 / x -> RCP(x) 2886 if (CLHS->isExactlyValue(1.0)) { 2887 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2888 .addUse(RHS) 2889 .setMIFlags(Flags); 2890 2891 MI.eraseFromParent(); 2892 return true; 2893 } 2894 2895 // -1 / x -> RCP( FNEG(x) ) 2896 if (CLHS->isExactlyValue(-1.0)) { 2897 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2898 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2899 .addUse(FNeg.getReg(0)) 2900 .setMIFlags(Flags); 2901 2902 MI.eraseFromParent(); 2903 return true; 2904 } 2905 } 2906 2907 // x / y -> x * (1.0 / y) 2908 if (Unsafe) { 2909 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2910 .addUse(RHS) 2911 .setMIFlags(Flags); 2912 B.buildFMul(Res, LHS, RCP, Flags); 2913 2914 MI.eraseFromParent(); 2915 return true; 2916 } 2917 2918 return false; 2919 } 2920 2921 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2922 MachineRegisterInfo &MRI, 2923 MachineIRBuilder &B) const { 2924 Register Res = MI.getOperand(0).getReg(); 2925 Register LHS = MI.getOperand(1).getReg(); 2926 Register RHS = MI.getOperand(2).getReg(); 2927 2928 uint16_t Flags = MI.getFlags(); 2929 2930 LLT S16 = LLT::scalar(16); 2931 LLT S32 = LLT::scalar(32); 2932 2933 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2934 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2935 2936 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2937 .addUse(RHSExt.getReg(0)) 2938 .setMIFlags(Flags); 2939 2940 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2941 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2942 2943 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2944 .addUse(RDst.getReg(0)) 2945 .addUse(RHS) 2946 .addUse(LHS) 2947 .setMIFlags(Flags); 2948 2949 MI.eraseFromParent(); 2950 return true; 2951 } 2952 2953 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2954 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2955 static void toggleSPDenormMode(bool Enable, 2956 MachineIRBuilder &B, 2957 const GCNSubtarget &ST, 2958 AMDGPU::SIModeRegisterDefaults Mode) { 2959 // Set SP denorm mode to this value. 2960 unsigned SPDenormMode = 2961 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2962 2963 if (ST.hasDenormModeInst()) { 2964 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2965 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2966 2967 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2968 B.buildInstr(AMDGPU::S_DENORM_MODE) 2969 .addImm(NewDenormModeValue); 2970 2971 } else { 2972 // Select FP32 bit field in mode register. 2973 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2974 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2975 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2976 2977 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2978 .addImm(SPDenormMode) 2979 .addImm(SPDenormModeBitField); 2980 } 2981 } 2982 2983 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2984 MachineRegisterInfo &MRI, 2985 MachineIRBuilder &B) const { 2986 Register Res = MI.getOperand(0).getReg(); 2987 Register LHS = MI.getOperand(1).getReg(); 2988 Register RHS = MI.getOperand(2).getReg(); 2989 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2990 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2991 2992 uint16_t Flags = MI.getFlags(); 2993 2994 LLT S32 = LLT::scalar(32); 2995 LLT S1 = LLT::scalar(1); 2996 2997 auto One = B.buildFConstant(S32, 1.0f); 2998 2999 auto DenominatorScaled = 3000 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3001 .addUse(LHS) 3002 .addUse(RHS) 3003 .addImm(0) 3004 .setMIFlags(Flags); 3005 auto NumeratorScaled = 3006 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3007 .addUse(LHS) 3008 .addUse(RHS) 3009 .addImm(1) 3010 .setMIFlags(Flags); 3011 3012 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3013 .addUse(DenominatorScaled.getReg(0)) 3014 .setMIFlags(Flags); 3015 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3016 3017 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3018 // aren't modeled as reading it. 3019 if (!Mode.allFP32Denormals()) 3020 toggleSPDenormMode(true, B, ST, Mode); 3021 3022 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3023 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3024 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3025 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3026 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3027 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3028 3029 if (!Mode.allFP32Denormals()) 3030 toggleSPDenormMode(false, B, ST, Mode); 3031 3032 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3033 .addUse(Fma4.getReg(0)) 3034 .addUse(Fma1.getReg(0)) 3035 .addUse(Fma3.getReg(0)) 3036 .addUse(NumeratorScaled.getReg(1)) 3037 .setMIFlags(Flags); 3038 3039 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3040 .addUse(Fmas.getReg(0)) 3041 .addUse(RHS) 3042 .addUse(LHS) 3043 .setMIFlags(Flags); 3044 3045 MI.eraseFromParent(); 3046 return true; 3047 } 3048 3049 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3050 MachineRegisterInfo &MRI, 3051 MachineIRBuilder &B) const { 3052 Register Res = MI.getOperand(0).getReg(); 3053 Register LHS = MI.getOperand(1).getReg(); 3054 Register RHS = MI.getOperand(2).getReg(); 3055 3056 uint16_t Flags = MI.getFlags(); 3057 3058 LLT S64 = LLT::scalar(64); 3059 LLT S1 = LLT::scalar(1); 3060 3061 auto One = B.buildFConstant(S64, 1.0); 3062 3063 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3064 .addUse(LHS) 3065 .addUse(RHS) 3066 .addImm(0) 3067 .setMIFlags(Flags); 3068 3069 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3070 3071 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3072 .addUse(DivScale0.getReg(0)) 3073 .setMIFlags(Flags); 3074 3075 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3076 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3077 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3078 3079 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3080 .addUse(LHS) 3081 .addUse(RHS) 3082 .addImm(1) 3083 .setMIFlags(Flags); 3084 3085 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3086 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3087 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3088 3089 Register Scale; 3090 if (!ST.hasUsableDivScaleConditionOutput()) { 3091 // Workaround a hardware bug on SI where the condition output from div_scale 3092 // is not usable. 3093 3094 LLT S32 = LLT::scalar(32); 3095 3096 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3097 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3098 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3099 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3100 3101 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3102 Scale1Unmerge.getReg(1)); 3103 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3104 Scale0Unmerge.getReg(1)); 3105 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3106 } else { 3107 Scale = DivScale1.getReg(1); 3108 } 3109 3110 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3111 .addUse(Fma4.getReg(0)) 3112 .addUse(Fma3.getReg(0)) 3113 .addUse(Mul.getReg(0)) 3114 .addUse(Scale) 3115 .setMIFlags(Flags); 3116 3117 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3118 .addUse(Fmas.getReg(0)) 3119 .addUse(RHS) 3120 .addUse(LHS) 3121 .setMIFlags(Flags); 3122 3123 MI.eraseFromParent(); 3124 return true; 3125 } 3126 3127 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3128 MachineRegisterInfo &MRI, 3129 MachineIRBuilder &B) const { 3130 Register Res = MI.getOperand(0).getReg(); 3131 Register LHS = MI.getOperand(2).getReg(); 3132 Register RHS = MI.getOperand(3).getReg(); 3133 uint16_t Flags = MI.getFlags(); 3134 3135 LLT S32 = LLT::scalar(32); 3136 LLT S1 = LLT::scalar(1); 3137 3138 auto Abs = B.buildFAbs(S32, RHS, Flags); 3139 const APFloat C0Val(1.0f); 3140 3141 auto C0 = B.buildConstant(S32, 0x6f800000); 3142 auto C1 = B.buildConstant(S32, 0x2f800000); 3143 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3144 3145 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3146 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3147 3148 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3149 3150 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3151 .addUse(Mul0.getReg(0)) 3152 .setMIFlags(Flags); 3153 3154 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3155 3156 B.buildFMul(Res, Sel, Mul1, Flags); 3157 3158 MI.eraseFromParent(); 3159 return true; 3160 } 3161 3162 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3163 MachineRegisterInfo &MRI, 3164 MachineIRBuilder &B) const { 3165 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3166 uint64_t Offset = 3167 ST.getTargetLowering()->getImplicitParameterOffset( 3168 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3169 LLT DstTy = MRI.getType(DstReg); 3170 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3171 3172 const ArgDescriptor *Arg; 3173 const TargetRegisterClass *RC; 3174 LLT ArgTy; 3175 std::tie(Arg, RC, ArgTy) = 3176 MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3177 if (!Arg) 3178 return false; 3179 3180 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3181 if (!loadInputValue(KernargPtrReg, B, Arg)) 3182 return false; 3183 3184 // FIXME: This should be nuw 3185 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3186 return true; 3187 } 3188 3189 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3190 MachineRegisterInfo &MRI, 3191 MachineIRBuilder &B) const { 3192 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3193 if (!MFI->isEntryFunction()) { 3194 return legalizePreloadedArgIntrin(MI, MRI, B, 3195 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3196 } 3197 3198 Register DstReg = MI.getOperand(0).getReg(); 3199 if (!getImplicitArgPtr(DstReg, MRI, B)) 3200 return false; 3201 3202 MI.eraseFromParent(); 3203 return true; 3204 } 3205 3206 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3207 MachineRegisterInfo &MRI, 3208 MachineIRBuilder &B, 3209 unsigned AddrSpace) const { 3210 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3211 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3212 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3213 MI.eraseFromParent(); 3214 return true; 3215 } 3216 3217 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3218 // offset (the offset that is included in bounds checking and swizzling, to be 3219 // split between the instruction's voffset and immoffset fields) and soffset 3220 // (the offset that is excluded from bounds checking and swizzling, to go in 3221 // the instruction's soffset field). This function takes the first kind of 3222 // offset and figures out how to split it between voffset and immoffset. 3223 std::tuple<Register, unsigned, unsigned> 3224 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3225 Register OrigOffset) const { 3226 const unsigned MaxImm = 4095; 3227 Register BaseReg; 3228 unsigned TotalConstOffset; 3229 MachineInstr *OffsetDef; 3230 const LLT S32 = LLT::scalar(32); 3231 3232 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3233 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3234 3235 unsigned ImmOffset = TotalConstOffset; 3236 3237 // If the immediate value is too big for the immoffset field, put the value 3238 // and -4096 into the immoffset field so that the value that is copied/added 3239 // for the voffset field is a multiple of 4096, and it stands more chance 3240 // of being CSEd with the copy/add for another similar load/store. 3241 // However, do not do that rounding down to a multiple of 4096 if that is a 3242 // negative number, as it appears to be illegal to have a negative offset 3243 // in the vgpr, even if adding the immediate offset makes it positive. 3244 unsigned Overflow = ImmOffset & ~MaxImm; 3245 ImmOffset -= Overflow; 3246 if ((int32_t)Overflow < 0) { 3247 Overflow += ImmOffset; 3248 ImmOffset = 0; 3249 } 3250 3251 if (Overflow != 0) { 3252 if (!BaseReg) { 3253 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3254 } else { 3255 auto OverflowVal = B.buildConstant(S32, Overflow); 3256 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3257 } 3258 } 3259 3260 if (!BaseReg) 3261 BaseReg = B.buildConstant(S32, 0).getReg(0); 3262 3263 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3264 } 3265 3266 /// Handle register layout difference for f16 images for some subtargets. 3267 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3268 MachineRegisterInfo &MRI, 3269 Register Reg) const { 3270 if (!ST.hasUnpackedD16VMem()) 3271 return Reg; 3272 3273 const LLT S16 = LLT::scalar(16); 3274 const LLT S32 = LLT::scalar(32); 3275 LLT StoreVT = MRI.getType(Reg); 3276 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3277 3278 auto Unmerge = B.buildUnmerge(S16, Reg); 3279 3280 SmallVector<Register, 4> WideRegs; 3281 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3282 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3283 3284 int NumElts = StoreVT.getNumElements(); 3285 3286 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3287 } 3288 3289 Register AMDGPULegalizerInfo::fixStoreSourceType( 3290 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3291 MachineRegisterInfo *MRI = B.getMRI(); 3292 LLT Ty = MRI->getType(VData); 3293 3294 const LLT S16 = LLT::scalar(16); 3295 3296 // Fixup illegal register types for i8 stores. 3297 if (Ty == LLT::scalar(8) || Ty == S16) { 3298 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3299 return AnyExt; 3300 } 3301 3302 if (Ty.isVector()) { 3303 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3304 if (IsFormat) 3305 return handleD16VData(B, *MRI, VData); 3306 } 3307 } 3308 3309 return VData; 3310 } 3311 3312 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3313 MachineRegisterInfo &MRI, 3314 MachineIRBuilder &B, 3315 bool IsTyped, 3316 bool IsFormat) const { 3317 Register VData = MI.getOperand(1).getReg(); 3318 LLT Ty = MRI.getType(VData); 3319 LLT EltTy = Ty.getScalarType(); 3320 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3321 const LLT S32 = LLT::scalar(32); 3322 3323 VData = fixStoreSourceType(B, VData, IsFormat); 3324 Register RSrc = MI.getOperand(2).getReg(); 3325 3326 MachineMemOperand *MMO = *MI.memoperands_begin(); 3327 const int MemSize = MMO->getSize(); 3328 3329 unsigned ImmOffset; 3330 unsigned TotalOffset; 3331 3332 // The typed intrinsics add an immediate after the registers. 3333 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3334 3335 // The struct intrinsic variants add one additional operand over raw. 3336 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3337 Register VIndex; 3338 int OpOffset = 0; 3339 if (HasVIndex) { 3340 VIndex = MI.getOperand(3).getReg(); 3341 OpOffset = 1; 3342 } 3343 3344 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3345 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3346 3347 unsigned Format = 0; 3348 if (IsTyped) { 3349 Format = MI.getOperand(5 + OpOffset).getImm(); 3350 ++OpOffset; 3351 } 3352 3353 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3354 3355 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3356 if (TotalOffset != 0) 3357 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3358 3359 unsigned Opc; 3360 if (IsTyped) { 3361 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3362 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3363 } else if (IsFormat) { 3364 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3365 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3366 } else { 3367 switch (MemSize) { 3368 case 1: 3369 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3370 break; 3371 case 2: 3372 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3373 break; 3374 default: 3375 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3376 break; 3377 } 3378 } 3379 3380 if (!VIndex) 3381 VIndex = B.buildConstant(S32, 0).getReg(0); 3382 3383 auto MIB = B.buildInstr(Opc) 3384 .addUse(VData) // vdata 3385 .addUse(RSrc) // rsrc 3386 .addUse(VIndex) // vindex 3387 .addUse(VOffset) // voffset 3388 .addUse(SOffset) // soffset 3389 .addImm(ImmOffset); // offset(imm) 3390 3391 if (IsTyped) 3392 MIB.addImm(Format); 3393 3394 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3395 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3396 .addMemOperand(MMO); 3397 3398 MI.eraseFromParent(); 3399 return true; 3400 } 3401 3402 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3403 MachineRegisterInfo &MRI, 3404 MachineIRBuilder &B, 3405 bool IsFormat, 3406 bool IsTyped) const { 3407 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3408 MachineMemOperand *MMO = *MI.memoperands_begin(); 3409 const int MemSize = MMO->getSize(); 3410 const LLT S32 = LLT::scalar(32); 3411 3412 Register Dst = MI.getOperand(0).getReg(); 3413 Register RSrc = MI.getOperand(2).getReg(); 3414 3415 // The typed intrinsics add an immediate after the registers. 3416 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3417 3418 // The struct intrinsic variants add one additional operand over raw. 3419 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3420 Register VIndex; 3421 int OpOffset = 0; 3422 if (HasVIndex) { 3423 VIndex = MI.getOperand(3).getReg(); 3424 OpOffset = 1; 3425 } 3426 3427 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3428 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3429 3430 unsigned Format = 0; 3431 if (IsTyped) { 3432 Format = MI.getOperand(5 + OpOffset).getImm(); 3433 ++OpOffset; 3434 } 3435 3436 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3437 unsigned ImmOffset; 3438 unsigned TotalOffset; 3439 3440 LLT Ty = MRI.getType(Dst); 3441 LLT EltTy = Ty.getScalarType(); 3442 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3443 const bool Unpacked = ST.hasUnpackedD16VMem(); 3444 3445 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3446 if (TotalOffset != 0) 3447 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3448 3449 unsigned Opc; 3450 3451 if (IsTyped) { 3452 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3453 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3454 } else if (IsFormat) { 3455 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3456 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3457 } else { 3458 switch (MemSize) { 3459 case 1: 3460 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3461 break; 3462 case 2: 3463 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3464 break; 3465 default: 3466 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3467 break; 3468 } 3469 } 3470 3471 Register LoadDstReg; 3472 3473 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3474 LLT UnpackedTy = Ty.changeElementSize(32); 3475 3476 if (IsExtLoad) 3477 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3478 else if (Unpacked && IsD16 && Ty.isVector()) 3479 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3480 else 3481 LoadDstReg = Dst; 3482 3483 if (!VIndex) 3484 VIndex = B.buildConstant(S32, 0).getReg(0); 3485 3486 auto MIB = B.buildInstr(Opc) 3487 .addDef(LoadDstReg) // vdata 3488 .addUse(RSrc) // rsrc 3489 .addUse(VIndex) // vindex 3490 .addUse(VOffset) // voffset 3491 .addUse(SOffset) // soffset 3492 .addImm(ImmOffset); // offset(imm) 3493 3494 if (IsTyped) 3495 MIB.addImm(Format); 3496 3497 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3498 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3499 .addMemOperand(MMO); 3500 3501 if (LoadDstReg != Dst) { 3502 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3503 3504 // Widen result for extending loads was widened. 3505 if (IsExtLoad) 3506 B.buildTrunc(Dst, LoadDstReg); 3507 else { 3508 // Repack to original 16-bit vector result 3509 // FIXME: G_TRUNC should work, but legalization currently fails 3510 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3511 SmallVector<Register, 4> Repack; 3512 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3513 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3514 B.buildMerge(Dst, Repack); 3515 } 3516 } 3517 3518 MI.eraseFromParent(); 3519 return true; 3520 } 3521 3522 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3523 MachineIRBuilder &B, 3524 bool IsInc) const { 3525 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3526 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3527 B.buildInstr(Opc) 3528 .addDef(MI.getOperand(0).getReg()) 3529 .addUse(MI.getOperand(2).getReg()) 3530 .addUse(MI.getOperand(3).getReg()) 3531 .cloneMemRefs(MI); 3532 MI.eraseFromParent(); 3533 return true; 3534 } 3535 3536 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3537 switch (IntrID) { 3538 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3539 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3540 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3541 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3542 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3543 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3544 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3545 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3546 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3547 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3548 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3549 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3550 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3551 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3552 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3553 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3554 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3555 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3556 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3557 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3558 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3559 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3560 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3561 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3562 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3563 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3564 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3565 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3566 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3567 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3568 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3569 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3570 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3571 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3572 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3573 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3574 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3575 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3576 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3577 default: 3578 llvm_unreachable("unhandled atomic opcode"); 3579 } 3580 } 3581 3582 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3583 MachineIRBuilder &B, 3584 Intrinsic::ID IID) const { 3585 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3586 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3587 3588 Register Dst = MI.getOperand(0).getReg(); 3589 Register VData = MI.getOperand(2).getReg(); 3590 3591 Register CmpVal; 3592 int OpOffset = 0; 3593 3594 if (IsCmpSwap) { 3595 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3596 ++OpOffset; 3597 } 3598 3599 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3600 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3601 3602 // The struct intrinsic variants add one additional operand over raw. 3603 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3604 Register VIndex; 3605 if (HasVIndex) { 3606 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3607 ++OpOffset; 3608 } 3609 3610 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3611 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3612 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3613 3614 MachineMemOperand *MMO = *MI.memoperands_begin(); 3615 3616 unsigned ImmOffset; 3617 unsigned TotalOffset; 3618 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3619 if (TotalOffset != 0) 3620 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3621 3622 if (!VIndex) 3623 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3624 3625 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3626 .addDef(Dst) 3627 .addUse(VData); // vdata 3628 3629 if (IsCmpSwap) 3630 MIB.addReg(CmpVal); 3631 3632 MIB.addUse(RSrc) // rsrc 3633 .addUse(VIndex) // vindex 3634 .addUse(VOffset) // voffset 3635 .addUse(SOffset) // soffset 3636 .addImm(ImmOffset) // offset(imm) 3637 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3638 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3639 .addMemOperand(MMO); 3640 3641 MI.eraseFromParent(); 3642 return true; 3643 } 3644 3645 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3646 /// vector with s16 typed elements. 3647 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3648 SmallVectorImpl<Register> &PackedAddrs, 3649 int AddrIdx, int DimIdx, int EndIdx, 3650 int NumGradients) { 3651 const LLT S16 = LLT::scalar(16); 3652 const LLT V2S16 = LLT::vector(2, 16); 3653 3654 for (int I = AddrIdx; I < EndIdx; ++I) { 3655 MachineOperand &SrcOp = MI.getOperand(I); 3656 if (!SrcOp.isReg()) 3657 continue; // _L to _LZ may have eliminated this. 3658 3659 Register AddrReg = SrcOp.getReg(); 3660 3661 if (I < DimIdx) { 3662 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3663 PackedAddrs.push_back(AddrReg); 3664 } else { 3665 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3666 // derivatives dx/dh and dx/dv are packed with undef. 3667 if (((I + 1) >= EndIdx) || 3668 ((NumGradients / 2) % 2 == 1 && 3669 (I == DimIdx + (NumGradients / 2) - 1 || 3670 I == DimIdx + NumGradients - 1)) || 3671 // Check for _L to _LZ optimization 3672 !MI.getOperand(I + 1).isReg()) { 3673 PackedAddrs.push_back( 3674 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3675 .getReg(0)); 3676 } else { 3677 PackedAddrs.push_back( 3678 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3679 .getReg(0)); 3680 ++I; 3681 } 3682 } 3683 } 3684 } 3685 3686 /// Convert from separate vaddr components to a single vector address register, 3687 /// and replace the remaining operands with $noreg. 3688 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3689 int DimIdx, int NumVAddrs) { 3690 const LLT S32 = LLT::scalar(32); 3691 3692 SmallVector<Register, 8> AddrRegs; 3693 for (int I = 0; I != NumVAddrs; ++I) { 3694 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3695 if (SrcOp.isReg()) { 3696 AddrRegs.push_back(SrcOp.getReg()); 3697 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3698 } 3699 } 3700 3701 int NumAddrRegs = AddrRegs.size(); 3702 if (NumAddrRegs != 1) { 3703 // Round up to 8 elements for v5-v7 3704 // FIXME: Missing intermediate sized register classes and instructions. 3705 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3706 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3707 auto Undef = B.buildUndef(S32); 3708 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3709 NumAddrRegs = RoundedNumRegs; 3710 } 3711 3712 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3713 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3714 } 3715 3716 for (int I = 1; I != NumVAddrs; ++I) { 3717 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3718 if (SrcOp.isReg()) 3719 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3720 } 3721 } 3722 3723 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3724 /// 3725 /// Depending on the subtarget, load/store with 16-bit element data need to be 3726 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3727 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3728 /// registers. 3729 /// 3730 /// We don't want to directly select image instructions just yet, but also want 3731 /// to exposes all register repacking to the legalizer/combiners. We also don't 3732 /// want a selected instrution entering RegBankSelect. In order to avoid 3733 /// defining a multitude of intermediate image instructions, directly hack on 3734 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3735 /// now unnecessary arguments with $noreg. 3736 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3737 MachineInstr &MI, MachineIRBuilder &B, 3738 GISelChangeObserver &Observer, 3739 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3740 3741 const int NumDefs = MI.getNumExplicitDefs(); 3742 bool IsTFE = NumDefs == 2; 3743 // We are only processing the operands of d16 image operations on subtargets 3744 // that use the unpacked register layout, or need to repack the TFE result. 3745 3746 // TODO: Do we need to guard against already legalized intrinsics? 3747 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3748 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3749 3750 MachineRegisterInfo *MRI = B.getMRI(); 3751 const LLT S32 = LLT::scalar(32); 3752 const LLT S16 = LLT::scalar(16); 3753 const LLT V2S16 = LLT::vector(2, 16); 3754 3755 // Index of first address argument 3756 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3757 3758 int NumVAddrs, NumGradients; 3759 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3760 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3761 getDMaskIdx(BaseOpcode, NumDefs); 3762 unsigned DMask = 0; 3763 3764 // Check for 16 bit addresses and pack if true. 3765 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3766 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3767 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3768 const bool IsG16 = GradTy == S16; 3769 const bool IsA16 = AddrTy == S16; 3770 3771 int DMaskLanes = 0; 3772 if (!BaseOpcode->Atomic) { 3773 DMask = MI.getOperand(DMaskIdx).getImm(); 3774 if (BaseOpcode->Gather4) { 3775 DMaskLanes = 4; 3776 } else if (DMask != 0) { 3777 DMaskLanes = countPopulation(DMask); 3778 } else if (!IsTFE && !BaseOpcode->Store) { 3779 // If dmask is 0, this is a no-op load. This can be eliminated. 3780 B.buildUndef(MI.getOperand(0)); 3781 MI.eraseFromParent(); 3782 return true; 3783 } 3784 } 3785 3786 Observer.changingInstr(MI); 3787 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3788 3789 unsigned NewOpcode = NumDefs == 0 ? 3790 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3791 3792 // Track that we legalized this 3793 MI.setDesc(B.getTII().get(NewOpcode)); 3794 3795 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3796 // dmask to be at least 1 otherwise the instruction will fail 3797 if (IsTFE && DMask == 0) { 3798 DMask = 0x1; 3799 DMaskLanes = 1; 3800 MI.getOperand(DMaskIdx).setImm(DMask); 3801 } 3802 3803 if (BaseOpcode->Atomic) { 3804 Register VData0 = MI.getOperand(2).getReg(); 3805 LLT Ty = MRI->getType(VData0); 3806 3807 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3808 if (Ty.isVector()) 3809 return false; 3810 3811 if (BaseOpcode->AtomicX2) { 3812 Register VData1 = MI.getOperand(3).getReg(); 3813 // The two values are packed in one register. 3814 LLT PackedTy = LLT::vector(2, Ty); 3815 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3816 MI.getOperand(2).setReg(Concat.getReg(0)); 3817 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3818 } 3819 } 3820 3821 int CorrectedNumVAddrs = NumVAddrs; 3822 3823 // Optimize _L to _LZ when _L is zero 3824 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3825 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3826 const ConstantFP *ConstantLod; 3827 const int LodIdx = AddrIdx + NumVAddrs - 1; 3828 3829 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3830 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3831 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3832 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3833 LZMappingInfo->LZ, ImageDimIntr->Dim); 3834 3835 // The starting indexes should remain in the same place. 3836 --NumVAddrs; 3837 --CorrectedNumVAddrs; 3838 3839 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3840 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3841 MI.RemoveOperand(LodIdx); 3842 } 3843 } 3844 } 3845 3846 // Optimize _mip away, when 'lod' is zero 3847 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3848 int64_t ConstantLod; 3849 const int LodIdx = AddrIdx + NumVAddrs - 1; 3850 3851 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3852 if (ConstantLod == 0) { 3853 // TODO: Change intrinsic opcode and remove operand instead or replacing 3854 // it with 0, as the _L to _LZ handling is done above. 3855 MI.getOperand(LodIdx).ChangeToImmediate(0); 3856 --CorrectedNumVAddrs; 3857 } 3858 } 3859 } 3860 3861 // Rewrite the addressing register layout before doing anything else. 3862 if (IsA16 || IsG16) { 3863 if (IsA16) { 3864 // Target must support the feature and gradients need to be 16 bit too 3865 if (!ST.hasA16() || !IsG16) 3866 return false; 3867 } else if (!ST.hasG16()) 3868 return false; 3869 3870 if (NumVAddrs > 1) { 3871 SmallVector<Register, 4> PackedRegs; 3872 // Don't compress addresses for G16 3873 const int PackEndIdx = 3874 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3875 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3876 PackEndIdx, NumGradients); 3877 3878 if (!IsA16) { 3879 // Add uncompressed address 3880 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3881 int AddrReg = MI.getOperand(I).getReg(); 3882 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3883 PackedRegs.push_back(AddrReg); 3884 } 3885 } 3886 3887 // See also below in the non-a16 branch 3888 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3889 3890 if (!UseNSA && PackedRegs.size() > 1) { 3891 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3892 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3893 PackedRegs[0] = Concat.getReg(0); 3894 PackedRegs.resize(1); 3895 } 3896 3897 const int NumPacked = PackedRegs.size(); 3898 for (int I = 0; I != NumVAddrs; ++I) { 3899 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3900 if (!SrcOp.isReg()) { 3901 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3902 continue; 3903 } 3904 3905 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3906 3907 if (I < NumPacked) 3908 SrcOp.setReg(PackedRegs[I]); 3909 else 3910 SrcOp.setReg(AMDGPU::NoRegister); 3911 } 3912 } 3913 } else { 3914 // If the register allocator cannot place the address registers contiguously 3915 // without introducing moves, then using the non-sequential address encoding 3916 // is always preferable, since it saves VALU instructions and is usually a 3917 // wash in terms of code size or even better. 3918 // 3919 // However, we currently have no way of hinting to the register allocator 3920 // that MIMG addresses should be placed contiguously when it is possible to 3921 // do so, so force non-NSA for the common 2-address case as a heuristic. 3922 // 3923 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3924 // allocation when possible. 3925 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3926 3927 if (!UseNSA && NumVAddrs > 1) 3928 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3929 } 3930 3931 int Flags = 0; 3932 if (IsA16) 3933 Flags |= 1; 3934 if (IsG16) 3935 Flags |= 2; 3936 MI.addOperand(MachineOperand::CreateImm(Flags)); 3937 3938 if (BaseOpcode->Store) { // No TFE for stores? 3939 // TODO: Handle dmask trim 3940 Register VData = MI.getOperand(1).getReg(); 3941 LLT Ty = MRI->getType(VData); 3942 if (!Ty.isVector() || Ty.getElementType() != S16) 3943 return true; 3944 3945 Register RepackedReg = handleD16VData(B, *MRI, VData); 3946 if (RepackedReg != VData) { 3947 MI.getOperand(1).setReg(RepackedReg); 3948 } 3949 3950 return true; 3951 } 3952 3953 Register DstReg = MI.getOperand(0).getReg(); 3954 LLT Ty = MRI->getType(DstReg); 3955 const LLT EltTy = Ty.getScalarType(); 3956 const bool IsD16 = Ty.getScalarType() == S16; 3957 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3958 3959 // Confirm that the return type is large enough for the dmask specified 3960 if (NumElts < DMaskLanes) 3961 return false; 3962 3963 if (NumElts > 4 || DMaskLanes > 4) 3964 return false; 3965 3966 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3967 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3968 3969 // The raw dword aligned data component of the load. The only legal cases 3970 // where this matters should be when using the packed D16 format, for 3971 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3972 LLT RoundedTy; 3973 3974 // S32 vector to to cover all data, plus TFE result element. 3975 LLT TFETy; 3976 3977 // Register type to use for each loaded component. Will be S32 or V2S16. 3978 LLT RegTy; 3979 3980 if (IsD16 && ST.hasUnpackedD16VMem()) { 3981 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3982 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3983 RegTy = S32; 3984 } else { 3985 unsigned EltSize = EltTy.getSizeInBits(); 3986 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3987 unsigned RoundedSize = 32 * RoundedElts; 3988 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3989 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3990 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3991 } 3992 3993 // The return type does not need adjustment. 3994 // TODO: Should we change s16 case to s32 or <2 x s16>? 3995 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3996 return true; 3997 3998 Register Dst1Reg; 3999 4000 // Insert after the instruction. 4001 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4002 4003 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4004 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4005 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4006 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4007 4008 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4009 4010 MI.getOperand(0).setReg(NewResultReg); 4011 4012 // In the IR, TFE is supposed to be used with a 2 element struct return 4013 // type. The intruction really returns these two values in one contiguous 4014 // register, with one additional dword beyond the loaded data. Rewrite the 4015 // return type to use a single register result. 4016 4017 if (IsTFE) { 4018 Dst1Reg = MI.getOperand(1).getReg(); 4019 if (MRI->getType(Dst1Reg) != S32) 4020 return false; 4021 4022 // TODO: Make sure the TFE operand bit is set. 4023 MI.RemoveOperand(1); 4024 4025 // Handle the easy case that requires no repack instructions. 4026 if (Ty == S32) { 4027 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4028 return true; 4029 } 4030 } 4031 4032 // Now figure out how to copy the new result register back into the old 4033 // result. 4034 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4035 4036 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4037 4038 if (ResultNumRegs == 1) { 4039 assert(!IsTFE); 4040 ResultRegs[0] = NewResultReg; 4041 } else { 4042 // We have to repack into a new vector of some kind. 4043 for (int I = 0; I != NumDataRegs; ++I) 4044 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4045 B.buildUnmerge(ResultRegs, NewResultReg); 4046 4047 // Drop the final TFE element to get the data part. The TFE result is 4048 // directly written to the right place already. 4049 if (IsTFE) 4050 ResultRegs.resize(NumDataRegs); 4051 } 4052 4053 // For an s16 scalar result, we form an s32 result with a truncate regardless 4054 // of packed vs. unpacked. 4055 if (IsD16 && !Ty.isVector()) { 4056 B.buildTrunc(DstReg, ResultRegs[0]); 4057 return true; 4058 } 4059 4060 // Avoid a build/concat_vector of 1 entry. 4061 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4062 B.buildBitcast(DstReg, ResultRegs[0]); 4063 return true; 4064 } 4065 4066 assert(Ty.isVector()); 4067 4068 if (IsD16) { 4069 // For packed D16 results with TFE enabled, all the data components are 4070 // S32. Cast back to the expected type. 4071 // 4072 // TODO: We don't really need to use load s32 elements. We would only need one 4073 // cast for the TFE result if a multiple of v2s16 was used. 4074 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4075 for (Register &Reg : ResultRegs) 4076 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4077 } else if (ST.hasUnpackedD16VMem()) { 4078 for (Register &Reg : ResultRegs) 4079 Reg = B.buildTrunc(S16, Reg).getReg(0); 4080 } 4081 } 4082 4083 auto padWithUndef = [&](LLT Ty, int NumElts) { 4084 if (NumElts == 0) 4085 return; 4086 Register Undef = B.buildUndef(Ty).getReg(0); 4087 for (int I = 0; I != NumElts; ++I) 4088 ResultRegs.push_back(Undef); 4089 }; 4090 4091 // Pad out any elements eliminated due to the dmask. 4092 LLT ResTy = MRI->getType(ResultRegs[0]); 4093 if (!ResTy.isVector()) { 4094 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4095 B.buildBuildVector(DstReg, ResultRegs); 4096 return true; 4097 } 4098 4099 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4100 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4101 4102 // Deal with the one annoying legal case. 4103 const LLT V3S16 = LLT::vector(3, 16); 4104 if (Ty == V3S16) { 4105 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4106 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4107 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4108 return true; 4109 } 4110 4111 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4112 B.buildConcatVectors(DstReg, ResultRegs); 4113 return true; 4114 } 4115 4116 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4117 MachineInstr &MI, MachineIRBuilder &B, 4118 GISelChangeObserver &Observer) const { 4119 Register Dst = MI.getOperand(0).getReg(); 4120 LLT Ty = B.getMRI()->getType(Dst); 4121 unsigned Size = Ty.getSizeInBits(); 4122 MachineFunction &MF = B.getMF(); 4123 4124 Observer.changingInstr(MI); 4125 4126 // FIXME: We don't really need this intermediate instruction. The intrinsic 4127 // should be fixed to have a memory operand. Since it's readnone, we're not 4128 // allowed to add one. 4129 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4130 MI.RemoveOperand(1); // Remove intrinsic ID 4131 4132 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4133 // TODO: Should this use datalayout alignment? 4134 const unsigned MemSize = (Size + 7) / 8; 4135 const Align MemAlign(4); 4136 MachineMemOperand *MMO = MF.getMachineMemOperand( 4137 MachinePointerInfo(), 4138 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4139 MachineMemOperand::MOInvariant, 4140 MemSize, MemAlign); 4141 MI.addMemOperand(MF, MMO); 4142 4143 // There are no 96-bit result scalar loads, but widening to 128-bit should 4144 // always be legal. We may need to restore this to a 96-bit result if it turns 4145 // out this needs to be converted to a vector load during RegBankSelect. 4146 if (!isPowerOf2_32(Size)) { 4147 LegalizerHelper Helper(MF, *this, Observer, B); 4148 4149 if (Ty.isVector()) 4150 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4151 else 4152 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4153 } 4154 4155 Observer.changedInstr(MI); 4156 return true; 4157 } 4158 4159 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4160 MachineRegisterInfo &MRI, 4161 MachineIRBuilder &B) const { 4162 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4163 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4164 !ST.isTrapHandlerEnabled()) { 4165 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4166 } else { 4167 // Pass queue pointer to trap handler as input, and insert trap instruction 4168 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4169 const ArgDescriptor *Arg = 4170 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4171 if (!Arg) 4172 return false; 4173 MachineRegisterInfo &MRI = *B.getMRI(); 4174 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4175 Register LiveIn = getLiveInRegister( 4176 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4177 /*InsertLiveInCopy=*/false); 4178 if (!loadInputValue(LiveIn, B, Arg)) 4179 return false; 4180 B.buildCopy(SGPR01, LiveIn); 4181 B.buildInstr(AMDGPU::S_TRAP) 4182 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4183 .addReg(SGPR01, RegState::Implicit); 4184 } 4185 4186 MI.eraseFromParent(); 4187 return true; 4188 } 4189 4190 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4191 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4192 // Is non-HSA path or trap-handler disabled? then, report a warning 4193 // accordingly 4194 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4195 !ST.isTrapHandlerEnabled()) { 4196 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4197 "debugtrap handler not supported", 4198 MI.getDebugLoc(), DS_Warning); 4199 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4200 Ctx.diagnose(NoTrap); 4201 } else { 4202 // Insert debug-trap instruction 4203 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4204 } 4205 4206 MI.eraseFromParent(); 4207 return true; 4208 } 4209 4210 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4211 MachineInstr &MI) const { 4212 MachineIRBuilder &B = Helper.MIRBuilder; 4213 MachineRegisterInfo &MRI = *B.getMRI(); 4214 4215 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4216 auto IntrID = MI.getIntrinsicID(); 4217 switch (IntrID) { 4218 case Intrinsic::amdgcn_if: 4219 case Intrinsic::amdgcn_else: { 4220 MachineInstr *Br = nullptr; 4221 MachineBasicBlock *UncondBrTarget = nullptr; 4222 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4223 const SIRegisterInfo *TRI 4224 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4225 4226 Register Def = MI.getOperand(1).getReg(); 4227 Register Use = MI.getOperand(3).getReg(); 4228 4229 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4230 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4231 if (IntrID == Intrinsic::amdgcn_if) { 4232 B.buildInstr(AMDGPU::SI_IF) 4233 .addDef(Def) 4234 .addUse(Use) 4235 .addMBB(UncondBrTarget); 4236 } else { 4237 B.buildInstr(AMDGPU::SI_ELSE) 4238 .addDef(Def) 4239 .addUse(Use) 4240 .addMBB(UncondBrTarget) 4241 .addImm(0); 4242 } 4243 4244 if (Br) { 4245 Br->getOperand(0).setMBB(CondBrTarget); 4246 } else { 4247 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4248 // since we're swapping branch targets it needs to be reinserted. 4249 // FIXME: IRTranslator should probably not do this 4250 B.buildBr(*CondBrTarget); 4251 } 4252 4253 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4254 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4255 MI.eraseFromParent(); 4256 BrCond->eraseFromParent(); 4257 return true; 4258 } 4259 4260 return false; 4261 } 4262 case Intrinsic::amdgcn_loop: { 4263 MachineInstr *Br = nullptr; 4264 MachineBasicBlock *UncondBrTarget = nullptr; 4265 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4266 const SIRegisterInfo *TRI 4267 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4268 4269 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4270 Register Reg = MI.getOperand(2).getReg(); 4271 4272 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4273 B.buildInstr(AMDGPU::SI_LOOP) 4274 .addUse(Reg) 4275 .addMBB(UncondBrTarget); 4276 4277 if (Br) 4278 Br->getOperand(0).setMBB(CondBrTarget); 4279 else 4280 B.buildBr(*CondBrTarget); 4281 4282 MI.eraseFromParent(); 4283 BrCond->eraseFromParent(); 4284 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4285 return true; 4286 } 4287 4288 return false; 4289 } 4290 case Intrinsic::amdgcn_kernarg_segment_ptr: 4291 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4292 // This only makes sense to call in a kernel, so just lower to null. 4293 B.buildConstant(MI.getOperand(0).getReg(), 0); 4294 MI.eraseFromParent(); 4295 return true; 4296 } 4297 4298 return legalizePreloadedArgIntrin( 4299 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4300 case Intrinsic::amdgcn_implicitarg_ptr: 4301 return legalizeImplicitArgPtr(MI, MRI, B); 4302 case Intrinsic::amdgcn_workitem_id_x: 4303 return legalizePreloadedArgIntrin(MI, MRI, B, 4304 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4305 case Intrinsic::amdgcn_workitem_id_y: 4306 return legalizePreloadedArgIntrin(MI, MRI, B, 4307 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4308 case Intrinsic::amdgcn_workitem_id_z: 4309 return legalizePreloadedArgIntrin(MI, MRI, B, 4310 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4311 case Intrinsic::amdgcn_workgroup_id_x: 4312 return legalizePreloadedArgIntrin(MI, MRI, B, 4313 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4314 case Intrinsic::amdgcn_workgroup_id_y: 4315 return legalizePreloadedArgIntrin(MI, MRI, B, 4316 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4317 case Intrinsic::amdgcn_workgroup_id_z: 4318 return legalizePreloadedArgIntrin(MI, MRI, B, 4319 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4320 case Intrinsic::amdgcn_dispatch_ptr: 4321 return legalizePreloadedArgIntrin(MI, MRI, B, 4322 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4323 case Intrinsic::amdgcn_queue_ptr: 4324 return legalizePreloadedArgIntrin(MI, MRI, B, 4325 AMDGPUFunctionArgInfo::QUEUE_PTR); 4326 case Intrinsic::amdgcn_implicit_buffer_ptr: 4327 return legalizePreloadedArgIntrin( 4328 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4329 case Intrinsic::amdgcn_dispatch_id: 4330 return legalizePreloadedArgIntrin(MI, MRI, B, 4331 AMDGPUFunctionArgInfo::DISPATCH_ID); 4332 case Intrinsic::amdgcn_fdiv_fast: 4333 return legalizeFDIVFastIntrin(MI, MRI, B); 4334 case Intrinsic::amdgcn_is_shared: 4335 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4336 case Intrinsic::amdgcn_is_private: 4337 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4338 case Intrinsic::amdgcn_wavefrontsize: { 4339 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4340 MI.eraseFromParent(); 4341 return true; 4342 } 4343 case Intrinsic::amdgcn_s_buffer_load: 4344 return legalizeSBufferLoad(MI, B, Helper.Observer); 4345 case Intrinsic::amdgcn_raw_buffer_store: 4346 case Intrinsic::amdgcn_struct_buffer_store: 4347 return legalizeBufferStore(MI, MRI, B, false, false); 4348 case Intrinsic::amdgcn_raw_buffer_store_format: 4349 case Intrinsic::amdgcn_struct_buffer_store_format: 4350 return legalizeBufferStore(MI, MRI, B, false, true); 4351 case Intrinsic::amdgcn_raw_tbuffer_store: 4352 case Intrinsic::amdgcn_struct_tbuffer_store: 4353 return legalizeBufferStore(MI, MRI, B, true, true); 4354 case Intrinsic::amdgcn_raw_buffer_load: 4355 case Intrinsic::amdgcn_struct_buffer_load: 4356 return legalizeBufferLoad(MI, MRI, B, false, false); 4357 case Intrinsic::amdgcn_raw_buffer_load_format: 4358 case Intrinsic::amdgcn_struct_buffer_load_format: 4359 return legalizeBufferLoad(MI, MRI, B, true, false); 4360 case Intrinsic::amdgcn_raw_tbuffer_load: 4361 case Intrinsic::amdgcn_struct_tbuffer_load: 4362 return legalizeBufferLoad(MI, MRI, B, true, true); 4363 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4364 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4365 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4366 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4367 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4368 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4369 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4370 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4371 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4372 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4373 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4374 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4375 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4376 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4377 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4378 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4379 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4380 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4381 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4382 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4383 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4384 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4385 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4386 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4387 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4388 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4389 return legalizeBufferAtomic(MI, B, IntrID); 4390 case Intrinsic::amdgcn_atomic_inc: 4391 return legalizeAtomicIncDec(MI, B, true); 4392 case Intrinsic::amdgcn_atomic_dec: 4393 return legalizeAtomicIncDec(MI, B, false); 4394 case Intrinsic::trap: 4395 return legalizeTrapIntrinsic(MI, MRI, B); 4396 case Intrinsic::debugtrap: 4397 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4398 default: { 4399 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4400 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4401 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4402 return true; 4403 } 4404 } 4405 4406 return true; 4407 } 4408