1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 return Ty.isVector() && 67 Ty.getNumElements() % 2 != 0 && 68 Ty.getElementType().getSizeInBits() < 32 && 69 Ty.getSizeInBits() % 32 != 0; 70 }; 71 } 72 73 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 74 return [=](const LegalityQuery &Query) { 75 const LLT Ty = Query.Types[TypeIdx]; 76 const LLT EltTy = Ty.getScalarType(); 77 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 78 }; 79 } 80 81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 82 return [=](const LegalityQuery &Query) { 83 const LLT Ty = Query.Types[TypeIdx]; 84 const LLT EltTy = Ty.getElementType(); 85 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 86 }; 87 } 88 89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 90 return [=](const LegalityQuery &Query) { 91 const LLT Ty = Query.Types[TypeIdx]; 92 const LLT EltTy = Ty.getElementType(); 93 unsigned Size = Ty.getSizeInBits(); 94 unsigned Pieces = (Size + 63) / 64; 95 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 96 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 97 }; 98 } 99 100 // Increase the number of vector elements to reach the next multiple of 32-bit 101 // type. 102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 106 const LLT EltTy = Ty.getElementType(); 107 const int Size = Ty.getSizeInBits(); 108 const int EltSize = EltTy.getSizeInBits(); 109 const int NextMul32 = (Size + 31) / 32; 110 111 assert(EltSize < 32); 112 113 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 114 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 115 }; 116 } 117 118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 unsigned Size = Ty.getSizeInBits(); 122 123 LLT CoercedTy; 124 if (Size <= 32) { 125 // <2 x s8> -> s16 126 // <4 x s8> -> s32 127 CoercedTy = LLT::scalar(Size); 128 } else 129 CoercedTy = LLT::scalarOrVector(Size / 32, 32); 130 131 return std::make_pair(TypeIdx, CoercedTy); 132 }; 133 } 134 135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 136 return [=](const LegalityQuery &Query) { 137 const LLT QueryTy = Query.Types[TypeIdx]; 138 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 139 }; 140 } 141 142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 143 return [=](const LegalityQuery &Query) { 144 const LLT QueryTy = Query.Types[TypeIdx]; 145 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 146 }; 147 } 148 149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT QueryTy = Query.Types[TypeIdx]; 152 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 153 }; 154 } 155 156 static bool isRegisterSize(unsigned Size) { 157 return Size % 32 == 0 && Size <= MaxRegisterSize; 158 } 159 160 static bool isRegisterVectorElementType(LLT EltTy) { 161 const int EltSize = EltTy.getSizeInBits(); 162 return EltSize == 16 || EltSize % 32 == 0; 163 } 164 165 static bool isRegisterVectorType(LLT Ty) { 166 const int EltSize = Ty.getElementType().getSizeInBits(); 167 return EltSize == 32 || EltSize == 64 || 168 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 169 EltSize == 128 || EltSize == 256; 170 } 171 172 static bool isRegisterType(LLT Ty) { 173 if (!isRegisterSize(Ty.getSizeInBits())) 174 return false; 175 176 if (Ty.isVector()) 177 return isRegisterVectorType(Ty); 178 179 return true; 180 } 181 182 // Any combination of 32 or 64-bit elements up the maximum register size, and 183 // multiples of v2s16. 184 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 return isRegisterType(Query.Types[TypeIdx]); 187 }; 188 } 189 190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 191 return [=](const LegalityQuery &Query) { 192 const LLT QueryTy = Query.Types[TypeIdx]; 193 if (!QueryTy.isVector()) 194 return false; 195 const LLT EltTy = QueryTy.getElementType(); 196 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 197 }; 198 } 199 200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 201 return [=](const LegalityQuery &Query) { 202 const LLT Ty = Query.Types[TypeIdx]; 203 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 204 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 205 }; 206 } 207 208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 209 // handle some operations by just promoting the register during 210 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 212 bool IsLoad) { 213 switch (AS) { 214 case AMDGPUAS::PRIVATE_ADDRESS: 215 // FIXME: Private element size. 216 return 32; 217 case AMDGPUAS::LOCAL_ADDRESS: 218 return ST.useDS128() ? 128 : 64; 219 case AMDGPUAS::GLOBAL_ADDRESS: 220 case AMDGPUAS::CONSTANT_ADDRESS: 221 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 222 // Treat constant and global as identical. SMRD loads are sometimes usable for 223 // global loads (ideally constant address space should be eliminated) 224 // depending on the context. Legality cannot be context dependent, but 225 // RegBankSelect can split the load as necessary depending on the pointer 226 // register bank/uniformity and if the memory is invariant or not written in a 227 // kernel. 228 return IsLoad ? 512 : 128; 229 default: 230 // Flat addresses may contextually need to be split to 32-bit parts if they 231 // may alias scratch depending on the subtarget. 232 return 128; 233 } 234 } 235 236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 237 const LegalityQuery &Query, 238 unsigned Opcode) { 239 const LLT Ty = Query.Types[0]; 240 241 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 242 const bool IsLoad = Opcode != AMDGPU::G_STORE; 243 244 unsigned RegSize = Ty.getSizeInBits(); 245 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 246 unsigned Align = Query.MMODescrs[0].AlignInBits; 247 unsigned AS = Query.Types[1].getAddressSpace(); 248 249 // All of these need to be custom lowered to cast the pointer operand. 250 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 251 return false; 252 253 // TODO: We should be able to widen loads if the alignment is high enough, but 254 // we also need to modify the memory access size. 255 #if 0 256 // Accept widening loads based on alignment. 257 if (IsLoad && MemSize < Size) 258 MemSize = std::max(MemSize, Align); 259 #endif 260 261 // Only 1-byte and 2-byte to 32-bit extloads are valid. 262 if (MemSize != RegSize && RegSize != 32) 263 return false; 264 265 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 266 return false; 267 268 switch (MemSize) { 269 case 8: 270 case 16: 271 case 32: 272 case 64: 273 case 128: 274 break; 275 case 96: 276 if (!ST.hasDwordx3LoadStores()) 277 return false; 278 break; 279 case 256: 280 case 512: 281 // These may contextually need to be broken down. 282 break; 283 default: 284 return false; 285 } 286 287 assert(RegSize >= MemSize); 288 289 if (Align < MemSize) { 290 const SITargetLowering *TLI = ST.getTargetLowering(); 291 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 292 return false; 293 } 294 295 return true; 296 } 297 298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 299 // workaround this. Eventually it should ignore the type for loads and only care 300 // about the size. Return true in cases where we will workaround this for now by 301 // bitcasting. 302 static bool loadStoreBitcastWorkaround(const LLT Ty) { 303 if (EnableNewLegality) 304 return false; 305 306 const unsigned Size = Ty.getSizeInBits(); 307 if (Size <= 64) 308 return false; 309 if (!Ty.isVector()) 310 return true; 311 unsigned EltSize = Ty.getElementType().getSizeInBits(); 312 return EltSize != 32 && EltSize != 64; 313 } 314 315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 316 unsigned Opcode) { 317 const LLT Ty = Query.Types[0]; 318 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 319 !loadStoreBitcastWorkaround(Ty); 320 } 321 322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 323 const GCNTargetMachine &TM) 324 : ST(ST_) { 325 using namespace TargetOpcode; 326 327 auto GetAddrSpacePtr = [&TM](unsigned AS) { 328 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 329 }; 330 331 const LLT S1 = LLT::scalar(1); 332 const LLT S16 = LLT::scalar(16); 333 const LLT S32 = LLT::scalar(32); 334 const LLT S64 = LLT::scalar(64); 335 const LLT S128 = LLT::scalar(128); 336 const LLT S256 = LLT::scalar(256); 337 const LLT S512 = LLT::scalar(512); 338 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 339 340 const LLT V2S16 = LLT::vector(2, 16); 341 const LLT V4S16 = LLT::vector(4, 16); 342 343 const LLT V2S32 = LLT::vector(2, 32); 344 const LLT V3S32 = LLT::vector(3, 32); 345 const LLT V4S32 = LLT::vector(4, 32); 346 const LLT V5S32 = LLT::vector(5, 32); 347 const LLT V6S32 = LLT::vector(6, 32); 348 const LLT V7S32 = LLT::vector(7, 32); 349 const LLT V8S32 = LLT::vector(8, 32); 350 const LLT V9S32 = LLT::vector(9, 32); 351 const LLT V10S32 = LLT::vector(10, 32); 352 const LLT V11S32 = LLT::vector(11, 32); 353 const LLT V12S32 = LLT::vector(12, 32); 354 const LLT V13S32 = LLT::vector(13, 32); 355 const LLT V14S32 = LLT::vector(14, 32); 356 const LLT V15S32 = LLT::vector(15, 32); 357 const LLT V16S32 = LLT::vector(16, 32); 358 const LLT V32S32 = LLT::vector(32, 32); 359 360 const LLT V2S64 = LLT::vector(2, 64); 361 const LLT V3S64 = LLT::vector(3, 64); 362 const LLT V4S64 = LLT::vector(4, 64); 363 const LLT V5S64 = LLT::vector(5, 64); 364 const LLT V6S64 = LLT::vector(6, 64); 365 const LLT V7S64 = LLT::vector(7, 64); 366 const LLT V8S64 = LLT::vector(8, 64); 367 const LLT V16S64 = LLT::vector(16, 64); 368 369 std::initializer_list<LLT> AllS32Vectors = 370 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 371 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 372 std::initializer_list<LLT> AllS64Vectors = 373 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 374 375 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 376 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 377 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 378 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 379 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 380 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 381 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 382 383 const LLT CodePtr = FlatPtr; 384 385 const std::initializer_list<LLT> AddrSpaces64 = { 386 GlobalPtr, ConstantPtr, FlatPtr 387 }; 388 389 const std::initializer_list<LLT> AddrSpaces32 = { 390 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 391 }; 392 393 const std::initializer_list<LLT> FPTypesBase = { 394 S32, S64 395 }; 396 397 const std::initializer_list<LLT> FPTypes16 = { 398 S32, S64, S16 399 }; 400 401 const std::initializer_list<LLT> FPTypesPK16 = { 402 S32, S64, S16, V2S16 403 }; 404 405 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 406 407 setAction({G_BRCOND, S1}, Legal); // VCC branches 408 setAction({G_BRCOND, S32}, Legal); // SCC branches 409 410 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 411 // elements for v3s16 412 getActionDefinitionsBuilder(G_PHI) 413 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 414 .legalFor(AllS32Vectors) 415 .legalFor(AllS64Vectors) 416 .legalFor(AddrSpaces64) 417 .legalFor(AddrSpaces32) 418 .legalIf(isPointer(0)) 419 .clampScalar(0, S32, S256) 420 .widenScalarToNextPow2(0, 32) 421 .clampMaxNumElements(0, S32, 16) 422 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 423 .scalarize(0); 424 425 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 426 // Full set of gfx9 features. 427 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 428 .legalFor({S32, S16, V2S16}) 429 .clampScalar(0, S16, S32) 430 .clampMaxNumElements(0, S16, 2) 431 .scalarize(0) 432 .widenScalarToNextPow2(0, 32); 433 434 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 435 .legalFor({S32, S16, V2S16}) // Clamp modifier 436 .minScalar(0, S16) 437 .clampMaxNumElements(0, S16, 2) 438 .scalarize(0) 439 .widenScalarToNextPow2(0, 32) 440 .lower(); 441 } else if (ST.has16BitInsts()) { 442 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 443 .legalFor({S32, S16}) 444 .clampScalar(0, S16, S32) 445 .scalarize(0) 446 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 447 448 // Technically the saturating operations require clamp bit support, but this 449 // was introduced at the same time as 16-bit operations. 450 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 451 .legalFor({S32, S16}) // Clamp modifier 452 .minScalar(0, S16) 453 .scalarize(0) 454 .widenScalarToNextPow2(0, 16) 455 .lower(); 456 457 // We're just lowering this, but it helps get a better result to try to 458 // coerce to the desired type first. 459 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 460 .minScalar(0, S16) 461 .scalarize(0) 462 .lower(); 463 } else { 464 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 465 .legalFor({S32}) 466 .clampScalar(0, S32, S32) 467 .scalarize(0); 468 469 if (ST.hasIntClamp()) { 470 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 471 .legalFor({S32}) // Clamp modifier. 472 .scalarize(0) 473 .minScalarOrElt(0, S32) 474 .lower(); 475 } else { 476 // Clamp bit support was added in VI, along with 16-bit operations. 477 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 478 .minScalar(0, S32) 479 .scalarize(0) 480 .lower(); 481 } 482 483 // FIXME: DAG expansion gets better results. The widening uses the smaller 484 // range values and goes for the min/max lowering directly. 485 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 486 .minScalar(0, S32) 487 .scalarize(0) 488 .lower(); 489 } 490 491 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 492 .customFor({S32, S64}) 493 .clampScalar(0, S32, S64) 494 .widenScalarToNextPow2(0, 32) 495 .scalarize(0); 496 497 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 498 .legalFor({S32}) 499 .clampScalar(0, S32, S32) 500 .scalarize(0); 501 502 // Report legal for any types we can handle anywhere. For the cases only legal 503 // on the SALU, RegBankSelect will be able to re-legalize. 504 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 505 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 506 .clampScalar(0, S32, S64) 507 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 508 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 509 .widenScalarToNextPow2(0) 510 .scalarize(0); 511 512 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 513 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 514 .legalFor({{S32, S1}, {S32, S32}}) 515 .minScalar(0, S32) 516 // TODO: .scalarize(0) 517 .lower(); 518 519 getActionDefinitionsBuilder(G_BITCAST) 520 // Don't worry about the size constraint. 521 .legalIf(all(isRegisterType(0), isRegisterType(1))) 522 .lower(); 523 524 525 getActionDefinitionsBuilder(G_CONSTANT) 526 .legalFor({S1, S32, S64, S16, GlobalPtr, 527 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 528 .legalIf(isPointer(0)) 529 .clampScalar(0, S32, S64) 530 .widenScalarToNextPow2(0); 531 532 getActionDefinitionsBuilder(G_FCONSTANT) 533 .legalFor({S32, S64, S16}) 534 .clampScalar(0, S16, S64); 535 536 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 537 .legalIf(isRegisterType(0)) 538 // s1 and s16 are special cases because they have legal operations on 539 // them, but don't really occupy registers in the normal way. 540 .legalFor({S1, S16}) 541 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 542 .clampScalarOrElt(0, S32, MaxScalar) 543 .widenScalarToNextPow2(0, 32) 544 .clampMaxNumElements(0, S32, 16); 545 546 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 547 548 // If the amount is divergent, we have to do a wave reduction to get the 549 // maximum value, so this is expanded during RegBankSelect. 550 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 551 .legalFor({{PrivatePtr, S32}}); 552 553 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 554 .customIf(typeIsNot(0, PrivatePtr)); 555 556 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 557 558 auto &FPOpActions = getActionDefinitionsBuilder( 559 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 560 .legalFor({S32, S64}); 561 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 562 .customFor({S32, S64}); 563 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 564 .customFor({S32, S64}); 565 566 if (ST.has16BitInsts()) { 567 if (ST.hasVOP3PInsts()) 568 FPOpActions.legalFor({S16, V2S16}); 569 else 570 FPOpActions.legalFor({S16}); 571 572 TrigActions.customFor({S16}); 573 FDIVActions.customFor({S16}); 574 } 575 576 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 577 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 578 579 if (ST.hasVOP3PInsts()) { 580 MinNumMaxNum.customFor(FPTypesPK16) 581 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 582 .clampMaxNumElements(0, S16, 2) 583 .clampScalar(0, S16, S64) 584 .scalarize(0); 585 } else if (ST.has16BitInsts()) { 586 MinNumMaxNum.customFor(FPTypes16) 587 .clampScalar(0, S16, S64) 588 .scalarize(0); 589 } else { 590 MinNumMaxNum.customFor(FPTypesBase) 591 .clampScalar(0, S32, S64) 592 .scalarize(0); 593 } 594 595 if (ST.hasVOP3PInsts()) 596 FPOpActions.clampMaxNumElements(0, S16, 2); 597 598 FPOpActions 599 .scalarize(0) 600 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 601 602 TrigActions 603 .scalarize(0) 604 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 605 606 FDIVActions 607 .scalarize(0) 608 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 609 610 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 611 .legalFor(FPTypesPK16) 612 .clampMaxNumElements(0, S16, 2) 613 .scalarize(0) 614 .clampScalar(0, S16, S64); 615 616 if (ST.has16BitInsts()) { 617 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 618 .legalFor({S32, S64, S16}) 619 .scalarize(0) 620 .clampScalar(0, S16, S64); 621 } else { 622 getActionDefinitionsBuilder(G_FSQRT) 623 .legalFor({S32, S64}) 624 .scalarize(0) 625 .clampScalar(0, S32, S64); 626 627 if (ST.hasFractBug()) { 628 getActionDefinitionsBuilder(G_FFLOOR) 629 .customFor({S64}) 630 .legalFor({S32, S64}) 631 .scalarize(0) 632 .clampScalar(0, S32, S64); 633 } else { 634 getActionDefinitionsBuilder(G_FFLOOR) 635 .legalFor({S32, S64}) 636 .scalarize(0) 637 .clampScalar(0, S32, S64); 638 } 639 } 640 641 getActionDefinitionsBuilder(G_FPTRUNC) 642 .legalFor({{S32, S64}, {S16, S32}}) 643 .scalarize(0) 644 .lower(); 645 646 getActionDefinitionsBuilder(G_FPEXT) 647 .legalFor({{S64, S32}, {S32, S16}}) 648 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 649 .scalarize(0); 650 651 getActionDefinitionsBuilder(G_FSUB) 652 // Use actual fsub instruction 653 .legalFor({S32}) 654 // Must use fadd + fneg 655 .lowerFor({S64, S16, V2S16}) 656 .scalarize(0) 657 .clampScalar(0, S32, S64); 658 659 // Whether this is legal depends on the floating point mode for the function. 660 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 661 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 662 FMad.customFor({S32, S16}); 663 else if (ST.hasMadMacF32Insts()) 664 FMad.customFor({S32}); 665 else if (ST.hasMadF16()) 666 FMad.customFor({S16}); 667 FMad.scalarize(0) 668 .lower(); 669 670 // TODO: Do we need to clamp maximum bitwidth? 671 getActionDefinitionsBuilder(G_TRUNC) 672 .legalIf(isScalar(0)) 673 .legalFor({{V2S16, V2S32}}) 674 .clampMaxNumElements(0, S16, 2) 675 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 676 // situations (like an invalid implicit use), we don't want to infinite loop 677 // in the legalizer. 678 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 679 .alwaysLegal(); 680 681 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 682 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 683 {S32, S1}, {S64, S1}, {S16, S1}}) 684 .scalarize(0) 685 .clampScalar(0, S32, S64) 686 .widenScalarToNextPow2(1, 32); 687 688 // TODO: Split s1->s64 during regbankselect for VALU. 689 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 690 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 691 .lowerFor({{S32, S64}}) 692 .lowerIf(typeIs(1, S1)) 693 .customFor({{S64, S64}}); 694 if (ST.has16BitInsts()) 695 IToFP.legalFor({{S16, S16}}); 696 IToFP.clampScalar(1, S32, S64) 697 .minScalar(0, S32) 698 .scalarize(0) 699 .widenScalarToNextPow2(1); 700 701 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 702 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 703 .customFor({{S64, S64}}) 704 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 705 if (ST.has16BitInsts()) 706 FPToI.legalFor({{S16, S16}}); 707 else 708 FPToI.minScalar(1, S32); 709 710 FPToI.minScalar(0, S32) 711 .scalarize(0) 712 .lower(); 713 714 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 715 .scalarize(0) 716 .lower(); 717 718 if (ST.has16BitInsts()) { 719 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 720 .legalFor({S16, S32, S64}) 721 .clampScalar(0, S16, S64) 722 .scalarize(0); 723 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 724 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 725 .legalFor({S32, S64}) 726 .clampScalar(0, S32, S64) 727 .scalarize(0); 728 } else { 729 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 730 .legalFor({S32}) 731 .customFor({S64}) 732 .clampScalar(0, S32, S64) 733 .scalarize(0); 734 } 735 736 getActionDefinitionsBuilder(G_PTR_ADD) 737 .legalIf(all(isPointer(0), sameSize(0, 1))) 738 .scalarize(0) 739 .scalarSameSizeAs(1, 0); 740 741 getActionDefinitionsBuilder(G_PTRMASK) 742 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 743 .scalarSameSizeAs(1, 0) 744 .scalarize(0); 745 746 auto &CmpBuilder = 747 getActionDefinitionsBuilder(G_ICMP) 748 // The compare output type differs based on the register bank of the output, 749 // so make both s1 and s32 legal. 750 // 751 // Scalar compares producing output in scc will be promoted to s32, as that 752 // is the allocatable register type that will be needed for the copy from 753 // scc. This will be promoted during RegBankSelect, and we assume something 754 // before that won't try to use s32 result types. 755 // 756 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 757 // bank. 758 .legalForCartesianProduct( 759 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 760 .legalForCartesianProduct( 761 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 762 if (ST.has16BitInsts()) { 763 CmpBuilder.legalFor({{S1, S16}}); 764 } 765 766 CmpBuilder 767 .widenScalarToNextPow2(1) 768 .clampScalar(1, S32, S64) 769 .scalarize(0) 770 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 771 772 getActionDefinitionsBuilder(G_FCMP) 773 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 774 .widenScalarToNextPow2(1) 775 .clampScalar(1, S32, S64) 776 .scalarize(0); 777 778 // FIXME: fpow has a selection pattern that should move to custom lowering. 779 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 780 if (ST.has16BitInsts()) 781 Exp2Ops.legalFor({S32, S16}); 782 else 783 Exp2Ops.legalFor({S32}); 784 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 785 Exp2Ops.scalarize(0); 786 787 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 788 if (ST.has16BitInsts()) 789 ExpOps.customFor({{S32}, {S16}}); 790 else 791 ExpOps.customFor({S32}); 792 ExpOps.clampScalar(0, MinScalarFPTy, S32) 793 .scalarize(0); 794 795 getActionDefinitionsBuilder(G_FPOWI) 796 .clampScalar(0, MinScalarFPTy, S32) 797 .lower(); 798 799 // The 64-bit versions produce 32-bit results, but only on the SALU. 800 getActionDefinitionsBuilder(G_CTPOP) 801 .legalFor({{S32, S32}, {S32, S64}}) 802 .clampScalar(0, S32, S32) 803 .clampScalar(1, S32, S64) 804 .scalarize(0) 805 .widenScalarToNextPow2(0, 32) 806 .widenScalarToNextPow2(1, 32); 807 808 // The hardware instructions return a different result on 0 than the generic 809 // instructions expect. The hardware produces -1, but these produce the 810 // bitwidth. 811 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 812 .scalarize(0) 813 .clampScalar(0, S32, S32) 814 .clampScalar(1, S32, S64) 815 .widenScalarToNextPow2(0, 32) 816 .widenScalarToNextPow2(1, 32) 817 .lower(); 818 819 // The 64-bit versions produce 32-bit results, but only on the SALU. 820 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 821 .legalFor({{S32, S32}, {S32, S64}}) 822 .clampScalar(0, S32, S32) 823 .clampScalar(1, S32, S64) 824 .scalarize(0) 825 .widenScalarToNextPow2(0, 32) 826 .widenScalarToNextPow2(1, 32); 827 828 getActionDefinitionsBuilder(G_BITREVERSE) 829 .legalFor({S32}) 830 .clampScalar(0, S32, S32) 831 .scalarize(0); 832 833 if (ST.has16BitInsts()) { 834 getActionDefinitionsBuilder(G_BSWAP) 835 .legalFor({S16, S32, V2S16}) 836 .clampMaxNumElements(0, S16, 2) 837 // FIXME: Fixing non-power-of-2 before clamp is workaround for 838 // narrowScalar limitation. 839 .widenScalarToNextPow2(0) 840 .clampScalar(0, S16, S32) 841 .scalarize(0); 842 843 if (ST.hasVOP3PInsts()) { 844 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 845 .legalFor({S32, S16, V2S16}) 846 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 847 .clampMaxNumElements(0, S16, 2) 848 .minScalar(0, S16) 849 .widenScalarToNextPow2(0) 850 .scalarize(0) 851 .lower(); 852 } else { 853 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 854 .legalFor({S32, S16}) 855 .widenScalarToNextPow2(0) 856 .minScalar(0, S16) 857 .scalarize(0) 858 .lower(); 859 } 860 } else { 861 // TODO: Should have same legality without v_perm_b32 862 getActionDefinitionsBuilder(G_BSWAP) 863 .legalFor({S32}) 864 .lowerIf(scalarNarrowerThan(0, 32)) 865 // FIXME: Fixing non-power-of-2 before clamp is workaround for 866 // narrowScalar limitation. 867 .widenScalarToNextPow2(0) 868 .maxScalar(0, S32) 869 .scalarize(0) 870 .lower(); 871 872 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 873 .legalFor({S32}) 874 .minScalar(0, S32) 875 .widenScalarToNextPow2(0) 876 .scalarize(0) 877 .lower(); 878 } 879 880 getActionDefinitionsBuilder(G_INTTOPTR) 881 // List the common cases 882 .legalForCartesianProduct(AddrSpaces64, {S64}) 883 .legalForCartesianProduct(AddrSpaces32, {S32}) 884 .scalarize(0) 885 // Accept any address space as long as the size matches 886 .legalIf(sameSize(0, 1)) 887 .widenScalarIf(smallerThan(1, 0), 888 [](const LegalityQuery &Query) { 889 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 890 }) 891 .narrowScalarIf(largerThan(1, 0), 892 [](const LegalityQuery &Query) { 893 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 894 }); 895 896 getActionDefinitionsBuilder(G_PTRTOINT) 897 // List the common cases 898 .legalForCartesianProduct(AddrSpaces64, {S64}) 899 .legalForCartesianProduct(AddrSpaces32, {S32}) 900 .scalarize(0) 901 // Accept any address space as long as the size matches 902 .legalIf(sameSize(0, 1)) 903 .widenScalarIf(smallerThan(0, 1), 904 [](const LegalityQuery &Query) { 905 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 906 }) 907 .narrowScalarIf( 908 largerThan(0, 1), 909 [](const LegalityQuery &Query) { 910 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 911 }); 912 913 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 914 .scalarize(0) 915 .custom(); 916 917 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 918 bool IsLoad) -> bool { 919 const LLT DstTy = Query.Types[0]; 920 921 // Split vector extloads. 922 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 923 unsigned Align = Query.MMODescrs[0].AlignInBits; 924 925 if (MemSize < DstTy.getSizeInBits()) 926 MemSize = std::max(MemSize, Align); 927 928 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 929 return true; 930 931 const LLT PtrTy = Query.Types[1]; 932 unsigned AS = PtrTy.getAddressSpace(); 933 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 934 return true; 935 936 // Catch weird sized loads that don't evenly divide into the access sizes 937 // TODO: May be able to widen depending on alignment etc. 938 unsigned NumRegs = (MemSize + 31) / 32; 939 if (NumRegs == 3) { 940 if (!ST.hasDwordx3LoadStores()) 941 return true; 942 } else { 943 // If the alignment allows, these should have been widened. 944 if (!isPowerOf2_32(NumRegs)) 945 return true; 946 } 947 948 if (Align < MemSize) { 949 const SITargetLowering *TLI = ST.getTargetLowering(); 950 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 951 } 952 953 return false; 954 }; 955 956 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 957 unsigned Opc) -> bool { 958 unsigned Size = Query.Types[0].getSizeInBits(); 959 if (isPowerOf2_32(Size)) 960 return false; 961 962 if (Size == 96 && ST.hasDwordx3LoadStores()) 963 return false; 964 965 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 966 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 967 return false; 968 969 unsigned Align = Query.MMODescrs[0].AlignInBits; 970 unsigned RoundedSize = NextPowerOf2(Size); 971 return (Align >= RoundedSize); 972 }; 973 974 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 975 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 976 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 977 978 // TODO: Refine based on subtargets which support unaligned access or 128-bit 979 // LDS 980 // TODO: Unsupported flat for SI. 981 982 for (unsigned Op : {G_LOAD, G_STORE}) { 983 const bool IsStore = Op == G_STORE; 984 985 auto &Actions = getActionDefinitionsBuilder(Op); 986 // Explicitly list some common cases. 987 // TODO: Does this help compile time at all? 988 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 989 {V2S32, GlobalPtr, 64, GlobalAlign32}, 990 {V4S32, GlobalPtr, 128, GlobalAlign32}, 991 {S64, GlobalPtr, 64, GlobalAlign32}, 992 {V2S64, GlobalPtr, 128, GlobalAlign32}, 993 {V2S16, GlobalPtr, 32, GlobalAlign32}, 994 {S32, GlobalPtr, 8, GlobalAlign8}, 995 {S32, GlobalPtr, 16, GlobalAlign16}, 996 997 {S32, LocalPtr, 32, 32}, 998 {S64, LocalPtr, 64, 32}, 999 {V2S32, LocalPtr, 64, 32}, 1000 {S32, LocalPtr, 8, 8}, 1001 {S32, LocalPtr, 16, 16}, 1002 {V2S16, LocalPtr, 32, 32}, 1003 1004 {S32, PrivatePtr, 32, 32}, 1005 {S32, PrivatePtr, 8, 8}, 1006 {S32, PrivatePtr, 16, 16}, 1007 {V2S16, PrivatePtr, 32, 32}, 1008 1009 {S32, ConstantPtr, 32, GlobalAlign32}, 1010 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1011 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1012 {S64, ConstantPtr, 64, GlobalAlign32}, 1013 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1014 Actions.legalIf( 1015 [=](const LegalityQuery &Query) -> bool { 1016 return isLoadStoreLegal(ST, Query, Op); 1017 }); 1018 1019 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1020 // 64-bits. 1021 // 1022 // TODO: Should generalize bitcast action into coerce, which will also cover 1023 // inserting addrspacecasts. 1024 Actions.customIf(typeIs(1, Constant32Ptr)); 1025 1026 // Turn any illegal element vectors into something easier to deal 1027 // with. These will ultimately produce 32-bit scalar shifts to extract the 1028 // parts anyway. 1029 // 1030 // For odd 16-bit element vectors, prefer to split those into pieces with 1031 // 16-bit vector parts. 1032 Actions.bitcastIf( 1033 [=](const LegalityQuery &Query) -> bool { 1034 const LLT Ty = Query.Types[0]; 1035 const unsigned Size = Ty.getSizeInBits(); 1036 1037 if (Size != Query.MMODescrs[0].SizeInBits) 1038 return Size <= 32 && Ty.isVector(); 1039 1040 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 1041 return true; 1042 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 1043 !isRegisterVectorElementType(Ty.getElementType()); 1044 }, bitcastToRegisterType(0)); 1045 1046 Actions 1047 .customIf(typeIs(1, Constant32Ptr)) 1048 // Widen suitably aligned loads by loading extra elements. 1049 .moreElementsIf([=](const LegalityQuery &Query) { 1050 const LLT Ty = Query.Types[0]; 1051 return Op == G_LOAD && Ty.isVector() && 1052 shouldWidenLoadResult(Query, Op); 1053 }, moreElementsToNextPow2(0)) 1054 .widenScalarIf([=](const LegalityQuery &Query) { 1055 const LLT Ty = Query.Types[0]; 1056 return Op == G_LOAD && !Ty.isVector() && 1057 shouldWidenLoadResult(Query, Op); 1058 }, widenScalarOrEltToNextPow2(0)) 1059 .narrowScalarIf( 1060 [=](const LegalityQuery &Query) -> bool { 1061 return !Query.Types[0].isVector() && 1062 needToSplitMemOp(Query, Op == G_LOAD); 1063 }, 1064 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1065 const LLT DstTy = Query.Types[0]; 1066 const LLT PtrTy = Query.Types[1]; 1067 1068 const unsigned DstSize = DstTy.getSizeInBits(); 1069 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1070 1071 // Split extloads. 1072 if (DstSize > MemSize) 1073 return std::make_pair(0, LLT::scalar(MemSize)); 1074 1075 if (!isPowerOf2_32(DstSize)) { 1076 // We're probably decomposing an odd sized store. Try to split 1077 // to the widest type. TODO: Account for alignment. As-is it 1078 // should be OK, since the new parts will be further legalized. 1079 unsigned FloorSize = PowerOf2Floor(DstSize); 1080 return std::make_pair(0, LLT::scalar(FloorSize)); 1081 } 1082 1083 if (DstSize > 32 && (DstSize % 32 != 0)) { 1084 // FIXME: Need a way to specify non-extload of larger size if 1085 // suitably aligned. 1086 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1087 } 1088 1089 unsigned MaxSize = maxSizeForAddrSpace(ST, 1090 PtrTy.getAddressSpace(), 1091 Op == G_LOAD); 1092 if (MemSize > MaxSize) 1093 return std::make_pair(0, LLT::scalar(MaxSize)); 1094 1095 unsigned Align = Query.MMODescrs[0].AlignInBits; 1096 return std::make_pair(0, LLT::scalar(Align)); 1097 }) 1098 .fewerElementsIf( 1099 [=](const LegalityQuery &Query) -> bool { 1100 return Query.Types[0].isVector() && 1101 needToSplitMemOp(Query, Op == G_LOAD); 1102 }, 1103 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1104 const LLT DstTy = Query.Types[0]; 1105 const LLT PtrTy = Query.Types[1]; 1106 1107 LLT EltTy = DstTy.getElementType(); 1108 unsigned MaxSize = maxSizeForAddrSpace(ST, 1109 PtrTy.getAddressSpace(), 1110 Op == G_LOAD); 1111 1112 // FIXME: Handle widened to power of 2 results better. This ends 1113 // up scalarizing. 1114 // FIXME: 3 element stores scalarized on SI 1115 1116 // Split if it's too large for the address space. 1117 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1118 unsigned NumElts = DstTy.getNumElements(); 1119 unsigned EltSize = EltTy.getSizeInBits(); 1120 1121 if (MaxSize % EltSize == 0) { 1122 return std::make_pair( 1123 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1124 } 1125 1126 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1127 1128 // FIXME: Refine when odd breakdowns handled 1129 // The scalars will need to be re-legalized. 1130 if (NumPieces == 1 || NumPieces >= NumElts || 1131 NumElts % NumPieces != 0) 1132 return std::make_pair(0, EltTy); 1133 1134 return std::make_pair(0, 1135 LLT::vector(NumElts / NumPieces, EltTy)); 1136 } 1137 1138 // FIXME: We could probably handle weird extending loads better. 1139 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1140 if (DstTy.getSizeInBits() > MemSize) 1141 return std::make_pair(0, EltTy); 1142 1143 unsigned EltSize = EltTy.getSizeInBits(); 1144 unsigned DstSize = DstTy.getSizeInBits(); 1145 if (!isPowerOf2_32(DstSize)) { 1146 // We're probably decomposing an odd sized store. Try to split 1147 // to the widest type. TODO: Account for alignment. As-is it 1148 // should be OK, since the new parts will be further legalized. 1149 unsigned FloorSize = PowerOf2Floor(DstSize); 1150 return std::make_pair( 1151 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1152 } 1153 1154 // Need to split because of alignment. 1155 unsigned Align = Query.MMODescrs[0].AlignInBits; 1156 if (EltSize > Align && 1157 (EltSize / Align < DstTy.getNumElements())) { 1158 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1159 } 1160 1161 // May need relegalization for the scalars. 1162 return std::make_pair(0, EltTy); 1163 }) 1164 .minScalar(0, S32); 1165 1166 if (IsStore) 1167 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1168 1169 // TODO: Need a bitcast lower option? 1170 Actions 1171 .widenScalarToNextPow2(0) 1172 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1173 } 1174 1175 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1176 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1177 {S32, GlobalPtr, 16, 2 * 8}, 1178 {S32, LocalPtr, 8, 8}, 1179 {S32, LocalPtr, 16, 16}, 1180 {S32, PrivatePtr, 8, 8}, 1181 {S32, PrivatePtr, 16, 16}, 1182 {S32, ConstantPtr, 8, 8}, 1183 {S32, ConstantPtr, 16, 2 * 8}}); 1184 if (ST.hasFlatAddressSpace()) { 1185 ExtLoads.legalForTypesWithMemDesc( 1186 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1187 } 1188 1189 ExtLoads.clampScalar(0, S32, S32) 1190 .widenScalarToNextPow2(0) 1191 .unsupportedIfMemSizeNotPow2() 1192 .lower(); 1193 1194 auto &Atomics = getActionDefinitionsBuilder( 1195 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1196 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1197 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1198 G_ATOMICRMW_UMIN}) 1199 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1200 {S64, GlobalPtr}, {S64, LocalPtr}, 1201 {S32, RegionPtr}, {S64, RegionPtr}}); 1202 if (ST.hasFlatAddressSpace()) { 1203 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1204 } 1205 1206 if (ST.hasLDSFPAtomics()) { 1207 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1208 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1209 } 1210 1211 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1212 // demarshalling 1213 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1214 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1215 {S32, FlatPtr}, {S64, FlatPtr}}) 1216 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1217 {S32, RegionPtr}, {S64, RegionPtr}}); 1218 // TODO: Pointer types, any 32-bit or 64-bit vector 1219 1220 // Condition should be s32 for scalar, s1 for vector. 1221 getActionDefinitionsBuilder(G_SELECT) 1222 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1223 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1224 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1225 .clampScalar(0, S16, S64) 1226 .scalarize(1) 1227 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1228 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1229 .clampMaxNumElements(0, S32, 2) 1230 .clampMaxNumElements(0, LocalPtr, 2) 1231 .clampMaxNumElements(0, PrivatePtr, 2) 1232 .scalarize(0) 1233 .widenScalarToNextPow2(0) 1234 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1235 1236 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1237 // be more flexible with the shift amount type. 1238 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1239 .legalFor({{S32, S32}, {S64, S32}}); 1240 if (ST.has16BitInsts()) { 1241 if (ST.hasVOP3PInsts()) { 1242 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1243 .clampMaxNumElements(0, S16, 2); 1244 } else 1245 Shifts.legalFor({{S16, S16}}); 1246 1247 // TODO: Support 16-bit shift amounts for all types 1248 Shifts.widenScalarIf( 1249 [=](const LegalityQuery &Query) { 1250 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1251 // 32-bit amount. 1252 const LLT ValTy = Query.Types[0]; 1253 const LLT AmountTy = Query.Types[1]; 1254 return ValTy.getSizeInBits() <= 16 && 1255 AmountTy.getSizeInBits() < 16; 1256 }, changeTo(1, S16)); 1257 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1258 Shifts.clampScalar(1, S32, S32); 1259 Shifts.clampScalar(0, S16, S64); 1260 Shifts.widenScalarToNextPow2(0, 16); 1261 } else { 1262 // Make sure we legalize the shift amount type first, as the general 1263 // expansion for the shifted type will produce much worse code if it hasn't 1264 // been truncated already. 1265 Shifts.clampScalar(1, S32, S32); 1266 Shifts.clampScalar(0, S32, S64); 1267 Shifts.widenScalarToNextPow2(0, 32); 1268 } 1269 Shifts.scalarize(0); 1270 1271 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1272 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1273 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1274 unsigned IdxTypeIdx = 2; 1275 1276 getActionDefinitionsBuilder(Op) 1277 .customIf([=](const LegalityQuery &Query) { 1278 const LLT EltTy = Query.Types[EltTypeIdx]; 1279 const LLT VecTy = Query.Types[VecTypeIdx]; 1280 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1281 return (EltTy.getSizeInBits() == 16 || 1282 EltTy.getSizeInBits() % 32 == 0) && 1283 VecTy.getSizeInBits() % 32 == 0 && 1284 VecTy.getSizeInBits() <= MaxRegisterSize && 1285 IdxTy.getSizeInBits() == 32; 1286 }) 1287 .clampScalar(EltTypeIdx, S32, S64) 1288 .clampScalar(VecTypeIdx, S32, S64) 1289 .clampScalar(IdxTypeIdx, S32, S32); 1290 } 1291 1292 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1293 .unsupportedIf([=](const LegalityQuery &Query) { 1294 const LLT &EltTy = Query.Types[1].getElementType(); 1295 return Query.Types[0] != EltTy; 1296 }); 1297 1298 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1299 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1300 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1301 1302 // FIXME: Doesn't handle extract of illegal sizes. 1303 getActionDefinitionsBuilder(Op) 1304 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1305 // FIXME: Multiples of 16 should not be legal. 1306 .legalIf([=](const LegalityQuery &Query) { 1307 const LLT BigTy = Query.Types[BigTyIdx]; 1308 const LLT LitTy = Query.Types[LitTyIdx]; 1309 return (BigTy.getSizeInBits() % 32 == 0) && 1310 (LitTy.getSizeInBits() % 16 == 0); 1311 }) 1312 .widenScalarIf( 1313 [=](const LegalityQuery &Query) { 1314 const LLT BigTy = Query.Types[BigTyIdx]; 1315 return (BigTy.getScalarSizeInBits() < 16); 1316 }, 1317 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1318 .widenScalarIf( 1319 [=](const LegalityQuery &Query) { 1320 const LLT LitTy = Query.Types[LitTyIdx]; 1321 return (LitTy.getScalarSizeInBits() < 16); 1322 }, 1323 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1324 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1325 .widenScalarToNextPow2(BigTyIdx, 32); 1326 1327 } 1328 1329 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1330 .legalForCartesianProduct(AllS32Vectors, {S32}) 1331 .legalForCartesianProduct(AllS64Vectors, {S64}) 1332 .clampNumElements(0, V16S32, V32S32) 1333 .clampNumElements(0, V2S64, V16S64) 1334 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1335 1336 if (ST.hasScalarPackInsts()) { 1337 BuildVector 1338 // FIXME: Should probably widen s1 vectors straight to s32 1339 .minScalarOrElt(0, S16) 1340 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1341 .minScalar(1, S32); 1342 1343 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1344 .legalFor({V2S16, S32}) 1345 .lower(); 1346 BuildVector.minScalarOrElt(0, S32); 1347 } else { 1348 BuildVector.customFor({V2S16, S16}); 1349 BuildVector.minScalarOrElt(0, S32); 1350 1351 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1352 .customFor({V2S16, S32}) 1353 .lower(); 1354 } 1355 1356 BuildVector.legalIf(isRegisterType(0)); 1357 1358 // FIXME: Clamp maximum size 1359 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1360 .legalIf(isRegisterType(0)); 1361 1362 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1363 // pre-legalize. 1364 if (ST.hasVOP3PInsts()) { 1365 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1366 .customFor({V2S16, V2S16}) 1367 .lower(); 1368 } else 1369 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1370 1371 // Merge/Unmerge 1372 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1373 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1374 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1375 1376 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1377 const LLT Ty = Query.Types[TypeIdx]; 1378 if (Ty.isVector()) { 1379 const LLT &EltTy = Ty.getElementType(); 1380 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1381 return true; 1382 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1383 return true; 1384 } 1385 return false; 1386 }; 1387 1388 auto &Builder = getActionDefinitionsBuilder(Op) 1389 .lowerFor({{S16, V2S16}}) 1390 .lowerIf([=](const LegalityQuery &Query) { 1391 const LLT BigTy = Query.Types[BigTyIdx]; 1392 return BigTy.getSizeInBits() == 32; 1393 }) 1394 // Try to widen to s16 first for small types. 1395 // TODO: Only do this on targets with legal s16 shifts 1396 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1397 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1398 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1399 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1400 elementTypeIs(1, S16)), 1401 changeTo(1, V2S16)) 1402 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1403 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1404 // valid. 1405 .clampScalar(LitTyIdx, S32, S512) 1406 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1407 // Break up vectors with weird elements into scalars 1408 .fewerElementsIf( 1409 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1410 scalarize(0)) 1411 .fewerElementsIf( 1412 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1413 scalarize(1)) 1414 .clampScalar(BigTyIdx, S32, MaxScalar); 1415 1416 if (Op == G_MERGE_VALUES) { 1417 Builder.widenScalarIf( 1418 // TODO: Use 16-bit shifts if legal for 8-bit values? 1419 [=](const LegalityQuery &Query) { 1420 const LLT Ty = Query.Types[LitTyIdx]; 1421 return Ty.getSizeInBits() < 32; 1422 }, 1423 changeTo(LitTyIdx, S32)); 1424 } 1425 1426 Builder.widenScalarIf( 1427 [=](const LegalityQuery &Query) { 1428 const LLT Ty = Query.Types[BigTyIdx]; 1429 return !isPowerOf2_32(Ty.getSizeInBits()) && 1430 Ty.getSizeInBits() % 16 != 0; 1431 }, 1432 [=](const LegalityQuery &Query) { 1433 // Pick the next power of 2, or a multiple of 64 over 128. 1434 // Whichever is smaller. 1435 const LLT &Ty = Query.Types[BigTyIdx]; 1436 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1437 if (NewSizeInBits >= 256) { 1438 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1439 if (RoundedTo < NewSizeInBits) 1440 NewSizeInBits = RoundedTo; 1441 } 1442 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1443 }) 1444 .legalIf([=](const LegalityQuery &Query) { 1445 const LLT &BigTy = Query.Types[BigTyIdx]; 1446 const LLT &LitTy = Query.Types[LitTyIdx]; 1447 1448 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1449 return false; 1450 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1451 return false; 1452 1453 return BigTy.getSizeInBits() % 16 == 0 && 1454 LitTy.getSizeInBits() % 16 == 0 && 1455 BigTy.getSizeInBits() <= MaxRegisterSize; 1456 }) 1457 // Any vectors left are the wrong size. Scalarize them. 1458 .scalarize(0) 1459 .scalarize(1); 1460 } 1461 1462 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1463 // RegBankSelect. 1464 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1465 .legalFor({{S32}, {S64}}); 1466 1467 if (ST.hasVOP3PInsts()) { 1468 SextInReg.lowerFor({{V2S16}}) 1469 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1470 // get more vector shift opportunities, since we'll get those when 1471 // expanded. 1472 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1473 } else if (ST.has16BitInsts()) { 1474 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1475 } else { 1476 // Prefer to promote to s32 before lowering if we don't have 16-bit 1477 // shifts. This avoid a lot of intermediate truncate and extend operations. 1478 SextInReg.lowerFor({{S32}, {S64}}); 1479 } 1480 1481 SextInReg 1482 .scalarize(0) 1483 .clampScalar(0, S32, S64) 1484 .lower(); 1485 1486 getActionDefinitionsBuilder(G_FSHR) 1487 .legalFor({{S32, S32}}) 1488 .scalarize(0) 1489 .lower(); 1490 1491 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1492 .legalFor({S64}); 1493 1494 getActionDefinitionsBuilder(G_FENCE) 1495 .alwaysLegal(); 1496 1497 getActionDefinitionsBuilder({ 1498 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1499 G_FCOPYSIGN, 1500 1501 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1502 G_ATOMICRMW_NAND, 1503 G_ATOMICRMW_FSUB, 1504 G_READ_REGISTER, 1505 G_WRITE_REGISTER, 1506 1507 G_SADDO, G_SSUBO, 1508 1509 // TODO: Implement 1510 G_FMINIMUM, G_FMAXIMUM, 1511 G_FSHL 1512 }).lower(); 1513 1514 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1515 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1516 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1517 .unsupported(); 1518 1519 computeTables(); 1520 verify(*ST.getInstrInfo()); 1521 } 1522 1523 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1524 MachineInstr &MI) const { 1525 MachineIRBuilder &B = Helper.MIRBuilder; 1526 MachineRegisterInfo &MRI = *B.getMRI(); 1527 GISelChangeObserver &Observer = Helper.Observer; 1528 1529 switch (MI.getOpcode()) { 1530 case TargetOpcode::G_ADDRSPACE_CAST: 1531 return legalizeAddrSpaceCast(MI, MRI, B); 1532 case TargetOpcode::G_FRINT: 1533 return legalizeFrint(MI, MRI, B); 1534 case TargetOpcode::G_FCEIL: 1535 return legalizeFceil(MI, MRI, B); 1536 case TargetOpcode::G_INTRINSIC_TRUNC: 1537 return legalizeIntrinsicTrunc(MI, MRI, B); 1538 case TargetOpcode::G_SITOFP: 1539 return legalizeITOFP(MI, MRI, B, true); 1540 case TargetOpcode::G_UITOFP: 1541 return legalizeITOFP(MI, MRI, B, false); 1542 case TargetOpcode::G_FPTOSI: 1543 return legalizeFPTOI(MI, MRI, B, true); 1544 case TargetOpcode::G_FPTOUI: 1545 return legalizeFPTOI(MI, MRI, B, false); 1546 case TargetOpcode::G_FMINNUM: 1547 case TargetOpcode::G_FMAXNUM: 1548 case TargetOpcode::G_FMINNUM_IEEE: 1549 case TargetOpcode::G_FMAXNUM_IEEE: 1550 return legalizeMinNumMaxNum(Helper, MI); 1551 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1552 return legalizeExtractVectorElt(MI, MRI, B); 1553 case TargetOpcode::G_INSERT_VECTOR_ELT: 1554 return legalizeInsertVectorElt(MI, MRI, B); 1555 case TargetOpcode::G_SHUFFLE_VECTOR: 1556 return legalizeShuffleVector(MI, MRI, B); 1557 case TargetOpcode::G_FSIN: 1558 case TargetOpcode::G_FCOS: 1559 return legalizeSinCos(MI, MRI, B); 1560 case TargetOpcode::G_GLOBAL_VALUE: 1561 return legalizeGlobalValue(MI, MRI, B); 1562 case TargetOpcode::G_LOAD: 1563 return legalizeLoad(MI, MRI, B, Observer); 1564 case TargetOpcode::G_FMAD: 1565 return legalizeFMad(MI, MRI, B); 1566 case TargetOpcode::G_FDIV: 1567 return legalizeFDIV(MI, MRI, B); 1568 case TargetOpcode::G_UDIV: 1569 case TargetOpcode::G_UREM: 1570 return legalizeUDIV_UREM(MI, MRI, B); 1571 case TargetOpcode::G_SDIV: 1572 case TargetOpcode::G_SREM: 1573 return legalizeSDIV_SREM(MI, MRI, B); 1574 case TargetOpcode::G_ATOMIC_CMPXCHG: 1575 return legalizeAtomicCmpXChg(MI, MRI, B); 1576 case TargetOpcode::G_FLOG: 1577 return legalizeFlog(MI, B, numbers::ln2f); 1578 case TargetOpcode::G_FLOG10: 1579 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1580 case TargetOpcode::G_FEXP: 1581 return legalizeFExp(MI, B); 1582 case TargetOpcode::G_FPOW: 1583 return legalizeFPow(MI, B); 1584 case TargetOpcode::G_FFLOOR: 1585 return legalizeFFloor(MI, MRI, B); 1586 case TargetOpcode::G_BUILD_VECTOR: 1587 return legalizeBuildVector(MI, MRI, B); 1588 default: 1589 return false; 1590 } 1591 1592 llvm_unreachable("expected switch to return"); 1593 } 1594 1595 Register AMDGPULegalizerInfo::getSegmentAperture( 1596 unsigned AS, 1597 MachineRegisterInfo &MRI, 1598 MachineIRBuilder &B) const { 1599 MachineFunction &MF = B.getMF(); 1600 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1601 const LLT S32 = LLT::scalar(32); 1602 1603 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1604 1605 if (ST.hasApertureRegs()) { 1606 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1607 // getreg. 1608 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1609 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1610 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1611 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1612 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1613 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1614 unsigned Encoding = 1615 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1616 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1617 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1618 1619 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1620 1621 B.buildInstr(AMDGPU::S_GETREG_B32) 1622 .addDef(GetReg) 1623 .addImm(Encoding); 1624 MRI.setType(GetReg, S32); 1625 1626 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1627 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1628 } 1629 1630 Register QueuePtr = MRI.createGenericVirtualRegister( 1631 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1632 1633 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1634 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1635 return Register(); 1636 1637 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1638 // private_segment_aperture_base_hi. 1639 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1640 1641 // TODO: can we be smarter about machine pointer info? 1642 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1643 MachineMemOperand *MMO = MF.getMachineMemOperand( 1644 PtrInfo, 1645 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1646 MachineMemOperand::MOInvariant, 1647 4, commonAlignment(Align(64), StructOffset)); 1648 1649 Register LoadAddr; 1650 1651 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1652 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1653 } 1654 1655 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1656 MachineInstr &MI, MachineRegisterInfo &MRI, 1657 MachineIRBuilder &B) const { 1658 MachineFunction &MF = B.getMF(); 1659 1660 const LLT S32 = LLT::scalar(32); 1661 Register Dst = MI.getOperand(0).getReg(); 1662 Register Src = MI.getOperand(1).getReg(); 1663 1664 LLT DstTy = MRI.getType(Dst); 1665 LLT SrcTy = MRI.getType(Src); 1666 unsigned DestAS = DstTy.getAddressSpace(); 1667 unsigned SrcAS = SrcTy.getAddressSpace(); 1668 1669 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1670 // vector element. 1671 assert(!DstTy.isVector()); 1672 1673 const AMDGPUTargetMachine &TM 1674 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1675 1676 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1677 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1678 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1679 return true; 1680 } 1681 1682 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1683 // Truncate. 1684 B.buildExtract(Dst, Src, 0); 1685 MI.eraseFromParent(); 1686 return true; 1687 } 1688 1689 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1690 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1691 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1692 1693 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1694 // another. Merge operands are required to be the same type, but creating an 1695 // extra ptrtoint would be kind of pointless. 1696 auto HighAddr = B.buildConstant( 1697 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1698 B.buildMerge(Dst, {Src, HighAddr}); 1699 MI.eraseFromParent(); 1700 return true; 1701 } 1702 1703 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1704 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1705 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1706 unsigned NullVal = TM.getNullPointerValue(DestAS); 1707 1708 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1709 auto FlatNull = B.buildConstant(SrcTy, 0); 1710 1711 // Extract low 32-bits of the pointer. 1712 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1713 1714 auto CmpRes = 1715 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1716 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1717 1718 MI.eraseFromParent(); 1719 return true; 1720 } 1721 1722 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1723 return false; 1724 1725 if (!ST.hasFlatAddressSpace()) 1726 return false; 1727 1728 auto SegmentNull = 1729 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1730 auto FlatNull = 1731 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1732 1733 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1734 if (!ApertureReg.isValid()) 1735 return false; 1736 1737 auto CmpRes = 1738 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1739 1740 // Coerce the type of the low half of the result so we can use merge_values. 1741 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1742 1743 // TODO: Should we allow mismatched types but matching sizes in merges to 1744 // avoid the ptrtoint? 1745 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1746 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1747 1748 MI.eraseFromParent(); 1749 return true; 1750 } 1751 1752 bool AMDGPULegalizerInfo::legalizeFrint( 1753 MachineInstr &MI, MachineRegisterInfo &MRI, 1754 MachineIRBuilder &B) const { 1755 Register Src = MI.getOperand(1).getReg(); 1756 LLT Ty = MRI.getType(Src); 1757 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1758 1759 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1760 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1761 1762 auto C1 = B.buildFConstant(Ty, C1Val); 1763 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1764 1765 // TODO: Should this propagate fast-math-flags? 1766 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1767 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1768 1769 auto C2 = B.buildFConstant(Ty, C2Val); 1770 auto Fabs = B.buildFAbs(Ty, Src); 1771 1772 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1773 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1774 MI.eraseFromParent(); 1775 return true; 1776 } 1777 1778 bool AMDGPULegalizerInfo::legalizeFceil( 1779 MachineInstr &MI, MachineRegisterInfo &MRI, 1780 MachineIRBuilder &B) const { 1781 1782 const LLT S1 = LLT::scalar(1); 1783 const LLT S64 = LLT::scalar(64); 1784 1785 Register Src = MI.getOperand(1).getReg(); 1786 assert(MRI.getType(Src) == S64); 1787 1788 // result = trunc(src) 1789 // if (src > 0.0 && src != result) 1790 // result += 1.0 1791 1792 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1793 1794 const auto Zero = B.buildFConstant(S64, 0.0); 1795 const auto One = B.buildFConstant(S64, 1.0); 1796 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1797 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1798 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1799 auto Add = B.buildSelect(S64, And, One, Zero); 1800 1801 // TODO: Should this propagate fast-math-flags? 1802 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1803 return true; 1804 } 1805 1806 static MachineInstrBuilder extractF64Exponent(Register Hi, 1807 MachineIRBuilder &B) { 1808 const unsigned FractBits = 52; 1809 const unsigned ExpBits = 11; 1810 LLT S32 = LLT::scalar(32); 1811 1812 auto Const0 = B.buildConstant(S32, FractBits - 32); 1813 auto Const1 = B.buildConstant(S32, ExpBits); 1814 1815 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1816 .addUse(Hi) 1817 .addUse(Const0.getReg(0)) 1818 .addUse(Const1.getReg(0)); 1819 1820 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1821 } 1822 1823 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1824 MachineInstr &MI, MachineRegisterInfo &MRI, 1825 MachineIRBuilder &B) const { 1826 const LLT S1 = LLT::scalar(1); 1827 const LLT S32 = LLT::scalar(32); 1828 const LLT S64 = LLT::scalar(64); 1829 1830 Register Src = MI.getOperand(1).getReg(); 1831 assert(MRI.getType(Src) == S64); 1832 1833 // TODO: Should this use extract since the low half is unused? 1834 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1835 Register Hi = Unmerge.getReg(1); 1836 1837 // Extract the upper half, since this is where we will find the sign and 1838 // exponent. 1839 auto Exp = extractF64Exponent(Hi, B); 1840 1841 const unsigned FractBits = 52; 1842 1843 // Extract the sign bit. 1844 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1845 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1846 1847 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1848 1849 const auto Zero32 = B.buildConstant(S32, 0); 1850 1851 // Extend back to 64-bits. 1852 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1853 1854 auto Shr = B.buildAShr(S64, FractMask, Exp); 1855 auto Not = B.buildNot(S64, Shr); 1856 auto Tmp0 = B.buildAnd(S64, Src, Not); 1857 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1858 1859 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1860 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1861 1862 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1863 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1864 MI.eraseFromParent(); 1865 return true; 1866 } 1867 1868 bool AMDGPULegalizerInfo::legalizeITOFP( 1869 MachineInstr &MI, MachineRegisterInfo &MRI, 1870 MachineIRBuilder &B, bool Signed) const { 1871 1872 Register Dst = MI.getOperand(0).getReg(); 1873 Register Src = MI.getOperand(1).getReg(); 1874 1875 const LLT S64 = LLT::scalar(64); 1876 const LLT S32 = LLT::scalar(32); 1877 1878 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1879 1880 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1881 1882 auto CvtHi = Signed ? 1883 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1884 B.buildUITOFP(S64, Unmerge.getReg(1)); 1885 1886 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1887 1888 auto ThirtyTwo = B.buildConstant(S32, 32); 1889 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1890 .addUse(CvtHi.getReg(0)) 1891 .addUse(ThirtyTwo.getReg(0)); 1892 1893 // TODO: Should this propagate fast-math-flags? 1894 B.buildFAdd(Dst, LdExp, CvtLo); 1895 MI.eraseFromParent(); 1896 return true; 1897 } 1898 1899 // TODO: Copied from DAG implementation. Verify logic and document how this 1900 // actually works. 1901 bool AMDGPULegalizerInfo::legalizeFPTOI( 1902 MachineInstr &MI, MachineRegisterInfo &MRI, 1903 MachineIRBuilder &B, bool Signed) const { 1904 1905 Register Dst = MI.getOperand(0).getReg(); 1906 Register Src = MI.getOperand(1).getReg(); 1907 1908 const LLT S64 = LLT::scalar(64); 1909 const LLT S32 = LLT::scalar(32); 1910 1911 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1912 1913 unsigned Flags = MI.getFlags(); 1914 1915 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1916 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1917 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1918 1919 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1920 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1921 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1922 1923 auto Hi = Signed ? 1924 B.buildFPTOSI(S32, FloorMul) : 1925 B.buildFPTOUI(S32, FloorMul); 1926 auto Lo = B.buildFPTOUI(S32, Fma); 1927 1928 B.buildMerge(Dst, { Lo, Hi }); 1929 MI.eraseFromParent(); 1930 1931 return true; 1932 } 1933 1934 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 1935 MachineInstr &MI) const { 1936 MachineFunction &MF = Helper.MIRBuilder.getMF(); 1937 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1938 1939 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1940 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1941 1942 // With ieee_mode disabled, the instructions have the correct behavior 1943 // already for G_FMINNUM/G_FMAXNUM 1944 if (!MFI->getMode().IEEE) 1945 return !IsIEEEOp; 1946 1947 if (IsIEEEOp) 1948 return true; 1949 1950 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1951 } 1952 1953 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1954 MachineInstr &MI, MachineRegisterInfo &MRI, 1955 MachineIRBuilder &B) const { 1956 // TODO: Should move some of this into LegalizerHelper. 1957 1958 // TODO: Promote dynamic indexing of s16 to s32 1959 1960 // FIXME: Artifact combiner probably should have replaced the truncated 1961 // constant before this, so we shouldn't need 1962 // getConstantVRegValWithLookThrough. 1963 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1964 MI.getOperand(2).getReg(), MRI); 1965 if (!IdxVal) // Dynamic case will be selected to register indexing. 1966 return true; 1967 1968 Register Dst = MI.getOperand(0).getReg(); 1969 Register Vec = MI.getOperand(1).getReg(); 1970 1971 LLT VecTy = MRI.getType(Vec); 1972 LLT EltTy = VecTy.getElementType(); 1973 assert(EltTy == MRI.getType(Dst)); 1974 1975 if (IdxVal->Value < VecTy.getNumElements()) 1976 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1977 else 1978 B.buildUndef(Dst); 1979 1980 MI.eraseFromParent(); 1981 return true; 1982 } 1983 1984 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1985 MachineInstr &MI, MachineRegisterInfo &MRI, 1986 MachineIRBuilder &B) const { 1987 // TODO: Should move some of this into LegalizerHelper. 1988 1989 // TODO: Promote dynamic indexing of s16 to s32 1990 1991 // FIXME: Artifact combiner probably should have replaced the truncated 1992 // constant before this, so we shouldn't need 1993 // getConstantVRegValWithLookThrough. 1994 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1995 MI.getOperand(3).getReg(), MRI); 1996 if (!IdxVal) // Dynamic case will be selected to register indexing. 1997 return true; 1998 1999 Register Dst = MI.getOperand(0).getReg(); 2000 Register Vec = MI.getOperand(1).getReg(); 2001 Register Ins = MI.getOperand(2).getReg(); 2002 2003 LLT VecTy = MRI.getType(Vec); 2004 LLT EltTy = VecTy.getElementType(); 2005 assert(EltTy == MRI.getType(Ins)); 2006 2007 if (IdxVal->Value < VecTy.getNumElements()) 2008 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2009 else 2010 B.buildUndef(Dst); 2011 2012 MI.eraseFromParent(); 2013 return true; 2014 } 2015 2016 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2017 MachineInstr &MI, MachineRegisterInfo &MRI, 2018 MachineIRBuilder &B) const { 2019 const LLT V2S16 = LLT::vector(2, 16); 2020 2021 Register Dst = MI.getOperand(0).getReg(); 2022 Register Src0 = MI.getOperand(1).getReg(); 2023 LLT DstTy = MRI.getType(Dst); 2024 LLT SrcTy = MRI.getType(Src0); 2025 2026 if (SrcTy == V2S16 && DstTy == V2S16 && 2027 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2028 return true; 2029 2030 MachineIRBuilder HelperBuilder(MI); 2031 GISelObserverWrapper DummyObserver; 2032 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2033 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2034 } 2035 2036 bool AMDGPULegalizerInfo::legalizeSinCos( 2037 MachineInstr &MI, MachineRegisterInfo &MRI, 2038 MachineIRBuilder &B) const { 2039 2040 Register DstReg = MI.getOperand(0).getReg(); 2041 Register SrcReg = MI.getOperand(1).getReg(); 2042 LLT Ty = MRI.getType(DstReg); 2043 unsigned Flags = MI.getFlags(); 2044 2045 Register TrigVal; 2046 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2047 if (ST.hasTrigReducedRange()) { 2048 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2049 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2050 .addUse(MulVal.getReg(0)) 2051 .setMIFlags(Flags).getReg(0); 2052 } else 2053 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2054 2055 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2056 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2057 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2058 .addUse(TrigVal) 2059 .setMIFlags(Flags); 2060 MI.eraseFromParent(); 2061 return true; 2062 } 2063 2064 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2065 MachineIRBuilder &B, 2066 const GlobalValue *GV, 2067 int64_t Offset, 2068 unsigned GAFlags) const { 2069 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2070 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2071 // to the following code sequence: 2072 // 2073 // For constant address space: 2074 // s_getpc_b64 s[0:1] 2075 // s_add_u32 s0, s0, $symbol 2076 // s_addc_u32 s1, s1, 0 2077 // 2078 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2079 // a fixup or relocation is emitted to replace $symbol with a literal 2080 // constant, which is a pc-relative offset from the encoding of the $symbol 2081 // operand to the global variable. 2082 // 2083 // For global address space: 2084 // s_getpc_b64 s[0:1] 2085 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2086 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2087 // 2088 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2089 // fixups or relocations are emitted to replace $symbol@*@lo and 2090 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2091 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2092 // operand to the global variable. 2093 // 2094 // What we want here is an offset from the value returned by s_getpc 2095 // (which is the address of the s_add_u32 instruction) to the global 2096 // variable, but since the encoding of $symbol starts 4 bytes after the start 2097 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2098 // small. This requires us to add 4 to the global variable offset in order to 2099 // compute the correct address. 2100 2101 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2102 2103 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2104 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2105 2106 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2107 .addDef(PCReg); 2108 2109 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2110 if (GAFlags == SIInstrInfo::MO_NONE) 2111 MIB.addImm(0); 2112 else 2113 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2114 2115 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2116 2117 if (PtrTy.getSizeInBits() == 32) 2118 B.buildExtract(DstReg, PCReg, 0); 2119 return true; 2120 } 2121 2122 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2123 MachineInstr &MI, MachineRegisterInfo &MRI, 2124 MachineIRBuilder &B) const { 2125 Register DstReg = MI.getOperand(0).getReg(); 2126 LLT Ty = MRI.getType(DstReg); 2127 unsigned AS = Ty.getAddressSpace(); 2128 2129 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2130 MachineFunction &MF = B.getMF(); 2131 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2132 2133 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2134 if (!MFI->isEntryFunction()) { 2135 const Function &Fn = MF.getFunction(); 2136 DiagnosticInfoUnsupported BadLDSDecl( 2137 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2138 DS_Warning); 2139 Fn.getContext().diagnose(BadLDSDecl); 2140 2141 // We currently don't have a way to correctly allocate LDS objects that 2142 // aren't directly associated with a kernel. We do force inlining of 2143 // functions that use local objects. However, if these dead functions are 2144 // not eliminated, we don't want a compile time error. Just emit a warning 2145 // and a trap, since there should be no callable path here. 2146 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2147 B.buildUndef(DstReg); 2148 MI.eraseFromParent(); 2149 return true; 2150 } 2151 2152 // TODO: We could emit code to handle the initialization somewhere. 2153 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2154 const SITargetLowering *TLI = ST.getTargetLowering(); 2155 if (!TLI->shouldUseLDSConstAddress(GV)) { 2156 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2157 return true; // Leave in place; 2158 } 2159 2160 B.buildConstant( 2161 DstReg, 2162 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2163 MI.eraseFromParent(); 2164 return true; 2165 } 2166 2167 const Function &Fn = MF.getFunction(); 2168 DiagnosticInfoUnsupported BadInit( 2169 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2170 Fn.getContext().diagnose(BadInit); 2171 return true; 2172 } 2173 2174 const SITargetLowering *TLI = ST.getTargetLowering(); 2175 2176 if (TLI->shouldEmitFixup(GV)) { 2177 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2178 MI.eraseFromParent(); 2179 return true; 2180 } 2181 2182 if (TLI->shouldEmitPCReloc(GV)) { 2183 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2184 MI.eraseFromParent(); 2185 return true; 2186 } 2187 2188 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2189 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2190 2191 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2192 MachinePointerInfo::getGOT(MF), 2193 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2194 MachineMemOperand::MOInvariant, 2195 8 /*Size*/, Align(8)); 2196 2197 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2198 2199 if (Ty.getSizeInBits() == 32) { 2200 // Truncate if this is a 32-bit constant adrdess. 2201 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2202 B.buildExtract(DstReg, Load, 0); 2203 } else 2204 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2205 2206 MI.eraseFromParent(); 2207 return true; 2208 } 2209 2210 bool AMDGPULegalizerInfo::legalizeLoad( 2211 MachineInstr &MI, MachineRegisterInfo &MRI, 2212 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2213 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2214 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2215 Observer.changingInstr(MI); 2216 MI.getOperand(1).setReg(Cast.getReg(0)); 2217 Observer.changedInstr(MI); 2218 return true; 2219 } 2220 2221 bool AMDGPULegalizerInfo::legalizeFMad( 2222 MachineInstr &MI, MachineRegisterInfo &MRI, 2223 MachineIRBuilder &B) const { 2224 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2225 assert(Ty.isScalar()); 2226 2227 MachineFunction &MF = B.getMF(); 2228 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2229 2230 // TODO: Always legal with future ftz flag. 2231 // FIXME: Do we need just output? 2232 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2233 return true; 2234 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2235 return true; 2236 2237 MachineIRBuilder HelperBuilder(MI); 2238 GISelObserverWrapper DummyObserver; 2239 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2240 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2241 } 2242 2243 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2244 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2245 Register DstReg = MI.getOperand(0).getReg(); 2246 Register PtrReg = MI.getOperand(1).getReg(); 2247 Register CmpVal = MI.getOperand(2).getReg(); 2248 Register NewVal = MI.getOperand(3).getReg(); 2249 2250 assert(SITargetLowering::isFlatGlobalAddrSpace( 2251 MRI.getType(PtrReg).getAddressSpace()) && 2252 "this should not have been custom lowered"); 2253 2254 LLT ValTy = MRI.getType(CmpVal); 2255 LLT VecTy = LLT::vector(2, ValTy); 2256 2257 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2258 2259 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2260 .addDef(DstReg) 2261 .addUse(PtrReg) 2262 .addUse(PackedVal) 2263 .setMemRefs(MI.memoperands()); 2264 2265 MI.eraseFromParent(); 2266 return true; 2267 } 2268 2269 bool AMDGPULegalizerInfo::legalizeFlog( 2270 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2271 Register Dst = MI.getOperand(0).getReg(); 2272 Register Src = MI.getOperand(1).getReg(); 2273 LLT Ty = B.getMRI()->getType(Dst); 2274 unsigned Flags = MI.getFlags(); 2275 2276 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2277 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2278 2279 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2280 MI.eraseFromParent(); 2281 return true; 2282 } 2283 2284 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2285 MachineIRBuilder &B) const { 2286 Register Dst = MI.getOperand(0).getReg(); 2287 Register Src = MI.getOperand(1).getReg(); 2288 unsigned Flags = MI.getFlags(); 2289 LLT Ty = B.getMRI()->getType(Dst); 2290 2291 auto K = B.buildFConstant(Ty, numbers::log2e); 2292 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2293 B.buildFExp2(Dst, Mul, Flags); 2294 MI.eraseFromParent(); 2295 return true; 2296 } 2297 2298 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2299 MachineIRBuilder &B) const { 2300 Register Dst = MI.getOperand(0).getReg(); 2301 Register Src0 = MI.getOperand(1).getReg(); 2302 Register Src1 = MI.getOperand(2).getReg(); 2303 unsigned Flags = MI.getFlags(); 2304 LLT Ty = B.getMRI()->getType(Dst); 2305 const LLT S16 = LLT::scalar(16); 2306 const LLT S32 = LLT::scalar(32); 2307 2308 if (Ty == S32) { 2309 auto Log = B.buildFLog2(S32, Src0, Flags); 2310 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2311 .addUse(Log.getReg(0)) 2312 .addUse(Src1) 2313 .setMIFlags(Flags); 2314 B.buildFExp2(Dst, Mul, Flags); 2315 } else if (Ty == S16) { 2316 // There's no f16 fmul_legacy, so we need to convert for it. 2317 auto Log = B.buildFLog2(S16, Src0, Flags); 2318 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2319 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2320 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2321 .addUse(Ext0.getReg(0)) 2322 .addUse(Ext1.getReg(0)) 2323 .setMIFlags(Flags); 2324 2325 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2326 } else 2327 return false; 2328 2329 MI.eraseFromParent(); 2330 return true; 2331 } 2332 2333 // Find a source register, ignoring any possible source modifiers. 2334 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2335 Register ModSrc = OrigSrc; 2336 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2337 ModSrc = SrcFNeg->getOperand(1).getReg(); 2338 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2339 ModSrc = SrcFAbs->getOperand(1).getReg(); 2340 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2341 ModSrc = SrcFAbs->getOperand(1).getReg(); 2342 return ModSrc; 2343 } 2344 2345 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2346 MachineRegisterInfo &MRI, 2347 MachineIRBuilder &B) const { 2348 2349 const LLT S1 = LLT::scalar(1); 2350 const LLT S64 = LLT::scalar(64); 2351 Register Dst = MI.getOperand(0).getReg(); 2352 Register OrigSrc = MI.getOperand(1).getReg(); 2353 unsigned Flags = MI.getFlags(); 2354 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2355 "this should not have been custom lowered"); 2356 2357 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2358 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2359 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2360 // V_FRACT bug is: 2361 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2362 // 2363 // Convert floor(x) to (x - fract(x)) 2364 2365 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2366 .addUse(OrigSrc) 2367 .setMIFlags(Flags); 2368 2369 // Give source modifier matching some assistance before obscuring a foldable 2370 // pattern. 2371 2372 // TODO: We can avoid the neg on the fract? The input sign to fract 2373 // shouldn't matter? 2374 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2375 2376 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2377 2378 Register Min = MRI.createGenericVirtualRegister(S64); 2379 2380 // We don't need to concern ourselves with the snan handling difference, so 2381 // use the one which will directly select. 2382 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2383 if (MFI->getMode().IEEE) 2384 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2385 else 2386 B.buildFMinNum(Min, Fract, Const, Flags); 2387 2388 Register CorrectedFract = Min; 2389 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2390 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2391 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2392 } 2393 2394 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2395 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2396 2397 MI.eraseFromParent(); 2398 return true; 2399 } 2400 2401 // Turn an illegal packed v2s16 build vector into bit operations. 2402 // TODO: This should probably be a bitcast action in LegalizerHelper. 2403 bool AMDGPULegalizerInfo::legalizeBuildVector( 2404 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2405 Register Dst = MI.getOperand(0).getReg(); 2406 const LLT S32 = LLT::scalar(32); 2407 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2408 2409 Register Src0 = MI.getOperand(1).getReg(); 2410 Register Src1 = MI.getOperand(2).getReg(); 2411 assert(MRI.getType(Src0) == LLT::scalar(16)); 2412 2413 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2414 B.buildBitcast(Dst, Merge); 2415 2416 MI.eraseFromParent(); 2417 return true; 2418 } 2419 2420 // Return the use branch instruction, otherwise null if the usage is invalid. 2421 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2422 MachineRegisterInfo &MRI, 2423 MachineInstr *&Br, 2424 MachineBasicBlock *&UncondBrTarget) { 2425 Register CondDef = MI.getOperand(0).getReg(); 2426 if (!MRI.hasOneNonDBGUse(CondDef)) 2427 return nullptr; 2428 2429 MachineBasicBlock *Parent = MI.getParent(); 2430 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2431 if (UseMI.getParent() != Parent || 2432 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2433 return nullptr; 2434 2435 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2436 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2437 if (Next == Parent->end()) { 2438 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2439 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2440 return nullptr; 2441 UncondBrTarget = &*NextMBB; 2442 } else { 2443 if (Next->getOpcode() != AMDGPU::G_BR) 2444 return nullptr; 2445 Br = &*Next; 2446 UncondBrTarget = Br->getOperand(0).getMBB(); 2447 } 2448 2449 return &UseMI; 2450 } 2451 2452 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2453 MachineRegisterInfo &MRI, 2454 Register LiveIn, 2455 Register PhyReg) const { 2456 assert(PhyReg.isPhysical() && "Physical register expected"); 2457 2458 // Insert the live-in copy, if required, by defining destination virtual 2459 // register. 2460 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2461 if (!MRI.getVRegDef(LiveIn)) { 2462 // FIXME: Should have scoped insert pt 2463 MachineBasicBlock &OrigInsBB = B.getMBB(); 2464 auto OrigInsPt = B.getInsertPt(); 2465 2466 MachineBasicBlock &EntryMBB = B.getMF().front(); 2467 EntryMBB.addLiveIn(PhyReg); 2468 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2469 B.buildCopy(LiveIn, PhyReg); 2470 2471 B.setInsertPt(OrigInsBB, OrigInsPt); 2472 } 2473 2474 return LiveIn; 2475 } 2476 2477 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2478 MachineRegisterInfo &MRI, 2479 Register PhyReg, LLT Ty, 2480 bool InsertLiveInCopy) const { 2481 assert(PhyReg.isPhysical() && "Physical register expected"); 2482 2483 // Get or create virtual live-in regester 2484 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2485 if (!LiveIn) { 2486 LiveIn = MRI.createGenericVirtualRegister(Ty); 2487 MRI.addLiveIn(PhyReg, LiveIn); 2488 } 2489 2490 // When the actual true copy required is from virtual register to physical 2491 // register (to be inserted later), live-in copy insertion from physical 2492 // to register virtual register is not required 2493 if (!InsertLiveInCopy) 2494 return LiveIn; 2495 2496 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2497 } 2498 2499 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2500 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2501 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2502 const ArgDescriptor *Arg; 2503 const TargetRegisterClass *RC; 2504 LLT ArgTy; 2505 std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType); 2506 if (!Arg) { 2507 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2508 return nullptr; 2509 } 2510 return Arg; 2511 } 2512 2513 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2514 const ArgDescriptor *Arg) const { 2515 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2516 return false; // TODO: Handle these 2517 2518 Register SrcReg = Arg->getRegister(); 2519 assert(SrcReg.isPhysical() && "Physical register expected"); 2520 assert(DstReg.isVirtual() && "Virtual register expected"); 2521 2522 MachineRegisterInfo &MRI = *B.getMRI(); 2523 2524 LLT Ty = MRI.getType(DstReg); 2525 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2526 2527 if (Arg->isMasked()) { 2528 // TODO: Should we try to emit this once in the entry block? 2529 const LLT S32 = LLT::scalar(32); 2530 const unsigned Mask = Arg->getMask(); 2531 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2532 2533 Register AndMaskSrc = LiveIn; 2534 2535 if (Shift != 0) { 2536 auto ShiftAmt = B.buildConstant(S32, Shift); 2537 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2538 } 2539 2540 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2541 } else { 2542 B.buildCopy(DstReg, LiveIn); 2543 } 2544 2545 return true; 2546 } 2547 2548 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2549 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2550 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2551 2552 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2553 if (!Arg) 2554 return false; 2555 2556 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2557 return false; 2558 2559 MI.eraseFromParent(); 2560 return true; 2561 } 2562 2563 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2564 MachineRegisterInfo &MRI, 2565 MachineIRBuilder &B) const { 2566 Register Dst = MI.getOperand(0).getReg(); 2567 LLT DstTy = MRI.getType(Dst); 2568 LLT S16 = LLT::scalar(16); 2569 LLT S32 = LLT::scalar(32); 2570 LLT S64 = LLT::scalar(64); 2571 2572 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2573 return true; 2574 2575 if (DstTy == S16) 2576 return legalizeFDIV16(MI, MRI, B); 2577 if (DstTy == S32) 2578 return legalizeFDIV32(MI, MRI, B); 2579 if (DstTy == S64) 2580 return legalizeFDIV64(MI, MRI, B); 2581 2582 return false; 2583 } 2584 2585 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2586 Register DstReg, 2587 Register X, 2588 Register Y, 2589 bool IsDiv) const { 2590 const LLT S1 = LLT::scalar(1); 2591 const LLT S32 = LLT::scalar(32); 2592 2593 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2594 // algorithm used here. 2595 2596 // Initial estimate of inv(y). 2597 auto FloatY = B.buildUITOFP(S32, Y); 2598 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2599 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2600 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2601 auto Z = B.buildFPTOUI(S32, ScaledY); 2602 2603 // One round of UNR. 2604 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2605 auto NegYZ = B.buildMul(S32, NegY, Z); 2606 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2607 2608 // Quotient/remainder estimate. 2609 auto Q = B.buildUMulH(S32, X, Z); 2610 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2611 2612 // First quotient/remainder refinement. 2613 auto One = B.buildConstant(S32, 1); 2614 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2615 if (IsDiv) 2616 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2617 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2618 2619 // Second quotient/remainder refinement. 2620 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2621 if (IsDiv) 2622 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2623 else 2624 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2625 } 2626 2627 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2628 MachineRegisterInfo &MRI, 2629 MachineIRBuilder &B) const { 2630 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2631 Register DstReg = MI.getOperand(0).getReg(); 2632 Register Num = MI.getOperand(1).getReg(); 2633 Register Den = MI.getOperand(2).getReg(); 2634 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2635 MI.eraseFromParent(); 2636 return true; 2637 } 2638 2639 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2640 // 2641 // Return lo, hi of result 2642 // 2643 // %cvt.lo = G_UITOFP Val.lo 2644 // %cvt.hi = G_UITOFP Val.hi 2645 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2646 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2647 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2648 // %mul2 = G_FMUL %mul1, 2**(-32) 2649 // %trunc = G_INTRINSIC_TRUNC %mul2 2650 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2651 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2652 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2653 Register Val) { 2654 const LLT S32 = LLT::scalar(32); 2655 auto Unmerge = B.buildUnmerge(S32, Val); 2656 2657 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2658 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2659 2660 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2661 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2662 2663 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2664 auto Mul1 = 2665 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2666 2667 // 2**(-32) 2668 auto Mul2 = 2669 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2670 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2671 2672 // -(2**32) 2673 auto Mad2 = B.buildFMAD(S32, Trunc, 2674 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2675 2676 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2677 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2678 2679 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2680 } 2681 2682 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2683 Register DstReg, 2684 Register Numer, 2685 Register Denom, 2686 bool IsDiv) const { 2687 const LLT S32 = LLT::scalar(32); 2688 const LLT S64 = LLT::scalar(64); 2689 const LLT S1 = LLT::scalar(1); 2690 Register RcpLo, RcpHi; 2691 2692 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2693 2694 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2695 2696 auto Zero64 = B.buildConstant(S64, 0); 2697 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2698 2699 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2700 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2701 2702 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2703 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2704 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2705 2706 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2707 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2708 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2709 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2710 2711 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2712 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2713 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2714 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2715 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2716 2717 auto Zero32 = B.buildConstant(S32, 0); 2718 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2719 auto Add2_HiC = 2720 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2721 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2722 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2723 2724 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2725 Register NumerLo = UnmergeNumer.getReg(0); 2726 Register NumerHi = UnmergeNumer.getReg(1); 2727 2728 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2729 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2730 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2731 Register Mul3_Lo = UnmergeMul3.getReg(0); 2732 Register Mul3_Hi = UnmergeMul3.getReg(1); 2733 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2734 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2735 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2736 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2737 2738 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2739 Register DenomLo = UnmergeDenom.getReg(0); 2740 Register DenomHi = UnmergeDenom.getReg(1); 2741 2742 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2743 auto C1 = B.buildSExt(S32, CmpHi); 2744 2745 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2746 auto C2 = B.buildSExt(S32, CmpLo); 2747 2748 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2749 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2750 2751 // TODO: Here and below portions of the code can be enclosed into if/endif. 2752 // Currently control flow is unconditional and we have 4 selects after 2753 // potential endif to substitute PHIs. 2754 2755 // if C3 != 0 ... 2756 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2757 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2758 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2759 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2760 2761 auto One64 = B.buildConstant(S64, 1); 2762 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2763 2764 auto C4 = 2765 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2766 auto C5 = 2767 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2768 auto C6 = B.buildSelect( 2769 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2770 2771 // if (C6 != 0) 2772 auto Add4 = B.buildAdd(S64, Add3, One64); 2773 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2774 2775 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2776 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2777 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2778 2779 // endif C6 2780 // endif C3 2781 2782 if (IsDiv) { 2783 auto Sel1 = B.buildSelect( 2784 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2785 B.buildSelect(DstReg, 2786 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2787 } else { 2788 auto Sel2 = B.buildSelect( 2789 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2790 B.buildSelect(DstReg, 2791 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2792 } 2793 } 2794 2795 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2796 MachineRegisterInfo &MRI, 2797 MachineIRBuilder &B) const { 2798 const LLT S64 = LLT::scalar(64); 2799 const LLT S32 = LLT::scalar(32); 2800 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2801 Register DstReg = MI.getOperand(0).getReg(); 2802 Register Num = MI.getOperand(1).getReg(); 2803 Register Den = MI.getOperand(2).getReg(); 2804 LLT Ty = MRI.getType(DstReg); 2805 2806 if (Ty == S32) 2807 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2808 else if (Ty == S64) 2809 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2810 else 2811 return false; 2812 2813 MI.eraseFromParent(); 2814 return true; 2815 2816 } 2817 2818 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2819 MachineRegisterInfo &MRI, 2820 MachineIRBuilder &B) const { 2821 const LLT S64 = LLT::scalar(64); 2822 const LLT S32 = LLT::scalar(32); 2823 2824 Register DstReg = MI.getOperand(0).getReg(); 2825 const LLT Ty = MRI.getType(DstReg); 2826 if (Ty != S32 && Ty != S64) 2827 return false; 2828 2829 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2830 2831 Register LHS = MI.getOperand(1).getReg(); 2832 Register RHS = MI.getOperand(2).getReg(); 2833 2834 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2835 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2836 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2837 2838 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2839 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2840 2841 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2842 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2843 2844 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2845 if (Ty == S32) 2846 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2847 else 2848 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2849 2850 Register Sign; 2851 if (IsDiv) 2852 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2853 else 2854 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2855 2856 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2857 B.buildSub(DstReg, UDivRem, Sign); 2858 2859 MI.eraseFromParent(); 2860 return true; 2861 } 2862 2863 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2864 MachineRegisterInfo &MRI, 2865 MachineIRBuilder &B) const { 2866 Register Res = MI.getOperand(0).getReg(); 2867 Register LHS = MI.getOperand(1).getReg(); 2868 Register RHS = MI.getOperand(2).getReg(); 2869 2870 uint16_t Flags = MI.getFlags(); 2871 2872 LLT ResTy = MRI.getType(Res); 2873 LLT S32 = LLT::scalar(32); 2874 LLT S64 = LLT::scalar(64); 2875 2876 const MachineFunction &MF = B.getMF(); 2877 bool Unsafe = 2878 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2879 2880 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2881 return false; 2882 2883 if (!Unsafe && ResTy == S32 && 2884 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2885 return false; 2886 2887 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2888 // 1 / x -> RCP(x) 2889 if (CLHS->isExactlyValue(1.0)) { 2890 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2891 .addUse(RHS) 2892 .setMIFlags(Flags); 2893 2894 MI.eraseFromParent(); 2895 return true; 2896 } 2897 2898 // -1 / x -> RCP( FNEG(x) ) 2899 if (CLHS->isExactlyValue(-1.0)) { 2900 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2901 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2902 .addUse(FNeg.getReg(0)) 2903 .setMIFlags(Flags); 2904 2905 MI.eraseFromParent(); 2906 return true; 2907 } 2908 } 2909 2910 // x / y -> x * (1.0 / y) 2911 if (Unsafe) { 2912 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2913 .addUse(RHS) 2914 .setMIFlags(Flags); 2915 B.buildFMul(Res, LHS, RCP, Flags); 2916 2917 MI.eraseFromParent(); 2918 return true; 2919 } 2920 2921 return false; 2922 } 2923 2924 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2925 MachineRegisterInfo &MRI, 2926 MachineIRBuilder &B) const { 2927 Register Res = MI.getOperand(0).getReg(); 2928 Register LHS = MI.getOperand(1).getReg(); 2929 Register RHS = MI.getOperand(2).getReg(); 2930 2931 uint16_t Flags = MI.getFlags(); 2932 2933 LLT S16 = LLT::scalar(16); 2934 LLT S32 = LLT::scalar(32); 2935 2936 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2937 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2938 2939 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2940 .addUse(RHSExt.getReg(0)) 2941 .setMIFlags(Flags); 2942 2943 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2944 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2945 2946 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2947 .addUse(RDst.getReg(0)) 2948 .addUse(RHS) 2949 .addUse(LHS) 2950 .setMIFlags(Flags); 2951 2952 MI.eraseFromParent(); 2953 return true; 2954 } 2955 2956 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2957 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2958 static void toggleSPDenormMode(bool Enable, 2959 MachineIRBuilder &B, 2960 const GCNSubtarget &ST, 2961 AMDGPU::SIModeRegisterDefaults Mode) { 2962 // Set SP denorm mode to this value. 2963 unsigned SPDenormMode = 2964 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2965 2966 if (ST.hasDenormModeInst()) { 2967 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2968 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2969 2970 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2971 B.buildInstr(AMDGPU::S_DENORM_MODE) 2972 .addImm(NewDenormModeValue); 2973 2974 } else { 2975 // Select FP32 bit field in mode register. 2976 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2977 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2978 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2979 2980 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2981 .addImm(SPDenormMode) 2982 .addImm(SPDenormModeBitField); 2983 } 2984 } 2985 2986 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2987 MachineRegisterInfo &MRI, 2988 MachineIRBuilder &B) const { 2989 Register Res = MI.getOperand(0).getReg(); 2990 Register LHS = MI.getOperand(1).getReg(); 2991 Register RHS = MI.getOperand(2).getReg(); 2992 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2993 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2994 2995 uint16_t Flags = MI.getFlags(); 2996 2997 LLT S32 = LLT::scalar(32); 2998 LLT S1 = LLT::scalar(1); 2999 3000 auto One = B.buildFConstant(S32, 1.0f); 3001 3002 auto DenominatorScaled = 3003 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3004 .addUse(LHS) 3005 .addUse(RHS) 3006 .addImm(0) 3007 .setMIFlags(Flags); 3008 auto NumeratorScaled = 3009 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3010 .addUse(LHS) 3011 .addUse(RHS) 3012 .addImm(1) 3013 .setMIFlags(Flags); 3014 3015 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3016 .addUse(DenominatorScaled.getReg(0)) 3017 .setMIFlags(Flags); 3018 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3019 3020 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3021 // aren't modeled as reading it. 3022 if (!Mode.allFP32Denormals()) 3023 toggleSPDenormMode(true, B, ST, Mode); 3024 3025 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3026 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3027 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3028 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3029 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3030 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3031 3032 if (!Mode.allFP32Denormals()) 3033 toggleSPDenormMode(false, B, ST, Mode); 3034 3035 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3036 .addUse(Fma4.getReg(0)) 3037 .addUse(Fma1.getReg(0)) 3038 .addUse(Fma3.getReg(0)) 3039 .addUse(NumeratorScaled.getReg(1)) 3040 .setMIFlags(Flags); 3041 3042 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3043 .addUse(Fmas.getReg(0)) 3044 .addUse(RHS) 3045 .addUse(LHS) 3046 .setMIFlags(Flags); 3047 3048 MI.eraseFromParent(); 3049 return true; 3050 } 3051 3052 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3053 MachineRegisterInfo &MRI, 3054 MachineIRBuilder &B) const { 3055 Register Res = MI.getOperand(0).getReg(); 3056 Register LHS = MI.getOperand(1).getReg(); 3057 Register RHS = MI.getOperand(2).getReg(); 3058 3059 uint16_t Flags = MI.getFlags(); 3060 3061 LLT S64 = LLT::scalar(64); 3062 LLT S1 = LLT::scalar(1); 3063 3064 auto One = B.buildFConstant(S64, 1.0); 3065 3066 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3067 .addUse(LHS) 3068 .addUse(RHS) 3069 .addImm(0) 3070 .setMIFlags(Flags); 3071 3072 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3073 3074 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3075 .addUse(DivScale0.getReg(0)) 3076 .setMIFlags(Flags); 3077 3078 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3079 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3080 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3081 3082 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3083 .addUse(LHS) 3084 .addUse(RHS) 3085 .addImm(1) 3086 .setMIFlags(Flags); 3087 3088 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3089 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3090 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3091 3092 Register Scale; 3093 if (!ST.hasUsableDivScaleConditionOutput()) { 3094 // Workaround a hardware bug on SI where the condition output from div_scale 3095 // is not usable. 3096 3097 LLT S32 = LLT::scalar(32); 3098 3099 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3100 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3101 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3102 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3103 3104 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3105 Scale1Unmerge.getReg(1)); 3106 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3107 Scale0Unmerge.getReg(1)); 3108 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3109 } else { 3110 Scale = DivScale1.getReg(1); 3111 } 3112 3113 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3114 .addUse(Fma4.getReg(0)) 3115 .addUse(Fma3.getReg(0)) 3116 .addUse(Mul.getReg(0)) 3117 .addUse(Scale) 3118 .setMIFlags(Flags); 3119 3120 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3121 .addUse(Fmas.getReg(0)) 3122 .addUse(RHS) 3123 .addUse(LHS) 3124 .setMIFlags(Flags); 3125 3126 MI.eraseFromParent(); 3127 return true; 3128 } 3129 3130 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3131 MachineRegisterInfo &MRI, 3132 MachineIRBuilder &B) const { 3133 Register Res = MI.getOperand(0).getReg(); 3134 Register LHS = MI.getOperand(2).getReg(); 3135 Register RHS = MI.getOperand(3).getReg(); 3136 uint16_t Flags = MI.getFlags(); 3137 3138 LLT S32 = LLT::scalar(32); 3139 LLT S1 = LLT::scalar(1); 3140 3141 auto Abs = B.buildFAbs(S32, RHS, Flags); 3142 const APFloat C0Val(1.0f); 3143 3144 auto C0 = B.buildConstant(S32, 0x6f800000); 3145 auto C1 = B.buildConstant(S32, 0x2f800000); 3146 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3147 3148 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3149 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3150 3151 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3152 3153 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3154 .addUse(Mul0.getReg(0)) 3155 .setMIFlags(Flags); 3156 3157 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3158 3159 B.buildFMul(Res, Sel, Mul1, Flags); 3160 3161 MI.eraseFromParent(); 3162 return true; 3163 } 3164 3165 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3166 MachineRegisterInfo &MRI, 3167 MachineIRBuilder &B) const { 3168 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3169 uint64_t Offset = 3170 ST.getTargetLowering()->getImplicitParameterOffset( 3171 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3172 LLT DstTy = MRI.getType(DstReg); 3173 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3174 3175 const ArgDescriptor *Arg; 3176 const TargetRegisterClass *RC; 3177 LLT ArgTy; 3178 std::tie(Arg, RC, ArgTy) = 3179 MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3180 if (!Arg) 3181 return false; 3182 3183 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3184 if (!loadInputValue(KernargPtrReg, B, Arg)) 3185 return false; 3186 3187 // FIXME: This should be nuw 3188 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3189 return true; 3190 } 3191 3192 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3193 MachineRegisterInfo &MRI, 3194 MachineIRBuilder &B) const { 3195 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3196 if (!MFI->isEntryFunction()) { 3197 return legalizePreloadedArgIntrin(MI, MRI, B, 3198 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3199 } 3200 3201 Register DstReg = MI.getOperand(0).getReg(); 3202 if (!getImplicitArgPtr(DstReg, MRI, B)) 3203 return false; 3204 3205 MI.eraseFromParent(); 3206 return true; 3207 } 3208 3209 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3210 MachineRegisterInfo &MRI, 3211 MachineIRBuilder &B, 3212 unsigned AddrSpace) const { 3213 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3214 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3215 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3216 MI.eraseFromParent(); 3217 return true; 3218 } 3219 3220 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3221 // offset (the offset that is included in bounds checking and swizzling, to be 3222 // split between the instruction's voffset and immoffset fields) and soffset 3223 // (the offset that is excluded from bounds checking and swizzling, to go in 3224 // the instruction's soffset field). This function takes the first kind of 3225 // offset and figures out how to split it between voffset and immoffset. 3226 std::tuple<Register, unsigned, unsigned> 3227 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3228 Register OrigOffset) const { 3229 const unsigned MaxImm = 4095; 3230 Register BaseReg; 3231 unsigned TotalConstOffset; 3232 MachineInstr *OffsetDef; 3233 const LLT S32 = LLT::scalar(32); 3234 3235 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3236 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3237 3238 unsigned ImmOffset = TotalConstOffset; 3239 3240 // If the immediate value is too big for the immoffset field, put the value 3241 // and -4096 into the immoffset field so that the value that is copied/added 3242 // for the voffset field is a multiple of 4096, and it stands more chance 3243 // of being CSEd with the copy/add for another similar load/store. 3244 // However, do not do that rounding down to a multiple of 4096 if that is a 3245 // negative number, as it appears to be illegal to have a negative offset 3246 // in the vgpr, even if adding the immediate offset makes it positive. 3247 unsigned Overflow = ImmOffset & ~MaxImm; 3248 ImmOffset -= Overflow; 3249 if ((int32_t)Overflow < 0) { 3250 Overflow += ImmOffset; 3251 ImmOffset = 0; 3252 } 3253 3254 if (Overflow != 0) { 3255 if (!BaseReg) { 3256 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3257 } else { 3258 auto OverflowVal = B.buildConstant(S32, Overflow); 3259 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3260 } 3261 } 3262 3263 if (!BaseReg) 3264 BaseReg = B.buildConstant(S32, 0).getReg(0); 3265 3266 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3267 } 3268 3269 /// Handle register layout difference for f16 images for some subtargets. 3270 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3271 MachineRegisterInfo &MRI, 3272 Register Reg) const { 3273 if (!ST.hasUnpackedD16VMem()) 3274 return Reg; 3275 3276 const LLT S16 = LLT::scalar(16); 3277 const LLT S32 = LLT::scalar(32); 3278 LLT StoreVT = MRI.getType(Reg); 3279 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3280 3281 auto Unmerge = B.buildUnmerge(S16, Reg); 3282 3283 SmallVector<Register, 4> WideRegs; 3284 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3285 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3286 3287 int NumElts = StoreVT.getNumElements(); 3288 3289 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3290 } 3291 3292 Register AMDGPULegalizerInfo::fixStoreSourceType( 3293 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3294 MachineRegisterInfo *MRI = B.getMRI(); 3295 LLT Ty = MRI->getType(VData); 3296 3297 const LLT S16 = LLT::scalar(16); 3298 3299 // Fixup illegal register types for i8 stores. 3300 if (Ty == LLT::scalar(8) || Ty == S16) { 3301 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3302 return AnyExt; 3303 } 3304 3305 if (Ty.isVector()) { 3306 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3307 if (IsFormat) 3308 return handleD16VData(B, *MRI, VData); 3309 } 3310 } 3311 3312 return VData; 3313 } 3314 3315 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3316 MachineRegisterInfo &MRI, 3317 MachineIRBuilder &B, 3318 bool IsTyped, 3319 bool IsFormat) const { 3320 Register VData = MI.getOperand(1).getReg(); 3321 LLT Ty = MRI.getType(VData); 3322 LLT EltTy = Ty.getScalarType(); 3323 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3324 const LLT S32 = LLT::scalar(32); 3325 3326 VData = fixStoreSourceType(B, VData, IsFormat); 3327 Register RSrc = MI.getOperand(2).getReg(); 3328 3329 MachineMemOperand *MMO = *MI.memoperands_begin(); 3330 const int MemSize = MMO->getSize(); 3331 3332 unsigned ImmOffset; 3333 unsigned TotalOffset; 3334 3335 // The typed intrinsics add an immediate after the registers. 3336 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3337 3338 // The struct intrinsic variants add one additional operand over raw. 3339 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3340 Register VIndex; 3341 int OpOffset = 0; 3342 if (HasVIndex) { 3343 VIndex = MI.getOperand(3).getReg(); 3344 OpOffset = 1; 3345 } 3346 3347 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3348 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3349 3350 unsigned Format = 0; 3351 if (IsTyped) { 3352 Format = MI.getOperand(5 + OpOffset).getImm(); 3353 ++OpOffset; 3354 } 3355 3356 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3357 3358 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3359 if (TotalOffset != 0) 3360 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3361 3362 unsigned Opc; 3363 if (IsTyped) { 3364 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3365 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3366 } else if (IsFormat) { 3367 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3368 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3369 } else { 3370 switch (MemSize) { 3371 case 1: 3372 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3373 break; 3374 case 2: 3375 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3376 break; 3377 default: 3378 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3379 break; 3380 } 3381 } 3382 3383 if (!VIndex) 3384 VIndex = B.buildConstant(S32, 0).getReg(0); 3385 3386 auto MIB = B.buildInstr(Opc) 3387 .addUse(VData) // vdata 3388 .addUse(RSrc) // rsrc 3389 .addUse(VIndex) // vindex 3390 .addUse(VOffset) // voffset 3391 .addUse(SOffset) // soffset 3392 .addImm(ImmOffset); // offset(imm) 3393 3394 if (IsTyped) 3395 MIB.addImm(Format); 3396 3397 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3398 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3399 .addMemOperand(MMO); 3400 3401 MI.eraseFromParent(); 3402 return true; 3403 } 3404 3405 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3406 MachineRegisterInfo &MRI, 3407 MachineIRBuilder &B, 3408 bool IsFormat, 3409 bool IsTyped) const { 3410 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3411 MachineMemOperand *MMO = *MI.memoperands_begin(); 3412 const int MemSize = MMO->getSize(); 3413 const LLT S32 = LLT::scalar(32); 3414 3415 Register Dst = MI.getOperand(0).getReg(); 3416 Register RSrc = MI.getOperand(2).getReg(); 3417 3418 // The typed intrinsics add an immediate after the registers. 3419 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3420 3421 // The struct intrinsic variants add one additional operand over raw. 3422 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3423 Register VIndex; 3424 int OpOffset = 0; 3425 if (HasVIndex) { 3426 VIndex = MI.getOperand(3).getReg(); 3427 OpOffset = 1; 3428 } 3429 3430 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3431 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3432 3433 unsigned Format = 0; 3434 if (IsTyped) { 3435 Format = MI.getOperand(5 + OpOffset).getImm(); 3436 ++OpOffset; 3437 } 3438 3439 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3440 unsigned ImmOffset; 3441 unsigned TotalOffset; 3442 3443 LLT Ty = MRI.getType(Dst); 3444 LLT EltTy = Ty.getScalarType(); 3445 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3446 const bool Unpacked = ST.hasUnpackedD16VMem(); 3447 3448 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3449 if (TotalOffset != 0) 3450 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3451 3452 unsigned Opc; 3453 3454 if (IsTyped) { 3455 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3456 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3457 } else if (IsFormat) { 3458 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3459 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3460 } else { 3461 switch (MemSize) { 3462 case 1: 3463 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3464 break; 3465 case 2: 3466 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3467 break; 3468 default: 3469 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3470 break; 3471 } 3472 } 3473 3474 Register LoadDstReg; 3475 3476 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3477 LLT UnpackedTy = Ty.changeElementSize(32); 3478 3479 if (IsExtLoad) 3480 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3481 else if (Unpacked && IsD16 && Ty.isVector()) 3482 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3483 else 3484 LoadDstReg = Dst; 3485 3486 if (!VIndex) 3487 VIndex = B.buildConstant(S32, 0).getReg(0); 3488 3489 auto MIB = B.buildInstr(Opc) 3490 .addDef(LoadDstReg) // vdata 3491 .addUse(RSrc) // rsrc 3492 .addUse(VIndex) // vindex 3493 .addUse(VOffset) // voffset 3494 .addUse(SOffset) // soffset 3495 .addImm(ImmOffset); // offset(imm) 3496 3497 if (IsTyped) 3498 MIB.addImm(Format); 3499 3500 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3501 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3502 .addMemOperand(MMO); 3503 3504 if (LoadDstReg != Dst) { 3505 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3506 3507 // Widen result for extending loads was widened. 3508 if (IsExtLoad) 3509 B.buildTrunc(Dst, LoadDstReg); 3510 else { 3511 // Repack to original 16-bit vector result 3512 // FIXME: G_TRUNC should work, but legalization currently fails 3513 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3514 SmallVector<Register, 4> Repack; 3515 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3516 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3517 B.buildMerge(Dst, Repack); 3518 } 3519 } 3520 3521 MI.eraseFromParent(); 3522 return true; 3523 } 3524 3525 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3526 MachineIRBuilder &B, 3527 bool IsInc) const { 3528 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3529 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3530 B.buildInstr(Opc) 3531 .addDef(MI.getOperand(0).getReg()) 3532 .addUse(MI.getOperand(2).getReg()) 3533 .addUse(MI.getOperand(3).getReg()) 3534 .cloneMemRefs(MI); 3535 MI.eraseFromParent(); 3536 return true; 3537 } 3538 3539 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3540 switch (IntrID) { 3541 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3542 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3543 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3544 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3545 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3546 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3547 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3548 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3549 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3550 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3551 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3552 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3553 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3554 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3555 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3556 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3557 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3558 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3559 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3560 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3561 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3562 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3563 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3564 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3565 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3566 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3567 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3568 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3569 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3570 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3571 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3572 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3573 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3574 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3575 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3576 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3577 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3578 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3579 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3580 default: 3581 llvm_unreachable("unhandled atomic opcode"); 3582 } 3583 } 3584 3585 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3586 MachineIRBuilder &B, 3587 Intrinsic::ID IID) const { 3588 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3589 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3590 3591 Register Dst = MI.getOperand(0).getReg(); 3592 Register VData = MI.getOperand(2).getReg(); 3593 3594 Register CmpVal; 3595 int OpOffset = 0; 3596 3597 if (IsCmpSwap) { 3598 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3599 ++OpOffset; 3600 } 3601 3602 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3603 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3604 3605 // The struct intrinsic variants add one additional operand over raw. 3606 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3607 Register VIndex; 3608 if (HasVIndex) { 3609 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3610 ++OpOffset; 3611 } 3612 3613 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3614 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3615 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3616 3617 MachineMemOperand *MMO = *MI.memoperands_begin(); 3618 3619 unsigned ImmOffset; 3620 unsigned TotalOffset; 3621 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3622 if (TotalOffset != 0) 3623 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3624 3625 if (!VIndex) 3626 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3627 3628 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3629 .addDef(Dst) 3630 .addUse(VData); // vdata 3631 3632 if (IsCmpSwap) 3633 MIB.addReg(CmpVal); 3634 3635 MIB.addUse(RSrc) // rsrc 3636 .addUse(VIndex) // vindex 3637 .addUse(VOffset) // voffset 3638 .addUse(SOffset) // soffset 3639 .addImm(ImmOffset) // offset(imm) 3640 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3641 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3642 .addMemOperand(MMO); 3643 3644 MI.eraseFromParent(); 3645 return true; 3646 } 3647 3648 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3649 /// vector with s16 typed elements. 3650 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3651 SmallVectorImpl<Register> &PackedAddrs, 3652 int AddrIdx, int DimIdx, int EndIdx, 3653 int NumGradients) { 3654 const LLT S16 = LLT::scalar(16); 3655 const LLT V2S16 = LLT::vector(2, 16); 3656 3657 for (int I = AddrIdx; I < EndIdx; ++I) { 3658 MachineOperand &SrcOp = MI.getOperand(I); 3659 if (!SrcOp.isReg()) 3660 continue; // _L to _LZ may have eliminated this. 3661 3662 Register AddrReg = SrcOp.getReg(); 3663 3664 if (I < DimIdx) { 3665 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3666 PackedAddrs.push_back(AddrReg); 3667 } else { 3668 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3669 // derivatives dx/dh and dx/dv are packed with undef. 3670 if (((I + 1) >= EndIdx) || 3671 ((NumGradients / 2) % 2 == 1 && 3672 (I == DimIdx + (NumGradients / 2) - 1 || 3673 I == DimIdx + NumGradients - 1)) || 3674 // Check for _L to _LZ optimization 3675 !MI.getOperand(I + 1).isReg()) { 3676 PackedAddrs.push_back( 3677 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3678 .getReg(0)); 3679 } else { 3680 PackedAddrs.push_back( 3681 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3682 .getReg(0)); 3683 ++I; 3684 } 3685 } 3686 } 3687 } 3688 3689 /// Convert from separate vaddr components to a single vector address register, 3690 /// and replace the remaining operands with $noreg. 3691 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3692 int DimIdx, int NumVAddrs) { 3693 const LLT S32 = LLT::scalar(32); 3694 3695 SmallVector<Register, 8> AddrRegs; 3696 for (int I = 0; I != NumVAddrs; ++I) { 3697 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3698 if (SrcOp.isReg()) { 3699 AddrRegs.push_back(SrcOp.getReg()); 3700 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3701 } 3702 } 3703 3704 int NumAddrRegs = AddrRegs.size(); 3705 if (NumAddrRegs != 1) { 3706 // Round up to 8 elements for v5-v7 3707 // FIXME: Missing intermediate sized register classes and instructions. 3708 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3709 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3710 auto Undef = B.buildUndef(S32); 3711 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3712 NumAddrRegs = RoundedNumRegs; 3713 } 3714 3715 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3716 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3717 } 3718 3719 for (int I = 1; I != NumVAddrs; ++I) { 3720 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3721 if (SrcOp.isReg()) 3722 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3723 } 3724 } 3725 3726 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3727 /// 3728 /// Depending on the subtarget, load/store with 16-bit element data need to be 3729 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3730 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3731 /// registers. 3732 /// 3733 /// We don't want to directly select image instructions just yet, but also want 3734 /// to exposes all register repacking to the legalizer/combiners. We also don't 3735 /// want a selected instrution entering RegBankSelect. In order to avoid 3736 /// defining a multitude of intermediate image instructions, directly hack on 3737 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3738 /// now unnecessary arguments with $noreg. 3739 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3740 MachineInstr &MI, MachineIRBuilder &B, 3741 GISelChangeObserver &Observer, 3742 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3743 3744 const int NumDefs = MI.getNumExplicitDefs(); 3745 bool IsTFE = NumDefs == 2; 3746 // We are only processing the operands of d16 image operations on subtargets 3747 // that use the unpacked register layout, or need to repack the TFE result. 3748 3749 // TODO: Do we need to guard against already legalized intrinsics? 3750 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3751 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3752 3753 MachineRegisterInfo *MRI = B.getMRI(); 3754 const LLT S32 = LLT::scalar(32); 3755 const LLT S16 = LLT::scalar(16); 3756 const LLT V2S16 = LLT::vector(2, 16); 3757 3758 // Index of first address argument 3759 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3760 3761 int NumVAddrs, NumGradients; 3762 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3763 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3764 getDMaskIdx(BaseOpcode, NumDefs); 3765 unsigned DMask = 0; 3766 3767 // Check for 16 bit addresses and pack if true. 3768 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3769 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3770 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3771 const bool IsG16 = GradTy == S16; 3772 const bool IsA16 = AddrTy == S16; 3773 3774 int DMaskLanes = 0; 3775 if (!BaseOpcode->Atomic) { 3776 DMask = MI.getOperand(DMaskIdx).getImm(); 3777 if (BaseOpcode->Gather4) { 3778 DMaskLanes = 4; 3779 } else if (DMask != 0) { 3780 DMaskLanes = countPopulation(DMask); 3781 } else if (!IsTFE && !BaseOpcode->Store) { 3782 // If dmask is 0, this is a no-op load. This can be eliminated. 3783 B.buildUndef(MI.getOperand(0)); 3784 MI.eraseFromParent(); 3785 return true; 3786 } 3787 } 3788 3789 Observer.changingInstr(MI); 3790 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3791 3792 unsigned NewOpcode = NumDefs == 0 ? 3793 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3794 3795 // Track that we legalized this 3796 MI.setDesc(B.getTII().get(NewOpcode)); 3797 3798 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3799 // dmask to be at least 1 otherwise the instruction will fail 3800 if (IsTFE && DMask == 0) { 3801 DMask = 0x1; 3802 DMaskLanes = 1; 3803 MI.getOperand(DMaskIdx).setImm(DMask); 3804 } 3805 3806 if (BaseOpcode->Atomic) { 3807 Register VData0 = MI.getOperand(2).getReg(); 3808 LLT Ty = MRI->getType(VData0); 3809 3810 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3811 if (Ty.isVector()) 3812 return false; 3813 3814 if (BaseOpcode->AtomicX2) { 3815 Register VData1 = MI.getOperand(3).getReg(); 3816 // The two values are packed in one register. 3817 LLT PackedTy = LLT::vector(2, Ty); 3818 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3819 MI.getOperand(2).setReg(Concat.getReg(0)); 3820 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3821 } 3822 } 3823 3824 int CorrectedNumVAddrs = NumVAddrs; 3825 3826 // Optimize _L to _LZ when _L is zero 3827 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3828 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3829 const ConstantFP *ConstantLod; 3830 const int LodIdx = AddrIdx + NumVAddrs - 1; 3831 3832 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3833 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3834 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3835 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3836 LZMappingInfo->LZ, ImageDimIntr->Dim); 3837 3838 // The starting indexes should remain in the same place. 3839 --NumVAddrs; 3840 --CorrectedNumVAddrs; 3841 3842 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3843 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3844 MI.RemoveOperand(LodIdx); 3845 } 3846 } 3847 } 3848 3849 // Optimize _mip away, when 'lod' is zero 3850 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3851 int64_t ConstantLod; 3852 const int LodIdx = AddrIdx + NumVAddrs - 1; 3853 3854 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3855 if (ConstantLod == 0) { 3856 // TODO: Change intrinsic opcode and remove operand instead or replacing 3857 // it with 0, as the _L to _LZ handling is done above. 3858 MI.getOperand(LodIdx).ChangeToImmediate(0); 3859 --CorrectedNumVAddrs; 3860 } 3861 } 3862 } 3863 3864 // Rewrite the addressing register layout before doing anything else. 3865 if (IsA16 || IsG16) { 3866 if (IsA16) { 3867 // Target must support the feature and gradients need to be 16 bit too 3868 if (!ST.hasA16() || !IsG16) 3869 return false; 3870 } else if (!ST.hasG16()) 3871 return false; 3872 3873 if (NumVAddrs > 1) { 3874 SmallVector<Register, 4> PackedRegs; 3875 // Don't compress addresses for G16 3876 const int PackEndIdx = 3877 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3878 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3879 PackEndIdx, NumGradients); 3880 3881 if (!IsA16) { 3882 // Add uncompressed address 3883 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3884 int AddrReg = MI.getOperand(I).getReg(); 3885 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3886 PackedRegs.push_back(AddrReg); 3887 } 3888 } 3889 3890 // See also below in the non-a16 branch 3891 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3892 3893 if (!UseNSA && PackedRegs.size() > 1) { 3894 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3895 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3896 PackedRegs[0] = Concat.getReg(0); 3897 PackedRegs.resize(1); 3898 } 3899 3900 const int NumPacked = PackedRegs.size(); 3901 for (int I = 0; I != NumVAddrs; ++I) { 3902 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3903 if (!SrcOp.isReg()) { 3904 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3905 continue; 3906 } 3907 3908 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3909 3910 if (I < NumPacked) 3911 SrcOp.setReg(PackedRegs[I]); 3912 else 3913 SrcOp.setReg(AMDGPU::NoRegister); 3914 } 3915 } 3916 } else { 3917 // If the register allocator cannot place the address registers contiguously 3918 // without introducing moves, then using the non-sequential address encoding 3919 // is always preferable, since it saves VALU instructions and is usually a 3920 // wash in terms of code size or even better. 3921 // 3922 // However, we currently have no way of hinting to the register allocator 3923 // that MIMG addresses should be placed contiguously when it is possible to 3924 // do so, so force non-NSA for the common 2-address case as a heuristic. 3925 // 3926 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3927 // allocation when possible. 3928 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3929 3930 if (!UseNSA && NumVAddrs > 1) 3931 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3932 } 3933 3934 int Flags = 0; 3935 if (IsA16) 3936 Flags |= 1; 3937 if (IsG16) 3938 Flags |= 2; 3939 MI.addOperand(MachineOperand::CreateImm(Flags)); 3940 3941 if (BaseOpcode->Store) { // No TFE for stores? 3942 // TODO: Handle dmask trim 3943 Register VData = MI.getOperand(1).getReg(); 3944 LLT Ty = MRI->getType(VData); 3945 if (!Ty.isVector() || Ty.getElementType() != S16) 3946 return true; 3947 3948 Register RepackedReg = handleD16VData(B, *MRI, VData); 3949 if (RepackedReg != VData) { 3950 MI.getOperand(1).setReg(RepackedReg); 3951 } 3952 3953 return true; 3954 } 3955 3956 Register DstReg = MI.getOperand(0).getReg(); 3957 LLT Ty = MRI->getType(DstReg); 3958 const LLT EltTy = Ty.getScalarType(); 3959 const bool IsD16 = Ty.getScalarType() == S16; 3960 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3961 3962 // Confirm that the return type is large enough for the dmask specified 3963 if (NumElts < DMaskLanes) 3964 return false; 3965 3966 if (NumElts > 4 || DMaskLanes > 4) 3967 return false; 3968 3969 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3970 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3971 3972 // The raw dword aligned data component of the load. The only legal cases 3973 // where this matters should be when using the packed D16 format, for 3974 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3975 LLT RoundedTy; 3976 3977 // S32 vector to to cover all data, plus TFE result element. 3978 LLT TFETy; 3979 3980 // Register type to use for each loaded component. Will be S32 or V2S16. 3981 LLT RegTy; 3982 3983 if (IsD16 && ST.hasUnpackedD16VMem()) { 3984 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3985 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3986 RegTy = S32; 3987 } else { 3988 unsigned EltSize = EltTy.getSizeInBits(); 3989 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3990 unsigned RoundedSize = 32 * RoundedElts; 3991 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3992 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3993 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3994 } 3995 3996 // The return type does not need adjustment. 3997 // TODO: Should we change s16 case to s32 or <2 x s16>? 3998 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3999 return true; 4000 4001 Register Dst1Reg; 4002 4003 // Insert after the instruction. 4004 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4005 4006 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4007 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4008 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4009 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4010 4011 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4012 4013 MI.getOperand(0).setReg(NewResultReg); 4014 4015 // In the IR, TFE is supposed to be used with a 2 element struct return 4016 // type. The intruction really returns these two values in one contiguous 4017 // register, with one additional dword beyond the loaded data. Rewrite the 4018 // return type to use a single register result. 4019 4020 if (IsTFE) { 4021 Dst1Reg = MI.getOperand(1).getReg(); 4022 if (MRI->getType(Dst1Reg) != S32) 4023 return false; 4024 4025 // TODO: Make sure the TFE operand bit is set. 4026 MI.RemoveOperand(1); 4027 4028 // Handle the easy case that requires no repack instructions. 4029 if (Ty == S32) { 4030 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4031 return true; 4032 } 4033 } 4034 4035 // Now figure out how to copy the new result register back into the old 4036 // result. 4037 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4038 4039 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4040 4041 if (ResultNumRegs == 1) { 4042 assert(!IsTFE); 4043 ResultRegs[0] = NewResultReg; 4044 } else { 4045 // We have to repack into a new vector of some kind. 4046 for (int I = 0; I != NumDataRegs; ++I) 4047 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4048 B.buildUnmerge(ResultRegs, NewResultReg); 4049 4050 // Drop the final TFE element to get the data part. The TFE result is 4051 // directly written to the right place already. 4052 if (IsTFE) 4053 ResultRegs.resize(NumDataRegs); 4054 } 4055 4056 // For an s16 scalar result, we form an s32 result with a truncate regardless 4057 // of packed vs. unpacked. 4058 if (IsD16 && !Ty.isVector()) { 4059 B.buildTrunc(DstReg, ResultRegs[0]); 4060 return true; 4061 } 4062 4063 // Avoid a build/concat_vector of 1 entry. 4064 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4065 B.buildBitcast(DstReg, ResultRegs[0]); 4066 return true; 4067 } 4068 4069 assert(Ty.isVector()); 4070 4071 if (IsD16) { 4072 // For packed D16 results with TFE enabled, all the data components are 4073 // S32. Cast back to the expected type. 4074 // 4075 // TODO: We don't really need to use load s32 elements. We would only need one 4076 // cast for the TFE result if a multiple of v2s16 was used. 4077 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4078 for (Register &Reg : ResultRegs) 4079 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4080 } else if (ST.hasUnpackedD16VMem()) { 4081 for (Register &Reg : ResultRegs) 4082 Reg = B.buildTrunc(S16, Reg).getReg(0); 4083 } 4084 } 4085 4086 auto padWithUndef = [&](LLT Ty, int NumElts) { 4087 if (NumElts == 0) 4088 return; 4089 Register Undef = B.buildUndef(Ty).getReg(0); 4090 for (int I = 0; I != NumElts; ++I) 4091 ResultRegs.push_back(Undef); 4092 }; 4093 4094 // Pad out any elements eliminated due to the dmask. 4095 LLT ResTy = MRI->getType(ResultRegs[0]); 4096 if (!ResTy.isVector()) { 4097 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4098 B.buildBuildVector(DstReg, ResultRegs); 4099 return true; 4100 } 4101 4102 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4103 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4104 4105 // Deal with the one annoying legal case. 4106 const LLT V3S16 = LLT::vector(3, 16); 4107 if (Ty == V3S16) { 4108 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4109 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4110 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4111 return true; 4112 } 4113 4114 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4115 B.buildConcatVectors(DstReg, ResultRegs); 4116 return true; 4117 } 4118 4119 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4120 MachineInstr &MI, MachineIRBuilder &B, 4121 GISelChangeObserver &Observer) const { 4122 Register Dst = MI.getOperand(0).getReg(); 4123 LLT Ty = B.getMRI()->getType(Dst); 4124 unsigned Size = Ty.getSizeInBits(); 4125 MachineFunction &MF = B.getMF(); 4126 4127 Observer.changingInstr(MI); 4128 4129 // FIXME: We don't really need this intermediate instruction. The intrinsic 4130 // should be fixed to have a memory operand. Since it's readnone, we're not 4131 // allowed to add one. 4132 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4133 MI.RemoveOperand(1); // Remove intrinsic ID 4134 4135 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4136 // TODO: Should this use datalayout alignment? 4137 const unsigned MemSize = (Size + 7) / 8; 4138 const Align MemAlign(4); 4139 MachineMemOperand *MMO = MF.getMachineMemOperand( 4140 MachinePointerInfo(), 4141 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4142 MachineMemOperand::MOInvariant, 4143 MemSize, MemAlign); 4144 MI.addMemOperand(MF, MMO); 4145 4146 // There are no 96-bit result scalar loads, but widening to 128-bit should 4147 // always be legal. We may need to restore this to a 96-bit result if it turns 4148 // out this needs to be converted to a vector load during RegBankSelect. 4149 if (!isPowerOf2_32(Size)) { 4150 LegalizerHelper Helper(MF, *this, Observer, B); 4151 4152 if (Ty.isVector()) 4153 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4154 else 4155 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4156 } 4157 4158 Observer.changedInstr(MI); 4159 return true; 4160 } 4161 4162 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4163 MachineRegisterInfo &MRI, 4164 MachineIRBuilder &B) const { 4165 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4166 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4167 !ST.isTrapHandlerEnabled()) { 4168 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4169 } else { 4170 // Pass queue pointer to trap handler as input, and insert trap instruction 4171 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4172 const ArgDescriptor *Arg = 4173 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4174 if (!Arg) 4175 return false; 4176 MachineRegisterInfo &MRI = *B.getMRI(); 4177 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4178 Register LiveIn = getLiveInRegister( 4179 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4180 /*InsertLiveInCopy=*/false); 4181 if (!loadInputValue(LiveIn, B, Arg)) 4182 return false; 4183 B.buildCopy(SGPR01, LiveIn); 4184 B.buildInstr(AMDGPU::S_TRAP) 4185 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4186 .addReg(SGPR01, RegState::Implicit); 4187 } 4188 4189 MI.eraseFromParent(); 4190 return true; 4191 } 4192 4193 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4194 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4195 // Is non-HSA path or trap-handler disabled? then, report a warning 4196 // accordingly 4197 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4198 !ST.isTrapHandlerEnabled()) { 4199 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4200 "debugtrap handler not supported", 4201 MI.getDebugLoc(), DS_Warning); 4202 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4203 Ctx.diagnose(NoTrap); 4204 } else { 4205 // Insert debug-trap instruction 4206 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4207 } 4208 4209 MI.eraseFromParent(); 4210 return true; 4211 } 4212 4213 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4214 MachineInstr &MI) const { 4215 MachineIRBuilder &B = Helper.MIRBuilder; 4216 MachineRegisterInfo &MRI = *B.getMRI(); 4217 4218 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4219 auto IntrID = MI.getIntrinsicID(); 4220 switch (IntrID) { 4221 case Intrinsic::amdgcn_if: 4222 case Intrinsic::amdgcn_else: { 4223 MachineInstr *Br = nullptr; 4224 MachineBasicBlock *UncondBrTarget = nullptr; 4225 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4226 const SIRegisterInfo *TRI 4227 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4228 4229 Register Def = MI.getOperand(1).getReg(); 4230 Register Use = MI.getOperand(3).getReg(); 4231 4232 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4233 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4234 if (IntrID == Intrinsic::amdgcn_if) { 4235 B.buildInstr(AMDGPU::SI_IF) 4236 .addDef(Def) 4237 .addUse(Use) 4238 .addMBB(UncondBrTarget); 4239 } else { 4240 B.buildInstr(AMDGPU::SI_ELSE) 4241 .addDef(Def) 4242 .addUse(Use) 4243 .addMBB(UncondBrTarget) 4244 .addImm(0); 4245 } 4246 4247 if (Br) { 4248 Br->getOperand(0).setMBB(CondBrTarget); 4249 } else { 4250 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4251 // since we're swapping branch targets it needs to be reinserted. 4252 // FIXME: IRTranslator should probably not do this 4253 B.buildBr(*CondBrTarget); 4254 } 4255 4256 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4257 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4258 MI.eraseFromParent(); 4259 BrCond->eraseFromParent(); 4260 return true; 4261 } 4262 4263 return false; 4264 } 4265 case Intrinsic::amdgcn_loop: { 4266 MachineInstr *Br = nullptr; 4267 MachineBasicBlock *UncondBrTarget = nullptr; 4268 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4269 const SIRegisterInfo *TRI 4270 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4271 4272 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4273 Register Reg = MI.getOperand(2).getReg(); 4274 4275 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4276 B.buildInstr(AMDGPU::SI_LOOP) 4277 .addUse(Reg) 4278 .addMBB(UncondBrTarget); 4279 4280 if (Br) 4281 Br->getOperand(0).setMBB(CondBrTarget); 4282 else 4283 B.buildBr(*CondBrTarget); 4284 4285 MI.eraseFromParent(); 4286 BrCond->eraseFromParent(); 4287 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4288 return true; 4289 } 4290 4291 return false; 4292 } 4293 case Intrinsic::amdgcn_kernarg_segment_ptr: 4294 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4295 // This only makes sense to call in a kernel, so just lower to null. 4296 B.buildConstant(MI.getOperand(0).getReg(), 0); 4297 MI.eraseFromParent(); 4298 return true; 4299 } 4300 4301 return legalizePreloadedArgIntrin( 4302 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4303 case Intrinsic::amdgcn_implicitarg_ptr: 4304 return legalizeImplicitArgPtr(MI, MRI, B); 4305 case Intrinsic::amdgcn_workitem_id_x: 4306 return legalizePreloadedArgIntrin(MI, MRI, B, 4307 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4308 case Intrinsic::amdgcn_workitem_id_y: 4309 return legalizePreloadedArgIntrin(MI, MRI, B, 4310 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4311 case Intrinsic::amdgcn_workitem_id_z: 4312 return legalizePreloadedArgIntrin(MI, MRI, B, 4313 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4314 case Intrinsic::amdgcn_workgroup_id_x: 4315 return legalizePreloadedArgIntrin(MI, MRI, B, 4316 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4317 case Intrinsic::amdgcn_workgroup_id_y: 4318 return legalizePreloadedArgIntrin(MI, MRI, B, 4319 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4320 case Intrinsic::amdgcn_workgroup_id_z: 4321 return legalizePreloadedArgIntrin(MI, MRI, B, 4322 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4323 case Intrinsic::amdgcn_dispatch_ptr: 4324 return legalizePreloadedArgIntrin(MI, MRI, B, 4325 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4326 case Intrinsic::amdgcn_queue_ptr: 4327 return legalizePreloadedArgIntrin(MI, MRI, B, 4328 AMDGPUFunctionArgInfo::QUEUE_PTR); 4329 case Intrinsic::amdgcn_implicit_buffer_ptr: 4330 return legalizePreloadedArgIntrin( 4331 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4332 case Intrinsic::amdgcn_dispatch_id: 4333 return legalizePreloadedArgIntrin(MI, MRI, B, 4334 AMDGPUFunctionArgInfo::DISPATCH_ID); 4335 case Intrinsic::amdgcn_fdiv_fast: 4336 return legalizeFDIVFastIntrin(MI, MRI, B); 4337 case Intrinsic::amdgcn_is_shared: 4338 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4339 case Intrinsic::amdgcn_is_private: 4340 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4341 case Intrinsic::amdgcn_wavefrontsize: { 4342 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4343 MI.eraseFromParent(); 4344 return true; 4345 } 4346 case Intrinsic::amdgcn_s_buffer_load: 4347 return legalizeSBufferLoad(MI, B, Helper.Observer); 4348 case Intrinsic::amdgcn_raw_buffer_store: 4349 case Intrinsic::amdgcn_struct_buffer_store: 4350 return legalizeBufferStore(MI, MRI, B, false, false); 4351 case Intrinsic::amdgcn_raw_buffer_store_format: 4352 case Intrinsic::amdgcn_struct_buffer_store_format: 4353 return legalizeBufferStore(MI, MRI, B, false, true); 4354 case Intrinsic::amdgcn_raw_tbuffer_store: 4355 case Intrinsic::amdgcn_struct_tbuffer_store: 4356 return legalizeBufferStore(MI, MRI, B, true, true); 4357 case Intrinsic::amdgcn_raw_buffer_load: 4358 case Intrinsic::amdgcn_struct_buffer_load: 4359 return legalizeBufferLoad(MI, MRI, B, false, false); 4360 case Intrinsic::amdgcn_raw_buffer_load_format: 4361 case Intrinsic::amdgcn_struct_buffer_load_format: 4362 return legalizeBufferLoad(MI, MRI, B, true, false); 4363 case Intrinsic::amdgcn_raw_tbuffer_load: 4364 case Intrinsic::amdgcn_struct_tbuffer_load: 4365 return legalizeBufferLoad(MI, MRI, B, true, true); 4366 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4367 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4368 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4369 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4370 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4371 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4372 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4373 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4374 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4375 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4376 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4377 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4378 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4379 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4380 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4381 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4382 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4383 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4384 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4385 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4386 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4387 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4388 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4389 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4390 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4391 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4392 return legalizeBufferAtomic(MI, B, IntrID); 4393 case Intrinsic::amdgcn_atomic_inc: 4394 return legalizeAtomicIncDec(MI, B, true); 4395 case Intrinsic::amdgcn_atomic_dec: 4396 return legalizeAtomicIncDec(MI, B, false); 4397 case Intrinsic::trap: 4398 return legalizeTrapIntrinsic(MI, MRI, B); 4399 case Intrinsic::debugtrap: 4400 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4401 default: { 4402 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4403 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4404 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4405 return true; 4406 } 4407 } 4408 4409 return true; 4410 } 4411