1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 return Ty.isVector() && 67 Ty.getNumElements() % 2 != 0 && 68 Ty.getElementType().getSizeInBits() < 32 && 69 Ty.getSizeInBits() % 32 != 0; 70 }; 71 } 72 73 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 74 return [=](const LegalityQuery &Query) { 75 const LLT Ty = Query.Types[TypeIdx]; 76 const LLT EltTy = Ty.getScalarType(); 77 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 78 }; 79 } 80 81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 82 return [=](const LegalityQuery &Query) { 83 const LLT Ty = Query.Types[TypeIdx]; 84 const LLT EltTy = Ty.getElementType(); 85 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 86 }; 87 } 88 89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 90 return [=](const LegalityQuery &Query) { 91 const LLT Ty = Query.Types[TypeIdx]; 92 const LLT EltTy = Ty.getElementType(); 93 unsigned Size = Ty.getSizeInBits(); 94 unsigned Pieces = (Size + 63) / 64; 95 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 96 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 97 }; 98 } 99 100 // Increase the number of vector elements to reach the next multiple of 32-bit 101 // type. 102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 106 const LLT EltTy = Ty.getElementType(); 107 const int Size = Ty.getSizeInBits(); 108 const int EltSize = EltTy.getSizeInBits(); 109 const int NextMul32 = (Size + 31) / 32; 110 111 assert(EltSize < 32); 112 113 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 114 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 115 }; 116 } 117 118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 unsigned Size = Ty.getSizeInBits(); 122 123 LLT CoercedTy; 124 if (Size <= 32) { 125 // <2 x s8> -> s16 126 // <4 x s8> -> s32 127 CoercedTy = LLT::scalar(Size); 128 } else 129 CoercedTy = LLT::scalarOrVector(Size / 32, 32); 130 131 return std::make_pair(TypeIdx, CoercedTy); 132 }; 133 } 134 135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 136 return [=](const LegalityQuery &Query) { 137 const LLT QueryTy = Query.Types[TypeIdx]; 138 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 139 }; 140 } 141 142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 143 return [=](const LegalityQuery &Query) { 144 const LLT QueryTy = Query.Types[TypeIdx]; 145 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 146 }; 147 } 148 149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT QueryTy = Query.Types[TypeIdx]; 152 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 153 }; 154 } 155 156 static bool isRegisterSize(unsigned Size) { 157 return Size % 32 == 0 && Size <= MaxRegisterSize; 158 } 159 160 static bool isRegisterVectorElementType(LLT EltTy) { 161 const int EltSize = EltTy.getSizeInBits(); 162 return EltSize == 16 || EltSize % 32 == 0; 163 } 164 165 static bool isRegisterVectorType(LLT Ty) { 166 const int EltSize = Ty.getElementType().getSizeInBits(); 167 return EltSize == 32 || EltSize == 64 || 168 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 169 EltSize == 128 || EltSize == 256; 170 } 171 172 static bool isRegisterType(LLT Ty) { 173 if (!isRegisterSize(Ty.getSizeInBits())) 174 return false; 175 176 if (Ty.isVector()) 177 return isRegisterVectorType(Ty); 178 179 return true; 180 } 181 182 // Any combination of 32 or 64-bit elements up the maximum register size, and 183 // multiples of v2s16. 184 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 return isRegisterType(Query.Types[TypeIdx]); 187 }; 188 } 189 190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 191 return [=](const LegalityQuery &Query) { 192 const LLT QueryTy = Query.Types[TypeIdx]; 193 if (!QueryTy.isVector()) 194 return false; 195 const LLT EltTy = QueryTy.getElementType(); 196 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 197 }; 198 } 199 200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 201 return [=](const LegalityQuery &Query) { 202 const LLT Ty = Query.Types[TypeIdx]; 203 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 204 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 205 }; 206 } 207 208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 209 // handle some operations by just promoting the register during 210 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 212 bool IsLoad) { 213 switch (AS) { 214 case AMDGPUAS::PRIVATE_ADDRESS: 215 // FIXME: Private element size. 216 return 32; 217 case AMDGPUAS::LOCAL_ADDRESS: 218 return ST.useDS128() ? 128 : 64; 219 case AMDGPUAS::GLOBAL_ADDRESS: 220 case AMDGPUAS::CONSTANT_ADDRESS: 221 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 222 // Treat constant and global as identical. SMRD loads are sometimes usable for 223 // global loads (ideally constant address space should be eliminated) 224 // depending on the context. Legality cannot be context dependent, but 225 // RegBankSelect can split the load as necessary depending on the pointer 226 // register bank/uniformity and if the memory is invariant or not written in a 227 // kernel. 228 return IsLoad ? 512 : 128; 229 default: 230 // Flat addresses may contextually need to be split to 32-bit parts if they 231 // may alias scratch depending on the subtarget. 232 return 128; 233 } 234 } 235 236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 237 const LegalityQuery &Query, 238 unsigned Opcode) { 239 const LLT Ty = Query.Types[0]; 240 241 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 242 const bool IsLoad = Opcode != AMDGPU::G_STORE; 243 244 unsigned RegSize = Ty.getSizeInBits(); 245 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 246 unsigned Align = Query.MMODescrs[0].AlignInBits; 247 unsigned AS = Query.Types[1].getAddressSpace(); 248 249 // All of these need to be custom lowered to cast the pointer operand. 250 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 251 return false; 252 253 // TODO: We should be able to widen loads if the alignment is high enough, but 254 // we also need to modify the memory access size. 255 #if 0 256 // Accept widening loads based on alignment. 257 if (IsLoad && MemSize < Size) 258 MemSize = std::max(MemSize, Align); 259 #endif 260 261 // Only 1-byte and 2-byte to 32-bit extloads are valid. 262 if (MemSize != RegSize && RegSize != 32) 263 return false; 264 265 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 266 return false; 267 268 switch (MemSize) { 269 case 8: 270 case 16: 271 case 32: 272 case 64: 273 case 128: 274 break; 275 case 96: 276 if (!ST.hasDwordx3LoadStores()) 277 return false; 278 break; 279 case 256: 280 case 512: 281 // These may contextually need to be broken down. 282 break; 283 default: 284 return false; 285 } 286 287 assert(RegSize >= MemSize); 288 289 if (Align < MemSize) { 290 const SITargetLowering *TLI = ST.getTargetLowering(); 291 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 292 return false; 293 } 294 295 return true; 296 } 297 298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 299 // workaround this. Eventually it should ignore the type for loads and only care 300 // about the size. Return true in cases where we will workaround this for now by 301 // bitcasting. 302 static bool loadStoreBitcastWorkaround(const LLT Ty) { 303 if (EnableNewLegality) 304 return false; 305 306 const unsigned Size = Ty.getSizeInBits(); 307 if (Size <= 64) 308 return false; 309 if (!Ty.isVector()) 310 return true; 311 unsigned EltSize = Ty.getElementType().getSizeInBits(); 312 return EltSize != 32 && EltSize != 64; 313 } 314 315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 316 unsigned Opcode) { 317 const LLT Ty = Query.Types[0]; 318 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 319 !loadStoreBitcastWorkaround(Ty); 320 } 321 322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 323 const GCNTargetMachine &TM) 324 : ST(ST_) { 325 using namespace TargetOpcode; 326 327 auto GetAddrSpacePtr = [&TM](unsigned AS) { 328 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 329 }; 330 331 const LLT S1 = LLT::scalar(1); 332 const LLT S16 = LLT::scalar(16); 333 const LLT S32 = LLT::scalar(32); 334 const LLT S64 = LLT::scalar(64); 335 const LLT S128 = LLT::scalar(128); 336 const LLT S256 = LLT::scalar(256); 337 const LLT S512 = LLT::scalar(512); 338 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 339 340 const LLT V2S16 = LLT::vector(2, 16); 341 const LLT V4S16 = LLT::vector(4, 16); 342 343 const LLT V2S32 = LLT::vector(2, 32); 344 const LLT V3S32 = LLT::vector(3, 32); 345 const LLT V4S32 = LLT::vector(4, 32); 346 const LLT V5S32 = LLT::vector(5, 32); 347 const LLT V6S32 = LLT::vector(6, 32); 348 const LLT V7S32 = LLT::vector(7, 32); 349 const LLT V8S32 = LLT::vector(8, 32); 350 const LLT V9S32 = LLT::vector(9, 32); 351 const LLT V10S32 = LLT::vector(10, 32); 352 const LLT V11S32 = LLT::vector(11, 32); 353 const LLT V12S32 = LLT::vector(12, 32); 354 const LLT V13S32 = LLT::vector(13, 32); 355 const LLT V14S32 = LLT::vector(14, 32); 356 const LLT V15S32 = LLT::vector(15, 32); 357 const LLT V16S32 = LLT::vector(16, 32); 358 const LLT V32S32 = LLT::vector(32, 32); 359 360 const LLT V2S64 = LLT::vector(2, 64); 361 const LLT V3S64 = LLT::vector(3, 64); 362 const LLT V4S64 = LLT::vector(4, 64); 363 const LLT V5S64 = LLT::vector(5, 64); 364 const LLT V6S64 = LLT::vector(6, 64); 365 const LLT V7S64 = LLT::vector(7, 64); 366 const LLT V8S64 = LLT::vector(8, 64); 367 const LLT V16S64 = LLT::vector(16, 64); 368 369 std::initializer_list<LLT> AllS32Vectors = 370 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 371 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 372 std::initializer_list<LLT> AllS64Vectors = 373 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 374 375 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 376 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 377 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 378 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 379 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 380 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 381 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 382 383 const LLT CodePtr = FlatPtr; 384 385 const std::initializer_list<LLT> AddrSpaces64 = { 386 GlobalPtr, ConstantPtr, FlatPtr 387 }; 388 389 const std::initializer_list<LLT> AddrSpaces32 = { 390 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 391 }; 392 393 const std::initializer_list<LLT> FPTypesBase = { 394 S32, S64 395 }; 396 397 const std::initializer_list<LLT> FPTypes16 = { 398 S32, S64, S16 399 }; 400 401 const std::initializer_list<LLT> FPTypesPK16 = { 402 S32, S64, S16, V2S16 403 }; 404 405 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 406 407 setAction({G_BRCOND, S1}, Legal); // VCC branches 408 setAction({G_BRCOND, S32}, Legal); // SCC branches 409 410 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 411 // elements for v3s16 412 getActionDefinitionsBuilder(G_PHI) 413 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 414 .legalFor(AllS32Vectors) 415 .legalFor(AllS64Vectors) 416 .legalFor(AddrSpaces64) 417 .legalFor(AddrSpaces32) 418 .legalIf(isPointer(0)) 419 .clampScalar(0, S32, S256) 420 .widenScalarToNextPow2(0, 32) 421 .clampMaxNumElements(0, S32, 16) 422 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 423 .scalarize(0); 424 425 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 426 // Full set of gfx9 features. 427 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 428 .legalFor({S32, S16, V2S16}) 429 .clampScalar(0, S16, S32) 430 .clampMaxNumElements(0, S16, 2) 431 .scalarize(0) 432 .widenScalarToNextPow2(0, 32); 433 434 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 435 .legalFor({S32, S16, V2S16}) // Clamp modifier 436 .minScalar(0, S16) 437 .clampMaxNumElements(0, S16, 2) 438 .scalarize(0) 439 .widenScalarToNextPow2(0, 32) 440 .lower(); 441 } else if (ST.has16BitInsts()) { 442 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 443 .legalFor({S32, S16}) 444 .clampScalar(0, S16, S32) 445 .scalarize(0) 446 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 447 448 // Technically the saturating operations require clamp bit support, but this 449 // was introduced at the same time as 16-bit operations. 450 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 451 .legalFor({S32, S16}) // Clamp modifier 452 .minScalar(0, S16) 453 .scalarize(0) 454 .widenScalarToNextPow2(0, 16) 455 .lower(); 456 457 // We're just lowering this, but it helps get a better result to try to 458 // coerce to the desired type first. 459 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 460 .minScalar(0, S16) 461 .scalarize(0) 462 .lower(); 463 } else { 464 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 465 .legalFor({S32}) 466 .clampScalar(0, S32, S32) 467 .scalarize(0); 468 469 if (ST.hasIntClamp()) { 470 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 471 .legalFor({S32}) // Clamp modifier. 472 .scalarize(0) 473 .minScalarOrElt(0, S32) 474 .lower(); 475 } else { 476 // Clamp bit support was added in VI, along with 16-bit operations. 477 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 478 .minScalar(0, S32) 479 .scalarize(0) 480 .lower(); 481 } 482 483 // FIXME: DAG expansion gets better results. The widening uses the smaller 484 // range values and goes for the min/max lowering directly. 485 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 486 .minScalar(0, S32) 487 .scalarize(0) 488 .lower(); 489 } 490 491 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 492 .customFor({S32, S64}) 493 .clampScalar(0, S32, S64) 494 .widenScalarToNextPow2(0, 32) 495 .scalarize(0); 496 497 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 498 .legalFor({S32}) 499 .clampScalar(0, S32, S32) 500 .scalarize(0); 501 502 // Report legal for any types we can handle anywhere. For the cases only legal 503 // on the SALU, RegBankSelect will be able to re-legalize. 504 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 505 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 506 .clampScalar(0, S32, S64) 507 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 508 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 509 .widenScalarToNextPow2(0) 510 .scalarize(0); 511 512 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 513 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 514 .legalFor({{S32, S1}, {S32, S32}}) 515 .minScalar(0, S32) 516 // TODO: .scalarize(0) 517 .lower(); 518 519 getActionDefinitionsBuilder(G_BITCAST) 520 // Don't worry about the size constraint. 521 .legalIf(all(isRegisterType(0), isRegisterType(1))) 522 .lower(); 523 524 525 getActionDefinitionsBuilder(G_CONSTANT) 526 .legalFor({S1, S32, S64, S16, GlobalPtr, 527 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 528 .legalIf(isPointer(0)) 529 .clampScalar(0, S32, S64) 530 .widenScalarToNextPow2(0); 531 532 getActionDefinitionsBuilder(G_FCONSTANT) 533 .legalFor({S32, S64, S16}) 534 .clampScalar(0, S16, S64); 535 536 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 537 .legalIf(isRegisterType(0)) 538 // s1 and s16 are special cases because they have legal operations on 539 // them, but don't really occupy registers in the normal way. 540 .legalFor({S1, S16}) 541 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 542 .clampScalarOrElt(0, S32, MaxScalar) 543 .widenScalarToNextPow2(0, 32) 544 .clampMaxNumElements(0, S32, 16); 545 546 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 547 548 // If the amount is divergent, we have to do a wave reduction to get the 549 // maximum value, so this is expanded during RegBankSelect. 550 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 551 .legalFor({{PrivatePtr, S32}}); 552 553 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 554 .customIf(typeIsNot(0, PrivatePtr)); 555 556 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 557 558 auto &FPOpActions = getActionDefinitionsBuilder( 559 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 560 .legalFor({S32, S64}); 561 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 562 .customFor({S32, S64}); 563 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 564 .customFor({S32, S64}); 565 566 if (ST.has16BitInsts()) { 567 if (ST.hasVOP3PInsts()) 568 FPOpActions.legalFor({S16, V2S16}); 569 else 570 FPOpActions.legalFor({S16}); 571 572 TrigActions.customFor({S16}); 573 FDIVActions.customFor({S16}); 574 } 575 576 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 577 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 578 579 if (ST.hasVOP3PInsts()) { 580 MinNumMaxNum.customFor(FPTypesPK16) 581 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 582 .clampMaxNumElements(0, S16, 2) 583 .clampScalar(0, S16, S64) 584 .scalarize(0); 585 } else if (ST.has16BitInsts()) { 586 MinNumMaxNum.customFor(FPTypes16) 587 .clampScalar(0, S16, S64) 588 .scalarize(0); 589 } else { 590 MinNumMaxNum.customFor(FPTypesBase) 591 .clampScalar(0, S32, S64) 592 .scalarize(0); 593 } 594 595 if (ST.hasVOP3PInsts()) 596 FPOpActions.clampMaxNumElements(0, S16, 2); 597 598 FPOpActions 599 .scalarize(0) 600 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 601 602 TrigActions 603 .scalarize(0) 604 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 605 606 FDIVActions 607 .scalarize(0) 608 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 609 610 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 611 .legalFor(FPTypesPK16) 612 .clampMaxNumElements(0, S16, 2) 613 .scalarize(0) 614 .clampScalar(0, S16, S64); 615 616 if (ST.has16BitInsts()) { 617 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 618 .legalFor({S32, S64, S16}) 619 .scalarize(0) 620 .clampScalar(0, S16, S64); 621 } else { 622 getActionDefinitionsBuilder(G_FSQRT) 623 .legalFor({S32, S64}) 624 .scalarize(0) 625 .clampScalar(0, S32, S64); 626 627 if (ST.hasFractBug()) { 628 getActionDefinitionsBuilder(G_FFLOOR) 629 .customFor({S64}) 630 .legalFor({S32, S64}) 631 .scalarize(0) 632 .clampScalar(0, S32, S64); 633 } else { 634 getActionDefinitionsBuilder(G_FFLOOR) 635 .legalFor({S32, S64}) 636 .scalarize(0) 637 .clampScalar(0, S32, S64); 638 } 639 } 640 641 getActionDefinitionsBuilder(G_FPTRUNC) 642 .legalFor({{S32, S64}, {S16, S32}}) 643 .scalarize(0) 644 .lower(); 645 646 getActionDefinitionsBuilder(G_FPEXT) 647 .legalFor({{S64, S32}, {S32, S16}}) 648 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 649 .scalarize(0); 650 651 getActionDefinitionsBuilder(G_FSUB) 652 // Use actual fsub instruction 653 .legalFor({S32}) 654 // Must use fadd + fneg 655 .lowerFor({S64, S16, V2S16}) 656 .scalarize(0) 657 .clampScalar(0, S32, S64); 658 659 // Whether this is legal depends on the floating point mode for the function. 660 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 661 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 662 FMad.customFor({S32, S16}); 663 else if (ST.hasMadMacF32Insts()) 664 FMad.customFor({S32}); 665 else if (ST.hasMadF16()) 666 FMad.customFor({S16}); 667 FMad.scalarize(0) 668 .lower(); 669 670 // TODO: Do we need to clamp maximum bitwidth? 671 getActionDefinitionsBuilder(G_TRUNC) 672 .legalIf(isScalar(0)) 673 .legalFor({{V2S16, V2S32}}) 674 .clampMaxNumElements(0, S16, 2) 675 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 676 // situations (like an invalid implicit use), we don't want to infinite loop 677 // in the legalizer. 678 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 679 .alwaysLegal(); 680 681 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 682 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 683 {S32, S1}, {S64, S1}, {S16, S1}}) 684 .scalarize(0) 685 .clampScalar(0, S32, S64) 686 .widenScalarToNextPow2(1, 32); 687 688 // TODO: Split s1->s64 during regbankselect for VALU. 689 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 690 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 691 .lowerFor({{S32, S64}}) 692 .lowerIf(typeIs(1, S1)) 693 .customFor({{S64, S64}}); 694 if (ST.has16BitInsts()) 695 IToFP.legalFor({{S16, S16}}); 696 IToFP.clampScalar(1, S32, S64) 697 .minScalar(0, S32) 698 .scalarize(0) 699 .widenScalarToNextPow2(1); 700 701 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 702 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 703 .customFor({{S64, S64}}) 704 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 705 if (ST.has16BitInsts()) 706 FPToI.legalFor({{S16, S16}}); 707 else 708 FPToI.minScalar(1, S32); 709 710 FPToI.minScalar(0, S32) 711 .scalarize(0) 712 .lower(); 713 714 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 715 .scalarize(0) 716 .lower(); 717 718 if (ST.has16BitInsts()) { 719 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 720 .legalFor({S16, S32, S64}) 721 .clampScalar(0, S16, S64) 722 .scalarize(0); 723 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 724 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 725 .legalFor({S32, S64}) 726 .clampScalar(0, S32, S64) 727 .scalarize(0); 728 } else { 729 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 730 .legalFor({S32}) 731 .customFor({S64}) 732 .clampScalar(0, S32, S64) 733 .scalarize(0); 734 } 735 736 getActionDefinitionsBuilder(G_PTR_ADD) 737 .legalIf(all(isPointer(0), sameSize(0, 1))) 738 .scalarize(0) 739 .scalarSameSizeAs(1, 0); 740 741 getActionDefinitionsBuilder(G_PTRMASK) 742 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 743 .scalarSameSizeAs(1, 0) 744 .scalarize(0); 745 746 auto &CmpBuilder = 747 getActionDefinitionsBuilder(G_ICMP) 748 // The compare output type differs based on the register bank of the output, 749 // so make both s1 and s32 legal. 750 // 751 // Scalar compares producing output in scc will be promoted to s32, as that 752 // is the allocatable register type that will be needed for the copy from 753 // scc. This will be promoted during RegBankSelect, and we assume something 754 // before that won't try to use s32 result types. 755 // 756 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 757 // bank. 758 .legalForCartesianProduct( 759 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 760 .legalForCartesianProduct( 761 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 762 if (ST.has16BitInsts()) { 763 CmpBuilder.legalFor({{S1, S16}}); 764 } 765 766 CmpBuilder 767 .widenScalarToNextPow2(1) 768 .clampScalar(1, S32, S64) 769 .scalarize(0) 770 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 771 772 getActionDefinitionsBuilder(G_FCMP) 773 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 774 .widenScalarToNextPow2(1) 775 .clampScalar(1, S32, S64) 776 .scalarize(0); 777 778 // FIXME: fpow has a selection pattern that should move to custom lowering. 779 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 780 if (ST.has16BitInsts()) 781 Exp2Ops.legalFor({S32, S16}); 782 else 783 Exp2Ops.legalFor({S32}); 784 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 785 Exp2Ops.scalarize(0); 786 787 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 788 if (ST.has16BitInsts()) 789 ExpOps.customFor({{S32}, {S16}}); 790 else 791 ExpOps.customFor({S32}); 792 ExpOps.clampScalar(0, MinScalarFPTy, S32) 793 .scalarize(0); 794 795 getActionDefinitionsBuilder(G_FPOWI) 796 .clampScalar(0, MinScalarFPTy, S32) 797 .lower(); 798 799 // The 64-bit versions produce 32-bit results, but only on the SALU. 800 getActionDefinitionsBuilder(G_CTPOP) 801 .legalFor({{S32, S32}, {S32, S64}}) 802 .clampScalar(0, S32, S32) 803 .clampScalar(1, S32, S64) 804 .scalarize(0) 805 .widenScalarToNextPow2(0, 32) 806 .widenScalarToNextPow2(1, 32); 807 808 // The hardware instructions return a different result on 0 than the generic 809 // instructions expect. The hardware produces -1, but these produce the 810 // bitwidth. 811 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 812 .scalarize(0) 813 .clampScalar(0, S32, S32) 814 .clampScalar(1, S32, S64) 815 .widenScalarToNextPow2(0, 32) 816 .widenScalarToNextPow2(1, 32) 817 .lower(); 818 819 // The 64-bit versions produce 32-bit results, but only on the SALU. 820 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 821 .legalFor({{S32, S32}, {S32, S64}}) 822 .clampScalar(0, S32, S32) 823 .clampScalar(1, S32, S64) 824 .scalarize(0) 825 .widenScalarToNextPow2(0, 32) 826 .widenScalarToNextPow2(1, 32); 827 828 getActionDefinitionsBuilder(G_BITREVERSE) 829 .legalFor({S32}) 830 .clampScalar(0, S32, S32) 831 .scalarize(0); 832 833 if (ST.has16BitInsts()) { 834 getActionDefinitionsBuilder(G_BSWAP) 835 .legalFor({S16, S32, V2S16}) 836 .clampMaxNumElements(0, S16, 2) 837 // FIXME: Fixing non-power-of-2 before clamp is workaround for 838 // narrowScalar limitation. 839 .widenScalarToNextPow2(0) 840 .clampScalar(0, S16, S32) 841 .scalarize(0); 842 843 if (ST.hasVOP3PInsts()) { 844 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 845 .legalFor({S32, S16, V2S16}) 846 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 847 .clampMaxNumElements(0, S16, 2) 848 .minScalar(0, S16) 849 .widenScalarToNextPow2(0) 850 .scalarize(0) 851 .lower(); 852 } else { 853 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 854 .legalFor({S32, S16}) 855 .widenScalarToNextPow2(0) 856 .minScalar(0, S16) 857 .scalarize(0) 858 .lower(); 859 } 860 } else { 861 // TODO: Should have same legality without v_perm_b32 862 getActionDefinitionsBuilder(G_BSWAP) 863 .legalFor({S32}) 864 .lowerIf(scalarNarrowerThan(0, 32)) 865 // FIXME: Fixing non-power-of-2 before clamp is workaround for 866 // narrowScalar limitation. 867 .widenScalarToNextPow2(0) 868 .maxScalar(0, S32) 869 .scalarize(0) 870 .lower(); 871 872 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 873 .legalFor({S32}) 874 .minScalar(0, S32) 875 .widenScalarToNextPow2(0) 876 .scalarize(0) 877 .lower(); 878 } 879 880 getActionDefinitionsBuilder(G_INTTOPTR) 881 // List the common cases 882 .legalForCartesianProduct(AddrSpaces64, {S64}) 883 .legalForCartesianProduct(AddrSpaces32, {S32}) 884 .scalarize(0) 885 // Accept any address space as long as the size matches 886 .legalIf(sameSize(0, 1)) 887 .widenScalarIf(smallerThan(1, 0), 888 [](const LegalityQuery &Query) { 889 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 890 }) 891 .narrowScalarIf(largerThan(1, 0), 892 [](const LegalityQuery &Query) { 893 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 894 }); 895 896 getActionDefinitionsBuilder(G_PTRTOINT) 897 // List the common cases 898 .legalForCartesianProduct(AddrSpaces64, {S64}) 899 .legalForCartesianProduct(AddrSpaces32, {S32}) 900 .scalarize(0) 901 // Accept any address space as long as the size matches 902 .legalIf(sameSize(0, 1)) 903 .widenScalarIf(smallerThan(0, 1), 904 [](const LegalityQuery &Query) { 905 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 906 }) 907 .narrowScalarIf( 908 largerThan(0, 1), 909 [](const LegalityQuery &Query) { 910 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 911 }); 912 913 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 914 .scalarize(0) 915 .custom(); 916 917 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 918 bool IsLoad) -> bool { 919 const LLT DstTy = Query.Types[0]; 920 921 // Split vector extloads. 922 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 923 unsigned Align = Query.MMODescrs[0].AlignInBits; 924 925 if (MemSize < DstTy.getSizeInBits()) 926 MemSize = std::max(MemSize, Align); 927 928 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 929 return true; 930 931 const LLT PtrTy = Query.Types[1]; 932 unsigned AS = PtrTy.getAddressSpace(); 933 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 934 return true; 935 936 // Catch weird sized loads that don't evenly divide into the access sizes 937 // TODO: May be able to widen depending on alignment etc. 938 unsigned NumRegs = (MemSize + 31) / 32; 939 if (NumRegs == 3) { 940 if (!ST.hasDwordx3LoadStores()) 941 return true; 942 } else { 943 // If the alignment allows, these should have been widened. 944 if (!isPowerOf2_32(NumRegs)) 945 return true; 946 } 947 948 if (Align < MemSize) { 949 const SITargetLowering *TLI = ST.getTargetLowering(); 950 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 951 } 952 953 return false; 954 }; 955 956 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 957 unsigned Opc) -> bool { 958 unsigned Size = Query.Types[0].getSizeInBits(); 959 if (isPowerOf2_32(Size)) 960 return false; 961 962 if (Size == 96 && ST.hasDwordx3LoadStores()) 963 return false; 964 965 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 966 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 967 return false; 968 969 unsigned Align = Query.MMODescrs[0].AlignInBits; 970 unsigned RoundedSize = NextPowerOf2(Size); 971 return (Align >= RoundedSize); 972 }; 973 974 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 975 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 976 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 977 978 // TODO: Refine based on subtargets which support unaligned access or 128-bit 979 // LDS 980 // TODO: Unsupported flat for SI. 981 982 for (unsigned Op : {G_LOAD, G_STORE}) { 983 const bool IsStore = Op == G_STORE; 984 985 auto &Actions = getActionDefinitionsBuilder(Op); 986 // Explicitly list some common cases. 987 // TODO: Does this help compile time at all? 988 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 989 {V2S32, GlobalPtr, 64, GlobalAlign32}, 990 {V4S32, GlobalPtr, 128, GlobalAlign32}, 991 {S64, GlobalPtr, 64, GlobalAlign32}, 992 {V2S64, GlobalPtr, 128, GlobalAlign32}, 993 {V2S16, GlobalPtr, 32, GlobalAlign32}, 994 {S32, GlobalPtr, 8, GlobalAlign8}, 995 {S32, GlobalPtr, 16, GlobalAlign16}, 996 997 {S32, LocalPtr, 32, 32}, 998 {S64, LocalPtr, 64, 32}, 999 {V2S32, LocalPtr, 64, 32}, 1000 {S32, LocalPtr, 8, 8}, 1001 {S32, LocalPtr, 16, 16}, 1002 {V2S16, LocalPtr, 32, 32}, 1003 1004 {S32, PrivatePtr, 32, 32}, 1005 {S32, PrivatePtr, 8, 8}, 1006 {S32, PrivatePtr, 16, 16}, 1007 {V2S16, PrivatePtr, 32, 32}, 1008 1009 {S32, ConstantPtr, 32, GlobalAlign32}, 1010 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1011 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1012 {S64, ConstantPtr, 64, GlobalAlign32}, 1013 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1014 Actions.legalIf( 1015 [=](const LegalityQuery &Query) -> bool { 1016 return isLoadStoreLegal(ST, Query, Op); 1017 }); 1018 1019 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1020 // 64-bits. 1021 // 1022 // TODO: Should generalize bitcast action into coerce, which will also cover 1023 // inserting addrspacecasts. 1024 Actions.customIf(typeIs(1, Constant32Ptr)); 1025 1026 // Turn any illegal element vectors into something easier to deal 1027 // with. These will ultimately produce 32-bit scalar shifts to extract the 1028 // parts anyway. 1029 // 1030 // For odd 16-bit element vectors, prefer to split those into pieces with 1031 // 16-bit vector parts. 1032 Actions.bitcastIf( 1033 [=](const LegalityQuery &Query) -> bool { 1034 const LLT Ty = Query.Types[0]; 1035 const unsigned Size = Ty.getSizeInBits(); 1036 1037 if (Size != Query.MMODescrs[0].SizeInBits) 1038 return Size <= 32 && Ty.isVector(); 1039 1040 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 1041 return true; 1042 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 1043 !isRegisterVectorElementType(Ty.getElementType()); 1044 }, bitcastToRegisterType(0)); 1045 1046 Actions 1047 .customIf(typeIs(1, Constant32Ptr)) 1048 // Widen suitably aligned loads by loading extra elements. 1049 .moreElementsIf([=](const LegalityQuery &Query) { 1050 const LLT Ty = Query.Types[0]; 1051 return Op == G_LOAD && Ty.isVector() && 1052 shouldWidenLoadResult(Query, Op); 1053 }, moreElementsToNextPow2(0)) 1054 .widenScalarIf([=](const LegalityQuery &Query) { 1055 const LLT Ty = Query.Types[0]; 1056 return Op == G_LOAD && !Ty.isVector() && 1057 shouldWidenLoadResult(Query, Op); 1058 }, widenScalarOrEltToNextPow2(0)) 1059 .narrowScalarIf( 1060 [=](const LegalityQuery &Query) -> bool { 1061 return !Query.Types[0].isVector() && 1062 needToSplitMemOp(Query, Op == G_LOAD); 1063 }, 1064 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1065 const LLT DstTy = Query.Types[0]; 1066 const LLT PtrTy = Query.Types[1]; 1067 1068 const unsigned DstSize = DstTy.getSizeInBits(); 1069 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1070 1071 // Split extloads. 1072 if (DstSize > MemSize) 1073 return std::make_pair(0, LLT::scalar(MemSize)); 1074 1075 if (!isPowerOf2_32(DstSize)) { 1076 // We're probably decomposing an odd sized store. Try to split 1077 // to the widest type. TODO: Account for alignment. As-is it 1078 // should be OK, since the new parts will be further legalized. 1079 unsigned FloorSize = PowerOf2Floor(DstSize); 1080 return std::make_pair(0, LLT::scalar(FloorSize)); 1081 } 1082 1083 if (DstSize > 32 && (DstSize % 32 != 0)) { 1084 // FIXME: Need a way to specify non-extload of larger size if 1085 // suitably aligned. 1086 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1087 } 1088 1089 unsigned MaxSize = maxSizeForAddrSpace(ST, 1090 PtrTy.getAddressSpace(), 1091 Op == G_LOAD); 1092 if (MemSize > MaxSize) 1093 return std::make_pair(0, LLT::scalar(MaxSize)); 1094 1095 unsigned Align = Query.MMODescrs[0].AlignInBits; 1096 return std::make_pair(0, LLT::scalar(Align)); 1097 }) 1098 .fewerElementsIf( 1099 [=](const LegalityQuery &Query) -> bool { 1100 return Query.Types[0].isVector() && 1101 needToSplitMemOp(Query, Op == G_LOAD); 1102 }, 1103 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1104 const LLT DstTy = Query.Types[0]; 1105 const LLT PtrTy = Query.Types[1]; 1106 1107 LLT EltTy = DstTy.getElementType(); 1108 unsigned MaxSize = maxSizeForAddrSpace(ST, 1109 PtrTy.getAddressSpace(), 1110 Op == G_LOAD); 1111 1112 // FIXME: Handle widened to power of 2 results better. This ends 1113 // up scalarizing. 1114 // FIXME: 3 element stores scalarized on SI 1115 1116 // Split if it's too large for the address space. 1117 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1118 unsigned NumElts = DstTy.getNumElements(); 1119 unsigned EltSize = EltTy.getSizeInBits(); 1120 1121 if (MaxSize % EltSize == 0) { 1122 return std::make_pair( 1123 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1124 } 1125 1126 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1127 1128 // FIXME: Refine when odd breakdowns handled 1129 // The scalars will need to be re-legalized. 1130 if (NumPieces == 1 || NumPieces >= NumElts || 1131 NumElts % NumPieces != 0) 1132 return std::make_pair(0, EltTy); 1133 1134 return std::make_pair(0, 1135 LLT::vector(NumElts / NumPieces, EltTy)); 1136 } 1137 1138 // FIXME: We could probably handle weird extending loads better. 1139 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1140 if (DstTy.getSizeInBits() > MemSize) 1141 return std::make_pair(0, EltTy); 1142 1143 unsigned EltSize = EltTy.getSizeInBits(); 1144 unsigned DstSize = DstTy.getSizeInBits(); 1145 if (!isPowerOf2_32(DstSize)) { 1146 // We're probably decomposing an odd sized store. Try to split 1147 // to the widest type. TODO: Account for alignment. As-is it 1148 // should be OK, since the new parts will be further legalized. 1149 unsigned FloorSize = PowerOf2Floor(DstSize); 1150 return std::make_pair( 1151 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1152 } 1153 1154 // Need to split because of alignment. 1155 unsigned Align = Query.MMODescrs[0].AlignInBits; 1156 if (EltSize > Align && 1157 (EltSize / Align < DstTy.getNumElements())) { 1158 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1159 } 1160 1161 // May need relegalization for the scalars. 1162 return std::make_pair(0, EltTy); 1163 }) 1164 .minScalar(0, S32); 1165 1166 if (IsStore) 1167 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1168 1169 // TODO: Need a bitcast lower option? 1170 Actions 1171 .widenScalarToNextPow2(0) 1172 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1173 } 1174 1175 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1176 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1177 {S32, GlobalPtr, 16, 2 * 8}, 1178 {S32, LocalPtr, 8, 8}, 1179 {S32, LocalPtr, 16, 16}, 1180 {S32, PrivatePtr, 8, 8}, 1181 {S32, PrivatePtr, 16, 16}, 1182 {S32, ConstantPtr, 8, 8}, 1183 {S32, ConstantPtr, 16, 2 * 8}}); 1184 if (ST.hasFlatAddressSpace()) { 1185 ExtLoads.legalForTypesWithMemDesc( 1186 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1187 } 1188 1189 ExtLoads.clampScalar(0, S32, S32) 1190 .widenScalarToNextPow2(0) 1191 .unsupportedIfMemSizeNotPow2() 1192 .lower(); 1193 1194 auto &Atomics = getActionDefinitionsBuilder( 1195 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1196 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1197 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1198 G_ATOMICRMW_UMIN}) 1199 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1200 {S64, GlobalPtr}, {S64, LocalPtr}, 1201 {S32, RegionPtr}, {S64, RegionPtr}}); 1202 if (ST.hasFlatAddressSpace()) { 1203 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1204 } 1205 1206 if (ST.hasLDSFPAtomics()) { 1207 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1208 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1209 } 1210 1211 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1212 // demarshalling 1213 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1214 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1215 {S32, FlatPtr}, {S64, FlatPtr}}) 1216 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1217 {S32, RegionPtr}, {S64, RegionPtr}}); 1218 // TODO: Pointer types, any 32-bit or 64-bit vector 1219 1220 // Condition should be s32 for scalar, s1 for vector. 1221 getActionDefinitionsBuilder(G_SELECT) 1222 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1223 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1224 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1225 .clampScalar(0, S16, S64) 1226 .scalarize(1) 1227 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1228 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1229 .clampMaxNumElements(0, S32, 2) 1230 .clampMaxNumElements(0, LocalPtr, 2) 1231 .clampMaxNumElements(0, PrivatePtr, 2) 1232 .scalarize(0) 1233 .widenScalarToNextPow2(0) 1234 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1235 1236 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1237 // be more flexible with the shift amount type. 1238 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1239 .legalFor({{S32, S32}, {S64, S32}}); 1240 if (ST.has16BitInsts()) { 1241 if (ST.hasVOP3PInsts()) { 1242 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1243 .clampMaxNumElements(0, S16, 2); 1244 } else 1245 Shifts.legalFor({{S16, S16}}); 1246 1247 // TODO: Support 16-bit shift amounts for all types 1248 Shifts.widenScalarIf( 1249 [=](const LegalityQuery &Query) { 1250 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1251 // 32-bit amount. 1252 const LLT ValTy = Query.Types[0]; 1253 const LLT AmountTy = Query.Types[1]; 1254 return ValTy.getSizeInBits() <= 16 && 1255 AmountTy.getSizeInBits() < 16; 1256 }, changeTo(1, S16)); 1257 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1258 Shifts.clampScalar(1, S32, S32); 1259 Shifts.clampScalar(0, S16, S64); 1260 Shifts.widenScalarToNextPow2(0, 16); 1261 } else { 1262 // Make sure we legalize the shift amount type first, as the general 1263 // expansion for the shifted type will produce much worse code if it hasn't 1264 // been truncated already. 1265 Shifts.clampScalar(1, S32, S32); 1266 Shifts.clampScalar(0, S32, S64); 1267 Shifts.widenScalarToNextPow2(0, 32); 1268 } 1269 Shifts.scalarize(0); 1270 1271 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1272 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1273 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1274 unsigned IdxTypeIdx = 2; 1275 1276 getActionDefinitionsBuilder(Op) 1277 .customIf([=](const LegalityQuery &Query) { 1278 const LLT EltTy = Query.Types[EltTypeIdx]; 1279 const LLT VecTy = Query.Types[VecTypeIdx]; 1280 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1281 return (EltTy.getSizeInBits() == 16 || 1282 EltTy.getSizeInBits() % 32 == 0) && 1283 VecTy.getSizeInBits() % 32 == 0 && 1284 VecTy.getSizeInBits() <= MaxRegisterSize && 1285 IdxTy.getSizeInBits() == 32; 1286 }) 1287 .clampScalar(EltTypeIdx, S32, S64) 1288 .clampScalar(VecTypeIdx, S32, S64) 1289 .clampScalar(IdxTypeIdx, S32, S32) 1290 // TODO: Clamp the number of elements before resorting to stack lowering. 1291 // It should only be necessary with variable indexes. 1292 // As a last resort, lower to the stack 1293 .lower(); 1294 } 1295 1296 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1297 .unsupportedIf([=](const LegalityQuery &Query) { 1298 const LLT &EltTy = Query.Types[1].getElementType(); 1299 return Query.Types[0] != EltTy; 1300 }); 1301 1302 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1303 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1304 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1305 1306 // FIXME: Doesn't handle extract of illegal sizes. 1307 getActionDefinitionsBuilder(Op) 1308 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1309 // FIXME: Multiples of 16 should not be legal. 1310 .legalIf([=](const LegalityQuery &Query) { 1311 const LLT BigTy = Query.Types[BigTyIdx]; 1312 const LLT LitTy = Query.Types[LitTyIdx]; 1313 return (BigTy.getSizeInBits() % 32 == 0) && 1314 (LitTy.getSizeInBits() % 16 == 0); 1315 }) 1316 .widenScalarIf( 1317 [=](const LegalityQuery &Query) { 1318 const LLT BigTy = Query.Types[BigTyIdx]; 1319 return (BigTy.getScalarSizeInBits() < 16); 1320 }, 1321 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1322 .widenScalarIf( 1323 [=](const LegalityQuery &Query) { 1324 const LLT LitTy = Query.Types[LitTyIdx]; 1325 return (LitTy.getScalarSizeInBits() < 16); 1326 }, 1327 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1328 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1329 .widenScalarToNextPow2(BigTyIdx, 32); 1330 1331 } 1332 1333 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1334 .legalForCartesianProduct(AllS32Vectors, {S32}) 1335 .legalForCartesianProduct(AllS64Vectors, {S64}) 1336 .clampNumElements(0, V16S32, V32S32) 1337 .clampNumElements(0, V2S64, V16S64) 1338 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1339 1340 if (ST.hasScalarPackInsts()) { 1341 BuildVector 1342 // FIXME: Should probably widen s1 vectors straight to s32 1343 .minScalarOrElt(0, S16) 1344 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1345 .minScalar(1, S32); 1346 1347 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1348 .legalFor({V2S16, S32}) 1349 .lower(); 1350 BuildVector.minScalarOrElt(0, S32); 1351 } else { 1352 BuildVector.customFor({V2S16, S16}); 1353 BuildVector.minScalarOrElt(0, S32); 1354 1355 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1356 .customFor({V2S16, S32}) 1357 .lower(); 1358 } 1359 1360 BuildVector.legalIf(isRegisterType(0)); 1361 1362 // FIXME: Clamp maximum size 1363 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1364 .legalIf(isRegisterType(0)); 1365 1366 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1367 // pre-legalize. 1368 if (ST.hasVOP3PInsts()) { 1369 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1370 .customFor({V2S16, V2S16}) 1371 .lower(); 1372 } else 1373 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1374 1375 // Merge/Unmerge 1376 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1377 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1378 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1379 1380 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1381 const LLT Ty = Query.Types[TypeIdx]; 1382 if (Ty.isVector()) { 1383 const LLT &EltTy = Ty.getElementType(); 1384 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1385 return true; 1386 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1387 return true; 1388 } 1389 return false; 1390 }; 1391 1392 auto &Builder = getActionDefinitionsBuilder(Op) 1393 .lowerFor({{S16, V2S16}}) 1394 .lowerIf([=](const LegalityQuery &Query) { 1395 const LLT BigTy = Query.Types[BigTyIdx]; 1396 return BigTy.getSizeInBits() == 32; 1397 }) 1398 // Try to widen to s16 first for small types. 1399 // TODO: Only do this on targets with legal s16 shifts 1400 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1401 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1402 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1403 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1404 elementTypeIs(1, S16)), 1405 changeTo(1, V2S16)) 1406 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1407 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1408 // valid. 1409 .clampScalar(LitTyIdx, S32, S512) 1410 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1411 // Break up vectors with weird elements into scalars 1412 .fewerElementsIf( 1413 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1414 scalarize(0)) 1415 .fewerElementsIf( 1416 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1417 scalarize(1)) 1418 .clampScalar(BigTyIdx, S32, MaxScalar); 1419 1420 if (Op == G_MERGE_VALUES) { 1421 Builder.widenScalarIf( 1422 // TODO: Use 16-bit shifts if legal for 8-bit values? 1423 [=](const LegalityQuery &Query) { 1424 const LLT Ty = Query.Types[LitTyIdx]; 1425 return Ty.getSizeInBits() < 32; 1426 }, 1427 changeTo(LitTyIdx, S32)); 1428 } 1429 1430 Builder.widenScalarIf( 1431 [=](const LegalityQuery &Query) { 1432 const LLT Ty = Query.Types[BigTyIdx]; 1433 return !isPowerOf2_32(Ty.getSizeInBits()) && 1434 Ty.getSizeInBits() % 16 != 0; 1435 }, 1436 [=](const LegalityQuery &Query) { 1437 // Pick the next power of 2, or a multiple of 64 over 128. 1438 // Whichever is smaller. 1439 const LLT &Ty = Query.Types[BigTyIdx]; 1440 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1441 if (NewSizeInBits >= 256) { 1442 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1443 if (RoundedTo < NewSizeInBits) 1444 NewSizeInBits = RoundedTo; 1445 } 1446 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1447 }) 1448 .legalIf([=](const LegalityQuery &Query) { 1449 const LLT &BigTy = Query.Types[BigTyIdx]; 1450 const LLT &LitTy = Query.Types[LitTyIdx]; 1451 1452 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1453 return false; 1454 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1455 return false; 1456 1457 return BigTy.getSizeInBits() % 16 == 0 && 1458 LitTy.getSizeInBits() % 16 == 0 && 1459 BigTy.getSizeInBits() <= MaxRegisterSize; 1460 }) 1461 // Any vectors left are the wrong size. Scalarize them. 1462 .scalarize(0) 1463 .scalarize(1); 1464 } 1465 1466 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1467 // RegBankSelect. 1468 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1469 .legalFor({{S32}, {S64}}); 1470 1471 if (ST.hasVOP3PInsts()) { 1472 SextInReg.lowerFor({{V2S16}}) 1473 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1474 // get more vector shift opportunities, since we'll get those when 1475 // expanded. 1476 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1477 } else if (ST.has16BitInsts()) { 1478 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1479 } else { 1480 // Prefer to promote to s32 before lowering if we don't have 16-bit 1481 // shifts. This avoid a lot of intermediate truncate and extend operations. 1482 SextInReg.lowerFor({{S32}, {S64}}); 1483 } 1484 1485 SextInReg 1486 .scalarize(0) 1487 .clampScalar(0, S32, S64) 1488 .lower(); 1489 1490 getActionDefinitionsBuilder(G_FSHR) 1491 .legalFor({{S32, S32}}) 1492 .scalarize(0) 1493 .lower(); 1494 1495 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1496 .legalFor({S64}); 1497 1498 getActionDefinitionsBuilder(G_FENCE) 1499 .alwaysLegal(); 1500 1501 getActionDefinitionsBuilder({ 1502 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1503 G_FCOPYSIGN, 1504 1505 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1506 G_ATOMICRMW_NAND, 1507 G_ATOMICRMW_FSUB, 1508 G_READ_REGISTER, 1509 G_WRITE_REGISTER, 1510 1511 G_SADDO, G_SSUBO, 1512 1513 // TODO: Implement 1514 G_FMINIMUM, G_FMAXIMUM, 1515 G_FSHL 1516 }).lower(); 1517 1518 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1519 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1520 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1521 .unsupported(); 1522 1523 computeTables(); 1524 verify(*ST.getInstrInfo()); 1525 } 1526 1527 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1528 MachineInstr &MI) const { 1529 MachineIRBuilder &B = Helper.MIRBuilder; 1530 MachineRegisterInfo &MRI = *B.getMRI(); 1531 GISelChangeObserver &Observer = Helper.Observer; 1532 1533 switch (MI.getOpcode()) { 1534 case TargetOpcode::G_ADDRSPACE_CAST: 1535 return legalizeAddrSpaceCast(MI, MRI, B); 1536 case TargetOpcode::G_FRINT: 1537 return legalizeFrint(MI, MRI, B); 1538 case TargetOpcode::G_FCEIL: 1539 return legalizeFceil(MI, MRI, B); 1540 case TargetOpcode::G_INTRINSIC_TRUNC: 1541 return legalizeIntrinsicTrunc(MI, MRI, B); 1542 case TargetOpcode::G_SITOFP: 1543 return legalizeITOFP(MI, MRI, B, true); 1544 case TargetOpcode::G_UITOFP: 1545 return legalizeITOFP(MI, MRI, B, false); 1546 case TargetOpcode::G_FPTOSI: 1547 return legalizeFPTOI(MI, MRI, B, true); 1548 case TargetOpcode::G_FPTOUI: 1549 return legalizeFPTOI(MI, MRI, B, false); 1550 case TargetOpcode::G_FMINNUM: 1551 case TargetOpcode::G_FMAXNUM: 1552 case TargetOpcode::G_FMINNUM_IEEE: 1553 case TargetOpcode::G_FMAXNUM_IEEE: 1554 return legalizeMinNumMaxNum(Helper, MI); 1555 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1556 return legalizeExtractVectorElt(MI, MRI, B); 1557 case TargetOpcode::G_INSERT_VECTOR_ELT: 1558 return legalizeInsertVectorElt(MI, MRI, B); 1559 case TargetOpcode::G_SHUFFLE_VECTOR: 1560 return legalizeShuffleVector(MI, MRI, B); 1561 case TargetOpcode::G_FSIN: 1562 case TargetOpcode::G_FCOS: 1563 return legalizeSinCos(MI, MRI, B); 1564 case TargetOpcode::G_GLOBAL_VALUE: 1565 return legalizeGlobalValue(MI, MRI, B); 1566 case TargetOpcode::G_LOAD: 1567 return legalizeLoad(MI, MRI, B, Observer); 1568 case TargetOpcode::G_FMAD: 1569 return legalizeFMad(MI, MRI, B); 1570 case TargetOpcode::G_FDIV: 1571 return legalizeFDIV(MI, MRI, B); 1572 case TargetOpcode::G_UDIV: 1573 case TargetOpcode::G_UREM: 1574 return legalizeUDIV_UREM(MI, MRI, B); 1575 case TargetOpcode::G_SDIV: 1576 case TargetOpcode::G_SREM: 1577 return legalizeSDIV_SREM(MI, MRI, B); 1578 case TargetOpcode::G_ATOMIC_CMPXCHG: 1579 return legalizeAtomicCmpXChg(MI, MRI, B); 1580 case TargetOpcode::G_FLOG: 1581 return legalizeFlog(MI, B, numbers::ln2f); 1582 case TargetOpcode::G_FLOG10: 1583 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1584 case TargetOpcode::G_FEXP: 1585 return legalizeFExp(MI, B); 1586 case TargetOpcode::G_FPOW: 1587 return legalizeFPow(MI, B); 1588 case TargetOpcode::G_FFLOOR: 1589 return legalizeFFloor(MI, MRI, B); 1590 case TargetOpcode::G_BUILD_VECTOR: 1591 return legalizeBuildVector(MI, MRI, B); 1592 default: 1593 return false; 1594 } 1595 1596 llvm_unreachable("expected switch to return"); 1597 } 1598 1599 Register AMDGPULegalizerInfo::getSegmentAperture( 1600 unsigned AS, 1601 MachineRegisterInfo &MRI, 1602 MachineIRBuilder &B) const { 1603 MachineFunction &MF = B.getMF(); 1604 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1605 const LLT S32 = LLT::scalar(32); 1606 1607 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1608 1609 if (ST.hasApertureRegs()) { 1610 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1611 // getreg. 1612 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1613 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1614 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1615 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1616 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1617 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1618 unsigned Encoding = 1619 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1620 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1621 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1622 1623 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1624 1625 B.buildInstr(AMDGPU::S_GETREG_B32) 1626 .addDef(GetReg) 1627 .addImm(Encoding); 1628 MRI.setType(GetReg, S32); 1629 1630 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1631 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1632 } 1633 1634 Register QueuePtr = MRI.createGenericVirtualRegister( 1635 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1636 1637 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1638 return Register(); 1639 1640 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1641 // private_segment_aperture_base_hi. 1642 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1643 1644 // TODO: can we be smarter about machine pointer info? 1645 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1646 MachineMemOperand *MMO = MF.getMachineMemOperand( 1647 PtrInfo, 1648 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1649 MachineMemOperand::MOInvariant, 1650 4, commonAlignment(Align(64), StructOffset)); 1651 1652 Register LoadAddr; 1653 1654 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1655 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1656 } 1657 1658 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1659 MachineInstr &MI, MachineRegisterInfo &MRI, 1660 MachineIRBuilder &B) const { 1661 MachineFunction &MF = B.getMF(); 1662 1663 const LLT S32 = LLT::scalar(32); 1664 Register Dst = MI.getOperand(0).getReg(); 1665 Register Src = MI.getOperand(1).getReg(); 1666 1667 LLT DstTy = MRI.getType(Dst); 1668 LLT SrcTy = MRI.getType(Src); 1669 unsigned DestAS = DstTy.getAddressSpace(); 1670 unsigned SrcAS = SrcTy.getAddressSpace(); 1671 1672 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1673 // vector element. 1674 assert(!DstTy.isVector()); 1675 1676 const AMDGPUTargetMachine &TM 1677 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1678 1679 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1680 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1681 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1682 return true; 1683 } 1684 1685 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1686 // Truncate. 1687 B.buildExtract(Dst, Src, 0); 1688 MI.eraseFromParent(); 1689 return true; 1690 } 1691 1692 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1693 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1694 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1695 1696 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1697 // another. Merge operands are required to be the same type, but creating an 1698 // extra ptrtoint would be kind of pointless. 1699 auto HighAddr = B.buildConstant( 1700 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1701 B.buildMerge(Dst, {Src, HighAddr}); 1702 MI.eraseFromParent(); 1703 return true; 1704 } 1705 1706 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1707 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1708 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1709 unsigned NullVal = TM.getNullPointerValue(DestAS); 1710 1711 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1712 auto FlatNull = B.buildConstant(SrcTy, 0); 1713 1714 // Extract low 32-bits of the pointer. 1715 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1716 1717 auto CmpRes = 1718 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1719 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1720 1721 MI.eraseFromParent(); 1722 return true; 1723 } 1724 1725 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1726 return false; 1727 1728 if (!ST.hasFlatAddressSpace()) 1729 return false; 1730 1731 auto SegmentNull = 1732 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1733 auto FlatNull = 1734 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1735 1736 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1737 if (!ApertureReg.isValid()) 1738 return false; 1739 1740 auto CmpRes = 1741 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1742 1743 // Coerce the type of the low half of the result so we can use merge_values. 1744 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1745 1746 // TODO: Should we allow mismatched types but matching sizes in merges to 1747 // avoid the ptrtoint? 1748 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1749 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1750 1751 MI.eraseFromParent(); 1752 return true; 1753 } 1754 1755 bool AMDGPULegalizerInfo::legalizeFrint( 1756 MachineInstr &MI, MachineRegisterInfo &MRI, 1757 MachineIRBuilder &B) const { 1758 Register Src = MI.getOperand(1).getReg(); 1759 LLT Ty = MRI.getType(Src); 1760 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1761 1762 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1763 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1764 1765 auto C1 = B.buildFConstant(Ty, C1Val); 1766 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1767 1768 // TODO: Should this propagate fast-math-flags? 1769 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1770 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1771 1772 auto C2 = B.buildFConstant(Ty, C2Val); 1773 auto Fabs = B.buildFAbs(Ty, Src); 1774 1775 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1776 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1777 MI.eraseFromParent(); 1778 return true; 1779 } 1780 1781 bool AMDGPULegalizerInfo::legalizeFceil( 1782 MachineInstr &MI, MachineRegisterInfo &MRI, 1783 MachineIRBuilder &B) const { 1784 1785 const LLT S1 = LLT::scalar(1); 1786 const LLT S64 = LLT::scalar(64); 1787 1788 Register Src = MI.getOperand(1).getReg(); 1789 assert(MRI.getType(Src) == S64); 1790 1791 // result = trunc(src) 1792 // if (src > 0.0 && src != result) 1793 // result += 1.0 1794 1795 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1796 1797 const auto Zero = B.buildFConstant(S64, 0.0); 1798 const auto One = B.buildFConstant(S64, 1.0); 1799 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1800 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1801 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1802 auto Add = B.buildSelect(S64, And, One, Zero); 1803 1804 // TODO: Should this propagate fast-math-flags? 1805 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1806 return true; 1807 } 1808 1809 static MachineInstrBuilder extractF64Exponent(Register Hi, 1810 MachineIRBuilder &B) { 1811 const unsigned FractBits = 52; 1812 const unsigned ExpBits = 11; 1813 LLT S32 = LLT::scalar(32); 1814 1815 auto Const0 = B.buildConstant(S32, FractBits - 32); 1816 auto Const1 = B.buildConstant(S32, ExpBits); 1817 1818 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1819 .addUse(Hi) 1820 .addUse(Const0.getReg(0)) 1821 .addUse(Const1.getReg(0)); 1822 1823 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1824 } 1825 1826 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1827 MachineInstr &MI, MachineRegisterInfo &MRI, 1828 MachineIRBuilder &B) const { 1829 const LLT S1 = LLT::scalar(1); 1830 const LLT S32 = LLT::scalar(32); 1831 const LLT S64 = LLT::scalar(64); 1832 1833 Register Src = MI.getOperand(1).getReg(); 1834 assert(MRI.getType(Src) == S64); 1835 1836 // TODO: Should this use extract since the low half is unused? 1837 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1838 Register Hi = Unmerge.getReg(1); 1839 1840 // Extract the upper half, since this is where we will find the sign and 1841 // exponent. 1842 auto Exp = extractF64Exponent(Hi, B); 1843 1844 const unsigned FractBits = 52; 1845 1846 // Extract the sign bit. 1847 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1848 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1849 1850 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1851 1852 const auto Zero32 = B.buildConstant(S32, 0); 1853 1854 // Extend back to 64-bits. 1855 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1856 1857 auto Shr = B.buildAShr(S64, FractMask, Exp); 1858 auto Not = B.buildNot(S64, Shr); 1859 auto Tmp0 = B.buildAnd(S64, Src, Not); 1860 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1861 1862 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1863 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1864 1865 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1866 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1867 MI.eraseFromParent(); 1868 return true; 1869 } 1870 1871 bool AMDGPULegalizerInfo::legalizeITOFP( 1872 MachineInstr &MI, MachineRegisterInfo &MRI, 1873 MachineIRBuilder &B, bool Signed) const { 1874 1875 Register Dst = MI.getOperand(0).getReg(); 1876 Register Src = MI.getOperand(1).getReg(); 1877 1878 const LLT S64 = LLT::scalar(64); 1879 const LLT S32 = LLT::scalar(32); 1880 1881 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1882 1883 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1884 1885 auto CvtHi = Signed ? 1886 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1887 B.buildUITOFP(S64, Unmerge.getReg(1)); 1888 1889 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1890 1891 auto ThirtyTwo = B.buildConstant(S32, 32); 1892 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1893 .addUse(CvtHi.getReg(0)) 1894 .addUse(ThirtyTwo.getReg(0)); 1895 1896 // TODO: Should this propagate fast-math-flags? 1897 B.buildFAdd(Dst, LdExp, CvtLo); 1898 MI.eraseFromParent(); 1899 return true; 1900 } 1901 1902 // TODO: Copied from DAG implementation. Verify logic and document how this 1903 // actually works. 1904 bool AMDGPULegalizerInfo::legalizeFPTOI( 1905 MachineInstr &MI, MachineRegisterInfo &MRI, 1906 MachineIRBuilder &B, bool Signed) const { 1907 1908 Register Dst = MI.getOperand(0).getReg(); 1909 Register Src = MI.getOperand(1).getReg(); 1910 1911 const LLT S64 = LLT::scalar(64); 1912 const LLT S32 = LLT::scalar(32); 1913 1914 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1915 1916 unsigned Flags = MI.getFlags(); 1917 1918 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1919 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1920 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1921 1922 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1923 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1924 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1925 1926 auto Hi = Signed ? 1927 B.buildFPTOSI(S32, FloorMul) : 1928 B.buildFPTOUI(S32, FloorMul); 1929 auto Lo = B.buildFPTOUI(S32, Fma); 1930 1931 B.buildMerge(Dst, { Lo, Hi }); 1932 MI.eraseFromParent(); 1933 1934 return true; 1935 } 1936 1937 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 1938 MachineInstr &MI) const { 1939 MachineFunction &MF = Helper.MIRBuilder.getMF(); 1940 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1941 1942 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1943 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1944 1945 // With ieee_mode disabled, the instructions have the correct behavior 1946 // already for G_FMINNUM/G_FMAXNUM 1947 if (!MFI->getMode().IEEE) 1948 return !IsIEEEOp; 1949 1950 if (IsIEEEOp) 1951 return true; 1952 1953 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1954 } 1955 1956 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1957 MachineInstr &MI, MachineRegisterInfo &MRI, 1958 MachineIRBuilder &B) const { 1959 // TODO: Should move some of this into LegalizerHelper. 1960 1961 // TODO: Promote dynamic indexing of s16 to s32 1962 1963 // FIXME: Artifact combiner probably should have replaced the truncated 1964 // constant before this, so we shouldn't need 1965 // getConstantVRegValWithLookThrough. 1966 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1967 MI.getOperand(2).getReg(), MRI); 1968 if (!IdxVal) // Dynamic case will be selected to register indexing. 1969 return true; 1970 1971 Register Dst = MI.getOperand(0).getReg(); 1972 Register Vec = MI.getOperand(1).getReg(); 1973 1974 LLT VecTy = MRI.getType(Vec); 1975 LLT EltTy = VecTy.getElementType(); 1976 assert(EltTy == MRI.getType(Dst)); 1977 1978 if (IdxVal->Value < VecTy.getNumElements()) 1979 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1980 else 1981 B.buildUndef(Dst); 1982 1983 MI.eraseFromParent(); 1984 return true; 1985 } 1986 1987 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1988 MachineInstr &MI, MachineRegisterInfo &MRI, 1989 MachineIRBuilder &B) const { 1990 // TODO: Should move some of this into LegalizerHelper. 1991 1992 // TODO: Promote dynamic indexing of s16 to s32 1993 1994 // FIXME: Artifact combiner probably should have replaced the truncated 1995 // constant before this, so we shouldn't need 1996 // getConstantVRegValWithLookThrough. 1997 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1998 MI.getOperand(3).getReg(), MRI); 1999 if (!IdxVal) // Dynamic case will be selected to register indexing. 2000 return true; 2001 2002 Register Dst = MI.getOperand(0).getReg(); 2003 Register Vec = MI.getOperand(1).getReg(); 2004 Register Ins = MI.getOperand(2).getReg(); 2005 2006 LLT VecTy = MRI.getType(Vec); 2007 LLT EltTy = VecTy.getElementType(); 2008 assert(EltTy == MRI.getType(Ins)); 2009 2010 if (IdxVal->Value < VecTy.getNumElements()) 2011 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2012 else 2013 B.buildUndef(Dst); 2014 2015 MI.eraseFromParent(); 2016 return true; 2017 } 2018 2019 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2020 MachineInstr &MI, MachineRegisterInfo &MRI, 2021 MachineIRBuilder &B) const { 2022 const LLT V2S16 = LLT::vector(2, 16); 2023 2024 Register Dst = MI.getOperand(0).getReg(); 2025 Register Src0 = MI.getOperand(1).getReg(); 2026 LLT DstTy = MRI.getType(Dst); 2027 LLT SrcTy = MRI.getType(Src0); 2028 2029 if (SrcTy == V2S16 && DstTy == V2S16 && 2030 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2031 return true; 2032 2033 MachineIRBuilder HelperBuilder(MI); 2034 GISelObserverWrapper DummyObserver; 2035 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2036 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2037 } 2038 2039 bool AMDGPULegalizerInfo::legalizeSinCos( 2040 MachineInstr &MI, MachineRegisterInfo &MRI, 2041 MachineIRBuilder &B) const { 2042 2043 Register DstReg = MI.getOperand(0).getReg(); 2044 Register SrcReg = MI.getOperand(1).getReg(); 2045 LLT Ty = MRI.getType(DstReg); 2046 unsigned Flags = MI.getFlags(); 2047 2048 Register TrigVal; 2049 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2050 if (ST.hasTrigReducedRange()) { 2051 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2052 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2053 .addUse(MulVal.getReg(0)) 2054 .setMIFlags(Flags).getReg(0); 2055 } else 2056 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2057 2058 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2059 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2060 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2061 .addUse(TrigVal) 2062 .setMIFlags(Flags); 2063 MI.eraseFromParent(); 2064 return true; 2065 } 2066 2067 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2068 MachineIRBuilder &B, 2069 const GlobalValue *GV, 2070 int64_t Offset, 2071 unsigned GAFlags) const { 2072 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2073 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2074 // to the following code sequence: 2075 // 2076 // For constant address space: 2077 // s_getpc_b64 s[0:1] 2078 // s_add_u32 s0, s0, $symbol 2079 // s_addc_u32 s1, s1, 0 2080 // 2081 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2082 // a fixup or relocation is emitted to replace $symbol with a literal 2083 // constant, which is a pc-relative offset from the encoding of the $symbol 2084 // operand to the global variable. 2085 // 2086 // For global address space: 2087 // s_getpc_b64 s[0:1] 2088 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2089 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2090 // 2091 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2092 // fixups or relocations are emitted to replace $symbol@*@lo and 2093 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2094 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2095 // operand to the global variable. 2096 // 2097 // What we want here is an offset from the value returned by s_getpc 2098 // (which is the address of the s_add_u32 instruction) to the global 2099 // variable, but since the encoding of $symbol starts 4 bytes after the start 2100 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2101 // small. This requires us to add 4 to the global variable offset in order to 2102 // compute the correct address. 2103 2104 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2105 2106 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2107 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2108 2109 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2110 .addDef(PCReg); 2111 2112 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2113 if (GAFlags == SIInstrInfo::MO_NONE) 2114 MIB.addImm(0); 2115 else 2116 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2117 2118 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2119 2120 if (PtrTy.getSizeInBits() == 32) 2121 B.buildExtract(DstReg, PCReg, 0); 2122 return true; 2123 } 2124 2125 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2126 MachineInstr &MI, MachineRegisterInfo &MRI, 2127 MachineIRBuilder &B) const { 2128 Register DstReg = MI.getOperand(0).getReg(); 2129 LLT Ty = MRI.getType(DstReg); 2130 unsigned AS = Ty.getAddressSpace(); 2131 2132 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2133 MachineFunction &MF = B.getMF(); 2134 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2135 2136 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2137 if (!MFI->isEntryFunction()) { 2138 const Function &Fn = MF.getFunction(); 2139 DiagnosticInfoUnsupported BadLDSDecl( 2140 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2141 DS_Warning); 2142 Fn.getContext().diagnose(BadLDSDecl); 2143 2144 // We currently don't have a way to correctly allocate LDS objects that 2145 // aren't directly associated with a kernel. We do force inlining of 2146 // functions that use local objects. However, if these dead functions are 2147 // not eliminated, we don't want a compile time error. Just emit a warning 2148 // and a trap, since there should be no callable path here. 2149 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2150 B.buildUndef(DstReg); 2151 MI.eraseFromParent(); 2152 return true; 2153 } 2154 2155 // TODO: We could emit code to handle the initialization somewhere. 2156 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2157 const SITargetLowering *TLI = ST.getTargetLowering(); 2158 if (!TLI->shouldUseLDSConstAddress(GV)) { 2159 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2160 return true; // Leave in place; 2161 } 2162 2163 B.buildConstant( 2164 DstReg, 2165 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2166 MI.eraseFromParent(); 2167 return true; 2168 } 2169 2170 const Function &Fn = MF.getFunction(); 2171 DiagnosticInfoUnsupported BadInit( 2172 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2173 Fn.getContext().diagnose(BadInit); 2174 return true; 2175 } 2176 2177 const SITargetLowering *TLI = ST.getTargetLowering(); 2178 2179 if (TLI->shouldEmitFixup(GV)) { 2180 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2181 MI.eraseFromParent(); 2182 return true; 2183 } 2184 2185 if (TLI->shouldEmitPCReloc(GV)) { 2186 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2187 MI.eraseFromParent(); 2188 return true; 2189 } 2190 2191 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2192 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2193 2194 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2195 MachinePointerInfo::getGOT(MF), 2196 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2197 MachineMemOperand::MOInvariant, 2198 8 /*Size*/, Align(8)); 2199 2200 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2201 2202 if (Ty.getSizeInBits() == 32) { 2203 // Truncate if this is a 32-bit constant adrdess. 2204 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2205 B.buildExtract(DstReg, Load, 0); 2206 } else 2207 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2208 2209 MI.eraseFromParent(); 2210 return true; 2211 } 2212 2213 bool AMDGPULegalizerInfo::legalizeLoad( 2214 MachineInstr &MI, MachineRegisterInfo &MRI, 2215 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2216 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2217 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2218 Observer.changingInstr(MI); 2219 MI.getOperand(1).setReg(Cast.getReg(0)); 2220 Observer.changedInstr(MI); 2221 return true; 2222 } 2223 2224 bool AMDGPULegalizerInfo::legalizeFMad( 2225 MachineInstr &MI, MachineRegisterInfo &MRI, 2226 MachineIRBuilder &B) const { 2227 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2228 assert(Ty.isScalar()); 2229 2230 MachineFunction &MF = B.getMF(); 2231 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2232 2233 // TODO: Always legal with future ftz flag. 2234 // FIXME: Do we need just output? 2235 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2236 return true; 2237 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2238 return true; 2239 2240 MachineIRBuilder HelperBuilder(MI); 2241 GISelObserverWrapper DummyObserver; 2242 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2243 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2244 } 2245 2246 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2247 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2248 Register DstReg = MI.getOperand(0).getReg(); 2249 Register PtrReg = MI.getOperand(1).getReg(); 2250 Register CmpVal = MI.getOperand(2).getReg(); 2251 Register NewVal = MI.getOperand(3).getReg(); 2252 2253 assert(SITargetLowering::isFlatGlobalAddrSpace( 2254 MRI.getType(PtrReg).getAddressSpace()) && 2255 "this should not have been custom lowered"); 2256 2257 LLT ValTy = MRI.getType(CmpVal); 2258 LLT VecTy = LLT::vector(2, ValTy); 2259 2260 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2261 2262 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2263 .addDef(DstReg) 2264 .addUse(PtrReg) 2265 .addUse(PackedVal) 2266 .setMemRefs(MI.memoperands()); 2267 2268 MI.eraseFromParent(); 2269 return true; 2270 } 2271 2272 bool AMDGPULegalizerInfo::legalizeFlog( 2273 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2274 Register Dst = MI.getOperand(0).getReg(); 2275 Register Src = MI.getOperand(1).getReg(); 2276 LLT Ty = B.getMRI()->getType(Dst); 2277 unsigned Flags = MI.getFlags(); 2278 2279 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2280 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2281 2282 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2283 MI.eraseFromParent(); 2284 return true; 2285 } 2286 2287 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2288 MachineIRBuilder &B) const { 2289 Register Dst = MI.getOperand(0).getReg(); 2290 Register Src = MI.getOperand(1).getReg(); 2291 unsigned Flags = MI.getFlags(); 2292 LLT Ty = B.getMRI()->getType(Dst); 2293 2294 auto K = B.buildFConstant(Ty, numbers::log2e); 2295 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2296 B.buildFExp2(Dst, Mul, Flags); 2297 MI.eraseFromParent(); 2298 return true; 2299 } 2300 2301 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2302 MachineIRBuilder &B) const { 2303 Register Dst = MI.getOperand(0).getReg(); 2304 Register Src0 = MI.getOperand(1).getReg(); 2305 Register Src1 = MI.getOperand(2).getReg(); 2306 unsigned Flags = MI.getFlags(); 2307 LLT Ty = B.getMRI()->getType(Dst); 2308 const LLT S16 = LLT::scalar(16); 2309 const LLT S32 = LLT::scalar(32); 2310 2311 if (Ty == S32) { 2312 auto Log = B.buildFLog2(S32, Src0, Flags); 2313 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2314 .addUse(Log.getReg(0)) 2315 .addUse(Src1) 2316 .setMIFlags(Flags); 2317 B.buildFExp2(Dst, Mul, Flags); 2318 } else if (Ty == S16) { 2319 // There's no f16 fmul_legacy, so we need to convert for it. 2320 auto Log = B.buildFLog2(S16, Src0, Flags); 2321 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2322 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2323 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2324 .addUse(Ext0.getReg(0)) 2325 .addUse(Ext1.getReg(0)) 2326 .setMIFlags(Flags); 2327 2328 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2329 } else 2330 return false; 2331 2332 MI.eraseFromParent(); 2333 return true; 2334 } 2335 2336 // Find a source register, ignoring any possible source modifiers. 2337 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2338 Register ModSrc = OrigSrc; 2339 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2340 ModSrc = SrcFNeg->getOperand(1).getReg(); 2341 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2342 ModSrc = SrcFAbs->getOperand(1).getReg(); 2343 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2344 ModSrc = SrcFAbs->getOperand(1).getReg(); 2345 return ModSrc; 2346 } 2347 2348 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2349 MachineRegisterInfo &MRI, 2350 MachineIRBuilder &B) const { 2351 2352 const LLT S1 = LLT::scalar(1); 2353 const LLT S64 = LLT::scalar(64); 2354 Register Dst = MI.getOperand(0).getReg(); 2355 Register OrigSrc = MI.getOperand(1).getReg(); 2356 unsigned Flags = MI.getFlags(); 2357 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2358 "this should not have been custom lowered"); 2359 2360 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2361 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2362 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2363 // V_FRACT bug is: 2364 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2365 // 2366 // Convert floor(x) to (x - fract(x)) 2367 2368 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2369 .addUse(OrigSrc) 2370 .setMIFlags(Flags); 2371 2372 // Give source modifier matching some assistance before obscuring a foldable 2373 // pattern. 2374 2375 // TODO: We can avoid the neg on the fract? The input sign to fract 2376 // shouldn't matter? 2377 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2378 2379 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2380 2381 Register Min = MRI.createGenericVirtualRegister(S64); 2382 2383 // We don't need to concern ourselves with the snan handling difference, so 2384 // use the one which will directly select. 2385 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2386 if (MFI->getMode().IEEE) 2387 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2388 else 2389 B.buildFMinNum(Min, Fract, Const, Flags); 2390 2391 Register CorrectedFract = Min; 2392 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2393 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2394 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2395 } 2396 2397 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2398 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2399 2400 MI.eraseFromParent(); 2401 return true; 2402 } 2403 2404 // Turn an illegal packed v2s16 build vector into bit operations. 2405 // TODO: This should probably be a bitcast action in LegalizerHelper. 2406 bool AMDGPULegalizerInfo::legalizeBuildVector( 2407 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2408 Register Dst = MI.getOperand(0).getReg(); 2409 const LLT S32 = LLT::scalar(32); 2410 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2411 2412 Register Src0 = MI.getOperand(1).getReg(); 2413 Register Src1 = MI.getOperand(2).getReg(); 2414 assert(MRI.getType(Src0) == LLT::scalar(16)); 2415 2416 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2417 B.buildBitcast(Dst, Merge); 2418 2419 MI.eraseFromParent(); 2420 return true; 2421 } 2422 2423 // Return the use branch instruction, otherwise null if the usage is invalid. 2424 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2425 MachineRegisterInfo &MRI, 2426 MachineInstr *&Br, 2427 MachineBasicBlock *&UncondBrTarget) { 2428 Register CondDef = MI.getOperand(0).getReg(); 2429 if (!MRI.hasOneNonDBGUse(CondDef)) 2430 return nullptr; 2431 2432 MachineBasicBlock *Parent = MI.getParent(); 2433 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2434 if (UseMI.getParent() != Parent || 2435 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2436 return nullptr; 2437 2438 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2439 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2440 if (Next == Parent->end()) { 2441 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2442 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2443 return nullptr; 2444 UncondBrTarget = &*NextMBB; 2445 } else { 2446 if (Next->getOpcode() != AMDGPU::G_BR) 2447 return nullptr; 2448 Br = &*Next; 2449 UncondBrTarget = Br->getOperand(0).getMBB(); 2450 } 2451 2452 return &UseMI; 2453 } 2454 2455 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2456 MachineRegisterInfo &MRI, 2457 Register LiveIn, 2458 Register PhyReg) const { 2459 assert(PhyReg.isPhysical() && "Physical register expected"); 2460 2461 // Insert the live-in copy, if required, by defining destination virtual 2462 // register. 2463 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2464 if (!MRI.getVRegDef(LiveIn)) { 2465 // FIXME: Should have scoped insert pt 2466 MachineBasicBlock &OrigInsBB = B.getMBB(); 2467 auto OrigInsPt = B.getInsertPt(); 2468 2469 MachineBasicBlock &EntryMBB = B.getMF().front(); 2470 EntryMBB.addLiveIn(PhyReg); 2471 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2472 B.buildCopy(LiveIn, PhyReg); 2473 2474 B.setInsertPt(OrigInsBB, OrigInsPt); 2475 } 2476 2477 return LiveIn; 2478 } 2479 2480 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2481 MachineRegisterInfo &MRI, 2482 Register PhyReg, LLT Ty, 2483 bool InsertLiveInCopy) const { 2484 assert(PhyReg.isPhysical() && "Physical register expected"); 2485 2486 // Get or create virtual live-in regester 2487 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2488 if (!LiveIn) { 2489 LiveIn = MRI.createGenericVirtualRegister(Ty); 2490 MRI.addLiveIn(PhyReg, LiveIn); 2491 } 2492 2493 // When the actual true copy required is from virtual register to physical 2494 // register (to be inserted later), live-in copy insertion from physical 2495 // to register virtual register is not required 2496 if (!InsertLiveInCopy) 2497 return LiveIn; 2498 2499 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2500 } 2501 2502 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2503 const ArgDescriptor *Arg, 2504 const TargetRegisterClass *ArgRC, 2505 LLT ArgTy) const { 2506 MCRegister SrcReg = Arg->getRegister(); 2507 assert(SrcReg.isPhysical() && "Physical register expected"); 2508 assert(DstReg.isVirtual() && "Virtual register expected"); 2509 2510 MachineRegisterInfo &MRI = *B.getMRI(); 2511 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, ArgTy); 2512 2513 if (Arg->isMasked()) { 2514 // TODO: Should we try to emit this once in the entry block? 2515 const LLT S32 = LLT::scalar(32); 2516 const unsigned Mask = Arg->getMask(); 2517 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2518 2519 Register AndMaskSrc = LiveIn; 2520 2521 if (Shift != 0) { 2522 auto ShiftAmt = B.buildConstant(S32, Shift); 2523 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2524 } 2525 2526 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2527 } else { 2528 B.buildCopy(DstReg, LiveIn); 2529 } 2530 2531 return true; 2532 } 2533 2534 bool AMDGPULegalizerInfo::loadInputValue( 2535 Register DstReg, MachineIRBuilder &B, 2536 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2537 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2538 const ArgDescriptor *Arg; 2539 const TargetRegisterClass *ArgRC; 2540 LLT ArgTy; 2541 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2542 2543 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2544 return false; // TODO: Handle these 2545 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2546 } 2547 2548 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2549 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2550 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2551 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2552 return false; 2553 2554 MI.eraseFromParent(); 2555 return true; 2556 } 2557 2558 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2559 MachineRegisterInfo &MRI, 2560 MachineIRBuilder &B) const { 2561 Register Dst = MI.getOperand(0).getReg(); 2562 LLT DstTy = MRI.getType(Dst); 2563 LLT S16 = LLT::scalar(16); 2564 LLT S32 = LLT::scalar(32); 2565 LLT S64 = LLT::scalar(64); 2566 2567 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2568 return true; 2569 2570 if (DstTy == S16) 2571 return legalizeFDIV16(MI, MRI, B); 2572 if (DstTy == S32) 2573 return legalizeFDIV32(MI, MRI, B); 2574 if (DstTy == S64) 2575 return legalizeFDIV64(MI, MRI, B); 2576 2577 return false; 2578 } 2579 2580 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2581 Register DstReg, 2582 Register X, 2583 Register Y, 2584 bool IsDiv) const { 2585 const LLT S1 = LLT::scalar(1); 2586 const LLT S32 = LLT::scalar(32); 2587 2588 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2589 // algorithm used here. 2590 2591 // Initial estimate of inv(y). 2592 auto FloatY = B.buildUITOFP(S32, Y); 2593 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2594 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2595 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2596 auto Z = B.buildFPTOUI(S32, ScaledY); 2597 2598 // One round of UNR. 2599 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2600 auto NegYZ = B.buildMul(S32, NegY, Z); 2601 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2602 2603 // Quotient/remainder estimate. 2604 auto Q = B.buildUMulH(S32, X, Z); 2605 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2606 2607 // First quotient/remainder refinement. 2608 auto One = B.buildConstant(S32, 1); 2609 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2610 if (IsDiv) 2611 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2612 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2613 2614 // Second quotient/remainder refinement. 2615 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2616 if (IsDiv) 2617 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2618 else 2619 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2620 } 2621 2622 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2623 MachineRegisterInfo &MRI, 2624 MachineIRBuilder &B) const { 2625 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2626 Register DstReg = MI.getOperand(0).getReg(); 2627 Register Num = MI.getOperand(1).getReg(); 2628 Register Den = MI.getOperand(2).getReg(); 2629 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2630 MI.eraseFromParent(); 2631 return true; 2632 } 2633 2634 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2635 // 2636 // Return lo, hi of result 2637 // 2638 // %cvt.lo = G_UITOFP Val.lo 2639 // %cvt.hi = G_UITOFP Val.hi 2640 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2641 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2642 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2643 // %mul2 = G_FMUL %mul1, 2**(-32) 2644 // %trunc = G_INTRINSIC_TRUNC %mul2 2645 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2646 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2647 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2648 Register Val) { 2649 const LLT S32 = LLT::scalar(32); 2650 auto Unmerge = B.buildUnmerge(S32, Val); 2651 2652 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2653 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2654 2655 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2656 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2657 2658 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2659 auto Mul1 = 2660 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2661 2662 // 2**(-32) 2663 auto Mul2 = 2664 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2665 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2666 2667 // -(2**32) 2668 auto Mad2 = B.buildFMAD(S32, Trunc, 2669 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2670 2671 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2672 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2673 2674 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2675 } 2676 2677 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2678 Register DstReg, 2679 Register Numer, 2680 Register Denom, 2681 bool IsDiv) const { 2682 const LLT S32 = LLT::scalar(32); 2683 const LLT S64 = LLT::scalar(64); 2684 const LLT S1 = LLT::scalar(1); 2685 Register RcpLo, RcpHi; 2686 2687 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2688 2689 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2690 2691 auto Zero64 = B.buildConstant(S64, 0); 2692 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2693 2694 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2695 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2696 2697 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2698 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2699 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2700 2701 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2702 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2703 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2704 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2705 2706 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2707 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2708 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2709 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2710 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2711 2712 auto Zero32 = B.buildConstant(S32, 0); 2713 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2714 auto Add2_HiC = 2715 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2716 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2717 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2718 2719 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2720 Register NumerLo = UnmergeNumer.getReg(0); 2721 Register NumerHi = UnmergeNumer.getReg(1); 2722 2723 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2724 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2725 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2726 Register Mul3_Lo = UnmergeMul3.getReg(0); 2727 Register Mul3_Hi = UnmergeMul3.getReg(1); 2728 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2729 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2730 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2731 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2732 2733 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2734 Register DenomLo = UnmergeDenom.getReg(0); 2735 Register DenomHi = UnmergeDenom.getReg(1); 2736 2737 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2738 auto C1 = B.buildSExt(S32, CmpHi); 2739 2740 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2741 auto C2 = B.buildSExt(S32, CmpLo); 2742 2743 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2744 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2745 2746 // TODO: Here and below portions of the code can be enclosed into if/endif. 2747 // Currently control flow is unconditional and we have 4 selects after 2748 // potential endif to substitute PHIs. 2749 2750 // if C3 != 0 ... 2751 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2752 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2753 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2754 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2755 2756 auto One64 = B.buildConstant(S64, 1); 2757 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2758 2759 auto C4 = 2760 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2761 auto C5 = 2762 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2763 auto C6 = B.buildSelect( 2764 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2765 2766 // if (C6 != 0) 2767 auto Add4 = B.buildAdd(S64, Add3, One64); 2768 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2769 2770 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2771 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2772 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2773 2774 // endif C6 2775 // endif C3 2776 2777 if (IsDiv) { 2778 auto Sel1 = B.buildSelect( 2779 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2780 B.buildSelect(DstReg, 2781 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2782 } else { 2783 auto Sel2 = B.buildSelect( 2784 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2785 B.buildSelect(DstReg, 2786 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2787 } 2788 } 2789 2790 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2791 MachineRegisterInfo &MRI, 2792 MachineIRBuilder &B) const { 2793 const LLT S64 = LLT::scalar(64); 2794 const LLT S32 = LLT::scalar(32); 2795 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2796 Register DstReg = MI.getOperand(0).getReg(); 2797 Register Num = MI.getOperand(1).getReg(); 2798 Register Den = MI.getOperand(2).getReg(); 2799 LLT Ty = MRI.getType(DstReg); 2800 2801 if (Ty == S32) 2802 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2803 else if (Ty == S64) 2804 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2805 else 2806 return false; 2807 2808 MI.eraseFromParent(); 2809 return true; 2810 2811 } 2812 2813 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2814 MachineRegisterInfo &MRI, 2815 MachineIRBuilder &B) const { 2816 const LLT S64 = LLT::scalar(64); 2817 const LLT S32 = LLT::scalar(32); 2818 2819 Register DstReg = MI.getOperand(0).getReg(); 2820 const LLT Ty = MRI.getType(DstReg); 2821 if (Ty != S32 && Ty != S64) 2822 return false; 2823 2824 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2825 2826 Register LHS = MI.getOperand(1).getReg(); 2827 Register RHS = MI.getOperand(2).getReg(); 2828 2829 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2830 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2831 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2832 2833 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2834 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2835 2836 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2837 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2838 2839 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2840 if (Ty == S32) 2841 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2842 else 2843 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2844 2845 Register Sign; 2846 if (IsDiv) 2847 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2848 else 2849 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2850 2851 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2852 B.buildSub(DstReg, UDivRem, Sign); 2853 2854 MI.eraseFromParent(); 2855 return true; 2856 } 2857 2858 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2859 MachineRegisterInfo &MRI, 2860 MachineIRBuilder &B) const { 2861 Register Res = MI.getOperand(0).getReg(); 2862 Register LHS = MI.getOperand(1).getReg(); 2863 Register RHS = MI.getOperand(2).getReg(); 2864 2865 uint16_t Flags = MI.getFlags(); 2866 2867 LLT ResTy = MRI.getType(Res); 2868 LLT S32 = LLT::scalar(32); 2869 LLT S64 = LLT::scalar(64); 2870 2871 const MachineFunction &MF = B.getMF(); 2872 bool Unsafe = 2873 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2874 2875 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2876 return false; 2877 2878 if (!Unsafe && ResTy == S32 && 2879 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2880 return false; 2881 2882 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2883 // 1 / x -> RCP(x) 2884 if (CLHS->isExactlyValue(1.0)) { 2885 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2886 .addUse(RHS) 2887 .setMIFlags(Flags); 2888 2889 MI.eraseFromParent(); 2890 return true; 2891 } 2892 2893 // -1 / x -> RCP( FNEG(x) ) 2894 if (CLHS->isExactlyValue(-1.0)) { 2895 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2896 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2897 .addUse(FNeg.getReg(0)) 2898 .setMIFlags(Flags); 2899 2900 MI.eraseFromParent(); 2901 return true; 2902 } 2903 } 2904 2905 // x / y -> x * (1.0 / y) 2906 if (Unsafe) { 2907 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2908 .addUse(RHS) 2909 .setMIFlags(Flags); 2910 B.buildFMul(Res, LHS, RCP, Flags); 2911 2912 MI.eraseFromParent(); 2913 return true; 2914 } 2915 2916 return false; 2917 } 2918 2919 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2920 MachineRegisterInfo &MRI, 2921 MachineIRBuilder &B) const { 2922 Register Res = MI.getOperand(0).getReg(); 2923 Register LHS = MI.getOperand(1).getReg(); 2924 Register RHS = MI.getOperand(2).getReg(); 2925 2926 uint16_t Flags = MI.getFlags(); 2927 2928 LLT S16 = LLT::scalar(16); 2929 LLT S32 = LLT::scalar(32); 2930 2931 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2932 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2933 2934 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2935 .addUse(RHSExt.getReg(0)) 2936 .setMIFlags(Flags); 2937 2938 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2939 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2940 2941 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2942 .addUse(RDst.getReg(0)) 2943 .addUse(RHS) 2944 .addUse(LHS) 2945 .setMIFlags(Flags); 2946 2947 MI.eraseFromParent(); 2948 return true; 2949 } 2950 2951 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2952 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2953 static void toggleSPDenormMode(bool Enable, 2954 MachineIRBuilder &B, 2955 const GCNSubtarget &ST, 2956 AMDGPU::SIModeRegisterDefaults Mode) { 2957 // Set SP denorm mode to this value. 2958 unsigned SPDenormMode = 2959 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2960 2961 if (ST.hasDenormModeInst()) { 2962 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2963 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2964 2965 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2966 B.buildInstr(AMDGPU::S_DENORM_MODE) 2967 .addImm(NewDenormModeValue); 2968 2969 } else { 2970 // Select FP32 bit field in mode register. 2971 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2972 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2973 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2974 2975 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2976 .addImm(SPDenormMode) 2977 .addImm(SPDenormModeBitField); 2978 } 2979 } 2980 2981 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2982 MachineRegisterInfo &MRI, 2983 MachineIRBuilder &B) const { 2984 Register Res = MI.getOperand(0).getReg(); 2985 Register LHS = MI.getOperand(1).getReg(); 2986 Register RHS = MI.getOperand(2).getReg(); 2987 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2988 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2989 2990 uint16_t Flags = MI.getFlags(); 2991 2992 LLT S32 = LLT::scalar(32); 2993 LLT S1 = LLT::scalar(1); 2994 2995 auto One = B.buildFConstant(S32, 1.0f); 2996 2997 auto DenominatorScaled = 2998 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2999 .addUse(LHS) 3000 .addUse(RHS) 3001 .addImm(0) 3002 .setMIFlags(Flags); 3003 auto NumeratorScaled = 3004 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3005 .addUse(LHS) 3006 .addUse(RHS) 3007 .addImm(1) 3008 .setMIFlags(Flags); 3009 3010 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3011 .addUse(DenominatorScaled.getReg(0)) 3012 .setMIFlags(Flags); 3013 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3014 3015 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3016 // aren't modeled as reading it. 3017 if (!Mode.allFP32Denormals()) 3018 toggleSPDenormMode(true, B, ST, Mode); 3019 3020 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3021 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3022 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3023 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3024 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3025 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3026 3027 if (!Mode.allFP32Denormals()) 3028 toggleSPDenormMode(false, B, ST, Mode); 3029 3030 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3031 .addUse(Fma4.getReg(0)) 3032 .addUse(Fma1.getReg(0)) 3033 .addUse(Fma3.getReg(0)) 3034 .addUse(NumeratorScaled.getReg(1)) 3035 .setMIFlags(Flags); 3036 3037 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3038 .addUse(Fmas.getReg(0)) 3039 .addUse(RHS) 3040 .addUse(LHS) 3041 .setMIFlags(Flags); 3042 3043 MI.eraseFromParent(); 3044 return true; 3045 } 3046 3047 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3048 MachineRegisterInfo &MRI, 3049 MachineIRBuilder &B) const { 3050 Register Res = MI.getOperand(0).getReg(); 3051 Register LHS = MI.getOperand(1).getReg(); 3052 Register RHS = MI.getOperand(2).getReg(); 3053 3054 uint16_t Flags = MI.getFlags(); 3055 3056 LLT S64 = LLT::scalar(64); 3057 LLT S1 = LLT::scalar(1); 3058 3059 auto One = B.buildFConstant(S64, 1.0); 3060 3061 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3062 .addUse(LHS) 3063 .addUse(RHS) 3064 .addImm(0) 3065 .setMIFlags(Flags); 3066 3067 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3068 3069 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3070 .addUse(DivScale0.getReg(0)) 3071 .setMIFlags(Flags); 3072 3073 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3074 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3075 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3076 3077 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3078 .addUse(LHS) 3079 .addUse(RHS) 3080 .addImm(1) 3081 .setMIFlags(Flags); 3082 3083 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3084 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3085 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3086 3087 Register Scale; 3088 if (!ST.hasUsableDivScaleConditionOutput()) { 3089 // Workaround a hardware bug on SI where the condition output from div_scale 3090 // is not usable. 3091 3092 LLT S32 = LLT::scalar(32); 3093 3094 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3095 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3096 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3097 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3098 3099 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3100 Scale1Unmerge.getReg(1)); 3101 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3102 Scale0Unmerge.getReg(1)); 3103 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3104 } else { 3105 Scale = DivScale1.getReg(1); 3106 } 3107 3108 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3109 .addUse(Fma4.getReg(0)) 3110 .addUse(Fma3.getReg(0)) 3111 .addUse(Mul.getReg(0)) 3112 .addUse(Scale) 3113 .setMIFlags(Flags); 3114 3115 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3116 .addUse(Fmas.getReg(0)) 3117 .addUse(RHS) 3118 .addUse(LHS) 3119 .setMIFlags(Flags); 3120 3121 MI.eraseFromParent(); 3122 return true; 3123 } 3124 3125 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3126 MachineRegisterInfo &MRI, 3127 MachineIRBuilder &B) const { 3128 Register Res = MI.getOperand(0).getReg(); 3129 Register LHS = MI.getOperand(2).getReg(); 3130 Register RHS = MI.getOperand(3).getReg(); 3131 uint16_t Flags = MI.getFlags(); 3132 3133 LLT S32 = LLT::scalar(32); 3134 LLT S1 = LLT::scalar(1); 3135 3136 auto Abs = B.buildFAbs(S32, RHS, Flags); 3137 const APFloat C0Val(1.0f); 3138 3139 auto C0 = B.buildConstant(S32, 0x6f800000); 3140 auto C1 = B.buildConstant(S32, 0x2f800000); 3141 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3142 3143 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3144 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3145 3146 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3147 3148 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3149 .addUse(Mul0.getReg(0)) 3150 .setMIFlags(Flags); 3151 3152 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3153 3154 B.buildFMul(Res, Sel, Mul1, Flags); 3155 3156 MI.eraseFromParent(); 3157 return true; 3158 } 3159 3160 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3161 MachineRegisterInfo &MRI, 3162 MachineIRBuilder &B) const { 3163 uint64_t Offset = 3164 ST.getTargetLowering()->getImplicitParameterOffset( 3165 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3166 LLT DstTy = MRI.getType(DstReg); 3167 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3168 3169 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3170 if (!loadInputValue(KernargPtrReg, B, 3171 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3172 return false; 3173 3174 // FIXME: This should be nuw 3175 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3176 return true; 3177 } 3178 3179 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3180 MachineRegisterInfo &MRI, 3181 MachineIRBuilder &B) const { 3182 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3183 if (!MFI->isEntryFunction()) { 3184 return legalizePreloadedArgIntrin(MI, MRI, B, 3185 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3186 } 3187 3188 Register DstReg = MI.getOperand(0).getReg(); 3189 if (!getImplicitArgPtr(DstReg, MRI, B)) 3190 return false; 3191 3192 MI.eraseFromParent(); 3193 return true; 3194 } 3195 3196 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3197 MachineRegisterInfo &MRI, 3198 MachineIRBuilder &B, 3199 unsigned AddrSpace) const { 3200 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3201 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3202 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3203 MI.eraseFromParent(); 3204 return true; 3205 } 3206 3207 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3208 // offset (the offset that is included in bounds checking and swizzling, to be 3209 // split between the instruction's voffset and immoffset fields) and soffset 3210 // (the offset that is excluded from bounds checking and swizzling, to go in 3211 // the instruction's soffset field). This function takes the first kind of 3212 // offset and figures out how to split it between voffset and immoffset. 3213 std::tuple<Register, unsigned, unsigned> 3214 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3215 Register OrigOffset) const { 3216 const unsigned MaxImm = 4095; 3217 Register BaseReg; 3218 unsigned TotalConstOffset; 3219 MachineInstr *OffsetDef; 3220 const LLT S32 = LLT::scalar(32); 3221 3222 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3223 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3224 3225 unsigned ImmOffset = TotalConstOffset; 3226 3227 // If the immediate value is too big for the immoffset field, put the value 3228 // and -4096 into the immoffset field so that the value that is copied/added 3229 // for the voffset field is a multiple of 4096, and it stands more chance 3230 // of being CSEd with the copy/add for another similar load/store. 3231 // However, do not do that rounding down to a multiple of 4096 if that is a 3232 // negative number, as it appears to be illegal to have a negative offset 3233 // in the vgpr, even if adding the immediate offset makes it positive. 3234 unsigned Overflow = ImmOffset & ~MaxImm; 3235 ImmOffset -= Overflow; 3236 if ((int32_t)Overflow < 0) { 3237 Overflow += ImmOffset; 3238 ImmOffset = 0; 3239 } 3240 3241 if (Overflow != 0) { 3242 if (!BaseReg) { 3243 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3244 } else { 3245 auto OverflowVal = B.buildConstant(S32, Overflow); 3246 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3247 } 3248 } 3249 3250 if (!BaseReg) 3251 BaseReg = B.buildConstant(S32, 0).getReg(0); 3252 3253 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3254 } 3255 3256 /// Handle register layout difference for f16 images for some subtargets. 3257 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3258 MachineRegisterInfo &MRI, 3259 Register Reg) const { 3260 if (!ST.hasUnpackedD16VMem()) 3261 return Reg; 3262 3263 const LLT S16 = LLT::scalar(16); 3264 const LLT S32 = LLT::scalar(32); 3265 LLT StoreVT = MRI.getType(Reg); 3266 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3267 3268 auto Unmerge = B.buildUnmerge(S16, Reg); 3269 3270 SmallVector<Register, 4> WideRegs; 3271 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3272 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3273 3274 int NumElts = StoreVT.getNumElements(); 3275 3276 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3277 } 3278 3279 Register AMDGPULegalizerInfo::fixStoreSourceType( 3280 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3281 MachineRegisterInfo *MRI = B.getMRI(); 3282 LLT Ty = MRI->getType(VData); 3283 3284 const LLT S16 = LLT::scalar(16); 3285 3286 // Fixup illegal register types for i8 stores. 3287 if (Ty == LLT::scalar(8) || Ty == S16) { 3288 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3289 return AnyExt; 3290 } 3291 3292 if (Ty.isVector()) { 3293 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3294 if (IsFormat) 3295 return handleD16VData(B, *MRI, VData); 3296 } 3297 } 3298 3299 return VData; 3300 } 3301 3302 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3303 MachineRegisterInfo &MRI, 3304 MachineIRBuilder &B, 3305 bool IsTyped, 3306 bool IsFormat) const { 3307 Register VData = MI.getOperand(1).getReg(); 3308 LLT Ty = MRI.getType(VData); 3309 LLT EltTy = Ty.getScalarType(); 3310 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3311 const LLT S32 = LLT::scalar(32); 3312 3313 VData = fixStoreSourceType(B, VData, IsFormat); 3314 Register RSrc = MI.getOperand(2).getReg(); 3315 3316 MachineMemOperand *MMO = *MI.memoperands_begin(); 3317 const int MemSize = MMO->getSize(); 3318 3319 unsigned ImmOffset; 3320 unsigned TotalOffset; 3321 3322 // The typed intrinsics add an immediate after the registers. 3323 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3324 3325 // The struct intrinsic variants add one additional operand over raw. 3326 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3327 Register VIndex; 3328 int OpOffset = 0; 3329 if (HasVIndex) { 3330 VIndex = MI.getOperand(3).getReg(); 3331 OpOffset = 1; 3332 } 3333 3334 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3335 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3336 3337 unsigned Format = 0; 3338 if (IsTyped) { 3339 Format = MI.getOperand(5 + OpOffset).getImm(); 3340 ++OpOffset; 3341 } 3342 3343 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3344 3345 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3346 if (TotalOffset != 0) 3347 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3348 3349 unsigned Opc; 3350 if (IsTyped) { 3351 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3352 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3353 } else if (IsFormat) { 3354 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3355 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3356 } else { 3357 switch (MemSize) { 3358 case 1: 3359 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3360 break; 3361 case 2: 3362 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3363 break; 3364 default: 3365 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3366 break; 3367 } 3368 } 3369 3370 if (!VIndex) 3371 VIndex = B.buildConstant(S32, 0).getReg(0); 3372 3373 auto MIB = B.buildInstr(Opc) 3374 .addUse(VData) // vdata 3375 .addUse(RSrc) // rsrc 3376 .addUse(VIndex) // vindex 3377 .addUse(VOffset) // voffset 3378 .addUse(SOffset) // soffset 3379 .addImm(ImmOffset); // offset(imm) 3380 3381 if (IsTyped) 3382 MIB.addImm(Format); 3383 3384 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3385 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3386 .addMemOperand(MMO); 3387 3388 MI.eraseFromParent(); 3389 return true; 3390 } 3391 3392 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3393 MachineRegisterInfo &MRI, 3394 MachineIRBuilder &B, 3395 bool IsFormat, 3396 bool IsTyped) const { 3397 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3398 MachineMemOperand *MMO = *MI.memoperands_begin(); 3399 const int MemSize = MMO->getSize(); 3400 const LLT S32 = LLT::scalar(32); 3401 3402 Register Dst = MI.getOperand(0).getReg(); 3403 Register RSrc = MI.getOperand(2).getReg(); 3404 3405 // The typed intrinsics add an immediate after the registers. 3406 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3407 3408 // The struct intrinsic variants add one additional operand over raw. 3409 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3410 Register VIndex; 3411 int OpOffset = 0; 3412 if (HasVIndex) { 3413 VIndex = MI.getOperand(3).getReg(); 3414 OpOffset = 1; 3415 } 3416 3417 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3418 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3419 3420 unsigned Format = 0; 3421 if (IsTyped) { 3422 Format = MI.getOperand(5 + OpOffset).getImm(); 3423 ++OpOffset; 3424 } 3425 3426 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3427 unsigned ImmOffset; 3428 unsigned TotalOffset; 3429 3430 LLT Ty = MRI.getType(Dst); 3431 LLT EltTy = Ty.getScalarType(); 3432 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3433 const bool Unpacked = ST.hasUnpackedD16VMem(); 3434 3435 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3436 if (TotalOffset != 0) 3437 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3438 3439 unsigned Opc; 3440 3441 if (IsTyped) { 3442 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3443 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3444 } else if (IsFormat) { 3445 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3446 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3447 } else { 3448 switch (MemSize) { 3449 case 1: 3450 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3451 break; 3452 case 2: 3453 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3454 break; 3455 default: 3456 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3457 break; 3458 } 3459 } 3460 3461 Register LoadDstReg; 3462 3463 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3464 LLT UnpackedTy = Ty.changeElementSize(32); 3465 3466 if (IsExtLoad) 3467 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3468 else if (Unpacked && IsD16 && Ty.isVector()) 3469 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3470 else 3471 LoadDstReg = Dst; 3472 3473 if (!VIndex) 3474 VIndex = B.buildConstant(S32, 0).getReg(0); 3475 3476 auto MIB = B.buildInstr(Opc) 3477 .addDef(LoadDstReg) // vdata 3478 .addUse(RSrc) // rsrc 3479 .addUse(VIndex) // vindex 3480 .addUse(VOffset) // voffset 3481 .addUse(SOffset) // soffset 3482 .addImm(ImmOffset); // offset(imm) 3483 3484 if (IsTyped) 3485 MIB.addImm(Format); 3486 3487 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3488 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3489 .addMemOperand(MMO); 3490 3491 if (LoadDstReg != Dst) { 3492 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3493 3494 // Widen result for extending loads was widened. 3495 if (IsExtLoad) 3496 B.buildTrunc(Dst, LoadDstReg); 3497 else { 3498 // Repack to original 16-bit vector result 3499 // FIXME: G_TRUNC should work, but legalization currently fails 3500 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3501 SmallVector<Register, 4> Repack; 3502 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3503 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3504 B.buildMerge(Dst, Repack); 3505 } 3506 } 3507 3508 MI.eraseFromParent(); 3509 return true; 3510 } 3511 3512 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3513 MachineIRBuilder &B, 3514 bool IsInc) const { 3515 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3516 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3517 B.buildInstr(Opc) 3518 .addDef(MI.getOperand(0).getReg()) 3519 .addUse(MI.getOperand(2).getReg()) 3520 .addUse(MI.getOperand(3).getReg()) 3521 .cloneMemRefs(MI); 3522 MI.eraseFromParent(); 3523 return true; 3524 } 3525 3526 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3527 switch (IntrID) { 3528 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3529 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3530 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3531 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3532 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3533 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3534 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3535 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3536 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3537 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3538 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3539 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3540 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3541 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3542 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3543 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3544 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3545 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3546 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3547 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3548 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3549 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3550 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3551 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3552 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3553 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3554 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3555 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3556 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3557 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3558 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3559 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3560 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3561 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3562 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3563 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3564 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3565 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3566 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3567 default: 3568 llvm_unreachable("unhandled atomic opcode"); 3569 } 3570 } 3571 3572 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3573 MachineIRBuilder &B, 3574 Intrinsic::ID IID) const { 3575 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3576 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3577 3578 Register Dst = MI.getOperand(0).getReg(); 3579 Register VData = MI.getOperand(2).getReg(); 3580 3581 Register CmpVal; 3582 int OpOffset = 0; 3583 3584 if (IsCmpSwap) { 3585 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3586 ++OpOffset; 3587 } 3588 3589 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3590 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3591 3592 // The struct intrinsic variants add one additional operand over raw. 3593 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3594 Register VIndex; 3595 if (HasVIndex) { 3596 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3597 ++OpOffset; 3598 } 3599 3600 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3601 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3602 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3603 3604 MachineMemOperand *MMO = *MI.memoperands_begin(); 3605 3606 unsigned ImmOffset; 3607 unsigned TotalOffset; 3608 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3609 if (TotalOffset != 0) 3610 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3611 3612 if (!VIndex) 3613 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3614 3615 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3616 .addDef(Dst) 3617 .addUse(VData); // vdata 3618 3619 if (IsCmpSwap) 3620 MIB.addReg(CmpVal); 3621 3622 MIB.addUse(RSrc) // rsrc 3623 .addUse(VIndex) // vindex 3624 .addUse(VOffset) // voffset 3625 .addUse(SOffset) // soffset 3626 .addImm(ImmOffset) // offset(imm) 3627 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3628 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3629 .addMemOperand(MMO); 3630 3631 MI.eraseFromParent(); 3632 return true; 3633 } 3634 3635 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3636 /// vector with s16 typed elements. 3637 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3638 SmallVectorImpl<Register> &PackedAddrs, 3639 int AddrIdx, int DimIdx, int EndIdx, 3640 int NumGradients) { 3641 const LLT S16 = LLT::scalar(16); 3642 const LLT V2S16 = LLT::vector(2, 16); 3643 3644 for (int I = AddrIdx; I < EndIdx; ++I) { 3645 MachineOperand &SrcOp = MI.getOperand(I); 3646 if (!SrcOp.isReg()) 3647 continue; // _L to _LZ may have eliminated this. 3648 3649 Register AddrReg = SrcOp.getReg(); 3650 3651 if (I < DimIdx) { 3652 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3653 PackedAddrs.push_back(AddrReg); 3654 } else { 3655 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3656 // derivatives dx/dh and dx/dv are packed with undef. 3657 if (((I + 1) >= EndIdx) || 3658 ((NumGradients / 2) % 2 == 1 && 3659 (I == DimIdx + (NumGradients / 2) - 1 || 3660 I == DimIdx + NumGradients - 1)) || 3661 // Check for _L to _LZ optimization 3662 !MI.getOperand(I + 1).isReg()) { 3663 PackedAddrs.push_back( 3664 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3665 .getReg(0)); 3666 } else { 3667 PackedAddrs.push_back( 3668 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3669 .getReg(0)); 3670 ++I; 3671 } 3672 } 3673 } 3674 } 3675 3676 /// Convert from separate vaddr components to a single vector address register, 3677 /// and replace the remaining operands with $noreg. 3678 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3679 int DimIdx, int NumVAddrs) { 3680 const LLT S32 = LLT::scalar(32); 3681 3682 SmallVector<Register, 8> AddrRegs; 3683 for (int I = 0; I != NumVAddrs; ++I) { 3684 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3685 if (SrcOp.isReg()) { 3686 AddrRegs.push_back(SrcOp.getReg()); 3687 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3688 } 3689 } 3690 3691 int NumAddrRegs = AddrRegs.size(); 3692 if (NumAddrRegs != 1) { 3693 // Round up to 8 elements for v5-v7 3694 // FIXME: Missing intermediate sized register classes and instructions. 3695 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3696 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3697 auto Undef = B.buildUndef(S32); 3698 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3699 NumAddrRegs = RoundedNumRegs; 3700 } 3701 3702 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3703 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3704 } 3705 3706 for (int I = 1; I != NumVAddrs; ++I) { 3707 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3708 if (SrcOp.isReg()) 3709 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3710 } 3711 } 3712 3713 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3714 /// 3715 /// Depending on the subtarget, load/store with 16-bit element data need to be 3716 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3717 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3718 /// registers. 3719 /// 3720 /// We don't want to directly select image instructions just yet, but also want 3721 /// to exposes all register repacking to the legalizer/combiners. We also don't 3722 /// want a selected instrution entering RegBankSelect. In order to avoid 3723 /// defining a multitude of intermediate image instructions, directly hack on 3724 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3725 /// now unnecessary arguments with $noreg. 3726 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3727 MachineInstr &MI, MachineIRBuilder &B, 3728 GISelChangeObserver &Observer, 3729 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3730 3731 const int NumDefs = MI.getNumExplicitDefs(); 3732 bool IsTFE = NumDefs == 2; 3733 // We are only processing the operands of d16 image operations on subtargets 3734 // that use the unpacked register layout, or need to repack the TFE result. 3735 3736 // TODO: Do we need to guard against already legalized intrinsics? 3737 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3738 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3739 3740 MachineRegisterInfo *MRI = B.getMRI(); 3741 const LLT S32 = LLT::scalar(32); 3742 const LLT S16 = LLT::scalar(16); 3743 const LLT V2S16 = LLT::vector(2, 16); 3744 3745 // Index of first address argument 3746 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3747 3748 int NumVAddrs, NumGradients; 3749 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3750 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3751 getDMaskIdx(BaseOpcode, NumDefs); 3752 unsigned DMask = 0; 3753 3754 // Check for 16 bit addresses and pack if true. 3755 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3756 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3757 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3758 const bool IsG16 = GradTy == S16; 3759 const bool IsA16 = AddrTy == S16; 3760 3761 int DMaskLanes = 0; 3762 if (!BaseOpcode->Atomic) { 3763 DMask = MI.getOperand(DMaskIdx).getImm(); 3764 if (BaseOpcode->Gather4) { 3765 DMaskLanes = 4; 3766 } else if (DMask != 0) { 3767 DMaskLanes = countPopulation(DMask); 3768 } else if (!IsTFE && !BaseOpcode->Store) { 3769 // If dmask is 0, this is a no-op load. This can be eliminated. 3770 B.buildUndef(MI.getOperand(0)); 3771 MI.eraseFromParent(); 3772 return true; 3773 } 3774 } 3775 3776 Observer.changingInstr(MI); 3777 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3778 3779 unsigned NewOpcode = NumDefs == 0 ? 3780 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3781 3782 // Track that we legalized this 3783 MI.setDesc(B.getTII().get(NewOpcode)); 3784 3785 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3786 // dmask to be at least 1 otherwise the instruction will fail 3787 if (IsTFE && DMask == 0) { 3788 DMask = 0x1; 3789 DMaskLanes = 1; 3790 MI.getOperand(DMaskIdx).setImm(DMask); 3791 } 3792 3793 if (BaseOpcode->Atomic) { 3794 Register VData0 = MI.getOperand(2).getReg(); 3795 LLT Ty = MRI->getType(VData0); 3796 3797 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3798 if (Ty.isVector()) 3799 return false; 3800 3801 if (BaseOpcode->AtomicX2) { 3802 Register VData1 = MI.getOperand(3).getReg(); 3803 // The two values are packed in one register. 3804 LLT PackedTy = LLT::vector(2, Ty); 3805 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3806 MI.getOperand(2).setReg(Concat.getReg(0)); 3807 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3808 } 3809 } 3810 3811 int CorrectedNumVAddrs = NumVAddrs; 3812 3813 // Optimize _L to _LZ when _L is zero 3814 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3815 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3816 const ConstantFP *ConstantLod; 3817 const int LodIdx = AddrIdx + NumVAddrs - 1; 3818 3819 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3820 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3821 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3822 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3823 LZMappingInfo->LZ, ImageDimIntr->Dim); 3824 3825 // The starting indexes should remain in the same place. 3826 --NumVAddrs; 3827 --CorrectedNumVAddrs; 3828 3829 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3830 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3831 MI.RemoveOperand(LodIdx); 3832 } 3833 } 3834 } 3835 3836 // Optimize _mip away, when 'lod' is zero 3837 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3838 int64_t ConstantLod; 3839 const int LodIdx = AddrIdx + NumVAddrs - 1; 3840 3841 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3842 if (ConstantLod == 0) { 3843 // TODO: Change intrinsic opcode and remove operand instead or replacing 3844 // it with 0, as the _L to _LZ handling is done above. 3845 MI.getOperand(LodIdx).ChangeToImmediate(0); 3846 --CorrectedNumVAddrs; 3847 } 3848 } 3849 } 3850 3851 // Rewrite the addressing register layout before doing anything else. 3852 if (IsA16 || IsG16) { 3853 if (IsA16) { 3854 // Target must support the feature and gradients need to be 16 bit too 3855 if (!ST.hasA16() || !IsG16) 3856 return false; 3857 } else if (!ST.hasG16()) 3858 return false; 3859 3860 if (NumVAddrs > 1) { 3861 SmallVector<Register, 4> PackedRegs; 3862 // Don't compress addresses for G16 3863 const int PackEndIdx = 3864 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3865 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3866 PackEndIdx, NumGradients); 3867 3868 if (!IsA16) { 3869 // Add uncompressed address 3870 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3871 int AddrReg = MI.getOperand(I).getReg(); 3872 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3873 PackedRegs.push_back(AddrReg); 3874 } 3875 } 3876 3877 // See also below in the non-a16 branch 3878 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3879 3880 if (!UseNSA && PackedRegs.size() > 1) { 3881 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3882 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3883 PackedRegs[0] = Concat.getReg(0); 3884 PackedRegs.resize(1); 3885 } 3886 3887 const int NumPacked = PackedRegs.size(); 3888 for (int I = 0; I != NumVAddrs; ++I) { 3889 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3890 if (!SrcOp.isReg()) { 3891 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3892 continue; 3893 } 3894 3895 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3896 3897 if (I < NumPacked) 3898 SrcOp.setReg(PackedRegs[I]); 3899 else 3900 SrcOp.setReg(AMDGPU::NoRegister); 3901 } 3902 } 3903 } else { 3904 // If the register allocator cannot place the address registers contiguously 3905 // without introducing moves, then using the non-sequential address encoding 3906 // is always preferable, since it saves VALU instructions and is usually a 3907 // wash in terms of code size or even better. 3908 // 3909 // However, we currently have no way of hinting to the register allocator 3910 // that MIMG addresses should be placed contiguously when it is possible to 3911 // do so, so force non-NSA for the common 2-address case as a heuristic. 3912 // 3913 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3914 // allocation when possible. 3915 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3916 3917 if (!UseNSA && NumVAddrs > 1) 3918 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3919 } 3920 3921 int Flags = 0; 3922 if (IsA16) 3923 Flags |= 1; 3924 if (IsG16) 3925 Flags |= 2; 3926 MI.addOperand(MachineOperand::CreateImm(Flags)); 3927 3928 if (BaseOpcode->Store) { // No TFE for stores? 3929 // TODO: Handle dmask trim 3930 Register VData = MI.getOperand(1).getReg(); 3931 LLT Ty = MRI->getType(VData); 3932 if (!Ty.isVector() || Ty.getElementType() != S16) 3933 return true; 3934 3935 Register RepackedReg = handleD16VData(B, *MRI, VData); 3936 if (RepackedReg != VData) { 3937 MI.getOperand(1).setReg(RepackedReg); 3938 } 3939 3940 return true; 3941 } 3942 3943 Register DstReg = MI.getOperand(0).getReg(); 3944 LLT Ty = MRI->getType(DstReg); 3945 const LLT EltTy = Ty.getScalarType(); 3946 const bool IsD16 = Ty.getScalarType() == S16; 3947 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3948 3949 // Confirm that the return type is large enough for the dmask specified 3950 if (NumElts < DMaskLanes) 3951 return false; 3952 3953 if (NumElts > 4 || DMaskLanes > 4) 3954 return false; 3955 3956 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3957 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3958 3959 // The raw dword aligned data component of the load. The only legal cases 3960 // where this matters should be when using the packed D16 format, for 3961 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3962 LLT RoundedTy; 3963 3964 // S32 vector to to cover all data, plus TFE result element. 3965 LLT TFETy; 3966 3967 // Register type to use for each loaded component. Will be S32 or V2S16. 3968 LLT RegTy; 3969 3970 if (IsD16 && ST.hasUnpackedD16VMem()) { 3971 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3972 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3973 RegTy = S32; 3974 } else { 3975 unsigned EltSize = EltTy.getSizeInBits(); 3976 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3977 unsigned RoundedSize = 32 * RoundedElts; 3978 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3979 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3980 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3981 } 3982 3983 // The return type does not need adjustment. 3984 // TODO: Should we change s16 case to s32 or <2 x s16>? 3985 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3986 return true; 3987 3988 Register Dst1Reg; 3989 3990 // Insert after the instruction. 3991 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3992 3993 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3994 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3995 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3996 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3997 3998 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3999 4000 MI.getOperand(0).setReg(NewResultReg); 4001 4002 // In the IR, TFE is supposed to be used with a 2 element struct return 4003 // type. The intruction really returns these two values in one contiguous 4004 // register, with one additional dword beyond the loaded data. Rewrite the 4005 // return type to use a single register result. 4006 4007 if (IsTFE) { 4008 Dst1Reg = MI.getOperand(1).getReg(); 4009 if (MRI->getType(Dst1Reg) != S32) 4010 return false; 4011 4012 // TODO: Make sure the TFE operand bit is set. 4013 MI.RemoveOperand(1); 4014 4015 // Handle the easy case that requires no repack instructions. 4016 if (Ty == S32) { 4017 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4018 return true; 4019 } 4020 } 4021 4022 // Now figure out how to copy the new result register back into the old 4023 // result. 4024 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4025 4026 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4027 4028 if (ResultNumRegs == 1) { 4029 assert(!IsTFE); 4030 ResultRegs[0] = NewResultReg; 4031 } else { 4032 // We have to repack into a new vector of some kind. 4033 for (int I = 0; I != NumDataRegs; ++I) 4034 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4035 B.buildUnmerge(ResultRegs, NewResultReg); 4036 4037 // Drop the final TFE element to get the data part. The TFE result is 4038 // directly written to the right place already. 4039 if (IsTFE) 4040 ResultRegs.resize(NumDataRegs); 4041 } 4042 4043 // For an s16 scalar result, we form an s32 result with a truncate regardless 4044 // of packed vs. unpacked. 4045 if (IsD16 && !Ty.isVector()) { 4046 B.buildTrunc(DstReg, ResultRegs[0]); 4047 return true; 4048 } 4049 4050 // Avoid a build/concat_vector of 1 entry. 4051 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4052 B.buildBitcast(DstReg, ResultRegs[0]); 4053 return true; 4054 } 4055 4056 assert(Ty.isVector()); 4057 4058 if (IsD16) { 4059 // For packed D16 results with TFE enabled, all the data components are 4060 // S32. Cast back to the expected type. 4061 // 4062 // TODO: We don't really need to use load s32 elements. We would only need one 4063 // cast for the TFE result if a multiple of v2s16 was used. 4064 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4065 for (Register &Reg : ResultRegs) 4066 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4067 } else if (ST.hasUnpackedD16VMem()) { 4068 for (Register &Reg : ResultRegs) 4069 Reg = B.buildTrunc(S16, Reg).getReg(0); 4070 } 4071 } 4072 4073 auto padWithUndef = [&](LLT Ty, int NumElts) { 4074 if (NumElts == 0) 4075 return; 4076 Register Undef = B.buildUndef(Ty).getReg(0); 4077 for (int I = 0; I != NumElts; ++I) 4078 ResultRegs.push_back(Undef); 4079 }; 4080 4081 // Pad out any elements eliminated due to the dmask. 4082 LLT ResTy = MRI->getType(ResultRegs[0]); 4083 if (!ResTy.isVector()) { 4084 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4085 B.buildBuildVector(DstReg, ResultRegs); 4086 return true; 4087 } 4088 4089 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4090 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4091 4092 // Deal with the one annoying legal case. 4093 const LLT V3S16 = LLT::vector(3, 16); 4094 if (Ty == V3S16) { 4095 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4096 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4097 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4098 return true; 4099 } 4100 4101 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4102 B.buildConcatVectors(DstReg, ResultRegs); 4103 return true; 4104 } 4105 4106 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4107 MachineInstr &MI, MachineIRBuilder &B, 4108 GISelChangeObserver &Observer) const { 4109 Register Dst = MI.getOperand(0).getReg(); 4110 LLT Ty = B.getMRI()->getType(Dst); 4111 unsigned Size = Ty.getSizeInBits(); 4112 MachineFunction &MF = B.getMF(); 4113 4114 Observer.changingInstr(MI); 4115 4116 // FIXME: We don't really need this intermediate instruction. The intrinsic 4117 // should be fixed to have a memory operand. Since it's readnone, we're not 4118 // allowed to add one. 4119 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4120 MI.RemoveOperand(1); // Remove intrinsic ID 4121 4122 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4123 // TODO: Should this use datalayout alignment? 4124 const unsigned MemSize = (Size + 7) / 8; 4125 const Align MemAlign(4); 4126 MachineMemOperand *MMO = MF.getMachineMemOperand( 4127 MachinePointerInfo(), 4128 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4129 MachineMemOperand::MOInvariant, 4130 MemSize, MemAlign); 4131 MI.addMemOperand(MF, MMO); 4132 4133 // There are no 96-bit result scalar loads, but widening to 128-bit should 4134 // always be legal. We may need to restore this to a 96-bit result if it turns 4135 // out this needs to be converted to a vector load during RegBankSelect. 4136 if (!isPowerOf2_32(Size)) { 4137 LegalizerHelper Helper(MF, *this, Observer, B); 4138 4139 if (Ty.isVector()) 4140 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4141 else 4142 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4143 } 4144 4145 Observer.changedInstr(MI); 4146 return true; 4147 } 4148 4149 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4150 MachineRegisterInfo &MRI, 4151 MachineIRBuilder &B) const { 4152 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4153 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4154 !ST.isTrapHandlerEnabled()) { 4155 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4156 } else { 4157 // Pass queue pointer to trap handler as input, and insert trap instruction 4158 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4159 MachineRegisterInfo &MRI = *B.getMRI(); 4160 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4161 Register LiveIn = getLiveInRegister( 4162 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4163 /*InsertLiveInCopy=*/false); 4164 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4165 return false; 4166 B.buildCopy(SGPR01, LiveIn); 4167 B.buildInstr(AMDGPU::S_TRAP) 4168 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4169 .addReg(SGPR01, RegState::Implicit); 4170 } 4171 4172 MI.eraseFromParent(); 4173 return true; 4174 } 4175 4176 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4177 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4178 // Is non-HSA path or trap-handler disabled? then, report a warning 4179 // accordingly 4180 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4181 !ST.isTrapHandlerEnabled()) { 4182 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4183 "debugtrap handler not supported", 4184 MI.getDebugLoc(), DS_Warning); 4185 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4186 Ctx.diagnose(NoTrap); 4187 } else { 4188 // Insert debug-trap instruction 4189 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4190 } 4191 4192 MI.eraseFromParent(); 4193 return true; 4194 } 4195 4196 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4197 MachineInstr &MI) const { 4198 MachineIRBuilder &B = Helper.MIRBuilder; 4199 MachineRegisterInfo &MRI = *B.getMRI(); 4200 4201 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4202 auto IntrID = MI.getIntrinsicID(); 4203 switch (IntrID) { 4204 case Intrinsic::amdgcn_if: 4205 case Intrinsic::amdgcn_else: { 4206 MachineInstr *Br = nullptr; 4207 MachineBasicBlock *UncondBrTarget = nullptr; 4208 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4209 const SIRegisterInfo *TRI 4210 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4211 4212 Register Def = MI.getOperand(1).getReg(); 4213 Register Use = MI.getOperand(3).getReg(); 4214 4215 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4216 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4217 if (IntrID == Intrinsic::amdgcn_if) { 4218 B.buildInstr(AMDGPU::SI_IF) 4219 .addDef(Def) 4220 .addUse(Use) 4221 .addMBB(UncondBrTarget); 4222 } else { 4223 B.buildInstr(AMDGPU::SI_ELSE) 4224 .addDef(Def) 4225 .addUse(Use) 4226 .addMBB(UncondBrTarget) 4227 .addImm(0); 4228 } 4229 4230 if (Br) { 4231 Br->getOperand(0).setMBB(CondBrTarget); 4232 } else { 4233 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4234 // since we're swapping branch targets it needs to be reinserted. 4235 // FIXME: IRTranslator should probably not do this 4236 B.buildBr(*CondBrTarget); 4237 } 4238 4239 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4240 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4241 MI.eraseFromParent(); 4242 BrCond->eraseFromParent(); 4243 return true; 4244 } 4245 4246 return false; 4247 } 4248 case Intrinsic::amdgcn_loop: { 4249 MachineInstr *Br = nullptr; 4250 MachineBasicBlock *UncondBrTarget = nullptr; 4251 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4252 const SIRegisterInfo *TRI 4253 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4254 4255 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4256 Register Reg = MI.getOperand(2).getReg(); 4257 4258 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4259 B.buildInstr(AMDGPU::SI_LOOP) 4260 .addUse(Reg) 4261 .addMBB(UncondBrTarget); 4262 4263 if (Br) 4264 Br->getOperand(0).setMBB(CondBrTarget); 4265 else 4266 B.buildBr(*CondBrTarget); 4267 4268 MI.eraseFromParent(); 4269 BrCond->eraseFromParent(); 4270 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4271 return true; 4272 } 4273 4274 return false; 4275 } 4276 case Intrinsic::amdgcn_kernarg_segment_ptr: 4277 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4278 // This only makes sense to call in a kernel, so just lower to null. 4279 B.buildConstant(MI.getOperand(0).getReg(), 0); 4280 MI.eraseFromParent(); 4281 return true; 4282 } 4283 4284 return legalizePreloadedArgIntrin( 4285 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4286 case Intrinsic::amdgcn_implicitarg_ptr: 4287 return legalizeImplicitArgPtr(MI, MRI, B); 4288 case Intrinsic::amdgcn_workitem_id_x: 4289 return legalizePreloadedArgIntrin(MI, MRI, B, 4290 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4291 case Intrinsic::amdgcn_workitem_id_y: 4292 return legalizePreloadedArgIntrin(MI, MRI, B, 4293 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4294 case Intrinsic::amdgcn_workitem_id_z: 4295 return legalizePreloadedArgIntrin(MI, MRI, B, 4296 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4297 case Intrinsic::amdgcn_workgroup_id_x: 4298 return legalizePreloadedArgIntrin(MI, MRI, B, 4299 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4300 case Intrinsic::amdgcn_workgroup_id_y: 4301 return legalizePreloadedArgIntrin(MI, MRI, B, 4302 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4303 case Intrinsic::amdgcn_workgroup_id_z: 4304 return legalizePreloadedArgIntrin(MI, MRI, B, 4305 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4306 case Intrinsic::amdgcn_dispatch_ptr: 4307 return legalizePreloadedArgIntrin(MI, MRI, B, 4308 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4309 case Intrinsic::amdgcn_queue_ptr: 4310 return legalizePreloadedArgIntrin(MI, MRI, B, 4311 AMDGPUFunctionArgInfo::QUEUE_PTR); 4312 case Intrinsic::amdgcn_implicit_buffer_ptr: 4313 return legalizePreloadedArgIntrin( 4314 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4315 case Intrinsic::amdgcn_dispatch_id: 4316 return legalizePreloadedArgIntrin(MI, MRI, B, 4317 AMDGPUFunctionArgInfo::DISPATCH_ID); 4318 case Intrinsic::amdgcn_fdiv_fast: 4319 return legalizeFDIVFastIntrin(MI, MRI, B); 4320 case Intrinsic::amdgcn_is_shared: 4321 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4322 case Intrinsic::amdgcn_is_private: 4323 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4324 case Intrinsic::amdgcn_wavefrontsize: { 4325 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4326 MI.eraseFromParent(); 4327 return true; 4328 } 4329 case Intrinsic::amdgcn_s_buffer_load: 4330 return legalizeSBufferLoad(MI, B, Helper.Observer); 4331 case Intrinsic::amdgcn_raw_buffer_store: 4332 case Intrinsic::amdgcn_struct_buffer_store: 4333 return legalizeBufferStore(MI, MRI, B, false, false); 4334 case Intrinsic::amdgcn_raw_buffer_store_format: 4335 case Intrinsic::amdgcn_struct_buffer_store_format: 4336 return legalizeBufferStore(MI, MRI, B, false, true); 4337 case Intrinsic::amdgcn_raw_tbuffer_store: 4338 case Intrinsic::amdgcn_struct_tbuffer_store: 4339 return legalizeBufferStore(MI, MRI, B, true, true); 4340 case Intrinsic::amdgcn_raw_buffer_load: 4341 case Intrinsic::amdgcn_struct_buffer_load: 4342 return legalizeBufferLoad(MI, MRI, B, false, false); 4343 case Intrinsic::amdgcn_raw_buffer_load_format: 4344 case Intrinsic::amdgcn_struct_buffer_load_format: 4345 return legalizeBufferLoad(MI, MRI, B, true, false); 4346 case Intrinsic::amdgcn_raw_tbuffer_load: 4347 case Intrinsic::amdgcn_struct_tbuffer_load: 4348 return legalizeBufferLoad(MI, MRI, B, true, true); 4349 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4350 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4351 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4352 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4353 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4354 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4355 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4356 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4357 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4358 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4359 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4360 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4361 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4362 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4363 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4364 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4365 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4366 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4367 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4368 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4369 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4370 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4371 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4372 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4373 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4374 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4375 return legalizeBufferAtomic(MI, B, IntrID); 4376 case Intrinsic::amdgcn_atomic_inc: 4377 return legalizeAtomicIncDec(MI, B, true); 4378 case Intrinsic::amdgcn_atomic_dec: 4379 return legalizeAtomicIncDec(MI, B, false); 4380 case Intrinsic::trap: 4381 return legalizeTrapIntrinsic(MI, MRI, B); 4382 case Intrinsic::debugtrap: 4383 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4384 default: { 4385 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4386 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4387 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4388 return true; 4389 } 4390 } 4391 4392 return true; 4393 } 4394