1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 return Ty.isVector() && 67 Ty.getNumElements() % 2 != 0 && 68 Ty.getElementType().getSizeInBits() < 32 && 69 Ty.getSizeInBits() % 32 != 0; 70 }; 71 } 72 73 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 74 return [=](const LegalityQuery &Query) { 75 const LLT Ty = Query.Types[TypeIdx]; 76 const LLT EltTy = Ty.getScalarType(); 77 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 78 }; 79 } 80 81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 82 return [=](const LegalityQuery &Query) { 83 const LLT Ty = Query.Types[TypeIdx]; 84 const LLT EltTy = Ty.getElementType(); 85 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 86 }; 87 } 88 89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 90 return [=](const LegalityQuery &Query) { 91 const LLT Ty = Query.Types[TypeIdx]; 92 const LLT EltTy = Ty.getElementType(); 93 unsigned Size = Ty.getSizeInBits(); 94 unsigned Pieces = (Size + 63) / 64; 95 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 96 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 97 }; 98 } 99 100 // Increase the number of vector elements to reach the next multiple of 32-bit 101 // type. 102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 106 const LLT EltTy = Ty.getElementType(); 107 const int Size = Ty.getSizeInBits(); 108 const int EltSize = EltTy.getSizeInBits(); 109 const int NextMul32 = (Size + 31) / 32; 110 111 assert(EltSize < 32); 112 113 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 114 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 115 }; 116 } 117 118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 unsigned Size = Ty.getSizeInBits(); 122 123 LLT CoercedTy; 124 if (Size <= 32) { 125 // <2 x s8> -> s16 126 // <4 x s8> -> s32 127 CoercedTy = LLT::scalar(Size); 128 } else 129 CoercedTy = LLT::scalarOrVector(Size / 32, 32); 130 131 return std::make_pair(TypeIdx, CoercedTy); 132 }; 133 } 134 135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 136 return [=](const LegalityQuery &Query) { 137 const LLT QueryTy = Query.Types[TypeIdx]; 138 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 139 }; 140 } 141 142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 143 return [=](const LegalityQuery &Query) { 144 const LLT QueryTy = Query.Types[TypeIdx]; 145 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 146 }; 147 } 148 149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT QueryTy = Query.Types[TypeIdx]; 152 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 153 }; 154 } 155 156 static bool isRegisterSize(unsigned Size) { 157 return Size % 32 == 0 && Size <= MaxRegisterSize; 158 } 159 160 static bool isRegisterVectorElementType(LLT EltTy) { 161 const int EltSize = EltTy.getSizeInBits(); 162 return EltSize == 16 || EltSize % 32 == 0; 163 } 164 165 static bool isRegisterVectorType(LLT Ty) { 166 const int EltSize = Ty.getElementType().getSizeInBits(); 167 return EltSize == 32 || EltSize == 64 || 168 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 169 EltSize == 128 || EltSize == 256; 170 } 171 172 static bool isRegisterType(LLT Ty) { 173 if (!isRegisterSize(Ty.getSizeInBits())) 174 return false; 175 176 if (Ty.isVector()) 177 return isRegisterVectorType(Ty); 178 179 return true; 180 } 181 182 // Any combination of 32 or 64-bit elements up the maximum register size, and 183 // multiples of v2s16. 184 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 return isRegisterType(Query.Types[TypeIdx]); 187 }; 188 } 189 190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 191 return [=](const LegalityQuery &Query) { 192 const LLT QueryTy = Query.Types[TypeIdx]; 193 if (!QueryTy.isVector()) 194 return false; 195 const LLT EltTy = QueryTy.getElementType(); 196 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 197 }; 198 } 199 200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 201 return [=](const LegalityQuery &Query) { 202 const LLT Ty = Query.Types[TypeIdx]; 203 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 204 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 205 }; 206 } 207 208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 209 // handle some operations by just promoting the register during 210 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 212 bool IsLoad) { 213 switch (AS) { 214 case AMDGPUAS::PRIVATE_ADDRESS: 215 // FIXME: Private element size. 216 return 32; 217 case AMDGPUAS::LOCAL_ADDRESS: 218 return ST.useDS128() ? 128 : 64; 219 case AMDGPUAS::GLOBAL_ADDRESS: 220 case AMDGPUAS::CONSTANT_ADDRESS: 221 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 222 // Treat constant and global as identical. SMRD loads are sometimes usable for 223 // global loads (ideally constant address space should be eliminated) 224 // depending on the context. Legality cannot be context dependent, but 225 // RegBankSelect can split the load as necessary depending on the pointer 226 // register bank/uniformity and if the memory is invariant or not written in a 227 // kernel. 228 return IsLoad ? 512 : 128; 229 default: 230 // Flat addresses may contextually need to be split to 32-bit parts if they 231 // may alias scratch depending on the subtarget. 232 return 128; 233 } 234 } 235 236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 237 const LegalityQuery &Query, 238 unsigned Opcode) { 239 const LLT Ty = Query.Types[0]; 240 241 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 242 const bool IsLoad = Opcode != AMDGPU::G_STORE; 243 244 unsigned RegSize = Ty.getSizeInBits(); 245 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 246 unsigned Align = Query.MMODescrs[0].AlignInBits; 247 unsigned AS = Query.Types[1].getAddressSpace(); 248 249 // All of these need to be custom lowered to cast the pointer operand. 250 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 251 return false; 252 253 // TODO: We should be able to widen loads if the alignment is high enough, but 254 // we also need to modify the memory access size. 255 #if 0 256 // Accept widening loads based on alignment. 257 if (IsLoad && MemSize < Size) 258 MemSize = std::max(MemSize, Align); 259 #endif 260 261 // Only 1-byte and 2-byte to 32-bit extloads are valid. 262 if (MemSize != RegSize && RegSize != 32) 263 return false; 264 265 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 266 return false; 267 268 switch (MemSize) { 269 case 8: 270 case 16: 271 case 32: 272 case 64: 273 case 128: 274 break; 275 case 96: 276 if (!ST.hasDwordx3LoadStores()) 277 return false; 278 break; 279 case 256: 280 case 512: 281 // These may contextually need to be broken down. 282 break; 283 default: 284 return false; 285 } 286 287 assert(RegSize >= MemSize); 288 289 if (Align < MemSize) { 290 const SITargetLowering *TLI = ST.getTargetLowering(); 291 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 292 return false; 293 } 294 295 return true; 296 } 297 298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 299 // workaround this. Eventually it should ignore the type for loads and only care 300 // about the size. Return true in cases where we will workaround this for now by 301 // bitcasting. 302 static bool loadStoreBitcastWorkaround(const LLT Ty) { 303 if (EnableNewLegality) 304 return false; 305 306 const unsigned Size = Ty.getSizeInBits(); 307 if (Size <= 64) 308 return false; 309 if (!Ty.isVector()) 310 return true; 311 unsigned EltSize = Ty.getElementType().getSizeInBits(); 312 return EltSize != 32 && EltSize != 64; 313 } 314 315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 316 unsigned Opcode) { 317 const LLT Ty = Query.Types[0]; 318 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 319 !loadStoreBitcastWorkaround(Ty); 320 } 321 322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 323 const GCNTargetMachine &TM) 324 : ST(ST_) { 325 using namespace TargetOpcode; 326 327 auto GetAddrSpacePtr = [&TM](unsigned AS) { 328 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 329 }; 330 331 const LLT S1 = LLT::scalar(1); 332 const LLT S16 = LLT::scalar(16); 333 const LLT S32 = LLT::scalar(32); 334 const LLT S64 = LLT::scalar(64); 335 const LLT S128 = LLT::scalar(128); 336 const LLT S256 = LLT::scalar(256); 337 const LLT S512 = LLT::scalar(512); 338 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 339 340 const LLT V2S16 = LLT::vector(2, 16); 341 const LLT V4S16 = LLT::vector(4, 16); 342 343 const LLT V2S32 = LLT::vector(2, 32); 344 const LLT V3S32 = LLT::vector(3, 32); 345 const LLT V4S32 = LLT::vector(4, 32); 346 const LLT V5S32 = LLT::vector(5, 32); 347 const LLT V6S32 = LLT::vector(6, 32); 348 const LLT V7S32 = LLT::vector(7, 32); 349 const LLT V8S32 = LLT::vector(8, 32); 350 const LLT V9S32 = LLT::vector(9, 32); 351 const LLT V10S32 = LLT::vector(10, 32); 352 const LLT V11S32 = LLT::vector(11, 32); 353 const LLT V12S32 = LLT::vector(12, 32); 354 const LLT V13S32 = LLT::vector(13, 32); 355 const LLT V14S32 = LLT::vector(14, 32); 356 const LLT V15S32 = LLT::vector(15, 32); 357 const LLT V16S32 = LLT::vector(16, 32); 358 const LLT V32S32 = LLT::vector(32, 32); 359 360 const LLT V2S64 = LLT::vector(2, 64); 361 const LLT V3S64 = LLT::vector(3, 64); 362 const LLT V4S64 = LLT::vector(4, 64); 363 const LLT V5S64 = LLT::vector(5, 64); 364 const LLT V6S64 = LLT::vector(6, 64); 365 const LLT V7S64 = LLT::vector(7, 64); 366 const LLT V8S64 = LLT::vector(8, 64); 367 const LLT V16S64 = LLT::vector(16, 64); 368 369 std::initializer_list<LLT> AllS32Vectors = 370 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 371 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 372 std::initializer_list<LLT> AllS64Vectors = 373 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 374 375 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 376 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 377 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 378 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 379 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 380 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 381 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 382 383 const LLT CodePtr = FlatPtr; 384 385 const std::initializer_list<LLT> AddrSpaces64 = { 386 GlobalPtr, ConstantPtr, FlatPtr 387 }; 388 389 const std::initializer_list<LLT> AddrSpaces32 = { 390 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 391 }; 392 393 const std::initializer_list<LLT> FPTypesBase = { 394 S32, S64 395 }; 396 397 const std::initializer_list<LLT> FPTypes16 = { 398 S32, S64, S16 399 }; 400 401 const std::initializer_list<LLT> FPTypesPK16 = { 402 S32, S64, S16, V2S16 403 }; 404 405 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 406 407 setAction({G_BRCOND, S1}, Legal); // VCC branches 408 setAction({G_BRCOND, S32}, Legal); // SCC branches 409 410 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 411 // elements for v3s16 412 getActionDefinitionsBuilder(G_PHI) 413 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 414 .legalFor(AllS32Vectors) 415 .legalFor(AllS64Vectors) 416 .legalFor(AddrSpaces64) 417 .legalFor(AddrSpaces32) 418 .legalIf(isPointer(0)) 419 .clampScalar(0, S32, S256) 420 .widenScalarToNextPow2(0, 32) 421 .clampMaxNumElements(0, S32, 16) 422 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 423 .scalarize(0); 424 425 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 426 // Full set of gfx9 features. 427 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 428 .legalFor({S32, S16, V2S16}) 429 .clampScalar(0, S16, S32) 430 .clampMaxNumElements(0, S16, 2) 431 .scalarize(0) 432 .widenScalarToNextPow2(0, 32); 433 434 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 435 .legalFor({S32, S16, V2S16}) // Clamp modifier 436 .minScalar(0, S16) 437 .clampMaxNumElements(0, S16, 2) 438 .scalarize(0) 439 .widenScalarToNextPow2(0, 32) 440 .lower(); 441 } else if (ST.has16BitInsts()) { 442 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 443 .legalFor({S32, S16}) 444 .clampScalar(0, S16, S32) 445 .scalarize(0) 446 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 447 448 // Technically the saturating operations require clamp bit support, but this 449 // was introduced at the same time as 16-bit operations. 450 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 451 .legalFor({S32, S16}) // Clamp modifier 452 .minScalar(0, S16) 453 .scalarize(0) 454 .widenScalarToNextPow2(0, 16) 455 .lower(); 456 457 // We're just lowering this, but it helps get a better result to try to 458 // coerce to the desired type first. 459 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 460 .minScalar(0, S16) 461 .scalarize(0) 462 .lower(); 463 } else { 464 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 465 .legalFor({S32}) 466 .clampScalar(0, S32, S32) 467 .scalarize(0); 468 469 if (ST.hasIntClamp()) { 470 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 471 .legalFor({S32}) // Clamp modifier. 472 .scalarize(0) 473 .minScalarOrElt(0, S32) 474 .lower(); 475 } else { 476 // Clamp bit support was added in VI, along with 16-bit operations. 477 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 478 .minScalar(0, S32) 479 .scalarize(0) 480 .lower(); 481 } 482 483 // FIXME: DAG expansion gets better results. The widening uses the smaller 484 // range values and goes for the min/max lowering directly. 485 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 486 .minScalar(0, S32) 487 .scalarize(0) 488 .lower(); 489 } 490 491 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 492 .customFor({S32, S64}) 493 .clampScalar(0, S32, S64) 494 .widenScalarToNextPow2(0, 32) 495 .scalarize(0); 496 497 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 498 .legalFor({S32}) 499 .clampScalar(0, S32, S32) 500 .scalarize(0); 501 502 // Report legal for any types we can handle anywhere. For the cases only legal 503 // on the SALU, RegBankSelect will be able to re-legalize. 504 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 505 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 506 .clampScalar(0, S32, S64) 507 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 508 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 509 .widenScalarToNextPow2(0) 510 .scalarize(0); 511 512 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 513 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 514 .legalFor({{S32, S1}, {S32, S32}}) 515 .minScalar(0, S32) 516 // TODO: .scalarize(0) 517 .lower(); 518 519 getActionDefinitionsBuilder(G_BITCAST) 520 // Don't worry about the size constraint. 521 .legalIf(all(isRegisterType(0), isRegisterType(1))) 522 .lower(); 523 524 525 getActionDefinitionsBuilder(G_CONSTANT) 526 .legalFor({S1, S32, S64, S16, GlobalPtr, 527 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 528 .legalIf(isPointer(0)) 529 .clampScalar(0, S32, S64) 530 .widenScalarToNextPow2(0); 531 532 getActionDefinitionsBuilder(G_FCONSTANT) 533 .legalFor({S32, S64, S16}) 534 .clampScalar(0, S16, S64); 535 536 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 537 .legalIf(isRegisterType(0)) 538 // s1 and s16 are special cases because they have legal operations on 539 // them, but don't really occupy registers in the normal way. 540 .legalFor({S1, S16}) 541 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 542 .clampScalarOrElt(0, S32, MaxScalar) 543 .widenScalarToNextPow2(0, 32) 544 .clampMaxNumElements(0, S32, 16); 545 546 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 547 548 // If the amount is divergent, we have to do a wave reduction to get the 549 // maximum value, so this is expanded during RegBankSelect. 550 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 551 .legalFor({{PrivatePtr, S32}}); 552 553 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 554 .customIf(typeIsNot(0, PrivatePtr)); 555 556 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 557 558 auto &FPOpActions = getActionDefinitionsBuilder( 559 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 560 .legalFor({S32, S64}); 561 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 562 .customFor({S32, S64}); 563 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 564 .customFor({S32, S64}); 565 566 if (ST.has16BitInsts()) { 567 if (ST.hasVOP3PInsts()) 568 FPOpActions.legalFor({S16, V2S16}); 569 else 570 FPOpActions.legalFor({S16}); 571 572 TrigActions.customFor({S16}); 573 FDIVActions.customFor({S16}); 574 } 575 576 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 577 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 578 579 if (ST.hasVOP3PInsts()) { 580 MinNumMaxNum.customFor(FPTypesPK16) 581 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 582 .clampMaxNumElements(0, S16, 2) 583 .clampScalar(0, S16, S64) 584 .scalarize(0); 585 } else if (ST.has16BitInsts()) { 586 MinNumMaxNum.customFor(FPTypes16) 587 .clampScalar(0, S16, S64) 588 .scalarize(0); 589 } else { 590 MinNumMaxNum.customFor(FPTypesBase) 591 .clampScalar(0, S32, S64) 592 .scalarize(0); 593 } 594 595 if (ST.hasVOP3PInsts()) 596 FPOpActions.clampMaxNumElements(0, S16, 2); 597 598 FPOpActions 599 .scalarize(0) 600 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 601 602 TrigActions 603 .scalarize(0) 604 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 605 606 FDIVActions 607 .scalarize(0) 608 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 609 610 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 611 .legalFor(FPTypesPK16) 612 .clampMaxNumElements(0, S16, 2) 613 .scalarize(0) 614 .clampScalar(0, S16, S64); 615 616 if (ST.has16BitInsts()) { 617 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 618 .legalFor({S32, S64, S16}) 619 .scalarize(0) 620 .clampScalar(0, S16, S64); 621 } else { 622 getActionDefinitionsBuilder(G_FSQRT) 623 .legalFor({S32, S64}) 624 .scalarize(0) 625 .clampScalar(0, S32, S64); 626 627 if (ST.hasFractBug()) { 628 getActionDefinitionsBuilder(G_FFLOOR) 629 .customFor({S64}) 630 .legalFor({S32, S64}) 631 .scalarize(0) 632 .clampScalar(0, S32, S64); 633 } else { 634 getActionDefinitionsBuilder(G_FFLOOR) 635 .legalFor({S32, S64}) 636 .scalarize(0) 637 .clampScalar(0, S32, S64); 638 } 639 } 640 641 getActionDefinitionsBuilder(G_FPTRUNC) 642 .legalFor({{S32, S64}, {S16, S32}}) 643 .scalarize(0) 644 .lower(); 645 646 getActionDefinitionsBuilder(G_FPEXT) 647 .legalFor({{S64, S32}, {S32, S16}}) 648 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 649 .scalarize(0); 650 651 getActionDefinitionsBuilder(G_FSUB) 652 // Use actual fsub instruction 653 .legalFor({S32}) 654 // Must use fadd + fneg 655 .lowerFor({S64, S16, V2S16}) 656 .scalarize(0) 657 .clampScalar(0, S32, S64); 658 659 // Whether this is legal depends on the floating point mode for the function. 660 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 661 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 662 FMad.customFor({S32, S16}); 663 else if (ST.hasMadMacF32Insts()) 664 FMad.customFor({S32}); 665 else if (ST.hasMadF16()) 666 FMad.customFor({S16}); 667 FMad.scalarize(0) 668 .lower(); 669 670 // TODO: Do we need to clamp maximum bitwidth? 671 getActionDefinitionsBuilder(G_TRUNC) 672 .legalIf(isScalar(0)) 673 .legalFor({{V2S16, V2S32}}) 674 .clampMaxNumElements(0, S16, 2) 675 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 676 // situations (like an invalid implicit use), we don't want to infinite loop 677 // in the legalizer. 678 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 679 .alwaysLegal(); 680 681 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 682 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 683 {S32, S1}, {S64, S1}, {S16, S1}}) 684 .scalarize(0) 685 .clampScalar(0, S32, S64) 686 .widenScalarToNextPow2(1, 32); 687 688 // TODO: Split s1->s64 during regbankselect for VALU. 689 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 690 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 691 .lowerFor({{S32, S64}}) 692 .lowerIf(typeIs(1, S1)) 693 .customFor({{S64, S64}}); 694 if (ST.has16BitInsts()) 695 IToFP.legalFor({{S16, S16}}); 696 IToFP.clampScalar(1, S32, S64) 697 .minScalar(0, S32) 698 .scalarize(0) 699 .widenScalarToNextPow2(1); 700 701 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 702 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 703 .customFor({{S64, S64}}) 704 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 705 if (ST.has16BitInsts()) 706 FPToI.legalFor({{S16, S16}}); 707 else 708 FPToI.minScalar(1, S32); 709 710 FPToI.minScalar(0, S32) 711 .scalarize(0) 712 .lower(); 713 714 // Lower roundeven into G_FRINT 715 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 716 .scalarize(0) 717 .lower(); 718 719 if (ST.has16BitInsts()) { 720 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 721 .legalFor({S16, S32, S64}) 722 .clampScalar(0, S16, S64) 723 .scalarize(0); 724 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 725 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 726 .legalFor({S32, S64}) 727 .clampScalar(0, S32, S64) 728 .scalarize(0); 729 } else { 730 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 731 .legalFor({S32}) 732 .customFor({S64}) 733 .clampScalar(0, S32, S64) 734 .scalarize(0); 735 } 736 737 getActionDefinitionsBuilder(G_PTR_ADD) 738 .legalIf(all(isPointer(0), sameSize(0, 1))) 739 .scalarize(0) 740 .scalarSameSizeAs(1, 0); 741 742 getActionDefinitionsBuilder(G_PTRMASK) 743 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 744 .scalarSameSizeAs(1, 0) 745 .scalarize(0); 746 747 auto &CmpBuilder = 748 getActionDefinitionsBuilder(G_ICMP) 749 // The compare output type differs based on the register bank of the output, 750 // so make both s1 and s32 legal. 751 // 752 // Scalar compares producing output in scc will be promoted to s32, as that 753 // is the allocatable register type that will be needed for the copy from 754 // scc. This will be promoted during RegBankSelect, and we assume something 755 // before that won't try to use s32 result types. 756 // 757 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 758 // bank. 759 .legalForCartesianProduct( 760 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 761 .legalForCartesianProduct( 762 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 763 if (ST.has16BitInsts()) { 764 CmpBuilder.legalFor({{S1, S16}}); 765 } 766 767 CmpBuilder 768 .widenScalarToNextPow2(1) 769 .clampScalar(1, S32, S64) 770 .scalarize(0) 771 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 772 773 getActionDefinitionsBuilder(G_FCMP) 774 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 775 .widenScalarToNextPow2(1) 776 .clampScalar(1, S32, S64) 777 .scalarize(0); 778 779 // FIXME: fpow has a selection pattern that should move to custom lowering. 780 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 781 if (ST.has16BitInsts()) 782 Exp2Ops.legalFor({S32, S16}); 783 else 784 Exp2Ops.legalFor({S32}); 785 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 786 Exp2Ops.scalarize(0); 787 788 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 789 if (ST.has16BitInsts()) 790 ExpOps.customFor({{S32}, {S16}}); 791 else 792 ExpOps.customFor({S32}); 793 ExpOps.clampScalar(0, MinScalarFPTy, S32) 794 .scalarize(0); 795 796 getActionDefinitionsBuilder(G_FPOWI) 797 .clampScalar(0, MinScalarFPTy, S32) 798 .lower(); 799 800 // The 64-bit versions produce 32-bit results, but only on the SALU. 801 getActionDefinitionsBuilder(G_CTPOP) 802 .legalFor({{S32, S32}, {S32, S64}}) 803 .clampScalar(0, S32, S32) 804 .clampScalar(1, S32, S64) 805 .scalarize(0) 806 .widenScalarToNextPow2(0, 32) 807 .widenScalarToNextPow2(1, 32); 808 809 // The hardware instructions return a different result on 0 than the generic 810 // instructions expect. The hardware produces -1, but these produce the 811 // bitwidth. 812 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 813 .scalarize(0) 814 .clampScalar(0, S32, S32) 815 .clampScalar(1, S32, S64) 816 .widenScalarToNextPow2(0, 32) 817 .widenScalarToNextPow2(1, 32) 818 .lower(); 819 820 // The 64-bit versions produce 32-bit results, but only on the SALU. 821 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 822 .legalFor({{S32, S32}, {S32, S64}}) 823 .clampScalar(0, S32, S32) 824 .clampScalar(1, S32, S64) 825 .scalarize(0) 826 .widenScalarToNextPow2(0, 32) 827 .widenScalarToNextPow2(1, 32); 828 829 getActionDefinitionsBuilder(G_BITREVERSE) 830 .legalFor({S32}) 831 .clampScalar(0, S32, S32) 832 .scalarize(0); 833 834 if (ST.has16BitInsts()) { 835 getActionDefinitionsBuilder(G_BSWAP) 836 .legalFor({S16, S32, V2S16}) 837 .clampMaxNumElements(0, S16, 2) 838 // FIXME: Fixing non-power-of-2 before clamp is workaround for 839 // narrowScalar limitation. 840 .widenScalarToNextPow2(0) 841 .clampScalar(0, S16, S32) 842 .scalarize(0); 843 844 if (ST.hasVOP3PInsts()) { 845 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 846 .legalFor({S32, S16, V2S16}) 847 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 848 .clampMaxNumElements(0, S16, 2) 849 .minScalar(0, S16) 850 .widenScalarToNextPow2(0) 851 .scalarize(0) 852 .lower(); 853 } else { 854 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 855 .legalFor({S32, S16}) 856 .widenScalarToNextPow2(0) 857 .minScalar(0, S16) 858 .scalarize(0) 859 .lower(); 860 } 861 } else { 862 // TODO: Should have same legality without v_perm_b32 863 getActionDefinitionsBuilder(G_BSWAP) 864 .legalFor({S32}) 865 .lowerIf(scalarNarrowerThan(0, 32)) 866 // FIXME: Fixing non-power-of-2 before clamp is workaround for 867 // narrowScalar limitation. 868 .widenScalarToNextPow2(0) 869 .maxScalar(0, S32) 870 .scalarize(0) 871 .lower(); 872 873 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 874 .legalFor({S32}) 875 .minScalar(0, S32) 876 .widenScalarToNextPow2(0) 877 .scalarize(0) 878 .lower(); 879 } 880 881 getActionDefinitionsBuilder(G_INTTOPTR) 882 // List the common cases 883 .legalForCartesianProduct(AddrSpaces64, {S64}) 884 .legalForCartesianProduct(AddrSpaces32, {S32}) 885 .scalarize(0) 886 // Accept any address space as long as the size matches 887 .legalIf(sameSize(0, 1)) 888 .widenScalarIf(smallerThan(1, 0), 889 [](const LegalityQuery &Query) { 890 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 891 }) 892 .narrowScalarIf(largerThan(1, 0), 893 [](const LegalityQuery &Query) { 894 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 895 }); 896 897 getActionDefinitionsBuilder(G_PTRTOINT) 898 // List the common cases 899 .legalForCartesianProduct(AddrSpaces64, {S64}) 900 .legalForCartesianProduct(AddrSpaces32, {S32}) 901 .scalarize(0) 902 // Accept any address space as long as the size matches 903 .legalIf(sameSize(0, 1)) 904 .widenScalarIf(smallerThan(0, 1), 905 [](const LegalityQuery &Query) { 906 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 907 }) 908 .narrowScalarIf( 909 largerThan(0, 1), 910 [](const LegalityQuery &Query) { 911 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 912 }); 913 914 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 915 .scalarize(0) 916 .custom(); 917 918 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 919 bool IsLoad) -> bool { 920 const LLT DstTy = Query.Types[0]; 921 922 // Split vector extloads. 923 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 924 unsigned Align = Query.MMODescrs[0].AlignInBits; 925 926 if (MemSize < DstTy.getSizeInBits()) 927 MemSize = std::max(MemSize, Align); 928 929 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 930 return true; 931 932 const LLT PtrTy = Query.Types[1]; 933 unsigned AS = PtrTy.getAddressSpace(); 934 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 935 return true; 936 937 // Catch weird sized loads that don't evenly divide into the access sizes 938 // TODO: May be able to widen depending on alignment etc. 939 unsigned NumRegs = (MemSize + 31) / 32; 940 if (NumRegs == 3) { 941 if (!ST.hasDwordx3LoadStores()) 942 return true; 943 } else { 944 // If the alignment allows, these should have been widened. 945 if (!isPowerOf2_32(NumRegs)) 946 return true; 947 } 948 949 if (Align < MemSize) { 950 const SITargetLowering *TLI = ST.getTargetLowering(); 951 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 952 } 953 954 return false; 955 }; 956 957 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 958 unsigned Opc) -> bool { 959 unsigned Size = Query.Types[0].getSizeInBits(); 960 if (isPowerOf2_32(Size)) 961 return false; 962 963 if (Size == 96 && ST.hasDwordx3LoadStores()) 964 return false; 965 966 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 967 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 968 return false; 969 970 unsigned Align = Query.MMODescrs[0].AlignInBits; 971 unsigned RoundedSize = NextPowerOf2(Size); 972 return (Align >= RoundedSize); 973 }; 974 975 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 976 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 977 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 978 979 // TODO: Refine based on subtargets which support unaligned access or 128-bit 980 // LDS 981 // TODO: Unsupported flat for SI. 982 983 for (unsigned Op : {G_LOAD, G_STORE}) { 984 const bool IsStore = Op == G_STORE; 985 986 auto &Actions = getActionDefinitionsBuilder(Op); 987 // Explicitly list some common cases. 988 // TODO: Does this help compile time at all? 989 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 990 {V2S32, GlobalPtr, 64, GlobalAlign32}, 991 {V4S32, GlobalPtr, 128, GlobalAlign32}, 992 {S64, GlobalPtr, 64, GlobalAlign32}, 993 {V2S64, GlobalPtr, 128, GlobalAlign32}, 994 {V2S16, GlobalPtr, 32, GlobalAlign32}, 995 {S32, GlobalPtr, 8, GlobalAlign8}, 996 {S32, GlobalPtr, 16, GlobalAlign16}, 997 998 {S32, LocalPtr, 32, 32}, 999 {S64, LocalPtr, 64, 32}, 1000 {V2S32, LocalPtr, 64, 32}, 1001 {S32, LocalPtr, 8, 8}, 1002 {S32, LocalPtr, 16, 16}, 1003 {V2S16, LocalPtr, 32, 32}, 1004 1005 {S32, PrivatePtr, 32, 32}, 1006 {S32, PrivatePtr, 8, 8}, 1007 {S32, PrivatePtr, 16, 16}, 1008 {V2S16, PrivatePtr, 32, 32}, 1009 1010 {S32, ConstantPtr, 32, GlobalAlign32}, 1011 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1012 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1013 {S64, ConstantPtr, 64, GlobalAlign32}, 1014 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1015 Actions.legalIf( 1016 [=](const LegalityQuery &Query) -> bool { 1017 return isLoadStoreLegal(ST, Query, Op); 1018 }); 1019 1020 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1021 // 64-bits. 1022 // 1023 // TODO: Should generalize bitcast action into coerce, which will also cover 1024 // inserting addrspacecasts. 1025 Actions.customIf(typeIs(1, Constant32Ptr)); 1026 1027 // Turn any illegal element vectors into something easier to deal 1028 // with. These will ultimately produce 32-bit scalar shifts to extract the 1029 // parts anyway. 1030 // 1031 // For odd 16-bit element vectors, prefer to split those into pieces with 1032 // 16-bit vector parts. 1033 Actions.bitcastIf( 1034 [=](const LegalityQuery &Query) -> bool { 1035 const LLT Ty = Query.Types[0]; 1036 const unsigned Size = Ty.getSizeInBits(); 1037 1038 if (Size != Query.MMODescrs[0].SizeInBits) 1039 return Size <= 32 && Ty.isVector(); 1040 1041 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 1042 return true; 1043 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 1044 !isRegisterVectorElementType(Ty.getElementType()); 1045 }, bitcastToRegisterType(0)); 1046 1047 Actions 1048 .customIf(typeIs(1, Constant32Ptr)) 1049 // Widen suitably aligned loads by loading extra elements. 1050 .moreElementsIf([=](const LegalityQuery &Query) { 1051 const LLT Ty = Query.Types[0]; 1052 return Op == G_LOAD && Ty.isVector() && 1053 shouldWidenLoadResult(Query, Op); 1054 }, moreElementsToNextPow2(0)) 1055 .widenScalarIf([=](const LegalityQuery &Query) { 1056 const LLT Ty = Query.Types[0]; 1057 return Op == G_LOAD && !Ty.isVector() && 1058 shouldWidenLoadResult(Query, Op); 1059 }, widenScalarOrEltToNextPow2(0)) 1060 .narrowScalarIf( 1061 [=](const LegalityQuery &Query) -> bool { 1062 return !Query.Types[0].isVector() && 1063 needToSplitMemOp(Query, Op == G_LOAD); 1064 }, 1065 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1066 const LLT DstTy = Query.Types[0]; 1067 const LLT PtrTy = Query.Types[1]; 1068 1069 const unsigned DstSize = DstTy.getSizeInBits(); 1070 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1071 1072 // Split extloads. 1073 if (DstSize > MemSize) 1074 return std::make_pair(0, LLT::scalar(MemSize)); 1075 1076 if (!isPowerOf2_32(DstSize)) { 1077 // We're probably decomposing an odd sized store. Try to split 1078 // to the widest type. TODO: Account for alignment. As-is it 1079 // should be OK, since the new parts will be further legalized. 1080 unsigned FloorSize = PowerOf2Floor(DstSize); 1081 return std::make_pair(0, LLT::scalar(FloorSize)); 1082 } 1083 1084 if (DstSize > 32 && (DstSize % 32 != 0)) { 1085 // FIXME: Need a way to specify non-extload of larger size if 1086 // suitably aligned. 1087 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1088 } 1089 1090 unsigned MaxSize = maxSizeForAddrSpace(ST, 1091 PtrTy.getAddressSpace(), 1092 Op == G_LOAD); 1093 if (MemSize > MaxSize) 1094 return std::make_pair(0, LLT::scalar(MaxSize)); 1095 1096 unsigned Align = Query.MMODescrs[0].AlignInBits; 1097 return std::make_pair(0, LLT::scalar(Align)); 1098 }) 1099 .fewerElementsIf( 1100 [=](const LegalityQuery &Query) -> bool { 1101 return Query.Types[0].isVector() && 1102 needToSplitMemOp(Query, Op == G_LOAD); 1103 }, 1104 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1105 const LLT DstTy = Query.Types[0]; 1106 const LLT PtrTy = Query.Types[1]; 1107 1108 LLT EltTy = DstTy.getElementType(); 1109 unsigned MaxSize = maxSizeForAddrSpace(ST, 1110 PtrTy.getAddressSpace(), 1111 Op == G_LOAD); 1112 1113 // FIXME: Handle widened to power of 2 results better. This ends 1114 // up scalarizing. 1115 // FIXME: 3 element stores scalarized on SI 1116 1117 // Split if it's too large for the address space. 1118 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1119 unsigned NumElts = DstTy.getNumElements(); 1120 unsigned EltSize = EltTy.getSizeInBits(); 1121 1122 if (MaxSize % EltSize == 0) { 1123 return std::make_pair( 1124 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1125 } 1126 1127 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1128 1129 // FIXME: Refine when odd breakdowns handled 1130 // The scalars will need to be re-legalized. 1131 if (NumPieces == 1 || NumPieces >= NumElts || 1132 NumElts % NumPieces != 0) 1133 return std::make_pair(0, EltTy); 1134 1135 return std::make_pair(0, 1136 LLT::vector(NumElts / NumPieces, EltTy)); 1137 } 1138 1139 // FIXME: We could probably handle weird extending loads better. 1140 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1141 if (DstTy.getSizeInBits() > MemSize) 1142 return std::make_pair(0, EltTy); 1143 1144 unsigned EltSize = EltTy.getSizeInBits(); 1145 unsigned DstSize = DstTy.getSizeInBits(); 1146 if (!isPowerOf2_32(DstSize)) { 1147 // We're probably decomposing an odd sized store. Try to split 1148 // to the widest type. TODO: Account for alignment. As-is it 1149 // should be OK, since the new parts will be further legalized. 1150 unsigned FloorSize = PowerOf2Floor(DstSize); 1151 return std::make_pair( 1152 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1153 } 1154 1155 // Need to split because of alignment. 1156 unsigned Align = Query.MMODescrs[0].AlignInBits; 1157 if (EltSize > Align && 1158 (EltSize / Align < DstTy.getNumElements())) { 1159 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1160 } 1161 1162 // May need relegalization for the scalars. 1163 return std::make_pair(0, EltTy); 1164 }) 1165 .minScalar(0, S32); 1166 1167 if (IsStore) 1168 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1169 1170 // TODO: Need a bitcast lower option? 1171 Actions 1172 .widenScalarToNextPow2(0) 1173 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1174 } 1175 1176 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1177 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1178 {S32, GlobalPtr, 16, 2 * 8}, 1179 {S32, LocalPtr, 8, 8}, 1180 {S32, LocalPtr, 16, 16}, 1181 {S32, PrivatePtr, 8, 8}, 1182 {S32, PrivatePtr, 16, 16}, 1183 {S32, ConstantPtr, 8, 8}, 1184 {S32, ConstantPtr, 16, 2 * 8}}); 1185 if (ST.hasFlatAddressSpace()) { 1186 ExtLoads.legalForTypesWithMemDesc( 1187 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1188 } 1189 1190 ExtLoads.clampScalar(0, S32, S32) 1191 .widenScalarToNextPow2(0) 1192 .unsupportedIfMemSizeNotPow2() 1193 .lower(); 1194 1195 auto &Atomics = getActionDefinitionsBuilder( 1196 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1197 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1198 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1199 G_ATOMICRMW_UMIN}) 1200 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1201 {S64, GlobalPtr}, {S64, LocalPtr}, 1202 {S32, RegionPtr}, {S64, RegionPtr}}); 1203 if (ST.hasFlatAddressSpace()) { 1204 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1205 } 1206 1207 if (ST.hasLDSFPAtomics()) { 1208 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1209 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1210 } 1211 1212 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1213 // demarshalling 1214 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1215 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1216 {S32, FlatPtr}, {S64, FlatPtr}}) 1217 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1218 {S32, RegionPtr}, {S64, RegionPtr}}); 1219 // TODO: Pointer types, any 32-bit or 64-bit vector 1220 1221 // Condition should be s32 for scalar, s1 for vector. 1222 getActionDefinitionsBuilder(G_SELECT) 1223 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1224 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1225 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1226 .clampScalar(0, S16, S64) 1227 .scalarize(1) 1228 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1229 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1230 .clampMaxNumElements(0, S32, 2) 1231 .clampMaxNumElements(0, LocalPtr, 2) 1232 .clampMaxNumElements(0, PrivatePtr, 2) 1233 .scalarize(0) 1234 .widenScalarToNextPow2(0) 1235 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1236 1237 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1238 // be more flexible with the shift amount type. 1239 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1240 .legalFor({{S32, S32}, {S64, S32}}); 1241 if (ST.has16BitInsts()) { 1242 if (ST.hasVOP3PInsts()) { 1243 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1244 .clampMaxNumElements(0, S16, 2); 1245 } else 1246 Shifts.legalFor({{S16, S16}}); 1247 1248 // TODO: Support 16-bit shift amounts for all types 1249 Shifts.widenScalarIf( 1250 [=](const LegalityQuery &Query) { 1251 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1252 // 32-bit amount. 1253 const LLT ValTy = Query.Types[0]; 1254 const LLT AmountTy = Query.Types[1]; 1255 return ValTy.getSizeInBits() <= 16 && 1256 AmountTy.getSizeInBits() < 16; 1257 }, changeTo(1, S16)); 1258 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1259 Shifts.clampScalar(1, S32, S32); 1260 Shifts.clampScalar(0, S16, S64); 1261 Shifts.widenScalarToNextPow2(0, 16); 1262 } else { 1263 // Make sure we legalize the shift amount type first, as the general 1264 // expansion for the shifted type will produce much worse code if it hasn't 1265 // been truncated already. 1266 Shifts.clampScalar(1, S32, S32); 1267 Shifts.clampScalar(0, S32, S64); 1268 Shifts.widenScalarToNextPow2(0, 32); 1269 } 1270 Shifts.scalarize(0); 1271 1272 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1273 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1274 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1275 unsigned IdxTypeIdx = 2; 1276 1277 getActionDefinitionsBuilder(Op) 1278 .customIf([=](const LegalityQuery &Query) { 1279 const LLT EltTy = Query.Types[EltTypeIdx]; 1280 const LLT VecTy = Query.Types[VecTypeIdx]; 1281 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1282 return (EltTy.getSizeInBits() == 16 || 1283 EltTy.getSizeInBits() % 32 == 0) && 1284 VecTy.getSizeInBits() % 32 == 0 && 1285 VecTy.getSizeInBits() <= MaxRegisterSize && 1286 IdxTy.getSizeInBits() == 32; 1287 }) 1288 .clampScalar(EltTypeIdx, S32, S64) 1289 .clampScalar(VecTypeIdx, S32, S64) 1290 .clampScalar(IdxTypeIdx, S32, S32) 1291 // TODO: Clamp the number of elements before resorting to stack lowering. 1292 // It should only be necessary with variable indexes. 1293 // As a last resort, lower to the stack 1294 .lower(); 1295 } 1296 1297 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1298 .unsupportedIf([=](const LegalityQuery &Query) { 1299 const LLT &EltTy = Query.Types[1].getElementType(); 1300 return Query.Types[0] != EltTy; 1301 }); 1302 1303 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1304 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1305 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1306 1307 // FIXME: Doesn't handle extract of illegal sizes. 1308 getActionDefinitionsBuilder(Op) 1309 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1310 // FIXME: Multiples of 16 should not be legal. 1311 .legalIf([=](const LegalityQuery &Query) { 1312 const LLT BigTy = Query.Types[BigTyIdx]; 1313 const LLT LitTy = Query.Types[LitTyIdx]; 1314 return (BigTy.getSizeInBits() % 32 == 0) && 1315 (LitTy.getSizeInBits() % 16 == 0); 1316 }) 1317 .widenScalarIf( 1318 [=](const LegalityQuery &Query) { 1319 const LLT BigTy = Query.Types[BigTyIdx]; 1320 return (BigTy.getScalarSizeInBits() < 16); 1321 }, 1322 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1323 .widenScalarIf( 1324 [=](const LegalityQuery &Query) { 1325 const LLT LitTy = Query.Types[LitTyIdx]; 1326 return (LitTy.getScalarSizeInBits() < 16); 1327 }, 1328 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1329 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1330 .widenScalarToNextPow2(BigTyIdx, 32); 1331 1332 } 1333 1334 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1335 .legalForCartesianProduct(AllS32Vectors, {S32}) 1336 .legalForCartesianProduct(AllS64Vectors, {S64}) 1337 .clampNumElements(0, V16S32, V32S32) 1338 .clampNumElements(0, V2S64, V16S64) 1339 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1340 1341 if (ST.hasScalarPackInsts()) { 1342 BuildVector 1343 // FIXME: Should probably widen s1 vectors straight to s32 1344 .minScalarOrElt(0, S16) 1345 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1346 .minScalar(1, S32); 1347 1348 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1349 .legalFor({V2S16, S32}) 1350 .lower(); 1351 BuildVector.minScalarOrElt(0, S32); 1352 } else { 1353 BuildVector.customFor({V2S16, S16}); 1354 BuildVector.minScalarOrElt(0, S32); 1355 1356 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1357 .customFor({V2S16, S32}) 1358 .lower(); 1359 } 1360 1361 BuildVector.legalIf(isRegisterType(0)); 1362 1363 // FIXME: Clamp maximum size 1364 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1365 .legalIf(isRegisterType(0)); 1366 1367 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1368 // pre-legalize. 1369 if (ST.hasVOP3PInsts()) { 1370 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1371 .customFor({V2S16, V2S16}) 1372 .lower(); 1373 } else 1374 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1375 1376 // Merge/Unmerge 1377 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1378 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1379 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1380 1381 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1382 const LLT Ty = Query.Types[TypeIdx]; 1383 if (Ty.isVector()) { 1384 const LLT &EltTy = Ty.getElementType(); 1385 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1386 return true; 1387 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1388 return true; 1389 } 1390 return false; 1391 }; 1392 1393 auto &Builder = getActionDefinitionsBuilder(Op) 1394 .lowerFor({{S16, V2S16}}) 1395 .lowerIf([=](const LegalityQuery &Query) { 1396 const LLT BigTy = Query.Types[BigTyIdx]; 1397 return BigTy.getSizeInBits() == 32; 1398 }) 1399 // Try to widen to s16 first for small types. 1400 // TODO: Only do this on targets with legal s16 shifts 1401 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1402 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1403 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1404 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1405 elementTypeIs(1, S16)), 1406 changeTo(1, V2S16)) 1407 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1408 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1409 // valid. 1410 .clampScalar(LitTyIdx, S32, S512) 1411 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1412 // Break up vectors with weird elements into scalars 1413 .fewerElementsIf( 1414 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1415 scalarize(0)) 1416 .fewerElementsIf( 1417 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1418 scalarize(1)) 1419 .clampScalar(BigTyIdx, S32, MaxScalar); 1420 1421 if (Op == G_MERGE_VALUES) { 1422 Builder.widenScalarIf( 1423 // TODO: Use 16-bit shifts if legal for 8-bit values? 1424 [=](const LegalityQuery &Query) { 1425 const LLT Ty = Query.Types[LitTyIdx]; 1426 return Ty.getSizeInBits() < 32; 1427 }, 1428 changeTo(LitTyIdx, S32)); 1429 } 1430 1431 Builder.widenScalarIf( 1432 [=](const LegalityQuery &Query) { 1433 const LLT Ty = Query.Types[BigTyIdx]; 1434 return !isPowerOf2_32(Ty.getSizeInBits()) && 1435 Ty.getSizeInBits() % 16 != 0; 1436 }, 1437 [=](const LegalityQuery &Query) { 1438 // Pick the next power of 2, or a multiple of 64 over 128. 1439 // Whichever is smaller. 1440 const LLT &Ty = Query.Types[BigTyIdx]; 1441 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1442 if (NewSizeInBits >= 256) { 1443 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1444 if (RoundedTo < NewSizeInBits) 1445 NewSizeInBits = RoundedTo; 1446 } 1447 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1448 }) 1449 .legalIf([=](const LegalityQuery &Query) { 1450 const LLT &BigTy = Query.Types[BigTyIdx]; 1451 const LLT &LitTy = Query.Types[LitTyIdx]; 1452 1453 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1454 return false; 1455 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1456 return false; 1457 1458 return BigTy.getSizeInBits() % 16 == 0 && 1459 LitTy.getSizeInBits() % 16 == 0 && 1460 BigTy.getSizeInBits() <= MaxRegisterSize; 1461 }) 1462 // Any vectors left are the wrong size. Scalarize them. 1463 .scalarize(0) 1464 .scalarize(1); 1465 } 1466 1467 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1468 // RegBankSelect. 1469 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1470 .legalFor({{S32}, {S64}}); 1471 1472 if (ST.hasVOP3PInsts()) { 1473 SextInReg.lowerFor({{V2S16}}) 1474 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1475 // get more vector shift opportunities, since we'll get those when 1476 // expanded. 1477 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1478 } else if (ST.has16BitInsts()) { 1479 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1480 } else { 1481 // Prefer to promote to s32 before lowering if we don't have 16-bit 1482 // shifts. This avoid a lot of intermediate truncate and extend operations. 1483 SextInReg.lowerFor({{S32}, {S64}}); 1484 } 1485 1486 SextInReg 1487 .scalarize(0) 1488 .clampScalar(0, S32, S64) 1489 .lower(); 1490 1491 getActionDefinitionsBuilder(G_FSHR) 1492 .legalFor({{S32, S32}}) 1493 .scalarize(0) 1494 .lower(); 1495 1496 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1497 .legalFor({S64}); 1498 1499 getActionDefinitionsBuilder(G_FENCE) 1500 .alwaysLegal(); 1501 1502 getActionDefinitionsBuilder({ 1503 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1504 G_FCOPYSIGN, 1505 1506 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1507 G_ATOMICRMW_NAND, 1508 G_ATOMICRMW_FSUB, 1509 G_READ_REGISTER, 1510 G_WRITE_REGISTER, 1511 1512 G_SADDO, G_SSUBO, 1513 1514 // TODO: Implement 1515 G_FMINIMUM, G_FMAXIMUM, 1516 G_FSHL 1517 }).lower(); 1518 1519 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1520 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1521 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1522 .unsupported(); 1523 1524 computeTables(); 1525 verify(*ST.getInstrInfo()); 1526 } 1527 1528 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1529 MachineInstr &MI) const { 1530 MachineIRBuilder &B = Helper.MIRBuilder; 1531 MachineRegisterInfo &MRI = *B.getMRI(); 1532 GISelChangeObserver &Observer = Helper.Observer; 1533 1534 switch (MI.getOpcode()) { 1535 case TargetOpcode::G_ADDRSPACE_CAST: 1536 return legalizeAddrSpaceCast(MI, MRI, B); 1537 case TargetOpcode::G_FRINT: 1538 return legalizeFrint(MI, MRI, B); 1539 case TargetOpcode::G_FCEIL: 1540 return legalizeFceil(MI, MRI, B); 1541 case TargetOpcode::G_INTRINSIC_TRUNC: 1542 return legalizeIntrinsicTrunc(MI, MRI, B); 1543 case TargetOpcode::G_SITOFP: 1544 return legalizeITOFP(MI, MRI, B, true); 1545 case TargetOpcode::G_UITOFP: 1546 return legalizeITOFP(MI, MRI, B, false); 1547 case TargetOpcode::G_FPTOSI: 1548 return legalizeFPTOI(MI, MRI, B, true); 1549 case TargetOpcode::G_FPTOUI: 1550 return legalizeFPTOI(MI, MRI, B, false); 1551 case TargetOpcode::G_FMINNUM: 1552 case TargetOpcode::G_FMAXNUM: 1553 case TargetOpcode::G_FMINNUM_IEEE: 1554 case TargetOpcode::G_FMAXNUM_IEEE: 1555 return legalizeMinNumMaxNum(Helper, MI); 1556 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1557 return legalizeExtractVectorElt(MI, MRI, B); 1558 case TargetOpcode::G_INSERT_VECTOR_ELT: 1559 return legalizeInsertVectorElt(MI, MRI, B); 1560 case TargetOpcode::G_SHUFFLE_VECTOR: 1561 return legalizeShuffleVector(MI, MRI, B); 1562 case TargetOpcode::G_FSIN: 1563 case TargetOpcode::G_FCOS: 1564 return legalizeSinCos(MI, MRI, B); 1565 case TargetOpcode::G_GLOBAL_VALUE: 1566 return legalizeGlobalValue(MI, MRI, B); 1567 case TargetOpcode::G_LOAD: 1568 return legalizeLoad(MI, MRI, B, Observer); 1569 case TargetOpcode::G_FMAD: 1570 return legalizeFMad(MI, MRI, B); 1571 case TargetOpcode::G_FDIV: 1572 return legalizeFDIV(MI, MRI, B); 1573 case TargetOpcode::G_UDIV: 1574 case TargetOpcode::G_UREM: 1575 return legalizeUDIV_UREM(MI, MRI, B); 1576 case TargetOpcode::G_SDIV: 1577 case TargetOpcode::G_SREM: 1578 return legalizeSDIV_SREM(MI, MRI, B); 1579 case TargetOpcode::G_ATOMIC_CMPXCHG: 1580 return legalizeAtomicCmpXChg(MI, MRI, B); 1581 case TargetOpcode::G_FLOG: 1582 return legalizeFlog(MI, B, numbers::ln2f); 1583 case TargetOpcode::G_FLOG10: 1584 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1585 case TargetOpcode::G_FEXP: 1586 return legalizeFExp(MI, B); 1587 case TargetOpcode::G_FPOW: 1588 return legalizeFPow(MI, B); 1589 case TargetOpcode::G_FFLOOR: 1590 return legalizeFFloor(MI, MRI, B); 1591 case TargetOpcode::G_BUILD_VECTOR: 1592 return legalizeBuildVector(MI, MRI, B); 1593 default: 1594 return false; 1595 } 1596 1597 llvm_unreachable("expected switch to return"); 1598 } 1599 1600 Register AMDGPULegalizerInfo::getSegmentAperture( 1601 unsigned AS, 1602 MachineRegisterInfo &MRI, 1603 MachineIRBuilder &B) const { 1604 MachineFunction &MF = B.getMF(); 1605 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1606 const LLT S32 = LLT::scalar(32); 1607 1608 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1609 1610 if (ST.hasApertureRegs()) { 1611 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1612 // getreg. 1613 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1614 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1615 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1616 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1617 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1618 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1619 unsigned Encoding = 1620 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1621 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1622 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1623 1624 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1625 1626 B.buildInstr(AMDGPU::S_GETREG_B32) 1627 .addDef(GetReg) 1628 .addImm(Encoding); 1629 MRI.setType(GetReg, S32); 1630 1631 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1632 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1633 } 1634 1635 Register QueuePtr = MRI.createGenericVirtualRegister( 1636 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1637 1638 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1639 return Register(); 1640 1641 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1642 // private_segment_aperture_base_hi. 1643 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1644 1645 // TODO: can we be smarter about machine pointer info? 1646 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1647 MachineMemOperand *MMO = MF.getMachineMemOperand( 1648 PtrInfo, 1649 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1650 MachineMemOperand::MOInvariant, 1651 4, commonAlignment(Align(64), StructOffset)); 1652 1653 Register LoadAddr; 1654 1655 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1656 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1657 } 1658 1659 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1660 MachineInstr &MI, MachineRegisterInfo &MRI, 1661 MachineIRBuilder &B) const { 1662 MachineFunction &MF = B.getMF(); 1663 1664 const LLT S32 = LLT::scalar(32); 1665 Register Dst = MI.getOperand(0).getReg(); 1666 Register Src = MI.getOperand(1).getReg(); 1667 1668 LLT DstTy = MRI.getType(Dst); 1669 LLT SrcTy = MRI.getType(Src); 1670 unsigned DestAS = DstTy.getAddressSpace(); 1671 unsigned SrcAS = SrcTy.getAddressSpace(); 1672 1673 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1674 // vector element. 1675 assert(!DstTy.isVector()); 1676 1677 const AMDGPUTargetMachine &TM 1678 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1679 1680 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1681 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1682 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1683 return true; 1684 } 1685 1686 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1687 // Truncate. 1688 B.buildExtract(Dst, Src, 0); 1689 MI.eraseFromParent(); 1690 return true; 1691 } 1692 1693 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1694 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1695 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1696 1697 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1698 // another. Merge operands are required to be the same type, but creating an 1699 // extra ptrtoint would be kind of pointless. 1700 auto HighAddr = B.buildConstant( 1701 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1702 B.buildMerge(Dst, {Src, HighAddr}); 1703 MI.eraseFromParent(); 1704 return true; 1705 } 1706 1707 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1708 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1709 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1710 unsigned NullVal = TM.getNullPointerValue(DestAS); 1711 1712 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1713 auto FlatNull = B.buildConstant(SrcTy, 0); 1714 1715 // Extract low 32-bits of the pointer. 1716 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1717 1718 auto CmpRes = 1719 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1720 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1721 1722 MI.eraseFromParent(); 1723 return true; 1724 } 1725 1726 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1727 return false; 1728 1729 if (!ST.hasFlatAddressSpace()) 1730 return false; 1731 1732 auto SegmentNull = 1733 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1734 auto FlatNull = 1735 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1736 1737 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1738 if (!ApertureReg.isValid()) 1739 return false; 1740 1741 auto CmpRes = 1742 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1743 1744 // Coerce the type of the low half of the result so we can use merge_values. 1745 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1746 1747 // TODO: Should we allow mismatched types but matching sizes in merges to 1748 // avoid the ptrtoint? 1749 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1750 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1751 1752 MI.eraseFromParent(); 1753 return true; 1754 } 1755 1756 bool AMDGPULegalizerInfo::legalizeFrint( 1757 MachineInstr &MI, MachineRegisterInfo &MRI, 1758 MachineIRBuilder &B) const { 1759 Register Src = MI.getOperand(1).getReg(); 1760 LLT Ty = MRI.getType(Src); 1761 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1762 1763 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1764 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1765 1766 auto C1 = B.buildFConstant(Ty, C1Val); 1767 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1768 1769 // TODO: Should this propagate fast-math-flags? 1770 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1771 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1772 1773 auto C2 = B.buildFConstant(Ty, C2Val); 1774 auto Fabs = B.buildFAbs(Ty, Src); 1775 1776 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1777 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1778 MI.eraseFromParent(); 1779 return true; 1780 } 1781 1782 bool AMDGPULegalizerInfo::legalizeFceil( 1783 MachineInstr &MI, MachineRegisterInfo &MRI, 1784 MachineIRBuilder &B) const { 1785 1786 const LLT S1 = LLT::scalar(1); 1787 const LLT S64 = LLT::scalar(64); 1788 1789 Register Src = MI.getOperand(1).getReg(); 1790 assert(MRI.getType(Src) == S64); 1791 1792 // result = trunc(src) 1793 // if (src > 0.0 && src != result) 1794 // result += 1.0 1795 1796 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1797 1798 const auto Zero = B.buildFConstant(S64, 0.0); 1799 const auto One = B.buildFConstant(S64, 1.0); 1800 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1801 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1802 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1803 auto Add = B.buildSelect(S64, And, One, Zero); 1804 1805 // TODO: Should this propagate fast-math-flags? 1806 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1807 return true; 1808 } 1809 1810 static MachineInstrBuilder extractF64Exponent(Register Hi, 1811 MachineIRBuilder &B) { 1812 const unsigned FractBits = 52; 1813 const unsigned ExpBits = 11; 1814 LLT S32 = LLT::scalar(32); 1815 1816 auto Const0 = B.buildConstant(S32, FractBits - 32); 1817 auto Const1 = B.buildConstant(S32, ExpBits); 1818 1819 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1820 .addUse(Hi) 1821 .addUse(Const0.getReg(0)) 1822 .addUse(Const1.getReg(0)); 1823 1824 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1825 } 1826 1827 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1828 MachineInstr &MI, MachineRegisterInfo &MRI, 1829 MachineIRBuilder &B) const { 1830 const LLT S1 = LLT::scalar(1); 1831 const LLT S32 = LLT::scalar(32); 1832 const LLT S64 = LLT::scalar(64); 1833 1834 Register Src = MI.getOperand(1).getReg(); 1835 assert(MRI.getType(Src) == S64); 1836 1837 // TODO: Should this use extract since the low half is unused? 1838 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1839 Register Hi = Unmerge.getReg(1); 1840 1841 // Extract the upper half, since this is where we will find the sign and 1842 // exponent. 1843 auto Exp = extractF64Exponent(Hi, B); 1844 1845 const unsigned FractBits = 52; 1846 1847 // Extract the sign bit. 1848 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1849 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1850 1851 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1852 1853 const auto Zero32 = B.buildConstant(S32, 0); 1854 1855 // Extend back to 64-bits. 1856 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1857 1858 auto Shr = B.buildAShr(S64, FractMask, Exp); 1859 auto Not = B.buildNot(S64, Shr); 1860 auto Tmp0 = B.buildAnd(S64, Src, Not); 1861 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1862 1863 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1864 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1865 1866 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1867 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1868 MI.eraseFromParent(); 1869 return true; 1870 } 1871 1872 bool AMDGPULegalizerInfo::legalizeITOFP( 1873 MachineInstr &MI, MachineRegisterInfo &MRI, 1874 MachineIRBuilder &B, bool Signed) const { 1875 1876 Register Dst = MI.getOperand(0).getReg(); 1877 Register Src = MI.getOperand(1).getReg(); 1878 1879 const LLT S64 = LLT::scalar(64); 1880 const LLT S32 = LLT::scalar(32); 1881 1882 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1883 1884 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1885 1886 auto CvtHi = Signed ? 1887 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1888 B.buildUITOFP(S64, Unmerge.getReg(1)); 1889 1890 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1891 1892 auto ThirtyTwo = B.buildConstant(S32, 32); 1893 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1894 .addUse(CvtHi.getReg(0)) 1895 .addUse(ThirtyTwo.getReg(0)); 1896 1897 // TODO: Should this propagate fast-math-flags? 1898 B.buildFAdd(Dst, LdExp, CvtLo); 1899 MI.eraseFromParent(); 1900 return true; 1901 } 1902 1903 // TODO: Copied from DAG implementation. Verify logic and document how this 1904 // actually works. 1905 bool AMDGPULegalizerInfo::legalizeFPTOI( 1906 MachineInstr &MI, MachineRegisterInfo &MRI, 1907 MachineIRBuilder &B, bool Signed) const { 1908 1909 Register Dst = MI.getOperand(0).getReg(); 1910 Register Src = MI.getOperand(1).getReg(); 1911 1912 const LLT S64 = LLT::scalar(64); 1913 const LLT S32 = LLT::scalar(32); 1914 1915 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1916 1917 unsigned Flags = MI.getFlags(); 1918 1919 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1920 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1921 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1922 1923 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1924 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1925 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1926 1927 auto Hi = Signed ? 1928 B.buildFPTOSI(S32, FloorMul) : 1929 B.buildFPTOUI(S32, FloorMul); 1930 auto Lo = B.buildFPTOUI(S32, Fma); 1931 1932 B.buildMerge(Dst, { Lo, Hi }); 1933 MI.eraseFromParent(); 1934 1935 return true; 1936 } 1937 1938 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 1939 MachineInstr &MI) const { 1940 MachineFunction &MF = Helper.MIRBuilder.getMF(); 1941 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1942 1943 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1944 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1945 1946 // With ieee_mode disabled, the instructions have the correct behavior 1947 // already for G_FMINNUM/G_FMAXNUM 1948 if (!MFI->getMode().IEEE) 1949 return !IsIEEEOp; 1950 1951 if (IsIEEEOp) 1952 return true; 1953 1954 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1955 } 1956 1957 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1958 MachineInstr &MI, MachineRegisterInfo &MRI, 1959 MachineIRBuilder &B) const { 1960 // TODO: Should move some of this into LegalizerHelper. 1961 1962 // TODO: Promote dynamic indexing of s16 to s32 1963 1964 // FIXME: Artifact combiner probably should have replaced the truncated 1965 // constant before this, so we shouldn't need 1966 // getConstantVRegValWithLookThrough. 1967 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1968 MI.getOperand(2).getReg(), MRI); 1969 if (!IdxVal) // Dynamic case will be selected to register indexing. 1970 return true; 1971 1972 Register Dst = MI.getOperand(0).getReg(); 1973 Register Vec = MI.getOperand(1).getReg(); 1974 1975 LLT VecTy = MRI.getType(Vec); 1976 LLT EltTy = VecTy.getElementType(); 1977 assert(EltTy == MRI.getType(Dst)); 1978 1979 if (IdxVal->Value < VecTy.getNumElements()) 1980 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1981 else 1982 B.buildUndef(Dst); 1983 1984 MI.eraseFromParent(); 1985 return true; 1986 } 1987 1988 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1989 MachineInstr &MI, MachineRegisterInfo &MRI, 1990 MachineIRBuilder &B) const { 1991 // TODO: Should move some of this into LegalizerHelper. 1992 1993 // TODO: Promote dynamic indexing of s16 to s32 1994 1995 // FIXME: Artifact combiner probably should have replaced the truncated 1996 // constant before this, so we shouldn't need 1997 // getConstantVRegValWithLookThrough. 1998 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1999 MI.getOperand(3).getReg(), MRI); 2000 if (!IdxVal) // Dynamic case will be selected to register indexing. 2001 return true; 2002 2003 Register Dst = MI.getOperand(0).getReg(); 2004 Register Vec = MI.getOperand(1).getReg(); 2005 Register Ins = MI.getOperand(2).getReg(); 2006 2007 LLT VecTy = MRI.getType(Vec); 2008 LLT EltTy = VecTy.getElementType(); 2009 assert(EltTy == MRI.getType(Ins)); 2010 2011 if (IdxVal->Value < VecTy.getNumElements()) 2012 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2013 else 2014 B.buildUndef(Dst); 2015 2016 MI.eraseFromParent(); 2017 return true; 2018 } 2019 2020 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2021 MachineInstr &MI, MachineRegisterInfo &MRI, 2022 MachineIRBuilder &B) const { 2023 const LLT V2S16 = LLT::vector(2, 16); 2024 2025 Register Dst = MI.getOperand(0).getReg(); 2026 Register Src0 = MI.getOperand(1).getReg(); 2027 LLT DstTy = MRI.getType(Dst); 2028 LLT SrcTy = MRI.getType(Src0); 2029 2030 if (SrcTy == V2S16 && DstTy == V2S16 && 2031 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2032 return true; 2033 2034 MachineIRBuilder HelperBuilder(MI); 2035 GISelObserverWrapper DummyObserver; 2036 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2037 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2038 } 2039 2040 bool AMDGPULegalizerInfo::legalizeSinCos( 2041 MachineInstr &MI, MachineRegisterInfo &MRI, 2042 MachineIRBuilder &B) const { 2043 2044 Register DstReg = MI.getOperand(0).getReg(); 2045 Register SrcReg = MI.getOperand(1).getReg(); 2046 LLT Ty = MRI.getType(DstReg); 2047 unsigned Flags = MI.getFlags(); 2048 2049 Register TrigVal; 2050 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2051 if (ST.hasTrigReducedRange()) { 2052 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2053 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2054 .addUse(MulVal.getReg(0)) 2055 .setMIFlags(Flags).getReg(0); 2056 } else 2057 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2058 2059 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2060 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2061 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2062 .addUse(TrigVal) 2063 .setMIFlags(Flags); 2064 MI.eraseFromParent(); 2065 return true; 2066 } 2067 2068 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2069 MachineIRBuilder &B, 2070 const GlobalValue *GV, 2071 int64_t Offset, 2072 unsigned GAFlags) const { 2073 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2074 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2075 // to the following code sequence: 2076 // 2077 // For constant address space: 2078 // s_getpc_b64 s[0:1] 2079 // s_add_u32 s0, s0, $symbol 2080 // s_addc_u32 s1, s1, 0 2081 // 2082 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2083 // a fixup or relocation is emitted to replace $symbol with a literal 2084 // constant, which is a pc-relative offset from the encoding of the $symbol 2085 // operand to the global variable. 2086 // 2087 // For global address space: 2088 // s_getpc_b64 s[0:1] 2089 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2090 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2091 // 2092 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2093 // fixups or relocations are emitted to replace $symbol@*@lo and 2094 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2095 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2096 // operand to the global variable. 2097 // 2098 // What we want here is an offset from the value returned by s_getpc 2099 // (which is the address of the s_add_u32 instruction) to the global 2100 // variable, but since the encoding of $symbol starts 4 bytes after the start 2101 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2102 // small. This requires us to add 4 to the global variable offset in order to 2103 // compute the correct address. 2104 2105 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2106 2107 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2108 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2109 2110 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2111 .addDef(PCReg); 2112 2113 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2114 if (GAFlags == SIInstrInfo::MO_NONE) 2115 MIB.addImm(0); 2116 else 2117 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2118 2119 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2120 2121 if (PtrTy.getSizeInBits() == 32) 2122 B.buildExtract(DstReg, PCReg, 0); 2123 return true; 2124 } 2125 2126 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2127 MachineInstr &MI, MachineRegisterInfo &MRI, 2128 MachineIRBuilder &B) const { 2129 Register DstReg = MI.getOperand(0).getReg(); 2130 LLT Ty = MRI.getType(DstReg); 2131 unsigned AS = Ty.getAddressSpace(); 2132 2133 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2134 MachineFunction &MF = B.getMF(); 2135 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2136 2137 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2138 if (!MFI->isEntryFunction()) { 2139 const Function &Fn = MF.getFunction(); 2140 DiagnosticInfoUnsupported BadLDSDecl( 2141 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2142 DS_Warning); 2143 Fn.getContext().diagnose(BadLDSDecl); 2144 2145 // We currently don't have a way to correctly allocate LDS objects that 2146 // aren't directly associated with a kernel. We do force inlining of 2147 // functions that use local objects. However, if these dead functions are 2148 // not eliminated, we don't want a compile time error. Just emit a warning 2149 // and a trap, since there should be no callable path here. 2150 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2151 B.buildUndef(DstReg); 2152 MI.eraseFromParent(); 2153 return true; 2154 } 2155 2156 // TODO: We could emit code to handle the initialization somewhere. 2157 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2158 const SITargetLowering *TLI = ST.getTargetLowering(); 2159 if (!TLI->shouldUseLDSConstAddress(GV)) { 2160 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2161 return true; // Leave in place; 2162 } 2163 2164 B.buildConstant( 2165 DstReg, 2166 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2167 MI.eraseFromParent(); 2168 return true; 2169 } 2170 2171 const Function &Fn = MF.getFunction(); 2172 DiagnosticInfoUnsupported BadInit( 2173 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2174 Fn.getContext().diagnose(BadInit); 2175 return true; 2176 } 2177 2178 const SITargetLowering *TLI = ST.getTargetLowering(); 2179 2180 if (TLI->shouldEmitFixup(GV)) { 2181 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2182 MI.eraseFromParent(); 2183 return true; 2184 } 2185 2186 if (TLI->shouldEmitPCReloc(GV)) { 2187 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2188 MI.eraseFromParent(); 2189 return true; 2190 } 2191 2192 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2193 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2194 2195 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2196 MachinePointerInfo::getGOT(MF), 2197 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2198 MachineMemOperand::MOInvariant, 2199 8 /*Size*/, Align(8)); 2200 2201 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2202 2203 if (Ty.getSizeInBits() == 32) { 2204 // Truncate if this is a 32-bit constant adrdess. 2205 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2206 B.buildExtract(DstReg, Load, 0); 2207 } else 2208 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2209 2210 MI.eraseFromParent(); 2211 return true; 2212 } 2213 2214 bool AMDGPULegalizerInfo::legalizeLoad( 2215 MachineInstr &MI, MachineRegisterInfo &MRI, 2216 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2217 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2218 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2219 Observer.changingInstr(MI); 2220 MI.getOperand(1).setReg(Cast.getReg(0)); 2221 Observer.changedInstr(MI); 2222 return true; 2223 } 2224 2225 bool AMDGPULegalizerInfo::legalizeFMad( 2226 MachineInstr &MI, MachineRegisterInfo &MRI, 2227 MachineIRBuilder &B) const { 2228 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2229 assert(Ty.isScalar()); 2230 2231 MachineFunction &MF = B.getMF(); 2232 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2233 2234 // TODO: Always legal with future ftz flag. 2235 // FIXME: Do we need just output? 2236 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2237 return true; 2238 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2239 return true; 2240 2241 MachineIRBuilder HelperBuilder(MI); 2242 GISelObserverWrapper DummyObserver; 2243 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2244 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2245 } 2246 2247 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2248 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2249 Register DstReg = MI.getOperand(0).getReg(); 2250 Register PtrReg = MI.getOperand(1).getReg(); 2251 Register CmpVal = MI.getOperand(2).getReg(); 2252 Register NewVal = MI.getOperand(3).getReg(); 2253 2254 assert(SITargetLowering::isFlatGlobalAddrSpace( 2255 MRI.getType(PtrReg).getAddressSpace()) && 2256 "this should not have been custom lowered"); 2257 2258 LLT ValTy = MRI.getType(CmpVal); 2259 LLT VecTy = LLT::vector(2, ValTy); 2260 2261 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2262 2263 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2264 .addDef(DstReg) 2265 .addUse(PtrReg) 2266 .addUse(PackedVal) 2267 .setMemRefs(MI.memoperands()); 2268 2269 MI.eraseFromParent(); 2270 return true; 2271 } 2272 2273 bool AMDGPULegalizerInfo::legalizeFlog( 2274 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2275 Register Dst = MI.getOperand(0).getReg(); 2276 Register Src = MI.getOperand(1).getReg(); 2277 LLT Ty = B.getMRI()->getType(Dst); 2278 unsigned Flags = MI.getFlags(); 2279 2280 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2281 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2282 2283 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2284 MI.eraseFromParent(); 2285 return true; 2286 } 2287 2288 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2289 MachineIRBuilder &B) const { 2290 Register Dst = MI.getOperand(0).getReg(); 2291 Register Src = MI.getOperand(1).getReg(); 2292 unsigned Flags = MI.getFlags(); 2293 LLT Ty = B.getMRI()->getType(Dst); 2294 2295 auto K = B.buildFConstant(Ty, numbers::log2e); 2296 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2297 B.buildFExp2(Dst, Mul, Flags); 2298 MI.eraseFromParent(); 2299 return true; 2300 } 2301 2302 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2303 MachineIRBuilder &B) const { 2304 Register Dst = MI.getOperand(0).getReg(); 2305 Register Src0 = MI.getOperand(1).getReg(); 2306 Register Src1 = MI.getOperand(2).getReg(); 2307 unsigned Flags = MI.getFlags(); 2308 LLT Ty = B.getMRI()->getType(Dst); 2309 const LLT S16 = LLT::scalar(16); 2310 const LLT S32 = LLT::scalar(32); 2311 2312 if (Ty == S32) { 2313 auto Log = B.buildFLog2(S32, Src0, Flags); 2314 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2315 .addUse(Log.getReg(0)) 2316 .addUse(Src1) 2317 .setMIFlags(Flags); 2318 B.buildFExp2(Dst, Mul, Flags); 2319 } else if (Ty == S16) { 2320 // There's no f16 fmul_legacy, so we need to convert for it. 2321 auto Log = B.buildFLog2(S16, Src0, Flags); 2322 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2323 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2324 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2325 .addUse(Ext0.getReg(0)) 2326 .addUse(Ext1.getReg(0)) 2327 .setMIFlags(Flags); 2328 2329 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2330 } else 2331 return false; 2332 2333 MI.eraseFromParent(); 2334 return true; 2335 } 2336 2337 // Find a source register, ignoring any possible source modifiers. 2338 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2339 Register ModSrc = OrigSrc; 2340 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2341 ModSrc = SrcFNeg->getOperand(1).getReg(); 2342 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2343 ModSrc = SrcFAbs->getOperand(1).getReg(); 2344 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2345 ModSrc = SrcFAbs->getOperand(1).getReg(); 2346 return ModSrc; 2347 } 2348 2349 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2350 MachineRegisterInfo &MRI, 2351 MachineIRBuilder &B) const { 2352 2353 const LLT S1 = LLT::scalar(1); 2354 const LLT S64 = LLT::scalar(64); 2355 Register Dst = MI.getOperand(0).getReg(); 2356 Register OrigSrc = MI.getOperand(1).getReg(); 2357 unsigned Flags = MI.getFlags(); 2358 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2359 "this should not have been custom lowered"); 2360 2361 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2362 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2363 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2364 // V_FRACT bug is: 2365 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2366 // 2367 // Convert floor(x) to (x - fract(x)) 2368 2369 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2370 .addUse(OrigSrc) 2371 .setMIFlags(Flags); 2372 2373 // Give source modifier matching some assistance before obscuring a foldable 2374 // pattern. 2375 2376 // TODO: We can avoid the neg on the fract? The input sign to fract 2377 // shouldn't matter? 2378 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2379 2380 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2381 2382 Register Min = MRI.createGenericVirtualRegister(S64); 2383 2384 // We don't need to concern ourselves with the snan handling difference, so 2385 // use the one which will directly select. 2386 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2387 if (MFI->getMode().IEEE) 2388 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2389 else 2390 B.buildFMinNum(Min, Fract, Const, Flags); 2391 2392 Register CorrectedFract = Min; 2393 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2394 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2395 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2396 } 2397 2398 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2399 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2400 2401 MI.eraseFromParent(); 2402 return true; 2403 } 2404 2405 // Turn an illegal packed v2s16 build vector into bit operations. 2406 // TODO: This should probably be a bitcast action in LegalizerHelper. 2407 bool AMDGPULegalizerInfo::legalizeBuildVector( 2408 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2409 Register Dst = MI.getOperand(0).getReg(); 2410 const LLT S32 = LLT::scalar(32); 2411 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2412 2413 Register Src0 = MI.getOperand(1).getReg(); 2414 Register Src1 = MI.getOperand(2).getReg(); 2415 assert(MRI.getType(Src0) == LLT::scalar(16)); 2416 2417 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2418 B.buildBitcast(Dst, Merge); 2419 2420 MI.eraseFromParent(); 2421 return true; 2422 } 2423 2424 // Return the use branch instruction, otherwise null if the usage is invalid. 2425 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2426 MachineRegisterInfo &MRI, 2427 MachineInstr *&Br, 2428 MachineBasicBlock *&UncondBrTarget) { 2429 Register CondDef = MI.getOperand(0).getReg(); 2430 if (!MRI.hasOneNonDBGUse(CondDef)) 2431 return nullptr; 2432 2433 MachineBasicBlock *Parent = MI.getParent(); 2434 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2435 if (UseMI.getParent() != Parent || 2436 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2437 return nullptr; 2438 2439 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2440 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2441 if (Next == Parent->end()) { 2442 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2443 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2444 return nullptr; 2445 UncondBrTarget = &*NextMBB; 2446 } else { 2447 if (Next->getOpcode() != AMDGPU::G_BR) 2448 return nullptr; 2449 Br = &*Next; 2450 UncondBrTarget = Br->getOperand(0).getMBB(); 2451 } 2452 2453 return &UseMI; 2454 } 2455 2456 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2457 MachineRegisterInfo &MRI, 2458 Register LiveIn, 2459 Register PhyReg) const { 2460 assert(PhyReg.isPhysical() && "Physical register expected"); 2461 2462 // Insert the live-in copy, if required, by defining destination virtual 2463 // register. 2464 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2465 if (!MRI.getVRegDef(LiveIn)) { 2466 // FIXME: Should have scoped insert pt 2467 MachineBasicBlock &OrigInsBB = B.getMBB(); 2468 auto OrigInsPt = B.getInsertPt(); 2469 2470 MachineBasicBlock &EntryMBB = B.getMF().front(); 2471 EntryMBB.addLiveIn(PhyReg); 2472 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2473 B.buildCopy(LiveIn, PhyReg); 2474 2475 B.setInsertPt(OrigInsBB, OrigInsPt); 2476 } 2477 2478 return LiveIn; 2479 } 2480 2481 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2482 MachineRegisterInfo &MRI, 2483 Register PhyReg, LLT Ty, 2484 bool InsertLiveInCopy) const { 2485 assert(PhyReg.isPhysical() && "Physical register expected"); 2486 2487 // Get or create virtual live-in regester 2488 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2489 if (!LiveIn) { 2490 LiveIn = MRI.createGenericVirtualRegister(Ty); 2491 MRI.addLiveIn(PhyReg, LiveIn); 2492 } 2493 2494 // When the actual true copy required is from virtual register to physical 2495 // register (to be inserted later), live-in copy insertion from physical 2496 // to register virtual register is not required 2497 if (!InsertLiveInCopy) 2498 return LiveIn; 2499 2500 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2501 } 2502 2503 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2504 const ArgDescriptor *Arg, 2505 const TargetRegisterClass *ArgRC, 2506 LLT ArgTy) const { 2507 MCRegister SrcReg = Arg->getRegister(); 2508 assert(SrcReg.isPhysical() && "Physical register expected"); 2509 assert(DstReg.isVirtual() && "Virtual register expected"); 2510 2511 MachineRegisterInfo &MRI = *B.getMRI(); 2512 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, ArgTy); 2513 2514 if (Arg->isMasked()) { 2515 // TODO: Should we try to emit this once in the entry block? 2516 const LLT S32 = LLT::scalar(32); 2517 const unsigned Mask = Arg->getMask(); 2518 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2519 2520 Register AndMaskSrc = LiveIn; 2521 2522 if (Shift != 0) { 2523 auto ShiftAmt = B.buildConstant(S32, Shift); 2524 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2525 } 2526 2527 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2528 } else { 2529 B.buildCopy(DstReg, LiveIn); 2530 } 2531 2532 return true; 2533 } 2534 2535 bool AMDGPULegalizerInfo::loadInputValue( 2536 Register DstReg, MachineIRBuilder &B, 2537 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2538 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2539 const ArgDescriptor *Arg; 2540 const TargetRegisterClass *ArgRC; 2541 LLT ArgTy; 2542 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2543 2544 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2545 return false; // TODO: Handle these 2546 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2547 } 2548 2549 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2550 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2551 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2552 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2553 return false; 2554 2555 MI.eraseFromParent(); 2556 return true; 2557 } 2558 2559 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2560 MachineRegisterInfo &MRI, 2561 MachineIRBuilder &B) const { 2562 Register Dst = MI.getOperand(0).getReg(); 2563 LLT DstTy = MRI.getType(Dst); 2564 LLT S16 = LLT::scalar(16); 2565 LLT S32 = LLT::scalar(32); 2566 LLT S64 = LLT::scalar(64); 2567 2568 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2569 return true; 2570 2571 if (DstTy == S16) 2572 return legalizeFDIV16(MI, MRI, B); 2573 if (DstTy == S32) 2574 return legalizeFDIV32(MI, MRI, B); 2575 if (DstTy == S64) 2576 return legalizeFDIV64(MI, MRI, B); 2577 2578 return false; 2579 } 2580 2581 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2582 Register DstReg, 2583 Register X, 2584 Register Y, 2585 bool IsDiv) const { 2586 const LLT S1 = LLT::scalar(1); 2587 const LLT S32 = LLT::scalar(32); 2588 2589 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2590 // algorithm used here. 2591 2592 // Initial estimate of inv(y). 2593 auto FloatY = B.buildUITOFP(S32, Y); 2594 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2595 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2596 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2597 auto Z = B.buildFPTOUI(S32, ScaledY); 2598 2599 // One round of UNR. 2600 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2601 auto NegYZ = B.buildMul(S32, NegY, Z); 2602 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2603 2604 // Quotient/remainder estimate. 2605 auto Q = B.buildUMulH(S32, X, Z); 2606 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2607 2608 // First quotient/remainder refinement. 2609 auto One = B.buildConstant(S32, 1); 2610 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2611 if (IsDiv) 2612 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2613 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2614 2615 // Second quotient/remainder refinement. 2616 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2617 if (IsDiv) 2618 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2619 else 2620 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2621 } 2622 2623 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2624 MachineRegisterInfo &MRI, 2625 MachineIRBuilder &B) const { 2626 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2627 Register DstReg = MI.getOperand(0).getReg(); 2628 Register Num = MI.getOperand(1).getReg(); 2629 Register Den = MI.getOperand(2).getReg(); 2630 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2631 MI.eraseFromParent(); 2632 return true; 2633 } 2634 2635 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2636 // 2637 // Return lo, hi of result 2638 // 2639 // %cvt.lo = G_UITOFP Val.lo 2640 // %cvt.hi = G_UITOFP Val.hi 2641 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2642 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2643 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2644 // %mul2 = G_FMUL %mul1, 2**(-32) 2645 // %trunc = G_INTRINSIC_TRUNC %mul2 2646 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2647 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2648 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2649 Register Val) { 2650 const LLT S32 = LLT::scalar(32); 2651 auto Unmerge = B.buildUnmerge(S32, Val); 2652 2653 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2654 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2655 2656 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2657 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2658 2659 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2660 auto Mul1 = 2661 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2662 2663 // 2**(-32) 2664 auto Mul2 = 2665 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2666 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2667 2668 // -(2**32) 2669 auto Mad2 = B.buildFMAD(S32, Trunc, 2670 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2671 2672 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2673 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2674 2675 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2676 } 2677 2678 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2679 Register DstReg, 2680 Register Numer, 2681 Register Denom, 2682 bool IsDiv) const { 2683 const LLT S32 = LLT::scalar(32); 2684 const LLT S64 = LLT::scalar(64); 2685 const LLT S1 = LLT::scalar(1); 2686 Register RcpLo, RcpHi; 2687 2688 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2689 2690 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2691 2692 auto Zero64 = B.buildConstant(S64, 0); 2693 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2694 2695 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2696 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2697 2698 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2699 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2700 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2701 2702 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2703 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2704 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2705 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2706 2707 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2708 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2709 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2710 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2711 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2712 2713 auto Zero32 = B.buildConstant(S32, 0); 2714 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2715 auto Add2_HiC = 2716 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2717 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2718 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2719 2720 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2721 Register NumerLo = UnmergeNumer.getReg(0); 2722 Register NumerHi = UnmergeNumer.getReg(1); 2723 2724 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2725 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2726 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2727 Register Mul3_Lo = UnmergeMul3.getReg(0); 2728 Register Mul3_Hi = UnmergeMul3.getReg(1); 2729 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2730 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2731 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2732 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2733 2734 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2735 Register DenomLo = UnmergeDenom.getReg(0); 2736 Register DenomHi = UnmergeDenom.getReg(1); 2737 2738 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2739 auto C1 = B.buildSExt(S32, CmpHi); 2740 2741 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2742 auto C2 = B.buildSExt(S32, CmpLo); 2743 2744 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2745 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2746 2747 // TODO: Here and below portions of the code can be enclosed into if/endif. 2748 // Currently control flow is unconditional and we have 4 selects after 2749 // potential endif to substitute PHIs. 2750 2751 // if C3 != 0 ... 2752 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2753 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2754 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2755 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2756 2757 auto One64 = B.buildConstant(S64, 1); 2758 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2759 2760 auto C4 = 2761 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2762 auto C5 = 2763 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2764 auto C6 = B.buildSelect( 2765 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2766 2767 // if (C6 != 0) 2768 auto Add4 = B.buildAdd(S64, Add3, One64); 2769 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2770 2771 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2772 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2773 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2774 2775 // endif C6 2776 // endif C3 2777 2778 if (IsDiv) { 2779 auto Sel1 = B.buildSelect( 2780 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2781 B.buildSelect(DstReg, 2782 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2783 } else { 2784 auto Sel2 = B.buildSelect( 2785 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2786 B.buildSelect(DstReg, 2787 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2788 } 2789 } 2790 2791 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2792 MachineRegisterInfo &MRI, 2793 MachineIRBuilder &B) const { 2794 const LLT S64 = LLT::scalar(64); 2795 const LLT S32 = LLT::scalar(32); 2796 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2797 Register DstReg = MI.getOperand(0).getReg(); 2798 Register Num = MI.getOperand(1).getReg(); 2799 Register Den = MI.getOperand(2).getReg(); 2800 LLT Ty = MRI.getType(DstReg); 2801 2802 if (Ty == S32) 2803 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2804 else if (Ty == S64) 2805 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2806 else 2807 return false; 2808 2809 MI.eraseFromParent(); 2810 return true; 2811 2812 } 2813 2814 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2815 MachineRegisterInfo &MRI, 2816 MachineIRBuilder &B) const { 2817 const LLT S64 = LLT::scalar(64); 2818 const LLT S32 = LLT::scalar(32); 2819 2820 Register DstReg = MI.getOperand(0).getReg(); 2821 const LLT Ty = MRI.getType(DstReg); 2822 if (Ty != S32 && Ty != S64) 2823 return false; 2824 2825 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2826 2827 Register LHS = MI.getOperand(1).getReg(); 2828 Register RHS = MI.getOperand(2).getReg(); 2829 2830 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2831 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2832 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2833 2834 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2835 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2836 2837 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2838 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2839 2840 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2841 if (Ty == S32) 2842 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2843 else 2844 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2845 2846 Register Sign; 2847 if (IsDiv) 2848 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2849 else 2850 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2851 2852 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2853 B.buildSub(DstReg, UDivRem, Sign); 2854 2855 MI.eraseFromParent(); 2856 return true; 2857 } 2858 2859 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2860 MachineRegisterInfo &MRI, 2861 MachineIRBuilder &B) const { 2862 Register Res = MI.getOperand(0).getReg(); 2863 Register LHS = MI.getOperand(1).getReg(); 2864 Register RHS = MI.getOperand(2).getReg(); 2865 2866 uint16_t Flags = MI.getFlags(); 2867 2868 LLT ResTy = MRI.getType(Res); 2869 LLT S32 = LLT::scalar(32); 2870 LLT S64 = LLT::scalar(64); 2871 2872 const MachineFunction &MF = B.getMF(); 2873 bool Unsafe = 2874 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2875 2876 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2877 return false; 2878 2879 if (!Unsafe && ResTy == S32 && 2880 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2881 return false; 2882 2883 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2884 // 1 / x -> RCP(x) 2885 if (CLHS->isExactlyValue(1.0)) { 2886 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2887 .addUse(RHS) 2888 .setMIFlags(Flags); 2889 2890 MI.eraseFromParent(); 2891 return true; 2892 } 2893 2894 // -1 / x -> RCP( FNEG(x) ) 2895 if (CLHS->isExactlyValue(-1.0)) { 2896 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2897 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2898 .addUse(FNeg.getReg(0)) 2899 .setMIFlags(Flags); 2900 2901 MI.eraseFromParent(); 2902 return true; 2903 } 2904 } 2905 2906 // x / y -> x * (1.0 / y) 2907 if (Unsafe) { 2908 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2909 .addUse(RHS) 2910 .setMIFlags(Flags); 2911 B.buildFMul(Res, LHS, RCP, Flags); 2912 2913 MI.eraseFromParent(); 2914 return true; 2915 } 2916 2917 return false; 2918 } 2919 2920 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2921 MachineRegisterInfo &MRI, 2922 MachineIRBuilder &B) const { 2923 Register Res = MI.getOperand(0).getReg(); 2924 Register LHS = MI.getOperand(1).getReg(); 2925 Register RHS = MI.getOperand(2).getReg(); 2926 2927 uint16_t Flags = MI.getFlags(); 2928 2929 LLT S16 = LLT::scalar(16); 2930 LLT S32 = LLT::scalar(32); 2931 2932 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2933 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2934 2935 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2936 .addUse(RHSExt.getReg(0)) 2937 .setMIFlags(Flags); 2938 2939 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2940 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2941 2942 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2943 .addUse(RDst.getReg(0)) 2944 .addUse(RHS) 2945 .addUse(LHS) 2946 .setMIFlags(Flags); 2947 2948 MI.eraseFromParent(); 2949 return true; 2950 } 2951 2952 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2953 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2954 static void toggleSPDenormMode(bool Enable, 2955 MachineIRBuilder &B, 2956 const GCNSubtarget &ST, 2957 AMDGPU::SIModeRegisterDefaults Mode) { 2958 // Set SP denorm mode to this value. 2959 unsigned SPDenormMode = 2960 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2961 2962 if (ST.hasDenormModeInst()) { 2963 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2964 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2965 2966 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2967 B.buildInstr(AMDGPU::S_DENORM_MODE) 2968 .addImm(NewDenormModeValue); 2969 2970 } else { 2971 // Select FP32 bit field in mode register. 2972 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2973 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2974 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2975 2976 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2977 .addImm(SPDenormMode) 2978 .addImm(SPDenormModeBitField); 2979 } 2980 } 2981 2982 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2983 MachineRegisterInfo &MRI, 2984 MachineIRBuilder &B) const { 2985 Register Res = MI.getOperand(0).getReg(); 2986 Register LHS = MI.getOperand(1).getReg(); 2987 Register RHS = MI.getOperand(2).getReg(); 2988 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2989 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2990 2991 uint16_t Flags = MI.getFlags(); 2992 2993 LLT S32 = LLT::scalar(32); 2994 LLT S1 = LLT::scalar(1); 2995 2996 auto One = B.buildFConstant(S32, 1.0f); 2997 2998 auto DenominatorScaled = 2999 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3000 .addUse(LHS) 3001 .addUse(RHS) 3002 .addImm(0) 3003 .setMIFlags(Flags); 3004 auto NumeratorScaled = 3005 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3006 .addUse(LHS) 3007 .addUse(RHS) 3008 .addImm(1) 3009 .setMIFlags(Flags); 3010 3011 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3012 .addUse(DenominatorScaled.getReg(0)) 3013 .setMIFlags(Flags); 3014 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3015 3016 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3017 // aren't modeled as reading it. 3018 if (!Mode.allFP32Denormals()) 3019 toggleSPDenormMode(true, B, ST, Mode); 3020 3021 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3022 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3023 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3024 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3025 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3026 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3027 3028 if (!Mode.allFP32Denormals()) 3029 toggleSPDenormMode(false, B, ST, Mode); 3030 3031 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3032 .addUse(Fma4.getReg(0)) 3033 .addUse(Fma1.getReg(0)) 3034 .addUse(Fma3.getReg(0)) 3035 .addUse(NumeratorScaled.getReg(1)) 3036 .setMIFlags(Flags); 3037 3038 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3039 .addUse(Fmas.getReg(0)) 3040 .addUse(RHS) 3041 .addUse(LHS) 3042 .setMIFlags(Flags); 3043 3044 MI.eraseFromParent(); 3045 return true; 3046 } 3047 3048 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3049 MachineRegisterInfo &MRI, 3050 MachineIRBuilder &B) const { 3051 Register Res = MI.getOperand(0).getReg(); 3052 Register LHS = MI.getOperand(1).getReg(); 3053 Register RHS = MI.getOperand(2).getReg(); 3054 3055 uint16_t Flags = MI.getFlags(); 3056 3057 LLT S64 = LLT::scalar(64); 3058 LLT S1 = LLT::scalar(1); 3059 3060 auto One = B.buildFConstant(S64, 1.0); 3061 3062 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3063 .addUse(LHS) 3064 .addUse(RHS) 3065 .addImm(0) 3066 .setMIFlags(Flags); 3067 3068 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3069 3070 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3071 .addUse(DivScale0.getReg(0)) 3072 .setMIFlags(Flags); 3073 3074 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3075 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3076 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3077 3078 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3079 .addUse(LHS) 3080 .addUse(RHS) 3081 .addImm(1) 3082 .setMIFlags(Flags); 3083 3084 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3085 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3086 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3087 3088 Register Scale; 3089 if (!ST.hasUsableDivScaleConditionOutput()) { 3090 // Workaround a hardware bug on SI where the condition output from div_scale 3091 // is not usable. 3092 3093 LLT S32 = LLT::scalar(32); 3094 3095 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3096 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3097 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3098 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3099 3100 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3101 Scale1Unmerge.getReg(1)); 3102 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3103 Scale0Unmerge.getReg(1)); 3104 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3105 } else { 3106 Scale = DivScale1.getReg(1); 3107 } 3108 3109 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3110 .addUse(Fma4.getReg(0)) 3111 .addUse(Fma3.getReg(0)) 3112 .addUse(Mul.getReg(0)) 3113 .addUse(Scale) 3114 .setMIFlags(Flags); 3115 3116 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3117 .addUse(Fmas.getReg(0)) 3118 .addUse(RHS) 3119 .addUse(LHS) 3120 .setMIFlags(Flags); 3121 3122 MI.eraseFromParent(); 3123 return true; 3124 } 3125 3126 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3127 MachineRegisterInfo &MRI, 3128 MachineIRBuilder &B) const { 3129 Register Res = MI.getOperand(0).getReg(); 3130 Register LHS = MI.getOperand(2).getReg(); 3131 Register RHS = MI.getOperand(3).getReg(); 3132 uint16_t Flags = MI.getFlags(); 3133 3134 LLT S32 = LLT::scalar(32); 3135 LLT S1 = LLT::scalar(1); 3136 3137 auto Abs = B.buildFAbs(S32, RHS, Flags); 3138 const APFloat C0Val(1.0f); 3139 3140 auto C0 = B.buildConstant(S32, 0x6f800000); 3141 auto C1 = B.buildConstant(S32, 0x2f800000); 3142 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3143 3144 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3145 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3146 3147 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3148 3149 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3150 .addUse(Mul0.getReg(0)) 3151 .setMIFlags(Flags); 3152 3153 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3154 3155 B.buildFMul(Res, Sel, Mul1, Flags); 3156 3157 MI.eraseFromParent(); 3158 return true; 3159 } 3160 3161 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3162 MachineRegisterInfo &MRI, 3163 MachineIRBuilder &B) const { 3164 uint64_t Offset = 3165 ST.getTargetLowering()->getImplicitParameterOffset( 3166 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3167 LLT DstTy = MRI.getType(DstReg); 3168 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3169 3170 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3171 if (!loadInputValue(KernargPtrReg, B, 3172 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3173 return false; 3174 3175 // FIXME: This should be nuw 3176 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3177 return true; 3178 } 3179 3180 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3181 MachineRegisterInfo &MRI, 3182 MachineIRBuilder &B) const { 3183 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3184 if (!MFI->isEntryFunction()) { 3185 return legalizePreloadedArgIntrin(MI, MRI, B, 3186 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3187 } 3188 3189 Register DstReg = MI.getOperand(0).getReg(); 3190 if (!getImplicitArgPtr(DstReg, MRI, B)) 3191 return false; 3192 3193 MI.eraseFromParent(); 3194 return true; 3195 } 3196 3197 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3198 MachineRegisterInfo &MRI, 3199 MachineIRBuilder &B, 3200 unsigned AddrSpace) const { 3201 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3202 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3203 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3204 MI.eraseFromParent(); 3205 return true; 3206 } 3207 3208 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3209 // offset (the offset that is included in bounds checking and swizzling, to be 3210 // split between the instruction's voffset and immoffset fields) and soffset 3211 // (the offset that is excluded from bounds checking and swizzling, to go in 3212 // the instruction's soffset field). This function takes the first kind of 3213 // offset and figures out how to split it between voffset and immoffset. 3214 std::tuple<Register, unsigned, unsigned> 3215 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3216 Register OrigOffset) const { 3217 const unsigned MaxImm = 4095; 3218 Register BaseReg; 3219 unsigned TotalConstOffset; 3220 MachineInstr *OffsetDef; 3221 const LLT S32 = LLT::scalar(32); 3222 3223 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3224 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3225 3226 unsigned ImmOffset = TotalConstOffset; 3227 3228 // If the immediate value is too big for the immoffset field, put the value 3229 // and -4096 into the immoffset field so that the value that is copied/added 3230 // for the voffset field is a multiple of 4096, and it stands more chance 3231 // of being CSEd with the copy/add for another similar load/store. 3232 // However, do not do that rounding down to a multiple of 4096 if that is a 3233 // negative number, as it appears to be illegal to have a negative offset 3234 // in the vgpr, even if adding the immediate offset makes it positive. 3235 unsigned Overflow = ImmOffset & ~MaxImm; 3236 ImmOffset -= Overflow; 3237 if ((int32_t)Overflow < 0) { 3238 Overflow += ImmOffset; 3239 ImmOffset = 0; 3240 } 3241 3242 if (Overflow != 0) { 3243 if (!BaseReg) { 3244 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3245 } else { 3246 auto OverflowVal = B.buildConstant(S32, Overflow); 3247 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3248 } 3249 } 3250 3251 if (!BaseReg) 3252 BaseReg = B.buildConstant(S32, 0).getReg(0); 3253 3254 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3255 } 3256 3257 /// Handle register layout difference for f16 images for some subtargets. 3258 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3259 MachineRegisterInfo &MRI, 3260 Register Reg) const { 3261 if (!ST.hasUnpackedD16VMem()) 3262 return Reg; 3263 3264 const LLT S16 = LLT::scalar(16); 3265 const LLT S32 = LLT::scalar(32); 3266 LLT StoreVT = MRI.getType(Reg); 3267 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3268 3269 auto Unmerge = B.buildUnmerge(S16, Reg); 3270 3271 SmallVector<Register, 4> WideRegs; 3272 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3273 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3274 3275 int NumElts = StoreVT.getNumElements(); 3276 3277 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3278 } 3279 3280 Register AMDGPULegalizerInfo::fixStoreSourceType( 3281 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3282 MachineRegisterInfo *MRI = B.getMRI(); 3283 LLT Ty = MRI->getType(VData); 3284 3285 const LLT S16 = LLT::scalar(16); 3286 3287 // Fixup illegal register types for i8 stores. 3288 if (Ty == LLT::scalar(8) || Ty == S16) { 3289 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3290 return AnyExt; 3291 } 3292 3293 if (Ty.isVector()) { 3294 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3295 if (IsFormat) 3296 return handleD16VData(B, *MRI, VData); 3297 } 3298 } 3299 3300 return VData; 3301 } 3302 3303 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3304 MachineRegisterInfo &MRI, 3305 MachineIRBuilder &B, 3306 bool IsTyped, 3307 bool IsFormat) const { 3308 Register VData = MI.getOperand(1).getReg(); 3309 LLT Ty = MRI.getType(VData); 3310 LLT EltTy = Ty.getScalarType(); 3311 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3312 const LLT S32 = LLT::scalar(32); 3313 3314 VData = fixStoreSourceType(B, VData, IsFormat); 3315 Register RSrc = MI.getOperand(2).getReg(); 3316 3317 MachineMemOperand *MMO = *MI.memoperands_begin(); 3318 const int MemSize = MMO->getSize(); 3319 3320 unsigned ImmOffset; 3321 unsigned TotalOffset; 3322 3323 // The typed intrinsics add an immediate after the registers. 3324 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3325 3326 // The struct intrinsic variants add one additional operand over raw. 3327 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3328 Register VIndex; 3329 int OpOffset = 0; 3330 if (HasVIndex) { 3331 VIndex = MI.getOperand(3).getReg(); 3332 OpOffset = 1; 3333 } 3334 3335 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3336 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3337 3338 unsigned Format = 0; 3339 if (IsTyped) { 3340 Format = MI.getOperand(5 + OpOffset).getImm(); 3341 ++OpOffset; 3342 } 3343 3344 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3345 3346 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3347 if (TotalOffset != 0) 3348 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3349 3350 unsigned Opc; 3351 if (IsTyped) { 3352 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3353 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3354 } else if (IsFormat) { 3355 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3356 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3357 } else { 3358 switch (MemSize) { 3359 case 1: 3360 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3361 break; 3362 case 2: 3363 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3364 break; 3365 default: 3366 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3367 break; 3368 } 3369 } 3370 3371 if (!VIndex) 3372 VIndex = B.buildConstant(S32, 0).getReg(0); 3373 3374 auto MIB = B.buildInstr(Opc) 3375 .addUse(VData) // vdata 3376 .addUse(RSrc) // rsrc 3377 .addUse(VIndex) // vindex 3378 .addUse(VOffset) // voffset 3379 .addUse(SOffset) // soffset 3380 .addImm(ImmOffset); // offset(imm) 3381 3382 if (IsTyped) 3383 MIB.addImm(Format); 3384 3385 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3386 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3387 .addMemOperand(MMO); 3388 3389 MI.eraseFromParent(); 3390 return true; 3391 } 3392 3393 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3394 MachineRegisterInfo &MRI, 3395 MachineIRBuilder &B, 3396 bool IsFormat, 3397 bool IsTyped) const { 3398 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3399 MachineMemOperand *MMO = *MI.memoperands_begin(); 3400 const int MemSize = MMO->getSize(); 3401 const LLT S32 = LLT::scalar(32); 3402 3403 Register Dst = MI.getOperand(0).getReg(); 3404 Register RSrc = MI.getOperand(2).getReg(); 3405 3406 // The typed intrinsics add an immediate after the registers. 3407 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3408 3409 // The struct intrinsic variants add one additional operand over raw. 3410 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3411 Register VIndex; 3412 int OpOffset = 0; 3413 if (HasVIndex) { 3414 VIndex = MI.getOperand(3).getReg(); 3415 OpOffset = 1; 3416 } 3417 3418 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3419 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3420 3421 unsigned Format = 0; 3422 if (IsTyped) { 3423 Format = MI.getOperand(5 + OpOffset).getImm(); 3424 ++OpOffset; 3425 } 3426 3427 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3428 unsigned ImmOffset; 3429 unsigned TotalOffset; 3430 3431 LLT Ty = MRI.getType(Dst); 3432 LLT EltTy = Ty.getScalarType(); 3433 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3434 const bool Unpacked = ST.hasUnpackedD16VMem(); 3435 3436 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3437 if (TotalOffset != 0) 3438 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3439 3440 unsigned Opc; 3441 3442 if (IsTyped) { 3443 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3444 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3445 } else if (IsFormat) { 3446 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3447 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3448 } else { 3449 switch (MemSize) { 3450 case 1: 3451 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3452 break; 3453 case 2: 3454 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3455 break; 3456 default: 3457 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3458 break; 3459 } 3460 } 3461 3462 Register LoadDstReg; 3463 3464 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3465 LLT UnpackedTy = Ty.changeElementSize(32); 3466 3467 if (IsExtLoad) 3468 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3469 else if (Unpacked && IsD16 && Ty.isVector()) 3470 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3471 else 3472 LoadDstReg = Dst; 3473 3474 if (!VIndex) 3475 VIndex = B.buildConstant(S32, 0).getReg(0); 3476 3477 auto MIB = B.buildInstr(Opc) 3478 .addDef(LoadDstReg) // vdata 3479 .addUse(RSrc) // rsrc 3480 .addUse(VIndex) // vindex 3481 .addUse(VOffset) // voffset 3482 .addUse(SOffset) // soffset 3483 .addImm(ImmOffset); // offset(imm) 3484 3485 if (IsTyped) 3486 MIB.addImm(Format); 3487 3488 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3489 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3490 .addMemOperand(MMO); 3491 3492 if (LoadDstReg != Dst) { 3493 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3494 3495 // Widen result for extending loads was widened. 3496 if (IsExtLoad) 3497 B.buildTrunc(Dst, LoadDstReg); 3498 else { 3499 // Repack to original 16-bit vector result 3500 // FIXME: G_TRUNC should work, but legalization currently fails 3501 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3502 SmallVector<Register, 4> Repack; 3503 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3504 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3505 B.buildMerge(Dst, Repack); 3506 } 3507 } 3508 3509 MI.eraseFromParent(); 3510 return true; 3511 } 3512 3513 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3514 MachineIRBuilder &B, 3515 bool IsInc) const { 3516 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3517 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3518 B.buildInstr(Opc) 3519 .addDef(MI.getOperand(0).getReg()) 3520 .addUse(MI.getOperand(2).getReg()) 3521 .addUse(MI.getOperand(3).getReg()) 3522 .cloneMemRefs(MI); 3523 MI.eraseFromParent(); 3524 return true; 3525 } 3526 3527 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3528 switch (IntrID) { 3529 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3530 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3531 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3532 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3533 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3534 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3535 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3536 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3537 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3538 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3539 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3540 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3541 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3542 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3543 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3544 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3545 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3546 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3547 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3548 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3549 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3550 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3551 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3552 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3553 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3554 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3555 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3556 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3557 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3558 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3559 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3560 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3561 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3562 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3563 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3564 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3565 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3566 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3567 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3568 default: 3569 llvm_unreachable("unhandled atomic opcode"); 3570 } 3571 } 3572 3573 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3574 MachineIRBuilder &B, 3575 Intrinsic::ID IID) const { 3576 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3577 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3578 3579 Register Dst = MI.getOperand(0).getReg(); 3580 Register VData = MI.getOperand(2).getReg(); 3581 3582 Register CmpVal; 3583 int OpOffset = 0; 3584 3585 if (IsCmpSwap) { 3586 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3587 ++OpOffset; 3588 } 3589 3590 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3591 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3592 3593 // The struct intrinsic variants add one additional operand over raw. 3594 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3595 Register VIndex; 3596 if (HasVIndex) { 3597 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3598 ++OpOffset; 3599 } 3600 3601 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3602 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3603 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3604 3605 MachineMemOperand *MMO = *MI.memoperands_begin(); 3606 3607 unsigned ImmOffset; 3608 unsigned TotalOffset; 3609 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3610 if (TotalOffset != 0) 3611 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3612 3613 if (!VIndex) 3614 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3615 3616 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3617 .addDef(Dst) 3618 .addUse(VData); // vdata 3619 3620 if (IsCmpSwap) 3621 MIB.addReg(CmpVal); 3622 3623 MIB.addUse(RSrc) // rsrc 3624 .addUse(VIndex) // vindex 3625 .addUse(VOffset) // voffset 3626 .addUse(SOffset) // soffset 3627 .addImm(ImmOffset) // offset(imm) 3628 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3629 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3630 .addMemOperand(MMO); 3631 3632 MI.eraseFromParent(); 3633 return true; 3634 } 3635 3636 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3637 /// vector with s16 typed elements. 3638 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3639 SmallVectorImpl<Register> &PackedAddrs, 3640 int AddrIdx, int DimIdx, int EndIdx, 3641 int NumGradients) { 3642 const LLT S16 = LLT::scalar(16); 3643 const LLT V2S16 = LLT::vector(2, 16); 3644 3645 for (int I = AddrIdx; I < EndIdx; ++I) { 3646 MachineOperand &SrcOp = MI.getOperand(I); 3647 if (!SrcOp.isReg()) 3648 continue; // _L to _LZ may have eliminated this. 3649 3650 Register AddrReg = SrcOp.getReg(); 3651 3652 if (I < DimIdx) { 3653 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3654 PackedAddrs.push_back(AddrReg); 3655 } else { 3656 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3657 // derivatives dx/dh and dx/dv are packed with undef. 3658 if (((I + 1) >= EndIdx) || 3659 ((NumGradients / 2) % 2 == 1 && 3660 (I == DimIdx + (NumGradients / 2) - 1 || 3661 I == DimIdx + NumGradients - 1)) || 3662 // Check for _L to _LZ optimization 3663 !MI.getOperand(I + 1).isReg()) { 3664 PackedAddrs.push_back( 3665 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3666 .getReg(0)); 3667 } else { 3668 PackedAddrs.push_back( 3669 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3670 .getReg(0)); 3671 ++I; 3672 } 3673 } 3674 } 3675 } 3676 3677 /// Convert from separate vaddr components to a single vector address register, 3678 /// and replace the remaining operands with $noreg. 3679 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3680 int DimIdx, int NumVAddrs) { 3681 const LLT S32 = LLT::scalar(32); 3682 3683 SmallVector<Register, 8> AddrRegs; 3684 for (int I = 0; I != NumVAddrs; ++I) { 3685 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3686 if (SrcOp.isReg()) { 3687 AddrRegs.push_back(SrcOp.getReg()); 3688 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3689 } 3690 } 3691 3692 int NumAddrRegs = AddrRegs.size(); 3693 if (NumAddrRegs != 1) { 3694 // Round up to 8 elements for v5-v7 3695 // FIXME: Missing intermediate sized register classes and instructions. 3696 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3697 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3698 auto Undef = B.buildUndef(S32); 3699 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3700 NumAddrRegs = RoundedNumRegs; 3701 } 3702 3703 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3704 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3705 } 3706 3707 for (int I = 1; I != NumVAddrs; ++I) { 3708 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3709 if (SrcOp.isReg()) 3710 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3711 } 3712 } 3713 3714 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3715 /// 3716 /// Depending on the subtarget, load/store with 16-bit element data need to be 3717 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3718 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3719 /// registers. 3720 /// 3721 /// We don't want to directly select image instructions just yet, but also want 3722 /// to exposes all register repacking to the legalizer/combiners. We also don't 3723 /// want a selected instrution entering RegBankSelect. In order to avoid 3724 /// defining a multitude of intermediate image instructions, directly hack on 3725 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3726 /// now unnecessary arguments with $noreg. 3727 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3728 MachineInstr &MI, MachineIRBuilder &B, 3729 GISelChangeObserver &Observer, 3730 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3731 3732 const int NumDefs = MI.getNumExplicitDefs(); 3733 bool IsTFE = NumDefs == 2; 3734 // We are only processing the operands of d16 image operations on subtargets 3735 // that use the unpacked register layout, or need to repack the TFE result. 3736 3737 // TODO: Do we need to guard against already legalized intrinsics? 3738 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3739 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3740 3741 MachineRegisterInfo *MRI = B.getMRI(); 3742 const LLT S32 = LLT::scalar(32); 3743 const LLT S16 = LLT::scalar(16); 3744 const LLT V2S16 = LLT::vector(2, 16); 3745 3746 // Index of first address argument 3747 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3748 3749 int NumVAddrs, NumGradients; 3750 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3751 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3752 getDMaskIdx(BaseOpcode, NumDefs); 3753 unsigned DMask = 0; 3754 3755 // Check for 16 bit addresses and pack if true. 3756 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3757 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3758 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3759 const bool IsG16 = GradTy == S16; 3760 const bool IsA16 = AddrTy == S16; 3761 3762 int DMaskLanes = 0; 3763 if (!BaseOpcode->Atomic) { 3764 DMask = MI.getOperand(DMaskIdx).getImm(); 3765 if (BaseOpcode->Gather4) { 3766 DMaskLanes = 4; 3767 } else if (DMask != 0) { 3768 DMaskLanes = countPopulation(DMask); 3769 } else if (!IsTFE && !BaseOpcode->Store) { 3770 // If dmask is 0, this is a no-op load. This can be eliminated. 3771 B.buildUndef(MI.getOperand(0)); 3772 MI.eraseFromParent(); 3773 return true; 3774 } 3775 } 3776 3777 Observer.changingInstr(MI); 3778 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3779 3780 unsigned NewOpcode = NumDefs == 0 ? 3781 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3782 3783 // Track that we legalized this 3784 MI.setDesc(B.getTII().get(NewOpcode)); 3785 3786 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3787 // dmask to be at least 1 otherwise the instruction will fail 3788 if (IsTFE && DMask == 0) { 3789 DMask = 0x1; 3790 DMaskLanes = 1; 3791 MI.getOperand(DMaskIdx).setImm(DMask); 3792 } 3793 3794 if (BaseOpcode->Atomic) { 3795 Register VData0 = MI.getOperand(2).getReg(); 3796 LLT Ty = MRI->getType(VData0); 3797 3798 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3799 if (Ty.isVector()) 3800 return false; 3801 3802 if (BaseOpcode->AtomicX2) { 3803 Register VData1 = MI.getOperand(3).getReg(); 3804 // The two values are packed in one register. 3805 LLT PackedTy = LLT::vector(2, Ty); 3806 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3807 MI.getOperand(2).setReg(Concat.getReg(0)); 3808 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3809 } 3810 } 3811 3812 int CorrectedNumVAddrs = NumVAddrs; 3813 3814 // Optimize _L to _LZ when _L is zero 3815 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3816 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3817 const ConstantFP *ConstantLod; 3818 const int LodIdx = AddrIdx + NumVAddrs - 1; 3819 3820 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3821 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3822 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3823 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3824 LZMappingInfo->LZ, ImageDimIntr->Dim); 3825 3826 // The starting indexes should remain in the same place. 3827 --NumVAddrs; 3828 --CorrectedNumVAddrs; 3829 3830 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3831 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3832 MI.RemoveOperand(LodIdx); 3833 } 3834 } 3835 } 3836 3837 // Optimize _mip away, when 'lod' is zero 3838 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3839 int64_t ConstantLod; 3840 const int LodIdx = AddrIdx + NumVAddrs - 1; 3841 3842 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3843 if (ConstantLod == 0) { 3844 // TODO: Change intrinsic opcode and remove operand instead or replacing 3845 // it with 0, as the _L to _LZ handling is done above. 3846 MI.getOperand(LodIdx).ChangeToImmediate(0); 3847 --CorrectedNumVAddrs; 3848 } 3849 } 3850 } 3851 3852 // Rewrite the addressing register layout before doing anything else. 3853 if (IsA16 || IsG16) { 3854 if (IsA16) { 3855 // Target must support the feature and gradients need to be 16 bit too 3856 if (!ST.hasA16() || !IsG16) 3857 return false; 3858 } else if (!ST.hasG16()) 3859 return false; 3860 3861 if (NumVAddrs > 1) { 3862 SmallVector<Register, 4> PackedRegs; 3863 // Don't compress addresses for G16 3864 const int PackEndIdx = 3865 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3866 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3867 PackEndIdx, NumGradients); 3868 3869 if (!IsA16) { 3870 // Add uncompressed address 3871 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3872 int AddrReg = MI.getOperand(I).getReg(); 3873 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3874 PackedRegs.push_back(AddrReg); 3875 } 3876 } 3877 3878 // See also below in the non-a16 branch 3879 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3880 3881 if (!UseNSA && PackedRegs.size() > 1) { 3882 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3883 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3884 PackedRegs[0] = Concat.getReg(0); 3885 PackedRegs.resize(1); 3886 } 3887 3888 const int NumPacked = PackedRegs.size(); 3889 for (int I = 0; I != NumVAddrs; ++I) { 3890 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3891 if (!SrcOp.isReg()) { 3892 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3893 continue; 3894 } 3895 3896 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3897 3898 if (I < NumPacked) 3899 SrcOp.setReg(PackedRegs[I]); 3900 else 3901 SrcOp.setReg(AMDGPU::NoRegister); 3902 } 3903 } 3904 } else { 3905 // If the register allocator cannot place the address registers contiguously 3906 // without introducing moves, then using the non-sequential address encoding 3907 // is always preferable, since it saves VALU instructions and is usually a 3908 // wash in terms of code size or even better. 3909 // 3910 // However, we currently have no way of hinting to the register allocator 3911 // that MIMG addresses should be placed contiguously when it is possible to 3912 // do so, so force non-NSA for the common 2-address case as a heuristic. 3913 // 3914 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3915 // allocation when possible. 3916 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3917 3918 if (!UseNSA && NumVAddrs > 1) 3919 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3920 } 3921 3922 int Flags = 0; 3923 if (IsA16) 3924 Flags |= 1; 3925 if (IsG16) 3926 Flags |= 2; 3927 MI.addOperand(MachineOperand::CreateImm(Flags)); 3928 3929 if (BaseOpcode->Store) { // No TFE for stores? 3930 // TODO: Handle dmask trim 3931 Register VData = MI.getOperand(1).getReg(); 3932 LLT Ty = MRI->getType(VData); 3933 if (!Ty.isVector() || Ty.getElementType() != S16) 3934 return true; 3935 3936 Register RepackedReg = handleD16VData(B, *MRI, VData); 3937 if (RepackedReg != VData) { 3938 MI.getOperand(1).setReg(RepackedReg); 3939 } 3940 3941 return true; 3942 } 3943 3944 Register DstReg = MI.getOperand(0).getReg(); 3945 LLT Ty = MRI->getType(DstReg); 3946 const LLT EltTy = Ty.getScalarType(); 3947 const bool IsD16 = Ty.getScalarType() == S16; 3948 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3949 3950 // Confirm that the return type is large enough for the dmask specified 3951 if (NumElts < DMaskLanes) 3952 return false; 3953 3954 if (NumElts > 4 || DMaskLanes > 4) 3955 return false; 3956 3957 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3958 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3959 3960 // The raw dword aligned data component of the load. The only legal cases 3961 // where this matters should be when using the packed D16 format, for 3962 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3963 LLT RoundedTy; 3964 3965 // S32 vector to to cover all data, plus TFE result element. 3966 LLT TFETy; 3967 3968 // Register type to use for each loaded component. Will be S32 or V2S16. 3969 LLT RegTy; 3970 3971 if (IsD16 && ST.hasUnpackedD16VMem()) { 3972 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3973 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3974 RegTy = S32; 3975 } else { 3976 unsigned EltSize = EltTy.getSizeInBits(); 3977 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3978 unsigned RoundedSize = 32 * RoundedElts; 3979 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3980 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3981 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3982 } 3983 3984 // The return type does not need adjustment. 3985 // TODO: Should we change s16 case to s32 or <2 x s16>? 3986 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3987 return true; 3988 3989 Register Dst1Reg; 3990 3991 // Insert after the instruction. 3992 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3993 3994 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3995 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3996 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3997 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3998 3999 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4000 4001 MI.getOperand(0).setReg(NewResultReg); 4002 4003 // In the IR, TFE is supposed to be used with a 2 element struct return 4004 // type. The intruction really returns these two values in one contiguous 4005 // register, with one additional dword beyond the loaded data. Rewrite the 4006 // return type to use a single register result. 4007 4008 if (IsTFE) { 4009 Dst1Reg = MI.getOperand(1).getReg(); 4010 if (MRI->getType(Dst1Reg) != S32) 4011 return false; 4012 4013 // TODO: Make sure the TFE operand bit is set. 4014 MI.RemoveOperand(1); 4015 4016 // Handle the easy case that requires no repack instructions. 4017 if (Ty == S32) { 4018 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4019 return true; 4020 } 4021 } 4022 4023 // Now figure out how to copy the new result register back into the old 4024 // result. 4025 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4026 4027 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4028 4029 if (ResultNumRegs == 1) { 4030 assert(!IsTFE); 4031 ResultRegs[0] = NewResultReg; 4032 } else { 4033 // We have to repack into a new vector of some kind. 4034 for (int I = 0; I != NumDataRegs; ++I) 4035 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4036 B.buildUnmerge(ResultRegs, NewResultReg); 4037 4038 // Drop the final TFE element to get the data part. The TFE result is 4039 // directly written to the right place already. 4040 if (IsTFE) 4041 ResultRegs.resize(NumDataRegs); 4042 } 4043 4044 // For an s16 scalar result, we form an s32 result with a truncate regardless 4045 // of packed vs. unpacked. 4046 if (IsD16 && !Ty.isVector()) { 4047 B.buildTrunc(DstReg, ResultRegs[0]); 4048 return true; 4049 } 4050 4051 // Avoid a build/concat_vector of 1 entry. 4052 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4053 B.buildBitcast(DstReg, ResultRegs[0]); 4054 return true; 4055 } 4056 4057 assert(Ty.isVector()); 4058 4059 if (IsD16) { 4060 // For packed D16 results with TFE enabled, all the data components are 4061 // S32. Cast back to the expected type. 4062 // 4063 // TODO: We don't really need to use load s32 elements. We would only need one 4064 // cast for the TFE result if a multiple of v2s16 was used. 4065 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4066 for (Register &Reg : ResultRegs) 4067 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4068 } else if (ST.hasUnpackedD16VMem()) { 4069 for (Register &Reg : ResultRegs) 4070 Reg = B.buildTrunc(S16, Reg).getReg(0); 4071 } 4072 } 4073 4074 auto padWithUndef = [&](LLT Ty, int NumElts) { 4075 if (NumElts == 0) 4076 return; 4077 Register Undef = B.buildUndef(Ty).getReg(0); 4078 for (int I = 0; I != NumElts; ++I) 4079 ResultRegs.push_back(Undef); 4080 }; 4081 4082 // Pad out any elements eliminated due to the dmask. 4083 LLT ResTy = MRI->getType(ResultRegs[0]); 4084 if (!ResTy.isVector()) { 4085 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4086 B.buildBuildVector(DstReg, ResultRegs); 4087 return true; 4088 } 4089 4090 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4091 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4092 4093 // Deal with the one annoying legal case. 4094 const LLT V3S16 = LLT::vector(3, 16); 4095 if (Ty == V3S16) { 4096 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4097 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4098 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4099 return true; 4100 } 4101 4102 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4103 B.buildConcatVectors(DstReg, ResultRegs); 4104 return true; 4105 } 4106 4107 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4108 MachineInstr &MI, MachineIRBuilder &B, 4109 GISelChangeObserver &Observer) const { 4110 Register Dst = MI.getOperand(0).getReg(); 4111 LLT Ty = B.getMRI()->getType(Dst); 4112 unsigned Size = Ty.getSizeInBits(); 4113 MachineFunction &MF = B.getMF(); 4114 4115 Observer.changingInstr(MI); 4116 4117 // FIXME: We don't really need this intermediate instruction. The intrinsic 4118 // should be fixed to have a memory operand. Since it's readnone, we're not 4119 // allowed to add one. 4120 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4121 MI.RemoveOperand(1); // Remove intrinsic ID 4122 4123 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4124 // TODO: Should this use datalayout alignment? 4125 const unsigned MemSize = (Size + 7) / 8; 4126 const Align MemAlign(4); 4127 MachineMemOperand *MMO = MF.getMachineMemOperand( 4128 MachinePointerInfo(), 4129 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4130 MachineMemOperand::MOInvariant, 4131 MemSize, MemAlign); 4132 MI.addMemOperand(MF, MMO); 4133 4134 // There are no 96-bit result scalar loads, but widening to 128-bit should 4135 // always be legal. We may need to restore this to a 96-bit result if it turns 4136 // out this needs to be converted to a vector load during RegBankSelect. 4137 if (!isPowerOf2_32(Size)) { 4138 LegalizerHelper Helper(MF, *this, Observer, B); 4139 4140 if (Ty.isVector()) 4141 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4142 else 4143 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4144 } 4145 4146 Observer.changedInstr(MI); 4147 return true; 4148 } 4149 4150 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4151 MachineRegisterInfo &MRI, 4152 MachineIRBuilder &B) const { 4153 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4154 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4155 !ST.isTrapHandlerEnabled()) { 4156 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4157 } else { 4158 // Pass queue pointer to trap handler as input, and insert trap instruction 4159 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4160 MachineRegisterInfo &MRI = *B.getMRI(); 4161 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4162 Register LiveIn = getLiveInRegister( 4163 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4164 /*InsertLiveInCopy=*/false); 4165 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4166 return false; 4167 B.buildCopy(SGPR01, LiveIn); 4168 B.buildInstr(AMDGPU::S_TRAP) 4169 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4170 .addReg(SGPR01, RegState::Implicit); 4171 } 4172 4173 MI.eraseFromParent(); 4174 return true; 4175 } 4176 4177 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4178 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4179 // Is non-HSA path or trap-handler disabled? then, report a warning 4180 // accordingly 4181 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4182 !ST.isTrapHandlerEnabled()) { 4183 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4184 "debugtrap handler not supported", 4185 MI.getDebugLoc(), DS_Warning); 4186 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4187 Ctx.diagnose(NoTrap); 4188 } else { 4189 // Insert debug-trap instruction 4190 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4191 } 4192 4193 MI.eraseFromParent(); 4194 return true; 4195 } 4196 4197 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4198 MachineInstr &MI) const { 4199 MachineIRBuilder &B = Helper.MIRBuilder; 4200 MachineRegisterInfo &MRI = *B.getMRI(); 4201 4202 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4203 auto IntrID = MI.getIntrinsicID(); 4204 switch (IntrID) { 4205 case Intrinsic::amdgcn_if: 4206 case Intrinsic::amdgcn_else: { 4207 MachineInstr *Br = nullptr; 4208 MachineBasicBlock *UncondBrTarget = nullptr; 4209 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4210 const SIRegisterInfo *TRI 4211 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4212 4213 Register Def = MI.getOperand(1).getReg(); 4214 Register Use = MI.getOperand(3).getReg(); 4215 4216 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4217 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4218 if (IntrID == Intrinsic::amdgcn_if) { 4219 B.buildInstr(AMDGPU::SI_IF) 4220 .addDef(Def) 4221 .addUse(Use) 4222 .addMBB(UncondBrTarget); 4223 } else { 4224 B.buildInstr(AMDGPU::SI_ELSE) 4225 .addDef(Def) 4226 .addUse(Use) 4227 .addMBB(UncondBrTarget) 4228 .addImm(0); 4229 } 4230 4231 if (Br) { 4232 Br->getOperand(0).setMBB(CondBrTarget); 4233 } else { 4234 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4235 // since we're swapping branch targets it needs to be reinserted. 4236 // FIXME: IRTranslator should probably not do this 4237 B.buildBr(*CondBrTarget); 4238 } 4239 4240 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4241 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4242 MI.eraseFromParent(); 4243 BrCond->eraseFromParent(); 4244 return true; 4245 } 4246 4247 return false; 4248 } 4249 case Intrinsic::amdgcn_loop: { 4250 MachineInstr *Br = nullptr; 4251 MachineBasicBlock *UncondBrTarget = nullptr; 4252 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4253 const SIRegisterInfo *TRI 4254 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4255 4256 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4257 Register Reg = MI.getOperand(2).getReg(); 4258 4259 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4260 B.buildInstr(AMDGPU::SI_LOOP) 4261 .addUse(Reg) 4262 .addMBB(UncondBrTarget); 4263 4264 if (Br) 4265 Br->getOperand(0).setMBB(CondBrTarget); 4266 else 4267 B.buildBr(*CondBrTarget); 4268 4269 MI.eraseFromParent(); 4270 BrCond->eraseFromParent(); 4271 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4272 return true; 4273 } 4274 4275 return false; 4276 } 4277 case Intrinsic::amdgcn_kernarg_segment_ptr: 4278 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4279 // This only makes sense to call in a kernel, so just lower to null. 4280 B.buildConstant(MI.getOperand(0).getReg(), 0); 4281 MI.eraseFromParent(); 4282 return true; 4283 } 4284 4285 return legalizePreloadedArgIntrin( 4286 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4287 case Intrinsic::amdgcn_implicitarg_ptr: 4288 return legalizeImplicitArgPtr(MI, MRI, B); 4289 case Intrinsic::amdgcn_workitem_id_x: 4290 return legalizePreloadedArgIntrin(MI, MRI, B, 4291 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4292 case Intrinsic::amdgcn_workitem_id_y: 4293 return legalizePreloadedArgIntrin(MI, MRI, B, 4294 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4295 case Intrinsic::amdgcn_workitem_id_z: 4296 return legalizePreloadedArgIntrin(MI, MRI, B, 4297 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4298 case Intrinsic::amdgcn_workgroup_id_x: 4299 return legalizePreloadedArgIntrin(MI, MRI, B, 4300 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4301 case Intrinsic::amdgcn_workgroup_id_y: 4302 return legalizePreloadedArgIntrin(MI, MRI, B, 4303 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4304 case Intrinsic::amdgcn_workgroup_id_z: 4305 return legalizePreloadedArgIntrin(MI, MRI, B, 4306 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4307 case Intrinsic::amdgcn_dispatch_ptr: 4308 return legalizePreloadedArgIntrin(MI, MRI, B, 4309 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4310 case Intrinsic::amdgcn_queue_ptr: 4311 return legalizePreloadedArgIntrin(MI, MRI, B, 4312 AMDGPUFunctionArgInfo::QUEUE_PTR); 4313 case Intrinsic::amdgcn_implicit_buffer_ptr: 4314 return legalizePreloadedArgIntrin( 4315 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4316 case Intrinsic::amdgcn_dispatch_id: 4317 return legalizePreloadedArgIntrin(MI, MRI, B, 4318 AMDGPUFunctionArgInfo::DISPATCH_ID); 4319 case Intrinsic::amdgcn_fdiv_fast: 4320 return legalizeFDIVFastIntrin(MI, MRI, B); 4321 case Intrinsic::amdgcn_is_shared: 4322 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4323 case Intrinsic::amdgcn_is_private: 4324 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4325 case Intrinsic::amdgcn_wavefrontsize: { 4326 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4327 MI.eraseFromParent(); 4328 return true; 4329 } 4330 case Intrinsic::amdgcn_s_buffer_load: 4331 return legalizeSBufferLoad(MI, B, Helper.Observer); 4332 case Intrinsic::amdgcn_raw_buffer_store: 4333 case Intrinsic::amdgcn_struct_buffer_store: 4334 return legalizeBufferStore(MI, MRI, B, false, false); 4335 case Intrinsic::amdgcn_raw_buffer_store_format: 4336 case Intrinsic::amdgcn_struct_buffer_store_format: 4337 return legalizeBufferStore(MI, MRI, B, false, true); 4338 case Intrinsic::amdgcn_raw_tbuffer_store: 4339 case Intrinsic::amdgcn_struct_tbuffer_store: 4340 return legalizeBufferStore(MI, MRI, B, true, true); 4341 case Intrinsic::amdgcn_raw_buffer_load: 4342 case Intrinsic::amdgcn_struct_buffer_load: 4343 return legalizeBufferLoad(MI, MRI, B, false, false); 4344 case Intrinsic::amdgcn_raw_buffer_load_format: 4345 case Intrinsic::amdgcn_struct_buffer_load_format: 4346 return legalizeBufferLoad(MI, MRI, B, true, false); 4347 case Intrinsic::amdgcn_raw_tbuffer_load: 4348 case Intrinsic::amdgcn_struct_tbuffer_load: 4349 return legalizeBufferLoad(MI, MRI, B, true, true); 4350 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4351 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4352 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4353 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4354 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4355 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4356 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4357 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4358 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4359 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4360 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4361 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4362 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4363 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4364 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4365 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4366 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4367 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4368 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4369 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4370 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4371 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4372 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4373 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4374 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4375 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4376 return legalizeBufferAtomic(MI, B, IntrID); 4377 case Intrinsic::amdgcn_atomic_inc: 4378 return legalizeAtomicIncDec(MI, B, true); 4379 case Intrinsic::amdgcn_atomic_dec: 4380 return legalizeAtomicIncDec(MI, B, false); 4381 case Intrinsic::trap: 4382 return legalizeTrapIntrinsic(MI, MRI, B); 4383 case Intrinsic::debugtrap: 4384 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4385 default: { 4386 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4387 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4388 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4389 return true; 4390 } 4391 } 4392 4393 return true; 4394 } 4395