1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 return Ty.isVector() && 67 Ty.getNumElements() % 2 != 0 && 68 Ty.getElementType().getSizeInBits() < 32 && 69 Ty.getSizeInBits() % 32 != 0; 70 }; 71 } 72 73 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 74 return [=](const LegalityQuery &Query) { 75 const LLT Ty = Query.Types[TypeIdx]; 76 const LLT EltTy = Ty.getScalarType(); 77 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 78 }; 79 } 80 81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 82 return [=](const LegalityQuery &Query) { 83 const LLT Ty = Query.Types[TypeIdx]; 84 const LLT EltTy = Ty.getElementType(); 85 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 86 }; 87 } 88 89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 90 return [=](const LegalityQuery &Query) { 91 const LLT Ty = Query.Types[TypeIdx]; 92 const LLT EltTy = Ty.getElementType(); 93 unsigned Size = Ty.getSizeInBits(); 94 unsigned Pieces = (Size + 63) / 64; 95 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 96 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 97 }; 98 } 99 100 // Increase the number of vector elements to reach the next multiple of 32-bit 101 // type. 102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 106 const LLT EltTy = Ty.getElementType(); 107 const int Size = Ty.getSizeInBits(); 108 const int EltSize = EltTy.getSizeInBits(); 109 const int NextMul32 = (Size + 31) / 32; 110 111 assert(EltSize < 32); 112 113 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 114 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 115 }; 116 } 117 118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 unsigned Size = Ty.getSizeInBits(); 122 123 LLT CoercedTy; 124 if (Size <= 32) { 125 // <2 x s8> -> s16 126 // <4 x s8> -> s32 127 CoercedTy = LLT::scalar(Size); 128 } else 129 CoercedTy = LLT::scalarOrVector(Size / 32, 32); 130 131 return std::make_pair(TypeIdx, CoercedTy); 132 }; 133 } 134 135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 136 return [=](const LegalityQuery &Query) { 137 const LLT QueryTy = Query.Types[TypeIdx]; 138 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 139 }; 140 } 141 142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 143 return [=](const LegalityQuery &Query) { 144 const LLT QueryTy = Query.Types[TypeIdx]; 145 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 146 }; 147 } 148 149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT QueryTy = Query.Types[TypeIdx]; 152 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 153 }; 154 } 155 156 static bool isRegisterSize(unsigned Size) { 157 return Size % 32 == 0 && Size <= MaxRegisterSize; 158 } 159 160 static bool isRegisterVectorElementType(LLT EltTy) { 161 const int EltSize = EltTy.getSizeInBits(); 162 return EltSize == 16 || EltSize % 32 == 0; 163 } 164 165 static bool isRegisterVectorType(LLT Ty) { 166 const int EltSize = Ty.getElementType().getSizeInBits(); 167 return EltSize == 32 || EltSize == 64 || 168 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 169 EltSize == 128 || EltSize == 256; 170 } 171 172 static bool isRegisterType(LLT Ty) { 173 if (!isRegisterSize(Ty.getSizeInBits())) 174 return false; 175 176 if (Ty.isVector()) 177 return isRegisterVectorType(Ty); 178 179 return true; 180 } 181 182 // Any combination of 32 or 64-bit elements up the maximum register size, and 183 // multiples of v2s16. 184 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 return isRegisterType(Query.Types[TypeIdx]); 187 }; 188 } 189 190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 191 return [=](const LegalityQuery &Query) { 192 const LLT QueryTy = Query.Types[TypeIdx]; 193 if (!QueryTy.isVector()) 194 return false; 195 const LLT EltTy = QueryTy.getElementType(); 196 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 197 }; 198 } 199 200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 201 return [=](const LegalityQuery &Query) { 202 const LLT Ty = Query.Types[TypeIdx]; 203 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 204 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 205 }; 206 } 207 208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 209 // handle some operations by just promoting the register during 210 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 212 bool IsLoad) { 213 switch (AS) { 214 case AMDGPUAS::PRIVATE_ADDRESS: 215 // FIXME: Private element size. 216 return 32; 217 case AMDGPUAS::LOCAL_ADDRESS: 218 return ST.useDS128() ? 128 : 64; 219 case AMDGPUAS::GLOBAL_ADDRESS: 220 case AMDGPUAS::CONSTANT_ADDRESS: 221 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 222 // Treat constant and global as identical. SMRD loads are sometimes usable for 223 // global loads (ideally constant address space should be eliminated) 224 // depending on the context. Legality cannot be context dependent, but 225 // RegBankSelect can split the load as necessary depending on the pointer 226 // register bank/uniformity and if the memory is invariant or not written in a 227 // kernel. 228 return IsLoad ? 512 : 128; 229 default: 230 // Flat addresses may contextually need to be split to 32-bit parts if they 231 // may alias scratch depending on the subtarget. 232 return 128; 233 } 234 } 235 236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 237 const LegalityQuery &Query, 238 unsigned Opcode) { 239 const LLT Ty = Query.Types[0]; 240 241 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 242 const bool IsLoad = Opcode != AMDGPU::G_STORE; 243 244 unsigned RegSize = Ty.getSizeInBits(); 245 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 246 unsigned Align = Query.MMODescrs[0].AlignInBits; 247 unsigned AS = Query.Types[1].getAddressSpace(); 248 249 // All of these need to be custom lowered to cast the pointer operand. 250 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 251 return false; 252 253 // TODO: We should be able to widen loads if the alignment is high enough, but 254 // we also need to modify the memory access size. 255 #if 0 256 // Accept widening loads based on alignment. 257 if (IsLoad && MemSize < Size) 258 MemSize = std::max(MemSize, Align); 259 #endif 260 261 // Only 1-byte and 2-byte to 32-bit extloads are valid. 262 if (MemSize != RegSize && RegSize != 32) 263 return false; 264 265 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 266 return false; 267 268 switch (MemSize) { 269 case 8: 270 case 16: 271 case 32: 272 case 64: 273 case 128: 274 break; 275 case 96: 276 if (!ST.hasDwordx3LoadStores()) 277 return false; 278 break; 279 case 256: 280 case 512: 281 // These may contextually need to be broken down. 282 break; 283 default: 284 return false; 285 } 286 287 assert(RegSize >= MemSize); 288 289 if (Align < MemSize) { 290 const SITargetLowering *TLI = ST.getTargetLowering(); 291 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 292 return false; 293 } 294 295 return true; 296 } 297 298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 299 // workaround this. Eventually it should ignore the type for loads and only care 300 // about the size. Return true in cases where we will workaround this for now by 301 // bitcasting. 302 static bool loadStoreBitcastWorkaround(const LLT Ty) { 303 if (EnableNewLegality) 304 return false; 305 306 const unsigned Size = Ty.getSizeInBits(); 307 if (Size <= 64) 308 return false; 309 if (!Ty.isVector()) 310 return true; 311 unsigned EltSize = Ty.getElementType().getSizeInBits(); 312 return EltSize != 32 && EltSize != 64; 313 } 314 315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 316 unsigned Opcode) { 317 const LLT Ty = Query.Types[0]; 318 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 319 !loadStoreBitcastWorkaround(Ty); 320 } 321 322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 323 const GCNTargetMachine &TM) 324 : ST(ST_) { 325 using namespace TargetOpcode; 326 327 auto GetAddrSpacePtr = [&TM](unsigned AS) { 328 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 329 }; 330 331 const LLT S1 = LLT::scalar(1); 332 const LLT S16 = LLT::scalar(16); 333 const LLT S32 = LLT::scalar(32); 334 const LLT S64 = LLT::scalar(64); 335 const LLT S128 = LLT::scalar(128); 336 const LLT S256 = LLT::scalar(256); 337 const LLT S512 = LLT::scalar(512); 338 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 339 340 const LLT V2S16 = LLT::vector(2, 16); 341 const LLT V4S16 = LLT::vector(4, 16); 342 343 const LLT V2S32 = LLT::vector(2, 32); 344 const LLT V3S32 = LLT::vector(3, 32); 345 const LLT V4S32 = LLT::vector(4, 32); 346 const LLT V5S32 = LLT::vector(5, 32); 347 const LLT V6S32 = LLT::vector(6, 32); 348 const LLT V7S32 = LLT::vector(7, 32); 349 const LLT V8S32 = LLT::vector(8, 32); 350 const LLT V9S32 = LLT::vector(9, 32); 351 const LLT V10S32 = LLT::vector(10, 32); 352 const LLT V11S32 = LLT::vector(11, 32); 353 const LLT V12S32 = LLT::vector(12, 32); 354 const LLT V13S32 = LLT::vector(13, 32); 355 const LLT V14S32 = LLT::vector(14, 32); 356 const LLT V15S32 = LLT::vector(15, 32); 357 const LLT V16S32 = LLT::vector(16, 32); 358 const LLT V32S32 = LLT::vector(32, 32); 359 360 const LLT V2S64 = LLT::vector(2, 64); 361 const LLT V3S64 = LLT::vector(3, 64); 362 const LLT V4S64 = LLT::vector(4, 64); 363 const LLT V5S64 = LLT::vector(5, 64); 364 const LLT V6S64 = LLT::vector(6, 64); 365 const LLT V7S64 = LLT::vector(7, 64); 366 const LLT V8S64 = LLT::vector(8, 64); 367 const LLT V16S64 = LLT::vector(16, 64); 368 369 std::initializer_list<LLT> AllS32Vectors = 370 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 371 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 372 std::initializer_list<LLT> AllS64Vectors = 373 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 374 375 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 376 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 377 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 378 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 379 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 380 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 381 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 382 383 const LLT CodePtr = FlatPtr; 384 385 const std::initializer_list<LLT> AddrSpaces64 = { 386 GlobalPtr, ConstantPtr, FlatPtr 387 }; 388 389 const std::initializer_list<LLT> AddrSpaces32 = { 390 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 391 }; 392 393 const std::initializer_list<LLT> FPTypesBase = { 394 S32, S64 395 }; 396 397 const std::initializer_list<LLT> FPTypes16 = { 398 S32, S64, S16 399 }; 400 401 const std::initializer_list<LLT> FPTypesPK16 = { 402 S32, S64, S16, V2S16 403 }; 404 405 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 406 407 setAction({G_BRCOND, S1}, Legal); // VCC branches 408 setAction({G_BRCOND, S32}, Legal); // SCC branches 409 410 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 411 // elements for v3s16 412 getActionDefinitionsBuilder(G_PHI) 413 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 414 .legalFor(AllS32Vectors) 415 .legalFor(AllS64Vectors) 416 .legalFor(AddrSpaces64) 417 .legalFor(AddrSpaces32) 418 .clampScalar(0, S32, S256) 419 .widenScalarToNextPow2(0, 32) 420 .clampMaxNumElements(0, S32, 16) 421 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 422 .legalIf(isPointer(0)); 423 424 if (ST.hasVOP3PInsts()) { 425 assert(ST.hasIntClamp() && "all targets with VOP3P should support clamp"); 426 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 427 .legalFor({S32, S16, V2S16}) 428 .clampScalar(0, S16, S32) 429 .clampMaxNumElements(0, S16, 2) 430 .scalarize(0) 431 .widenScalarToNextPow2(0, 32); 432 433 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 434 .lowerFor({S32, S16, V2S16}) // FIXME: legal and merge with add/sub/mul 435 .minScalar(0, S16) 436 .clampMaxNumElements(0, S16, 2) 437 .scalarize(0) 438 .widenScalarToNextPow2(0, 32) 439 .lower(); 440 } else if (ST.has16BitInsts()) { 441 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 442 .legalFor({S32, S16}) 443 .clampScalar(0, S16, S32) 444 .scalarize(0) 445 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 446 447 assert(ST.hasIntClamp() && "all targets with 16-bit should support clamp"); 448 449 // Technically the saturating operations require clamp bit support, but this 450 // was introduced at the same time as 16-bit operations. 451 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 452 .lowerFor({S32, S16}) // FIXME: legal with clamp modifier 453 .minScalar(0, S16) 454 .scalarize(0) 455 .widenScalarToNextPow2(0, 16) 456 .lower(); 457 458 // We're just lowering this, but it helps get a better result to try to 459 // coerce to the desired type first. 460 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 461 .minScalar(0, S16) 462 .scalarize(0) 463 .lower(); 464 } else { 465 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 466 .legalFor({S32}) 467 .clampScalar(0, S32, S32) 468 .scalarize(0); 469 470 if (ST.hasIntClamp()) { 471 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 472 .lowerFor({S32}) // FIXME: legal with clamp modifier. 473 .scalarize(0) 474 .minScalarOrElt(0, S32) 475 .lower(); 476 } else { 477 // Clamp bit support was added in VI, along with 16-bit operations. 478 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 479 .minScalar(0, S32) 480 .scalarize(0) 481 .lower(); 482 } 483 484 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 485 .minScalar(0, S32) 486 .scalarize(0) 487 .lower(); 488 } 489 490 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 491 .customFor({S32, S64}) 492 .clampScalar(0, S32, S64) 493 .widenScalarToNextPow2(0, 32) 494 .scalarize(0); 495 496 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 497 .legalFor({S32}) 498 .clampScalar(0, S32, S32) 499 .scalarize(0); 500 501 // Report legal for any types we can handle anywhere. For the cases only legal 502 // on the SALU, RegBankSelect will be able to re-legalize. 503 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 504 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 505 .clampScalar(0, S32, S64) 506 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 507 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 508 .widenScalarToNextPow2(0) 509 .scalarize(0); 510 511 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 512 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 513 .legalFor({{S32, S1}, {S32, S32}}) 514 .minScalar(0, S32) 515 // TODO: .scalarize(0) 516 .lower(); 517 518 getActionDefinitionsBuilder(G_BITCAST) 519 // Don't worry about the size constraint. 520 .legalIf(all(isRegisterType(0), isRegisterType(1))) 521 .lower(); 522 523 524 getActionDefinitionsBuilder(G_CONSTANT) 525 .legalFor({S1, S32, S64, S16, GlobalPtr, 526 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 527 .clampScalar(0, S32, S64) 528 .widenScalarToNextPow2(0) 529 .legalIf(isPointer(0)); 530 531 getActionDefinitionsBuilder(G_FCONSTANT) 532 .legalFor({S32, S64, S16}) 533 .clampScalar(0, S16, S64); 534 535 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 536 .legalIf(isRegisterType(0)) 537 // s1 and s16 are special cases because they have legal operations on 538 // them, but don't really occupy registers in the normal way. 539 .legalFor({S1, S16}) 540 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 541 .clampScalarOrElt(0, S32, MaxScalar) 542 .widenScalarToNextPow2(0, 32) 543 .clampMaxNumElements(0, S32, 16); 544 545 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 546 547 // If the amount is divergent, we have to do a wave reduction to get the 548 // maximum value, so this is expanded during RegBankSelect. 549 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 550 .legalFor({{PrivatePtr, S32}}); 551 552 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 553 .unsupportedFor({PrivatePtr}) 554 .custom(); 555 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 556 557 auto &FPOpActions = getActionDefinitionsBuilder( 558 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 559 .legalFor({S32, S64}); 560 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 561 .customFor({S32, S64}); 562 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 563 .customFor({S32, S64}); 564 565 if (ST.has16BitInsts()) { 566 if (ST.hasVOP3PInsts()) 567 FPOpActions.legalFor({S16, V2S16}); 568 else 569 FPOpActions.legalFor({S16}); 570 571 TrigActions.customFor({S16}); 572 FDIVActions.customFor({S16}); 573 } 574 575 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 576 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 577 578 if (ST.hasVOP3PInsts()) { 579 MinNumMaxNum.customFor(FPTypesPK16) 580 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 581 .clampMaxNumElements(0, S16, 2) 582 .clampScalar(0, S16, S64) 583 .scalarize(0); 584 } else if (ST.has16BitInsts()) { 585 MinNumMaxNum.customFor(FPTypes16) 586 .clampScalar(0, S16, S64) 587 .scalarize(0); 588 } else { 589 MinNumMaxNum.customFor(FPTypesBase) 590 .clampScalar(0, S32, S64) 591 .scalarize(0); 592 } 593 594 if (ST.hasVOP3PInsts()) 595 FPOpActions.clampMaxNumElements(0, S16, 2); 596 597 FPOpActions 598 .scalarize(0) 599 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 600 601 TrigActions 602 .scalarize(0) 603 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 604 605 FDIVActions 606 .scalarize(0) 607 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 608 609 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 610 .legalFor(FPTypesPK16) 611 .clampMaxNumElements(0, S16, 2) 612 .scalarize(0) 613 .clampScalar(0, S16, S64); 614 615 if (ST.has16BitInsts()) { 616 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 617 .legalFor({S32, S64, S16}) 618 .scalarize(0) 619 .clampScalar(0, S16, S64); 620 } else { 621 getActionDefinitionsBuilder(G_FSQRT) 622 .legalFor({S32, S64}) 623 .scalarize(0) 624 .clampScalar(0, S32, S64); 625 626 if (ST.hasFractBug()) { 627 getActionDefinitionsBuilder(G_FFLOOR) 628 .customFor({S64}) 629 .legalFor({S32, S64}) 630 .scalarize(0) 631 .clampScalar(0, S32, S64); 632 } else { 633 getActionDefinitionsBuilder(G_FFLOOR) 634 .legalFor({S32, S64}) 635 .scalarize(0) 636 .clampScalar(0, S32, S64); 637 } 638 } 639 640 getActionDefinitionsBuilder(G_FPTRUNC) 641 .legalFor({{S32, S64}, {S16, S32}}) 642 .scalarize(0) 643 .lower(); 644 645 getActionDefinitionsBuilder(G_FPEXT) 646 .legalFor({{S64, S32}, {S32, S16}}) 647 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 648 .scalarize(0); 649 650 getActionDefinitionsBuilder(G_FSUB) 651 // Use actual fsub instruction 652 .legalFor({S32}) 653 // Must use fadd + fneg 654 .lowerFor({S64, S16, V2S16}) 655 .scalarize(0) 656 .clampScalar(0, S32, S64); 657 658 // Whether this is legal depends on the floating point mode for the function. 659 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 660 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 661 FMad.customFor({S32, S16}); 662 else if (ST.hasMadMacF32Insts()) 663 FMad.customFor({S32}); 664 else if (ST.hasMadF16()) 665 FMad.customFor({S16}); 666 FMad.scalarize(0) 667 .lower(); 668 669 // TODO: Do we need to clamp maximum bitwidth? 670 getActionDefinitionsBuilder(G_TRUNC) 671 .legalIf(isScalar(0)) 672 .legalFor({{V2S16, V2S32}}) 673 .clampMaxNumElements(0, S16, 2) 674 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 675 // situations (like an invalid implicit use), we don't want to infinite loop 676 // in the legalizer. 677 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 678 .alwaysLegal(); 679 680 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 681 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 682 {S32, S1}, {S64, S1}, {S16, S1}}) 683 .scalarize(0) 684 .clampScalar(0, S32, S64) 685 .widenScalarToNextPow2(1, 32); 686 687 // TODO: Split s1->s64 during regbankselect for VALU. 688 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 689 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 690 .lowerFor({{S32, S64}}) 691 .lowerIf(typeIs(1, S1)) 692 .customFor({{S64, S64}}); 693 if (ST.has16BitInsts()) 694 IToFP.legalFor({{S16, S16}}); 695 IToFP.clampScalar(1, S32, S64) 696 .minScalar(0, S32) 697 .scalarize(0) 698 .widenScalarToNextPow2(1); 699 700 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 701 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 702 .customFor({{S64, S64}}) 703 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 704 if (ST.has16BitInsts()) 705 FPToI.legalFor({{S16, S16}}); 706 else 707 FPToI.minScalar(1, S32); 708 709 FPToI.minScalar(0, S32) 710 .scalarize(0) 711 .lower(); 712 713 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 714 .scalarize(0) 715 .lower(); 716 717 if (ST.has16BitInsts()) { 718 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 719 .legalFor({S16, S32, S64}) 720 .clampScalar(0, S16, S64) 721 .scalarize(0); 722 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 723 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 724 .legalFor({S32, S64}) 725 .clampScalar(0, S32, S64) 726 .scalarize(0); 727 } else { 728 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 729 .legalFor({S32}) 730 .customFor({S64}) 731 .clampScalar(0, S32, S64) 732 .scalarize(0); 733 } 734 735 // FIXME: Clamp offset operand. 736 getActionDefinitionsBuilder(G_PTR_ADD) 737 .legalIf(isPointer(0)) 738 .scalarize(0); 739 740 getActionDefinitionsBuilder(G_PTRMASK) 741 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 742 .scalarSameSizeAs(1, 0) 743 .scalarize(0); 744 745 auto &CmpBuilder = 746 getActionDefinitionsBuilder(G_ICMP) 747 // The compare output type differs based on the register bank of the output, 748 // so make both s1 and s32 legal. 749 // 750 // Scalar compares producing output in scc will be promoted to s32, as that 751 // is the allocatable register type that will be needed for the copy from 752 // scc. This will be promoted during RegBankSelect, and we assume something 753 // before that won't try to use s32 result types. 754 // 755 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 756 // bank. 757 .legalForCartesianProduct( 758 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 759 .legalForCartesianProduct( 760 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 761 if (ST.has16BitInsts()) { 762 CmpBuilder.legalFor({{S1, S16}}); 763 } 764 765 CmpBuilder 766 .widenScalarToNextPow2(1) 767 .clampScalar(1, S32, S64) 768 .scalarize(0) 769 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 770 771 getActionDefinitionsBuilder(G_FCMP) 772 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 773 .widenScalarToNextPow2(1) 774 .clampScalar(1, S32, S64) 775 .scalarize(0); 776 777 // FIXME: fpow has a selection pattern that should move to custom lowering. 778 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 779 if (ST.has16BitInsts()) 780 Exp2Ops.legalFor({S32, S16}); 781 else 782 Exp2Ops.legalFor({S32}); 783 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 784 Exp2Ops.scalarize(0); 785 786 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 787 if (ST.has16BitInsts()) 788 ExpOps.customFor({{S32}, {S16}}); 789 else 790 ExpOps.customFor({S32}); 791 ExpOps.clampScalar(0, MinScalarFPTy, S32) 792 .scalarize(0); 793 794 getActionDefinitionsBuilder(G_FPOWI) 795 .clampScalar(0, MinScalarFPTy, S32) 796 .lower(); 797 798 // The 64-bit versions produce 32-bit results, but only on the SALU. 799 getActionDefinitionsBuilder(G_CTPOP) 800 .legalFor({{S32, S32}, {S32, S64}}) 801 .clampScalar(0, S32, S32) 802 .clampScalar(1, S32, S64) 803 .scalarize(0) 804 .widenScalarToNextPow2(0, 32) 805 .widenScalarToNextPow2(1, 32); 806 807 // The hardware instructions return a different result on 0 than the generic 808 // instructions expect. The hardware produces -1, but these produce the 809 // bitwidth. 810 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 811 .scalarize(0) 812 .clampScalar(0, S32, S32) 813 .clampScalar(1, S32, S64) 814 .widenScalarToNextPow2(0, 32) 815 .widenScalarToNextPow2(1, 32) 816 .lower(); 817 818 // The 64-bit versions produce 32-bit results, but only on the SALU. 819 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 820 .legalFor({{S32, S32}, {S32, S64}}) 821 .clampScalar(0, S32, S32) 822 .clampScalar(1, S32, S64) 823 .scalarize(0) 824 .widenScalarToNextPow2(0, 32) 825 .widenScalarToNextPow2(1, 32); 826 827 getActionDefinitionsBuilder(G_BITREVERSE) 828 .legalFor({S32}) 829 .clampScalar(0, S32, S32) 830 .scalarize(0); 831 832 if (ST.has16BitInsts()) { 833 getActionDefinitionsBuilder(G_BSWAP) 834 .legalFor({S16, S32, V2S16}) 835 .clampMaxNumElements(0, S16, 2) 836 // FIXME: Fixing non-power-of-2 before clamp is workaround for 837 // narrowScalar limitation. 838 .widenScalarToNextPow2(0) 839 .clampScalar(0, S16, S32) 840 .scalarize(0); 841 842 if (ST.hasVOP3PInsts()) { 843 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 844 .legalFor({S32, S16, V2S16}) 845 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 846 .clampMaxNumElements(0, S16, 2) 847 .minScalar(0, S16) 848 .widenScalarToNextPow2(0) 849 .scalarize(0) 850 .lower(); 851 } else { 852 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 853 .legalFor({S32, S16}) 854 .widenScalarToNextPow2(0) 855 .minScalar(0, S16) 856 .scalarize(0) 857 .lower(); 858 } 859 } else { 860 // TODO: Should have same legality without v_perm_b32 861 getActionDefinitionsBuilder(G_BSWAP) 862 .legalFor({S32}) 863 .lowerIf(scalarNarrowerThan(0, 32)) 864 // FIXME: Fixing non-power-of-2 before clamp is workaround for 865 // narrowScalar limitation. 866 .widenScalarToNextPow2(0) 867 .maxScalar(0, S32) 868 .scalarize(0) 869 .lower(); 870 871 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 872 .legalFor({S32}) 873 .minScalar(0, S32) 874 .widenScalarToNextPow2(0) 875 .scalarize(0) 876 .lower(); 877 } 878 879 getActionDefinitionsBuilder(G_INTTOPTR) 880 // List the common cases 881 .legalForCartesianProduct(AddrSpaces64, {S64}) 882 .legalForCartesianProduct(AddrSpaces32, {S32}) 883 .scalarize(0) 884 // Accept any address space as long as the size matches 885 .legalIf(sameSize(0, 1)) 886 .widenScalarIf(smallerThan(1, 0), 887 [](const LegalityQuery &Query) { 888 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 889 }) 890 .narrowScalarIf(largerThan(1, 0), 891 [](const LegalityQuery &Query) { 892 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 893 }); 894 895 getActionDefinitionsBuilder(G_PTRTOINT) 896 // List the common cases 897 .legalForCartesianProduct(AddrSpaces64, {S64}) 898 .legalForCartesianProduct(AddrSpaces32, {S32}) 899 .scalarize(0) 900 // Accept any address space as long as the size matches 901 .legalIf(sameSize(0, 1)) 902 .widenScalarIf(smallerThan(0, 1), 903 [](const LegalityQuery &Query) { 904 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 905 }) 906 .narrowScalarIf( 907 largerThan(0, 1), 908 [](const LegalityQuery &Query) { 909 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 910 }); 911 912 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 913 .scalarize(0) 914 .custom(); 915 916 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 917 bool IsLoad) -> bool { 918 const LLT DstTy = Query.Types[0]; 919 920 // Split vector extloads. 921 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 922 unsigned Align = Query.MMODescrs[0].AlignInBits; 923 924 if (MemSize < DstTy.getSizeInBits()) 925 MemSize = std::max(MemSize, Align); 926 927 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 928 return true; 929 930 const LLT PtrTy = Query.Types[1]; 931 unsigned AS = PtrTy.getAddressSpace(); 932 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 933 return true; 934 935 // Catch weird sized loads that don't evenly divide into the access sizes 936 // TODO: May be able to widen depending on alignment etc. 937 unsigned NumRegs = (MemSize + 31) / 32; 938 if (NumRegs == 3) { 939 if (!ST.hasDwordx3LoadStores()) 940 return true; 941 } else { 942 // If the alignment allows, these should have been widened. 943 if (!isPowerOf2_32(NumRegs)) 944 return true; 945 } 946 947 if (Align < MemSize) { 948 const SITargetLowering *TLI = ST.getTargetLowering(); 949 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 950 } 951 952 return false; 953 }; 954 955 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 956 unsigned Opc) -> bool { 957 unsigned Size = Query.Types[0].getSizeInBits(); 958 if (isPowerOf2_32(Size)) 959 return false; 960 961 if (Size == 96 && ST.hasDwordx3LoadStores()) 962 return false; 963 964 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 965 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 966 return false; 967 968 unsigned Align = Query.MMODescrs[0].AlignInBits; 969 unsigned RoundedSize = NextPowerOf2(Size); 970 return (Align >= RoundedSize); 971 }; 972 973 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 974 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 975 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 976 977 // TODO: Refine based on subtargets which support unaligned access or 128-bit 978 // LDS 979 // TODO: Unsupported flat for SI. 980 981 for (unsigned Op : {G_LOAD, G_STORE}) { 982 const bool IsStore = Op == G_STORE; 983 984 auto &Actions = getActionDefinitionsBuilder(Op); 985 // Explicitly list some common cases. 986 // TODO: Does this help compile time at all? 987 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 988 {V2S32, GlobalPtr, 64, GlobalAlign32}, 989 {V4S32, GlobalPtr, 128, GlobalAlign32}, 990 {S64, GlobalPtr, 64, GlobalAlign32}, 991 {V2S64, GlobalPtr, 128, GlobalAlign32}, 992 {V2S16, GlobalPtr, 32, GlobalAlign32}, 993 {S32, GlobalPtr, 8, GlobalAlign8}, 994 {S32, GlobalPtr, 16, GlobalAlign16}, 995 996 {S32, LocalPtr, 32, 32}, 997 {S64, LocalPtr, 64, 32}, 998 {V2S32, LocalPtr, 64, 32}, 999 {S32, LocalPtr, 8, 8}, 1000 {S32, LocalPtr, 16, 16}, 1001 {V2S16, LocalPtr, 32, 32}, 1002 1003 {S32, PrivatePtr, 32, 32}, 1004 {S32, PrivatePtr, 8, 8}, 1005 {S32, PrivatePtr, 16, 16}, 1006 {V2S16, PrivatePtr, 32, 32}, 1007 1008 {S32, ConstantPtr, 32, GlobalAlign32}, 1009 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1010 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1011 {S64, ConstantPtr, 64, GlobalAlign32}, 1012 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1013 Actions.legalIf( 1014 [=](const LegalityQuery &Query) -> bool { 1015 return isLoadStoreLegal(ST, Query, Op); 1016 }); 1017 1018 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1019 // 64-bits. 1020 // 1021 // TODO: Should generalize bitcast action into coerce, which will also cover 1022 // inserting addrspacecasts. 1023 Actions.customIf(typeIs(1, Constant32Ptr)); 1024 1025 // Turn any illegal element vectors into something easier to deal 1026 // with. These will ultimately produce 32-bit scalar shifts to extract the 1027 // parts anyway. 1028 // 1029 // For odd 16-bit element vectors, prefer to split those into pieces with 1030 // 16-bit vector parts. 1031 Actions.bitcastIf( 1032 [=](const LegalityQuery &Query) -> bool { 1033 const LLT Ty = Query.Types[0]; 1034 const unsigned Size = Ty.getSizeInBits(); 1035 1036 if (Size != Query.MMODescrs[0].SizeInBits) 1037 return Size <= 32 && Ty.isVector(); 1038 1039 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 1040 return true; 1041 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 1042 !isRegisterVectorElementType(Ty.getElementType()); 1043 }, bitcastToRegisterType(0)); 1044 1045 Actions 1046 .customIf(typeIs(1, Constant32Ptr)) 1047 // Widen suitably aligned loads by loading extra elements. 1048 .moreElementsIf([=](const LegalityQuery &Query) { 1049 const LLT Ty = Query.Types[0]; 1050 return Op == G_LOAD && Ty.isVector() && 1051 shouldWidenLoadResult(Query, Op); 1052 }, moreElementsToNextPow2(0)) 1053 .widenScalarIf([=](const LegalityQuery &Query) { 1054 const LLT Ty = Query.Types[0]; 1055 return Op == G_LOAD && !Ty.isVector() && 1056 shouldWidenLoadResult(Query, Op); 1057 }, widenScalarOrEltToNextPow2(0)) 1058 .narrowScalarIf( 1059 [=](const LegalityQuery &Query) -> bool { 1060 return !Query.Types[0].isVector() && 1061 needToSplitMemOp(Query, Op == G_LOAD); 1062 }, 1063 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1064 const LLT DstTy = Query.Types[0]; 1065 const LLT PtrTy = Query.Types[1]; 1066 1067 const unsigned DstSize = DstTy.getSizeInBits(); 1068 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1069 1070 // Split extloads. 1071 if (DstSize > MemSize) 1072 return std::make_pair(0, LLT::scalar(MemSize)); 1073 1074 if (!isPowerOf2_32(DstSize)) { 1075 // We're probably decomposing an odd sized store. Try to split 1076 // to the widest type. TODO: Account for alignment. As-is it 1077 // should be OK, since the new parts will be further legalized. 1078 unsigned FloorSize = PowerOf2Floor(DstSize); 1079 return std::make_pair(0, LLT::scalar(FloorSize)); 1080 } 1081 1082 if (DstSize > 32 && (DstSize % 32 != 0)) { 1083 // FIXME: Need a way to specify non-extload of larger size if 1084 // suitably aligned. 1085 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1086 } 1087 1088 unsigned MaxSize = maxSizeForAddrSpace(ST, 1089 PtrTy.getAddressSpace(), 1090 Op == G_LOAD); 1091 if (MemSize > MaxSize) 1092 return std::make_pair(0, LLT::scalar(MaxSize)); 1093 1094 unsigned Align = Query.MMODescrs[0].AlignInBits; 1095 return std::make_pair(0, LLT::scalar(Align)); 1096 }) 1097 .fewerElementsIf( 1098 [=](const LegalityQuery &Query) -> bool { 1099 return Query.Types[0].isVector() && 1100 needToSplitMemOp(Query, Op == G_LOAD); 1101 }, 1102 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1103 const LLT DstTy = Query.Types[0]; 1104 const LLT PtrTy = Query.Types[1]; 1105 1106 LLT EltTy = DstTy.getElementType(); 1107 unsigned MaxSize = maxSizeForAddrSpace(ST, 1108 PtrTy.getAddressSpace(), 1109 Op == G_LOAD); 1110 1111 // FIXME: Handle widened to power of 2 results better. This ends 1112 // up scalarizing. 1113 // FIXME: 3 element stores scalarized on SI 1114 1115 // Split if it's too large for the address space. 1116 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1117 unsigned NumElts = DstTy.getNumElements(); 1118 unsigned EltSize = EltTy.getSizeInBits(); 1119 1120 if (MaxSize % EltSize == 0) { 1121 return std::make_pair( 1122 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1123 } 1124 1125 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1126 1127 // FIXME: Refine when odd breakdowns handled 1128 // The scalars will need to be re-legalized. 1129 if (NumPieces == 1 || NumPieces >= NumElts || 1130 NumElts % NumPieces != 0) 1131 return std::make_pair(0, EltTy); 1132 1133 return std::make_pair(0, 1134 LLT::vector(NumElts / NumPieces, EltTy)); 1135 } 1136 1137 // FIXME: We could probably handle weird extending loads better. 1138 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1139 if (DstTy.getSizeInBits() > MemSize) 1140 return std::make_pair(0, EltTy); 1141 1142 unsigned EltSize = EltTy.getSizeInBits(); 1143 unsigned DstSize = DstTy.getSizeInBits(); 1144 if (!isPowerOf2_32(DstSize)) { 1145 // We're probably decomposing an odd sized store. Try to split 1146 // to the widest type. TODO: Account for alignment. As-is it 1147 // should be OK, since the new parts will be further legalized. 1148 unsigned FloorSize = PowerOf2Floor(DstSize); 1149 return std::make_pair( 1150 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1151 } 1152 1153 // Need to split because of alignment. 1154 unsigned Align = Query.MMODescrs[0].AlignInBits; 1155 if (EltSize > Align && 1156 (EltSize / Align < DstTy.getNumElements())) { 1157 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1158 } 1159 1160 // May need relegalization for the scalars. 1161 return std::make_pair(0, EltTy); 1162 }) 1163 .minScalar(0, S32); 1164 1165 if (IsStore) 1166 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1167 1168 // TODO: Need a bitcast lower option? 1169 Actions 1170 .widenScalarToNextPow2(0) 1171 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1172 } 1173 1174 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1175 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1176 {S32, GlobalPtr, 16, 2 * 8}, 1177 {S32, LocalPtr, 8, 8}, 1178 {S32, LocalPtr, 16, 16}, 1179 {S32, PrivatePtr, 8, 8}, 1180 {S32, PrivatePtr, 16, 16}, 1181 {S32, ConstantPtr, 8, 8}, 1182 {S32, ConstantPtr, 16, 2 * 8}}); 1183 if (ST.hasFlatAddressSpace()) { 1184 ExtLoads.legalForTypesWithMemDesc( 1185 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1186 } 1187 1188 ExtLoads.clampScalar(0, S32, S32) 1189 .widenScalarToNextPow2(0) 1190 .unsupportedIfMemSizeNotPow2() 1191 .lower(); 1192 1193 auto &Atomics = getActionDefinitionsBuilder( 1194 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1195 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1196 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1197 G_ATOMICRMW_UMIN}) 1198 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1199 {S64, GlobalPtr}, {S64, LocalPtr}}); 1200 if (ST.hasFlatAddressSpace()) { 1201 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1202 } 1203 1204 if (ST.hasLDSFPAtomics()) { 1205 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1206 .legalFor({{S32, LocalPtr}}); 1207 } 1208 1209 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1210 // demarshalling 1211 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1212 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1213 {S32, FlatPtr}, {S64, FlatPtr}}) 1214 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1215 {S32, RegionPtr}, {S64, RegionPtr}}); 1216 // TODO: Pointer types, any 32-bit or 64-bit vector 1217 1218 // Condition should be s32 for scalar, s1 for vector. 1219 getActionDefinitionsBuilder(G_SELECT) 1220 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1221 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1222 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1223 .clampScalar(0, S16, S64) 1224 .scalarize(1) 1225 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1226 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1227 .clampMaxNumElements(0, S32, 2) 1228 .clampMaxNumElements(0, LocalPtr, 2) 1229 .clampMaxNumElements(0, PrivatePtr, 2) 1230 .scalarize(0) 1231 .widenScalarToNextPow2(0) 1232 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1233 1234 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1235 // be more flexible with the shift amount type. 1236 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1237 .legalFor({{S32, S32}, {S64, S32}}); 1238 if (ST.has16BitInsts()) { 1239 if (ST.hasVOP3PInsts()) { 1240 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1241 .clampMaxNumElements(0, S16, 2); 1242 } else 1243 Shifts.legalFor({{S16, S16}}); 1244 1245 // TODO: Support 16-bit shift amounts for all types 1246 Shifts.widenScalarIf( 1247 [=](const LegalityQuery &Query) { 1248 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1249 // 32-bit amount. 1250 const LLT ValTy = Query.Types[0]; 1251 const LLT AmountTy = Query.Types[1]; 1252 return ValTy.getSizeInBits() <= 16 && 1253 AmountTy.getSizeInBits() < 16; 1254 }, changeTo(1, S16)); 1255 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1256 Shifts.clampScalar(1, S32, S32); 1257 Shifts.clampScalar(0, S16, S64); 1258 Shifts.widenScalarToNextPow2(0, 16); 1259 } else { 1260 // Make sure we legalize the shift amount type first, as the general 1261 // expansion for the shifted type will produce much worse code if it hasn't 1262 // been truncated already. 1263 Shifts.clampScalar(1, S32, S32); 1264 Shifts.clampScalar(0, S32, S64); 1265 Shifts.widenScalarToNextPow2(0, 32); 1266 } 1267 Shifts.scalarize(0); 1268 1269 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1270 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1271 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1272 unsigned IdxTypeIdx = 2; 1273 1274 getActionDefinitionsBuilder(Op) 1275 .customIf([=](const LegalityQuery &Query) { 1276 const LLT EltTy = Query.Types[EltTypeIdx]; 1277 const LLT VecTy = Query.Types[VecTypeIdx]; 1278 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1279 return (EltTy.getSizeInBits() == 16 || 1280 EltTy.getSizeInBits() % 32 == 0) && 1281 VecTy.getSizeInBits() % 32 == 0 && 1282 VecTy.getSizeInBits() <= MaxRegisterSize && 1283 IdxTy.getSizeInBits() == 32; 1284 }) 1285 .clampScalar(EltTypeIdx, S32, S64) 1286 .clampScalar(VecTypeIdx, S32, S64) 1287 .clampScalar(IdxTypeIdx, S32, S32); 1288 } 1289 1290 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1291 .unsupportedIf([=](const LegalityQuery &Query) { 1292 const LLT &EltTy = Query.Types[1].getElementType(); 1293 return Query.Types[0] != EltTy; 1294 }); 1295 1296 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1297 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1298 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1299 1300 // FIXME: Doesn't handle extract of illegal sizes. 1301 getActionDefinitionsBuilder(Op) 1302 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1303 // FIXME: Multiples of 16 should not be legal. 1304 .legalIf([=](const LegalityQuery &Query) { 1305 const LLT BigTy = Query.Types[BigTyIdx]; 1306 const LLT LitTy = Query.Types[LitTyIdx]; 1307 return (BigTy.getSizeInBits() % 32 == 0) && 1308 (LitTy.getSizeInBits() % 16 == 0); 1309 }) 1310 .widenScalarIf( 1311 [=](const LegalityQuery &Query) { 1312 const LLT BigTy = Query.Types[BigTyIdx]; 1313 return (BigTy.getScalarSizeInBits() < 16); 1314 }, 1315 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1316 .widenScalarIf( 1317 [=](const LegalityQuery &Query) { 1318 const LLT LitTy = Query.Types[LitTyIdx]; 1319 return (LitTy.getScalarSizeInBits() < 16); 1320 }, 1321 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1322 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1323 .widenScalarToNextPow2(BigTyIdx, 32); 1324 1325 } 1326 1327 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1328 .legalForCartesianProduct(AllS32Vectors, {S32}) 1329 .legalForCartesianProduct(AllS64Vectors, {S64}) 1330 .clampNumElements(0, V16S32, V32S32) 1331 .clampNumElements(0, V2S64, V16S64) 1332 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1333 1334 if (ST.hasScalarPackInsts()) { 1335 BuildVector 1336 // FIXME: Should probably widen s1 vectors straight to s32 1337 .minScalarOrElt(0, S16) 1338 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1339 .minScalar(1, S32); 1340 1341 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1342 .legalFor({V2S16, S32}) 1343 .lower(); 1344 BuildVector.minScalarOrElt(0, S32); 1345 } else { 1346 BuildVector.customFor({V2S16, S16}); 1347 BuildVector.minScalarOrElt(0, S32); 1348 1349 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1350 .customFor({V2S16, S32}) 1351 .lower(); 1352 } 1353 1354 BuildVector.legalIf(isRegisterType(0)); 1355 1356 // FIXME: Clamp maximum size 1357 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1358 .legalIf(isRegisterType(0)); 1359 1360 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1361 // pre-legalize. 1362 if (ST.hasVOP3PInsts()) { 1363 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1364 .customFor({V2S16, V2S16}) 1365 .lower(); 1366 } else 1367 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1368 1369 // Merge/Unmerge 1370 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1371 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1372 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1373 1374 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1375 const LLT Ty = Query.Types[TypeIdx]; 1376 if (Ty.isVector()) { 1377 const LLT &EltTy = Ty.getElementType(); 1378 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1379 return true; 1380 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1381 return true; 1382 } 1383 return false; 1384 }; 1385 1386 auto &Builder = getActionDefinitionsBuilder(Op) 1387 .lowerFor({{S16, V2S16}}) 1388 .lowerIf([=](const LegalityQuery &Query) { 1389 const LLT BigTy = Query.Types[BigTyIdx]; 1390 return BigTy.getSizeInBits() == 32; 1391 }) 1392 // Try to widen to s16 first for small types. 1393 // TODO: Only do this on targets with legal s16 shifts 1394 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1395 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1396 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1397 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1398 elementTypeIs(1, S16)), 1399 changeTo(1, V2S16)) 1400 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1401 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1402 // valid. 1403 .clampScalar(LitTyIdx, S32, S512) 1404 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1405 // Break up vectors with weird elements into scalars 1406 .fewerElementsIf( 1407 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1408 scalarize(0)) 1409 .fewerElementsIf( 1410 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1411 scalarize(1)) 1412 .clampScalar(BigTyIdx, S32, MaxScalar); 1413 1414 if (Op == G_MERGE_VALUES) { 1415 Builder.widenScalarIf( 1416 // TODO: Use 16-bit shifts if legal for 8-bit values? 1417 [=](const LegalityQuery &Query) { 1418 const LLT Ty = Query.Types[LitTyIdx]; 1419 return Ty.getSizeInBits() < 32; 1420 }, 1421 changeTo(LitTyIdx, S32)); 1422 } 1423 1424 Builder.widenScalarIf( 1425 [=](const LegalityQuery &Query) { 1426 const LLT Ty = Query.Types[BigTyIdx]; 1427 return !isPowerOf2_32(Ty.getSizeInBits()) && 1428 Ty.getSizeInBits() % 16 != 0; 1429 }, 1430 [=](const LegalityQuery &Query) { 1431 // Pick the next power of 2, or a multiple of 64 over 128. 1432 // Whichever is smaller. 1433 const LLT &Ty = Query.Types[BigTyIdx]; 1434 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1435 if (NewSizeInBits >= 256) { 1436 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1437 if (RoundedTo < NewSizeInBits) 1438 NewSizeInBits = RoundedTo; 1439 } 1440 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1441 }) 1442 .legalIf([=](const LegalityQuery &Query) { 1443 const LLT &BigTy = Query.Types[BigTyIdx]; 1444 const LLT &LitTy = Query.Types[LitTyIdx]; 1445 1446 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1447 return false; 1448 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1449 return false; 1450 1451 return BigTy.getSizeInBits() % 16 == 0 && 1452 LitTy.getSizeInBits() % 16 == 0 && 1453 BigTy.getSizeInBits() <= MaxRegisterSize; 1454 }) 1455 // Any vectors left are the wrong size. Scalarize them. 1456 .scalarize(0) 1457 .scalarize(1); 1458 } 1459 1460 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1461 // RegBankSelect. 1462 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1463 .legalFor({{S32}, {S64}}); 1464 1465 if (ST.hasVOP3PInsts()) { 1466 SextInReg.lowerFor({{V2S16}}) 1467 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1468 // get more vector shift opportunities, since we'll get those when 1469 // expanded. 1470 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1471 } else if (ST.has16BitInsts()) { 1472 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1473 } else { 1474 // Prefer to promote to s32 before lowering if we don't have 16-bit 1475 // shifts. This avoid a lot of intermediate truncate and extend operations. 1476 SextInReg.lowerFor({{S32}, {S64}}); 1477 } 1478 1479 SextInReg 1480 .scalarize(0) 1481 .clampScalar(0, S32, S64) 1482 .lower(); 1483 1484 getActionDefinitionsBuilder(G_FSHR) 1485 .legalFor({{S32, S32}}) 1486 .scalarize(0) 1487 .lower(); 1488 1489 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1490 .legalFor({S64}); 1491 1492 getActionDefinitionsBuilder({ 1493 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1494 G_FCOPYSIGN, 1495 1496 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1497 G_READ_REGISTER, 1498 G_WRITE_REGISTER, 1499 1500 G_SADDO, G_SSUBO, 1501 1502 // TODO: Implement 1503 G_FMINIMUM, G_FMAXIMUM, 1504 G_FSHL 1505 }).lower(); 1506 1507 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1508 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1509 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1510 .unsupported(); 1511 1512 computeTables(); 1513 verify(*ST.getInstrInfo()); 1514 } 1515 1516 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1517 MachineInstr &MI) const { 1518 MachineIRBuilder &B = Helper.MIRBuilder; 1519 MachineRegisterInfo &MRI = *B.getMRI(); 1520 GISelChangeObserver &Observer = Helper.Observer; 1521 1522 switch (MI.getOpcode()) { 1523 case TargetOpcode::G_ADDRSPACE_CAST: 1524 return legalizeAddrSpaceCast(MI, MRI, B); 1525 case TargetOpcode::G_FRINT: 1526 return legalizeFrint(MI, MRI, B); 1527 case TargetOpcode::G_FCEIL: 1528 return legalizeFceil(MI, MRI, B); 1529 case TargetOpcode::G_INTRINSIC_TRUNC: 1530 return legalizeIntrinsicTrunc(MI, MRI, B); 1531 case TargetOpcode::G_SITOFP: 1532 return legalizeITOFP(MI, MRI, B, true); 1533 case TargetOpcode::G_UITOFP: 1534 return legalizeITOFP(MI, MRI, B, false); 1535 case TargetOpcode::G_FPTOSI: 1536 return legalizeFPTOI(MI, MRI, B, true); 1537 case TargetOpcode::G_FPTOUI: 1538 return legalizeFPTOI(MI, MRI, B, false); 1539 case TargetOpcode::G_FMINNUM: 1540 case TargetOpcode::G_FMAXNUM: 1541 case TargetOpcode::G_FMINNUM_IEEE: 1542 case TargetOpcode::G_FMAXNUM_IEEE: 1543 return legalizeMinNumMaxNum(Helper, MI); 1544 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1545 return legalizeExtractVectorElt(MI, MRI, B); 1546 case TargetOpcode::G_INSERT_VECTOR_ELT: 1547 return legalizeInsertVectorElt(MI, MRI, B); 1548 case TargetOpcode::G_SHUFFLE_VECTOR: 1549 return legalizeShuffleVector(MI, MRI, B); 1550 case TargetOpcode::G_FSIN: 1551 case TargetOpcode::G_FCOS: 1552 return legalizeSinCos(MI, MRI, B); 1553 case TargetOpcode::G_GLOBAL_VALUE: 1554 return legalizeGlobalValue(MI, MRI, B); 1555 case TargetOpcode::G_LOAD: 1556 return legalizeLoad(MI, MRI, B, Observer); 1557 case TargetOpcode::G_FMAD: 1558 return legalizeFMad(MI, MRI, B); 1559 case TargetOpcode::G_FDIV: 1560 return legalizeFDIV(MI, MRI, B); 1561 case TargetOpcode::G_UDIV: 1562 case TargetOpcode::G_UREM: 1563 return legalizeUDIV_UREM(MI, MRI, B); 1564 case TargetOpcode::G_SDIV: 1565 case TargetOpcode::G_SREM: 1566 return legalizeSDIV_SREM(MI, MRI, B); 1567 case TargetOpcode::G_ATOMIC_CMPXCHG: 1568 return legalizeAtomicCmpXChg(MI, MRI, B); 1569 case TargetOpcode::G_FLOG: 1570 return legalizeFlog(MI, B, numbers::ln2f); 1571 case TargetOpcode::G_FLOG10: 1572 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1573 case TargetOpcode::G_FEXP: 1574 return legalizeFExp(MI, B); 1575 case TargetOpcode::G_FPOW: 1576 return legalizeFPow(MI, B); 1577 case TargetOpcode::G_FFLOOR: 1578 return legalizeFFloor(MI, MRI, B); 1579 case TargetOpcode::G_BUILD_VECTOR: 1580 return legalizeBuildVector(MI, MRI, B); 1581 default: 1582 return false; 1583 } 1584 1585 llvm_unreachable("expected switch to return"); 1586 } 1587 1588 Register AMDGPULegalizerInfo::getSegmentAperture( 1589 unsigned AS, 1590 MachineRegisterInfo &MRI, 1591 MachineIRBuilder &B) const { 1592 MachineFunction &MF = B.getMF(); 1593 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1594 const LLT S32 = LLT::scalar(32); 1595 1596 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1597 1598 if (ST.hasApertureRegs()) { 1599 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1600 // getreg. 1601 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1602 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1603 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1604 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1605 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1606 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1607 unsigned Encoding = 1608 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1609 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1610 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1611 1612 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1613 1614 B.buildInstr(AMDGPU::S_GETREG_B32) 1615 .addDef(GetReg) 1616 .addImm(Encoding); 1617 MRI.setType(GetReg, S32); 1618 1619 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1620 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1621 } 1622 1623 Register QueuePtr = MRI.createGenericVirtualRegister( 1624 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1625 1626 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1627 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1628 return Register(); 1629 1630 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1631 // private_segment_aperture_base_hi. 1632 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1633 1634 // TODO: can we be smarter about machine pointer info? 1635 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1636 MachineMemOperand *MMO = MF.getMachineMemOperand( 1637 PtrInfo, 1638 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1639 MachineMemOperand::MOInvariant, 1640 4, commonAlignment(Align(64), StructOffset)); 1641 1642 Register LoadAddr; 1643 1644 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1645 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1646 } 1647 1648 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1649 MachineInstr &MI, MachineRegisterInfo &MRI, 1650 MachineIRBuilder &B) const { 1651 MachineFunction &MF = B.getMF(); 1652 1653 const LLT S32 = LLT::scalar(32); 1654 Register Dst = MI.getOperand(0).getReg(); 1655 Register Src = MI.getOperand(1).getReg(); 1656 1657 LLT DstTy = MRI.getType(Dst); 1658 LLT SrcTy = MRI.getType(Src); 1659 unsigned DestAS = DstTy.getAddressSpace(); 1660 unsigned SrcAS = SrcTy.getAddressSpace(); 1661 1662 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1663 // vector element. 1664 assert(!DstTy.isVector()); 1665 1666 const AMDGPUTargetMachine &TM 1667 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1668 1669 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1670 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1671 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1672 return true; 1673 } 1674 1675 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1676 // Truncate. 1677 B.buildExtract(Dst, Src, 0); 1678 MI.eraseFromParent(); 1679 return true; 1680 } 1681 1682 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1683 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1684 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1685 1686 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1687 // another. Merge operands are required to be the same type, but creating an 1688 // extra ptrtoint would be kind of pointless. 1689 auto HighAddr = B.buildConstant( 1690 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1691 B.buildMerge(Dst, {Src, HighAddr}); 1692 MI.eraseFromParent(); 1693 return true; 1694 } 1695 1696 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1697 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1698 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1699 unsigned NullVal = TM.getNullPointerValue(DestAS); 1700 1701 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1702 auto FlatNull = B.buildConstant(SrcTy, 0); 1703 1704 // Extract low 32-bits of the pointer. 1705 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1706 1707 auto CmpRes = 1708 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1709 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1710 1711 MI.eraseFromParent(); 1712 return true; 1713 } 1714 1715 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1716 return false; 1717 1718 if (!ST.hasFlatAddressSpace()) 1719 return false; 1720 1721 auto SegmentNull = 1722 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1723 auto FlatNull = 1724 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1725 1726 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1727 if (!ApertureReg.isValid()) 1728 return false; 1729 1730 auto CmpRes = 1731 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1732 1733 // Coerce the type of the low half of the result so we can use merge_values. 1734 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1735 1736 // TODO: Should we allow mismatched types but matching sizes in merges to 1737 // avoid the ptrtoint? 1738 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1739 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1740 1741 MI.eraseFromParent(); 1742 return true; 1743 } 1744 1745 bool AMDGPULegalizerInfo::legalizeFrint( 1746 MachineInstr &MI, MachineRegisterInfo &MRI, 1747 MachineIRBuilder &B) const { 1748 Register Src = MI.getOperand(1).getReg(); 1749 LLT Ty = MRI.getType(Src); 1750 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1751 1752 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1753 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1754 1755 auto C1 = B.buildFConstant(Ty, C1Val); 1756 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1757 1758 // TODO: Should this propagate fast-math-flags? 1759 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1760 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1761 1762 auto C2 = B.buildFConstant(Ty, C2Val); 1763 auto Fabs = B.buildFAbs(Ty, Src); 1764 1765 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1766 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1767 MI.eraseFromParent(); 1768 return true; 1769 } 1770 1771 bool AMDGPULegalizerInfo::legalizeFceil( 1772 MachineInstr &MI, MachineRegisterInfo &MRI, 1773 MachineIRBuilder &B) const { 1774 1775 const LLT S1 = LLT::scalar(1); 1776 const LLT S64 = LLT::scalar(64); 1777 1778 Register Src = MI.getOperand(1).getReg(); 1779 assert(MRI.getType(Src) == S64); 1780 1781 // result = trunc(src) 1782 // if (src > 0.0 && src != result) 1783 // result += 1.0 1784 1785 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1786 1787 const auto Zero = B.buildFConstant(S64, 0.0); 1788 const auto One = B.buildFConstant(S64, 1.0); 1789 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1790 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1791 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1792 auto Add = B.buildSelect(S64, And, One, Zero); 1793 1794 // TODO: Should this propagate fast-math-flags? 1795 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1796 return true; 1797 } 1798 1799 static MachineInstrBuilder extractF64Exponent(Register Hi, 1800 MachineIRBuilder &B) { 1801 const unsigned FractBits = 52; 1802 const unsigned ExpBits = 11; 1803 LLT S32 = LLT::scalar(32); 1804 1805 auto Const0 = B.buildConstant(S32, FractBits - 32); 1806 auto Const1 = B.buildConstant(S32, ExpBits); 1807 1808 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1809 .addUse(Hi) 1810 .addUse(Const0.getReg(0)) 1811 .addUse(Const1.getReg(0)); 1812 1813 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1814 } 1815 1816 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1817 MachineInstr &MI, MachineRegisterInfo &MRI, 1818 MachineIRBuilder &B) const { 1819 const LLT S1 = LLT::scalar(1); 1820 const LLT S32 = LLT::scalar(32); 1821 const LLT S64 = LLT::scalar(64); 1822 1823 Register Src = MI.getOperand(1).getReg(); 1824 assert(MRI.getType(Src) == S64); 1825 1826 // TODO: Should this use extract since the low half is unused? 1827 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1828 Register Hi = Unmerge.getReg(1); 1829 1830 // Extract the upper half, since this is where we will find the sign and 1831 // exponent. 1832 auto Exp = extractF64Exponent(Hi, B); 1833 1834 const unsigned FractBits = 52; 1835 1836 // Extract the sign bit. 1837 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1838 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1839 1840 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1841 1842 const auto Zero32 = B.buildConstant(S32, 0); 1843 1844 // Extend back to 64-bits. 1845 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1846 1847 auto Shr = B.buildAShr(S64, FractMask, Exp); 1848 auto Not = B.buildNot(S64, Shr); 1849 auto Tmp0 = B.buildAnd(S64, Src, Not); 1850 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1851 1852 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1853 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1854 1855 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1856 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1857 MI.eraseFromParent(); 1858 return true; 1859 } 1860 1861 bool AMDGPULegalizerInfo::legalizeITOFP( 1862 MachineInstr &MI, MachineRegisterInfo &MRI, 1863 MachineIRBuilder &B, bool Signed) const { 1864 1865 Register Dst = MI.getOperand(0).getReg(); 1866 Register Src = MI.getOperand(1).getReg(); 1867 1868 const LLT S64 = LLT::scalar(64); 1869 const LLT S32 = LLT::scalar(32); 1870 1871 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1872 1873 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1874 1875 auto CvtHi = Signed ? 1876 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1877 B.buildUITOFP(S64, Unmerge.getReg(1)); 1878 1879 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1880 1881 auto ThirtyTwo = B.buildConstant(S32, 32); 1882 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1883 .addUse(CvtHi.getReg(0)) 1884 .addUse(ThirtyTwo.getReg(0)); 1885 1886 // TODO: Should this propagate fast-math-flags? 1887 B.buildFAdd(Dst, LdExp, CvtLo); 1888 MI.eraseFromParent(); 1889 return true; 1890 } 1891 1892 // TODO: Copied from DAG implementation. Verify logic and document how this 1893 // actually works. 1894 bool AMDGPULegalizerInfo::legalizeFPTOI( 1895 MachineInstr &MI, MachineRegisterInfo &MRI, 1896 MachineIRBuilder &B, bool Signed) const { 1897 1898 Register Dst = MI.getOperand(0).getReg(); 1899 Register Src = MI.getOperand(1).getReg(); 1900 1901 const LLT S64 = LLT::scalar(64); 1902 const LLT S32 = LLT::scalar(32); 1903 1904 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1905 1906 unsigned Flags = MI.getFlags(); 1907 1908 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1909 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1910 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1911 1912 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1913 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1914 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1915 1916 auto Hi = Signed ? 1917 B.buildFPTOSI(S32, FloorMul) : 1918 B.buildFPTOUI(S32, FloorMul); 1919 auto Lo = B.buildFPTOUI(S32, Fma); 1920 1921 B.buildMerge(Dst, { Lo, Hi }); 1922 MI.eraseFromParent(); 1923 1924 return true; 1925 } 1926 1927 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 1928 MachineInstr &MI) const { 1929 MachineFunction &MF = Helper.MIRBuilder.getMF(); 1930 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1931 1932 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1933 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1934 1935 // With ieee_mode disabled, the instructions have the correct behavior 1936 // already for G_FMINNUM/G_FMAXNUM 1937 if (!MFI->getMode().IEEE) 1938 return !IsIEEEOp; 1939 1940 if (IsIEEEOp) 1941 return true; 1942 1943 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1944 } 1945 1946 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1947 MachineInstr &MI, MachineRegisterInfo &MRI, 1948 MachineIRBuilder &B) const { 1949 // TODO: Should move some of this into LegalizerHelper. 1950 1951 // TODO: Promote dynamic indexing of s16 to s32 1952 1953 // FIXME: Artifact combiner probably should have replaced the truncated 1954 // constant before this, so we shouldn't need 1955 // getConstantVRegValWithLookThrough. 1956 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1957 MI.getOperand(2).getReg(), MRI); 1958 if (!IdxVal) // Dynamic case will be selected to register indexing. 1959 return true; 1960 1961 Register Dst = MI.getOperand(0).getReg(); 1962 Register Vec = MI.getOperand(1).getReg(); 1963 1964 LLT VecTy = MRI.getType(Vec); 1965 LLT EltTy = VecTy.getElementType(); 1966 assert(EltTy == MRI.getType(Dst)); 1967 1968 if (IdxVal->Value < VecTy.getNumElements()) 1969 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1970 else 1971 B.buildUndef(Dst); 1972 1973 MI.eraseFromParent(); 1974 return true; 1975 } 1976 1977 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1978 MachineInstr &MI, MachineRegisterInfo &MRI, 1979 MachineIRBuilder &B) const { 1980 // TODO: Should move some of this into LegalizerHelper. 1981 1982 // TODO: Promote dynamic indexing of s16 to s32 1983 1984 // FIXME: Artifact combiner probably should have replaced the truncated 1985 // constant before this, so we shouldn't need 1986 // getConstantVRegValWithLookThrough. 1987 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1988 MI.getOperand(3).getReg(), MRI); 1989 if (!IdxVal) // Dynamic case will be selected to register indexing. 1990 return true; 1991 1992 Register Dst = MI.getOperand(0).getReg(); 1993 Register Vec = MI.getOperand(1).getReg(); 1994 Register Ins = MI.getOperand(2).getReg(); 1995 1996 LLT VecTy = MRI.getType(Vec); 1997 LLT EltTy = VecTy.getElementType(); 1998 assert(EltTy == MRI.getType(Ins)); 1999 2000 if (IdxVal->Value < VecTy.getNumElements()) 2001 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2002 else 2003 B.buildUndef(Dst); 2004 2005 MI.eraseFromParent(); 2006 return true; 2007 } 2008 2009 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2010 MachineInstr &MI, MachineRegisterInfo &MRI, 2011 MachineIRBuilder &B) const { 2012 const LLT V2S16 = LLT::vector(2, 16); 2013 2014 Register Dst = MI.getOperand(0).getReg(); 2015 Register Src0 = MI.getOperand(1).getReg(); 2016 LLT DstTy = MRI.getType(Dst); 2017 LLT SrcTy = MRI.getType(Src0); 2018 2019 if (SrcTy == V2S16 && DstTy == V2S16 && 2020 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2021 return true; 2022 2023 MachineIRBuilder HelperBuilder(MI); 2024 GISelObserverWrapper DummyObserver; 2025 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2026 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2027 } 2028 2029 bool AMDGPULegalizerInfo::legalizeSinCos( 2030 MachineInstr &MI, MachineRegisterInfo &MRI, 2031 MachineIRBuilder &B) const { 2032 2033 Register DstReg = MI.getOperand(0).getReg(); 2034 Register SrcReg = MI.getOperand(1).getReg(); 2035 LLT Ty = MRI.getType(DstReg); 2036 unsigned Flags = MI.getFlags(); 2037 2038 Register TrigVal; 2039 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2040 if (ST.hasTrigReducedRange()) { 2041 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2042 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2043 .addUse(MulVal.getReg(0)) 2044 .setMIFlags(Flags).getReg(0); 2045 } else 2046 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2047 2048 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2049 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2050 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2051 .addUse(TrigVal) 2052 .setMIFlags(Flags); 2053 MI.eraseFromParent(); 2054 return true; 2055 } 2056 2057 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2058 MachineIRBuilder &B, 2059 const GlobalValue *GV, 2060 int64_t Offset, 2061 unsigned GAFlags) const { 2062 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2063 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2064 // to the following code sequence: 2065 // 2066 // For constant address space: 2067 // s_getpc_b64 s[0:1] 2068 // s_add_u32 s0, s0, $symbol 2069 // s_addc_u32 s1, s1, 0 2070 // 2071 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2072 // a fixup or relocation is emitted to replace $symbol with a literal 2073 // constant, which is a pc-relative offset from the encoding of the $symbol 2074 // operand to the global variable. 2075 // 2076 // For global address space: 2077 // s_getpc_b64 s[0:1] 2078 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2079 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2080 // 2081 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2082 // fixups or relocations are emitted to replace $symbol@*@lo and 2083 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2084 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2085 // operand to the global variable. 2086 // 2087 // What we want here is an offset from the value returned by s_getpc 2088 // (which is the address of the s_add_u32 instruction) to the global 2089 // variable, but since the encoding of $symbol starts 4 bytes after the start 2090 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2091 // small. This requires us to add 4 to the global variable offset in order to 2092 // compute the correct address. 2093 2094 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2095 2096 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2097 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2098 2099 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2100 .addDef(PCReg); 2101 2102 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2103 if (GAFlags == SIInstrInfo::MO_NONE) 2104 MIB.addImm(0); 2105 else 2106 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2107 2108 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2109 2110 if (PtrTy.getSizeInBits() == 32) 2111 B.buildExtract(DstReg, PCReg, 0); 2112 return true; 2113 } 2114 2115 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2116 MachineInstr &MI, MachineRegisterInfo &MRI, 2117 MachineIRBuilder &B) const { 2118 Register DstReg = MI.getOperand(0).getReg(); 2119 LLT Ty = MRI.getType(DstReg); 2120 unsigned AS = Ty.getAddressSpace(); 2121 2122 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2123 MachineFunction &MF = B.getMF(); 2124 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2125 2126 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2127 if (!MFI->isEntryFunction()) { 2128 const Function &Fn = MF.getFunction(); 2129 DiagnosticInfoUnsupported BadLDSDecl( 2130 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2131 DS_Warning); 2132 Fn.getContext().diagnose(BadLDSDecl); 2133 2134 // We currently don't have a way to correctly allocate LDS objects that 2135 // aren't directly associated with a kernel. We do force inlining of 2136 // functions that use local objects. However, if these dead functions are 2137 // not eliminated, we don't want a compile time error. Just emit a warning 2138 // and a trap, since there should be no callable path here. 2139 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2140 B.buildUndef(DstReg); 2141 MI.eraseFromParent(); 2142 return true; 2143 } 2144 2145 // TODO: We could emit code to handle the initialization somewhere. 2146 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2147 const SITargetLowering *TLI = ST.getTargetLowering(); 2148 if (!TLI->shouldUseLDSConstAddress(GV)) { 2149 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2150 return true; // Leave in place; 2151 } 2152 2153 B.buildConstant( 2154 DstReg, 2155 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2156 MI.eraseFromParent(); 2157 return true; 2158 } 2159 2160 const Function &Fn = MF.getFunction(); 2161 DiagnosticInfoUnsupported BadInit( 2162 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2163 Fn.getContext().diagnose(BadInit); 2164 return true; 2165 } 2166 2167 const SITargetLowering *TLI = ST.getTargetLowering(); 2168 2169 if (TLI->shouldEmitFixup(GV)) { 2170 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2171 MI.eraseFromParent(); 2172 return true; 2173 } 2174 2175 if (TLI->shouldEmitPCReloc(GV)) { 2176 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2177 MI.eraseFromParent(); 2178 return true; 2179 } 2180 2181 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2182 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2183 2184 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2185 MachinePointerInfo::getGOT(MF), 2186 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2187 MachineMemOperand::MOInvariant, 2188 8 /*Size*/, Align(8)); 2189 2190 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2191 2192 if (Ty.getSizeInBits() == 32) { 2193 // Truncate if this is a 32-bit constant adrdess. 2194 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2195 B.buildExtract(DstReg, Load, 0); 2196 } else 2197 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2198 2199 MI.eraseFromParent(); 2200 return true; 2201 } 2202 2203 bool AMDGPULegalizerInfo::legalizeLoad( 2204 MachineInstr &MI, MachineRegisterInfo &MRI, 2205 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2206 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2207 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2208 Observer.changingInstr(MI); 2209 MI.getOperand(1).setReg(Cast.getReg(0)); 2210 Observer.changedInstr(MI); 2211 return true; 2212 } 2213 2214 bool AMDGPULegalizerInfo::legalizeFMad( 2215 MachineInstr &MI, MachineRegisterInfo &MRI, 2216 MachineIRBuilder &B) const { 2217 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2218 assert(Ty.isScalar()); 2219 2220 MachineFunction &MF = B.getMF(); 2221 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2222 2223 // TODO: Always legal with future ftz flag. 2224 // FIXME: Do we need just output? 2225 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2226 return true; 2227 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2228 return true; 2229 2230 MachineIRBuilder HelperBuilder(MI); 2231 GISelObserverWrapper DummyObserver; 2232 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2233 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2234 } 2235 2236 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2237 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2238 Register DstReg = MI.getOperand(0).getReg(); 2239 Register PtrReg = MI.getOperand(1).getReg(); 2240 Register CmpVal = MI.getOperand(2).getReg(); 2241 Register NewVal = MI.getOperand(3).getReg(); 2242 2243 assert(SITargetLowering::isFlatGlobalAddrSpace( 2244 MRI.getType(PtrReg).getAddressSpace()) && 2245 "this should not have been custom lowered"); 2246 2247 LLT ValTy = MRI.getType(CmpVal); 2248 LLT VecTy = LLT::vector(2, ValTy); 2249 2250 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2251 2252 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2253 .addDef(DstReg) 2254 .addUse(PtrReg) 2255 .addUse(PackedVal) 2256 .setMemRefs(MI.memoperands()); 2257 2258 MI.eraseFromParent(); 2259 return true; 2260 } 2261 2262 bool AMDGPULegalizerInfo::legalizeFlog( 2263 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2264 Register Dst = MI.getOperand(0).getReg(); 2265 Register Src = MI.getOperand(1).getReg(); 2266 LLT Ty = B.getMRI()->getType(Dst); 2267 unsigned Flags = MI.getFlags(); 2268 2269 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2270 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2271 2272 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2273 MI.eraseFromParent(); 2274 return true; 2275 } 2276 2277 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2278 MachineIRBuilder &B) const { 2279 Register Dst = MI.getOperand(0).getReg(); 2280 Register Src = MI.getOperand(1).getReg(); 2281 unsigned Flags = MI.getFlags(); 2282 LLT Ty = B.getMRI()->getType(Dst); 2283 2284 auto K = B.buildFConstant(Ty, numbers::log2e); 2285 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2286 B.buildFExp2(Dst, Mul, Flags); 2287 MI.eraseFromParent(); 2288 return true; 2289 } 2290 2291 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2292 MachineIRBuilder &B) const { 2293 Register Dst = MI.getOperand(0).getReg(); 2294 Register Src0 = MI.getOperand(1).getReg(); 2295 Register Src1 = MI.getOperand(2).getReg(); 2296 unsigned Flags = MI.getFlags(); 2297 LLT Ty = B.getMRI()->getType(Dst); 2298 const LLT S16 = LLT::scalar(16); 2299 const LLT S32 = LLT::scalar(32); 2300 2301 if (Ty == S32) { 2302 auto Log = B.buildFLog2(S32, Src0, Flags); 2303 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2304 .addUse(Log.getReg(0)) 2305 .addUse(Src1) 2306 .setMIFlags(Flags); 2307 B.buildFExp2(Dst, Mul, Flags); 2308 } else if (Ty == S16) { 2309 // There's no f16 fmul_legacy, so we need to convert for it. 2310 auto Log = B.buildFLog2(S16, Src0, Flags); 2311 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2312 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2313 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2314 .addUse(Ext0.getReg(0)) 2315 .addUse(Ext1.getReg(0)) 2316 .setMIFlags(Flags); 2317 2318 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2319 } else 2320 return false; 2321 2322 MI.eraseFromParent(); 2323 return true; 2324 } 2325 2326 // Find a source register, ignoring any possible source modifiers. 2327 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2328 Register ModSrc = OrigSrc; 2329 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2330 ModSrc = SrcFNeg->getOperand(1).getReg(); 2331 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2332 ModSrc = SrcFAbs->getOperand(1).getReg(); 2333 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2334 ModSrc = SrcFAbs->getOperand(1).getReg(); 2335 return ModSrc; 2336 } 2337 2338 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2339 MachineRegisterInfo &MRI, 2340 MachineIRBuilder &B) const { 2341 2342 const LLT S1 = LLT::scalar(1); 2343 const LLT S64 = LLT::scalar(64); 2344 Register Dst = MI.getOperand(0).getReg(); 2345 Register OrigSrc = MI.getOperand(1).getReg(); 2346 unsigned Flags = MI.getFlags(); 2347 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2348 "this should not have been custom lowered"); 2349 2350 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2351 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2352 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2353 // V_FRACT bug is: 2354 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2355 // 2356 // Convert floor(x) to (x - fract(x)) 2357 2358 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2359 .addUse(OrigSrc) 2360 .setMIFlags(Flags); 2361 2362 // Give source modifier matching some assistance before obscuring a foldable 2363 // pattern. 2364 2365 // TODO: We can avoid the neg on the fract? The input sign to fract 2366 // shouldn't matter? 2367 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2368 2369 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2370 2371 Register Min = MRI.createGenericVirtualRegister(S64); 2372 2373 // We don't need to concern ourselves with the snan handling difference, so 2374 // use the one which will directly select. 2375 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2376 if (MFI->getMode().IEEE) 2377 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2378 else 2379 B.buildFMinNum(Min, Fract, Const, Flags); 2380 2381 Register CorrectedFract = Min; 2382 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2383 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2384 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2385 } 2386 2387 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2388 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2389 2390 MI.eraseFromParent(); 2391 return true; 2392 } 2393 2394 // Turn an illegal packed v2s16 build vector into bit operations. 2395 // TODO: This should probably be a bitcast action in LegalizerHelper. 2396 bool AMDGPULegalizerInfo::legalizeBuildVector( 2397 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2398 Register Dst = MI.getOperand(0).getReg(); 2399 const LLT S32 = LLT::scalar(32); 2400 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2401 2402 Register Src0 = MI.getOperand(1).getReg(); 2403 Register Src1 = MI.getOperand(2).getReg(); 2404 assert(MRI.getType(Src0) == LLT::scalar(16)); 2405 2406 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2407 B.buildBitcast(Dst, Merge); 2408 2409 MI.eraseFromParent(); 2410 return true; 2411 } 2412 2413 // Return the use branch instruction, otherwise null if the usage is invalid. 2414 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2415 MachineRegisterInfo &MRI, 2416 MachineInstr *&Br, 2417 MachineBasicBlock *&UncondBrTarget) { 2418 Register CondDef = MI.getOperand(0).getReg(); 2419 if (!MRI.hasOneNonDBGUse(CondDef)) 2420 return nullptr; 2421 2422 MachineBasicBlock *Parent = MI.getParent(); 2423 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2424 if (UseMI.getParent() != Parent || 2425 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2426 return nullptr; 2427 2428 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2429 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2430 if (Next == Parent->end()) { 2431 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2432 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2433 return nullptr; 2434 UncondBrTarget = &*NextMBB; 2435 } else { 2436 if (Next->getOpcode() != AMDGPU::G_BR) 2437 return nullptr; 2438 Br = &*Next; 2439 UncondBrTarget = Br->getOperand(0).getMBB(); 2440 } 2441 2442 return &UseMI; 2443 } 2444 2445 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2446 MachineRegisterInfo &MRI, 2447 Register LiveIn, 2448 Register PhyReg) const { 2449 assert(PhyReg.isPhysical() && "Physical register expected"); 2450 2451 // Insert the live-in copy, if required, by defining destination virtual 2452 // register. 2453 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2454 if (!MRI.getVRegDef(LiveIn)) { 2455 // FIXME: Should have scoped insert pt 2456 MachineBasicBlock &OrigInsBB = B.getMBB(); 2457 auto OrigInsPt = B.getInsertPt(); 2458 2459 MachineBasicBlock &EntryMBB = B.getMF().front(); 2460 EntryMBB.addLiveIn(PhyReg); 2461 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2462 B.buildCopy(LiveIn, PhyReg); 2463 2464 B.setInsertPt(OrigInsBB, OrigInsPt); 2465 } 2466 2467 return LiveIn; 2468 } 2469 2470 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2471 MachineRegisterInfo &MRI, 2472 Register PhyReg, LLT Ty, 2473 bool InsertLiveInCopy) const { 2474 assert(PhyReg.isPhysical() && "Physical register expected"); 2475 2476 // Get or create virtual live-in regester 2477 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2478 if (!LiveIn) { 2479 LiveIn = MRI.createGenericVirtualRegister(Ty); 2480 MRI.addLiveIn(PhyReg, LiveIn); 2481 } 2482 2483 // When the actual true copy required is from virtual register to physical 2484 // register (to be inserted later), live-in copy insertion from physical 2485 // to register virtual register is not required 2486 if (!InsertLiveInCopy) 2487 return LiveIn; 2488 2489 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2490 } 2491 2492 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2493 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2494 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2495 const ArgDescriptor *Arg; 2496 const TargetRegisterClass *RC; 2497 LLT ArgTy; 2498 std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType); 2499 if (!Arg) { 2500 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2501 return nullptr; 2502 } 2503 return Arg; 2504 } 2505 2506 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2507 const ArgDescriptor *Arg) const { 2508 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2509 return false; // TODO: Handle these 2510 2511 Register SrcReg = Arg->getRegister(); 2512 assert(SrcReg.isPhysical() && "Physical register expected"); 2513 assert(DstReg.isVirtual() && "Virtual register expected"); 2514 2515 MachineRegisterInfo &MRI = *B.getMRI(); 2516 2517 LLT Ty = MRI.getType(DstReg); 2518 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2519 2520 if (Arg->isMasked()) { 2521 // TODO: Should we try to emit this once in the entry block? 2522 const LLT S32 = LLT::scalar(32); 2523 const unsigned Mask = Arg->getMask(); 2524 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2525 2526 Register AndMaskSrc = LiveIn; 2527 2528 if (Shift != 0) { 2529 auto ShiftAmt = B.buildConstant(S32, Shift); 2530 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2531 } 2532 2533 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2534 } else { 2535 B.buildCopy(DstReg, LiveIn); 2536 } 2537 2538 return true; 2539 } 2540 2541 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2542 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2543 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2544 2545 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2546 if (!Arg) 2547 return false; 2548 2549 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2550 return false; 2551 2552 MI.eraseFromParent(); 2553 return true; 2554 } 2555 2556 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2557 MachineRegisterInfo &MRI, 2558 MachineIRBuilder &B) const { 2559 Register Dst = MI.getOperand(0).getReg(); 2560 LLT DstTy = MRI.getType(Dst); 2561 LLT S16 = LLT::scalar(16); 2562 LLT S32 = LLT::scalar(32); 2563 LLT S64 = LLT::scalar(64); 2564 2565 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2566 return true; 2567 2568 if (DstTy == S16) 2569 return legalizeFDIV16(MI, MRI, B); 2570 if (DstTy == S32) 2571 return legalizeFDIV32(MI, MRI, B); 2572 if (DstTy == S64) 2573 return legalizeFDIV64(MI, MRI, B); 2574 2575 return false; 2576 } 2577 2578 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2579 Register DstReg, 2580 Register X, 2581 Register Y, 2582 bool IsDiv) const { 2583 const LLT S1 = LLT::scalar(1); 2584 const LLT S32 = LLT::scalar(32); 2585 2586 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2587 // algorithm used here. 2588 2589 // Initial estimate of inv(y). 2590 auto FloatY = B.buildUITOFP(S32, Y); 2591 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2592 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2593 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2594 auto Z = B.buildFPTOUI(S32, ScaledY); 2595 2596 // One round of UNR. 2597 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2598 auto NegYZ = B.buildMul(S32, NegY, Z); 2599 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2600 2601 // Quotient/remainder estimate. 2602 auto Q = B.buildUMulH(S32, X, Z); 2603 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2604 2605 // First quotient/remainder refinement. 2606 auto One = B.buildConstant(S32, 1); 2607 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2608 if (IsDiv) 2609 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2610 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2611 2612 // Second quotient/remainder refinement. 2613 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2614 if (IsDiv) 2615 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2616 else 2617 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2618 } 2619 2620 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2621 MachineRegisterInfo &MRI, 2622 MachineIRBuilder &B) const { 2623 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2624 Register DstReg = MI.getOperand(0).getReg(); 2625 Register Num = MI.getOperand(1).getReg(); 2626 Register Den = MI.getOperand(2).getReg(); 2627 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2628 MI.eraseFromParent(); 2629 return true; 2630 } 2631 2632 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2633 // 2634 // Return lo, hi of result 2635 // 2636 // %cvt.lo = G_UITOFP Val.lo 2637 // %cvt.hi = G_UITOFP Val.hi 2638 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2639 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2640 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2641 // %mul2 = G_FMUL %mul1, 2**(-32) 2642 // %trunc = G_INTRINSIC_TRUNC %mul2 2643 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2644 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2645 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2646 Register Val) { 2647 const LLT S32 = LLT::scalar(32); 2648 auto Unmerge = B.buildUnmerge(S32, Val); 2649 2650 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2651 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2652 2653 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2654 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2655 2656 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2657 auto Mul1 = 2658 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2659 2660 // 2**(-32) 2661 auto Mul2 = 2662 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2663 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2664 2665 // -(2**32) 2666 auto Mad2 = B.buildFMAD(S32, Trunc, 2667 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2668 2669 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2670 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2671 2672 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2673 } 2674 2675 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2676 Register DstReg, 2677 Register Numer, 2678 Register Denom, 2679 bool IsDiv) const { 2680 const LLT S32 = LLT::scalar(32); 2681 const LLT S64 = LLT::scalar(64); 2682 const LLT S1 = LLT::scalar(1); 2683 Register RcpLo, RcpHi; 2684 2685 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2686 2687 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2688 2689 auto Zero64 = B.buildConstant(S64, 0); 2690 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2691 2692 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2693 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2694 2695 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2696 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2697 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2698 2699 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2700 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2701 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2702 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2703 2704 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2705 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2706 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2707 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2708 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2709 2710 auto Zero32 = B.buildConstant(S32, 0); 2711 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2712 auto Add2_HiC = 2713 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2714 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2715 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2716 2717 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2718 Register NumerLo = UnmergeNumer.getReg(0); 2719 Register NumerHi = UnmergeNumer.getReg(1); 2720 2721 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2722 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2723 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2724 Register Mul3_Lo = UnmergeMul3.getReg(0); 2725 Register Mul3_Hi = UnmergeMul3.getReg(1); 2726 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2727 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2728 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2729 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2730 2731 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2732 Register DenomLo = UnmergeDenom.getReg(0); 2733 Register DenomHi = UnmergeDenom.getReg(1); 2734 2735 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2736 auto C1 = B.buildSExt(S32, CmpHi); 2737 2738 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2739 auto C2 = B.buildSExt(S32, CmpLo); 2740 2741 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2742 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2743 2744 // TODO: Here and below portions of the code can be enclosed into if/endif. 2745 // Currently control flow is unconditional and we have 4 selects after 2746 // potential endif to substitute PHIs. 2747 2748 // if C3 != 0 ... 2749 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2750 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2751 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2752 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2753 2754 auto One64 = B.buildConstant(S64, 1); 2755 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2756 2757 auto C4 = 2758 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2759 auto C5 = 2760 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2761 auto C6 = B.buildSelect( 2762 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2763 2764 // if (C6 != 0) 2765 auto Add4 = B.buildAdd(S64, Add3, One64); 2766 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2767 2768 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2769 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2770 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2771 2772 // endif C6 2773 // endif C3 2774 2775 if (IsDiv) { 2776 auto Sel1 = B.buildSelect( 2777 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2778 B.buildSelect(DstReg, 2779 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2780 } else { 2781 auto Sel2 = B.buildSelect( 2782 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2783 B.buildSelect(DstReg, 2784 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2785 } 2786 } 2787 2788 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2789 MachineRegisterInfo &MRI, 2790 MachineIRBuilder &B) const { 2791 const LLT S64 = LLT::scalar(64); 2792 const LLT S32 = LLT::scalar(32); 2793 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2794 Register DstReg = MI.getOperand(0).getReg(); 2795 Register Num = MI.getOperand(1).getReg(); 2796 Register Den = MI.getOperand(2).getReg(); 2797 LLT Ty = MRI.getType(DstReg); 2798 2799 if (Ty == S32) 2800 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2801 else if (Ty == S64) 2802 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2803 else 2804 return false; 2805 2806 MI.eraseFromParent(); 2807 return true; 2808 2809 } 2810 2811 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2812 MachineRegisterInfo &MRI, 2813 MachineIRBuilder &B) const { 2814 const LLT S64 = LLT::scalar(64); 2815 const LLT S32 = LLT::scalar(32); 2816 2817 Register DstReg = MI.getOperand(0).getReg(); 2818 const LLT Ty = MRI.getType(DstReg); 2819 if (Ty != S32 && Ty != S64) 2820 return false; 2821 2822 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2823 2824 Register LHS = MI.getOperand(1).getReg(); 2825 Register RHS = MI.getOperand(2).getReg(); 2826 2827 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2828 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2829 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2830 2831 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2832 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2833 2834 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2835 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2836 2837 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2838 if (Ty == S32) 2839 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2840 else 2841 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2842 2843 Register Sign; 2844 if (IsDiv) 2845 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2846 else 2847 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2848 2849 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2850 B.buildSub(DstReg, UDivRem, Sign); 2851 2852 MI.eraseFromParent(); 2853 return true; 2854 } 2855 2856 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2857 MachineRegisterInfo &MRI, 2858 MachineIRBuilder &B) const { 2859 Register Res = MI.getOperand(0).getReg(); 2860 Register LHS = MI.getOperand(1).getReg(); 2861 Register RHS = MI.getOperand(2).getReg(); 2862 2863 uint16_t Flags = MI.getFlags(); 2864 2865 LLT ResTy = MRI.getType(Res); 2866 LLT S32 = LLT::scalar(32); 2867 LLT S64 = LLT::scalar(64); 2868 2869 const MachineFunction &MF = B.getMF(); 2870 bool Unsafe = 2871 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2872 2873 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2874 return false; 2875 2876 if (!Unsafe && ResTy == S32 && 2877 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2878 return false; 2879 2880 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2881 // 1 / x -> RCP(x) 2882 if (CLHS->isExactlyValue(1.0)) { 2883 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2884 .addUse(RHS) 2885 .setMIFlags(Flags); 2886 2887 MI.eraseFromParent(); 2888 return true; 2889 } 2890 2891 // -1 / x -> RCP( FNEG(x) ) 2892 if (CLHS->isExactlyValue(-1.0)) { 2893 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2894 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2895 .addUse(FNeg.getReg(0)) 2896 .setMIFlags(Flags); 2897 2898 MI.eraseFromParent(); 2899 return true; 2900 } 2901 } 2902 2903 // x / y -> x * (1.0 / y) 2904 if (Unsafe) { 2905 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2906 .addUse(RHS) 2907 .setMIFlags(Flags); 2908 B.buildFMul(Res, LHS, RCP, Flags); 2909 2910 MI.eraseFromParent(); 2911 return true; 2912 } 2913 2914 return false; 2915 } 2916 2917 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2918 MachineRegisterInfo &MRI, 2919 MachineIRBuilder &B) const { 2920 Register Res = MI.getOperand(0).getReg(); 2921 Register LHS = MI.getOperand(1).getReg(); 2922 Register RHS = MI.getOperand(2).getReg(); 2923 2924 uint16_t Flags = MI.getFlags(); 2925 2926 LLT S16 = LLT::scalar(16); 2927 LLT S32 = LLT::scalar(32); 2928 2929 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2930 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2931 2932 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2933 .addUse(RHSExt.getReg(0)) 2934 .setMIFlags(Flags); 2935 2936 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2937 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2938 2939 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2940 .addUse(RDst.getReg(0)) 2941 .addUse(RHS) 2942 .addUse(LHS) 2943 .setMIFlags(Flags); 2944 2945 MI.eraseFromParent(); 2946 return true; 2947 } 2948 2949 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2950 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2951 static void toggleSPDenormMode(bool Enable, 2952 MachineIRBuilder &B, 2953 const GCNSubtarget &ST, 2954 AMDGPU::SIModeRegisterDefaults Mode) { 2955 // Set SP denorm mode to this value. 2956 unsigned SPDenormMode = 2957 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2958 2959 if (ST.hasDenormModeInst()) { 2960 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2961 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2962 2963 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2964 B.buildInstr(AMDGPU::S_DENORM_MODE) 2965 .addImm(NewDenormModeValue); 2966 2967 } else { 2968 // Select FP32 bit field in mode register. 2969 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2970 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2971 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2972 2973 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2974 .addImm(SPDenormMode) 2975 .addImm(SPDenormModeBitField); 2976 } 2977 } 2978 2979 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2980 MachineRegisterInfo &MRI, 2981 MachineIRBuilder &B) const { 2982 Register Res = MI.getOperand(0).getReg(); 2983 Register LHS = MI.getOperand(1).getReg(); 2984 Register RHS = MI.getOperand(2).getReg(); 2985 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2986 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2987 2988 uint16_t Flags = MI.getFlags(); 2989 2990 LLT S32 = LLT::scalar(32); 2991 LLT S1 = LLT::scalar(1); 2992 2993 auto One = B.buildFConstant(S32, 1.0f); 2994 2995 auto DenominatorScaled = 2996 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2997 .addUse(LHS) 2998 .addUse(RHS) 2999 .addImm(0) 3000 .setMIFlags(Flags); 3001 auto NumeratorScaled = 3002 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3003 .addUse(LHS) 3004 .addUse(RHS) 3005 .addImm(1) 3006 .setMIFlags(Flags); 3007 3008 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3009 .addUse(DenominatorScaled.getReg(0)) 3010 .setMIFlags(Flags); 3011 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3012 3013 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3014 // aren't modeled as reading it. 3015 if (!Mode.allFP32Denormals()) 3016 toggleSPDenormMode(true, B, ST, Mode); 3017 3018 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3019 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3020 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3021 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3022 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3023 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3024 3025 if (!Mode.allFP32Denormals()) 3026 toggleSPDenormMode(false, B, ST, Mode); 3027 3028 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3029 .addUse(Fma4.getReg(0)) 3030 .addUse(Fma1.getReg(0)) 3031 .addUse(Fma3.getReg(0)) 3032 .addUse(NumeratorScaled.getReg(1)) 3033 .setMIFlags(Flags); 3034 3035 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3036 .addUse(Fmas.getReg(0)) 3037 .addUse(RHS) 3038 .addUse(LHS) 3039 .setMIFlags(Flags); 3040 3041 MI.eraseFromParent(); 3042 return true; 3043 } 3044 3045 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3046 MachineRegisterInfo &MRI, 3047 MachineIRBuilder &B) const { 3048 Register Res = MI.getOperand(0).getReg(); 3049 Register LHS = MI.getOperand(1).getReg(); 3050 Register RHS = MI.getOperand(2).getReg(); 3051 3052 uint16_t Flags = MI.getFlags(); 3053 3054 LLT S64 = LLT::scalar(64); 3055 LLT S1 = LLT::scalar(1); 3056 3057 auto One = B.buildFConstant(S64, 1.0); 3058 3059 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3060 .addUse(LHS) 3061 .addUse(RHS) 3062 .addImm(0) 3063 .setMIFlags(Flags); 3064 3065 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3066 3067 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3068 .addUse(DivScale0.getReg(0)) 3069 .setMIFlags(Flags); 3070 3071 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3072 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3073 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3074 3075 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3076 .addUse(LHS) 3077 .addUse(RHS) 3078 .addImm(1) 3079 .setMIFlags(Flags); 3080 3081 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3082 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3083 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3084 3085 Register Scale; 3086 if (!ST.hasUsableDivScaleConditionOutput()) { 3087 // Workaround a hardware bug on SI where the condition output from div_scale 3088 // is not usable. 3089 3090 LLT S32 = LLT::scalar(32); 3091 3092 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3093 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3094 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3095 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3096 3097 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3098 Scale1Unmerge.getReg(1)); 3099 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3100 Scale0Unmerge.getReg(1)); 3101 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3102 } else { 3103 Scale = DivScale1.getReg(1); 3104 } 3105 3106 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3107 .addUse(Fma4.getReg(0)) 3108 .addUse(Fma3.getReg(0)) 3109 .addUse(Mul.getReg(0)) 3110 .addUse(Scale) 3111 .setMIFlags(Flags); 3112 3113 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3114 .addUse(Fmas.getReg(0)) 3115 .addUse(RHS) 3116 .addUse(LHS) 3117 .setMIFlags(Flags); 3118 3119 MI.eraseFromParent(); 3120 return true; 3121 } 3122 3123 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3124 MachineRegisterInfo &MRI, 3125 MachineIRBuilder &B) const { 3126 Register Res = MI.getOperand(0).getReg(); 3127 Register LHS = MI.getOperand(2).getReg(); 3128 Register RHS = MI.getOperand(3).getReg(); 3129 uint16_t Flags = MI.getFlags(); 3130 3131 LLT S32 = LLT::scalar(32); 3132 LLT S1 = LLT::scalar(1); 3133 3134 auto Abs = B.buildFAbs(S32, RHS, Flags); 3135 const APFloat C0Val(1.0f); 3136 3137 auto C0 = B.buildConstant(S32, 0x6f800000); 3138 auto C1 = B.buildConstant(S32, 0x2f800000); 3139 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3140 3141 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3142 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3143 3144 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3145 3146 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3147 .addUse(Mul0.getReg(0)) 3148 .setMIFlags(Flags); 3149 3150 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3151 3152 B.buildFMul(Res, Sel, Mul1, Flags); 3153 3154 MI.eraseFromParent(); 3155 return true; 3156 } 3157 3158 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3159 MachineRegisterInfo &MRI, 3160 MachineIRBuilder &B) const { 3161 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3162 uint64_t Offset = 3163 ST.getTargetLowering()->getImplicitParameterOffset( 3164 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3165 LLT DstTy = MRI.getType(DstReg); 3166 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3167 3168 const ArgDescriptor *Arg; 3169 const TargetRegisterClass *RC; 3170 LLT ArgTy; 3171 std::tie(Arg, RC, ArgTy) = 3172 MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3173 if (!Arg) 3174 return false; 3175 3176 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3177 if (!loadInputValue(KernargPtrReg, B, Arg)) 3178 return false; 3179 3180 // FIXME: This should be nuw 3181 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3182 return true; 3183 } 3184 3185 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3186 MachineRegisterInfo &MRI, 3187 MachineIRBuilder &B) const { 3188 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3189 if (!MFI->isEntryFunction()) { 3190 return legalizePreloadedArgIntrin(MI, MRI, B, 3191 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3192 } 3193 3194 Register DstReg = MI.getOperand(0).getReg(); 3195 if (!getImplicitArgPtr(DstReg, MRI, B)) 3196 return false; 3197 3198 MI.eraseFromParent(); 3199 return true; 3200 } 3201 3202 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3203 MachineRegisterInfo &MRI, 3204 MachineIRBuilder &B, 3205 unsigned AddrSpace) const { 3206 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3207 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3208 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3209 MI.eraseFromParent(); 3210 return true; 3211 } 3212 3213 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3214 // offset (the offset that is included in bounds checking and swizzling, to be 3215 // split between the instruction's voffset and immoffset fields) and soffset 3216 // (the offset that is excluded from bounds checking and swizzling, to go in 3217 // the instruction's soffset field). This function takes the first kind of 3218 // offset and figures out how to split it between voffset and immoffset. 3219 std::tuple<Register, unsigned, unsigned> 3220 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3221 Register OrigOffset) const { 3222 const unsigned MaxImm = 4095; 3223 Register BaseReg; 3224 unsigned TotalConstOffset; 3225 MachineInstr *OffsetDef; 3226 const LLT S32 = LLT::scalar(32); 3227 3228 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3229 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3230 3231 unsigned ImmOffset = TotalConstOffset; 3232 3233 // If the immediate value is too big for the immoffset field, put the value 3234 // and -4096 into the immoffset field so that the value that is copied/added 3235 // for the voffset field is a multiple of 4096, and it stands more chance 3236 // of being CSEd with the copy/add for another similar load/store. 3237 // However, do not do that rounding down to a multiple of 4096 if that is a 3238 // negative number, as it appears to be illegal to have a negative offset 3239 // in the vgpr, even if adding the immediate offset makes it positive. 3240 unsigned Overflow = ImmOffset & ~MaxImm; 3241 ImmOffset -= Overflow; 3242 if ((int32_t)Overflow < 0) { 3243 Overflow += ImmOffset; 3244 ImmOffset = 0; 3245 } 3246 3247 if (Overflow != 0) { 3248 if (!BaseReg) { 3249 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3250 } else { 3251 auto OverflowVal = B.buildConstant(S32, Overflow); 3252 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3253 } 3254 } 3255 3256 if (!BaseReg) 3257 BaseReg = B.buildConstant(S32, 0).getReg(0); 3258 3259 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3260 } 3261 3262 /// Handle register layout difference for f16 images for some subtargets. 3263 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3264 MachineRegisterInfo &MRI, 3265 Register Reg) const { 3266 if (!ST.hasUnpackedD16VMem()) 3267 return Reg; 3268 3269 const LLT S16 = LLT::scalar(16); 3270 const LLT S32 = LLT::scalar(32); 3271 LLT StoreVT = MRI.getType(Reg); 3272 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3273 3274 auto Unmerge = B.buildUnmerge(S16, Reg); 3275 3276 SmallVector<Register, 4> WideRegs; 3277 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3278 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3279 3280 int NumElts = StoreVT.getNumElements(); 3281 3282 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3283 } 3284 3285 Register AMDGPULegalizerInfo::fixStoreSourceType( 3286 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3287 MachineRegisterInfo *MRI = B.getMRI(); 3288 LLT Ty = MRI->getType(VData); 3289 3290 const LLT S16 = LLT::scalar(16); 3291 3292 // Fixup illegal register types for i8 stores. 3293 if (Ty == LLT::scalar(8) || Ty == S16) { 3294 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3295 return AnyExt; 3296 } 3297 3298 if (Ty.isVector()) { 3299 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3300 if (IsFormat) 3301 return handleD16VData(B, *MRI, VData); 3302 } 3303 } 3304 3305 return VData; 3306 } 3307 3308 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3309 MachineRegisterInfo &MRI, 3310 MachineIRBuilder &B, 3311 bool IsTyped, 3312 bool IsFormat) const { 3313 Register VData = MI.getOperand(1).getReg(); 3314 LLT Ty = MRI.getType(VData); 3315 LLT EltTy = Ty.getScalarType(); 3316 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3317 const LLT S32 = LLT::scalar(32); 3318 3319 VData = fixStoreSourceType(B, VData, IsFormat); 3320 Register RSrc = MI.getOperand(2).getReg(); 3321 3322 MachineMemOperand *MMO = *MI.memoperands_begin(); 3323 const int MemSize = MMO->getSize(); 3324 3325 unsigned ImmOffset; 3326 unsigned TotalOffset; 3327 3328 // The typed intrinsics add an immediate after the registers. 3329 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3330 3331 // The struct intrinsic variants add one additional operand over raw. 3332 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3333 Register VIndex; 3334 int OpOffset = 0; 3335 if (HasVIndex) { 3336 VIndex = MI.getOperand(3).getReg(); 3337 OpOffset = 1; 3338 } 3339 3340 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3341 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3342 3343 unsigned Format = 0; 3344 if (IsTyped) { 3345 Format = MI.getOperand(5 + OpOffset).getImm(); 3346 ++OpOffset; 3347 } 3348 3349 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3350 3351 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3352 if (TotalOffset != 0) 3353 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3354 3355 unsigned Opc; 3356 if (IsTyped) { 3357 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3358 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3359 } else if (IsFormat) { 3360 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3361 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3362 } else { 3363 switch (MemSize) { 3364 case 1: 3365 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3366 break; 3367 case 2: 3368 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3369 break; 3370 default: 3371 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3372 break; 3373 } 3374 } 3375 3376 if (!VIndex) 3377 VIndex = B.buildConstant(S32, 0).getReg(0); 3378 3379 auto MIB = B.buildInstr(Opc) 3380 .addUse(VData) // vdata 3381 .addUse(RSrc) // rsrc 3382 .addUse(VIndex) // vindex 3383 .addUse(VOffset) // voffset 3384 .addUse(SOffset) // soffset 3385 .addImm(ImmOffset); // offset(imm) 3386 3387 if (IsTyped) 3388 MIB.addImm(Format); 3389 3390 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3391 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3392 .addMemOperand(MMO); 3393 3394 MI.eraseFromParent(); 3395 return true; 3396 } 3397 3398 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3399 MachineRegisterInfo &MRI, 3400 MachineIRBuilder &B, 3401 bool IsFormat, 3402 bool IsTyped) const { 3403 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3404 MachineMemOperand *MMO = *MI.memoperands_begin(); 3405 const int MemSize = MMO->getSize(); 3406 const LLT S32 = LLT::scalar(32); 3407 3408 Register Dst = MI.getOperand(0).getReg(); 3409 Register RSrc = MI.getOperand(2).getReg(); 3410 3411 // The typed intrinsics add an immediate after the registers. 3412 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3413 3414 // The struct intrinsic variants add one additional operand over raw. 3415 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3416 Register VIndex; 3417 int OpOffset = 0; 3418 if (HasVIndex) { 3419 VIndex = MI.getOperand(3).getReg(); 3420 OpOffset = 1; 3421 } 3422 3423 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3424 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3425 3426 unsigned Format = 0; 3427 if (IsTyped) { 3428 Format = MI.getOperand(5 + OpOffset).getImm(); 3429 ++OpOffset; 3430 } 3431 3432 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3433 unsigned ImmOffset; 3434 unsigned TotalOffset; 3435 3436 LLT Ty = MRI.getType(Dst); 3437 LLT EltTy = Ty.getScalarType(); 3438 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3439 const bool Unpacked = ST.hasUnpackedD16VMem(); 3440 3441 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3442 if (TotalOffset != 0) 3443 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3444 3445 unsigned Opc; 3446 3447 if (IsTyped) { 3448 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3449 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3450 } else if (IsFormat) { 3451 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3452 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3453 } else { 3454 switch (MemSize) { 3455 case 1: 3456 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3457 break; 3458 case 2: 3459 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3460 break; 3461 default: 3462 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3463 break; 3464 } 3465 } 3466 3467 Register LoadDstReg; 3468 3469 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3470 LLT UnpackedTy = Ty.changeElementSize(32); 3471 3472 if (IsExtLoad) 3473 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3474 else if (Unpacked && IsD16 && Ty.isVector()) 3475 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3476 else 3477 LoadDstReg = Dst; 3478 3479 if (!VIndex) 3480 VIndex = B.buildConstant(S32, 0).getReg(0); 3481 3482 auto MIB = B.buildInstr(Opc) 3483 .addDef(LoadDstReg) // vdata 3484 .addUse(RSrc) // rsrc 3485 .addUse(VIndex) // vindex 3486 .addUse(VOffset) // voffset 3487 .addUse(SOffset) // soffset 3488 .addImm(ImmOffset); // offset(imm) 3489 3490 if (IsTyped) 3491 MIB.addImm(Format); 3492 3493 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3494 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3495 .addMemOperand(MMO); 3496 3497 if (LoadDstReg != Dst) { 3498 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3499 3500 // Widen result for extending loads was widened. 3501 if (IsExtLoad) 3502 B.buildTrunc(Dst, LoadDstReg); 3503 else { 3504 // Repack to original 16-bit vector result 3505 // FIXME: G_TRUNC should work, but legalization currently fails 3506 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3507 SmallVector<Register, 4> Repack; 3508 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3509 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3510 B.buildMerge(Dst, Repack); 3511 } 3512 } 3513 3514 MI.eraseFromParent(); 3515 return true; 3516 } 3517 3518 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3519 MachineIRBuilder &B, 3520 bool IsInc) const { 3521 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3522 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3523 B.buildInstr(Opc) 3524 .addDef(MI.getOperand(0).getReg()) 3525 .addUse(MI.getOperand(2).getReg()) 3526 .addUse(MI.getOperand(3).getReg()) 3527 .cloneMemRefs(MI); 3528 MI.eraseFromParent(); 3529 return true; 3530 } 3531 3532 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3533 switch (IntrID) { 3534 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3535 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3536 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3537 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3538 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3539 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3540 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3541 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3542 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3543 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3544 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3545 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3546 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3547 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3548 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3549 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3550 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3551 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3552 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3553 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3554 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3555 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3556 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3557 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3558 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3559 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3560 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3561 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3562 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3563 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3564 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3565 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3566 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3567 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3568 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3569 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3570 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3571 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3572 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3573 default: 3574 llvm_unreachable("unhandled atomic opcode"); 3575 } 3576 } 3577 3578 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3579 MachineIRBuilder &B, 3580 Intrinsic::ID IID) const { 3581 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3582 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3583 3584 Register Dst = MI.getOperand(0).getReg(); 3585 Register VData = MI.getOperand(2).getReg(); 3586 3587 Register CmpVal; 3588 int OpOffset = 0; 3589 3590 if (IsCmpSwap) { 3591 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3592 ++OpOffset; 3593 } 3594 3595 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3596 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3597 3598 // The struct intrinsic variants add one additional operand over raw. 3599 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3600 Register VIndex; 3601 if (HasVIndex) { 3602 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3603 ++OpOffset; 3604 } 3605 3606 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3607 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3608 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3609 3610 MachineMemOperand *MMO = *MI.memoperands_begin(); 3611 3612 unsigned ImmOffset; 3613 unsigned TotalOffset; 3614 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3615 if (TotalOffset != 0) 3616 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3617 3618 if (!VIndex) 3619 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3620 3621 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3622 .addDef(Dst) 3623 .addUse(VData); // vdata 3624 3625 if (IsCmpSwap) 3626 MIB.addReg(CmpVal); 3627 3628 MIB.addUse(RSrc) // rsrc 3629 .addUse(VIndex) // vindex 3630 .addUse(VOffset) // voffset 3631 .addUse(SOffset) // soffset 3632 .addImm(ImmOffset) // offset(imm) 3633 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3634 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3635 .addMemOperand(MMO); 3636 3637 MI.eraseFromParent(); 3638 return true; 3639 } 3640 3641 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3642 /// vector with s16 typed elements. 3643 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3644 SmallVectorImpl<Register> &PackedAddrs, 3645 int AddrIdx, int DimIdx, int EndIdx, 3646 int NumGradients) { 3647 const LLT S16 = LLT::scalar(16); 3648 const LLT V2S16 = LLT::vector(2, 16); 3649 3650 for (int I = AddrIdx; I < EndIdx; ++I) { 3651 MachineOperand &SrcOp = MI.getOperand(I); 3652 if (!SrcOp.isReg()) 3653 continue; // _L to _LZ may have eliminated this. 3654 3655 Register AddrReg = SrcOp.getReg(); 3656 3657 if (I < DimIdx) { 3658 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3659 PackedAddrs.push_back(AddrReg); 3660 } else { 3661 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3662 // derivatives dx/dh and dx/dv are packed with undef. 3663 if (((I + 1) >= EndIdx) || 3664 ((NumGradients / 2) % 2 == 1 && 3665 (I == DimIdx + (NumGradients / 2) - 1 || 3666 I == DimIdx + NumGradients - 1)) || 3667 // Check for _L to _LZ optimization 3668 !MI.getOperand(I + 1).isReg()) { 3669 PackedAddrs.push_back( 3670 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3671 .getReg(0)); 3672 } else { 3673 PackedAddrs.push_back( 3674 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3675 .getReg(0)); 3676 ++I; 3677 } 3678 } 3679 } 3680 } 3681 3682 /// Convert from separate vaddr components to a single vector address register, 3683 /// and replace the remaining operands with $noreg. 3684 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3685 int DimIdx, int NumVAddrs) { 3686 const LLT S32 = LLT::scalar(32); 3687 3688 SmallVector<Register, 8> AddrRegs; 3689 for (int I = 0; I != NumVAddrs; ++I) { 3690 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3691 if (SrcOp.isReg()) { 3692 AddrRegs.push_back(SrcOp.getReg()); 3693 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3694 } 3695 } 3696 3697 int NumAddrRegs = AddrRegs.size(); 3698 if (NumAddrRegs != 1) { 3699 // Round up to 8 elements for v5-v7 3700 // FIXME: Missing intermediate sized register classes and instructions. 3701 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3702 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3703 auto Undef = B.buildUndef(S32); 3704 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3705 NumAddrRegs = RoundedNumRegs; 3706 } 3707 3708 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3709 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3710 } 3711 3712 for (int I = 1; I != NumVAddrs; ++I) { 3713 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3714 if (SrcOp.isReg()) 3715 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3716 } 3717 } 3718 3719 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3720 /// 3721 /// Depending on the subtarget, load/store with 16-bit element data need to be 3722 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3723 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3724 /// registers. 3725 /// 3726 /// We don't want to directly select image instructions just yet, but also want 3727 /// to exposes all register repacking to the legalizer/combiners. We also don't 3728 /// want a selected instrution entering RegBankSelect. In order to avoid 3729 /// defining a multitude of intermediate image instructions, directly hack on 3730 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3731 /// now unnecessary arguments with $noreg. 3732 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3733 MachineInstr &MI, MachineIRBuilder &B, 3734 GISelChangeObserver &Observer, 3735 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3736 3737 const int NumDefs = MI.getNumExplicitDefs(); 3738 bool IsTFE = NumDefs == 2; 3739 // We are only processing the operands of d16 image operations on subtargets 3740 // that use the unpacked register layout, or need to repack the TFE result. 3741 3742 // TODO: Do we need to guard against already legalized intrinsics? 3743 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3744 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3745 3746 MachineRegisterInfo *MRI = B.getMRI(); 3747 const LLT S32 = LLT::scalar(32); 3748 const LLT S16 = LLT::scalar(16); 3749 const LLT V2S16 = LLT::vector(2, 16); 3750 3751 // Index of first address argument 3752 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3753 3754 int NumVAddrs, NumGradients; 3755 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3756 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3757 getDMaskIdx(BaseOpcode, NumDefs); 3758 unsigned DMask = 0; 3759 3760 // Check for 16 bit addresses and pack if true. 3761 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3762 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3763 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3764 const bool IsG16 = GradTy == S16; 3765 const bool IsA16 = AddrTy == S16; 3766 3767 int DMaskLanes = 0; 3768 if (!BaseOpcode->Atomic) { 3769 DMask = MI.getOperand(DMaskIdx).getImm(); 3770 if (BaseOpcode->Gather4) { 3771 DMaskLanes = 4; 3772 } else if (DMask != 0) { 3773 DMaskLanes = countPopulation(DMask); 3774 } else if (!IsTFE && !BaseOpcode->Store) { 3775 // If dmask is 0, this is a no-op load. This can be eliminated. 3776 B.buildUndef(MI.getOperand(0)); 3777 MI.eraseFromParent(); 3778 return true; 3779 } 3780 } 3781 3782 Observer.changingInstr(MI); 3783 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3784 3785 unsigned NewOpcode = NumDefs == 0 ? 3786 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3787 3788 // Track that we legalized this 3789 MI.setDesc(B.getTII().get(NewOpcode)); 3790 3791 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3792 // dmask to be at least 1 otherwise the instruction will fail 3793 if (IsTFE && DMask == 0) { 3794 DMask = 0x1; 3795 DMaskLanes = 1; 3796 MI.getOperand(DMaskIdx).setImm(DMask); 3797 } 3798 3799 if (BaseOpcode->Atomic) { 3800 Register VData0 = MI.getOperand(2).getReg(); 3801 LLT Ty = MRI->getType(VData0); 3802 3803 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3804 if (Ty.isVector()) 3805 return false; 3806 3807 if (BaseOpcode->AtomicX2) { 3808 Register VData1 = MI.getOperand(3).getReg(); 3809 // The two values are packed in one register. 3810 LLT PackedTy = LLT::vector(2, Ty); 3811 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3812 MI.getOperand(2).setReg(Concat.getReg(0)); 3813 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3814 } 3815 } 3816 3817 int CorrectedNumVAddrs = NumVAddrs; 3818 3819 // Optimize _L to _LZ when _L is zero 3820 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3821 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3822 const ConstantFP *ConstantLod; 3823 const int LodIdx = AddrIdx + NumVAddrs - 1; 3824 3825 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3826 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3827 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3828 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3829 LZMappingInfo->LZ, ImageDimIntr->Dim); 3830 3831 // The starting indexes should remain in the same place. 3832 --NumVAddrs; 3833 --CorrectedNumVAddrs; 3834 3835 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3836 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3837 MI.RemoveOperand(LodIdx); 3838 } 3839 } 3840 } 3841 3842 // Optimize _mip away, when 'lod' is zero 3843 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3844 int64_t ConstantLod; 3845 const int LodIdx = AddrIdx + NumVAddrs - 1; 3846 3847 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3848 if (ConstantLod == 0) { 3849 // TODO: Change intrinsic opcode and remove operand instead or replacing 3850 // it with 0, as the _L to _LZ handling is done above. 3851 MI.getOperand(LodIdx).ChangeToImmediate(0); 3852 --CorrectedNumVAddrs; 3853 } 3854 } 3855 } 3856 3857 // Rewrite the addressing register layout before doing anything else. 3858 if (IsA16 || IsG16) { 3859 if (IsA16) { 3860 // Target must support the feature and gradients need to be 16 bit too 3861 if (!ST.hasA16() || !IsG16) 3862 return false; 3863 } else if (!ST.hasG16()) 3864 return false; 3865 3866 if (NumVAddrs > 1) { 3867 SmallVector<Register, 4> PackedRegs; 3868 // Don't compress addresses for G16 3869 const int PackEndIdx = 3870 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3871 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3872 PackEndIdx, NumGradients); 3873 3874 if (!IsA16) { 3875 // Add uncompressed address 3876 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3877 int AddrReg = MI.getOperand(I).getReg(); 3878 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3879 PackedRegs.push_back(AddrReg); 3880 } 3881 } 3882 3883 // See also below in the non-a16 branch 3884 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3885 3886 if (!UseNSA && PackedRegs.size() > 1) { 3887 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3888 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3889 PackedRegs[0] = Concat.getReg(0); 3890 PackedRegs.resize(1); 3891 } 3892 3893 const int NumPacked = PackedRegs.size(); 3894 for (int I = 0; I != NumVAddrs; ++I) { 3895 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3896 if (!SrcOp.isReg()) { 3897 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3898 continue; 3899 } 3900 3901 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3902 3903 if (I < NumPacked) 3904 SrcOp.setReg(PackedRegs[I]); 3905 else 3906 SrcOp.setReg(AMDGPU::NoRegister); 3907 } 3908 } 3909 } else { 3910 // If the register allocator cannot place the address registers contiguously 3911 // without introducing moves, then using the non-sequential address encoding 3912 // is always preferable, since it saves VALU instructions and is usually a 3913 // wash in terms of code size or even better. 3914 // 3915 // However, we currently have no way of hinting to the register allocator 3916 // that MIMG addresses should be placed contiguously when it is possible to 3917 // do so, so force non-NSA for the common 2-address case as a heuristic. 3918 // 3919 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3920 // allocation when possible. 3921 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3922 3923 if (!UseNSA && NumVAddrs > 1) 3924 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3925 } 3926 3927 int Flags = 0; 3928 if (IsA16) 3929 Flags |= 1; 3930 if (IsG16) 3931 Flags |= 2; 3932 MI.addOperand(MachineOperand::CreateImm(Flags)); 3933 3934 if (BaseOpcode->Store) { // No TFE for stores? 3935 // TODO: Handle dmask trim 3936 Register VData = MI.getOperand(1).getReg(); 3937 LLT Ty = MRI->getType(VData); 3938 if (!Ty.isVector() || Ty.getElementType() != S16) 3939 return true; 3940 3941 Register RepackedReg = handleD16VData(B, *MRI, VData); 3942 if (RepackedReg != VData) { 3943 MI.getOperand(1).setReg(RepackedReg); 3944 } 3945 3946 return true; 3947 } 3948 3949 Register DstReg = MI.getOperand(0).getReg(); 3950 LLT Ty = MRI->getType(DstReg); 3951 const LLT EltTy = Ty.getScalarType(); 3952 const bool IsD16 = Ty.getScalarType() == S16; 3953 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3954 3955 // Confirm that the return type is large enough for the dmask specified 3956 if (NumElts < DMaskLanes) 3957 return false; 3958 3959 if (NumElts > 4 || DMaskLanes > 4) 3960 return false; 3961 3962 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3963 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3964 3965 // The raw dword aligned data component of the load. The only legal cases 3966 // where this matters should be when using the packed D16 format, for 3967 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3968 LLT RoundedTy; 3969 3970 // S32 vector to to cover all data, plus TFE result element. 3971 LLT TFETy; 3972 3973 // Register type to use for each loaded component. Will be S32 or V2S16. 3974 LLT RegTy; 3975 3976 if (IsD16 && ST.hasUnpackedD16VMem()) { 3977 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3978 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3979 RegTy = S32; 3980 } else { 3981 unsigned EltSize = EltTy.getSizeInBits(); 3982 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3983 unsigned RoundedSize = 32 * RoundedElts; 3984 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3985 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3986 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3987 } 3988 3989 // The return type does not need adjustment. 3990 // TODO: Should we change s16 case to s32 or <2 x s16>? 3991 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3992 return true; 3993 3994 Register Dst1Reg; 3995 3996 // Insert after the instruction. 3997 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3998 3999 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4000 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4001 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4002 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4003 4004 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4005 4006 MI.getOperand(0).setReg(NewResultReg); 4007 4008 // In the IR, TFE is supposed to be used with a 2 element struct return 4009 // type. The intruction really returns these two values in one contiguous 4010 // register, with one additional dword beyond the loaded data. Rewrite the 4011 // return type to use a single register result. 4012 4013 if (IsTFE) { 4014 Dst1Reg = MI.getOperand(1).getReg(); 4015 if (MRI->getType(Dst1Reg) != S32) 4016 return false; 4017 4018 // TODO: Make sure the TFE operand bit is set. 4019 MI.RemoveOperand(1); 4020 4021 // Handle the easy case that requires no repack instructions. 4022 if (Ty == S32) { 4023 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4024 return true; 4025 } 4026 } 4027 4028 // Now figure out how to copy the new result register back into the old 4029 // result. 4030 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4031 4032 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4033 4034 if (ResultNumRegs == 1) { 4035 assert(!IsTFE); 4036 ResultRegs[0] = NewResultReg; 4037 } else { 4038 // We have to repack into a new vector of some kind. 4039 for (int I = 0; I != NumDataRegs; ++I) 4040 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4041 B.buildUnmerge(ResultRegs, NewResultReg); 4042 4043 // Drop the final TFE element to get the data part. The TFE result is 4044 // directly written to the right place already. 4045 if (IsTFE) 4046 ResultRegs.resize(NumDataRegs); 4047 } 4048 4049 // For an s16 scalar result, we form an s32 result with a truncate regardless 4050 // of packed vs. unpacked. 4051 if (IsD16 && !Ty.isVector()) { 4052 B.buildTrunc(DstReg, ResultRegs[0]); 4053 return true; 4054 } 4055 4056 // Avoid a build/concat_vector of 1 entry. 4057 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4058 B.buildBitcast(DstReg, ResultRegs[0]); 4059 return true; 4060 } 4061 4062 assert(Ty.isVector()); 4063 4064 if (IsD16) { 4065 // For packed D16 results with TFE enabled, all the data components are 4066 // S32. Cast back to the expected type. 4067 // 4068 // TODO: We don't really need to use load s32 elements. We would only need one 4069 // cast for the TFE result if a multiple of v2s16 was used. 4070 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4071 for (Register &Reg : ResultRegs) 4072 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4073 } else if (ST.hasUnpackedD16VMem()) { 4074 for (Register &Reg : ResultRegs) 4075 Reg = B.buildTrunc(S16, Reg).getReg(0); 4076 } 4077 } 4078 4079 auto padWithUndef = [&](LLT Ty, int NumElts) { 4080 if (NumElts == 0) 4081 return; 4082 Register Undef = B.buildUndef(Ty).getReg(0); 4083 for (int I = 0; I != NumElts; ++I) 4084 ResultRegs.push_back(Undef); 4085 }; 4086 4087 // Pad out any elements eliminated due to the dmask. 4088 LLT ResTy = MRI->getType(ResultRegs[0]); 4089 if (!ResTy.isVector()) { 4090 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4091 B.buildBuildVector(DstReg, ResultRegs); 4092 return true; 4093 } 4094 4095 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4096 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4097 4098 // Deal with the one annoying legal case. 4099 const LLT V3S16 = LLT::vector(3, 16); 4100 if (Ty == V3S16) { 4101 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4102 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4103 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4104 return true; 4105 } 4106 4107 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4108 B.buildConcatVectors(DstReg, ResultRegs); 4109 return true; 4110 } 4111 4112 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4113 MachineInstr &MI, MachineIRBuilder &B, 4114 GISelChangeObserver &Observer) const { 4115 Register Dst = MI.getOperand(0).getReg(); 4116 LLT Ty = B.getMRI()->getType(Dst); 4117 unsigned Size = Ty.getSizeInBits(); 4118 MachineFunction &MF = B.getMF(); 4119 4120 Observer.changingInstr(MI); 4121 4122 // FIXME: We don't really need this intermediate instruction. The intrinsic 4123 // should be fixed to have a memory operand. Since it's readnone, we're not 4124 // allowed to add one. 4125 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4126 MI.RemoveOperand(1); // Remove intrinsic ID 4127 4128 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4129 // TODO: Should this use datalayout alignment? 4130 const unsigned MemSize = (Size + 7) / 8; 4131 const Align MemAlign(4); 4132 MachineMemOperand *MMO = MF.getMachineMemOperand( 4133 MachinePointerInfo(), 4134 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4135 MachineMemOperand::MOInvariant, 4136 MemSize, MemAlign); 4137 MI.addMemOperand(MF, MMO); 4138 4139 // There are no 96-bit result scalar loads, but widening to 128-bit should 4140 // always be legal. We may need to restore this to a 96-bit result if it turns 4141 // out this needs to be converted to a vector load during RegBankSelect. 4142 if (!isPowerOf2_32(Size)) { 4143 LegalizerHelper Helper(MF, *this, Observer, B); 4144 4145 if (Ty.isVector()) 4146 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4147 else 4148 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4149 } 4150 4151 Observer.changedInstr(MI); 4152 return true; 4153 } 4154 4155 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4156 MachineRegisterInfo &MRI, 4157 MachineIRBuilder &B) const { 4158 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4159 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4160 !ST.isTrapHandlerEnabled()) { 4161 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4162 } else { 4163 // Pass queue pointer to trap handler as input, and insert trap instruction 4164 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4165 const ArgDescriptor *Arg = 4166 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4167 if (!Arg) 4168 return false; 4169 MachineRegisterInfo &MRI = *B.getMRI(); 4170 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4171 Register LiveIn = getLiveInRegister( 4172 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4173 /*InsertLiveInCopy=*/false); 4174 if (!loadInputValue(LiveIn, B, Arg)) 4175 return false; 4176 B.buildCopy(SGPR01, LiveIn); 4177 B.buildInstr(AMDGPU::S_TRAP) 4178 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4179 .addReg(SGPR01, RegState::Implicit); 4180 } 4181 4182 MI.eraseFromParent(); 4183 return true; 4184 } 4185 4186 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4187 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4188 // Is non-HSA path or trap-handler disabled? then, report a warning 4189 // accordingly 4190 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4191 !ST.isTrapHandlerEnabled()) { 4192 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4193 "debugtrap handler not supported", 4194 MI.getDebugLoc(), DS_Warning); 4195 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4196 Ctx.diagnose(NoTrap); 4197 } else { 4198 // Insert debug-trap instruction 4199 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4200 } 4201 4202 MI.eraseFromParent(); 4203 return true; 4204 } 4205 4206 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4207 MachineInstr &MI) const { 4208 MachineIRBuilder &B = Helper.MIRBuilder; 4209 MachineRegisterInfo &MRI = *B.getMRI(); 4210 4211 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4212 auto IntrID = MI.getIntrinsicID(); 4213 switch (IntrID) { 4214 case Intrinsic::amdgcn_if: 4215 case Intrinsic::amdgcn_else: { 4216 MachineInstr *Br = nullptr; 4217 MachineBasicBlock *UncondBrTarget = nullptr; 4218 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4219 const SIRegisterInfo *TRI 4220 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4221 4222 Register Def = MI.getOperand(1).getReg(); 4223 Register Use = MI.getOperand(3).getReg(); 4224 4225 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4226 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4227 if (IntrID == Intrinsic::amdgcn_if) { 4228 B.buildInstr(AMDGPU::SI_IF) 4229 .addDef(Def) 4230 .addUse(Use) 4231 .addMBB(UncondBrTarget); 4232 } else { 4233 B.buildInstr(AMDGPU::SI_ELSE) 4234 .addDef(Def) 4235 .addUse(Use) 4236 .addMBB(UncondBrTarget) 4237 .addImm(0); 4238 } 4239 4240 if (Br) { 4241 Br->getOperand(0).setMBB(CondBrTarget); 4242 } else { 4243 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4244 // since we're swapping branch targets it needs to be reinserted. 4245 // FIXME: IRTranslator should probably not do this 4246 B.buildBr(*CondBrTarget); 4247 } 4248 4249 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4250 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4251 MI.eraseFromParent(); 4252 BrCond->eraseFromParent(); 4253 return true; 4254 } 4255 4256 return false; 4257 } 4258 case Intrinsic::amdgcn_loop: { 4259 MachineInstr *Br = nullptr; 4260 MachineBasicBlock *UncondBrTarget = nullptr; 4261 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4262 const SIRegisterInfo *TRI 4263 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4264 4265 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4266 Register Reg = MI.getOperand(2).getReg(); 4267 4268 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4269 B.buildInstr(AMDGPU::SI_LOOP) 4270 .addUse(Reg) 4271 .addMBB(UncondBrTarget); 4272 4273 if (Br) 4274 Br->getOperand(0).setMBB(CondBrTarget); 4275 else 4276 B.buildBr(*CondBrTarget); 4277 4278 MI.eraseFromParent(); 4279 BrCond->eraseFromParent(); 4280 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4281 return true; 4282 } 4283 4284 return false; 4285 } 4286 case Intrinsic::amdgcn_kernarg_segment_ptr: 4287 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4288 // This only makes sense to call in a kernel, so just lower to null. 4289 B.buildConstant(MI.getOperand(0).getReg(), 0); 4290 MI.eraseFromParent(); 4291 return true; 4292 } 4293 4294 return legalizePreloadedArgIntrin( 4295 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4296 case Intrinsic::amdgcn_implicitarg_ptr: 4297 return legalizeImplicitArgPtr(MI, MRI, B); 4298 case Intrinsic::amdgcn_workitem_id_x: 4299 return legalizePreloadedArgIntrin(MI, MRI, B, 4300 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4301 case Intrinsic::amdgcn_workitem_id_y: 4302 return legalizePreloadedArgIntrin(MI, MRI, B, 4303 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4304 case Intrinsic::amdgcn_workitem_id_z: 4305 return legalizePreloadedArgIntrin(MI, MRI, B, 4306 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4307 case Intrinsic::amdgcn_workgroup_id_x: 4308 return legalizePreloadedArgIntrin(MI, MRI, B, 4309 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4310 case Intrinsic::amdgcn_workgroup_id_y: 4311 return legalizePreloadedArgIntrin(MI, MRI, B, 4312 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4313 case Intrinsic::amdgcn_workgroup_id_z: 4314 return legalizePreloadedArgIntrin(MI, MRI, B, 4315 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4316 case Intrinsic::amdgcn_dispatch_ptr: 4317 return legalizePreloadedArgIntrin(MI, MRI, B, 4318 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4319 case Intrinsic::amdgcn_queue_ptr: 4320 return legalizePreloadedArgIntrin(MI, MRI, B, 4321 AMDGPUFunctionArgInfo::QUEUE_PTR); 4322 case Intrinsic::amdgcn_implicit_buffer_ptr: 4323 return legalizePreloadedArgIntrin( 4324 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4325 case Intrinsic::amdgcn_dispatch_id: 4326 return legalizePreloadedArgIntrin(MI, MRI, B, 4327 AMDGPUFunctionArgInfo::DISPATCH_ID); 4328 case Intrinsic::amdgcn_fdiv_fast: 4329 return legalizeFDIVFastIntrin(MI, MRI, B); 4330 case Intrinsic::amdgcn_is_shared: 4331 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4332 case Intrinsic::amdgcn_is_private: 4333 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4334 case Intrinsic::amdgcn_wavefrontsize: { 4335 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4336 MI.eraseFromParent(); 4337 return true; 4338 } 4339 case Intrinsic::amdgcn_s_buffer_load: 4340 return legalizeSBufferLoad(MI, B, Helper.Observer); 4341 case Intrinsic::amdgcn_raw_buffer_store: 4342 case Intrinsic::amdgcn_struct_buffer_store: 4343 return legalizeBufferStore(MI, MRI, B, false, false); 4344 case Intrinsic::amdgcn_raw_buffer_store_format: 4345 case Intrinsic::amdgcn_struct_buffer_store_format: 4346 return legalizeBufferStore(MI, MRI, B, false, true); 4347 case Intrinsic::amdgcn_raw_tbuffer_store: 4348 case Intrinsic::amdgcn_struct_tbuffer_store: 4349 return legalizeBufferStore(MI, MRI, B, true, true); 4350 case Intrinsic::amdgcn_raw_buffer_load: 4351 case Intrinsic::amdgcn_struct_buffer_load: 4352 return legalizeBufferLoad(MI, MRI, B, false, false); 4353 case Intrinsic::amdgcn_raw_buffer_load_format: 4354 case Intrinsic::amdgcn_struct_buffer_load_format: 4355 return legalizeBufferLoad(MI, MRI, B, true, false); 4356 case Intrinsic::amdgcn_raw_tbuffer_load: 4357 case Intrinsic::amdgcn_struct_tbuffer_load: 4358 return legalizeBufferLoad(MI, MRI, B, true, true); 4359 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4360 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4361 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4362 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4363 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4364 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4365 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4366 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4367 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4368 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4369 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4370 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4371 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4372 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4373 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4374 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4375 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4376 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4377 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4378 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4379 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4380 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4381 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4382 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4383 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4384 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4385 return legalizeBufferAtomic(MI, B, IntrID); 4386 case Intrinsic::amdgcn_atomic_inc: 4387 return legalizeAtomicIncDec(MI, B, true); 4388 case Intrinsic::amdgcn_atomic_dec: 4389 return legalizeAtomicIncDec(MI, B, false); 4390 case Intrinsic::trap: 4391 return legalizeTrapIntrinsic(MI, MRI, B); 4392 case Intrinsic::debugtrap: 4393 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4394 default: { 4395 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4396 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4397 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4398 return true; 4399 } 4400 } 4401 4402 return true; 4403 } 4404