1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Round the number of elements to the next power of two elements 40 static LLT getPow2VectorType(LLT Ty) { 41 unsigned NElts = Ty.getNumElements(); 42 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 43 return Ty.changeNumElements(Pow2NElts); 44 } 45 46 // Round the number of bits to the next power of two bits 47 static LLT getPow2ScalarType(LLT Ty) { 48 unsigned Bits = Ty.getSizeInBits(); 49 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 50 return LLT::scalar(Pow2Bits); 51 } 52 53 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 54 return [=](const LegalityQuery &Query) { 55 const LLT Ty = Query.Types[TypeIdx]; 56 return Ty.isVector() && 57 Ty.getNumElements() % 2 != 0 && 58 Ty.getElementType().getSizeInBits() < 32 && 59 Ty.getSizeInBits() % 32 != 0; 60 }; 61 } 62 63 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 const LLT EltTy = Ty.getScalarType(); 67 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 68 }; 69 } 70 71 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 72 return [=](const LegalityQuery &Query) { 73 const LLT Ty = Query.Types[TypeIdx]; 74 const LLT EltTy = Ty.getElementType(); 75 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 76 }; 77 } 78 79 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 const LLT EltTy = Ty.getElementType(); 83 unsigned Size = Ty.getSizeInBits(); 84 unsigned Pieces = (Size + 63) / 64; 85 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 86 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 87 }; 88 } 89 90 // Increase the number of vector elements to reach the next multiple of 32-bit 91 // type. 92 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 96 const LLT EltTy = Ty.getElementType(); 97 const int Size = Ty.getSizeInBits(); 98 const int EltSize = EltTy.getSizeInBits(); 99 const int NextMul32 = (Size + 31) / 32; 100 101 assert(EltSize < 32); 102 103 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 104 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 105 }; 106 } 107 108 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 109 return [=](const LegalityQuery &Query) { 110 const LLT Ty = Query.Types[TypeIdx]; 111 unsigned Size = Ty.getSizeInBits(); 112 113 LLT CoercedTy; 114 if (Size < 32) { 115 // <2 x s8> -> s16 116 assert(Size == 16); 117 CoercedTy = LLT::scalar(16); 118 } else 119 CoercedTy = LLT::scalarOrVector(Size / 32, 32); 120 121 return std::make_pair(TypeIdx, CoercedTy); 122 }; 123 } 124 125 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 126 return [=](const LegalityQuery &Query) { 127 const LLT QueryTy = Query.Types[TypeIdx]; 128 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 129 }; 130 } 131 132 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 133 return [=](const LegalityQuery &Query) { 134 const LLT QueryTy = Query.Types[TypeIdx]; 135 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 136 }; 137 } 138 139 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 140 return [=](const LegalityQuery &Query) { 141 const LLT QueryTy = Query.Types[TypeIdx]; 142 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 143 }; 144 } 145 146 static bool isRegisterSize(unsigned Size) { 147 return Size % 32 == 0 && Size <= 1024; 148 } 149 150 static bool isRegisterVectorElementType(LLT EltTy) { 151 const int EltSize = EltTy.getSizeInBits(); 152 return EltSize == 16 || EltSize % 32 == 0; 153 } 154 155 static bool isRegisterVectorType(LLT Ty) { 156 const int EltSize = Ty.getElementType().getSizeInBits(); 157 return EltSize == 32 || EltSize == 64 || 158 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 159 EltSize == 128 || EltSize == 256; 160 } 161 162 static bool isRegisterType(LLT Ty) { 163 if (!isRegisterSize(Ty.getSizeInBits())) 164 return false; 165 166 if (Ty.isVector()) 167 return isRegisterVectorType(Ty); 168 169 return true; 170 } 171 172 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 173 // v2s16. 174 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 return isRegisterType(Query.Types[TypeIdx]); 177 }; 178 } 179 180 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 181 return [=](const LegalityQuery &Query) { 182 const LLT QueryTy = Query.Types[TypeIdx]; 183 if (!QueryTy.isVector()) 184 return false; 185 const LLT EltTy = QueryTy.getElementType(); 186 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 187 }; 188 } 189 190 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 191 return [=](const LegalityQuery &Query) { 192 const LLT Ty = Query.Types[TypeIdx]; 193 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 194 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 195 }; 196 } 197 198 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 199 // handle some operations by just promoting the register during 200 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 201 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 202 bool IsLoad) { 203 switch (AS) { 204 case AMDGPUAS::PRIVATE_ADDRESS: 205 // FIXME: Private element size. 206 return 32; 207 case AMDGPUAS::LOCAL_ADDRESS: 208 return ST.useDS128() ? 128 : 64; 209 case AMDGPUAS::GLOBAL_ADDRESS: 210 case AMDGPUAS::CONSTANT_ADDRESS: 211 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 212 // Treat constant and global as identical. SMRD loads are sometimes usable for 213 // global loads (ideally constant address space should be eliminated) 214 // depending on the context. Legality cannot be context dependent, but 215 // RegBankSelect can split the load as necessary depending on the pointer 216 // register bank/uniformity and if the memory is invariant or not written in a 217 // kernel. 218 return IsLoad ? 512 : 128; 219 default: 220 // Flat addresses may contextually need to be split to 32-bit parts if they 221 // may alias scratch depending on the subtarget. 222 return 128; 223 } 224 } 225 226 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 227 const LegalityQuery &Query, 228 unsigned Opcode) { 229 const LLT Ty = Query.Types[0]; 230 231 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 232 const bool IsLoad = Opcode != AMDGPU::G_STORE; 233 234 unsigned RegSize = Ty.getSizeInBits(); 235 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 236 unsigned Align = Query.MMODescrs[0].AlignInBits; 237 unsigned AS = Query.Types[1].getAddressSpace(); 238 239 // All of these need to be custom lowered to cast the pointer operand. 240 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 241 return false; 242 243 // TODO: We should be able to widen loads if the alignment is high enough, but 244 // we also need to modify the memory access size. 245 #if 0 246 // Accept widening loads based on alignment. 247 if (IsLoad && MemSize < Size) 248 MemSize = std::max(MemSize, Align); 249 #endif 250 251 // Only 1-byte and 2-byte to 32-bit extloads are valid. 252 if (MemSize != RegSize && RegSize != 32) 253 return false; 254 255 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 256 return false; 257 258 switch (MemSize) { 259 case 8: 260 case 16: 261 case 32: 262 case 64: 263 case 128: 264 break; 265 case 96: 266 if (!ST.hasDwordx3LoadStores()) 267 return false; 268 break; 269 case 256: 270 case 512: 271 // These may contextually need to be broken down. 272 break; 273 default: 274 return false; 275 } 276 277 assert(RegSize >= MemSize); 278 279 if (Align < MemSize) { 280 const SITargetLowering *TLI = ST.getTargetLowering(); 281 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 282 return false; 283 } 284 285 return true; 286 } 287 288 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 289 unsigned Opcode) { 290 const LLT Ty = Query.Types[0]; 291 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode); 292 } 293 294 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 295 const GCNTargetMachine &TM) 296 : ST(ST_) { 297 using namespace TargetOpcode; 298 299 auto GetAddrSpacePtr = [&TM](unsigned AS) { 300 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 301 }; 302 303 const LLT S1 = LLT::scalar(1); 304 const LLT S16 = LLT::scalar(16); 305 const LLT S32 = LLT::scalar(32); 306 const LLT S64 = LLT::scalar(64); 307 const LLT S128 = LLT::scalar(128); 308 const LLT S256 = LLT::scalar(256); 309 const LLT S512 = LLT::scalar(512); 310 const LLT S1024 = LLT::scalar(1024); 311 312 const LLT V2S16 = LLT::vector(2, 16); 313 const LLT V4S16 = LLT::vector(4, 16); 314 315 const LLT V2S32 = LLT::vector(2, 32); 316 const LLT V3S32 = LLT::vector(3, 32); 317 const LLT V4S32 = LLT::vector(4, 32); 318 const LLT V5S32 = LLT::vector(5, 32); 319 const LLT V6S32 = LLT::vector(6, 32); 320 const LLT V7S32 = LLT::vector(7, 32); 321 const LLT V8S32 = LLT::vector(8, 32); 322 const LLT V9S32 = LLT::vector(9, 32); 323 const LLT V10S32 = LLT::vector(10, 32); 324 const LLT V11S32 = LLT::vector(11, 32); 325 const LLT V12S32 = LLT::vector(12, 32); 326 const LLT V13S32 = LLT::vector(13, 32); 327 const LLT V14S32 = LLT::vector(14, 32); 328 const LLT V15S32 = LLT::vector(15, 32); 329 const LLT V16S32 = LLT::vector(16, 32); 330 const LLT V32S32 = LLT::vector(32, 32); 331 332 const LLT V2S64 = LLT::vector(2, 64); 333 const LLT V3S64 = LLT::vector(3, 64); 334 const LLT V4S64 = LLT::vector(4, 64); 335 const LLT V5S64 = LLT::vector(5, 64); 336 const LLT V6S64 = LLT::vector(6, 64); 337 const LLT V7S64 = LLT::vector(7, 64); 338 const LLT V8S64 = LLT::vector(8, 64); 339 const LLT V16S64 = LLT::vector(16, 64); 340 341 std::initializer_list<LLT> AllS32Vectors = 342 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 343 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 344 std::initializer_list<LLT> AllS64Vectors = 345 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 346 347 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 348 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 349 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 350 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 351 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 352 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 353 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 354 355 const LLT CodePtr = FlatPtr; 356 357 const std::initializer_list<LLT> AddrSpaces64 = { 358 GlobalPtr, ConstantPtr, FlatPtr 359 }; 360 361 const std::initializer_list<LLT> AddrSpaces32 = { 362 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 363 }; 364 365 const std::initializer_list<LLT> FPTypesBase = { 366 S32, S64 367 }; 368 369 const std::initializer_list<LLT> FPTypes16 = { 370 S32, S64, S16 371 }; 372 373 const std::initializer_list<LLT> FPTypesPK16 = { 374 S32, S64, S16, V2S16 375 }; 376 377 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 378 379 setAction({G_BRCOND, S1}, Legal); // VCC branches 380 setAction({G_BRCOND, S32}, Legal); // SCC branches 381 382 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 383 // elements for v3s16 384 getActionDefinitionsBuilder(G_PHI) 385 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 386 .legalFor(AllS32Vectors) 387 .legalFor(AllS64Vectors) 388 .legalFor(AddrSpaces64) 389 .legalFor(AddrSpaces32) 390 .clampScalar(0, S32, S256) 391 .widenScalarToNextPow2(0, 32) 392 .clampMaxNumElements(0, S32, 16) 393 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 394 .legalIf(isPointer(0)); 395 396 if (ST.hasVOP3PInsts()) { 397 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 398 .legalFor({S32, S16, V2S16}) 399 .clampScalar(0, S16, S32) 400 .clampMaxNumElements(0, S16, 2) 401 .scalarize(0) 402 .widenScalarToNextPow2(0, 32); 403 } else if (ST.has16BitInsts()) { 404 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 405 .legalFor({S32, S16}) 406 .clampScalar(0, S16, S32) 407 .scalarize(0) 408 .widenScalarToNextPow2(0, 32); 409 } else { 410 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 411 .legalFor({S32}) 412 .clampScalar(0, S32, S32) 413 .scalarize(0); 414 } 415 416 // FIXME: Not really legal. Placeholder for custom lowering. 417 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 418 .customFor({S32, S64}) 419 .clampScalar(0, S32, S64) 420 .widenScalarToNextPow2(0, 32) 421 .scalarize(0); 422 423 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 424 .legalFor({S32}) 425 .clampScalar(0, S32, S32) 426 .scalarize(0); 427 428 // Report legal for any types we can handle anywhere. For the cases only legal 429 // on the SALU, RegBankSelect will be able to re-legalize. 430 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 431 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 432 .clampScalar(0, S32, S64) 433 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 434 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 435 .widenScalarToNextPow2(0) 436 .scalarize(0); 437 438 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 439 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 440 .legalFor({{S32, S1}, {S32, S32}}) 441 .minScalar(0, S32) 442 // TODO: .scalarize(0) 443 .lower(); 444 445 getActionDefinitionsBuilder(G_BITCAST) 446 // Don't worry about the size constraint. 447 .legalIf(all(isRegisterType(0), isRegisterType(1))) 448 .lower(); 449 450 451 getActionDefinitionsBuilder(G_CONSTANT) 452 .legalFor({S1, S32, S64, S16, GlobalPtr, 453 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 454 .clampScalar(0, S32, S64) 455 .widenScalarToNextPow2(0) 456 .legalIf(isPointer(0)); 457 458 getActionDefinitionsBuilder(G_FCONSTANT) 459 .legalFor({S32, S64, S16}) 460 .clampScalar(0, S16, S64); 461 462 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 463 .legalIf(isRegisterType(0)) 464 // s1 and s16 are special cases because they have legal operations on 465 // them, but don't really occupy registers in the normal way. 466 .legalFor({S1, S16}) 467 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 468 .clampScalarOrElt(0, S32, S1024) 469 .widenScalarToNextPow2(0, 32) 470 .clampMaxNumElements(0, S32, 16); 471 472 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 473 474 // If the amount is divergent, we have to do a wave reduction to get the 475 // maximum value, so this is expanded during RegBankSelect. 476 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 477 .legalFor({{PrivatePtr, S32}}); 478 479 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 480 .unsupportedFor({PrivatePtr}) 481 .custom(); 482 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 483 484 auto &FPOpActions = getActionDefinitionsBuilder( 485 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 486 .legalFor({S32, S64}); 487 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 488 .customFor({S32, S64}); 489 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 490 .customFor({S32, S64}); 491 492 if (ST.has16BitInsts()) { 493 if (ST.hasVOP3PInsts()) 494 FPOpActions.legalFor({S16, V2S16}); 495 else 496 FPOpActions.legalFor({S16}); 497 498 TrigActions.customFor({S16}); 499 FDIVActions.customFor({S16}); 500 } 501 502 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 503 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 504 505 if (ST.hasVOP3PInsts()) { 506 MinNumMaxNum.customFor(FPTypesPK16) 507 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 508 .clampMaxNumElements(0, S16, 2) 509 .clampScalar(0, S16, S64) 510 .scalarize(0); 511 } else if (ST.has16BitInsts()) { 512 MinNumMaxNum.customFor(FPTypes16) 513 .clampScalar(0, S16, S64) 514 .scalarize(0); 515 } else { 516 MinNumMaxNum.customFor(FPTypesBase) 517 .clampScalar(0, S32, S64) 518 .scalarize(0); 519 } 520 521 if (ST.hasVOP3PInsts()) 522 FPOpActions.clampMaxNumElements(0, S16, 2); 523 524 FPOpActions 525 .scalarize(0) 526 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 527 528 TrigActions 529 .scalarize(0) 530 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 531 532 FDIVActions 533 .scalarize(0) 534 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 535 536 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 537 .legalFor(FPTypesPK16) 538 .clampMaxNumElements(0, S16, 2) 539 .scalarize(0) 540 .clampScalar(0, S16, S64); 541 542 if (ST.has16BitInsts()) { 543 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 544 .legalFor({S32, S64, S16}) 545 .scalarize(0) 546 .clampScalar(0, S16, S64); 547 } else { 548 getActionDefinitionsBuilder(G_FSQRT) 549 .legalFor({S32, S64}) 550 .scalarize(0) 551 .clampScalar(0, S32, S64); 552 553 if (ST.hasFractBug()) { 554 getActionDefinitionsBuilder(G_FFLOOR) 555 .customFor({S64}) 556 .legalFor({S32, S64}) 557 .scalarize(0) 558 .clampScalar(0, S32, S64); 559 } else { 560 getActionDefinitionsBuilder(G_FFLOOR) 561 .legalFor({S32, S64}) 562 .scalarize(0) 563 .clampScalar(0, S32, S64); 564 } 565 } 566 567 getActionDefinitionsBuilder(G_FPTRUNC) 568 .legalFor({{S32, S64}, {S16, S32}}) 569 .scalarize(0) 570 .lower(); 571 572 getActionDefinitionsBuilder(G_FPEXT) 573 .legalFor({{S64, S32}, {S32, S16}}) 574 .lowerFor({{S64, S16}}) // FIXME: Implement 575 .scalarize(0); 576 577 getActionDefinitionsBuilder(G_FSUB) 578 // Use actual fsub instruction 579 .legalFor({S32}) 580 // Must use fadd + fneg 581 .lowerFor({S64, S16, V2S16}) 582 .scalarize(0) 583 .clampScalar(0, S32, S64); 584 585 // Whether this is legal depends on the floating point mode for the function. 586 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 587 if (ST.hasMadF16()) 588 FMad.customFor({S32, S16}); 589 else 590 FMad.customFor({S32}); 591 FMad.scalarize(0) 592 .lower(); 593 594 // TODO: Do we need to clamp maximum bitwidth? 595 getActionDefinitionsBuilder(G_TRUNC) 596 .legalIf(isScalar(0)) 597 .legalFor({{V2S16, V2S32}}) 598 .clampMaxNumElements(0, S16, 2) 599 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 600 // situations (like an invalid implicit use), we don't want to infinite loop 601 // in the legalizer. 602 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 603 .alwaysLegal(); 604 605 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 606 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 607 {S32, S1}, {S64, S1}, {S16, S1}}) 608 .scalarize(0) 609 .clampScalar(0, S32, S64) 610 .widenScalarToNextPow2(1, 32); 611 612 // TODO: Split s1->s64 during regbankselect for VALU. 613 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 614 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 615 .lowerFor({{S32, S64}}) 616 .lowerIf(typeIs(1, S1)) 617 .customFor({{S64, S64}}); 618 if (ST.has16BitInsts()) 619 IToFP.legalFor({{S16, S16}}); 620 IToFP.clampScalar(1, S32, S64) 621 .scalarize(0) 622 .widenScalarToNextPow2(1); 623 624 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 625 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 626 .customFor({{S64, S64}}); 627 if (ST.has16BitInsts()) 628 FPToI.legalFor({{S16, S16}}); 629 else 630 FPToI.minScalar(1, S32); 631 632 FPToI.minScalar(0, S32) 633 .scalarize(0) 634 .lower(); 635 636 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 637 .scalarize(0) 638 .lower(); 639 640 if (ST.has16BitInsts()) { 641 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 642 .legalFor({S16, S32, S64}) 643 .clampScalar(0, S16, S64) 644 .scalarize(0); 645 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 646 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 647 .legalFor({S32, S64}) 648 .clampScalar(0, S32, S64) 649 .scalarize(0); 650 } else { 651 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 652 .legalFor({S32}) 653 .customFor({S64}) 654 .clampScalar(0, S32, S64) 655 .scalarize(0); 656 } 657 658 // FIXME: Clamp offset operand. 659 getActionDefinitionsBuilder(G_PTR_ADD) 660 .legalIf(isPointer(0)) 661 .scalarize(0); 662 663 getActionDefinitionsBuilder(G_PTRMASK) 664 .legalIf(typeInSet(1, {S64, S32})) 665 .minScalar(1, S32) 666 .maxScalarIf(sizeIs(0, 32), 1, S32) 667 .maxScalarIf(sizeIs(0, 64), 1, S64) 668 .scalarize(0); 669 670 auto &CmpBuilder = 671 getActionDefinitionsBuilder(G_ICMP) 672 // The compare output type differs based on the register bank of the output, 673 // so make both s1 and s32 legal. 674 // 675 // Scalar compares producing output in scc will be promoted to s32, as that 676 // is the allocatable register type that will be needed for the copy from 677 // scc. This will be promoted during RegBankSelect, and we assume something 678 // before that won't try to use s32 result types. 679 // 680 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 681 // bank. 682 .legalForCartesianProduct( 683 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 684 .legalForCartesianProduct( 685 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 686 if (ST.has16BitInsts()) { 687 CmpBuilder.legalFor({{S1, S16}}); 688 } 689 690 CmpBuilder 691 .widenScalarToNextPow2(1) 692 .clampScalar(1, S32, S64) 693 .scalarize(0) 694 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 695 696 getActionDefinitionsBuilder(G_FCMP) 697 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 698 .widenScalarToNextPow2(1) 699 .clampScalar(1, S32, S64) 700 .scalarize(0); 701 702 // FIXME: fpow has a selection pattern that should move to custom lowering. 703 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 704 if (ST.has16BitInsts()) 705 Exp2Ops.legalFor({S32, S16}); 706 else 707 Exp2Ops.legalFor({S32}); 708 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 709 Exp2Ops.scalarize(0); 710 711 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 712 if (ST.has16BitInsts()) 713 ExpOps.customFor({{S32}, {S16}}); 714 else 715 ExpOps.customFor({S32}); 716 ExpOps.clampScalar(0, MinScalarFPTy, S32) 717 .scalarize(0); 718 719 // The 64-bit versions produce 32-bit results, but only on the SALU. 720 getActionDefinitionsBuilder(G_CTPOP) 721 .legalFor({{S32, S32}, {S32, S64}}) 722 .clampScalar(0, S32, S32) 723 .clampScalar(1, S32, S64) 724 .scalarize(0) 725 .widenScalarToNextPow2(0, 32) 726 .widenScalarToNextPow2(1, 32); 727 728 // The hardware instructions return a different result on 0 than the generic 729 // instructions expect. The hardware produces -1, but these produce the 730 // bitwidth. 731 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 732 .scalarize(0) 733 .clampScalar(0, S32, S32) 734 .clampScalar(1, S32, S64) 735 .widenScalarToNextPow2(0, 32) 736 .widenScalarToNextPow2(1, 32) 737 .lower(); 738 739 // The 64-bit versions produce 32-bit results, but only on the SALU. 740 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 741 .legalFor({{S32, S32}, {S32, S64}}) 742 .clampScalar(0, S32, S32) 743 .clampScalar(1, S32, S64) 744 .scalarize(0) 745 .widenScalarToNextPow2(0, 32) 746 .widenScalarToNextPow2(1, 32); 747 748 getActionDefinitionsBuilder(G_BITREVERSE) 749 .legalFor({S32}) 750 .clampScalar(0, S32, S32) 751 .scalarize(0); 752 753 if (ST.has16BitInsts()) { 754 getActionDefinitionsBuilder(G_BSWAP) 755 .legalFor({S16, S32, V2S16}) 756 .clampMaxNumElements(0, S16, 2) 757 // FIXME: Fixing non-power-of-2 before clamp is workaround for 758 // narrowScalar limitation. 759 .widenScalarToNextPow2(0) 760 .clampScalar(0, S16, S32) 761 .scalarize(0); 762 763 if (ST.hasVOP3PInsts()) { 764 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 765 .legalFor({S32, S16, V2S16}) 766 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 767 .clampMaxNumElements(0, S16, 2) 768 .minScalar(0, S16) 769 .widenScalarToNextPow2(0) 770 .scalarize(0) 771 .lower(); 772 } else { 773 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 774 .legalFor({S32, S16}) 775 .widenScalarToNextPow2(0) 776 .minScalar(0, S16) 777 .scalarize(0) 778 .lower(); 779 } 780 } else { 781 // TODO: Should have same legality without v_perm_b32 782 getActionDefinitionsBuilder(G_BSWAP) 783 .legalFor({S32}) 784 .lowerIf(scalarNarrowerThan(0, 32)) 785 // FIXME: Fixing non-power-of-2 before clamp is workaround for 786 // narrowScalar limitation. 787 .widenScalarToNextPow2(0) 788 .maxScalar(0, S32) 789 .scalarize(0) 790 .lower(); 791 792 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 793 .legalFor({S32}) 794 .minScalar(0, S32) 795 .widenScalarToNextPow2(0) 796 .scalarize(0) 797 .lower(); 798 } 799 800 getActionDefinitionsBuilder(G_INTTOPTR) 801 // List the common cases 802 .legalForCartesianProduct(AddrSpaces64, {S64}) 803 .legalForCartesianProduct(AddrSpaces32, {S32}) 804 .scalarize(0) 805 // Accept any address space as long as the size matches 806 .legalIf(sameSize(0, 1)) 807 .widenScalarIf(smallerThan(1, 0), 808 [](const LegalityQuery &Query) { 809 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 810 }) 811 .narrowScalarIf(largerThan(1, 0), 812 [](const LegalityQuery &Query) { 813 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 814 }); 815 816 getActionDefinitionsBuilder(G_PTRTOINT) 817 // List the common cases 818 .legalForCartesianProduct(AddrSpaces64, {S64}) 819 .legalForCartesianProduct(AddrSpaces32, {S32}) 820 .scalarize(0) 821 // Accept any address space as long as the size matches 822 .legalIf(sameSize(0, 1)) 823 .widenScalarIf(smallerThan(0, 1), 824 [](const LegalityQuery &Query) { 825 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 826 }) 827 .narrowScalarIf( 828 largerThan(0, 1), 829 [](const LegalityQuery &Query) { 830 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 831 }); 832 833 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 834 .scalarize(0) 835 .custom(); 836 837 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 838 bool IsLoad) -> bool { 839 const LLT DstTy = Query.Types[0]; 840 841 // Split vector extloads. 842 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 843 unsigned Align = Query.MMODescrs[0].AlignInBits; 844 845 if (MemSize < DstTy.getSizeInBits()) 846 MemSize = std::max(MemSize, Align); 847 848 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 849 return true; 850 851 const LLT PtrTy = Query.Types[1]; 852 unsigned AS = PtrTy.getAddressSpace(); 853 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 854 return true; 855 856 // Catch weird sized loads that don't evenly divide into the access sizes 857 // TODO: May be able to widen depending on alignment etc. 858 unsigned NumRegs = (MemSize + 31) / 32; 859 if (NumRegs == 3) { 860 if (!ST.hasDwordx3LoadStores()) 861 return true; 862 } else { 863 // If the alignment allows, these should have been widened. 864 if (!isPowerOf2_32(NumRegs)) 865 return true; 866 } 867 868 if (Align < MemSize) { 869 const SITargetLowering *TLI = ST.getTargetLowering(); 870 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 871 } 872 873 return false; 874 }; 875 876 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 877 unsigned Opc) -> bool { 878 unsigned Size = Query.Types[0].getSizeInBits(); 879 if (isPowerOf2_32(Size)) 880 return false; 881 882 if (Size == 96 && ST.hasDwordx3LoadStores()) 883 return false; 884 885 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 886 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 887 return false; 888 889 unsigned Align = Query.MMODescrs[0].AlignInBits; 890 unsigned RoundedSize = NextPowerOf2(Size); 891 return (Align >= RoundedSize); 892 }; 893 894 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 895 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 896 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 897 898 // TODO: Refine based on subtargets which support unaligned access or 128-bit 899 // LDS 900 // TODO: Unsupported flat for SI. 901 902 for (unsigned Op : {G_LOAD, G_STORE}) { 903 const bool IsStore = Op == G_STORE; 904 905 auto &Actions = getActionDefinitionsBuilder(Op); 906 // Whitelist some common cases. 907 // TODO: Does this help compile time at all? 908 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 909 {V2S32, GlobalPtr, 64, GlobalAlign32}, 910 {V4S32, GlobalPtr, 128, GlobalAlign32}, 911 {S64, GlobalPtr, 64, GlobalAlign32}, 912 {V2S64, GlobalPtr, 128, GlobalAlign32}, 913 {V2S16, GlobalPtr, 32, GlobalAlign32}, 914 {S32, GlobalPtr, 8, GlobalAlign8}, 915 {S32, GlobalPtr, 16, GlobalAlign16}, 916 917 {S32, LocalPtr, 32, 32}, 918 {S64, LocalPtr, 64, 32}, 919 {V2S32, LocalPtr, 64, 32}, 920 {S32, LocalPtr, 8, 8}, 921 {S32, LocalPtr, 16, 16}, 922 {V2S16, LocalPtr, 32, 32}, 923 924 {S32, PrivatePtr, 32, 32}, 925 {S32, PrivatePtr, 8, 8}, 926 {S32, PrivatePtr, 16, 16}, 927 {V2S16, PrivatePtr, 32, 32}, 928 929 {S32, ConstantPtr, 32, GlobalAlign32}, 930 {V2S32, ConstantPtr, 64, GlobalAlign32}, 931 {V4S32, ConstantPtr, 128, GlobalAlign32}, 932 {S64, ConstantPtr, 64, GlobalAlign32}, 933 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 934 Actions.legalIf( 935 [=](const LegalityQuery &Query) -> bool { 936 return isLoadStoreLegal(ST, Query, Op); 937 }); 938 939 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 940 // 64-bits. 941 // 942 // TODO: Should generalize bitcast action into coerce, which will also cover 943 // inserting addrspacecasts. 944 Actions.customIf(typeIs(1, Constant32Ptr)); 945 946 // Turn any illegal element vectors into something easier to deal 947 // with. These will ultimately produce 32-bit scalar shifts to extract the 948 // parts anyway. 949 // 950 // For odd 16-bit element vectors, prefer to split those into pieces with 951 // 16-bit vector parts. 952 Actions.bitcastIf( 953 [=](const LegalityQuery &Query) -> bool { 954 LLT Ty = Query.Types[0]; 955 return Ty.isVector() && 956 isRegisterSize(Ty.getSizeInBits()) && 957 !isRegisterVectorElementType(Ty.getElementType()); 958 }, bitcastToRegisterType(0)); 959 960 Actions 961 .customIf(typeIs(1, Constant32Ptr)) 962 // Widen suitably aligned loads by loading extra elements. 963 .moreElementsIf([=](const LegalityQuery &Query) { 964 const LLT Ty = Query.Types[0]; 965 return Op == G_LOAD && Ty.isVector() && 966 shouldWidenLoadResult(Query, Op); 967 }, moreElementsToNextPow2(0)) 968 .widenScalarIf([=](const LegalityQuery &Query) { 969 const LLT Ty = Query.Types[0]; 970 return Op == G_LOAD && !Ty.isVector() && 971 shouldWidenLoadResult(Query, Op); 972 }, widenScalarOrEltToNextPow2(0)) 973 .narrowScalarIf( 974 [=](const LegalityQuery &Query) -> bool { 975 return !Query.Types[0].isVector() && 976 needToSplitMemOp(Query, Op == G_LOAD); 977 }, 978 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 979 const LLT DstTy = Query.Types[0]; 980 const LLT PtrTy = Query.Types[1]; 981 982 const unsigned DstSize = DstTy.getSizeInBits(); 983 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 984 985 // Split extloads. 986 if (DstSize > MemSize) 987 return std::make_pair(0, LLT::scalar(MemSize)); 988 989 if (!isPowerOf2_32(DstSize)) { 990 // We're probably decomposing an odd sized store. Try to split 991 // to the widest type. TODO: Account for alignment. As-is it 992 // should be OK, since the new parts will be further legalized. 993 unsigned FloorSize = PowerOf2Floor(DstSize); 994 return std::make_pair(0, LLT::scalar(FloorSize)); 995 } 996 997 if (DstSize > 32 && (DstSize % 32 != 0)) { 998 // FIXME: Need a way to specify non-extload of larger size if 999 // suitably aligned. 1000 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1001 } 1002 1003 unsigned MaxSize = maxSizeForAddrSpace(ST, 1004 PtrTy.getAddressSpace(), 1005 Op == G_LOAD); 1006 if (MemSize > MaxSize) 1007 return std::make_pair(0, LLT::scalar(MaxSize)); 1008 1009 unsigned Align = Query.MMODescrs[0].AlignInBits; 1010 return std::make_pair(0, LLT::scalar(Align)); 1011 }) 1012 .fewerElementsIf( 1013 [=](const LegalityQuery &Query) -> bool { 1014 return Query.Types[0].isVector() && 1015 needToSplitMemOp(Query, Op == G_LOAD); 1016 }, 1017 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1018 const LLT DstTy = Query.Types[0]; 1019 const LLT PtrTy = Query.Types[1]; 1020 1021 LLT EltTy = DstTy.getElementType(); 1022 unsigned MaxSize = maxSizeForAddrSpace(ST, 1023 PtrTy.getAddressSpace(), 1024 Op == G_LOAD); 1025 1026 // FIXME: Handle widened to power of 2 results better. This ends 1027 // up scalarizing. 1028 // FIXME: 3 element stores scalarized on SI 1029 1030 // Split if it's too large for the address space. 1031 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1032 unsigned NumElts = DstTy.getNumElements(); 1033 unsigned EltSize = EltTy.getSizeInBits(); 1034 1035 if (MaxSize % EltSize == 0) { 1036 return std::make_pair( 1037 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1038 } 1039 1040 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1041 1042 // FIXME: Refine when odd breakdowns handled 1043 // The scalars will need to be re-legalized. 1044 if (NumPieces == 1 || NumPieces >= NumElts || 1045 NumElts % NumPieces != 0) 1046 return std::make_pair(0, EltTy); 1047 1048 return std::make_pair(0, 1049 LLT::vector(NumElts / NumPieces, EltTy)); 1050 } 1051 1052 // FIXME: We could probably handle weird extending loads better. 1053 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1054 if (DstTy.getSizeInBits() > MemSize) 1055 return std::make_pair(0, EltTy); 1056 1057 unsigned EltSize = EltTy.getSizeInBits(); 1058 unsigned DstSize = DstTy.getSizeInBits(); 1059 if (!isPowerOf2_32(DstSize)) { 1060 // We're probably decomposing an odd sized store. Try to split 1061 // to the widest type. TODO: Account for alignment. As-is it 1062 // should be OK, since the new parts will be further legalized. 1063 unsigned FloorSize = PowerOf2Floor(DstSize); 1064 return std::make_pair( 1065 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1066 } 1067 1068 // Need to split because of alignment. 1069 unsigned Align = Query.MMODescrs[0].AlignInBits; 1070 if (EltSize > Align && 1071 (EltSize / Align < DstTy.getNumElements())) { 1072 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1073 } 1074 1075 // May need relegalization for the scalars. 1076 return std::make_pair(0, EltTy); 1077 }) 1078 .minScalar(0, S32); 1079 1080 if (IsStore) 1081 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1082 1083 // TODO: Need a bitcast lower option? 1084 Actions 1085 .widenScalarToNextPow2(0) 1086 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1087 } 1088 1089 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1090 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1091 {S32, GlobalPtr, 16, 2 * 8}, 1092 {S32, LocalPtr, 8, 8}, 1093 {S32, LocalPtr, 16, 16}, 1094 {S32, PrivatePtr, 8, 8}, 1095 {S32, PrivatePtr, 16, 16}, 1096 {S32, ConstantPtr, 8, 8}, 1097 {S32, ConstantPtr, 16, 2 * 8}}); 1098 if (ST.hasFlatAddressSpace()) { 1099 ExtLoads.legalForTypesWithMemDesc( 1100 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1101 } 1102 1103 ExtLoads.clampScalar(0, S32, S32) 1104 .widenScalarToNextPow2(0) 1105 .unsupportedIfMemSizeNotPow2() 1106 .lower(); 1107 1108 auto &Atomics = getActionDefinitionsBuilder( 1109 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1110 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1111 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1112 G_ATOMICRMW_UMIN}) 1113 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1114 {S64, GlobalPtr}, {S64, LocalPtr}}); 1115 if (ST.hasFlatAddressSpace()) { 1116 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1117 } 1118 1119 if (ST.hasLDSFPAtomics()) { 1120 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1121 .legalFor({{S32, LocalPtr}}); 1122 } 1123 1124 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1125 // demarshalling 1126 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1127 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1128 {S32, FlatPtr}, {S64, FlatPtr}}) 1129 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1130 {S32, RegionPtr}, {S64, RegionPtr}}); 1131 // TODO: Pointer types, any 32-bit or 64-bit vector 1132 1133 // Condition should be s32 for scalar, s1 for vector. 1134 getActionDefinitionsBuilder(G_SELECT) 1135 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1136 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1137 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1138 .clampScalar(0, S16, S64) 1139 .scalarize(1) 1140 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1141 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1142 .clampMaxNumElements(0, S32, 2) 1143 .clampMaxNumElements(0, LocalPtr, 2) 1144 .clampMaxNumElements(0, PrivatePtr, 2) 1145 .scalarize(0) 1146 .widenScalarToNextPow2(0) 1147 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1148 1149 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1150 // be more flexible with the shift amount type. 1151 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1152 .legalFor({{S32, S32}, {S64, S32}}); 1153 if (ST.has16BitInsts()) { 1154 if (ST.hasVOP3PInsts()) { 1155 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1156 .clampMaxNumElements(0, S16, 2); 1157 } else 1158 Shifts.legalFor({{S16, S16}}); 1159 1160 // TODO: Support 16-bit shift amounts for all types 1161 Shifts.widenScalarIf( 1162 [=](const LegalityQuery &Query) { 1163 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1164 // 32-bit amount. 1165 const LLT ValTy = Query.Types[0]; 1166 const LLT AmountTy = Query.Types[1]; 1167 return ValTy.getSizeInBits() <= 16 && 1168 AmountTy.getSizeInBits() < 16; 1169 }, changeTo(1, S16)); 1170 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1171 Shifts.clampScalar(1, S32, S32); 1172 Shifts.clampScalar(0, S16, S64); 1173 Shifts.widenScalarToNextPow2(0, 16); 1174 } else { 1175 // Make sure we legalize the shift amount type first, as the general 1176 // expansion for the shifted type will produce much worse code if it hasn't 1177 // been truncated already. 1178 Shifts.clampScalar(1, S32, S32); 1179 Shifts.clampScalar(0, S32, S64); 1180 Shifts.widenScalarToNextPow2(0, 32); 1181 } 1182 Shifts.scalarize(0); 1183 1184 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1185 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1186 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1187 unsigned IdxTypeIdx = 2; 1188 1189 getActionDefinitionsBuilder(Op) 1190 .customIf([=](const LegalityQuery &Query) { 1191 const LLT EltTy = Query.Types[EltTypeIdx]; 1192 const LLT VecTy = Query.Types[VecTypeIdx]; 1193 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1194 return (EltTy.getSizeInBits() == 16 || 1195 EltTy.getSizeInBits() % 32 == 0) && 1196 VecTy.getSizeInBits() % 32 == 0 && 1197 VecTy.getSizeInBits() <= 1024 && 1198 IdxTy.getSizeInBits() == 32; 1199 }) 1200 .clampScalar(EltTypeIdx, S32, S64) 1201 .clampScalar(VecTypeIdx, S32, S64) 1202 .clampScalar(IdxTypeIdx, S32, S32); 1203 } 1204 1205 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1206 .unsupportedIf([=](const LegalityQuery &Query) { 1207 const LLT &EltTy = Query.Types[1].getElementType(); 1208 return Query.Types[0] != EltTy; 1209 }); 1210 1211 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1212 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1213 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1214 1215 // FIXME: Doesn't handle extract of illegal sizes. 1216 getActionDefinitionsBuilder(Op) 1217 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1218 // FIXME: Multiples of 16 should not be legal. 1219 .legalIf([=](const LegalityQuery &Query) { 1220 const LLT BigTy = Query.Types[BigTyIdx]; 1221 const LLT LitTy = Query.Types[LitTyIdx]; 1222 return (BigTy.getSizeInBits() % 32 == 0) && 1223 (LitTy.getSizeInBits() % 16 == 0); 1224 }) 1225 .widenScalarIf( 1226 [=](const LegalityQuery &Query) { 1227 const LLT BigTy = Query.Types[BigTyIdx]; 1228 return (BigTy.getScalarSizeInBits() < 16); 1229 }, 1230 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1231 .widenScalarIf( 1232 [=](const LegalityQuery &Query) { 1233 const LLT LitTy = Query.Types[LitTyIdx]; 1234 return (LitTy.getScalarSizeInBits() < 16); 1235 }, 1236 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1237 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1238 .widenScalarToNextPow2(BigTyIdx, 32); 1239 1240 } 1241 1242 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1243 .legalForCartesianProduct(AllS32Vectors, {S32}) 1244 .legalForCartesianProduct(AllS64Vectors, {S64}) 1245 .clampNumElements(0, V16S32, V32S32) 1246 .clampNumElements(0, V2S64, V16S64) 1247 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1248 1249 if (ST.hasScalarPackInsts()) { 1250 BuildVector 1251 // FIXME: Should probably widen s1 vectors straight to s32 1252 .minScalarOrElt(0, S16) 1253 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1254 .minScalar(1, S32); 1255 1256 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1257 .legalFor({V2S16, S32}) 1258 .lower(); 1259 BuildVector.minScalarOrElt(0, S32); 1260 } else { 1261 BuildVector.customFor({V2S16, S16}); 1262 BuildVector.minScalarOrElt(0, S32); 1263 1264 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1265 .customFor({V2S16, S32}) 1266 .lower(); 1267 } 1268 1269 BuildVector.legalIf(isRegisterType(0)); 1270 1271 // FIXME: Clamp maximum size 1272 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1273 .legalIf(isRegisterType(0)); 1274 1275 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1276 // pre-legalize. 1277 if (ST.hasVOP3PInsts()) { 1278 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1279 .customFor({V2S16, V2S16}) 1280 .lower(); 1281 } else 1282 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1283 1284 // Merge/Unmerge 1285 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1286 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1287 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1288 1289 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1290 const LLT Ty = Query.Types[TypeIdx]; 1291 if (Ty.isVector()) { 1292 const LLT &EltTy = Ty.getElementType(); 1293 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1294 return true; 1295 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1296 return true; 1297 } 1298 return false; 1299 }; 1300 1301 auto &Builder = getActionDefinitionsBuilder(Op) 1302 .lowerFor({{S16, V2S16}}) 1303 .lowerIf([=](const LegalityQuery &Query) { 1304 const LLT BigTy = Query.Types[BigTyIdx]; 1305 return BigTy.getSizeInBits() == 32; 1306 }) 1307 // Try to widen to s16 first for small types. 1308 // TODO: Only do this on targets with legal s16 shifts 1309 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1310 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1311 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1312 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1313 elementTypeIs(1, S16)), 1314 changeTo(1, V2S16)) 1315 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1316 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1317 // valid. 1318 .clampScalar(LitTyIdx, S32, S512) 1319 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1320 // Break up vectors with weird elements into scalars 1321 .fewerElementsIf( 1322 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1323 scalarize(0)) 1324 .fewerElementsIf( 1325 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1326 scalarize(1)) 1327 .clampScalar(BigTyIdx, S32, S1024); 1328 1329 if (Op == G_MERGE_VALUES) { 1330 Builder.widenScalarIf( 1331 // TODO: Use 16-bit shifts if legal for 8-bit values? 1332 [=](const LegalityQuery &Query) { 1333 const LLT Ty = Query.Types[LitTyIdx]; 1334 return Ty.getSizeInBits() < 32; 1335 }, 1336 changeTo(LitTyIdx, S32)); 1337 } 1338 1339 Builder.widenScalarIf( 1340 [=](const LegalityQuery &Query) { 1341 const LLT Ty = Query.Types[BigTyIdx]; 1342 return !isPowerOf2_32(Ty.getSizeInBits()) && 1343 Ty.getSizeInBits() % 16 != 0; 1344 }, 1345 [=](const LegalityQuery &Query) { 1346 // Pick the next power of 2, or a multiple of 64 over 128. 1347 // Whichever is smaller. 1348 const LLT &Ty = Query.Types[BigTyIdx]; 1349 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1350 if (NewSizeInBits >= 256) { 1351 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1352 if (RoundedTo < NewSizeInBits) 1353 NewSizeInBits = RoundedTo; 1354 } 1355 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1356 }) 1357 .legalIf([=](const LegalityQuery &Query) { 1358 const LLT &BigTy = Query.Types[BigTyIdx]; 1359 const LLT &LitTy = Query.Types[LitTyIdx]; 1360 1361 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1362 return false; 1363 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1364 return false; 1365 1366 return BigTy.getSizeInBits() % 16 == 0 && 1367 LitTy.getSizeInBits() % 16 == 0 && 1368 BigTy.getSizeInBits() <= 1024; 1369 }) 1370 // Any vectors left are the wrong size. Scalarize them. 1371 .scalarize(0) 1372 .scalarize(1); 1373 } 1374 1375 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1376 // RegBankSelect. 1377 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1378 .legalFor({{S32}, {S64}}); 1379 1380 if (ST.hasVOP3PInsts()) { 1381 SextInReg.lowerFor({{V2S16}}) 1382 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1383 // get more vector shift opportunities, since we'll get those when 1384 // expanded. 1385 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1386 } else if (ST.has16BitInsts()) { 1387 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1388 } else { 1389 // Prefer to promote to s32 before lowering if we don't have 16-bit 1390 // shifts. This avoid a lot of intermediate truncate and extend operations. 1391 SextInReg.lowerFor({{S32}, {S64}}); 1392 } 1393 1394 SextInReg 1395 .scalarize(0) 1396 .clampScalar(0, S32, S64) 1397 .lower(); 1398 1399 getActionDefinitionsBuilder(G_FSHR) 1400 .legalFor({{S32, S32}}) 1401 .scalarize(0) 1402 .lower(); 1403 1404 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1405 .legalFor({S64}); 1406 1407 getActionDefinitionsBuilder({ 1408 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1409 G_FCOPYSIGN, 1410 1411 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1412 G_READ_REGISTER, 1413 G_WRITE_REGISTER, 1414 1415 G_SADDO, G_SSUBO, 1416 1417 // TODO: Implement 1418 G_FMINIMUM, G_FMAXIMUM, 1419 G_FSHL 1420 }).lower(); 1421 1422 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1423 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1424 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1425 .unsupported(); 1426 1427 computeTables(); 1428 verify(*ST.getInstrInfo()); 1429 } 1430 1431 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1432 MachineRegisterInfo &MRI, 1433 MachineIRBuilder &B, 1434 GISelChangeObserver &Observer) const { 1435 switch (MI.getOpcode()) { 1436 case TargetOpcode::G_ADDRSPACE_CAST: 1437 return legalizeAddrSpaceCast(MI, MRI, B); 1438 case TargetOpcode::G_FRINT: 1439 return legalizeFrint(MI, MRI, B); 1440 case TargetOpcode::G_FCEIL: 1441 return legalizeFceil(MI, MRI, B); 1442 case TargetOpcode::G_INTRINSIC_TRUNC: 1443 return legalizeIntrinsicTrunc(MI, MRI, B); 1444 case TargetOpcode::G_SITOFP: 1445 return legalizeITOFP(MI, MRI, B, true); 1446 case TargetOpcode::G_UITOFP: 1447 return legalizeITOFP(MI, MRI, B, false); 1448 case TargetOpcode::G_FPTOSI: 1449 return legalizeFPTOI(MI, MRI, B, true); 1450 case TargetOpcode::G_FPTOUI: 1451 return legalizeFPTOI(MI, MRI, B, false); 1452 case TargetOpcode::G_FMINNUM: 1453 case TargetOpcode::G_FMAXNUM: 1454 case TargetOpcode::G_FMINNUM_IEEE: 1455 case TargetOpcode::G_FMAXNUM_IEEE: 1456 return legalizeMinNumMaxNum(MI, MRI, B); 1457 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1458 return legalizeExtractVectorElt(MI, MRI, B); 1459 case TargetOpcode::G_INSERT_VECTOR_ELT: 1460 return legalizeInsertVectorElt(MI, MRI, B); 1461 case TargetOpcode::G_SHUFFLE_VECTOR: 1462 return legalizeShuffleVector(MI, MRI, B); 1463 case TargetOpcode::G_FSIN: 1464 case TargetOpcode::G_FCOS: 1465 return legalizeSinCos(MI, MRI, B); 1466 case TargetOpcode::G_GLOBAL_VALUE: 1467 return legalizeGlobalValue(MI, MRI, B); 1468 case TargetOpcode::G_LOAD: 1469 return legalizeLoad(MI, MRI, B, Observer); 1470 case TargetOpcode::G_FMAD: 1471 return legalizeFMad(MI, MRI, B); 1472 case TargetOpcode::G_FDIV: 1473 return legalizeFDIV(MI, MRI, B); 1474 case TargetOpcode::G_UDIV: 1475 case TargetOpcode::G_UREM: 1476 return legalizeUDIV_UREM(MI, MRI, B); 1477 case TargetOpcode::G_SDIV: 1478 case TargetOpcode::G_SREM: 1479 return legalizeSDIV_SREM(MI, MRI, B); 1480 case TargetOpcode::G_ATOMIC_CMPXCHG: 1481 return legalizeAtomicCmpXChg(MI, MRI, B); 1482 case TargetOpcode::G_FLOG: 1483 return legalizeFlog(MI, B, numbers::ln2f); 1484 case TargetOpcode::G_FLOG10: 1485 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1486 case TargetOpcode::G_FEXP: 1487 return legalizeFExp(MI, B); 1488 case TargetOpcode::G_FPOW: 1489 return legalizeFPow(MI, B); 1490 case TargetOpcode::G_FFLOOR: 1491 return legalizeFFloor(MI, MRI, B); 1492 case TargetOpcode::G_BUILD_VECTOR: 1493 return legalizeBuildVector(MI, MRI, B); 1494 default: 1495 return false; 1496 } 1497 1498 llvm_unreachable("expected switch to return"); 1499 } 1500 1501 Register AMDGPULegalizerInfo::getSegmentAperture( 1502 unsigned AS, 1503 MachineRegisterInfo &MRI, 1504 MachineIRBuilder &B) const { 1505 MachineFunction &MF = B.getMF(); 1506 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1507 const LLT S32 = LLT::scalar(32); 1508 1509 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1510 1511 if (ST.hasApertureRegs()) { 1512 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1513 // getreg. 1514 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1515 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1516 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1517 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1518 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1519 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1520 unsigned Encoding = 1521 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1522 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1523 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1524 1525 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1526 1527 B.buildInstr(AMDGPU::S_GETREG_B32) 1528 .addDef(GetReg) 1529 .addImm(Encoding); 1530 MRI.setType(GetReg, S32); 1531 1532 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1533 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1534 } 1535 1536 Register QueuePtr = MRI.createGenericVirtualRegister( 1537 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1538 1539 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1540 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1541 return Register(); 1542 1543 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1544 // private_segment_aperture_base_hi. 1545 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1546 1547 // TODO: can we be smarter about machine pointer info? 1548 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1549 MachineMemOperand *MMO = MF.getMachineMemOperand( 1550 PtrInfo, 1551 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1552 MachineMemOperand::MOInvariant, 1553 4, commonAlignment(Align(64), StructOffset)); 1554 1555 Register LoadAddr; 1556 1557 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1558 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1559 } 1560 1561 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1562 MachineInstr &MI, MachineRegisterInfo &MRI, 1563 MachineIRBuilder &B) const { 1564 MachineFunction &MF = B.getMF(); 1565 1566 const LLT S32 = LLT::scalar(32); 1567 Register Dst = MI.getOperand(0).getReg(); 1568 Register Src = MI.getOperand(1).getReg(); 1569 1570 LLT DstTy = MRI.getType(Dst); 1571 LLT SrcTy = MRI.getType(Src); 1572 unsigned DestAS = DstTy.getAddressSpace(); 1573 unsigned SrcAS = SrcTy.getAddressSpace(); 1574 1575 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1576 // vector element. 1577 assert(!DstTy.isVector()); 1578 1579 const AMDGPUTargetMachine &TM 1580 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1581 1582 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1583 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1584 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1585 return true; 1586 } 1587 1588 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1589 // Truncate. 1590 B.buildExtract(Dst, Src, 0); 1591 MI.eraseFromParent(); 1592 return true; 1593 } 1594 1595 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1596 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1597 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1598 1599 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1600 // another. Merge operands are required to be the same type, but creating an 1601 // extra ptrtoint would be kind of pointless. 1602 auto HighAddr = B.buildConstant( 1603 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1604 B.buildMerge(Dst, {Src, HighAddr}); 1605 MI.eraseFromParent(); 1606 return true; 1607 } 1608 1609 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1610 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1611 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1612 unsigned NullVal = TM.getNullPointerValue(DestAS); 1613 1614 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1615 auto FlatNull = B.buildConstant(SrcTy, 0); 1616 1617 // Extract low 32-bits of the pointer. 1618 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1619 1620 auto CmpRes = 1621 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1622 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1623 1624 MI.eraseFromParent(); 1625 return true; 1626 } 1627 1628 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1629 return false; 1630 1631 if (!ST.hasFlatAddressSpace()) 1632 return false; 1633 1634 auto SegmentNull = 1635 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1636 auto FlatNull = 1637 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1638 1639 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1640 if (!ApertureReg.isValid()) 1641 return false; 1642 1643 auto CmpRes = 1644 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1645 1646 // Coerce the type of the low half of the result so we can use merge_values. 1647 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1648 1649 // TODO: Should we allow mismatched types but matching sizes in merges to 1650 // avoid the ptrtoint? 1651 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1652 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1653 1654 MI.eraseFromParent(); 1655 return true; 1656 } 1657 1658 bool AMDGPULegalizerInfo::legalizeFrint( 1659 MachineInstr &MI, MachineRegisterInfo &MRI, 1660 MachineIRBuilder &B) const { 1661 Register Src = MI.getOperand(1).getReg(); 1662 LLT Ty = MRI.getType(Src); 1663 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1664 1665 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1666 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1667 1668 auto C1 = B.buildFConstant(Ty, C1Val); 1669 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1670 1671 // TODO: Should this propagate fast-math-flags? 1672 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1673 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1674 1675 auto C2 = B.buildFConstant(Ty, C2Val); 1676 auto Fabs = B.buildFAbs(Ty, Src); 1677 1678 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1679 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1680 return true; 1681 } 1682 1683 bool AMDGPULegalizerInfo::legalizeFceil( 1684 MachineInstr &MI, MachineRegisterInfo &MRI, 1685 MachineIRBuilder &B) const { 1686 1687 const LLT S1 = LLT::scalar(1); 1688 const LLT S64 = LLT::scalar(64); 1689 1690 Register Src = MI.getOperand(1).getReg(); 1691 assert(MRI.getType(Src) == S64); 1692 1693 // result = trunc(src) 1694 // if (src > 0.0 && src != result) 1695 // result += 1.0 1696 1697 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1698 1699 const auto Zero = B.buildFConstant(S64, 0.0); 1700 const auto One = B.buildFConstant(S64, 1.0); 1701 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1702 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1703 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1704 auto Add = B.buildSelect(S64, And, One, Zero); 1705 1706 // TODO: Should this propagate fast-math-flags? 1707 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1708 return true; 1709 } 1710 1711 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1712 MachineIRBuilder &B) { 1713 const unsigned FractBits = 52; 1714 const unsigned ExpBits = 11; 1715 LLT S32 = LLT::scalar(32); 1716 1717 auto Const0 = B.buildConstant(S32, FractBits - 32); 1718 auto Const1 = B.buildConstant(S32, ExpBits); 1719 1720 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1721 .addUse(Const0.getReg(0)) 1722 .addUse(Const1.getReg(0)); 1723 1724 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1725 } 1726 1727 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1728 MachineInstr &MI, MachineRegisterInfo &MRI, 1729 MachineIRBuilder &B) const { 1730 const LLT S1 = LLT::scalar(1); 1731 const LLT S32 = LLT::scalar(32); 1732 const LLT S64 = LLT::scalar(64); 1733 1734 Register Src = MI.getOperand(1).getReg(); 1735 assert(MRI.getType(Src) == S64); 1736 1737 // TODO: Should this use extract since the low half is unused? 1738 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1739 Register Hi = Unmerge.getReg(1); 1740 1741 // Extract the upper half, since this is where we will find the sign and 1742 // exponent. 1743 auto Exp = extractF64Exponent(Hi, B); 1744 1745 const unsigned FractBits = 52; 1746 1747 // Extract the sign bit. 1748 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1749 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1750 1751 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1752 1753 const auto Zero32 = B.buildConstant(S32, 0); 1754 1755 // Extend back to 64-bits. 1756 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1757 1758 auto Shr = B.buildAShr(S64, FractMask, Exp); 1759 auto Not = B.buildNot(S64, Shr); 1760 auto Tmp0 = B.buildAnd(S64, Src, Not); 1761 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1762 1763 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1764 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1765 1766 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1767 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1768 return true; 1769 } 1770 1771 bool AMDGPULegalizerInfo::legalizeITOFP( 1772 MachineInstr &MI, MachineRegisterInfo &MRI, 1773 MachineIRBuilder &B, bool Signed) const { 1774 1775 Register Dst = MI.getOperand(0).getReg(); 1776 Register Src = MI.getOperand(1).getReg(); 1777 1778 const LLT S64 = LLT::scalar(64); 1779 const LLT S32 = LLT::scalar(32); 1780 1781 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1782 1783 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1784 1785 auto CvtHi = Signed ? 1786 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1787 B.buildUITOFP(S64, Unmerge.getReg(1)); 1788 1789 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1790 1791 auto ThirtyTwo = B.buildConstant(S32, 32); 1792 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1793 .addUse(CvtHi.getReg(0)) 1794 .addUse(ThirtyTwo.getReg(0)); 1795 1796 // TODO: Should this propagate fast-math-flags? 1797 B.buildFAdd(Dst, LdExp, CvtLo); 1798 MI.eraseFromParent(); 1799 return true; 1800 } 1801 1802 // TODO: Copied from DAG implementation. Verify logic and document how this 1803 // actually works. 1804 bool AMDGPULegalizerInfo::legalizeFPTOI( 1805 MachineInstr &MI, MachineRegisterInfo &MRI, 1806 MachineIRBuilder &B, bool Signed) const { 1807 1808 Register Dst = MI.getOperand(0).getReg(); 1809 Register Src = MI.getOperand(1).getReg(); 1810 1811 const LLT S64 = LLT::scalar(64); 1812 const LLT S32 = LLT::scalar(32); 1813 1814 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1815 1816 unsigned Flags = MI.getFlags(); 1817 1818 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1819 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1820 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1821 1822 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1823 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1824 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1825 1826 auto Hi = Signed ? 1827 B.buildFPTOSI(S32, FloorMul) : 1828 B.buildFPTOUI(S32, FloorMul); 1829 auto Lo = B.buildFPTOUI(S32, Fma); 1830 1831 B.buildMerge(Dst, { Lo, Hi }); 1832 MI.eraseFromParent(); 1833 1834 return true; 1835 } 1836 1837 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1838 MachineInstr &MI, MachineRegisterInfo &MRI, 1839 MachineIRBuilder &B) const { 1840 MachineFunction &MF = B.getMF(); 1841 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1842 1843 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1844 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1845 1846 // With ieee_mode disabled, the instructions have the correct behavior 1847 // already for G_FMINNUM/G_FMAXNUM 1848 if (!MFI->getMode().IEEE) 1849 return !IsIEEEOp; 1850 1851 if (IsIEEEOp) 1852 return true; 1853 1854 MachineIRBuilder HelperBuilder(MI); 1855 GISelObserverWrapper DummyObserver; 1856 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1857 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1858 } 1859 1860 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1861 MachineInstr &MI, MachineRegisterInfo &MRI, 1862 MachineIRBuilder &B) const { 1863 // TODO: Should move some of this into LegalizerHelper. 1864 1865 // TODO: Promote dynamic indexing of s16 to s32 1866 1867 // FIXME: Artifact combiner probably should have replaced the truncated 1868 // constant before this, so we shouldn't need 1869 // getConstantVRegValWithLookThrough. 1870 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1871 MI.getOperand(2).getReg(), MRI); 1872 if (!IdxVal) // Dynamic case will be selected to register indexing. 1873 return true; 1874 1875 Register Dst = MI.getOperand(0).getReg(); 1876 Register Vec = MI.getOperand(1).getReg(); 1877 1878 LLT VecTy = MRI.getType(Vec); 1879 LLT EltTy = VecTy.getElementType(); 1880 assert(EltTy == MRI.getType(Dst)); 1881 1882 if (IdxVal->Value < VecTy.getNumElements()) 1883 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1884 else 1885 B.buildUndef(Dst); 1886 1887 MI.eraseFromParent(); 1888 return true; 1889 } 1890 1891 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1892 MachineInstr &MI, MachineRegisterInfo &MRI, 1893 MachineIRBuilder &B) const { 1894 // TODO: Should move some of this into LegalizerHelper. 1895 1896 // TODO: Promote dynamic indexing of s16 to s32 1897 1898 // FIXME: Artifact combiner probably should have replaced the truncated 1899 // constant before this, so we shouldn't need 1900 // getConstantVRegValWithLookThrough. 1901 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1902 MI.getOperand(3).getReg(), MRI); 1903 if (!IdxVal) // Dynamic case will be selected to register indexing. 1904 return true; 1905 1906 Register Dst = MI.getOperand(0).getReg(); 1907 Register Vec = MI.getOperand(1).getReg(); 1908 Register Ins = MI.getOperand(2).getReg(); 1909 1910 LLT VecTy = MRI.getType(Vec); 1911 LLT EltTy = VecTy.getElementType(); 1912 assert(EltTy == MRI.getType(Ins)); 1913 1914 if (IdxVal->Value < VecTy.getNumElements()) 1915 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1916 else 1917 B.buildUndef(Dst); 1918 1919 MI.eraseFromParent(); 1920 return true; 1921 } 1922 1923 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1924 MachineInstr &MI, MachineRegisterInfo &MRI, 1925 MachineIRBuilder &B) const { 1926 const LLT V2S16 = LLT::vector(2, 16); 1927 1928 Register Dst = MI.getOperand(0).getReg(); 1929 Register Src0 = MI.getOperand(1).getReg(); 1930 LLT DstTy = MRI.getType(Dst); 1931 LLT SrcTy = MRI.getType(Src0); 1932 1933 if (SrcTy == V2S16 && DstTy == V2S16 && 1934 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1935 return true; 1936 1937 MachineIRBuilder HelperBuilder(MI); 1938 GISelObserverWrapper DummyObserver; 1939 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1940 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1941 } 1942 1943 bool AMDGPULegalizerInfo::legalizeSinCos( 1944 MachineInstr &MI, MachineRegisterInfo &MRI, 1945 MachineIRBuilder &B) const { 1946 1947 Register DstReg = MI.getOperand(0).getReg(); 1948 Register SrcReg = MI.getOperand(1).getReg(); 1949 LLT Ty = MRI.getType(DstReg); 1950 unsigned Flags = MI.getFlags(); 1951 1952 Register TrigVal; 1953 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 1954 if (ST.hasTrigReducedRange()) { 1955 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1956 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1957 .addUse(MulVal.getReg(0)) 1958 .setMIFlags(Flags).getReg(0); 1959 } else 1960 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1961 1962 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1963 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1964 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1965 .addUse(TrigVal) 1966 .setMIFlags(Flags); 1967 MI.eraseFromParent(); 1968 return true; 1969 } 1970 1971 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1972 Register DstReg, LLT PtrTy, 1973 MachineIRBuilder &B, const GlobalValue *GV, 1974 unsigned Offset, unsigned GAFlags) const { 1975 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1976 // to the following code sequence: 1977 // 1978 // For constant address space: 1979 // s_getpc_b64 s[0:1] 1980 // s_add_u32 s0, s0, $symbol 1981 // s_addc_u32 s1, s1, 0 1982 // 1983 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1984 // a fixup or relocation is emitted to replace $symbol with a literal 1985 // constant, which is a pc-relative offset from the encoding of the $symbol 1986 // operand to the global variable. 1987 // 1988 // For global address space: 1989 // s_getpc_b64 s[0:1] 1990 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1991 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1992 // 1993 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1994 // fixups or relocations are emitted to replace $symbol@*@lo and 1995 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1996 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1997 // operand to the global variable. 1998 // 1999 // What we want here is an offset from the value returned by s_getpc 2000 // (which is the address of the s_add_u32 instruction) to the global 2001 // variable, but since the encoding of $symbol starts 4 bytes after the start 2002 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2003 // small. This requires us to add 4 to the global variable offset in order to 2004 // compute the correct address. 2005 2006 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2007 2008 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2009 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2010 2011 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2012 .addDef(PCReg); 2013 2014 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2015 if (GAFlags == SIInstrInfo::MO_NONE) 2016 MIB.addImm(0); 2017 else 2018 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2019 2020 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2021 2022 if (PtrTy.getSizeInBits() == 32) 2023 B.buildExtract(DstReg, PCReg, 0); 2024 return true; 2025 } 2026 2027 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2028 MachineInstr &MI, MachineRegisterInfo &MRI, 2029 MachineIRBuilder &B) const { 2030 Register DstReg = MI.getOperand(0).getReg(); 2031 LLT Ty = MRI.getType(DstReg); 2032 unsigned AS = Ty.getAddressSpace(); 2033 2034 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2035 MachineFunction &MF = B.getMF(); 2036 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2037 2038 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2039 if (!MFI->isEntryFunction()) { 2040 const Function &Fn = MF.getFunction(); 2041 DiagnosticInfoUnsupported BadLDSDecl( 2042 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2043 DS_Warning); 2044 Fn.getContext().diagnose(BadLDSDecl); 2045 2046 // We currently don't have a way to correctly allocate LDS objects that 2047 // aren't directly associated with a kernel. We do force inlining of 2048 // functions that use local objects. However, if these dead functions are 2049 // not eliminated, we don't want a compile time error. Just emit a warning 2050 // and a trap, since there should be no callable path here. 2051 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2052 B.buildUndef(DstReg); 2053 MI.eraseFromParent(); 2054 return true; 2055 } 2056 2057 // TODO: We could emit code to handle the initialization somewhere. 2058 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2059 const SITargetLowering *TLI = ST.getTargetLowering(); 2060 if (!TLI->shouldUseLDSConstAddress(GV)) { 2061 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2062 return true; // Leave in place; 2063 } 2064 2065 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 2066 MI.eraseFromParent(); 2067 return true; 2068 } 2069 2070 const Function &Fn = MF.getFunction(); 2071 DiagnosticInfoUnsupported BadInit( 2072 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2073 Fn.getContext().diagnose(BadInit); 2074 return true; 2075 } 2076 2077 const SITargetLowering *TLI = ST.getTargetLowering(); 2078 2079 if (TLI->shouldEmitFixup(GV)) { 2080 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2081 MI.eraseFromParent(); 2082 return true; 2083 } 2084 2085 if (TLI->shouldEmitPCReloc(GV)) { 2086 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2087 MI.eraseFromParent(); 2088 return true; 2089 } 2090 2091 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2092 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2093 2094 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2095 MachinePointerInfo::getGOT(MF), 2096 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2097 MachineMemOperand::MOInvariant, 2098 8 /*Size*/, Align(8)); 2099 2100 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2101 2102 if (Ty.getSizeInBits() == 32) { 2103 // Truncate if this is a 32-bit constant adrdess. 2104 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2105 B.buildExtract(DstReg, Load, 0); 2106 } else 2107 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2108 2109 MI.eraseFromParent(); 2110 return true; 2111 } 2112 2113 bool AMDGPULegalizerInfo::legalizeLoad( 2114 MachineInstr &MI, MachineRegisterInfo &MRI, 2115 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2116 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2117 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2118 Observer.changingInstr(MI); 2119 MI.getOperand(1).setReg(Cast.getReg(0)); 2120 Observer.changedInstr(MI); 2121 return true; 2122 } 2123 2124 bool AMDGPULegalizerInfo::legalizeFMad( 2125 MachineInstr &MI, MachineRegisterInfo &MRI, 2126 MachineIRBuilder &B) const { 2127 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2128 assert(Ty.isScalar()); 2129 2130 MachineFunction &MF = B.getMF(); 2131 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2132 2133 // TODO: Always legal with future ftz flag. 2134 // FIXME: Do we need just output? 2135 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2136 return true; 2137 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2138 return true; 2139 2140 MachineIRBuilder HelperBuilder(MI); 2141 GISelObserverWrapper DummyObserver; 2142 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2143 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2144 } 2145 2146 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2147 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2148 Register DstReg = MI.getOperand(0).getReg(); 2149 Register PtrReg = MI.getOperand(1).getReg(); 2150 Register CmpVal = MI.getOperand(2).getReg(); 2151 Register NewVal = MI.getOperand(3).getReg(); 2152 2153 assert(SITargetLowering::isFlatGlobalAddrSpace( 2154 MRI.getType(PtrReg).getAddressSpace()) && 2155 "this should not have been custom lowered"); 2156 2157 LLT ValTy = MRI.getType(CmpVal); 2158 LLT VecTy = LLT::vector(2, ValTy); 2159 2160 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2161 2162 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2163 .addDef(DstReg) 2164 .addUse(PtrReg) 2165 .addUse(PackedVal) 2166 .setMemRefs(MI.memoperands()); 2167 2168 MI.eraseFromParent(); 2169 return true; 2170 } 2171 2172 bool AMDGPULegalizerInfo::legalizeFlog( 2173 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2174 Register Dst = MI.getOperand(0).getReg(); 2175 Register Src = MI.getOperand(1).getReg(); 2176 LLT Ty = B.getMRI()->getType(Dst); 2177 unsigned Flags = MI.getFlags(); 2178 2179 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2180 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2181 2182 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2183 MI.eraseFromParent(); 2184 return true; 2185 } 2186 2187 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2188 MachineIRBuilder &B) const { 2189 Register Dst = MI.getOperand(0).getReg(); 2190 Register Src = MI.getOperand(1).getReg(); 2191 unsigned Flags = MI.getFlags(); 2192 LLT Ty = B.getMRI()->getType(Dst); 2193 2194 auto K = B.buildFConstant(Ty, numbers::log2e); 2195 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2196 B.buildFExp2(Dst, Mul, Flags); 2197 MI.eraseFromParent(); 2198 return true; 2199 } 2200 2201 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2202 MachineIRBuilder &B) const { 2203 Register Dst = MI.getOperand(0).getReg(); 2204 Register Src0 = MI.getOperand(1).getReg(); 2205 Register Src1 = MI.getOperand(2).getReg(); 2206 unsigned Flags = MI.getFlags(); 2207 LLT Ty = B.getMRI()->getType(Dst); 2208 const LLT S16 = LLT::scalar(16); 2209 const LLT S32 = LLT::scalar(32); 2210 2211 if (Ty == S32) { 2212 auto Log = B.buildFLog2(S32, Src0, Flags); 2213 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2214 .addUse(Log.getReg(0)) 2215 .addUse(Src1) 2216 .setMIFlags(Flags); 2217 B.buildFExp2(Dst, Mul, Flags); 2218 } else if (Ty == S16) { 2219 // There's no f16 fmul_legacy, so we need to convert for it. 2220 auto Log = B.buildFLog2(S16, Src0, Flags); 2221 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2222 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2223 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2224 .addUse(Ext0.getReg(0)) 2225 .addUse(Ext1.getReg(0)) 2226 .setMIFlags(Flags); 2227 2228 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2229 } else 2230 return false; 2231 2232 MI.eraseFromParent(); 2233 return true; 2234 } 2235 2236 // Find a source register, ignoring any possible source modifiers. 2237 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2238 Register ModSrc = OrigSrc; 2239 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2240 ModSrc = SrcFNeg->getOperand(1).getReg(); 2241 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2242 ModSrc = SrcFAbs->getOperand(1).getReg(); 2243 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2244 ModSrc = SrcFAbs->getOperand(1).getReg(); 2245 return ModSrc; 2246 } 2247 2248 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2249 MachineRegisterInfo &MRI, 2250 MachineIRBuilder &B) const { 2251 2252 const LLT S1 = LLT::scalar(1); 2253 const LLT S64 = LLT::scalar(64); 2254 Register Dst = MI.getOperand(0).getReg(); 2255 Register OrigSrc = MI.getOperand(1).getReg(); 2256 unsigned Flags = MI.getFlags(); 2257 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2258 "this should not have been custom lowered"); 2259 2260 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2261 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2262 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2263 // V_FRACT bug is: 2264 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2265 // 2266 // Convert floor(x) to (x - fract(x)) 2267 2268 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2269 .addUse(OrigSrc) 2270 .setMIFlags(Flags); 2271 2272 // Give source modifier matching some assistance before obscuring a foldable 2273 // pattern. 2274 2275 // TODO: We can avoid the neg on the fract? The input sign to fract 2276 // shouldn't matter? 2277 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2278 2279 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2280 2281 Register Min = MRI.createGenericVirtualRegister(S64); 2282 2283 // We don't need to concern ourselves with the snan handling difference, so 2284 // use the one which will directly select. 2285 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2286 if (MFI->getMode().IEEE) 2287 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2288 else 2289 B.buildFMinNum(Min, Fract, Const, Flags); 2290 2291 Register CorrectedFract = Min; 2292 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2293 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2294 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2295 } 2296 2297 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2298 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2299 2300 MI.eraseFromParent(); 2301 return true; 2302 } 2303 2304 // Turn an illegal packed v2s16 build vector into bit operations. 2305 // TODO: This should probably be a bitcast action in LegalizerHelper. 2306 bool AMDGPULegalizerInfo::legalizeBuildVector( 2307 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2308 Register Dst = MI.getOperand(0).getReg(); 2309 const LLT S32 = LLT::scalar(32); 2310 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2311 2312 Register Src0 = MI.getOperand(1).getReg(); 2313 Register Src1 = MI.getOperand(2).getReg(); 2314 assert(MRI.getType(Src0) == LLT::scalar(16)); 2315 2316 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2317 B.buildBitcast(Dst, Merge); 2318 2319 MI.eraseFromParent(); 2320 return true; 2321 } 2322 2323 // Return the use branch instruction, otherwise null if the usage is invalid. 2324 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2325 MachineRegisterInfo &MRI, 2326 MachineInstr *&Br, 2327 MachineBasicBlock *&UncondBrTarget) { 2328 Register CondDef = MI.getOperand(0).getReg(); 2329 if (!MRI.hasOneNonDBGUse(CondDef)) 2330 return nullptr; 2331 2332 MachineBasicBlock *Parent = MI.getParent(); 2333 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2334 if (UseMI.getParent() != Parent || 2335 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2336 return nullptr; 2337 2338 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2339 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2340 if (Next == Parent->end()) { 2341 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2342 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2343 return nullptr; 2344 UncondBrTarget = &*NextMBB; 2345 } else { 2346 if (Next->getOpcode() != AMDGPU::G_BR) 2347 return nullptr; 2348 Br = &*Next; 2349 UncondBrTarget = Br->getOperand(0).getMBB(); 2350 } 2351 2352 return &UseMI; 2353 } 2354 2355 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2356 MachineRegisterInfo &MRI, 2357 Register LiveIn, 2358 Register PhyReg) const { 2359 assert(PhyReg.isPhysical() && "Physical register expected"); 2360 2361 // Insert the live-in copy, if required, by defining destination virtual 2362 // register. 2363 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2364 if (!MRI.getVRegDef(LiveIn)) { 2365 // FIXME: Should have scoped insert pt 2366 MachineBasicBlock &OrigInsBB = B.getMBB(); 2367 auto OrigInsPt = B.getInsertPt(); 2368 2369 MachineBasicBlock &EntryMBB = B.getMF().front(); 2370 EntryMBB.addLiveIn(PhyReg); 2371 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2372 B.buildCopy(LiveIn, PhyReg); 2373 2374 B.setInsertPt(OrigInsBB, OrigInsPt); 2375 } 2376 2377 return LiveIn; 2378 } 2379 2380 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2381 MachineRegisterInfo &MRI, 2382 Register PhyReg, LLT Ty, 2383 bool InsertLiveInCopy) const { 2384 assert(PhyReg.isPhysical() && "Physical register expected"); 2385 2386 // Get or create virtual live-in regester 2387 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2388 if (!LiveIn) { 2389 LiveIn = MRI.createGenericVirtualRegister(Ty); 2390 MRI.addLiveIn(PhyReg, LiveIn); 2391 } 2392 2393 // When the actual true copy required is from virtual register to physical 2394 // register (to be inserted later), live-in copy insertion from physical 2395 // to register virtual register is not required 2396 if (!InsertLiveInCopy) 2397 return LiveIn; 2398 2399 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2400 } 2401 2402 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2403 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2404 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2405 const ArgDescriptor *Arg; 2406 const TargetRegisterClass *RC; 2407 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2408 if (!Arg) { 2409 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2410 return nullptr; 2411 } 2412 return Arg; 2413 } 2414 2415 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2416 const ArgDescriptor *Arg) const { 2417 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2418 return false; // TODO: Handle these 2419 2420 Register SrcReg = Arg->getRegister(); 2421 assert(SrcReg.isPhysical() && "Physical register expected"); 2422 assert(DstReg.isVirtual() && "Virtual register expected"); 2423 2424 MachineRegisterInfo &MRI = *B.getMRI(); 2425 2426 LLT Ty = MRI.getType(DstReg); 2427 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2428 2429 if (Arg->isMasked()) { 2430 // TODO: Should we try to emit this once in the entry block? 2431 const LLT S32 = LLT::scalar(32); 2432 const unsigned Mask = Arg->getMask(); 2433 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2434 2435 Register AndMaskSrc = LiveIn; 2436 2437 if (Shift != 0) { 2438 auto ShiftAmt = B.buildConstant(S32, Shift); 2439 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2440 } 2441 2442 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2443 } else { 2444 B.buildCopy(DstReg, LiveIn); 2445 } 2446 2447 return true; 2448 } 2449 2450 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2451 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2452 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2453 2454 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2455 if (!Arg) 2456 return false; 2457 2458 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2459 return false; 2460 2461 MI.eraseFromParent(); 2462 return true; 2463 } 2464 2465 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2466 MachineRegisterInfo &MRI, 2467 MachineIRBuilder &B) const { 2468 Register Dst = MI.getOperand(0).getReg(); 2469 LLT DstTy = MRI.getType(Dst); 2470 LLT S16 = LLT::scalar(16); 2471 LLT S32 = LLT::scalar(32); 2472 LLT S64 = LLT::scalar(64); 2473 2474 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2475 return true; 2476 2477 if (DstTy == S16) 2478 return legalizeFDIV16(MI, MRI, B); 2479 if (DstTy == S32) 2480 return legalizeFDIV32(MI, MRI, B); 2481 if (DstTy == S64) 2482 return legalizeFDIV64(MI, MRI, B); 2483 2484 return false; 2485 } 2486 2487 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2488 const LLT S32 = LLT::scalar(32); 2489 2490 auto Cvt0 = B.buildUITOFP(S32, Src); 2491 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2492 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2493 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2494 return B.buildFPTOUI(S32, Mul).getReg(0); 2495 } 2496 2497 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2498 Register DstReg, 2499 Register Num, 2500 Register Den, 2501 bool IsRem) const { 2502 const LLT S1 = LLT::scalar(1); 2503 const LLT S32 = LLT::scalar(32); 2504 2505 // RCP = URECIP(Den) = 2^32 / Den + e 2506 // e is rounding error. 2507 auto RCP = buildDivRCP(B, Den); 2508 2509 // RCP_LO = mul(RCP, Den) 2510 auto RCP_LO = B.buildMul(S32, RCP, Den); 2511 2512 // RCP_HI = mulhu (RCP, Den) */ 2513 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2514 2515 // NEG_RCP_LO = -RCP_LO 2516 auto Zero = B.buildConstant(S32, 0); 2517 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2518 2519 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2520 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2521 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2522 2523 // Calculate the rounding error from the URECIP instruction 2524 // E = mulhu(ABS_RCP_LO, RCP) 2525 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2526 2527 // RCP_A_E = RCP + E 2528 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2529 2530 // RCP_S_E = RCP - E 2531 auto RCP_S_E = B.buildSub(S32, RCP, E); 2532 2533 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2534 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2535 2536 // Quotient = mulhu(Tmp0, Num)stmp 2537 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2538 2539 // Num_S_Remainder = Quotient * Den 2540 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2541 2542 // Remainder = Num - Num_S_Remainder 2543 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2544 2545 // Remainder_GE_Den = Remainder >= Den 2546 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2547 2548 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2549 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2550 Num, Num_S_Remainder); 2551 2552 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2553 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2554 2555 // Calculate Division result: 2556 2557 // Quotient_A_One = Quotient + 1 2558 auto One = B.buildConstant(S32, 1); 2559 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2560 2561 // Quotient_S_One = Quotient - 1 2562 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2563 2564 // Div = (Tmp1 ? Quotient_A_One : Quotient) 2565 auto Div = B.buildSelect(S32, Tmp1, Quotient_A_One, Quotient); 2566 2567 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2568 if (IsRem) { 2569 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2570 2571 // Calculate Rem result: 2572 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2573 2574 // Remainder_A_Den = Remainder + Den 2575 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2576 2577 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2578 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2579 2580 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2581 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2582 } else { 2583 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2584 } 2585 } 2586 2587 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2588 MachineRegisterInfo &MRI, 2589 MachineIRBuilder &B) const { 2590 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2591 Register DstReg = MI.getOperand(0).getReg(); 2592 Register Num = MI.getOperand(1).getReg(); 2593 Register Den = MI.getOperand(2).getReg(); 2594 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2595 MI.eraseFromParent(); 2596 return true; 2597 } 2598 2599 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2600 // 2601 // Return lo, hi of result 2602 // 2603 // %cvt.lo = G_UITOFP Val.lo 2604 // %cvt.hi = G_UITOFP Val.hi 2605 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2606 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2607 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2608 // %mul2 = G_FMUL %mul1, 2**(-32) 2609 // %trunc = G_INTRINSIC_TRUNC %mul2 2610 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2611 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2612 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2613 Register Val) { 2614 const LLT S32 = LLT::scalar(32); 2615 auto Unmerge = B.buildUnmerge(S32, Val); 2616 2617 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2618 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2619 2620 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2621 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2622 2623 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2624 auto Mul1 = 2625 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2626 2627 // 2**(-32) 2628 auto Mul2 = 2629 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2630 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2631 2632 // -(2**32) 2633 auto Mad2 = B.buildFMAD(S32, Trunc, 2634 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2635 2636 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2637 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2638 2639 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2640 } 2641 2642 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI, 2643 MachineRegisterInfo &MRI, 2644 MachineIRBuilder &B) const { 2645 const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV; 2646 const LLT S32 = LLT::scalar(32); 2647 const LLT S64 = LLT::scalar(64); 2648 const LLT S1 = LLT::scalar(1); 2649 Register Numer = MI.getOperand(1).getReg(); 2650 Register Denom = MI.getOperand(2).getReg(); 2651 Register RcpLo, RcpHi; 2652 2653 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2654 2655 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2656 2657 auto Zero64 = B.buildConstant(S64, 0); 2658 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2659 2660 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2661 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2662 2663 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2664 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2665 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2666 2667 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2668 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2669 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2670 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2671 2672 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2673 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2674 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2675 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2676 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2677 2678 auto Zero32 = B.buildConstant(S32, 0); 2679 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2680 auto Add2_HiC = 2681 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2682 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2683 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2684 2685 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2686 Register NumerLo = UnmergeNumer.getReg(0); 2687 Register NumerHi = UnmergeNumer.getReg(1); 2688 2689 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2690 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2691 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2692 Register Mul3_Lo = UnmergeMul3.getReg(0); 2693 Register Mul3_Hi = UnmergeMul3.getReg(1); 2694 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2695 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2696 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2697 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2698 2699 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2700 Register DenomLo = UnmergeDenom.getReg(0); 2701 Register DenomHi = UnmergeDenom.getReg(1); 2702 2703 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2704 auto C1 = B.buildSExt(S32, CmpHi); 2705 2706 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2707 auto C2 = B.buildSExt(S32, CmpLo); 2708 2709 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2710 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2711 2712 // TODO: Here and below portions of the code can be enclosed into if/endif. 2713 // Currently control flow is unconditional and we have 4 selects after 2714 // potential endif to substitute PHIs. 2715 2716 // if C3 != 0 ... 2717 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2718 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2719 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2720 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2721 2722 auto One64 = B.buildConstant(S64, 1); 2723 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2724 2725 auto C4 = 2726 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2727 auto C5 = 2728 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2729 auto C6 = B.buildSelect( 2730 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2731 2732 // if (C6 != 0) 2733 auto Add4 = B.buildAdd(S64, Add3, One64); 2734 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2735 2736 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2737 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2738 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2739 2740 // endif C6 2741 // endif C3 2742 2743 if (IsDiv) { 2744 auto Sel1 = B.buildSelect( 2745 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2746 B.buildSelect(MI.getOperand(0), 2747 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2748 } else { 2749 auto Sel2 = B.buildSelect( 2750 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2751 B.buildSelect(MI.getOperand(0), 2752 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2753 } 2754 2755 MI.eraseFromParent(); 2756 return true; 2757 } 2758 2759 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2760 MachineRegisterInfo &MRI, 2761 MachineIRBuilder &B) const { 2762 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2763 if (Ty == LLT::scalar(32)) 2764 return legalizeUDIV_UREM32(MI, MRI, B); 2765 if (Ty == LLT::scalar(64)) 2766 return legalizeUDIV_UREM64(MI, MRI, B); 2767 return false; 2768 } 2769 2770 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2771 MachineRegisterInfo &MRI, 2772 MachineIRBuilder &B) const { 2773 const LLT S32 = LLT::scalar(32); 2774 2775 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2776 Register DstReg = MI.getOperand(0).getReg(); 2777 Register LHS = MI.getOperand(1).getReg(); 2778 Register RHS = MI.getOperand(2).getReg(); 2779 2780 auto ThirtyOne = B.buildConstant(S32, 31); 2781 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2782 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2783 2784 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2785 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2786 2787 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2788 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2789 2790 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2791 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2792 2793 if (IsRem) { 2794 auto RSign = LHSign; // Remainder sign is the same as LHS 2795 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2796 B.buildSub(DstReg, UDivRem, RSign); 2797 } else { 2798 auto DSign = B.buildXor(S32, LHSign, RHSign); 2799 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2800 B.buildSub(DstReg, UDivRem, DSign); 2801 } 2802 2803 MI.eraseFromParent(); 2804 return true; 2805 } 2806 2807 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2808 MachineRegisterInfo &MRI, 2809 MachineIRBuilder &B) const { 2810 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2811 return legalizeSDIV_SREM32(MI, MRI, B); 2812 return false; 2813 } 2814 2815 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2816 MachineRegisterInfo &MRI, 2817 MachineIRBuilder &B) const { 2818 Register Res = MI.getOperand(0).getReg(); 2819 Register LHS = MI.getOperand(1).getReg(); 2820 Register RHS = MI.getOperand(2).getReg(); 2821 2822 uint16_t Flags = MI.getFlags(); 2823 2824 LLT ResTy = MRI.getType(Res); 2825 LLT S32 = LLT::scalar(32); 2826 LLT S64 = LLT::scalar(64); 2827 2828 const MachineFunction &MF = B.getMF(); 2829 bool Unsafe = 2830 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2831 2832 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2833 return false; 2834 2835 if (!Unsafe && ResTy == S32 && 2836 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2837 return false; 2838 2839 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2840 // 1 / x -> RCP(x) 2841 if (CLHS->isExactlyValue(1.0)) { 2842 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2843 .addUse(RHS) 2844 .setMIFlags(Flags); 2845 2846 MI.eraseFromParent(); 2847 return true; 2848 } 2849 2850 // -1 / x -> RCP( FNEG(x) ) 2851 if (CLHS->isExactlyValue(-1.0)) { 2852 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2853 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2854 .addUse(FNeg.getReg(0)) 2855 .setMIFlags(Flags); 2856 2857 MI.eraseFromParent(); 2858 return true; 2859 } 2860 } 2861 2862 // x / y -> x * (1.0 / y) 2863 if (Unsafe) { 2864 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2865 .addUse(RHS) 2866 .setMIFlags(Flags); 2867 B.buildFMul(Res, LHS, RCP, Flags); 2868 2869 MI.eraseFromParent(); 2870 return true; 2871 } 2872 2873 return false; 2874 } 2875 2876 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2877 MachineRegisterInfo &MRI, 2878 MachineIRBuilder &B) const { 2879 Register Res = MI.getOperand(0).getReg(); 2880 Register LHS = MI.getOperand(1).getReg(); 2881 Register RHS = MI.getOperand(2).getReg(); 2882 2883 uint16_t Flags = MI.getFlags(); 2884 2885 LLT S16 = LLT::scalar(16); 2886 LLT S32 = LLT::scalar(32); 2887 2888 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2889 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2890 2891 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2892 .addUse(RHSExt.getReg(0)) 2893 .setMIFlags(Flags); 2894 2895 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2896 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2897 2898 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2899 .addUse(RDst.getReg(0)) 2900 .addUse(RHS) 2901 .addUse(LHS) 2902 .setMIFlags(Flags); 2903 2904 MI.eraseFromParent(); 2905 return true; 2906 } 2907 2908 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2909 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2910 static void toggleSPDenormMode(bool Enable, 2911 MachineIRBuilder &B, 2912 const GCNSubtarget &ST, 2913 AMDGPU::SIModeRegisterDefaults Mode) { 2914 // Set SP denorm mode to this value. 2915 unsigned SPDenormMode = 2916 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2917 2918 if (ST.hasDenormModeInst()) { 2919 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2920 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2921 2922 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2923 B.buildInstr(AMDGPU::S_DENORM_MODE) 2924 .addImm(NewDenormModeValue); 2925 2926 } else { 2927 // Select FP32 bit field in mode register. 2928 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2929 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2930 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2931 2932 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2933 .addImm(SPDenormMode) 2934 .addImm(SPDenormModeBitField); 2935 } 2936 } 2937 2938 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2939 MachineRegisterInfo &MRI, 2940 MachineIRBuilder &B) const { 2941 Register Res = MI.getOperand(0).getReg(); 2942 Register LHS = MI.getOperand(1).getReg(); 2943 Register RHS = MI.getOperand(2).getReg(); 2944 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2945 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2946 2947 uint16_t Flags = MI.getFlags(); 2948 2949 LLT S32 = LLT::scalar(32); 2950 LLT S1 = LLT::scalar(1); 2951 2952 auto One = B.buildFConstant(S32, 1.0f); 2953 2954 auto DenominatorScaled = 2955 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2956 .addUse(LHS) 2957 .addUse(RHS) 2958 .addImm(0) 2959 .setMIFlags(Flags); 2960 auto NumeratorScaled = 2961 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2962 .addUse(LHS) 2963 .addUse(RHS) 2964 .addImm(1) 2965 .setMIFlags(Flags); 2966 2967 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2968 .addUse(DenominatorScaled.getReg(0)) 2969 .setMIFlags(Flags); 2970 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2971 2972 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2973 // aren't modeled as reading it. 2974 if (!Mode.allFP32Denormals()) 2975 toggleSPDenormMode(true, B, ST, Mode); 2976 2977 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2978 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2979 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2980 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2981 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2982 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2983 2984 if (!Mode.allFP32Denormals()) 2985 toggleSPDenormMode(false, B, ST, Mode); 2986 2987 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2988 .addUse(Fma4.getReg(0)) 2989 .addUse(Fma1.getReg(0)) 2990 .addUse(Fma3.getReg(0)) 2991 .addUse(NumeratorScaled.getReg(1)) 2992 .setMIFlags(Flags); 2993 2994 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2995 .addUse(Fmas.getReg(0)) 2996 .addUse(RHS) 2997 .addUse(LHS) 2998 .setMIFlags(Flags); 2999 3000 MI.eraseFromParent(); 3001 return true; 3002 } 3003 3004 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3005 MachineRegisterInfo &MRI, 3006 MachineIRBuilder &B) const { 3007 Register Res = MI.getOperand(0).getReg(); 3008 Register LHS = MI.getOperand(1).getReg(); 3009 Register RHS = MI.getOperand(2).getReg(); 3010 3011 uint16_t Flags = MI.getFlags(); 3012 3013 LLT S64 = LLT::scalar(64); 3014 LLT S1 = LLT::scalar(1); 3015 3016 auto One = B.buildFConstant(S64, 1.0); 3017 3018 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3019 .addUse(LHS) 3020 .addUse(RHS) 3021 .addImm(0) 3022 .setMIFlags(Flags); 3023 3024 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3025 3026 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3027 .addUse(DivScale0.getReg(0)) 3028 .setMIFlags(Flags); 3029 3030 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3031 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3032 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3033 3034 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3035 .addUse(LHS) 3036 .addUse(RHS) 3037 .addImm(1) 3038 .setMIFlags(Flags); 3039 3040 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3041 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3042 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3043 3044 Register Scale; 3045 if (!ST.hasUsableDivScaleConditionOutput()) { 3046 // Workaround a hardware bug on SI where the condition output from div_scale 3047 // is not usable. 3048 3049 LLT S32 = LLT::scalar(32); 3050 3051 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3052 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3053 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3054 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3055 3056 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3057 Scale1Unmerge.getReg(1)); 3058 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3059 Scale0Unmerge.getReg(1)); 3060 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3061 } else { 3062 Scale = DivScale1.getReg(1); 3063 } 3064 3065 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3066 .addUse(Fma4.getReg(0)) 3067 .addUse(Fma3.getReg(0)) 3068 .addUse(Mul.getReg(0)) 3069 .addUse(Scale) 3070 .setMIFlags(Flags); 3071 3072 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3073 .addUse(Fmas.getReg(0)) 3074 .addUse(RHS) 3075 .addUse(LHS) 3076 .setMIFlags(Flags); 3077 3078 MI.eraseFromParent(); 3079 return true; 3080 } 3081 3082 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3083 MachineRegisterInfo &MRI, 3084 MachineIRBuilder &B) const { 3085 Register Res = MI.getOperand(0).getReg(); 3086 Register LHS = MI.getOperand(2).getReg(); 3087 Register RHS = MI.getOperand(3).getReg(); 3088 uint16_t Flags = MI.getFlags(); 3089 3090 LLT S32 = LLT::scalar(32); 3091 LLT S1 = LLT::scalar(1); 3092 3093 auto Abs = B.buildFAbs(S32, RHS, Flags); 3094 const APFloat C0Val(1.0f); 3095 3096 auto C0 = B.buildConstant(S32, 0x6f800000); 3097 auto C1 = B.buildConstant(S32, 0x2f800000); 3098 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3099 3100 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3101 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3102 3103 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3104 3105 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3106 .addUse(Mul0.getReg(0)) 3107 .setMIFlags(Flags); 3108 3109 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3110 3111 B.buildFMul(Res, Sel, Mul1, Flags); 3112 3113 MI.eraseFromParent(); 3114 return true; 3115 } 3116 3117 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3118 MachineRegisterInfo &MRI, 3119 MachineIRBuilder &B) const { 3120 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3121 if (!MFI->isEntryFunction()) { 3122 return legalizePreloadedArgIntrin(MI, MRI, B, 3123 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3124 } 3125 3126 uint64_t Offset = 3127 ST.getTargetLowering()->getImplicitParameterOffset( 3128 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3129 Register DstReg = MI.getOperand(0).getReg(); 3130 LLT DstTy = MRI.getType(DstReg); 3131 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3132 3133 const ArgDescriptor *Arg; 3134 const TargetRegisterClass *RC; 3135 std::tie(Arg, RC) 3136 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3137 if (!Arg) 3138 return false; 3139 3140 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3141 if (!loadInputValue(KernargPtrReg, B, Arg)) 3142 return false; 3143 3144 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3145 MI.eraseFromParent(); 3146 return true; 3147 } 3148 3149 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3150 MachineRegisterInfo &MRI, 3151 MachineIRBuilder &B, 3152 unsigned AddrSpace) const { 3153 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3154 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3155 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3156 MI.eraseFromParent(); 3157 return true; 3158 } 3159 3160 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3161 // offset (the offset that is included in bounds checking and swizzling, to be 3162 // split between the instruction's voffset and immoffset fields) and soffset 3163 // (the offset that is excluded from bounds checking and swizzling, to go in 3164 // the instruction's soffset field). This function takes the first kind of 3165 // offset and figures out how to split it between voffset and immoffset. 3166 std::tuple<Register, unsigned, unsigned> 3167 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3168 Register OrigOffset) const { 3169 const unsigned MaxImm = 4095; 3170 Register BaseReg; 3171 unsigned TotalConstOffset; 3172 MachineInstr *OffsetDef; 3173 const LLT S32 = LLT::scalar(32); 3174 3175 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3176 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3177 3178 unsigned ImmOffset = TotalConstOffset; 3179 3180 // If the immediate value is too big for the immoffset field, put the value 3181 // and -4096 into the immoffset field so that the value that is copied/added 3182 // for the voffset field is a multiple of 4096, and it stands more chance 3183 // of being CSEd with the copy/add for another similar load/store. 3184 // However, do not do that rounding down to a multiple of 4096 if that is a 3185 // negative number, as it appears to be illegal to have a negative offset 3186 // in the vgpr, even if adding the immediate offset makes it positive. 3187 unsigned Overflow = ImmOffset & ~MaxImm; 3188 ImmOffset -= Overflow; 3189 if ((int32_t)Overflow < 0) { 3190 Overflow += ImmOffset; 3191 ImmOffset = 0; 3192 } 3193 3194 if (Overflow != 0) { 3195 if (!BaseReg) { 3196 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3197 } else { 3198 auto OverflowVal = B.buildConstant(S32, Overflow); 3199 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3200 } 3201 } 3202 3203 if (!BaseReg) 3204 BaseReg = B.buildConstant(S32, 0).getReg(0); 3205 3206 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3207 } 3208 3209 /// Handle register layout difference for f16 images for some subtargets. 3210 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3211 MachineRegisterInfo &MRI, 3212 Register Reg) const { 3213 if (!ST.hasUnpackedD16VMem()) 3214 return Reg; 3215 3216 const LLT S16 = LLT::scalar(16); 3217 const LLT S32 = LLT::scalar(32); 3218 LLT StoreVT = MRI.getType(Reg); 3219 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3220 3221 auto Unmerge = B.buildUnmerge(S16, Reg); 3222 3223 SmallVector<Register, 4> WideRegs; 3224 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3225 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3226 3227 int NumElts = StoreVT.getNumElements(); 3228 3229 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3230 } 3231 3232 Register AMDGPULegalizerInfo::fixStoreSourceType( 3233 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3234 MachineRegisterInfo *MRI = B.getMRI(); 3235 LLT Ty = MRI->getType(VData); 3236 3237 const LLT S16 = LLT::scalar(16); 3238 3239 // Fixup illegal register types for i8 stores. 3240 if (Ty == LLT::scalar(8) || Ty == S16) { 3241 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3242 return AnyExt; 3243 } 3244 3245 if (Ty.isVector()) { 3246 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3247 if (IsFormat) 3248 return handleD16VData(B, *MRI, VData); 3249 } 3250 } 3251 3252 return VData; 3253 } 3254 3255 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3256 MachineRegisterInfo &MRI, 3257 MachineIRBuilder &B, 3258 bool IsTyped, 3259 bool IsFormat) const { 3260 Register VData = MI.getOperand(1).getReg(); 3261 LLT Ty = MRI.getType(VData); 3262 LLT EltTy = Ty.getScalarType(); 3263 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3264 const LLT S32 = LLT::scalar(32); 3265 3266 VData = fixStoreSourceType(B, VData, IsFormat); 3267 Register RSrc = MI.getOperand(2).getReg(); 3268 3269 MachineMemOperand *MMO = *MI.memoperands_begin(); 3270 const int MemSize = MMO->getSize(); 3271 3272 unsigned ImmOffset; 3273 unsigned TotalOffset; 3274 3275 // The typed intrinsics add an immediate after the registers. 3276 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3277 3278 // The struct intrinsic variants add one additional operand over raw. 3279 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3280 Register VIndex; 3281 int OpOffset = 0; 3282 if (HasVIndex) { 3283 VIndex = MI.getOperand(3).getReg(); 3284 OpOffset = 1; 3285 } 3286 3287 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3288 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3289 3290 unsigned Format = 0; 3291 if (IsTyped) { 3292 Format = MI.getOperand(5 + OpOffset).getImm(); 3293 ++OpOffset; 3294 } 3295 3296 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3297 3298 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3299 if (TotalOffset != 0) 3300 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3301 3302 unsigned Opc; 3303 if (IsTyped) { 3304 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3305 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3306 } else if (IsFormat) { 3307 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3308 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3309 } else { 3310 switch (MemSize) { 3311 case 1: 3312 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3313 break; 3314 case 2: 3315 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3316 break; 3317 default: 3318 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3319 break; 3320 } 3321 } 3322 3323 if (!VIndex) 3324 VIndex = B.buildConstant(S32, 0).getReg(0); 3325 3326 auto MIB = B.buildInstr(Opc) 3327 .addUse(VData) // vdata 3328 .addUse(RSrc) // rsrc 3329 .addUse(VIndex) // vindex 3330 .addUse(VOffset) // voffset 3331 .addUse(SOffset) // soffset 3332 .addImm(ImmOffset); // offset(imm) 3333 3334 if (IsTyped) 3335 MIB.addImm(Format); 3336 3337 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3338 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3339 .addMemOperand(MMO); 3340 3341 MI.eraseFromParent(); 3342 return true; 3343 } 3344 3345 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3346 MachineRegisterInfo &MRI, 3347 MachineIRBuilder &B, 3348 bool IsFormat, 3349 bool IsTyped) const { 3350 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3351 MachineMemOperand *MMO = *MI.memoperands_begin(); 3352 const int MemSize = MMO->getSize(); 3353 const LLT S32 = LLT::scalar(32); 3354 3355 Register Dst = MI.getOperand(0).getReg(); 3356 Register RSrc = MI.getOperand(2).getReg(); 3357 3358 // The typed intrinsics add an immediate after the registers. 3359 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3360 3361 // The struct intrinsic variants add one additional operand over raw. 3362 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3363 Register VIndex; 3364 int OpOffset = 0; 3365 if (HasVIndex) { 3366 VIndex = MI.getOperand(3).getReg(); 3367 OpOffset = 1; 3368 } 3369 3370 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3371 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3372 3373 unsigned Format = 0; 3374 if (IsTyped) { 3375 Format = MI.getOperand(5 + OpOffset).getImm(); 3376 ++OpOffset; 3377 } 3378 3379 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3380 unsigned ImmOffset; 3381 unsigned TotalOffset; 3382 3383 LLT Ty = MRI.getType(Dst); 3384 LLT EltTy = Ty.getScalarType(); 3385 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3386 const bool Unpacked = ST.hasUnpackedD16VMem(); 3387 3388 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3389 if (TotalOffset != 0) 3390 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3391 3392 unsigned Opc; 3393 3394 if (IsTyped) { 3395 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3396 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3397 } else if (IsFormat) { 3398 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3399 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3400 } else { 3401 switch (MemSize) { 3402 case 1: 3403 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3404 break; 3405 case 2: 3406 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3407 break; 3408 default: 3409 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3410 break; 3411 } 3412 } 3413 3414 Register LoadDstReg; 3415 3416 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3417 LLT UnpackedTy = Ty.changeElementSize(32); 3418 3419 if (IsExtLoad) 3420 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3421 else if (Unpacked && IsD16 && Ty.isVector()) 3422 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3423 else 3424 LoadDstReg = Dst; 3425 3426 if (!VIndex) 3427 VIndex = B.buildConstant(S32, 0).getReg(0); 3428 3429 auto MIB = B.buildInstr(Opc) 3430 .addDef(LoadDstReg) // vdata 3431 .addUse(RSrc) // rsrc 3432 .addUse(VIndex) // vindex 3433 .addUse(VOffset) // voffset 3434 .addUse(SOffset) // soffset 3435 .addImm(ImmOffset); // offset(imm) 3436 3437 if (IsTyped) 3438 MIB.addImm(Format); 3439 3440 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3441 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3442 .addMemOperand(MMO); 3443 3444 if (LoadDstReg != Dst) { 3445 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3446 3447 // Widen result for extending loads was widened. 3448 if (IsExtLoad) 3449 B.buildTrunc(Dst, LoadDstReg); 3450 else { 3451 // Repack to original 16-bit vector result 3452 // FIXME: G_TRUNC should work, but legalization currently fails 3453 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3454 SmallVector<Register, 4> Repack; 3455 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3456 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3457 B.buildMerge(Dst, Repack); 3458 } 3459 } 3460 3461 MI.eraseFromParent(); 3462 return true; 3463 } 3464 3465 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3466 MachineIRBuilder &B, 3467 bool IsInc) const { 3468 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3469 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3470 B.buildInstr(Opc) 3471 .addDef(MI.getOperand(0).getReg()) 3472 .addUse(MI.getOperand(2).getReg()) 3473 .addUse(MI.getOperand(3).getReg()) 3474 .cloneMemRefs(MI); 3475 MI.eraseFromParent(); 3476 return true; 3477 } 3478 3479 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3480 switch (IntrID) { 3481 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3482 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3483 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3484 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3485 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3486 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3487 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3488 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3489 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3490 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3491 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3492 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3493 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3494 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3495 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3496 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3497 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3498 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3499 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3500 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3501 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3502 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3503 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3504 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3505 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3506 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3507 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3508 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3509 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3510 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3511 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3512 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3513 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3514 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3515 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3516 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3517 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3518 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3519 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3520 default: 3521 llvm_unreachable("unhandled atomic opcode"); 3522 } 3523 } 3524 3525 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3526 MachineIRBuilder &B, 3527 Intrinsic::ID IID) const { 3528 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3529 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3530 3531 Register Dst = MI.getOperand(0).getReg(); 3532 Register VData = MI.getOperand(2).getReg(); 3533 3534 Register CmpVal; 3535 int OpOffset = 0; 3536 3537 if (IsCmpSwap) { 3538 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3539 ++OpOffset; 3540 } 3541 3542 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3543 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3544 3545 // The struct intrinsic variants add one additional operand over raw. 3546 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3547 Register VIndex; 3548 if (HasVIndex) { 3549 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3550 ++OpOffset; 3551 } 3552 3553 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3554 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3555 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3556 3557 MachineMemOperand *MMO = *MI.memoperands_begin(); 3558 3559 unsigned ImmOffset; 3560 unsigned TotalOffset; 3561 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3562 if (TotalOffset != 0) 3563 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3564 3565 if (!VIndex) 3566 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3567 3568 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3569 .addDef(Dst) 3570 .addUse(VData); // vdata 3571 3572 if (IsCmpSwap) 3573 MIB.addReg(CmpVal); 3574 3575 MIB.addUse(RSrc) // rsrc 3576 .addUse(VIndex) // vindex 3577 .addUse(VOffset) // voffset 3578 .addUse(SOffset) // soffset 3579 .addImm(ImmOffset) // offset(imm) 3580 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3581 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3582 .addMemOperand(MMO); 3583 3584 MI.eraseFromParent(); 3585 return true; 3586 } 3587 3588 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3589 /// vector with s16 typed elements. 3590 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3591 SmallVectorImpl<Register> &PackedAddrs, 3592 int AddrIdx, int DimIdx, int EndIdx, 3593 int NumGradients) { 3594 const LLT S16 = LLT::scalar(16); 3595 const LLT V2S16 = LLT::vector(2, 16); 3596 3597 for (int I = AddrIdx; I < EndIdx; ++I) { 3598 MachineOperand &SrcOp = MI.getOperand(I); 3599 if (!SrcOp.isReg()) 3600 continue; // _L to _LZ may have eliminated this. 3601 3602 Register AddrReg = SrcOp.getReg(); 3603 3604 if (I < DimIdx) { 3605 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3606 PackedAddrs.push_back(AddrReg); 3607 } else { 3608 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3609 // derivatives dx/dh and dx/dv are packed with undef. 3610 if (((I + 1) >= EndIdx) || 3611 ((NumGradients / 2) % 2 == 1 && 3612 (I == DimIdx + (NumGradients / 2) - 1 || 3613 I == DimIdx + NumGradients - 1)) || 3614 // Check for _L to _LZ optimization 3615 !MI.getOperand(I + 1).isReg()) { 3616 PackedAddrs.push_back( 3617 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3618 .getReg(0)); 3619 } else { 3620 PackedAddrs.push_back( 3621 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3622 .getReg(0)); 3623 ++I; 3624 } 3625 } 3626 } 3627 } 3628 3629 /// Convert from separate vaddr components to a single vector address register, 3630 /// and replace the remaining operands with $noreg. 3631 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3632 int DimIdx, int NumVAddrs) { 3633 const LLT S32 = LLT::scalar(32); 3634 3635 SmallVector<Register, 8> AddrRegs; 3636 for (int I = 0; I != NumVAddrs; ++I) { 3637 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3638 if (SrcOp.isReg()) { 3639 AddrRegs.push_back(SrcOp.getReg()); 3640 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3641 } 3642 } 3643 3644 int NumAddrRegs = AddrRegs.size(); 3645 if (NumAddrRegs != 1) { 3646 // Round up to 8 elements for v5-v7 3647 // FIXME: Missing intermediate sized register classes and instructions. 3648 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3649 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3650 auto Undef = B.buildUndef(S32); 3651 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3652 NumAddrRegs = RoundedNumRegs; 3653 } 3654 3655 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3656 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3657 } 3658 3659 for (int I = 1; I != NumVAddrs; ++I) { 3660 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3661 if (SrcOp.isReg()) 3662 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3663 } 3664 } 3665 3666 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3667 /// 3668 /// Depending on the subtarget, load/store with 16-bit element data need to be 3669 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3670 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3671 /// registers. 3672 /// 3673 /// We don't want to directly select image instructions just yet, but also want 3674 /// to exposes all register repacking to the legalizer/combiners. We also don't 3675 /// want a selected instrution entering RegBankSelect. In order to avoid 3676 /// defining a multitude of intermediate image instructions, directly hack on 3677 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3678 /// now unnecessary arguments with $noreg. 3679 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3680 MachineInstr &MI, MachineIRBuilder &B, 3681 GISelChangeObserver &Observer, 3682 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3683 3684 const int NumDefs = MI.getNumExplicitDefs(); 3685 bool IsTFE = NumDefs == 2; 3686 // We are only processing the operands of d16 image operations on subtargets 3687 // that use the unpacked register layout, or need to repack the TFE result. 3688 3689 // TODO: Do we need to guard against already legalized intrinsics? 3690 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3691 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3692 3693 MachineRegisterInfo *MRI = B.getMRI(); 3694 const LLT S32 = LLT::scalar(32); 3695 const LLT S16 = LLT::scalar(16); 3696 const LLT V2S16 = LLT::vector(2, 16); 3697 3698 // Index of first address argument 3699 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3700 3701 int NumVAddrs, NumGradients; 3702 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3703 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3704 getDMaskIdx(BaseOpcode, NumDefs); 3705 unsigned DMask = 0; 3706 3707 // Check for 16 bit addresses and pack if true. 3708 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3709 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3710 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3711 const bool IsG16 = GradTy == S16; 3712 const bool IsA16 = AddrTy == S16; 3713 3714 int DMaskLanes = 0; 3715 if (!BaseOpcode->Atomic) { 3716 DMask = MI.getOperand(DMaskIdx).getImm(); 3717 if (BaseOpcode->Gather4) { 3718 DMaskLanes = 4; 3719 } else if (DMask != 0) { 3720 DMaskLanes = countPopulation(DMask); 3721 } else if (!IsTFE && !BaseOpcode->Store) { 3722 // If dmask is 0, this is a no-op load. This can be eliminated. 3723 B.buildUndef(MI.getOperand(0)); 3724 MI.eraseFromParent(); 3725 return true; 3726 } 3727 } 3728 3729 Observer.changingInstr(MI); 3730 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3731 3732 unsigned NewOpcode = NumDefs == 0 ? 3733 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3734 3735 // Track that we legalized this 3736 MI.setDesc(B.getTII().get(NewOpcode)); 3737 3738 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3739 // dmask to be at least 1 otherwise the instruction will fail 3740 if (IsTFE && DMask == 0) { 3741 DMask = 0x1; 3742 DMaskLanes = 1; 3743 MI.getOperand(DMaskIdx).setImm(DMask); 3744 } 3745 3746 if (BaseOpcode->Atomic) { 3747 Register VData0 = MI.getOperand(2).getReg(); 3748 LLT Ty = MRI->getType(VData0); 3749 3750 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3751 if (Ty.isVector()) 3752 return false; 3753 3754 if (BaseOpcode->AtomicX2) { 3755 Register VData1 = MI.getOperand(3).getReg(); 3756 // The two values are packed in one register. 3757 LLT PackedTy = LLT::vector(2, Ty); 3758 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3759 MI.getOperand(2).setReg(Concat.getReg(0)); 3760 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3761 } 3762 } 3763 3764 int CorrectedNumVAddrs = NumVAddrs; 3765 3766 // Optimize _L to _LZ when _L is zero 3767 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3768 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3769 const ConstantFP *ConstantLod; 3770 const int LodIdx = AddrIdx + NumVAddrs - 1; 3771 3772 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3773 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3774 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3775 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3776 LZMappingInfo->LZ, ImageDimIntr->Dim); 3777 3778 // The starting indexes should remain in the same place. 3779 --NumVAddrs; 3780 --CorrectedNumVAddrs; 3781 3782 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3783 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3784 MI.RemoveOperand(LodIdx); 3785 } 3786 } 3787 } 3788 3789 // Optimize _mip away, when 'lod' is zero 3790 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3791 int64_t ConstantLod; 3792 const int LodIdx = AddrIdx + NumVAddrs - 1; 3793 3794 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3795 if (ConstantLod == 0) { 3796 // TODO: Change intrinsic opcode and remove operand instead or replacing 3797 // it with 0, as the _L to _LZ handling is done above. 3798 MI.getOperand(LodIdx).ChangeToImmediate(0); 3799 --CorrectedNumVAddrs; 3800 } 3801 } 3802 } 3803 3804 // Rewrite the addressing register layout before doing anything else. 3805 if (IsA16 || IsG16) { 3806 if (IsA16) { 3807 // Target must support the feature and gradients need to be 16 bit too 3808 if (!ST.hasA16() || !IsG16) 3809 return false; 3810 } else if (!ST.hasG16()) 3811 return false; 3812 3813 if (NumVAddrs > 1) { 3814 SmallVector<Register, 4> PackedRegs; 3815 // Don't compress addresses for G16 3816 const int PackEndIdx = 3817 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3818 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3819 PackEndIdx, NumGradients); 3820 3821 if (!IsA16) { 3822 // Add uncompressed address 3823 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3824 int AddrReg = MI.getOperand(I).getReg(); 3825 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3826 PackedRegs.push_back(AddrReg); 3827 } 3828 } 3829 3830 // See also below in the non-a16 branch 3831 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3832 3833 if (!UseNSA && PackedRegs.size() > 1) { 3834 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3835 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3836 PackedRegs[0] = Concat.getReg(0); 3837 PackedRegs.resize(1); 3838 } 3839 3840 const int NumPacked = PackedRegs.size(); 3841 for (int I = 0; I != NumVAddrs; ++I) { 3842 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3843 if (!SrcOp.isReg()) { 3844 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3845 continue; 3846 } 3847 3848 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3849 3850 if (I < NumPacked) 3851 SrcOp.setReg(PackedRegs[I]); 3852 else 3853 SrcOp.setReg(AMDGPU::NoRegister); 3854 } 3855 } 3856 } else { 3857 // If the register allocator cannot place the address registers contiguously 3858 // without introducing moves, then using the non-sequential address encoding 3859 // is always preferable, since it saves VALU instructions and is usually a 3860 // wash in terms of code size or even better. 3861 // 3862 // However, we currently have no way of hinting to the register allocator 3863 // that MIMG addresses should be placed contiguously when it is possible to 3864 // do so, so force non-NSA for the common 2-address case as a heuristic. 3865 // 3866 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3867 // allocation when possible. 3868 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3869 3870 if (!UseNSA && NumVAddrs > 1) 3871 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3872 } 3873 3874 int Flags = 0; 3875 if (IsA16) 3876 Flags |= 1; 3877 if (IsG16) 3878 Flags |= 2; 3879 MI.addOperand(MachineOperand::CreateImm(Flags)); 3880 3881 if (BaseOpcode->Store) { // No TFE for stores? 3882 // TODO: Handle dmask trim 3883 Register VData = MI.getOperand(1).getReg(); 3884 LLT Ty = MRI->getType(VData); 3885 if (!Ty.isVector() || Ty.getElementType() != S16) 3886 return true; 3887 3888 Register RepackedReg = handleD16VData(B, *MRI, VData); 3889 if (RepackedReg != VData) { 3890 MI.getOperand(1).setReg(RepackedReg); 3891 } 3892 3893 return true; 3894 } 3895 3896 Register DstReg = MI.getOperand(0).getReg(); 3897 LLT Ty = MRI->getType(DstReg); 3898 const LLT EltTy = Ty.getScalarType(); 3899 const bool IsD16 = Ty.getScalarType() == S16; 3900 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3901 3902 // Confirm that the return type is large enough for the dmask specified 3903 if (NumElts < DMaskLanes) 3904 return false; 3905 3906 if (NumElts > 4 || DMaskLanes > 4) 3907 return false; 3908 3909 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3910 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3911 3912 // The raw dword aligned data component of the load. The only legal cases 3913 // where this matters should be when using the packed D16 format, for 3914 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3915 LLT RoundedTy; 3916 3917 // S32 vector to to cover all data, plus TFE result element. 3918 LLT TFETy; 3919 3920 // Register type to use for each loaded component. Will be S32 or V2S16. 3921 LLT RegTy; 3922 3923 if (IsD16 && ST.hasUnpackedD16VMem()) { 3924 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3925 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3926 RegTy = S32; 3927 } else { 3928 unsigned EltSize = EltTy.getSizeInBits(); 3929 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3930 unsigned RoundedSize = 32 * RoundedElts; 3931 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3932 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3933 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3934 } 3935 3936 // The return type does not need adjustment. 3937 // TODO: Should we change s16 case to s32 or <2 x s16>? 3938 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3939 return true; 3940 3941 Register Dst1Reg; 3942 3943 // Insert after the instruction. 3944 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3945 3946 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3947 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3948 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3949 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3950 3951 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3952 3953 MI.getOperand(0).setReg(NewResultReg); 3954 3955 // In the IR, TFE is supposed to be used with a 2 element struct return 3956 // type. The intruction really returns these two values in one contiguous 3957 // register, with one additional dword beyond the loaded data. Rewrite the 3958 // return type to use a single register result. 3959 3960 if (IsTFE) { 3961 Dst1Reg = MI.getOperand(1).getReg(); 3962 if (MRI->getType(Dst1Reg) != S32) 3963 return false; 3964 3965 // TODO: Make sure the TFE operand bit is set. 3966 MI.RemoveOperand(1); 3967 3968 // Handle the easy case that requires no repack instructions. 3969 if (Ty == S32) { 3970 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 3971 return true; 3972 } 3973 } 3974 3975 // Now figure out how to copy the new result register back into the old 3976 // result. 3977 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 3978 3979 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 3980 3981 if (ResultNumRegs == 1) { 3982 assert(!IsTFE); 3983 ResultRegs[0] = NewResultReg; 3984 } else { 3985 // We have to repack into a new vector of some kind. 3986 for (int I = 0; I != NumDataRegs; ++I) 3987 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 3988 B.buildUnmerge(ResultRegs, NewResultReg); 3989 3990 // Drop the final TFE element to get the data part. The TFE result is 3991 // directly written to the right place already. 3992 if (IsTFE) 3993 ResultRegs.resize(NumDataRegs); 3994 } 3995 3996 // For an s16 scalar result, we form an s32 result with a truncate regardless 3997 // of packed vs. unpacked. 3998 if (IsD16 && !Ty.isVector()) { 3999 B.buildTrunc(DstReg, ResultRegs[0]); 4000 return true; 4001 } 4002 4003 // Avoid a build/concat_vector of 1 entry. 4004 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4005 B.buildBitcast(DstReg, ResultRegs[0]); 4006 return true; 4007 } 4008 4009 assert(Ty.isVector()); 4010 4011 if (IsD16) { 4012 // For packed D16 results with TFE enabled, all the data components are 4013 // S32. Cast back to the expected type. 4014 // 4015 // TODO: We don't really need to use load s32 elements. We would only need one 4016 // cast for the TFE result if a multiple of v2s16 was used. 4017 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4018 for (Register &Reg : ResultRegs) 4019 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4020 } else if (ST.hasUnpackedD16VMem()) { 4021 for (Register &Reg : ResultRegs) 4022 Reg = B.buildTrunc(S16, Reg).getReg(0); 4023 } 4024 } 4025 4026 auto padWithUndef = [&](LLT Ty, int NumElts) { 4027 if (NumElts == 0) 4028 return; 4029 Register Undef = B.buildUndef(Ty).getReg(0); 4030 for (int I = 0; I != NumElts; ++I) 4031 ResultRegs.push_back(Undef); 4032 }; 4033 4034 // Pad out any elements eliminated due to the dmask. 4035 LLT ResTy = MRI->getType(ResultRegs[0]); 4036 if (!ResTy.isVector()) { 4037 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4038 B.buildBuildVector(DstReg, ResultRegs); 4039 return true; 4040 } 4041 4042 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4043 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4044 4045 // Deal with the one annoying legal case. 4046 const LLT V3S16 = LLT::vector(3, 16); 4047 if (Ty == V3S16) { 4048 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4049 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4050 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4051 return true; 4052 } 4053 4054 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4055 B.buildConcatVectors(DstReg, ResultRegs); 4056 return true; 4057 } 4058 4059 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4060 MachineInstr &MI, MachineIRBuilder &B, 4061 GISelChangeObserver &Observer) const { 4062 Register Dst = MI.getOperand(0).getReg(); 4063 LLT Ty = B.getMRI()->getType(Dst); 4064 unsigned Size = Ty.getSizeInBits(); 4065 MachineFunction &MF = B.getMF(); 4066 4067 Observer.changingInstr(MI); 4068 4069 // FIXME: We don't really need this intermediate instruction. The intrinsic 4070 // should be fixed to have a memory operand. Since it's readnone, we're not 4071 // allowed to add one. 4072 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4073 MI.RemoveOperand(1); // Remove intrinsic ID 4074 4075 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4076 // TODO: Should this use datalayout alignment? 4077 const unsigned MemSize = (Size + 7) / 8; 4078 const Align MemAlign(4); 4079 MachineMemOperand *MMO = MF.getMachineMemOperand( 4080 MachinePointerInfo(), 4081 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4082 MachineMemOperand::MOInvariant, 4083 MemSize, MemAlign); 4084 MI.addMemOperand(MF, MMO); 4085 4086 // There are no 96-bit result scalar loads, but widening to 128-bit should 4087 // always be legal. We may need to restore this to a 96-bit result if it turns 4088 // out this needs to be converted to a vector load during RegBankSelect. 4089 if (!isPowerOf2_32(Size)) { 4090 LegalizerHelper Helper(MF, *this, Observer, B); 4091 4092 if (Ty.isVector()) 4093 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4094 else 4095 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4096 } 4097 4098 Observer.changedInstr(MI); 4099 return true; 4100 } 4101 4102 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4103 MachineRegisterInfo &MRI, 4104 MachineIRBuilder &B) const { 4105 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4106 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4107 !ST.isTrapHandlerEnabled()) { 4108 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4109 } else { 4110 // Pass queue pointer to trap handler as input, and insert trap instruction 4111 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4112 const ArgDescriptor *Arg = 4113 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4114 if (!Arg) 4115 return false; 4116 MachineRegisterInfo &MRI = *B.getMRI(); 4117 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4118 Register LiveIn = getLiveInRegister( 4119 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4120 /*InsertLiveInCopy=*/false); 4121 if (!loadInputValue(LiveIn, B, Arg)) 4122 return false; 4123 B.buildCopy(SGPR01, LiveIn); 4124 B.buildInstr(AMDGPU::S_TRAP) 4125 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4126 .addReg(SGPR01, RegState::Implicit); 4127 } 4128 4129 MI.eraseFromParent(); 4130 return true; 4131 } 4132 4133 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4134 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4135 // Is non-HSA path or trap-handler disabled? then, report a warning 4136 // accordingly 4137 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4138 !ST.isTrapHandlerEnabled()) { 4139 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4140 "debugtrap handler not supported", 4141 MI.getDebugLoc(), DS_Warning); 4142 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4143 Ctx.diagnose(NoTrap); 4144 } else { 4145 // Insert debug-trap instruction 4146 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4147 } 4148 4149 MI.eraseFromParent(); 4150 return true; 4151 } 4152 4153 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 4154 MachineIRBuilder &B, 4155 GISelChangeObserver &Observer) const { 4156 MachineRegisterInfo &MRI = *B.getMRI(); 4157 4158 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4159 auto IntrID = MI.getIntrinsicID(); 4160 switch (IntrID) { 4161 case Intrinsic::amdgcn_if: 4162 case Intrinsic::amdgcn_else: { 4163 MachineInstr *Br = nullptr; 4164 MachineBasicBlock *UncondBrTarget = nullptr; 4165 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4166 const SIRegisterInfo *TRI 4167 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4168 4169 Register Def = MI.getOperand(1).getReg(); 4170 Register Use = MI.getOperand(3).getReg(); 4171 4172 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4173 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4174 if (IntrID == Intrinsic::amdgcn_if) { 4175 B.buildInstr(AMDGPU::SI_IF) 4176 .addDef(Def) 4177 .addUse(Use) 4178 .addMBB(UncondBrTarget); 4179 } else { 4180 B.buildInstr(AMDGPU::SI_ELSE) 4181 .addDef(Def) 4182 .addUse(Use) 4183 .addMBB(UncondBrTarget) 4184 .addImm(0); 4185 } 4186 4187 if (Br) { 4188 Br->getOperand(0).setMBB(CondBrTarget); 4189 } else { 4190 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4191 // since we're swapping branch targets it needs to be reinserted. 4192 // FIXME: IRTranslator should probably not do this 4193 B.buildBr(*CondBrTarget); 4194 } 4195 4196 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4197 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4198 MI.eraseFromParent(); 4199 BrCond->eraseFromParent(); 4200 return true; 4201 } 4202 4203 return false; 4204 } 4205 case Intrinsic::amdgcn_loop: { 4206 MachineInstr *Br = nullptr; 4207 MachineBasicBlock *UncondBrTarget = nullptr; 4208 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4209 const SIRegisterInfo *TRI 4210 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4211 4212 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4213 Register Reg = MI.getOperand(2).getReg(); 4214 4215 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4216 B.buildInstr(AMDGPU::SI_LOOP) 4217 .addUse(Reg) 4218 .addMBB(UncondBrTarget); 4219 4220 if (Br) 4221 Br->getOperand(0).setMBB(CondBrTarget); 4222 else 4223 B.buildBr(*CondBrTarget); 4224 4225 MI.eraseFromParent(); 4226 BrCond->eraseFromParent(); 4227 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4228 return true; 4229 } 4230 4231 return false; 4232 } 4233 case Intrinsic::amdgcn_kernarg_segment_ptr: 4234 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4235 // This only makes sense to call in a kernel, so just lower to null. 4236 B.buildConstant(MI.getOperand(0).getReg(), 0); 4237 MI.eraseFromParent(); 4238 return true; 4239 } 4240 4241 return legalizePreloadedArgIntrin( 4242 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4243 case Intrinsic::amdgcn_implicitarg_ptr: 4244 return legalizeImplicitArgPtr(MI, MRI, B); 4245 case Intrinsic::amdgcn_workitem_id_x: 4246 return legalizePreloadedArgIntrin(MI, MRI, B, 4247 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4248 case Intrinsic::amdgcn_workitem_id_y: 4249 return legalizePreloadedArgIntrin(MI, MRI, B, 4250 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4251 case Intrinsic::amdgcn_workitem_id_z: 4252 return legalizePreloadedArgIntrin(MI, MRI, B, 4253 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4254 case Intrinsic::amdgcn_workgroup_id_x: 4255 return legalizePreloadedArgIntrin(MI, MRI, B, 4256 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4257 case Intrinsic::amdgcn_workgroup_id_y: 4258 return legalizePreloadedArgIntrin(MI, MRI, B, 4259 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4260 case Intrinsic::amdgcn_workgroup_id_z: 4261 return legalizePreloadedArgIntrin(MI, MRI, B, 4262 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4263 case Intrinsic::amdgcn_dispatch_ptr: 4264 return legalizePreloadedArgIntrin(MI, MRI, B, 4265 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4266 case Intrinsic::amdgcn_queue_ptr: 4267 return legalizePreloadedArgIntrin(MI, MRI, B, 4268 AMDGPUFunctionArgInfo::QUEUE_PTR); 4269 case Intrinsic::amdgcn_implicit_buffer_ptr: 4270 return legalizePreloadedArgIntrin( 4271 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4272 case Intrinsic::amdgcn_dispatch_id: 4273 return legalizePreloadedArgIntrin(MI, MRI, B, 4274 AMDGPUFunctionArgInfo::DISPATCH_ID); 4275 case Intrinsic::amdgcn_fdiv_fast: 4276 return legalizeFDIVFastIntrin(MI, MRI, B); 4277 case Intrinsic::amdgcn_is_shared: 4278 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4279 case Intrinsic::amdgcn_is_private: 4280 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4281 case Intrinsic::amdgcn_wavefrontsize: { 4282 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4283 MI.eraseFromParent(); 4284 return true; 4285 } 4286 case Intrinsic::amdgcn_s_buffer_load: 4287 return legalizeSBufferLoad(MI, B, Observer); 4288 case Intrinsic::amdgcn_raw_buffer_store: 4289 case Intrinsic::amdgcn_struct_buffer_store: 4290 return legalizeBufferStore(MI, MRI, B, false, false); 4291 case Intrinsic::amdgcn_raw_buffer_store_format: 4292 case Intrinsic::amdgcn_struct_buffer_store_format: 4293 return legalizeBufferStore(MI, MRI, B, false, true); 4294 case Intrinsic::amdgcn_raw_tbuffer_store: 4295 case Intrinsic::amdgcn_struct_tbuffer_store: 4296 return legalizeBufferStore(MI, MRI, B, true, true); 4297 case Intrinsic::amdgcn_raw_buffer_load: 4298 case Intrinsic::amdgcn_struct_buffer_load: 4299 return legalizeBufferLoad(MI, MRI, B, false, false); 4300 case Intrinsic::amdgcn_raw_buffer_load_format: 4301 case Intrinsic::amdgcn_struct_buffer_load_format: 4302 return legalizeBufferLoad(MI, MRI, B, true, false); 4303 case Intrinsic::amdgcn_raw_tbuffer_load: 4304 case Intrinsic::amdgcn_struct_tbuffer_load: 4305 return legalizeBufferLoad(MI, MRI, B, true, true); 4306 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4307 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4308 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4309 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4310 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4311 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4312 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4313 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4314 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4315 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4316 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4317 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4318 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4319 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4320 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4321 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4322 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4323 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4324 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4325 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4326 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4327 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4328 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4329 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4330 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4331 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4332 return legalizeBufferAtomic(MI, B, IntrID); 4333 case Intrinsic::amdgcn_atomic_inc: 4334 return legalizeAtomicIncDec(MI, B, true); 4335 case Intrinsic::amdgcn_atomic_dec: 4336 return legalizeAtomicIncDec(MI, B, false); 4337 case Intrinsic::trap: 4338 return legalizeTrapIntrinsic(MI, MRI, B); 4339 case Intrinsic::debugtrap: 4340 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4341 default: { 4342 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4343 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4344 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 4345 return true; 4346 } 4347 } 4348 4349 return true; 4350 } 4351