1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Round the number of elements to the next power of two elements 40 static LLT getPow2VectorType(LLT Ty) { 41 unsigned NElts = Ty.getNumElements(); 42 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 43 return Ty.changeNumElements(Pow2NElts); 44 } 45 46 // Round the number of bits to the next power of two bits 47 static LLT getPow2ScalarType(LLT Ty) { 48 unsigned Bits = Ty.getSizeInBits(); 49 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 50 return LLT::scalar(Pow2Bits); 51 } 52 53 static LegalityPredicate isMultiple32(unsigned TypeIdx, 54 unsigned MaxSize = 1024) { 55 return [=](const LegalityQuery &Query) { 56 const LLT Ty = Query.Types[TypeIdx]; 57 const LLT EltTy = Ty.getScalarType(); 58 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 59 }; 60 } 61 62 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 63 return [=](const LegalityQuery &Query) { 64 const LLT Ty = Query.Types[TypeIdx]; 65 return Ty.isVector() && 66 Ty.getNumElements() % 2 != 0 && 67 Ty.getElementType().getSizeInBits() < 32 && 68 Ty.getSizeInBits() % 32 != 0; 69 }; 70 } 71 72 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 73 return [=](const LegalityQuery &Query) { 74 const LLT Ty = Query.Types[TypeIdx]; 75 const LLT EltTy = Ty.getScalarType(); 76 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 77 }; 78 } 79 80 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 const LLT EltTy = Ty.getElementType(); 84 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 85 }; 86 } 87 88 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 89 return [=](const LegalityQuery &Query) { 90 const LLT Ty = Query.Types[TypeIdx]; 91 const LLT EltTy = Ty.getElementType(); 92 unsigned Size = Ty.getSizeInBits(); 93 unsigned Pieces = (Size + 63) / 64; 94 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 95 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 96 }; 97 } 98 99 // Increase the number of vector elements to reach the next multiple of 32-bit 100 // type. 101 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 102 return [=](const LegalityQuery &Query) { 103 const LLT Ty = Query.Types[TypeIdx]; 104 105 const LLT EltTy = Ty.getElementType(); 106 const int Size = Ty.getSizeInBits(); 107 const int EltSize = EltTy.getSizeInBits(); 108 const int NextMul32 = (Size + 31) / 32; 109 110 assert(EltSize < 32); 111 112 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 113 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 114 }; 115 } 116 117 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 118 return [=](const LegalityQuery &Query) { 119 const LLT Ty = Query.Types[TypeIdx]; 120 unsigned Size = Ty.getSizeInBits(); 121 122 LLT CoercedTy; 123 if (Size < 32) { 124 // <2 x s8> -> s16 125 assert(Size == 16); 126 CoercedTy = LLT::scalar(16); 127 } else 128 CoercedTy = LLT::scalarOrVector(Size / 32, 32); 129 130 return std::make_pair(TypeIdx, CoercedTy); 131 }; 132 } 133 134 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 135 return [=](const LegalityQuery &Query) { 136 const LLT QueryTy = Query.Types[TypeIdx]; 137 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 138 }; 139 } 140 141 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 142 return [=](const LegalityQuery &Query) { 143 const LLT QueryTy = Query.Types[TypeIdx]; 144 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 145 }; 146 } 147 148 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 149 return [=](const LegalityQuery &Query) { 150 const LLT QueryTy = Query.Types[TypeIdx]; 151 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 152 }; 153 } 154 155 static bool isRegisterSize(unsigned Size) { 156 return Size % 32 == 0 && Size <= 1024; 157 } 158 159 static bool isRegisterVectorElementType(LLT EltTy) { 160 const int EltSize = EltTy.getSizeInBits(); 161 return EltSize == 16 || EltSize % 32 == 0; 162 } 163 164 static bool isRegisterVectorType(LLT Ty) { 165 const int EltSize = Ty.getElementType().getSizeInBits(); 166 return EltSize == 32 || EltSize == 64 || 167 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 168 EltSize == 128 || EltSize == 256; 169 } 170 171 static bool isRegisterType(LLT Ty) { 172 if (!isRegisterSize(Ty.getSizeInBits())) 173 return false; 174 175 if (Ty.isVector()) 176 return isRegisterVectorType(Ty); 177 178 return true; 179 } 180 181 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 182 // v2s16. 183 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 184 return [=](const LegalityQuery &Query) { 185 return isRegisterType(Query.Types[TypeIdx]); 186 }; 187 } 188 189 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 190 return [=](const LegalityQuery &Query) { 191 const LLT QueryTy = Query.Types[TypeIdx]; 192 if (!QueryTy.isVector()) 193 return false; 194 const LLT EltTy = QueryTy.getElementType(); 195 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 196 }; 197 } 198 199 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 200 return [=](const LegalityQuery &Query) { 201 const LLT Ty = Query.Types[TypeIdx]; 202 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 203 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 204 }; 205 } 206 207 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 208 // handle some operations by just promoting the register during 209 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 210 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 211 bool IsLoad) { 212 switch (AS) { 213 case AMDGPUAS::PRIVATE_ADDRESS: 214 // FIXME: Private element size. 215 return 32; 216 case AMDGPUAS::LOCAL_ADDRESS: 217 return ST.useDS128() ? 128 : 64; 218 case AMDGPUAS::GLOBAL_ADDRESS: 219 case AMDGPUAS::CONSTANT_ADDRESS: 220 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 221 // Treat constant and global as identical. SMRD loads are sometimes usable for 222 // global loads (ideally constant address space should be eliminated) 223 // depending on the context. Legality cannot be context dependent, but 224 // RegBankSelect can split the load as necessary depending on the pointer 225 // register bank/uniformity and if the memory is invariant or not written in a 226 // kernel. 227 return IsLoad ? 512 : 128; 228 default: 229 // Flat addresses may contextually need to be split to 32-bit parts if they 230 // may alias scratch depending on the subtarget. 231 return 128; 232 } 233 } 234 235 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 236 const LegalityQuery &Query, 237 unsigned Opcode) { 238 const LLT Ty = Query.Types[0]; 239 240 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 241 const bool IsLoad = Opcode != AMDGPU::G_STORE; 242 243 unsigned RegSize = Ty.getSizeInBits(); 244 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 245 unsigned Align = Query.MMODescrs[0].AlignInBits; 246 unsigned AS = Query.Types[1].getAddressSpace(); 247 248 // All of these need to be custom lowered to cast the pointer operand. 249 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 250 return false; 251 252 // TODO: We should be able to widen loads if the alignment is high enough, but 253 // we also need to modify the memory access size. 254 #if 0 255 // Accept widening loads based on alignment. 256 if (IsLoad && MemSize < Size) 257 MemSize = std::max(MemSize, Align); 258 #endif 259 260 // Only 1-byte and 2-byte to 32-bit extloads are valid. 261 if (MemSize != RegSize && RegSize != 32) 262 return false; 263 264 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 265 return false; 266 267 switch (MemSize) { 268 case 8: 269 case 16: 270 case 32: 271 case 64: 272 case 128: 273 break; 274 case 96: 275 if (!ST.hasDwordx3LoadStores()) 276 return false; 277 break; 278 case 256: 279 case 512: 280 // These may contextually need to be broken down. 281 break; 282 default: 283 return false; 284 } 285 286 assert(RegSize >= MemSize); 287 288 if (Align < MemSize) { 289 const SITargetLowering *TLI = ST.getTargetLowering(); 290 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 291 return false; 292 } 293 294 return true; 295 } 296 297 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 298 unsigned Opcode) { 299 const LLT Ty = Query.Types[0]; 300 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode); 301 } 302 303 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 304 const GCNTargetMachine &TM) 305 : ST(ST_) { 306 using namespace TargetOpcode; 307 308 auto GetAddrSpacePtr = [&TM](unsigned AS) { 309 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 310 }; 311 312 const LLT S1 = LLT::scalar(1); 313 const LLT S16 = LLT::scalar(16); 314 const LLT S32 = LLT::scalar(32); 315 const LLT S64 = LLT::scalar(64); 316 const LLT S128 = LLT::scalar(128); 317 const LLT S256 = LLT::scalar(256); 318 const LLT S512 = LLT::scalar(512); 319 const LLT S1024 = LLT::scalar(1024); 320 321 const LLT V2S16 = LLT::vector(2, 16); 322 const LLT V4S16 = LLT::vector(4, 16); 323 324 const LLT V2S32 = LLT::vector(2, 32); 325 const LLT V3S32 = LLT::vector(3, 32); 326 const LLT V4S32 = LLT::vector(4, 32); 327 const LLT V5S32 = LLT::vector(5, 32); 328 const LLT V6S32 = LLT::vector(6, 32); 329 const LLT V7S32 = LLT::vector(7, 32); 330 const LLT V8S32 = LLT::vector(8, 32); 331 const LLT V9S32 = LLT::vector(9, 32); 332 const LLT V10S32 = LLT::vector(10, 32); 333 const LLT V11S32 = LLT::vector(11, 32); 334 const LLT V12S32 = LLT::vector(12, 32); 335 const LLT V13S32 = LLT::vector(13, 32); 336 const LLT V14S32 = LLT::vector(14, 32); 337 const LLT V15S32 = LLT::vector(15, 32); 338 const LLT V16S32 = LLT::vector(16, 32); 339 const LLT V32S32 = LLT::vector(32, 32); 340 341 const LLT V2S64 = LLT::vector(2, 64); 342 const LLT V3S64 = LLT::vector(3, 64); 343 const LLT V4S64 = LLT::vector(4, 64); 344 const LLT V5S64 = LLT::vector(5, 64); 345 const LLT V6S64 = LLT::vector(6, 64); 346 const LLT V7S64 = LLT::vector(7, 64); 347 const LLT V8S64 = LLT::vector(8, 64); 348 const LLT V16S64 = LLT::vector(16, 64); 349 350 std::initializer_list<LLT> AllS32Vectors = 351 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 352 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 353 std::initializer_list<LLT> AllS64Vectors = 354 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 355 356 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 357 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 358 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 359 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 360 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 361 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 362 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 363 364 const LLT CodePtr = FlatPtr; 365 366 const std::initializer_list<LLT> AddrSpaces64 = { 367 GlobalPtr, ConstantPtr, FlatPtr 368 }; 369 370 const std::initializer_list<LLT> AddrSpaces32 = { 371 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 372 }; 373 374 const std::initializer_list<LLT> FPTypesBase = { 375 S32, S64 376 }; 377 378 const std::initializer_list<LLT> FPTypes16 = { 379 S32, S64, S16 380 }; 381 382 const std::initializer_list<LLT> FPTypesPK16 = { 383 S32, S64, S16, V2S16 384 }; 385 386 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 387 388 setAction({G_BRCOND, S1}, Legal); // VCC branches 389 setAction({G_BRCOND, S32}, Legal); // SCC branches 390 391 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 392 // elements for v3s16 393 getActionDefinitionsBuilder(G_PHI) 394 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 395 .legalFor(AllS32Vectors) 396 .legalFor(AllS64Vectors) 397 .legalFor(AddrSpaces64) 398 .legalFor(AddrSpaces32) 399 .clampScalar(0, S32, S256) 400 .widenScalarToNextPow2(0, 32) 401 .clampMaxNumElements(0, S32, 16) 402 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 403 .legalIf(isPointer(0)); 404 405 if (ST.hasVOP3PInsts()) { 406 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 407 .legalFor({S32, S16, V2S16}) 408 .clampScalar(0, S16, S32) 409 .clampMaxNumElements(0, S16, 2) 410 .scalarize(0) 411 .widenScalarToNextPow2(0, 32); 412 } else if (ST.has16BitInsts()) { 413 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 414 .legalFor({S32, S16}) 415 .clampScalar(0, S16, S32) 416 .scalarize(0) 417 .widenScalarToNextPow2(0, 32); 418 } else { 419 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 420 .legalFor({S32}) 421 .clampScalar(0, S32, S32) 422 .scalarize(0); 423 } 424 425 // FIXME: Not really legal. Placeholder for custom lowering. 426 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 427 .customFor({S32, S64}) 428 .clampScalar(0, S32, S64) 429 .widenScalarToNextPow2(0, 32) 430 .scalarize(0); 431 432 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 433 .legalFor({S32}) 434 .clampScalar(0, S32, S32) 435 .scalarize(0); 436 437 // Report legal for any types we can handle anywhere. For the cases only legal 438 // on the SALU, RegBankSelect will be able to re-legalize. 439 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 440 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 441 .clampScalar(0, S32, S64) 442 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 443 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 444 .widenScalarToNextPow2(0) 445 .scalarize(0); 446 447 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 448 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 449 .legalFor({{S32, S1}, {S32, S32}}) 450 .minScalar(0, S32) 451 // TODO: .scalarize(0) 452 .lower(); 453 454 getActionDefinitionsBuilder(G_BITCAST) 455 // Don't worry about the size constraint. 456 .legalIf(all(isRegisterType(0), isRegisterType(1))) 457 .lower(); 458 459 460 getActionDefinitionsBuilder(G_CONSTANT) 461 .legalFor({S1, S32, S64, S16, GlobalPtr, 462 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 463 .clampScalar(0, S32, S64) 464 .widenScalarToNextPow2(0) 465 .legalIf(isPointer(0)); 466 467 getActionDefinitionsBuilder(G_FCONSTANT) 468 .legalFor({S32, S64, S16}) 469 .clampScalar(0, S16, S64); 470 471 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 472 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 473 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 474 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 475 .clampScalarOrElt(0, S32, S1024) 476 .legalIf(isMultiple32(0)) 477 .widenScalarToNextPow2(0, 32) 478 .clampMaxNumElements(0, S32, 16); 479 480 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 481 482 // If the amount is divergent, we have to do a wave reduction to get the 483 // maximum value, so this is expanded during RegBankSelect. 484 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 485 .legalFor({{PrivatePtr, S32}}); 486 487 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 488 .unsupportedFor({PrivatePtr}) 489 .custom(); 490 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 491 492 auto &FPOpActions = getActionDefinitionsBuilder( 493 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 494 .legalFor({S32, S64}); 495 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 496 .customFor({S32, S64}); 497 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 498 .customFor({S32, S64}); 499 500 if (ST.has16BitInsts()) { 501 if (ST.hasVOP3PInsts()) 502 FPOpActions.legalFor({S16, V2S16}); 503 else 504 FPOpActions.legalFor({S16}); 505 506 TrigActions.customFor({S16}); 507 FDIVActions.customFor({S16}); 508 } 509 510 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 511 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 512 513 if (ST.hasVOP3PInsts()) { 514 MinNumMaxNum.customFor(FPTypesPK16) 515 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 516 .clampMaxNumElements(0, S16, 2) 517 .clampScalar(0, S16, S64) 518 .scalarize(0); 519 } else if (ST.has16BitInsts()) { 520 MinNumMaxNum.customFor(FPTypes16) 521 .clampScalar(0, S16, S64) 522 .scalarize(0); 523 } else { 524 MinNumMaxNum.customFor(FPTypesBase) 525 .clampScalar(0, S32, S64) 526 .scalarize(0); 527 } 528 529 if (ST.hasVOP3PInsts()) 530 FPOpActions.clampMaxNumElements(0, S16, 2); 531 532 FPOpActions 533 .scalarize(0) 534 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 535 536 TrigActions 537 .scalarize(0) 538 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 539 540 FDIVActions 541 .scalarize(0) 542 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 543 544 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 545 .legalFor(FPTypesPK16) 546 .clampMaxNumElements(0, S16, 2) 547 .scalarize(0) 548 .clampScalar(0, S16, S64); 549 550 if (ST.has16BitInsts()) { 551 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 552 .legalFor({S32, S64, S16}) 553 .scalarize(0) 554 .clampScalar(0, S16, S64); 555 } else { 556 getActionDefinitionsBuilder(G_FSQRT) 557 .legalFor({S32, S64}) 558 .scalarize(0) 559 .clampScalar(0, S32, S64); 560 561 if (ST.hasFractBug()) { 562 getActionDefinitionsBuilder(G_FFLOOR) 563 .customFor({S64}) 564 .legalFor({S32, S64}) 565 .scalarize(0) 566 .clampScalar(0, S32, S64); 567 } else { 568 getActionDefinitionsBuilder(G_FFLOOR) 569 .legalFor({S32, S64}) 570 .scalarize(0) 571 .clampScalar(0, S32, S64); 572 } 573 } 574 575 getActionDefinitionsBuilder(G_FPTRUNC) 576 .legalFor({{S32, S64}, {S16, S32}}) 577 .scalarize(0) 578 .lower(); 579 580 getActionDefinitionsBuilder(G_FPEXT) 581 .legalFor({{S64, S32}, {S32, S16}}) 582 .lowerFor({{S64, S16}}) // FIXME: Implement 583 .scalarize(0); 584 585 getActionDefinitionsBuilder(G_FSUB) 586 // Use actual fsub instruction 587 .legalFor({S32}) 588 // Must use fadd + fneg 589 .lowerFor({S64, S16, V2S16}) 590 .scalarize(0) 591 .clampScalar(0, S32, S64); 592 593 // Whether this is legal depends on the floating point mode for the function. 594 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 595 if (ST.hasMadF16()) 596 FMad.customFor({S32, S16}); 597 else 598 FMad.customFor({S32}); 599 FMad.scalarize(0) 600 .lower(); 601 602 // TODO: Do we need to clamp maximum bitwidth? 603 getActionDefinitionsBuilder(G_TRUNC) 604 .legalIf(isScalar(0)) 605 .legalFor({{V2S16, V2S32}}) 606 .clampMaxNumElements(0, S16, 2) 607 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 608 // situations (like an invalid implicit use), we don't want to infinite loop 609 // in the legalizer. 610 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 611 .alwaysLegal(); 612 613 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 614 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 615 {S32, S1}, {S64, S1}, {S16, S1}}) 616 .scalarize(0) 617 .clampScalar(0, S32, S64) 618 .widenScalarToNextPow2(1, 32); 619 620 // TODO: Split s1->s64 during regbankselect for VALU. 621 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 622 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 623 .lowerFor({{S32, S64}}) 624 .lowerIf(typeIs(1, S1)) 625 .customFor({{S64, S64}}); 626 if (ST.has16BitInsts()) 627 IToFP.legalFor({{S16, S16}}); 628 IToFP.clampScalar(1, S32, S64) 629 .scalarize(0) 630 .widenScalarToNextPow2(1); 631 632 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 633 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 634 .customFor({{S64, S64}}); 635 if (ST.has16BitInsts()) 636 FPToI.legalFor({{S16, S16}}); 637 else 638 FPToI.minScalar(1, S32); 639 640 FPToI.minScalar(0, S32) 641 .scalarize(0) 642 .lower(); 643 644 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 645 .scalarize(0) 646 .lower(); 647 648 if (ST.has16BitInsts()) { 649 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 650 .legalFor({S16, S32, S64}) 651 .clampScalar(0, S16, S64) 652 .scalarize(0); 653 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 654 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 655 .legalFor({S32, S64}) 656 .clampScalar(0, S32, S64) 657 .scalarize(0); 658 } else { 659 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 660 .legalFor({S32}) 661 .customFor({S64}) 662 .clampScalar(0, S32, S64) 663 .scalarize(0); 664 } 665 666 // FIXME: Clamp offset operand. 667 getActionDefinitionsBuilder(G_PTR_ADD) 668 .legalIf(isPointer(0)) 669 .scalarize(0); 670 671 getActionDefinitionsBuilder(G_PTRMASK) 672 .legalIf(typeInSet(1, {S64, S32})) 673 .minScalar(1, S32) 674 .maxScalarIf(sizeIs(0, 32), 1, S32) 675 .maxScalarIf(sizeIs(0, 64), 1, S64) 676 .scalarize(0); 677 678 auto &CmpBuilder = 679 getActionDefinitionsBuilder(G_ICMP) 680 // The compare output type differs based on the register bank of the output, 681 // so make both s1 and s32 legal. 682 // 683 // Scalar compares producing output in scc will be promoted to s32, as that 684 // is the allocatable register type that will be needed for the copy from 685 // scc. This will be promoted during RegBankSelect, and we assume something 686 // before that won't try to use s32 result types. 687 // 688 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 689 // bank. 690 .legalForCartesianProduct( 691 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 692 .legalForCartesianProduct( 693 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 694 if (ST.has16BitInsts()) { 695 CmpBuilder.legalFor({{S1, S16}}); 696 } 697 698 CmpBuilder 699 .widenScalarToNextPow2(1) 700 .clampScalar(1, S32, S64) 701 .scalarize(0) 702 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 703 704 getActionDefinitionsBuilder(G_FCMP) 705 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 706 .widenScalarToNextPow2(1) 707 .clampScalar(1, S32, S64) 708 .scalarize(0); 709 710 // FIXME: fpow has a selection pattern that should move to custom lowering. 711 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 712 if (ST.has16BitInsts()) 713 Exp2Ops.legalFor({S32, S16}); 714 else 715 Exp2Ops.legalFor({S32}); 716 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 717 Exp2Ops.scalarize(0); 718 719 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 720 if (ST.has16BitInsts()) 721 ExpOps.customFor({{S32}, {S16}}); 722 else 723 ExpOps.customFor({S32}); 724 ExpOps.clampScalar(0, MinScalarFPTy, S32) 725 .scalarize(0); 726 727 // The 64-bit versions produce 32-bit results, but only on the SALU. 728 getActionDefinitionsBuilder(G_CTPOP) 729 .legalFor({{S32, S32}, {S32, S64}}) 730 .clampScalar(0, S32, S32) 731 .clampScalar(1, S32, S64) 732 .scalarize(0) 733 .widenScalarToNextPow2(0, 32) 734 .widenScalarToNextPow2(1, 32); 735 736 // The hardware instructions return a different result on 0 than the generic 737 // instructions expect. The hardware produces -1, but these produce the 738 // bitwidth. 739 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 740 .scalarize(0) 741 .clampScalar(0, S32, S32) 742 .clampScalar(1, S32, S64) 743 .widenScalarToNextPow2(0, 32) 744 .widenScalarToNextPow2(1, 32) 745 .lower(); 746 747 // The 64-bit versions produce 32-bit results, but only on the SALU. 748 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 749 .legalFor({{S32, S32}, {S32, S64}}) 750 .clampScalar(0, S32, S32) 751 .clampScalar(1, S32, S64) 752 .scalarize(0) 753 .widenScalarToNextPow2(0, 32) 754 .widenScalarToNextPow2(1, 32); 755 756 getActionDefinitionsBuilder(G_BITREVERSE) 757 .legalFor({S32}) 758 .clampScalar(0, S32, S32) 759 .scalarize(0); 760 761 if (ST.has16BitInsts()) { 762 getActionDefinitionsBuilder(G_BSWAP) 763 .legalFor({S16, S32, V2S16}) 764 .clampMaxNumElements(0, S16, 2) 765 // FIXME: Fixing non-power-of-2 before clamp is workaround for 766 // narrowScalar limitation. 767 .widenScalarToNextPow2(0) 768 .clampScalar(0, S16, S32) 769 .scalarize(0); 770 771 if (ST.hasVOP3PInsts()) { 772 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 773 .legalFor({S32, S16, V2S16}) 774 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 775 .clampMaxNumElements(0, S16, 2) 776 .minScalar(0, S16) 777 .widenScalarToNextPow2(0) 778 .scalarize(0) 779 .lower(); 780 } else { 781 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 782 .legalFor({S32, S16}) 783 .widenScalarToNextPow2(0) 784 .minScalar(0, S16) 785 .scalarize(0) 786 .lower(); 787 } 788 } else { 789 // TODO: Should have same legality without v_perm_b32 790 getActionDefinitionsBuilder(G_BSWAP) 791 .legalFor({S32}) 792 .lowerIf(scalarNarrowerThan(0, 32)) 793 // FIXME: Fixing non-power-of-2 before clamp is workaround for 794 // narrowScalar limitation. 795 .widenScalarToNextPow2(0) 796 .maxScalar(0, S32) 797 .scalarize(0) 798 .lower(); 799 800 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 801 .legalFor({S32}) 802 .minScalar(0, S32) 803 .widenScalarToNextPow2(0) 804 .scalarize(0) 805 .lower(); 806 } 807 808 getActionDefinitionsBuilder(G_INTTOPTR) 809 // List the common cases 810 .legalForCartesianProduct(AddrSpaces64, {S64}) 811 .legalForCartesianProduct(AddrSpaces32, {S32}) 812 .scalarize(0) 813 // Accept any address space as long as the size matches 814 .legalIf(sameSize(0, 1)) 815 .widenScalarIf(smallerThan(1, 0), 816 [](const LegalityQuery &Query) { 817 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 818 }) 819 .narrowScalarIf(largerThan(1, 0), 820 [](const LegalityQuery &Query) { 821 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 822 }); 823 824 getActionDefinitionsBuilder(G_PTRTOINT) 825 // List the common cases 826 .legalForCartesianProduct(AddrSpaces64, {S64}) 827 .legalForCartesianProduct(AddrSpaces32, {S32}) 828 .scalarize(0) 829 // Accept any address space as long as the size matches 830 .legalIf(sameSize(0, 1)) 831 .widenScalarIf(smallerThan(0, 1), 832 [](const LegalityQuery &Query) { 833 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 834 }) 835 .narrowScalarIf( 836 largerThan(0, 1), 837 [](const LegalityQuery &Query) { 838 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 839 }); 840 841 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 842 .scalarize(0) 843 .custom(); 844 845 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 846 bool IsLoad) -> bool { 847 const LLT DstTy = Query.Types[0]; 848 849 // Split vector extloads. 850 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 851 unsigned Align = Query.MMODescrs[0].AlignInBits; 852 853 if (MemSize < DstTy.getSizeInBits()) 854 MemSize = std::max(MemSize, Align); 855 856 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 857 return true; 858 859 const LLT PtrTy = Query.Types[1]; 860 unsigned AS = PtrTy.getAddressSpace(); 861 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 862 return true; 863 864 // Catch weird sized loads that don't evenly divide into the access sizes 865 // TODO: May be able to widen depending on alignment etc. 866 unsigned NumRegs = (MemSize + 31) / 32; 867 if (NumRegs == 3) { 868 if (!ST.hasDwordx3LoadStores()) 869 return true; 870 } else { 871 // If the alignment allows, these should have been widened. 872 if (!isPowerOf2_32(NumRegs)) 873 return true; 874 } 875 876 if (Align < MemSize) { 877 const SITargetLowering *TLI = ST.getTargetLowering(); 878 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 879 } 880 881 return false; 882 }; 883 884 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 885 unsigned Opc) -> bool { 886 unsigned Size = Query.Types[0].getSizeInBits(); 887 if (isPowerOf2_32(Size)) 888 return false; 889 890 if (Size == 96 && ST.hasDwordx3LoadStores()) 891 return false; 892 893 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 894 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 895 return false; 896 897 unsigned Align = Query.MMODescrs[0].AlignInBits; 898 unsigned RoundedSize = NextPowerOf2(Size); 899 return (Align >= RoundedSize); 900 }; 901 902 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 903 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 904 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 905 906 // TODO: Refine based on subtargets which support unaligned access or 128-bit 907 // LDS 908 // TODO: Unsupported flat for SI. 909 910 for (unsigned Op : {G_LOAD, G_STORE}) { 911 const bool IsStore = Op == G_STORE; 912 913 auto &Actions = getActionDefinitionsBuilder(Op); 914 // Whitelist some common cases. 915 // TODO: Does this help compile time at all? 916 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 917 {V2S32, GlobalPtr, 64, GlobalAlign32}, 918 {V4S32, GlobalPtr, 128, GlobalAlign32}, 919 {S64, GlobalPtr, 64, GlobalAlign32}, 920 {V2S64, GlobalPtr, 128, GlobalAlign32}, 921 {V2S16, GlobalPtr, 32, GlobalAlign32}, 922 {S32, GlobalPtr, 8, GlobalAlign8}, 923 {S32, GlobalPtr, 16, GlobalAlign16}, 924 925 {S32, LocalPtr, 32, 32}, 926 {S64, LocalPtr, 64, 32}, 927 {V2S32, LocalPtr, 64, 32}, 928 {S32, LocalPtr, 8, 8}, 929 {S32, LocalPtr, 16, 16}, 930 {V2S16, LocalPtr, 32, 32}, 931 932 {S32, PrivatePtr, 32, 32}, 933 {S32, PrivatePtr, 8, 8}, 934 {S32, PrivatePtr, 16, 16}, 935 {V2S16, PrivatePtr, 32, 32}, 936 937 {S32, ConstantPtr, 32, GlobalAlign32}, 938 {V2S32, ConstantPtr, 64, GlobalAlign32}, 939 {V4S32, ConstantPtr, 128, GlobalAlign32}, 940 {S64, ConstantPtr, 64, GlobalAlign32}, 941 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 942 Actions.legalIf( 943 [=](const LegalityQuery &Query) -> bool { 944 return isLoadStoreLegal(ST, Query, Op); 945 }); 946 947 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 948 // 64-bits. 949 // 950 // TODO: Should generalize bitcast action into coerce, which will also cover 951 // inserting addrspacecasts. 952 Actions.customIf(typeIs(1, Constant32Ptr)); 953 954 // Turn any illegal element vectors into something easier to deal 955 // with. These will ultimately produce 32-bit scalar shifts to extract the 956 // parts anyway. 957 // 958 // For odd 16-bit element vectors, prefer to split those into pieces with 959 // 16-bit vector parts. 960 Actions.bitcastIf( 961 [=](const LegalityQuery &Query) -> bool { 962 LLT Ty = Query.Types[0]; 963 return Ty.isVector() && 964 isRegisterSize(Ty.getSizeInBits()) && 965 !isRegisterVectorElementType(Ty.getElementType()); 966 }, bitcastToRegisterType(0)); 967 968 Actions 969 .customIf(typeIs(1, Constant32Ptr)) 970 // Widen suitably aligned loads by loading extra elements. 971 .moreElementsIf([=](const LegalityQuery &Query) { 972 const LLT Ty = Query.Types[0]; 973 return Op == G_LOAD && Ty.isVector() && 974 shouldWidenLoadResult(Query, Op); 975 }, moreElementsToNextPow2(0)) 976 .widenScalarIf([=](const LegalityQuery &Query) { 977 const LLT Ty = Query.Types[0]; 978 return Op == G_LOAD && !Ty.isVector() && 979 shouldWidenLoadResult(Query, Op); 980 }, widenScalarOrEltToNextPow2(0)) 981 .narrowScalarIf( 982 [=](const LegalityQuery &Query) -> bool { 983 return !Query.Types[0].isVector() && 984 needToSplitMemOp(Query, Op == G_LOAD); 985 }, 986 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 987 const LLT DstTy = Query.Types[0]; 988 const LLT PtrTy = Query.Types[1]; 989 990 const unsigned DstSize = DstTy.getSizeInBits(); 991 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 992 993 // Split extloads. 994 if (DstSize > MemSize) 995 return std::make_pair(0, LLT::scalar(MemSize)); 996 997 if (!isPowerOf2_32(DstSize)) { 998 // We're probably decomposing an odd sized store. Try to split 999 // to the widest type. TODO: Account for alignment. As-is it 1000 // should be OK, since the new parts will be further legalized. 1001 unsigned FloorSize = PowerOf2Floor(DstSize); 1002 return std::make_pair(0, LLT::scalar(FloorSize)); 1003 } 1004 1005 if (DstSize > 32 && (DstSize % 32 != 0)) { 1006 // FIXME: Need a way to specify non-extload of larger size if 1007 // suitably aligned. 1008 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1009 } 1010 1011 unsigned MaxSize = maxSizeForAddrSpace(ST, 1012 PtrTy.getAddressSpace(), 1013 Op == G_LOAD); 1014 if (MemSize > MaxSize) 1015 return std::make_pair(0, LLT::scalar(MaxSize)); 1016 1017 unsigned Align = Query.MMODescrs[0].AlignInBits; 1018 return std::make_pair(0, LLT::scalar(Align)); 1019 }) 1020 .fewerElementsIf( 1021 [=](const LegalityQuery &Query) -> bool { 1022 return Query.Types[0].isVector() && 1023 needToSplitMemOp(Query, Op == G_LOAD); 1024 }, 1025 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1026 const LLT DstTy = Query.Types[0]; 1027 const LLT PtrTy = Query.Types[1]; 1028 1029 LLT EltTy = DstTy.getElementType(); 1030 unsigned MaxSize = maxSizeForAddrSpace(ST, 1031 PtrTy.getAddressSpace(), 1032 Op == G_LOAD); 1033 1034 // FIXME: Handle widened to power of 2 results better. This ends 1035 // up scalarizing. 1036 // FIXME: 3 element stores scalarized on SI 1037 1038 // Split if it's too large for the address space. 1039 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1040 unsigned NumElts = DstTy.getNumElements(); 1041 unsigned EltSize = EltTy.getSizeInBits(); 1042 1043 if (MaxSize % EltSize == 0) { 1044 return std::make_pair( 1045 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1046 } 1047 1048 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1049 1050 // FIXME: Refine when odd breakdowns handled 1051 // The scalars will need to be re-legalized. 1052 if (NumPieces == 1 || NumPieces >= NumElts || 1053 NumElts % NumPieces != 0) 1054 return std::make_pair(0, EltTy); 1055 1056 return std::make_pair(0, 1057 LLT::vector(NumElts / NumPieces, EltTy)); 1058 } 1059 1060 // FIXME: We could probably handle weird extending loads better. 1061 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1062 if (DstTy.getSizeInBits() > MemSize) 1063 return std::make_pair(0, EltTy); 1064 1065 unsigned EltSize = EltTy.getSizeInBits(); 1066 unsigned DstSize = DstTy.getSizeInBits(); 1067 if (!isPowerOf2_32(DstSize)) { 1068 // We're probably decomposing an odd sized store. Try to split 1069 // to the widest type. TODO: Account for alignment. As-is it 1070 // should be OK, since the new parts will be further legalized. 1071 unsigned FloorSize = PowerOf2Floor(DstSize); 1072 return std::make_pair( 1073 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1074 } 1075 1076 // Need to split because of alignment. 1077 unsigned Align = Query.MMODescrs[0].AlignInBits; 1078 if (EltSize > Align && 1079 (EltSize / Align < DstTy.getNumElements())) { 1080 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1081 } 1082 1083 // May need relegalization for the scalars. 1084 return std::make_pair(0, EltTy); 1085 }) 1086 .minScalar(0, S32); 1087 1088 if (IsStore) 1089 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1090 1091 // TODO: Need a bitcast lower option? 1092 Actions 1093 .widenScalarToNextPow2(0) 1094 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1095 } 1096 1097 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1098 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1099 {S32, GlobalPtr, 16, 2 * 8}, 1100 {S32, LocalPtr, 8, 8}, 1101 {S32, LocalPtr, 16, 16}, 1102 {S32, PrivatePtr, 8, 8}, 1103 {S32, PrivatePtr, 16, 16}, 1104 {S32, ConstantPtr, 8, 8}, 1105 {S32, ConstantPtr, 16, 2 * 8}}); 1106 if (ST.hasFlatAddressSpace()) { 1107 ExtLoads.legalForTypesWithMemDesc( 1108 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1109 } 1110 1111 ExtLoads.clampScalar(0, S32, S32) 1112 .widenScalarToNextPow2(0) 1113 .unsupportedIfMemSizeNotPow2() 1114 .lower(); 1115 1116 auto &Atomics = getActionDefinitionsBuilder( 1117 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1118 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1119 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1120 G_ATOMICRMW_UMIN}) 1121 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1122 {S64, GlobalPtr}, {S64, LocalPtr}}); 1123 if (ST.hasFlatAddressSpace()) { 1124 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1125 } 1126 1127 if (ST.hasLDSFPAtomics()) { 1128 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1129 .legalFor({{S32, LocalPtr}}); 1130 } 1131 1132 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1133 // demarshalling 1134 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1135 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1136 {S32, FlatPtr}, {S64, FlatPtr}}) 1137 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1138 {S32, RegionPtr}, {S64, RegionPtr}}); 1139 // TODO: Pointer types, any 32-bit or 64-bit vector 1140 1141 // Condition should be s32 for scalar, s1 for vector. 1142 getActionDefinitionsBuilder(G_SELECT) 1143 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1144 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1145 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1146 .clampScalar(0, S16, S64) 1147 .scalarize(1) 1148 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1149 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1150 .clampMaxNumElements(0, S32, 2) 1151 .clampMaxNumElements(0, LocalPtr, 2) 1152 .clampMaxNumElements(0, PrivatePtr, 2) 1153 .scalarize(0) 1154 .widenScalarToNextPow2(0) 1155 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1156 1157 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1158 // be more flexible with the shift amount type. 1159 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1160 .legalFor({{S32, S32}, {S64, S32}}); 1161 if (ST.has16BitInsts()) { 1162 if (ST.hasVOP3PInsts()) { 1163 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1164 .clampMaxNumElements(0, S16, 2); 1165 } else 1166 Shifts.legalFor({{S16, S16}}); 1167 1168 // TODO: Support 16-bit shift amounts for all types 1169 Shifts.widenScalarIf( 1170 [=](const LegalityQuery &Query) { 1171 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1172 // 32-bit amount. 1173 const LLT ValTy = Query.Types[0]; 1174 const LLT AmountTy = Query.Types[1]; 1175 return ValTy.getSizeInBits() <= 16 && 1176 AmountTy.getSizeInBits() < 16; 1177 }, changeTo(1, S16)); 1178 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1179 Shifts.clampScalar(1, S32, S32); 1180 Shifts.clampScalar(0, S16, S64); 1181 Shifts.widenScalarToNextPow2(0, 16); 1182 } else { 1183 // Make sure we legalize the shift amount type first, as the general 1184 // expansion for the shifted type will produce much worse code if it hasn't 1185 // been truncated already. 1186 Shifts.clampScalar(1, S32, S32); 1187 Shifts.clampScalar(0, S32, S64); 1188 Shifts.widenScalarToNextPow2(0, 32); 1189 } 1190 Shifts.scalarize(0); 1191 1192 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1193 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1194 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1195 unsigned IdxTypeIdx = 2; 1196 1197 getActionDefinitionsBuilder(Op) 1198 .customIf([=](const LegalityQuery &Query) { 1199 const LLT EltTy = Query.Types[EltTypeIdx]; 1200 const LLT VecTy = Query.Types[VecTypeIdx]; 1201 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1202 return (EltTy.getSizeInBits() == 16 || 1203 EltTy.getSizeInBits() % 32 == 0) && 1204 VecTy.getSizeInBits() % 32 == 0 && 1205 VecTy.getSizeInBits() <= 1024 && 1206 IdxTy.getSizeInBits() == 32; 1207 }) 1208 .clampScalar(EltTypeIdx, S32, S64) 1209 .clampScalar(VecTypeIdx, S32, S64) 1210 .clampScalar(IdxTypeIdx, S32, S32); 1211 } 1212 1213 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1214 .unsupportedIf([=](const LegalityQuery &Query) { 1215 const LLT &EltTy = Query.Types[1].getElementType(); 1216 return Query.Types[0] != EltTy; 1217 }); 1218 1219 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1220 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1221 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1222 1223 // FIXME: Doesn't handle extract of illegal sizes. 1224 getActionDefinitionsBuilder(Op) 1225 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1226 // FIXME: Multiples of 16 should not be legal. 1227 .legalIf([=](const LegalityQuery &Query) { 1228 const LLT BigTy = Query.Types[BigTyIdx]; 1229 const LLT LitTy = Query.Types[LitTyIdx]; 1230 return (BigTy.getSizeInBits() % 32 == 0) && 1231 (LitTy.getSizeInBits() % 16 == 0); 1232 }) 1233 .widenScalarIf( 1234 [=](const LegalityQuery &Query) { 1235 const LLT BigTy = Query.Types[BigTyIdx]; 1236 return (BigTy.getScalarSizeInBits() < 16); 1237 }, 1238 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1239 .widenScalarIf( 1240 [=](const LegalityQuery &Query) { 1241 const LLT LitTy = Query.Types[LitTyIdx]; 1242 return (LitTy.getScalarSizeInBits() < 16); 1243 }, 1244 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1245 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1246 .widenScalarToNextPow2(BigTyIdx, 32); 1247 1248 } 1249 1250 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1251 .legalForCartesianProduct(AllS32Vectors, {S32}) 1252 .legalForCartesianProduct(AllS64Vectors, {S64}) 1253 .clampNumElements(0, V16S32, V32S32) 1254 .clampNumElements(0, V2S64, V16S64) 1255 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1256 1257 if (ST.hasScalarPackInsts()) { 1258 BuildVector 1259 // FIXME: Should probably widen s1 vectors straight to s32 1260 .minScalarOrElt(0, S16) 1261 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1262 .minScalar(1, S32); 1263 1264 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1265 .legalFor({V2S16, S32}) 1266 .lower(); 1267 BuildVector.minScalarOrElt(0, S32); 1268 } else { 1269 BuildVector.customFor({V2S16, S16}); 1270 BuildVector.minScalarOrElt(0, S32); 1271 1272 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1273 .customFor({V2S16, S32}) 1274 .lower(); 1275 } 1276 1277 BuildVector.legalIf(isRegisterType(0)); 1278 1279 // FIXME: Clamp maximum size 1280 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1281 .legalIf(isRegisterType(0)); 1282 1283 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1284 // pre-legalize. 1285 if (ST.hasVOP3PInsts()) { 1286 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1287 .customFor({V2S16, V2S16}) 1288 .lower(); 1289 } else 1290 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1291 1292 // Merge/Unmerge 1293 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1294 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1295 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1296 1297 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1298 const LLT Ty = Query.Types[TypeIdx]; 1299 if (Ty.isVector()) { 1300 const LLT &EltTy = Ty.getElementType(); 1301 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1302 return true; 1303 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1304 return true; 1305 } 1306 return false; 1307 }; 1308 1309 auto &Builder = getActionDefinitionsBuilder(Op) 1310 .lowerFor({{S16, V2S16}}) 1311 .lowerIf([=](const LegalityQuery &Query) { 1312 const LLT BigTy = Query.Types[BigTyIdx]; 1313 return BigTy.getSizeInBits() == 32; 1314 }) 1315 // Try to widen to s16 first for small types. 1316 // TODO: Only do this on targets with legal s16 shifts 1317 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1318 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1319 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1320 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1321 elementTypeIs(1, S16)), 1322 changeTo(1, V2S16)) 1323 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1324 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1325 // valid. 1326 .clampScalar(LitTyIdx, S32, S512) 1327 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1328 // Break up vectors with weird elements into scalars 1329 .fewerElementsIf( 1330 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1331 scalarize(0)) 1332 .fewerElementsIf( 1333 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1334 scalarize(1)) 1335 .clampScalar(BigTyIdx, S32, S1024); 1336 1337 if (Op == G_MERGE_VALUES) { 1338 Builder.widenScalarIf( 1339 // TODO: Use 16-bit shifts if legal for 8-bit values? 1340 [=](const LegalityQuery &Query) { 1341 const LLT Ty = Query.Types[LitTyIdx]; 1342 return Ty.getSizeInBits() < 32; 1343 }, 1344 changeTo(LitTyIdx, S32)); 1345 } 1346 1347 Builder.widenScalarIf( 1348 [=](const LegalityQuery &Query) { 1349 const LLT Ty = Query.Types[BigTyIdx]; 1350 return !isPowerOf2_32(Ty.getSizeInBits()) && 1351 Ty.getSizeInBits() % 16 != 0; 1352 }, 1353 [=](const LegalityQuery &Query) { 1354 // Pick the next power of 2, or a multiple of 64 over 128. 1355 // Whichever is smaller. 1356 const LLT &Ty = Query.Types[BigTyIdx]; 1357 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1358 if (NewSizeInBits >= 256) { 1359 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1360 if (RoundedTo < NewSizeInBits) 1361 NewSizeInBits = RoundedTo; 1362 } 1363 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1364 }) 1365 .legalIf([=](const LegalityQuery &Query) { 1366 const LLT &BigTy = Query.Types[BigTyIdx]; 1367 const LLT &LitTy = Query.Types[LitTyIdx]; 1368 1369 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1370 return false; 1371 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1372 return false; 1373 1374 return BigTy.getSizeInBits() % 16 == 0 && 1375 LitTy.getSizeInBits() % 16 == 0 && 1376 BigTy.getSizeInBits() <= 1024; 1377 }) 1378 // Any vectors left are the wrong size. Scalarize them. 1379 .scalarize(0) 1380 .scalarize(1); 1381 } 1382 1383 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1384 // RegBankSelect. 1385 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1386 .legalFor({{S32}, {S64}}); 1387 1388 if (ST.hasVOP3PInsts()) { 1389 SextInReg.lowerFor({{V2S16}}) 1390 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1391 // get more vector shift opportunities, since we'll get those when 1392 // expanded. 1393 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1394 } else if (ST.has16BitInsts()) { 1395 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1396 } else { 1397 // Prefer to promote to s32 before lowering if we don't have 16-bit 1398 // shifts. This avoid a lot of intermediate truncate and extend operations. 1399 SextInReg.lowerFor({{S32}, {S64}}); 1400 } 1401 1402 SextInReg 1403 .scalarize(0) 1404 .clampScalar(0, S32, S64) 1405 .lower(); 1406 1407 getActionDefinitionsBuilder(G_FSHR) 1408 .legalFor({{S32, S32}}) 1409 .scalarize(0) 1410 .lower(); 1411 1412 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1413 .legalFor({S64}); 1414 1415 getActionDefinitionsBuilder({ 1416 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1417 G_FCOPYSIGN, 1418 1419 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1420 G_READ_REGISTER, 1421 G_WRITE_REGISTER, 1422 1423 G_SADDO, G_SSUBO, 1424 1425 // TODO: Implement 1426 G_FMINIMUM, G_FMAXIMUM, 1427 G_FSHL 1428 }).lower(); 1429 1430 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1431 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1432 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1433 .unsupported(); 1434 1435 computeTables(); 1436 verify(*ST.getInstrInfo()); 1437 } 1438 1439 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1440 MachineRegisterInfo &MRI, 1441 MachineIRBuilder &B, 1442 GISelChangeObserver &Observer) const { 1443 switch (MI.getOpcode()) { 1444 case TargetOpcode::G_ADDRSPACE_CAST: 1445 return legalizeAddrSpaceCast(MI, MRI, B); 1446 case TargetOpcode::G_FRINT: 1447 return legalizeFrint(MI, MRI, B); 1448 case TargetOpcode::G_FCEIL: 1449 return legalizeFceil(MI, MRI, B); 1450 case TargetOpcode::G_INTRINSIC_TRUNC: 1451 return legalizeIntrinsicTrunc(MI, MRI, B); 1452 case TargetOpcode::G_SITOFP: 1453 return legalizeITOFP(MI, MRI, B, true); 1454 case TargetOpcode::G_UITOFP: 1455 return legalizeITOFP(MI, MRI, B, false); 1456 case TargetOpcode::G_FPTOSI: 1457 return legalizeFPTOI(MI, MRI, B, true); 1458 case TargetOpcode::G_FPTOUI: 1459 return legalizeFPTOI(MI, MRI, B, false); 1460 case TargetOpcode::G_FMINNUM: 1461 case TargetOpcode::G_FMAXNUM: 1462 case TargetOpcode::G_FMINNUM_IEEE: 1463 case TargetOpcode::G_FMAXNUM_IEEE: 1464 return legalizeMinNumMaxNum(MI, MRI, B); 1465 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1466 return legalizeExtractVectorElt(MI, MRI, B); 1467 case TargetOpcode::G_INSERT_VECTOR_ELT: 1468 return legalizeInsertVectorElt(MI, MRI, B); 1469 case TargetOpcode::G_SHUFFLE_VECTOR: 1470 return legalizeShuffleVector(MI, MRI, B); 1471 case TargetOpcode::G_FSIN: 1472 case TargetOpcode::G_FCOS: 1473 return legalizeSinCos(MI, MRI, B); 1474 case TargetOpcode::G_GLOBAL_VALUE: 1475 return legalizeGlobalValue(MI, MRI, B); 1476 case TargetOpcode::G_LOAD: 1477 return legalizeLoad(MI, MRI, B, Observer); 1478 case TargetOpcode::G_FMAD: 1479 return legalizeFMad(MI, MRI, B); 1480 case TargetOpcode::G_FDIV: 1481 return legalizeFDIV(MI, MRI, B); 1482 case TargetOpcode::G_UDIV: 1483 case TargetOpcode::G_UREM: 1484 return legalizeUDIV_UREM(MI, MRI, B); 1485 case TargetOpcode::G_SDIV: 1486 case TargetOpcode::G_SREM: 1487 return legalizeSDIV_SREM(MI, MRI, B); 1488 case TargetOpcode::G_ATOMIC_CMPXCHG: 1489 return legalizeAtomicCmpXChg(MI, MRI, B); 1490 case TargetOpcode::G_FLOG: 1491 return legalizeFlog(MI, B, numbers::ln2f); 1492 case TargetOpcode::G_FLOG10: 1493 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1494 case TargetOpcode::G_FEXP: 1495 return legalizeFExp(MI, B); 1496 case TargetOpcode::G_FPOW: 1497 return legalizeFPow(MI, B); 1498 case TargetOpcode::G_FFLOOR: 1499 return legalizeFFloor(MI, MRI, B); 1500 case TargetOpcode::G_BUILD_VECTOR: 1501 return legalizeBuildVector(MI, MRI, B); 1502 default: 1503 return false; 1504 } 1505 1506 llvm_unreachable("expected switch to return"); 1507 } 1508 1509 Register AMDGPULegalizerInfo::getSegmentAperture( 1510 unsigned AS, 1511 MachineRegisterInfo &MRI, 1512 MachineIRBuilder &B) const { 1513 MachineFunction &MF = B.getMF(); 1514 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1515 const LLT S32 = LLT::scalar(32); 1516 1517 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1518 1519 if (ST.hasApertureRegs()) { 1520 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1521 // getreg. 1522 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1523 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1524 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1525 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1526 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1527 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1528 unsigned Encoding = 1529 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1530 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1531 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1532 1533 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1534 1535 B.buildInstr(AMDGPU::S_GETREG_B32) 1536 .addDef(GetReg) 1537 .addImm(Encoding); 1538 MRI.setType(GetReg, S32); 1539 1540 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1541 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1542 } 1543 1544 Register QueuePtr = MRI.createGenericVirtualRegister( 1545 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1546 1547 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1548 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1549 return Register(); 1550 1551 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1552 // private_segment_aperture_base_hi. 1553 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1554 1555 // TODO: can we be smarter about machine pointer info? 1556 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1557 MachineMemOperand *MMO = MF.getMachineMemOperand( 1558 PtrInfo, 1559 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1560 MachineMemOperand::MOInvariant, 1561 4, commonAlignment(Align(64), StructOffset)); 1562 1563 Register LoadAddr; 1564 1565 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1566 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1567 } 1568 1569 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1570 MachineInstr &MI, MachineRegisterInfo &MRI, 1571 MachineIRBuilder &B) const { 1572 MachineFunction &MF = B.getMF(); 1573 1574 const LLT S32 = LLT::scalar(32); 1575 Register Dst = MI.getOperand(0).getReg(); 1576 Register Src = MI.getOperand(1).getReg(); 1577 1578 LLT DstTy = MRI.getType(Dst); 1579 LLT SrcTy = MRI.getType(Src); 1580 unsigned DestAS = DstTy.getAddressSpace(); 1581 unsigned SrcAS = SrcTy.getAddressSpace(); 1582 1583 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1584 // vector element. 1585 assert(!DstTy.isVector()); 1586 1587 const AMDGPUTargetMachine &TM 1588 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1589 1590 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1591 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1592 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1593 return true; 1594 } 1595 1596 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1597 // Truncate. 1598 B.buildExtract(Dst, Src, 0); 1599 MI.eraseFromParent(); 1600 return true; 1601 } 1602 1603 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1604 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1605 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1606 1607 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1608 // another. Merge operands are required to be the same type, but creating an 1609 // extra ptrtoint would be kind of pointless. 1610 auto HighAddr = B.buildConstant( 1611 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1612 B.buildMerge(Dst, {Src, HighAddr}); 1613 MI.eraseFromParent(); 1614 return true; 1615 } 1616 1617 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1618 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1619 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1620 unsigned NullVal = TM.getNullPointerValue(DestAS); 1621 1622 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1623 auto FlatNull = B.buildConstant(SrcTy, 0); 1624 1625 // Extract low 32-bits of the pointer. 1626 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1627 1628 auto CmpRes = 1629 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1630 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1631 1632 MI.eraseFromParent(); 1633 return true; 1634 } 1635 1636 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1637 return false; 1638 1639 if (!ST.hasFlatAddressSpace()) 1640 return false; 1641 1642 auto SegmentNull = 1643 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1644 auto FlatNull = 1645 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1646 1647 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1648 if (!ApertureReg.isValid()) 1649 return false; 1650 1651 auto CmpRes = 1652 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1653 1654 // Coerce the type of the low half of the result so we can use merge_values. 1655 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1656 1657 // TODO: Should we allow mismatched types but matching sizes in merges to 1658 // avoid the ptrtoint? 1659 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1660 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1661 1662 MI.eraseFromParent(); 1663 return true; 1664 } 1665 1666 bool AMDGPULegalizerInfo::legalizeFrint( 1667 MachineInstr &MI, MachineRegisterInfo &MRI, 1668 MachineIRBuilder &B) const { 1669 Register Src = MI.getOperand(1).getReg(); 1670 LLT Ty = MRI.getType(Src); 1671 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1672 1673 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1674 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1675 1676 auto C1 = B.buildFConstant(Ty, C1Val); 1677 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1678 1679 // TODO: Should this propagate fast-math-flags? 1680 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1681 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1682 1683 auto C2 = B.buildFConstant(Ty, C2Val); 1684 auto Fabs = B.buildFAbs(Ty, Src); 1685 1686 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1687 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1688 return true; 1689 } 1690 1691 bool AMDGPULegalizerInfo::legalizeFceil( 1692 MachineInstr &MI, MachineRegisterInfo &MRI, 1693 MachineIRBuilder &B) const { 1694 1695 const LLT S1 = LLT::scalar(1); 1696 const LLT S64 = LLT::scalar(64); 1697 1698 Register Src = MI.getOperand(1).getReg(); 1699 assert(MRI.getType(Src) == S64); 1700 1701 // result = trunc(src) 1702 // if (src > 0.0 && src != result) 1703 // result += 1.0 1704 1705 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1706 1707 const auto Zero = B.buildFConstant(S64, 0.0); 1708 const auto One = B.buildFConstant(S64, 1.0); 1709 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1710 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1711 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1712 auto Add = B.buildSelect(S64, And, One, Zero); 1713 1714 // TODO: Should this propagate fast-math-flags? 1715 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1716 return true; 1717 } 1718 1719 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1720 MachineIRBuilder &B) { 1721 const unsigned FractBits = 52; 1722 const unsigned ExpBits = 11; 1723 LLT S32 = LLT::scalar(32); 1724 1725 auto Const0 = B.buildConstant(S32, FractBits - 32); 1726 auto Const1 = B.buildConstant(S32, ExpBits); 1727 1728 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1729 .addUse(Const0.getReg(0)) 1730 .addUse(Const1.getReg(0)); 1731 1732 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1733 } 1734 1735 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1736 MachineInstr &MI, MachineRegisterInfo &MRI, 1737 MachineIRBuilder &B) const { 1738 const LLT S1 = LLT::scalar(1); 1739 const LLT S32 = LLT::scalar(32); 1740 const LLT S64 = LLT::scalar(64); 1741 1742 Register Src = MI.getOperand(1).getReg(); 1743 assert(MRI.getType(Src) == S64); 1744 1745 // TODO: Should this use extract since the low half is unused? 1746 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1747 Register Hi = Unmerge.getReg(1); 1748 1749 // Extract the upper half, since this is where we will find the sign and 1750 // exponent. 1751 auto Exp = extractF64Exponent(Hi, B); 1752 1753 const unsigned FractBits = 52; 1754 1755 // Extract the sign bit. 1756 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1757 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1758 1759 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1760 1761 const auto Zero32 = B.buildConstant(S32, 0); 1762 1763 // Extend back to 64-bits. 1764 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1765 1766 auto Shr = B.buildAShr(S64, FractMask, Exp); 1767 auto Not = B.buildNot(S64, Shr); 1768 auto Tmp0 = B.buildAnd(S64, Src, Not); 1769 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1770 1771 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1772 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1773 1774 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1775 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1776 return true; 1777 } 1778 1779 bool AMDGPULegalizerInfo::legalizeITOFP( 1780 MachineInstr &MI, MachineRegisterInfo &MRI, 1781 MachineIRBuilder &B, bool Signed) const { 1782 1783 Register Dst = MI.getOperand(0).getReg(); 1784 Register Src = MI.getOperand(1).getReg(); 1785 1786 const LLT S64 = LLT::scalar(64); 1787 const LLT S32 = LLT::scalar(32); 1788 1789 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1790 1791 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1792 1793 auto CvtHi = Signed ? 1794 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1795 B.buildUITOFP(S64, Unmerge.getReg(1)); 1796 1797 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1798 1799 auto ThirtyTwo = B.buildConstant(S32, 32); 1800 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1801 .addUse(CvtHi.getReg(0)) 1802 .addUse(ThirtyTwo.getReg(0)); 1803 1804 // TODO: Should this propagate fast-math-flags? 1805 B.buildFAdd(Dst, LdExp, CvtLo); 1806 MI.eraseFromParent(); 1807 return true; 1808 } 1809 1810 // TODO: Copied from DAG implementation. Verify logic and document how this 1811 // actually works. 1812 bool AMDGPULegalizerInfo::legalizeFPTOI( 1813 MachineInstr &MI, MachineRegisterInfo &MRI, 1814 MachineIRBuilder &B, bool Signed) const { 1815 1816 Register Dst = MI.getOperand(0).getReg(); 1817 Register Src = MI.getOperand(1).getReg(); 1818 1819 const LLT S64 = LLT::scalar(64); 1820 const LLT S32 = LLT::scalar(32); 1821 1822 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1823 1824 unsigned Flags = MI.getFlags(); 1825 1826 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1827 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1828 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1829 1830 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1831 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1832 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1833 1834 auto Hi = Signed ? 1835 B.buildFPTOSI(S32, FloorMul) : 1836 B.buildFPTOUI(S32, FloorMul); 1837 auto Lo = B.buildFPTOUI(S32, Fma); 1838 1839 B.buildMerge(Dst, { Lo, Hi }); 1840 MI.eraseFromParent(); 1841 1842 return true; 1843 } 1844 1845 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1846 MachineInstr &MI, MachineRegisterInfo &MRI, 1847 MachineIRBuilder &B) const { 1848 MachineFunction &MF = B.getMF(); 1849 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1850 1851 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1852 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1853 1854 // With ieee_mode disabled, the instructions have the correct behavior 1855 // already for G_FMINNUM/G_FMAXNUM 1856 if (!MFI->getMode().IEEE) 1857 return !IsIEEEOp; 1858 1859 if (IsIEEEOp) 1860 return true; 1861 1862 MachineIRBuilder HelperBuilder(MI); 1863 GISelObserverWrapper DummyObserver; 1864 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1865 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1866 } 1867 1868 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1869 MachineInstr &MI, MachineRegisterInfo &MRI, 1870 MachineIRBuilder &B) const { 1871 // TODO: Should move some of this into LegalizerHelper. 1872 1873 // TODO: Promote dynamic indexing of s16 to s32 1874 1875 // FIXME: Artifact combiner probably should have replaced the truncated 1876 // constant before this, so we shouldn't need 1877 // getConstantVRegValWithLookThrough. 1878 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1879 MI.getOperand(2).getReg(), MRI); 1880 if (!IdxVal) // Dynamic case will be selected to register indexing. 1881 return true; 1882 1883 Register Dst = MI.getOperand(0).getReg(); 1884 Register Vec = MI.getOperand(1).getReg(); 1885 1886 LLT VecTy = MRI.getType(Vec); 1887 LLT EltTy = VecTy.getElementType(); 1888 assert(EltTy == MRI.getType(Dst)); 1889 1890 if (IdxVal->Value < VecTy.getNumElements()) 1891 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1892 else 1893 B.buildUndef(Dst); 1894 1895 MI.eraseFromParent(); 1896 return true; 1897 } 1898 1899 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1900 MachineInstr &MI, MachineRegisterInfo &MRI, 1901 MachineIRBuilder &B) const { 1902 // TODO: Should move some of this into LegalizerHelper. 1903 1904 // TODO: Promote dynamic indexing of s16 to s32 1905 1906 // FIXME: Artifact combiner probably should have replaced the truncated 1907 // constant before this, so we shouldn't need 1908 // getConstantVRegValWithLookThrough. 1909 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1910 MI.getOperand(3).getReg(), MRI); 1911 if (!IdxVal) // Dynamic case will be selected to register indexing. 1912 return true; 1913 1914 Register Dst = MI.getOperand(0).getReg(); 1915 Register Vec = MI.getOperand(1).getReg(); 1916 Register Ins = MI.getOperand(2).getReg(); 1917 1918 LLT VecTy = MRI.getType(Vec); 1919 LLT EltTy = VecTy.getElementType(); 1920 assert(EltTy == MRI.getType(Ins)); 1921 1922 if (IdxVal->Value < VecTy.getNumElements()) 1923 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1924 else 1925 B.buildUndef(Dst); 1926 1927 MI.eraseFromParent(); 1928 return true; 1929 } 1930 1931 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1932 MachineInstr &MI, MachineRegisterInfo &MRI, 1933 MachineIRBuilder &B) const { 1934 const LLT V2S16 = LLT::vector(2, 16); 1935 1936 Register Dst = MI.getOperand(0).getReg(); 1937 Register Src0 = MI.getOperand(1).getReg(); 1938 LLT DstTy = MRI.getType(Dst); 1939 LLT SrcTy = MRI.getType(Src0); 1940 1941 if (SrcTy == V2S16 && DstTy == V2S16 && 1942 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1943 return true; 1944 1945 MachineIRBuilder HelperBuilder(MI); 1946 GISelObserverWrapper DummyObserver; 1947 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1948 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1949 } 1950 1951 bool AMDGPULegalizerInfo::legalizeSinCos( 1952 MachineInstr &MI, MachineRegisterInfo &MRI, 1953 MachineIRBuilder &B) const { 1954 1955 Register DstReg = MI.getOperand(0).getReg(); 1956 Register SrcReg = MI.getOperand(1).getReg(); 1957 LLT Ty = MRI.getType(DstReg); 1958 unsigned Flags = MI.getFlags(); 1959 1960 Register TrigVal; 1961 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 1962 if (ST.hasTrigReducedRange()) { 1963 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1964 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1965 .addUse(MulVal.getReg(0)) 1966 .setMIFlags(Flags).getReg(0); 1967 } else 1968 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1969 1970 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1971 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1972 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1973 .addUse(TrigVal) 1974 .setMIFlags(Flags); 1975 MI.eraseFromParent(); 1976 return true; 1977 } 1978 1979 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1980 Register DstReg, LLT PtrTy, 1981 MachineIRBuilder &B, const GlobalValue *GV, 1982 unsigned Offset, unsigned GAFlags) const { 1983 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1984 // to the following code sequence: 1985 // 1986 // For constant address space: 1987 // s_getpc_b64 s[0:1] 1988 // s_add_u32 s0, s0, $symbol 1989 // s_addc_u32 s1, s1, 0 1990 // 1991 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1992 // a fixup or relocation is emitted to replace $symbol with a literal 1993 // constant, which is a pc-relative offset from the encoding of the $symbol 1994 // operand to the global variable. 1995 // 1996 // For global address space: 1997 // s_getpc_b64 s[0:1] 1998 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1999 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2000 // 2001 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2002 // fixups or relocations are emitted to replace $symbol@*@lo and 2003 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2004 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2005 // operand to the global variable. 2006 // 2007 // What we want here is an offset from the value returned by s_getpc 2008 // (which is the address of the s_add_u32 instruction) to the global 2009 // variable, but since the encoding of $symbol starts 4 bytes after the start 2010 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2011 // small. This requires us to add 4 to the global variable offset in order to 2012 // compute the correct address. 2013 2014 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2015 2016 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2017 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2018 2019 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2020 .addDef(PCReg); 2021 2022 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2023 if (GAFlags == SIInstrInfo::MO_NONE) 2024 MIB.addImm(0); 2025 else 2026 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2027 2028 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2029 2030 if (PtrTy.getSizeInBits() == 32) 2031 B.buildExtract(DstReg, PCReg, 0); 2032 return true; 2033 } 2034 2035 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2036 MachineInstr &MI, MachineRegisterInfo &MRI, 2037 MachineIRBuilder &B) const { 2038 Register DstReg = MI.getOperand(0).getReg(); 2039 LLT Ty = MRI.getType(DstReg); 2040 unsigned AS = Ty.getAddressSpace(); 2041 2042 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2043 MachineFunction &MF = B.getMF(); 2044 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2045 2046 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2047 if (!MFI->isEntryFunction()) { 2048 const Function &Fn = MF.getFunction(); 2049 DiagnosticInfoUnsupported BadLDSDecl( 2050 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2051 DS_Warning); 2052 Fn.getContext().diagnose(BadLDSDecl); 2053 2054 // We currently don't have a way to correctly allocate LDS objects that 2055 // aren't directly associated with a kernel. We do force inlining of 2056 // functions that use local objects. However, if these dead functions are 2057 // not eliminated, we don't want a compile time error. Just emit a warning 2058 // and a trap, since there should be no callable path here. 2059 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2060 B.buildUndef(DstReg); 2061 MI.eraseFromParent(); 2062 return true; 2063 } 2064 2065 // TODO: We could emit code to handle the initialization somewhere. 2066 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2067 const SITargetLowering *TLI = ST.getTargetLowering(); 2068 if (!TLI->shouldUseLDSConstAddress(GV)) { 2069 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2070 return true; // Leave in place; 2071 } 2072 2073 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 2074 MI.eraseFromParent(); 2075 return true; 2076 } 2077 2078 const Function &Fn = MF.getFunction(); 2079 DiagnosticInfoUnsupported BadInit( 2080 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2081 Fn.getContext().diagnose(BadInit); 2082 return true; 2083 } 2084 2085 const SITargetLowering *TLI = ST.getTargetLowering(); 2086 2087 if (TLI->shouldEmitFixup(GV)) { 2088 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2089 MI.eraseFromParent(); 2090 return true; 2091 } 2092 2093 if (TLI->shouldEmitPCReloc(GV)) { 2094 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2095 MI.eraseFromParent(); 2096 return true; 2097 } 2098 2099 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2100 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2101 2102 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2103 MachinePointerInfo::getGOT(MF), 2104 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2105 MachineMemOperand::MOInvariant, 2106 8 /*Size*/, Align(8)); 2107 2108 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2109 2110 if (Ty.getSizeInBits() == 32) { 2111 // Truncate if this is a 32-bit constant adrdess. 2112 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2113 B.buildExtract(DstReg, Load, 0); 2114 } else 2115 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2116 2117 MI.eraseFromParent(); 2118 return true; 2119 } 2120 2121 bool AMDGPULegalizerInfo::legalizeLoad( 2122 MachineInstr &MI, MachineRegisterInfo &MRI, 2123 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2124 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2125 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2126 Observer.changingInstr(MI); 2127 MI.getOperand(1).setReg(Cast.getReg(0)); 2128 Observer.changedInstr(MI); 2129 return true; 2130 } 2131 2132 bool AMDGPULegalizerInfo::legalizeFMad( 2133 MachineInstr &MI, MachineRegisterInfo &MRI, 2134 MachineIRBuilder &B) const { 2135 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2136 assert(Ty.isScalar()); 2137 2138 MachineFunction &MF = B.getMF(); 2139 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2140 2141 // TODO: Always legal with future ftz flag. 2142 // FIXME: Do we need just output? 2143 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2144 return true; 2145 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2146 return true; 2147 2148 MachineIRBuilder HelperBuilder(MI); 2149 GISelObserverWrapper DummyObserver; 2150 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2151 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2152 } 2153 2154 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2155 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2156 Register DstReg = MI.getOperand(0).getReg(); 2157 Register PtrReg = MI.getOperand(1).getReg(); 2158 Register CmpVal = MI.getOperand(2).getReg(); 2159 Register NewVal = MI.getOperand(3).getReg(); 2160 2161 assert(SITargetLowering::isFlatGlobalAddrSpace( 2162 MRI.getType(PtrReg).getAddressSpace()) && 2163 "this should not have been custom lowered"); 2164 2165 LLT ValTy = MRI.getType(CmpVal); 2166 LLT VecTy = LLT::vector(2, ValTy); 2167 2168 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2169 2170 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2171 .addDef(DstReg) 2172 .addUse(PtrReg) 2173 .addUse(PackedVal) 2174 .setMemRefs(MI.memoperands()); 2175 2176 MI.eraseFromParent(); 2177 return true; 2178 } 2179 2180 bool AMDGPULegalizerInfo::legalizeFlog( 2181 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2182 Register Dst = MI.getOperand(0).getReg(); 2183 Register Src = MI.getOperand(1).getReg(); 2184 LLT Ty = B.getMRI()->getType(Dst); 2185 unsigned Flags = MI.getFlags(); 2186 2187 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2188 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2189 2190 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2191 MI.eraseFromParent(); 2192 return true; 2193 } 2194 2195 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2196 MachineIRBuilder &B) const { 2197 Register Dst = MI.getOperand(0).getReg(); 2198 Register Src = MI.getOperand(1).getReg(); 2199 unsigned Flags = MI.getFlags(); 2200 LLT Ty = B.getMRI()->getType(Dst); 2201 2202 auto K = B.buildFConstant(Ty, numbers::log2e); 2203 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2204 B.buildFExp2(Dst, Mul, Flags); 2205 MI.eraseFromParent(); 2206 return true; 2207 } 2208 2209 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2210 MachineIRBuilder &B) const { 2211 Register Dst = MI.getOperand(0).getReg(); 2212 Register Src0 = MI.getOperand(1).getReg(); 2213 Register Src1 = MI.getOperand(2).getReg(); 2214 unsigned Flags = MI.getFlags(); 2215 LLT Ty = B.getMRI()->getType(Dst); 2216 const LLT S16 = LLT::scalar(16); 2217 const LLT S32 = LLT::scalar(32); 2218 2219 if (Ty == S32) { 2220 auto Log = B.buildFLog2(S32, Src0, Flags); 2221 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2222 .addUse(Log.getReg(0)) 2223 .addUse(Src1) 2224 .setMIFlags(Flags); 2225 B.buildFExp2(Dst, Mul, Flags); 2226 } else if (Ty == S16) { 2227 // There's no f16 fmul_legacy, so we need to convert for it. 2228 auto Log = B.buildFLog2(S16, Src0, Flags); 2229 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2230 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2231 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2232 .addUse(Ext0.getReg(0)) 2233 .addUse(Ext1.getReg(0)) 2234 .setMIFlags(Flags); 2235 2236 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2237 } else 2238 return false; 2239 2240 MI.eraseFromParent(); 2241 return true; 2242 } 2243 2244 // Find a source register, ignoring any possible source modifiers. 2245 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2246 Register ModSrc = OrigSrc; 2247 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2248 ModSrc = SrcFNeg->getOperand(1).getReg(); 2249 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2250 ModSrc = SrcFAbs->getOperand(1).getReg(); 2251 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2252 ModSrc = SrcFAbs->getOperand(1).getReg(); 2253 return ModSrc; 2254 } 2255 2256 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2257 MachineRegisterInfo &MRI, 2258 MachineIRBuilder &B) const { 2259 2260 const LLT S1 = LLT::scalar(1); 2261 const LLT S64 = LLT::scalar(64); 2262 Register Dst = MI.getOperand(0).getReg(); 2263 Register OrigSrc = MI.getOperand(1).getReg(); 2264 unsigned Flags = MI.getFlags(); 2265 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2266 "this should not have been custom lowered"); 2267 2268 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2269 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2270 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2271 // V_FRACT bug is: 2272 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2273 // 2274 // Convert floor(x) to (x - fract(x)) 2275 2276 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2277 .addUse(OrigSrc) 2278 .setMIFlags(Flags); 2279 2280 // Give source modifier matching some assistance before obscuring a foldable 2281 // pattern. 2282 2283 // TODO: We can avoid the neg on the fract? The input sign to fract 2284 // shouldn't matter? 2285 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2286 2287 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2288 2289 Register Min = MRI.createGenericVirtualRegister(S64); 2290 2291 // We don't need to concern ourselves with the snan handling difference, so 2292 // use the one which will directly select. 2293 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2294 if (MFI->getMode().IEEE) 2295 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2296 else 2297 B.buildFMinNum(Min, Fract, Const, Flags); 2298 2299 Register CorrectedFract = Min; 2300 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2301 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2302 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2303 } 2304 2305 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2306 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2307 2308 MI.eraseFromParent(); 2309 return true; 2310 } 2311 2312 // Turn an illegal packed v2s16 build vector into bit operations. 2313 // TODO: This should probably be a bitcast action in LegalizerHelper. 2314 bool AMDGPULegalizerInfo::legalizeBuildVector( 2315 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2316 Register Dst = MI.getOperand(0).getReg(); 2317 const LLT S32 = LLT::scalar(32); 2318 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2319 2320 Register Src0 = MI.getOperand(1).getReg(); 2321 Register Src1 = MI.getOperand(2).getReg(); 2322 assert(MRI.getType(Src0) == LLT::scalar(16)); 2323 2324 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2325 B.buildBitcast(Dst, Merge); 2326 2327 MI.eraseFromParent(); 2328 return true; 2329 } 2330 2331 // Return the use branch instruction, otherwise null if the usage is invalid. 2332 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2333 MachineRegisterInfo &MRI, 2334 MachineInstr *&Br, 2335 MachineBasicBlock *&UncondBrTarget) { 2336 Register CondDef = MI.getOperand(0).getReg(); 2337 if (!MRI.hasOneNonDBGUse(CondDef)) 2338 return nullptr; 2339 2340 MachineBasicBlock *Parent = MI.getParent(); 2341 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2342 if (UseMI.getParent() != Parent || 2343 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2344 return nullptr; 2345 2346 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2347 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2348 if (Next == Parent->end()) { 2349 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2350 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2351 return nullptr; 2352 UncondBrTarget = &*NextMBB; 2353 } else { 2354 if (Next->getOpcode() != AMDGPU::G_BR) 2355 return nullptr; 2356 Br = &*Next; 2357 UncondBrTarget = Br->getOperand(0).getMBB(); 2358 } 2359 2360 return &UseMI; 2361 } 2362 2363 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2364 MachineRegisterInfo &MRI, 2365 Register LiveIn, 2366 Register PhyReg) const { 2367 assert(PhyReg.isPhysical() && "Physical register expected"); 2368 2369 // Insert the live-in copy, if required, by defining destination virtual 2370 // register. 2371 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2372 if (!MRI.getVRegDef(LiveIn)) { 2373 // FIXME: Should have scoped insert pt 2374 MachineBasicBlock &OrigInsBB = B.getMBB(); 2375 auto OrigInsPt = B.getInsertPt(); 2376 2377 MachineBasicBlock &EntryMBB = B.getMF().front(); 2378 EntryMBB.addLiveIn(PhyReg); 2379 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2380 B.buildCopy(LiveIn, PhyReg); 2381 2382 B.setInsertPt(OrigInsBB, OrigInsPt); 2383 } 2384 2385 return LiveIn; 2386 } 2387 2388 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2389 MachineRegisterInfo &MRI, 2390 Register PhyReg, LLT Ty, 2391 bool InsertLiveInCopy) const { 2392 assert(PhyReg.isPhysical() && "Physical register expected"); 2393 2394 // Get or create virtual live-in regester 2395 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2396 if (!LiveIn) { 2397 LiveIn = MRI.createGenericVirtualRegister(Ty); 2398 MRI.addLiveIn(PhyReg, LiveIn); 2399 } 2400 2401 // When the actual true copy required is from virtual register to physical 2402 // register (to be inserted later), live-in copy insertion from physical 2403 // to register virtual register is not required 2404 if (!InsertLiveInCopy) 2405 return LiveIn; 2406 2407 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2408 } 2409 2410 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2411 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2412 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2413 const ArgDescriptor *Arg; 2414 const TargetRegisterClass *RC; 2415 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2416 if (!Arg) { 2417 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2418 return nullptr; 2419 } 2420 return Arg; 2421 } 2422 2423 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2424 const ArgDescriptor *Arg) const { 2425 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2426 return false; // TODO: Handle these 2427 2428 Register SrcReg = Arg->getRegister(); 2429 assert(SrcReg.isPhysical() && "Physical register expected"); 2430 assert(DstReg.isVirtual() && "Virtual register expected"); 2431 2432 MachineRegisterInfo &MRI = *B.getMRI(); 2433 2434 LLT Ty = MRI.getType(DstReg); 2435 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2436 2437 if (Arg->isMasked()) { 2438 // TODO: Should we try to emit this once in the entry block? 2439 const LLT S32 = LLT::scalar(32); 2440 const unsigned Mask = Arg->getMask(); 2441 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2442 2443 Register AndMaskSrc = LiveIn; 2444 2445 if (Shift != 0) { 2446 auto ShiftAmt = B.buildConstant(S32, Shift); 2447 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2448 } 2449 2450 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2451 } else { 2452 B.buildCopy(DstReg, LiveIn); 2453 } 2454 2455 return true; 2456 } 2457 2458 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2459 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2460 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2461 2462 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2463 if (!Arg) 2464 return false; 2465 2466 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2467 return false; 2468 2469 MI.eraseFromParent(); 2470 return true; 2471 } 2472 2473 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2474 MachineRegisterInfo &MRI, 2475 MachineIRBuilder &B) const { 2476 Register Dst = MI.getOperand(0).getReg(); 2477 LLT DstTy = MRI.getType(Dst); 2478 LLT S16 = LLT::scalar(16); 2479 LLT S32 = LLT::scalar(32); 2480 LLT S64 = LLT::scalar(64); 2481 2482 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2483 return true; 2484 2485 if (DstTy == S16) 2486 return legalizeFDIV16(MI, MRI, B); 2487 if (DstTy == S32) 2488 return legalizeFDIV32(MI, MRI, B); 2489 if (DstTy == S64) 2490 return legalizeFDIV64(MI, MRI, B); 2491 2492 return false; 2493 } 2494 2495 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2496 const LLT S32 = LLT::scalar(32); 2497 2498 auto Cvt0 = B.buildUITOFP(S32, Src); 2499 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2500 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2501 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2502 return B.buildFPTOUI(S32, Mul).getReg(0); 2503 } 2504 2505 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2506 Register DstReg, 2507 Register Num, 2508 Register Den, 2509 bool IsRem) const { 2510 const LLT S1 = LLT::scalar(1); 2511 const LLT S32 = LLT::scalar(32); 2512 2513 // RCP = URECIP(Den) = 2^32 / Den + e 2514 // e is rounding error. 2515 auto RCP = buildDivRCP(B, Den); 2516 2517 // RCP_LO = mul(RCP, Den) 2518 auto RCP_LO = B.buildMul(S32, RCP, Den); 2519 2520 // RCP_HI = mulhu (RCP, Den) */ 2521 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2522 2523 // NEG_RCP_LO = -RCP_LO 2524 auto Zero = B.buildConstant(S32, 0); 2525 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2526 2527 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2528 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2529 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2530 2531 // Calculate the rounding error from the URECIP instruction 2532 // E = mulhu(ABS_RCP_LO, RCP) 2533 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2534 2535 // RCP_A_E = RCP + E 2536 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2537 2538 // RCP_S_E = RCP - E 2539 auto RCP_S_E = B.buildSub(S32, RCP, E); 2540 2541 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2542 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2543 2544 // Quotient = mulhu(Tmp0, Num)stmp 2545 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2546 2547 // Num_S_Remainder = Quotient * Den 2548 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2549 2550 // Remainder = Num - Num_S_Remainder 2551 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2552 2553 // Remainder_GE_Den = Remainder >= Den 2554 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2555 2556 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2557 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2558 Num, Num_S_Remainder); 2559 2560 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2561 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2562 2563 // Calculate Division result: 2564 2565 // Quotient_A_One = Quotient + 1 2566 auto One = B.buildConstant(S32, 1); 2567 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2568 2569 // Quotient_S_One = Quotient - 1 2570 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2571 2572 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2573 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2574 2575 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2576 if (IsRem) { 2577 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2578 2579 // Calculate Rem result: 2580 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2581 2582 // Remainder_A_Den = Remainder + Den 2583 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2584 2585 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2586 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2587 2588 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2589 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2590 } else { 2591 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2592 } 2593 } 2594 2595 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2596 MachineRegisterInfo &MRI, 2597 MachineIRBuilder &B) const { 2598 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2599 Register DstReg = MI.getOperand(0).getReg(); 2600 Register Num = MI.getOperand(1).getReg(); 2601 Register Den = MI.getOperand(2).getReg(); 2602 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2603 MI.eraseFromParent(); 2604 return true; 2605 } 2606 2607 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2608 // 2609 // Return lo, hi of result 2610 // 2611 // %cvt.lo = G_UITOFP Val.lo 2612 // %cvt.hi = G_UITOFP Val.hi 2613 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2614 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2615 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2616 // %mul2 = G_FMUL %mul1, 2**(-32) 2617 // %trunc = G_INTRINSIC_TRUNC %mul2 2618 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2619 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2620 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2621 Register Val) { 2622 const LLT S32 = LLT::scalar(32); 2623 auto Unmerge = B.buildUnmerge(S32, Val); 2624 2625 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2626 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2627 2628 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2629 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2630 2631 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2632 auto Mul1 = 2633 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2634 2635 // 2**(-32) 2636 auto Mul2 = 2637 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2638 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2639 2640 // -(2**32) 2641 auto Mad2 = B.buildFMAD(S32, Trunc, 2642 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2643 2644 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2645 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2646 2647 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2648 } 2649 2650 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI, 2651 MachineRegisterInfo &MRI, 2652 MachineIRBuilder &B) const { 2653 const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV; 2654 const LLT S32 = LLT::scalar(32); 2655 const LLT S64 = LLT::scalar(64); 2656 const LLT S1 = LLT::scalar(1); 2657 Register Numer = MI.getOperand(1).getReg(); 2658 Register Denom = MI.getOperand(2).getReg(); 2659 Register RcpLo, RcpHi; 2660 2661 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2662 2663 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2664 2665 auto Zero64 = B.buildConstant(S64, 0); 2666 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2667 2668 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2669 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2670 2671 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2672 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2673 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2674 2675 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2676 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2677 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2678 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2679 2680 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2681 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2682 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2683 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2684 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2685 2686 auto Zero32 = B.buildConstant(S32, 0); 2687 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2688 auto Add2_HiC = 2689 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2690 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2691 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2692 2693 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2694 Register NumerLo = UnmergeNumer.getReg(0); 2695 Register NumerHi = UnmergeNumer.getReg(1); 2696 2697 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2698 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2699 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2700 Register Mul3_Lo = UnmergeMul3.getReg(0); 2701 Register Mul3_Hi = UnmergeMul3.getReg(1); 2702 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2703 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2704 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2705 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2706 2707 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2708 Register DenomLo = UnmergeDenom.getReg(0); 2709 Register DenomHi = UnmergeDenom.getReg(1); 2710 2711 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2712 auto C1 = B.buildSExt(S32, CmpHi); 2713 2714 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2715 auto C2 = B.buildSExt(S32, CmpLo); 2716 2717 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2718 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2719 2720 // TODO: Here and below portions of the code can be enclosed into if/endif. 2721 // Currently control flow is unconditional and we have 4 selects after 2722 // potential endif to substitute PHIs. 2723 2724 // if C3 != 0 ... 2725 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2726 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2727 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2728 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2729 2730 auto One64 = B.buildConstant(S64, 1); 2731 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2732 2733 auto C4 = 2734 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2735 auto C5 = 2736 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2737 auto C6 = B.buildSelect( 2738 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2739 2740 // if (C6 != 0) 2741 auto Add4 = B.buildAdd(S64, Add3, One64); 2742 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2743 2744 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2745 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2746 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2747 2748 // endif C6 2749 // endif C3 2750 2751 if (IsDiv) { 2752 auto Sel1 = B.buildSelect( 2753 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2754 B.buildSelect(MI.getOperand(0), 2755 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2756 } else { 2757 auto Sel2 = B.buildSelect( 2758 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2759 B.buildSelect(MI.getOperand(0), 2760 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2761 } 2762 2763 MI.eraseFromParent(); 2764 return true; 2765 } 2766 2767 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2768 MachineRegisterInfo &MRI, 2769 MachineIRBuilder &B) const { 2770 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2771 if (Ty == LLT::scalar(32)) 2772 return legalizeUDIV_UREM32(MI, MRI, B); 2773 if (Ty == LLT::scalar(64)) 2774 return legalizeUDIV_UREM64(MI, MRI, B); 2775 return false; 2776 } 2777 2778 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2779 MachineRegisterInfo &MRI, 2780 MachineIRBuilder &B) const { 2781 const LLT S32 = LLT::scalar(32); 2782 2783 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2784 Register DstReg = MI.getOperand(0).getReg(); 2785 Register LHS = MI.getOperand(1).getReg(); 2786 Register RHS = MI.getOperand(2).getReg(); 2787 2788 auto ThirtyOne = B.buildConstant(S32, 31); 2789 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2790 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2791 2792 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2793 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2794 2795 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2796 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2797 2798 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2799 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2800 2801 if (IsRem) { 2802 auto RSign = LHSign; // Remainder sign is the same as LHS 2803 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2804 B.buildSub(DstReg, UDivRem, RSign); 2805 } else { 2806 auto DSign = B.buildXor(S32, LHSign, RHSign); 2807 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2808 B.buildSub(DstReg, UDivRem, DSign); 2809 } 2810 2811 MI.eraseFromParent(); 2812 return true; 2813 } 2814 2815 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2816 MachineRegisterInfo &MRI, 2817 MachineIRBuilder &B) const { 2818 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2819 return legalizeSDIV_SREM32(MI, MRI, B); 2820 return false; 2821 } 2822 2823 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2824 MachineRegisterInfo &MRI, 2825 MachineIRBuilder &B) const { 2826 Register Res = MI.getOperand(0).getReg(); 2827 Register LHS = MI.getOperand(1).getReg(); 2828 Register RHS = MI.getOperand(2).getReg(); 2829 2830 uint16_t Flags = MI.getFlags(); 2831 2832 LLT ResTy = MRI.getType(Res); 2833 LLT S32 = LLT::scalar(32); 2834 LLT S64 = LLT::scalar(64); 2835 2836 const MachineFunction &MF = B.getMF(); 2837 bool Unsafe = 2838 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2839 2840 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2841 return false; 2842 2843 if (!Unsafe && ResTy == S32 && 2844 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2845 return false; 2846 2847 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2848 // 1 / x -> RCP(x) 2849 if (CLHS->isExactlyValue(1.0)) { 2850 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2851 .addUse(RHS) 2852 .setMIFlags(Flags); 2853 2854 MI.eraseFromParent(); 2855 return true; 2856 } 2857 2858 // -1 / x -> RCP( FNEG(x) ) 2859 if (CLHS->isExactlyValue(-1.0)) { 2860 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2861 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2862 .addUse(FNeg.getReg(0)) 2863 .setMIFlags(Flags); 2864 2865 MI.eraseFromParent(); 2866 return true; 2867 } 2868 } 2869 2870 // x / y -> x * (1.0 / y) 2871 if (Unsafe) { 2872 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2873 .addUse(RHS) 2874 .setMIFlags(Flags); 2875 B.buildFMul(Res, LHS, RCP, Flags); 2876 2877 MI.eraseFromParent(); 2878 return true; 2879 } 2880 2881 return false; 2882 } 2883 2884 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2885 MachineRegisterInfo &MRI, 2886 MachineIRBuilder &B) const { 2887 Register Res = MI.getOperand(0).getReg(); 2888 Register LHS = MI.getOperand(1).getReg(); 2889 Register RHS = MI.getOperand(2).getReg(); 2890 2891 uint16_t Flags = MI.getFlags(); 2892 2893 LLT S16 = LLT::scalar(16); 2894 LLT S32 = LLT::scalar(32); 2895 2896 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2897 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2898 2899 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2900 .addUse(RHSExt.getReg(0)) 2901 .setMIFlags(Flags); 2902 2903 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2904 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2905 2906 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2907 .addUse(RDst.getReg(0)) 2908 .addUse(RHS) 2909 .addUse(LHS) 2910 .setMIFlags(Flags); 2911 2912 MI.eraseFromParent(); 2913 return true; 2914 } 2915 2916 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2917 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2918 static void toggleSPDenormMode(bool Enable, 2919 MachineIRBuilder &B, 2920 const GCNSubtarget &ST, 2921 AMDGPU::SIModeRegisterDefaults Mode) { 2922 // Set SP denorm mode to this value. 2923 unsigned SPDenormMode = 2924 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2925 2926 if (ST.hasDenormModeInst()) { 2927 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2928 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2929 2930 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2931 B.buildInstr(AMDGPU::S_DENORM_MODE) 2932 .addImm(NewDenormModeValue); 2933 2934 } else { 2935 // Select FP32 bit field in mode register. 2936 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2937 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2938 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2939 2940 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2941 .addImm(SPDenormMode) 2942 .addImm(SPDenormModeBitField); 2943 } 2944 } 2945 2946 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2947 MachineRegisterInfo &MRI, 2948 MachineIRBuilder &B) const { 2949 Register Res = MI.getOperand(0).getReg(); 2950 Register LHS = MI.getOperand(1).getReg(); 2951 Register RHS = MI.getOperand(2).getReg(); 2952 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2953 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2954 2955 uint16_t Flags = MI.getFlags(); 2956 2957 LLT S32 = LLT::scalar(32); 2958 LLT S1 = LLT::scalar(1); 2959 2960 auto One = B.buildFConstant(S32, 1.0f); 2961 2962 auto DenominatorScaled = 2963 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2964 .addUse(LHS) 2965 .addUse(RHS) 2966 .addImm(0) 2967 .setMIFlags(Flags); 2968 auto NumeratorScaled = 2969 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2970 .addUse(LHS) 2971 .addUse(RHS) 2972 .addImm(1) 2973 .setMIFlags(Flags); 2974 2975 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2976 .addUse(DenominatorScaled.getReg(0)) 2977 .setMIFlags(Flags); 2978 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2979 2980 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2981 // aren't modeled as reading it. 2982 if (!Mode.allFP32Denormals()) 2983 toggleSPDenormMode(true, B, ST, Mode); 2984 2985 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2986 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2987 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2988 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2989 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2990 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2991 2992 if (!Mode.allFP32Denormals()) 2993 toggleSPDenormMode(false, B, ST, Mode); 2994 2995 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2996 .addUse(Fma4.getReg(0)) 2997 .addUse(Fma1.getReg(0)) 2998 .addUse(Fma3.getReg(0)) 2999 .addUse(NumeratorScaled.getReg(1)) 3000 .setMIFlags(Flags); 3001 3002 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3003 .addUse(Fmas.getReg(0)) 3004 .addUse(RHS) 3005 .addUse(LHS) 3006 .setMIFlags(Flags); 3007 3008 MI.eraseFromParent(); 3009 return true; 3010 } 3011 3012 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3013 MachineRegisterInfo &MRI, 3014 MachineIRBuilder &B) const { 3015 Register Res = MI.getOperand(0).getReg(); 3016 Register LHS = MI.getOperand(1).getReg(); 3017 Register RHS = MI.getOperand(2).getReg(); 3018 3019 uint16_t Flags = MI.getFlags(); 3020 3021 LLT S64 = LLT::scalar(64); 3022 LLT S1 = LLT::scalar(1); 3023 3024 auto One = B.buildFConstant(S64, 1.0); 3025 3026 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3027 .addUse(LHS) 3028 .addUse(RHS) 3029 .addImm(0) 3030 .setMIFlags(Flags); 3031 3032 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3033 3034 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3035 .addUse(DivScale0.getReg(0)) 3036 .setMIFlags(Flags); 3037 3038 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3039 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3040 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3041 3042 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3043 .addUse(LHS) 3044 .addUse(RHS) 3045 .addImm(1) 3046 .setMIFlags(Flags); 3047 3048 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3049 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3050 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3051 3052 Register Scale; 3053 if (!ST.hasUsableDivScaleConditionOutput()) { 3054 // Workaround a hardware bug on SI where the condition output from div_scale 3055 // is not usable. 3056 3057 LLT S32 = LLT::scalar(32); 3058 3059 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3060 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3061 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3062 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3063 3064 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3065 Scale1Unmerge.getReg(1)); 3066 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3067 Scale0Unmerge.getReg(1)); 3068 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3069 } else { 3070 Scale = DivScale1.getReg(1); 3071 } 3072 3073 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3074 .addUse(Fma4.getReg(0)) 3075 .addUse(Fma3.getReg(0)) 3076 .addUse(Mul.getReg(0)) 3077 .addUse(Scale) 3078 .setMIFlags(Flags); 3079 3080 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3081 .addUse(Fmas.getReg(0)) 3082 .addUse(RHS) 3083 .addUse(LHS) 3084 .setMIFlags(Flags); 3085 3086 MI.eraseFromParent(); 3087 return true; 3088 } 3089 3090 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3091 MachineRegisterInfo &MRI, 3092 MachineIRBuilder &B) const { 3093 Register Res = MI.getOperand(0).getReg(); 3094 Register LHS = MI.getOperand(2).getReg(); 3095 Register RHS = MI.getOperand(3).getReg(); 3096 uint16_t Flags = MI.getFlags(); 3097 3098 LLT S32 = LLT::scalar(32); 3099 LLT S1 = LLT::scalar(1); 3100 3101 auto Abs = B.buildFAbs(S32, RHS, Flags); 3102 const APFloat C0Val(1.0f); 3103 3104 auto C0 = B.buildConstant(S32, 0x6f800000); 3105 auto C1 = B.buildConstant(S32, 0x2f800000); 3106 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3107 3108 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3109 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3110 3111 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3112 3113 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3114 .addUse(Mul0.getReg(0)) 3115 .setMIFlags(Flags); 3116 3117 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3118 3119 B.buildFMul(Res, Sel, Mul1, Flags); 3120 3121 MI.eraseFromParent(); 3122 return true; 3123 } 3124 3125 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3126 MachineRegisterInfo &MRI, 3127 MachineIRBuilder &B) const { 3128 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3129 if (!MFI->isEntryFunction()) { 3130 return legalizePreloadedArgIntrin(MI, MRI, B, 3131 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3132 } 3133 3134 uint64_t Offset = 3135 ST.getTargetLowering()->getImplicitParameterOffset( 3136 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3137 Register DstReg = MI.getOperand(0).getReg(); 3138 LLT DstTy = MRI.getType(DstReg); 3139 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3140 3141 const ArgDescriptor *Arg; 3142 const TargetRegisterClass *RC; 3143 std::tie(Arg, RC) 3144 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3145 if (!Arg) 3146 return false; 3147 3148 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3149 if (!loadInputValue(KernargPtrReg, B, Arg)) 3150 return false; 3151 3152 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3153 MI.eraseFromParent(); 3154 return true; 3155 } 3156 3157 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3158 MachineRegisterInfo &MRI, 3159 MachineIRBuilder &B, 3160 unsigned AddrSpace) const { 3161 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3162 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3163 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3164 MI.eraseFromParent(); 3165 return true; 3166 } 3167 3168 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3169 // offset (the offset that is included in bounds checking and swizzling, to be 3170 // split between the instruction's voffset and immoffset fields) and soffset 3171 // (the offset that is excluded from bounds checking and swizzling, to go in 3172 // the instruction's soffset field). This function takes the first kind of 3173 // offset and figures out how to split it between voffset and immoffset. 3174 std::tuple<Register, unsigned, unsigned> 3175 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3176 Register OrigOffset) const { 3177 const unsigned MaxImm = 4095; 3178 Register BaseReg; 3179 unsigned TotalConstOffset; 3180 MachineInstr *OffsetDef; 3181 const LLT S32 = LLT::scalar(32); 3182 3183 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3184 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3185 3186 unsigned ImmOffset = TotalConstOffset; 3187 3188 // If the immediate value is too big for the immoffset field, put the value 3189 // and -4096 into the immoffset field so that the value that is copied/added 3190 // for the voffset field is a multiple of 4096, and it stands more chance 3191 // of being CSEd with the copy/add for another similar load/store. 3192 // However, do not do that rounding down to a multiple of 4096 if that is a 3193 // negative number, as it appears to be illegal to have a negative offset 3194 // in the vgpr, even if adding the immediate offset makes it positive. 3195 unsigned Overflow = ImmOffset & ~MaxImm; 3196 ImmOffset -= Overflow; 3197 if ((int32_t)Overflow < 0) { 3198 Overflow += ImmOffset; 3199 ImmOffset = 0; 3200 } 3201 3202 if (Overflow != 0) { 3203 if (!BaseReg) { 3204 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3205 } else { 3206 auto OverflowVal = B.buildConstant(S32, Overflow); 3207 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3208 } 3209 } 3210 3211 if (!BaseReg) 3212 BaseReg = B.buildConstant(S32, 0).getReg(0); 3213 3214 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3215 } 3216 3217 /// Handle register layout difference for f16 images for some subtargets. 3218 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3219 MachineRegisterInfo &MRI, 3220 Register Reg) const { 3221 if (!ST.hasUnpackedD16VMem()) 3222 return Reg; 3223 3224 const LLT S16 = LLT::scalar(16); 3225 const LLT S32 = LLT::scalar(32); 3226 LLT StoreVT = MRI.getType(Reg); 3227 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3228 3229 auto Unmerge = B.buildUnmerge(S16, Reg); 3230 3231 SmallVector<Register, 4> WideRegs; 3232 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3233 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3234 3235 int NumElts = StoreVT.getNumElements(); 3236 3237 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3238 } 3239 3240 Register AMDGPULegalizerInfo::fixStoreSourceType( 3241 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3242 MachineRegisterInfo *MRI = B.getMRI(); 3243 LLT Ty = MRI->getType(VData); 3244 3245 const LLT S16 = LLT::scalar(16); 3246 3247 // Fixup illegal register types for i8 stores. 3248 if (Ty == LLT::scalar(8) || Ty == S16) { 3249 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3250 return AnyExt; 3251 } 3252 3253 if (Ty.isVector()) { 3254 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3255 if (IsFormat) 3256 return handleD16VData(B, *MRI, VData); 3257 } 3258 } 3259 3260 return VData; 3261 } 3262 3263 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3264 MachineRegisterInfo &MRI, 3265 MachineIRBuilder &B, 3266 bool IsTyped, 3267 bool IsFormat) const { 3268 Register VData = MI.getOperand(1).getReg(); 3269 LLT Ty = MRI.getType(VData); 3270 LLT EltTy = Ty.getScalarType(); 3271 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3272 const LLT S32 = LLT::scalar(32); 3273 3274 VData = fixStoreSourceType(B, VData, IsFormat); 3275 Register RSrc = MI.getOperand(2).getReg(); 3276 3277 MachineMemOperand *MMO = *MI.memoperands_begin(); 3278 const int MemSize = MMO->getSize(); 3279 3280 unsigned ImmOffset; 3281 unsigned TotalOffset; 3282 3283 // The typed intrinsics add an immediate after the registers. 3284 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3285 3286 // The struct intrinsic variants add one additional operand over raw. 3287 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3288 Register VIndex; 3289 int OpOffset = 0; 3290 if (HasVIndex) { 3291 VIndex = MI.getOperand(3).getReg(); 3292 OpOffset = 1; 3293 } 3294 3295 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3296 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3297 3298 unsigned Format = 0; 3299 if (IsTyped) { 3300 Format = MI.getOperand(5 + OpOffset).getImm(); 3301 ++OpOffset; 3302 } 3303 3304 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3305 3306 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3307 if (TotalOffset != 0) 3308 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3309 3310 unsigned Opc; 3311 if (IsTyped) { 3312 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3313 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3314 } else if (IsFormat) { 3315 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3316 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3317 } else { 3318 switch (MemSize) { 3319 case 1: 3320 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3321 break; 3322 case 2: 3323 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3324 break; 3325 default: 3326 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3327 break; 3328 } 3329 } 3330 3331 if (!VIndex) 3332 VIndex = B.buildConstant(S32, 0).getReg(0); 3333 3334 auto MIB = B.buildInstr(Opc) 3335 .addUse(VData) // vdata 3336 .addUse(RSrc) // rsrc 3337 .addUse(VIndex) // vindex 3338 .addUse(VOffset) // voffset 3339 .addUse(SOffset) // soffset 3340 .addImm(ImmOffset); // offset(imm) 3341 3342 if (IsTyped) 3343 MIB.addImm(Format); 3344 3345 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3346 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3347 .addMemOperand(MMO); 3348 3349 MI.eraseFromParent(); 3350 return true; 3351 } 3352 3353 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3354 MachineRegisterInfo &MRI, 3355 MachineIRBuilder &B, 3356 bool IsFormat, 3357 bool IsTyped) const { 3358 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3359 MachineMemOperand *MMO = *MI.memoperands_begin(); 3360 const int MemSize = MMO->getSize(); 3361 const LLT S32 = LLT::scalar(32); 3362 3363 Register Dst = MI.getOperand(0).getReg(); 3364 Register RSrc = MI.getOperand(2).getReg(); 3365 3366 // The typed intrinsics add an immediate after the registers. 3367 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3368 3369 // The struct intrinsic variants add one additional operand over raw. 3370 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3371 Register VIndex; 3372 int OpOffset = 0; 3373 if (HasVIndex) { 3374 VIndex = MI.getOperand(3).getReg(); 3375 OpOffset = 1; 3376 } 3377 3378 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3379 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3380 3381 unsigned Format = 0; 3382 if (IsTyped) { 3383 Format = MI.getOperand(5 + OpOffset).getImm(); 3384 ++OpOffset; 3385 } 3386 3387 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3388 unsigned ImmOffset; 3389 unsigned TotalOffset; 3390 3391 LLT Ty = MRI.getType(Dst); 3392 LLT EltTy = Ty.getScalarType(); 3393 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3394 const bool Unpacked = ST.hasUnpackedD16VMem(); 3395 3396 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3397 if (TotalOffset != 0) 3398 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3399 3400 unsigned Opc; 3401 3402 if (IsTyped) { 3403 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3404 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3405 } else if (IsFormat) { 3406 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3407 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3408 } else { 3409 switch (MemSize) { 3410 case 1: 3411 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3412 break; 3413 case 2: 3414 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3415 break; 3416 default: 3417 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3418 break; 3419 } 3420 } 3421 3422 Register LoadDstReg; 3423 3424 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3425 LLT UnpackedTy = Ty.changeElementSize(32); 3426 3427 if (IsExtLoad) 3428 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3429 else if (Unpacked && IsD16 && Ty.isVector()) 3430 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3431 else 3432 LoadDstReg = Dst; 3433 3434 if (!VIndex) 3435 VIndex = B.buildConstant(S32, 0).getReg(0); 3436 3437 auto MIB = B.buildInstr(Opc) 3438 .addDef(LoadDstReg) // vdata 3439 .addUse(RSrc) // rsrc 3440 .addUse(VIndex) // vindex 3441 .addUse(VOffset) // voffset 3442 .addUse(SOffset) // soffset 3443 .addImm(ImmOffset); // offset(imm) 3444 3445 if (IsTyped) 3446 MIB.addImm(Format); 3447 3448 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3449 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3450 .addMemOperand(MMO); 3451 3452 if (LoadDstReg != Dst) { 3453 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3454 3455 // Widen result for extending loads was widened. 3456 if (IsExtLoad) 3457 B.buildTrunc(Dst, LoadDstReg); 3458 else { 3459 // Repack to original 16-bit vector result 3460 // FIXME: G_TRUNC should work, but legalization currently fails 3461 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3462 SmallVector<Register, 4> Repack; 3463 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3464 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3465 B.buildMerge(Dst, Repack); 3466 } 3467 } 3468 3469 MI.eraseFromParent(); 3470 return true; 3471 } 3472 3473 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3474 MachineIRBuilder &B, 3475 bool IsInc) const { 3476 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3477 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3478 B.buildInstr(Opc) 3479 .addDef(MI.getOperand(0).getReg()) 3480 .addUse(MI.getOperand(2).getReg()) 3481 .addUse(MI.getOperand(3).getReg()) 3482 .cloneMemRefs(MI); 3483 MI.eraseFromParent(); 3484 return true; 3485 } 3486 3487 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3488 switch (IntrID) { 3489 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3490 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3491 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3492 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3493 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3494 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3495 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3496 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3497 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3498 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3499 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3500 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3501 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3502 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3503 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3504 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3505 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3506 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3507 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3508 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3509 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3510 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3511 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3512 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3513 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3514 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3515 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3516 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3517 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3518 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3519 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3520 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3521 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3522 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3523 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3524 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3525 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3526 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3527 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3528 default: 3529 llvm_unreachable("unhandled atomic opcode"); 3530 } 3531 } 3532 3533 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3534 MachineIRBuilder &B, 3535 Intrinsic::ID IID) const { 3536 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3537 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3538 3539 Register Dst = MI.getOperand(0).getReg(); 3540 Register VData = MI.getOperand(2).getReg(); 3541 3542 Register CmpVal; 3543 int OpOffset = 0; 3544 3545 if (IsCmpSwap) { 3546 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3547 ++OpOffset; 3548 } 3549 3550 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3551 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3552 3553 // The struct intrinsic variants add one additional operand over raw. 3554 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3555 Register VIndex; 3556 if (HasVIndex) { 3557 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3558 ++OpOffset; 3559 } 3560 3561 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3562 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3563 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3564 3565 MachineMemOperand *MMO = *MI.memoperands_begin(); 3566 3567 unsigned ImmOffset; 3568 unsigned TotalOffset; 3569 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3570 if (TotalOffset != 0) 3571 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3572 3573 if (!VIndex) 3574 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3575 3576 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3577 .addDef(Dst) 3578 .addUse(VData); // vdata 3579 3580 if (IsCmpSwap) 3581 MIB.addReg(CmpVal); 3582 3583 MIB.addUse(RSrc) // rsrc 3584 .addUse(VIndex) // vindex 3585 .addUse(VOffset) // voffset 3586 .addUse(SOffset) // soffset 3587 .addImm(ImmOffset) // offset(imm) 3588 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3589 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3590 .addMemOperand(MMO); 3591 3592 MI.eraseFromParent(); 3593 return true; 3594 } 3595 3596 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3597 /// vector with s16 typed elements. 3598 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3599 SmallVectorImpl<Register> &PackedAddrs, 3600 int AddrIdx, int DimIdx, int NumVAddrs, 3601 int NumGradients) { 3602 const LLT S16 = LLT::scalar(16); 3603 const LLT V2S16 = LLT::vector(2, 16); 3604 3605 for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) { 3606 MachineOperand &SrcOp = MI.getOperand(I); 3607 if (!SrcOp.isReg()) 3608 continue; // _L to _LZ may have eliminated this. 3609 3610 Register AddrReg = SrcOp.getReg(); 3611 3612 if (I < DimIdx) { 3613 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3614 PackedAddrs.push_back(AddrReg); 3615 } else { 3616 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3617 // derivatives dx/dh and dx/dv are packed with undef. 3618 if (((I + 1) >= (AddrIdx + NumVAddrs)) || 3619 ((NumGradients / 2) % 2 == 1 && 3620 (I == DimIdx + (NumGradients / 2) - 1 || 3621 I == DimIdx + NumGradients - 1)) || 3622 // Check for _L to _LZ optimization 3623 !MI.getOperand(I + 1).isReg()) { 3624 PackedAddrs.push_back( 3625 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3626 .getReg(0)); 3627 } else { 3628 PackedAddrs.push_back( 3629 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3630 .getReg(0)); 3631 ++I; 3632 } 3633 } 3634 } 3635 } 3636 3637 /// Convert from separate vaddr components to a single vector address register, 3638 /// and replace the remaining operands with $noreg. 3639 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3640 int DimIdx, int NumVAddrs) { 3641 const LLT S32 = LLT::scalar(32); 3642 3643 SmallVector<Register, 8> AddrRegs; 3644 for (int I = 0; I != NumVAddrs; ++I) { 3645 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3646 if (SrcOp.isReg()) { 3647 AddrRegs.push_back(SrcOp.getReg()); 3648 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3649 } 3650 } 3651 3652 int NumAddrRegs = AddrRegs.size(); 3653 if (NumAddrRegs != 1) { 3654 // Round up to 8 elements for v5-v7 3655 // FIXME: Missing intermediate sized register classes and instructions. 3656 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3657 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3658 auto Undef = B.buildUndef(S32); 3659 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3660 NumAddrRegs = RoundedNumRegs; 3661 } 3662 3663 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3664 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3665 } 3666 3667 for (int I = 1; I != NumVAddrs; ++I) { 3668 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3669 if (SrcOp.isReg()) 3670 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3671 } 3672 } 3673 3674 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3675 /// 3676 /// Depending on the subtarget, load/store with 16-bit element data need to be 3677 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3678 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3679 /// registers. 3680 /// 3681 /// We don't want to directly select image instructions just yet, but also want 3682 /// to exposes all register repacking to the legalizer/combiners. We also don't 3683 /// want a selected instrution entering RegBankSelect. In order to avoid 3684 /// defining a multitude of intermediate image instructions, directly hack on 3685 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3686 /// now unnecessary arguments with $noreg. 3687 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3688 MachineInstr &MI, MachineIRBuilder &B, 3689 GISelChangeObserver &Observer, 3690 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3691 3692 const int NumDefs = MI.getNumExplicitDefs(); 3693 bool IsTFE = NumDefs == 2; 3694 // We are only processing the operands of d16 image operations on subtargets 3695 // that use the unpacked register layout, or need to repack the TFE result. 3696 3697 // TODO: Do we need to guard against already legalized intrinsics? 3698 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3699 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3700 3701 MachineRegisterInfo *MRI = B.getMRI(); 3702 const LLT S32 = LLT::scalar(32); 3703 const LLT S16 = LLT::scalar(16); 3704 const LLT V2S16 = LLT::vector(2, 16); 3705 3706 // Index of first address argument 3707 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3708 3709 // Check for 16 bit addresses and pack if true. 3710 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3711 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3712 const bool IsA16 = AddrTy == S16; 3713 3714 int NumVAddrs, NumGradients; 3715 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3716 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3717 getDMaskIdx(BaseOpcode, NumDefs); 3718 unsigned DMask = 0; 3719 3720 int DMaskLanes = 0; 3721 if (!BaseOpcode->Atomic) { 3722 DMask = MI.getOperand(DMaskIdx).getImm(); 3723 if (BaseOpcode->Gather4) { 3724 DMaskLanes = 4; 3725 } else if (DMask != 0) { 3726 DMaskLanes = countPopulation(DMask); 3727 } else if (!IsTFE && !BaseOpcode->Store) { 3728 // If dmask is 0, this is a no-op load. This can be eliminated. 3729 B.buildUndef(MI.getOperand(0)); 3730 MI.eraseFromParent(); 3731 return true; 3732 } 3733 } 3734 3735 Observer.changingInstr(MI); 3736 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3737 3738 unsigned NewOpcode = NumDefs == 0 ? 3739 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3740 3741 // Track that we legalized this 3742 MI.setDesc(B.getTII().get(NewOpcode)); 3743 3744 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3745 // dmask to be at least 1 otherwise the instruction will fail 3746 if (IsTFE && DMask == 0) { 3747 DMask = 0x1; 3748 DMaskLanes = 1; 3749 MI.getOperand(DMaskIdx).setImm(DMask); 3750 } 3751 3752 if (BaseOpcode->Atomic) { 3753 Register VData0 = MI.getOperand(2).getReg(); 3754 LLT Ty = MRI->getType(VData0); 3755 3756 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3757 if (Ty.isVector()) 3758 return false; 3759 3760 if (BaseOpcode->AtomicX2) { 3761 Register VData1 = MI.getOperand(3).getReg(); 3762 // The two values are packed in one register. 3763 LLT PackedTy = LLT::vector(2, Ty); 3764 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3765 MI.getOperand(2).setReg(Concat.getReg(0)); 3766 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3767 } 3768 } 3769 3770 int CorrectedNumVAddrs = NumVAddrs; 3771 3772 // Optimize _L to _LZ when _L is zero 3773 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3774 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3775 const ConstantFP *ConstantLod; 3776 const int LodIdx = AddrIdx + NumVAddrs - 1; 3777 3778 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3779 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3780 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3781 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3782 LZMappingInfo->LZ, ImageDimIntr->Dim); 3783 3784 // The starting indexes should remain in the same place. 3785 --NumVAddrs; 3786 --CorrectedNumVAddrs; 3787 3788 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3789 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3790 MI.RemoveOperand(LodIdx); 3791 } 3792 } 3793 } 3794 3795 // Optimize _mip away, when 'lod' is zero 3796 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3797 int64_t ConstantLod; 3798 const int LodIdx = AddrIdx + NumVAddrs - 1; 3799 3800 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3801 if (ConstantLod == 0) { 3802 // TODO: Change intrinsic opcode and remove operand instead or replacing 3803 // it with 0, as the _L to _LZ handling is done above. 3804 MI.getOperand(LodIdx).ChangeToImmediate(0); 3805 --CorrectedNumVAddrs; 3806 } 3807 } 3808 } 3809 3810 // If the register allocator cannot place the address registers contiguously 3811 // without introducing moves, then using the non-sequential address encoding 3812 // is always preferable, since it saves VALU instructions and is usually a 3813 // wash in terms of code size or even better. 3814 // 3815 // However, we currently have no way of hinting to the register allocator 3816 // that MIMG addresses should be placed contiguously when it is possible to 3817 // do so, so force non-NSA for the common 2-address case as a heuristic. 3818 // 3819 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3820 // allocation when possible. 3821 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3822 3823 // Rewrite the addressing register layout before doing anything else. 3824 if (IsA16) { 3825 // FIXME: this feature is missing from gfx10. When that is fixed, this check 3826 // should be introduced. 3827 if (!ST.hasR128A16() && !ST.hasGFX10A16()) 3828 return false; 3829 3830 if (NumVAddrs > 1) { 3831 SmallVector<Register, 4> PackedRegs; 3832 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs, 3833 NumGradients); 3834 3835 if (!UseNSA && PackedRegs.size() > 1) { 3836 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3837 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3838 PackedRegs[0] = Concat.getReg(0); 3839 PackedRegs.resize(1); 3840 } 3841 3842 const int NumPacked = PackedRegs.size(); 3843 for (int I = 0; I != NumVAddrs; ++I) { 3844 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3845 if (!SrcOp.isReg()) { 3846 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3847 continue; 3848 } 3849 3850 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3851 3852 if (I < NumPacked) 3853 SrcOp.setReg(PackedRegs[I]); 3854 else 3855 SrcOp.setReg(AMDGPU::NoRegister); 3856 } 3857 } 3858 } else if (!UseNSA && NumVAddrs > 1) { 3859 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3860 } 3861 3862 3863 if (BaseOpcode->Store) { // No TFE for stores? 3864 // TODO: Handle dmask trim 3865 Register VData = MI.getOperand(1).getReg(); 3866 LLT Ty = MRI->getType(VData); 3867 if (!Ty.isVector() || Ty.getElementType() != S16) 3868 return true; 3869 3870 Register RepackedReg = handleD16VData(B, *MRI, VData); 3871 if (RepackedReg != VData) { 3872 MI.getOperand(1).setReg(RepackedReg); 3873 } 3874 3875 return true; 3876 } 3877 3878 Register DstReg = MI.getOperand(0).getReg(); 3879 LLT Ty = MRI->getType(DstReg); 3880 const LLT EltTy = Ty.getScalarType(); 3881 const bool IsD16 = Ty.getScalarType() == S16; 3882 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3883 3884 // Confirm that the return type is large enough for the dmask specified 3885 if (NumElts < DMaskLanes) 3886 return false; 3887 3888 if (NumElts > 4 || DMaskLanes > 4) 3889 return false; 3890 3891 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3892 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3893 3894 // The raw dword aligned data component of the load. The only legal cases 3895 // where this matters should be when using the packed D16 format, for 3896 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3897 LLT RoundedTy; 3898 3899 // S32 vector to to cover all data, plus TFE result element. 3900 LLT TFETy; 3901 3902 // Register type to use for each loaded component. Will be S32 or V2S16. 3903 LLT RegTy; 3904 3905 if (IsD16 && ST.hasUnpackedD16VMem()) { 3906 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3907 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3908 RegTy = S32; 3909 } else { 3910 unsigned EltSize = EltTy.getSizeInBits(); 3911 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3912 unsigned RoundedSize = 32 * RoundedElts; 3913 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3914 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3915 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3916 } 3917 3918 // The return type does not need adjustment. 3919 // TODO: Should we change s16 case to s32 or <2 x s16>? 3920 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3921 return true; 3922 3923 Register Dst1Reg; 3924 3925 // Insert after the instruction. 3926 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3927 3928 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3929 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3930 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3931 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3932 3933 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3934 3935 MI.getOperand(0).setReg(NewResultReg); 3936 3937 // In the IR, TFE is supposed to be used with a 2 element struct return 3938 // type. The intruction really returns these two values in one contiguous 3939 // register, with one additional dword beyond the loaded data. Rewrite the 3940 // return type to use a single register result. 3941 3942 if (IsTFE) { 3943 Dst1Reg = MI.getOperand(1).getReg(); 3944 if (MRI->getType(Dst1Reg) != S32) 3945 return false; 3946 3947 // TODO: Make sure the TFE operand bit is set. 3948 MI.RemoveOperand(1); 3949 3950 // Handle the easy case that requires no repack instructions. 3951 if (Ty == S32) { 3952 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 3953 return true; 3954 } 3955 } 3956 3957 // Now figure out how to copy the new result register back into the old 3958 // result. 3959 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 3960 3961 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 3962 3963 if (ResultNumRegs == 1) { 3964 assert(!IsTFE); 3965 ResultRegs[0] = NewResultReg; 3966 } else { 3967 // We have to repack into a new vector of some kind. 3968 for (int I = 0; I != NumDataRegs; ++I) 3969 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 3970 B.buildUnmerge(ResultRegs, NewResultReg); 3971 3972 // Drop the final TFE element to get the data part. The TFE result is 3973 // directly written to the right place already. 3974 if (IsTFE) 3975 ResultRegs.resize(NumDataRegs); 3976 } 3977 3978 // For an s16 scalar result, we form an s32 result with a truncate regardless 3979 // of packed vs. unpacked. 3980 if (IsD16 && !Ty.isVector()) { 3981 B.buildTrunc(DstReg, ResultRegs[0]); 3982 return true; 3983 } 3984 3985 // Avoid a build/concat_vector of 1 entry. 3986 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 3987 B.buildBitcast(DstReg, ResultRegs[0]); 3988 return true; 3989 } 3990 3991 assert(Ty.isVector()); 3992 3993 if (IsD16) { 3994 // For packed D16 results with TFE enabled, all the data components are 3995 // S32. Cast back to the expected type. 3996 // 3997 // TODO: We don't really need to use load s32 elements. We would only need one 3998 // cast for the TFE result if a multiple of v2s16 was used. 3999 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4000 for (Register &Reg : ResultRegs) 4001 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4002 } else if (ST.hasUnpackedD16VMem()) { 4003 for (Register &Reg : ResultRegs) 4004 Reg = B.buildTrunc(S16, Reg).getReg(0); 4005 } 4006 } 4007 4008 auto padWithUndef = [&](LLT Ty, int NumElts) { 4009 if (NumElts == 0) 4010 return; 4011 Register Undef = B.buildUndef(Ty).getReg(0); 4012 for (int I = 0; I != NumElts; ++I) 4013 ResultRegs.push_back(Undef); 4014 }; 4015 4016 // Pad out any elements eliminated due to the dmask. 4017 LLT ResTy = MRI->getType(ResultRegs[0]); 4018 if (!ResTy.isVector()) { 4019 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4020 B.buildBuildVector(DstReg, ResultRegs); 4021 return true; 4022 } 4023 4024 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4025 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4026 4027 // Deal with the one annoying legal case. 4028 const LLT V3S16 = LLT::vector(3, 16); 4029 if (Ty == V3S16) { 4030 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4031 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4032 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4033 return true; 4034 } 4035 4036 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4037 B.buildConcatVectors(DstReg, ResultRegs); 4038 return true; 4039 } 4040 4041 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4042 MachineInstr &MI, MachineIRBuilder &B, 4043 GISelChangeObserver &Observer) const { 4044 Register Dst = MI.getOperand(0).getReg(); 4045 LLT Ty = B.getMRI()->getType(Dst); 4046 unsigned Size = Ty.getSizeInBits(); 4047 MachineFunction &MF = B.getMF(); 4048 4049 Observer.changingInstr(MI); 4050 4051 // FIXME: We don't really need this intermediate instruction. The intrinsic 4052 // should be fixed to have a memory operand. Since it's readnone, we're not 4053 // allowed to add one. 4054 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4055 MI.RemoveOperand(1); // Remove intrinsic ID 4056 4057 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4058 // TODO: Should this use datalayout alignment? 4059 const unsigned MemSize = (Size + 7) / 8; 4060 const Align MemAlign(4); 4061 MachineMemOperand *MMO = MF.getMachineMemOperand( 4062 MachinePointerInfo(), 4063 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4064 MachineMemOperand::MOInvariant, 4065 MemSize, MemAlign); 4066 MI.addMemOperand(MF, MMO); 4067 4068 // There are no 96-bit result scalar loads, but widening to 128-bit should 4069 // always be legal. We may need to restore this to a 96-bit result if it turns 4070 // out this needs to be converted to a vector load during RegBankSelect. 4071 if (!isPowerOf2_32(Size)) { 4072 LegalizerHelper Helper(MF, *this, Observer, B); 4073 4074 if (Ty.isVector()) 4075 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4076 else 4077 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4078 } 4079 4080 Observer.changedInstr(MI); 4081 return true; 4082 } 4083 4084 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4085 MachineRegisterInfo &MRI, 4086 MachineIRBuilder &B) const { 4087 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4088 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4089 !ST.isTrapHandlerEnabled()) { 4090 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4091 } else { 4092 // Pass queue pointer to trap handler as input, and insert trap instruction 4093 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4094 const ArgDescriptor *Arg = 4095 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4096 if (!Arg) 4097 return false; 4098 MachineRegisterInfo &MRI = *B.getMRI(); 4099 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4100 Register LiveIn = getLiveInRegister( 4101 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4102 /*InsertLiveInCopy=*/false); 4103 if (!loadInputValue(LiveIn, B, Arg)) 4104 return false; 4105 B.buildCopy(SGPR01, LiveIn); 4106 B.buildInstr(AMDGPU::S_TRAP) 4107 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4108 .addReg(SGPR01, RegState::Implicit); 4109 } 4110 4111 MI.eraseFromParent(); 4112 return true; 4113 } 4114 4115 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4116 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4117 // Is non-HSA path or trap-handler disabled? then, report a warning 4118 // accordingly 4119 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4120 !ST.isTrapHandlerEnabled()) { 4121 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4122 "debugtrap handler not supported", 4123 MI.getDebugLoc(), DS_Warning); 4124 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4125 Ctx.diagnose(NoTrap); 4126 } else { 4127 // Insert debug-trap instruction 4128 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4129 } 4130 4131 MI.eraseFromParent(); 4132 return true; 4133 } 4134 4135 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 4136 MachineIRBuilder &B, 4137 GISelChangeObserver &Observer) const { 4138 MachineRegisterInfo &MRI = *B.getMRI(); 4139 4140 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4141 auto IntrID = MI.getIntrinsicID(); 4142 switch (IntrID) { 4143 case Intrinsic::amdgcn_if: 4144 case Intrinsic::amdgcn_else: { 4145 MachineInstr *Br = nullptr; 4146 MachineBasicBlock *UncondBrTarget = nullptr; 4147 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4148 const SIRegisterInfo *TRI 4149 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4150 4151 Register Def = MI.getOperand(1).getReg(); 4152 Register Use = MI.getOperand(3).getReg(); 4153 4154 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4155 if (IntrID == Intrinsic::amdgcn_if) { 4156 B.buildInstr(AMDGPU::SI_IF) 4157 .addDef(Def) 4158 .addUse(Use) 4159 .addMBB(UncondBrTarget); 4160 } else { 4161 B.buildInstr(AMDGPU::SI_ELSE) 4162 .addDef(Def) 4163 .addUse(Use) 4164 .addMBB(UncondBrTarget) 4165 .addImm(0); 4166 } 4167 4168 if (Br) { 4169 Br->getOperand(0).setMBB(CondBrTarget); 4170 } else { 4171 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4172 // since we're swapping branch targets it needs to be reinserted. 4173 // FIXME: IRTranslator should probably not do this 4174 B.buildBr(*CondBrTarget); 4175 } 4176 4177 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4178 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4179 MI.eraseFromParent(); 4180 BrCond->eraseFromParent(); 4181 return true; 4182 } 4183 4184 return false; 4185 } 4186 case Intrinsic::amdgcn_loop: { 4187 MachineInstr *Br = nullptr; 4188 MachineBasicBlock *UncondBrTarget = nullptr; 4189 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4190 const SIRegisterInfo *TRI 4191 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4192 4193 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4194 Register Reg = MI.getOperand(2).getReg(); 4195 B.buildInstr(AMDGPU::SI_LOOP) 4196 .addUse(Reg) 4197 .addMBB(UncondBrTarget); 4198 4199 if (Br) 4200 Br->getOperand(0).setMBB(CondBrTarget); 4201 else 4202 B.buildBr(*CondBrTarget); 4203 4204 MI.eraseFromParent(); 4205 BrCond->eraseFromParent(); 4206 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4207 return true; 4208 } 4209 4210 return false; 4211 } 4212 case Intrinsic::amdgcn_kernarg_segment_ptr: 4213 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4214 // This only makes sense to call in a kernel, so just lower to null. 4215 B.buildConstant(MI.getOperand(0).getReg(), 0); 4216 MI.eraseFromParent(); 4217 return true; 4218 } 4219 4220 return legalizePreloadedArgIntrin( 4221 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4222 case Intrinsic::amdgcn_implicitarg_ptr: 4223 return legalizeImplicitArgPtr(MI, MRI, B); 4224 case Intrinsic::amdgcn_workitem_id_x: 4225 return legalizePreloadedArgIntrin(MI, MRI, B, 4226 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4227 case Intrinsic::amdgcn_workitem_id_y: 4228 return legalizePreloadedArgIntrin(MI, MRI, B, 4229 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4230 case Intrinsic::amdgcn_workitem_id_z: 4231 return legalizePreloadedArgIntrin(MI, MRI, B, 4232 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4233 case Intrinsic::amdgcn_workgroup_id_x: 4234 return legalizePreloadedArgIntrin(MI, MRI, B, 4235 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4236 case Intrinsic::amdgcn_workgroup_id_y: 4237 return legalizePreloadedArgIntrin(MI, MRI, B, 4238 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4239 case Intrinsic::amdgcn_workgroup_id_z: 4240 return legalizePreloadedArgIntrin(MI, MRI, B, 4241 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4242 case Intrinsic::amdgcn_dispatch_ptr: 4243 return legalizePreloadedArgIntrin(MI, MRI, B, 4244 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4245 case Intrinsic::amdgcn_queue_ptr: 4246 return legalizePreloadedArgIntrin(MI, MRI, B, 4247 AMDGPUFunctionArgInfo::QUEUE_PTR); 4248 case Intrinsic::amdgcn_implicit_buffer_ptr: 4249 return legalizePreloadedArgIntrin( 4250 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4251 case Intrinsic::amdgcn_dispatch_id: 4252 return legalizePreloadedArgIntrin(MI, MRI, B, 4253 AMDGPUFunctionArgInfo::DISPATCH_ID); 4254 case Intrinsic::amdgcn_fdiv_fast: 4255 return legalizeFDIVFastIntrin(MI, MRI, B); 4256 case Intrinsic::amdgcn_is_shared: 4257 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4258 case Intrinsic::amdgcn_is_private: 4259 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4260 case Intrinsic::amdgcn_wavefrontsize: { 4261 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4262 MI.eraseFromParent(); 4263 return true; 4264 } 4265 case Intrinsic::amdgcn_s_buffer_load: 4266 return legalizeSBufferLoad(MI, B, Observer); 4267 case Intrinsic::amdgcn_raw_buffer_store: 4268 case Intrinsic::amdgcn_struct_buffer_store: 4269 return legalizeBufferStore(MI, MRI, B, false, false); 4270 case Intrinsic::amdgcn_raw_buffer_store_format: 4271 case Intrinsic::amdgcn_struct_buffer_store_format: 4272 return legalizeBufferStore(MI, MRI, B, false, true); 4273 case Intrinsic::amdgcn_raw_tbuffer_store: 4274 case Intrinsic::amdgcn_struct_tbuffer_store: 4275 return legalizeBufferStore(MI, MRI, B, true, true); 4276 case Intrinsic::amdgcn_raw_buffer_load: 4277 case Intrinsic::amdgcn_struct_buffer_load: 4278 return legalizeBufferLoad(MI, MRI, B, false, false); 4279 case Intrinsic::amdgcn_raw_buffer_load_format: 4280 case Intrinsic::amdgcn_struct_buffer_load_format: 4281 return legalizeBufferLoad(MI, MRI, B, true, false); 4282 case Intrinsic::amdgcn_raw_tbuffer_load: 4283 case Intrinsic::amdgcn_struct_tbuffer_load: 4284 return legalizeBufferLoad(MI, MRI, B, true, true); 4285 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4286 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4287 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4288 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4289 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4290 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4291 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4292 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4293 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4294 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4295 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4296 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4297 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4298 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4299 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4300 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4301 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4302 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4303 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4304 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4305 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4306 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4307 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4308 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4309 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4310 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4311 return legalizeBufferAtomic(MI, B, IntrID); 4312 case Intrinsic::amdgcn_atomic_inc: 4313 return legalizeAtomicIncDec(MI, B, true); 4314 case Intrinsic::amdgcn_atomic_dec: 4315 return legalizeAtomicIncDec(MI, B, false); 4316 case Intrinsic::trap: 4317 return legalizeTrapIntrinsic(MI, MRI, B); 4318 case Intrinsic::debugtrap: 4319 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4320 default: { 4321 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4322 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4323 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 4324 return true; 4325 } 4326 } 4327 4328 return true; 4329 } 4330