1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Round the number of elements to the next power of two elements 40 static LLT getPow2VectorType(LLT Ty) { 41 unsigned NElts = Ty.getNumElements(); 42 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 43 return Ty.changeNumElements(Pow2NElts); 44 } 45 46 // Round the number of bits to the next power of two bits 47 static LLT getPow2ScalarType(LLT Ty) { 48 unsigned Bits = Ty.getSizeInBits(); 49 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 50 return LLT::scalar(Pow2Bits); 51 } 52 53 static LegalityPredicate isMultiple32(unsigned TypeIdx, 54 unsigned MaxSize = 1024) { 55 return [=](const LegalityQuery &Query) { 56 const LLT Ty = Query.Types[TypeIdx]; 57 const LLT EltTy = Ty.getScalarType(); 58 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 59 }; 60 } 61 62 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 63 return [=](const LegalityQuery &Query) { 64 const LLT Ty = Query.Types[TypeIdx]; 65 return Ty.isVector() && 66 Ty.getNumElements() % 2 != 0 && 67 Ty.getElementType().getSizeInBits() < 32 && 68 Ty.getSizeInBits() % 32 != 0; 69 }; 70 } 71 72 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 73 return [=](const LegalityQuery &Query) { 74 const LLT Ty = Query.Types[TypeIdx]; 75 const LLT EltTy = Ty.getScalarType(); 76 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 77 }; 78 } 79 80 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 const LLT EltTy = Ty.getElementType(); 84 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 85 }; 86 } 87 88 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 89 return [=](const LegalityQuery &Query) { 90 const LLT Ty = Query.Types[TypeIdx]; 91 const LLT EltTy = Ty.getElementType(); 92 unsigned Size = Ty.getSizeInBits(); 93 unsigned Pieces = (Size + 63) / 64; 94 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 95 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 96 }; 97 } 98 99 // Increase the number of vector elements to reach the next multiple of 32-bit 100 // type. 101 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 102 return [=](const LegalityQuery &Query) { 103 const LLT Ty = Query.Types[TypeIdx]; 104 105 const LLT EltTy = Ty.getElementType(); 106 const int Size = Ty.getSizeInBits(); 107 const int EltSize = EltTy.getSizeInBits(); 108 const int NextMul32 = (Size + 31) / 32; 109 110 assert(EltSize < 32); 111 112 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 113 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 114 }; 115 } 116 117 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 118 return [=](const LegalityQuery &Query) { 119 const LLT QueryTy = Query.Types[TypeIdx]; 120 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 121 }; 122 } 123 124 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 125 return [=](const LegalityQuery &Query) { 126 const LLT QueryTy = Query.Types[TypeIdx]; 127 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 128 }; 129 } 130 131 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 132 return [=](const LegalityQuery &Query) { 133 const LLT QueryTy = Query.Types[TypeIdx]; 134 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 135 }; 136 } 137 138 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 139 // v2s16. 140 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 141 return [=](const LegalityQuery &Query) { 142 const LLT Ty = Query.Types[TypeIdx]; 143 if (Ty.isVector()) { 144 const int EltSize = Ty.getElementType().getSizeInBits(); 145 return EltSize == 32 || EltSize == 64 || 146 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 147 EltSize == 128 || EltSize == 256; 148 } 149 150 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 151 }; 152 } 153 154 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT QueryTy = Query.Types[TypeIdx]; 157 if (!QueryTy.isVector()) 158 return false; 159 const LLT EltTy = QueryTy.getElementType(); 160 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 161 }; 162 } 163 164 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 165 return [=](const LegalityQuery &Query) { 166 const LLT Ty = Query.Types[TypeIdx]; 167 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 168 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 169 }; 170 } 171 172 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 173 const GCNTargetMachine &TM) 174 : ST(ST_) { 175 using namespace TargetOpcode; 176 177 auto GetAddrSpacePtr = [&TM](unsigned AS) { 178 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 179 }; 180 181 const LLT S1 = LLT::scalar(1); 182 const LLT S16 = LLT::scalar(16); 183 const LLT S32 = LLT::scalar(32); 184 const LLT S64 = LLT::scalar(64); 185 const LLT S128 = LLT::scalar(128); 186 const LLT S256 = LLT::scalar(256); 187 const LLT S512 = LLT::scalar(512); 188 const LLT S1024 = LLT::scalar(1024); 189 190 const LLT V2S16 = LLT::vector(2, 16); 191 const LLT V4S16 = LLT::vector(4, 16); 192 193 const LLT V2S32 = LLT::vector(2, 32); 194 const LLT V3S32 = LLT::vector(3, 32); 195 const LLT V4S32 = LLT::vector(4, 32); 196 const LLT V5S32 = LLT::vector(5, 32); 197 const LLT V6S32 = LLT::vector(6, 32); 198 const LLT V7S32 = LLT::vector(7, 32); 199 const LLT V8S32 = LLT::vector(8, 32); 200 const LLT V9S32 = LLT::vector(9, 32); 201 const LLT V10S32 = LLT::vector(10, 32); 202 const LLT V11S32 = LLT::vector(11, 32); 203 const LLT V12S32 = LLT::vector(12, 32); 204 const LLT V13S32 = LLT::vector(13, 32); 205 const LLT V14S32 = LLT::vector(14, 32); 206 const LLT V15S32 = LLT::vector(15, 32); 207 const LLT V16S32 = LLT::vector(16, 32); 208 const LLT V32S32 = LLT::vector(32, 32); 209 210 const LLT V2S64 = LLT::vector(2, 64); 211 const LLT V3S64 = LLT::vector(3, 64); 212 const LLT V4S64 = LLT::vector(4, 64); 213 const LLT V5S64 = LLT::vector(5, 64); 214 const LLT V6S64 = LLT::vector(6, 64); 215 const LLT V7S64 = LLT::vector(7, 64); 216 const LLT V8S64 = LLT::vector(8, 64); 217 const LLT V16S64 = LLT::vector(16, 64); 218 219 std::initializer_list<LLT> AllS32Vectors = 220 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 221 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 222 std::initializer_list<LLT> AllS64Vectors = 223 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 224 225 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 226 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 227 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 228 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 229 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 230 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 231 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 232 233 const LLT CodePtr = FlatPtr; 234 235 const std::initializer_list<LLT> AddrSpaces64 = { 236 GlobalPtr, ConstantPtr, FlatPtr 237 }; 238 239 const std::initializer_list<LLT> AddrSpaces32 = { 240 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 241 }; 242 243 const std::initializer_list<LLT> FPTypesBase = { 244 S32, S64 245 }; 246 247 const std::initializer_list<LLT> FPTypes16 = { 248 S32, S64, S16 249 }; 250 251 const std::initializer_list<LLT> FPTypesPK16 = { 252 S32, S64, S16, V2S16 253 }; 254 255 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 256 257 setAction({G_BRCOND, S1}, Legal); // VCC branches 258 setAction({G_BRCOND, S32}, Legal); // SCC branches 259 260 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 261 // elements for v3s16 262 getActionDefinitionsBuilder(G_PHI) 263 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 264 .legalFor(AllS32Vectors) 265 .legalFor(AllS64Vectors) 266 .legalFor(AddrSpaces64) 267 .legalFor(AddrSpaces32) 268 .clampScalar(0, S32, S256) 269 .widenScalarToNextPow2(0, 32) 270 .clampMaxNumElements(0, S32, 16) 271 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 272 .legalIf(isPointer(0)); 273 274 if (ST.hasVOP3PInsts()) { 275 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 276 .legalFor({S32, S16, V2S16}) 277 .clampScalar(0, S16, S32) 278 .clampMaxNumElements(0, S16, 2) 279 .scalarize(0) 280 .widenScalarToNextPow2(0, 32); 281 } else if (ST.has16BitInsts()) { 282 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 283 .legalFor({S32, S16}) 284 .clampScalar(0, S16, S32) 285 .scalarize(0) 286 .widenScalarToNextPow2(0, 32); 287 } else { 288 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 289 .legalFor({S32}) 290 .clampScalar(0, S32, S32) 291 .scalarize(0); 292 } 293 294 // FIXME: Not really legal. Placeholder for custom lowering. 295 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 296 .customFor({S32, S64}) 297 .clampScalar(0, S32, S64) 298 .widenScalarToNextPow2(0, 32) 299 .scalarize(0); 300 301 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 302 .legalFor({S32}) 303 .clampScalar(0, S32, S32) 304 .scalarize(0); 305 306 // Report legal for any types we can handle anywhere. For the cases only legal 307 // on the SALU, RegBankSelect will be able to re-legalize. 308 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 309 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 310 .clampScalar(0, S32, S64) 311 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 312 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 313 .widenScalarToNextPow2(0) 314 .scalarize(0); 315 316 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 317 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 318 .legalFor({{S32, S1}, {S32, S32}}) 319 .minScalar(0, S32) 320 // TODO: .scalarize(0) 321 .lower(); 322 323 getActionDefinitionsBuilder(G_BITCAST) 324 // Don't worry about the size constraint. 325 .legalIf(all(isRegisterType(0), isRegisterType(1))) 326 .lower(); 327 328 329 getActionDefinitionsBuilder(G_CONSTANT) 330 .legalFor({S1, S32, S64, S16, GlobalPtr, 331 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 332 .clampScalar(0, S32, S64) 333 .widenScalarToNextPow2(0) 334 .legalIf(isPointer(0)); 335 336 getActionDefinitionsBuilder(G_FCONSTANT) 337 .legalFor({S32, S64, S16}) 338 .clampScalar(0, S16, S64); 339 340 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 341 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 342 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 343 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 344 .clampScalarOrElt(0, S32, S1024) 345 .legalIf(isMultiple32(0)) 346 .widenScalarToNextPow2(0, 32) 347 .clampMaxNumElements(0, S32, 16); 348 349 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 350 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 351 .unsupportedFor({PrivatePtr}) 352 .custom(); 353 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 354 355 auto &FPOpActions = getActionDefinitionsBuilder( 356 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 357 .legalFor({S32, S64}); 358 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 359 .customFor({S32, S64}); 360 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 361 .customFor({S32, S64}); 362 363 if (ST.has16BitInsts()) { 364 if (ST.hasVOP3PInsts()) 365 FPOpActions.legalFor({S16, V2S16}); 366 else 367 FPOpActions.legalFor({S16}); 368 369 TrigActions.customFor({S16}); 370 FDIVActions.customFor({S16}); 371 } 372 373 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 374 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 375 376 if (ST.hasVOP3PInsts()) { 377 MinNumMaxNum.customFor(FPTypesPK16) 378 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 379 .clampMaxNumElements(0, S16, 2) 380 .clampScalar(0, S16, S64) 381 .scalarize(0); 382 } else if (ST.has16BitInsts()) { 383 MinNumMaxNum.customFor(FPTypes16) 384 .clampScalar(0, S16, S64) 385 .scalarize(0); 386 } else { 387 MinNumMaxNum.customFor(FPTypesBase) 388 .clampScalar(0, S32, S64) 389 .scalarize(0); 390 } 391 392 if (ST.hasVOP3PInsts()) 393 FPOpActions.clampMaxNumElements(0, S16, 2); 394 395 FPOpActions 396 .scalarize(0) 397 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 398 399 TrigActions 400 .scalarize(0) 401 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 402 403 FDIVActions 404 .scalarize(0) 405 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 406 407 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 408 .legalFor(FPTypesPK16) 409 .clampMaxNumElements(0, S16, 2) 410 .scalarize(0) 411 .clampScalar(0, S16, S64); 412 413 if (ST.has16BitInsts()) { 414 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 415 .legalFor({S32, S64, S16}) 416 .scalarize(0) 417 .clampScalar(0, S16, S64); 418 } else { 419 getActionDefinitionsBuilder(G_FSQRT) 420 .legalFor({S32, S64}) 421 .scalarize(0) 422 .clampScalar(0, S32, S64); 423 424 if (ST.hasFractBug()) { 425 getActionDefinitionsBuilder(G_FFLOOR) 426 .customFor({S64}) 427 .legalFor({S32, S64}) 428 .scalarize(0) 429 .clampScalar(0, S32, S64); 430 } else { 431 getActionDefinitionsBuilder(G_FFLOOR) 432 .legalFor({S32, S64}) 433 .scalarize(0) 434 .clampScalar(0, S32, S64); 435 } 436 } 437 438 getActionDefinitionsBuilder(G_FPTRUNC) 439 .legalFor({{S32, S64}, {S16, S32}}) 440 .scalarize(0) 441 .lower(); 442 443 getActionDefinitionsBuilder(G_FPEXT) 444 .legalFor({{S64, S32}, {S32, S16}}) 445 .lowerFor({{S64, S16}}) // FIXME: Implement 446 .scalarize(0); 447 448 getActionDefinitionsBuilder(G_FSUB) 449 // Use actual fsub instruction 450 .legalFor({S32}) 451 // Must use fadd + fneg 452 .lowerFor({S64, S16, V2S16}) 453 .scalarize(0) 454 .clampScalar(0, S32, S64); 455 456 // Whether this is legal depends on the floating point mode for the function. 457 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 458 if (ST.hasMadF16()) 459 FMad.customFor({S32, S16}); 460 else 461 FMad.customFor({S32}); 462 FMad.scalarize(0) 463 .lower(); 464 465 // TODO: Do we need to clamp maximum bitwidth? 466 getActionDefinitionsBuilder(G_TRUNC) 467 .legalIf(isScalar(0)) 468 .legalFor({{V2S16, V2S32}}) 469 .clampMaxNumElements(0, S16, 2) 470 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 471 // situations (like an invalid implicit use), we don't want to infinite loop 472 // in the legalizer. 473 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 474 .alwaysLegal(); 475 476 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 477 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 478 {S32, S1}, {S64, S1}, {S16, S1}}) 479 .scalarize(0) 480 .clampScalar(0, S32, S64) 481 .widenScalarToNextPow2(1, 32); 482 483 // TODO: Split s1->s64 during regbankselect for VALU. 484 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 485 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 486 .lowerFor({{S32, S64}}) 487 .lowerIf(typeIs(1, S1)) 488 .customFor({{S64, S64}}); 489 if (ST.has16BitInsts()) 490 IToFP.legalFor({{S16, S16}}); 491 IToFP.clampScalar(1, S32, S64) 492 .scalarize(0) 493 .widenScalarToNextPow2(1); 494 495 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 496 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 497 .customFor({{S64, S64}}); 498 if (ST.has16BitInsts()) 499 FPToI.legalFor({{S16, S16}}); 500 else 501 FPToI.minScalar(1, S32); 502 503 FPToI.minScalar(0, S32) 504 .scalarize(0) 505 .lower(); 506 507 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 508 .scalarize(0) 509 .lower(); 510 511 if (ST.has16BitInsts()) { 512 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 513 .legalFor({S16, S32, S64}) 514 .clampScalar(0, S16, S64) 515 .scalarize(0); 516 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 517 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 518 .legalFor({S32, S64}) 519 .clampScalar(0, S32, S64) 520 .scalarize(0); 521 } else { 522 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 523 .legalFor({S32}) 524 .customFor({S64}) 525 .clampScalar(0, S32, S64) 526 .scalarize(0); 527 } 528 529 // FIXME: Clamp offset operand. 530 getActionDefinitionsBuilder(G_PTR_ADD) 531 .legalIf(isPointer(0)) 532 .scalarize(0); 533 534 getActionDefinitionsBuilder(G_PTRMASK) 535 .legalIf(typeInSet(1, {S64, S32})) 536 .minScalar(1, S32) 537 .maxScalarIf(sizeIs(0, 32), 1, S32) 538 .maxScalarIf(sizeIs(0, 64), 1, S64) 539 .scalarize(0); 540 541 auto &CmpBuilder = 542 getActionDefinitionsBuilder(G_ICMP) 543 // The compare output type differs based on the register bank of the output, 544 // so make both s1 and s32 legal. 545 // 546 // Scalar compares producing output in scc will be promoted to s32, as that 547 // is the allocatable register type that will be needed for the copy from 548 // scc. This will be promoted during RegBankSelect, and we assume something 549 // before that won't try to use s32 result types. 550 // 551 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 552 // bank. 553 .legalForCartesianProduct( 554 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 555 .legalForCartesianProduct( 556 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 557 if (ST.has16BitInsts()) { 558 CmpBuilder.legalFor({{S1, S16}}); 559 } 560 561 CmpBuilder 562 .widenScalarToNextPow2(1) 563 .clampScalar(1, S32, S64) 564 .scalarize(0) 565 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 566 567 getActionDefinitionsBuilder(G_FCMP) 568 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 569 .widenScalarToNextPow2(1) 570 .clampScalar(1, S32, S64) 571 .scalarize(0); 572 573 // FIXME: fpow has a selection pattern that should move to custom lowering. 574 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 575 if (ST.has16BitInsts()) 576 Exp2Ops.legalFor({S32, S16}); 577 else 578 Exp2Ops.legalFor({S32}); 579 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 580 Exp2Ops.scalarize(0); 581 582 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 583 if (ST.has16BitInsts()) 584 ExpOps.customFor({{S32}, {S16}}); 585 else 586 ExpOps.customFor({S32}); 587 ExpOps.clampScalar(0, MinScalarFPTy, S32) 588 .scalarize(0); 589 590 // The 64-bit versions produce 32-bit results, but only on the SALU. 591 getActionDefinitionsBuilder(G_CTPOP) 592 .legalFor({{S32, S32}, {S32, S64}}) 593 .clampScalar(0, S32, S32) 594 .clampScalar(1, S32, S64) 595 .scalarize(0) 596 .widenScalarToNextPow2(0, 32) 597 .widenScalarToNextPow2(1, 32); 598 599 // The hardware instructions return a different result on 0 than the generic 600 // instructions expect. The hardware produces -1, but these produce the 601 // bitwidth. 602 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 603 .scalarize(0) 604 .clampScalar(0, S32, S32) 605 .clampScalar(1, S32, S64) 606 .widenScalarToNextPow2(0, 32) 607 .widenScalarToNextPow2(1, 32) 608 .lower(); 609 610 // The 64-bit versions produce 32-bit results, but only on the SALU. 611 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 612 .legalFor({{S32, S32}, {S32, S64}}) 613 .clampScalar(0, S32, S32) 614 .clampScalar(1, S32, S64) 615 .scalarize(0) 616 .widenScalarToNextPow2(0, 32) 617 .widenScalarToNextPow2(1, 32); 618 619 getActionDefinitionsBuilder(G_BITREVERSE) 620 .legalFor({S32}) 621 .clampScalar(0, S32, S32) 622 .scalarize(0); 623 624 if (ST.has16BitInsts()) { 625 getActionDefinitionsBuilder(G_BSWAP) 626 .legalFor({S16, S32, V2S16}) 627 .clampMaxNumElements(0, S16, 2) 628 // FIXME: Fixing non-power-of-2 before clamp is workaround for 629 // narrowScalar limitation. 630 .widenScalarToNextPow2(0) 631 .clampScalar(0, S16, S32) 632 .scalarize(0); 633 634 if (ST.hasVOP3PInsts()) { 635 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 636 .legalFor({S32, S16, V2S16}) 637 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 638 .clampMaxNumElements(0, S16, 2) 639 .minScalar(0, S16) 640 .widenScalarToNextPow2(0) 641 .scalarize(0) 642 .lower(); 643 } else { 644 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 645 .legalFor({S32, S16}) 646 .widenScalarToNextPow2(0) 647 .minScalar(0, S16) 648 .scalarize(0) 649 .lower(); 650 } 651 } else { 652 // TODO: Should have same legality without v_perm_b32 653 getActionDefinitionsBuilder(G_BSWAP) 654 .legalFor({S32}) 655 .lowerIf(scalarNarrowerThan(0, 32)) 656 // FIXME: Fixing non-power-of-2 before clamp is workaround for 657 // narrowScalar limitation. 658 .widenScalarToNextPow2(0) 659 .maxScalar(0, S32) 660 .scalarize(0) 661 .lower(); 662 663 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 664 .legalFor({S32}) 665 .minScalar(0, S32) 666 .widenScalarToNextPow2(0) 667 .scalarize(0) 668 .lower(); 669 } 670 671 getActionDefinitionsBuilder(G_INTTOPTR) 672 // List the common cases 673 .legalForCartesianProduct(AddrSpaces64, {S64}) 674 .legalForCartesianProduct(AddrSpaces32, {S32}) 675 .scalarize(0) 676 // Accept any address space as long as the size matches 677 .legalIf(sameSize(0, 1)) 678 .widenScalarIf(smallerThan(1, 0), 679 [](const LegalityQuery &Query) { 680 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 681 }) 682 .narrowScalarIf(largerThan(1, 0), 683 [](const LegalityQuery &Query) { 684 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 685 }); 686 687 getActionDefinitionsBuilder(G_PTRTOINT) 688 // List the common cases 689 .legalForCartesianProduct(AddrSpaces64, {S64}) 690 .legalForCartesianProduct(AddrSpaces32, {S32}) 691 .scalarize(0) 692 // Accept any address space as long as the size matches 693 .legalIf(sameSize(0, 1)) 694 .widenScalarIf(smallerThan(0, 1), 695 [](const LegalityQuery &Query) { 696 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 697 }) 698 .narrowScalarIf( 699 largerThan(0, 1), 700 [](const LegalityQuery &Query) { 701 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 702 }); 703 704 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 705 .scalarize(0) 706 .custom(); 707 708 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 709 // handle some operations by just promoting the register during 710 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 711 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 712 switch (AS) { 713 // FIXME: Private element size. 714 case AMDGPUAS::PRIVATE_ADDRESS: 715 return 32; 716 // FIXME: Check subtarget 717 case AMDGPUAS::LOCAL_ADDRESS: 718 return ST.useDS128() ? 128 : 64; 719 720 // Treat constant and global as identical. SMRD loads are sometimes usable 721 // for global loads (ideally constant address space should be eliminated) 722 // depending on the context. Legality cannot be context dependent, but 723 // RegBankSelect can split the load as necessary depending on the pointer 724 // register bank/uniformity and if the memory is invariant or not written in 725 // a kernel. 726 case AMDGPUAS::CONSTANT_ADDRESS: 727 case AMDGPUAS::GLOBAL_ADDRESS: 728 return IsLoad ? 512 : 128; 729 default: 730 return 128; 731 } 732 }; 733 734 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 735 bool IsLoad) -> bool { 736 const LLT DstTy = Query.Types[0]; 737 738 // Split vector extloads. 739 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 740 unsigned Align = Query.MMODescrs[0].AlignInBits; 741 742 if (MemSize < DstTy.getSizeInBits()) 743 MemSize = std::max(MemSize, Align); 744 745 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 746 return true; 747 748 const LLT PtrTy = Query.Types[1]; 749 unsigned AS = PtrTy.getAddressSpace(); 750 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 751 return true; 752 753 // Catch weird sized loads that don't evenly divide into the access sizes 754 // TODO: May be able to widen depending on alignment etc. 755 unsigned NumRegs = (MemSize + 31) / 32; 756 if (NumRegs == 3) { 757 if (!ST.hasDwordx3LoadStores()) 758 return true; 759 } else { 760 // If the alignment allows, these should have been widened. 761 if (!isPowerOf2_32(NumRegs)) 762 return true; 763 } 764 765 if (Align < MemSize) { 766 const SITargetLowering *TLI = ST.getTargetLowering(); 767 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 768 } 769 770 return false; 771 }; 772 773 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 774 unsigned Size = Query.Types[0].getSizeInBits(); 775 if (isPowerOf2_32(Size)) 776 return false; 777 778 if (Size == 96 && ST.hasDwordx3LoadStores()) 779 return false; 780 781 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 782 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 783 return false; 784 785 unsigned Align = Query.MMODescrs[0].AlignInBits; 786 unsigned RoundedSize = NextPowerOf2(Size); 787 return (Align >= RoundedSize); 788 }; 789 790 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 791 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 792 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 793 794 // TODO: Refine based on subtargets which support unaligned access or 128-bit 795 // LDS 796 // TODO: Unsupported flat for SI. 797 798 for (unsigned Op : {G_LOAD, G_STORE}) { 799 const bool IsStore = Op == G_STORE; 800 801 auto &Actions = getActionDefinitionsBuilder(Op); 802 // Whitelist the common cases. 803 // TODO: Loads to s16 on gfx9 804 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 805 {V2S32, GlobalPtr, 64, GlobalAlign32}, 806 {V4S32, GlobalPtr, 128, GlobalAlign32}, 807 {S128, GlobalPtr, 128, GlobalAlign32}, 808 {S64, GlobalPtr, 64, GlobalAlign32}, 809 {V2S64, GlobalPtr, 128, GlobalAlign32}, 810 {V2S16, GlobalPtr, 32, GlobalAlign32}, 811 {S32, GlobalPtr, 8, GlobalAlign8}, 812 {S32, GlobalPtr, 16, GlobalAlign16}, 813 814 {S32, LocalPtr, 32, 32}, 815 {S64, LocalPtr, 64, 32}, 816 {V2S32, LocalPtr, 64, 32}, 817 {S32, LocalPtr, 8, 8}, 818 {S32, LocalPtr, 16, 16}, 819 {V2S16, LocalPtr, 32, 32}, 820 821 {S32, PrivatePtr, 32, 32}, 822 {S32, PrivatePtr, 8, 8}, 823 {S32, PrivatePtr, 16, 16}, 824 {V2S16, PrivatePtr, 32, 32}, 825 826 {S32, FlatPtr, 32, GlobalAlign32}, 827 {S32, FlatPtr, 16, GlobalAlign16}, 828 {S32, FlatPtr, 8, GlobalAlign8}, 829 {V2S16, FlatPtr, 32, GlobalAlign32}, 830 831 {S32, ConstantPtr, 32, GlobalAlign32}, 832 {V2S32, ConstantPtr, 64, GlobalAlign32}, 833 {V4S32, ConstantPtr, 128, GlobalAlign32}, 834 {S64, ConstantPtr, 64, GlobalAlign32}, 835 {S128, ConstantPtr, 128, GlobalAlign32}, 836 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 837 Actions 838 .customIf(typeIs(1, Constant32Ptr)) 839 // Widen suitably aligned loads by loading extra elements. 840 .moreElementsIf([=](const LegalityQuery &Query) { 841 const LLT Ty = Query.Types[0]; 842 return Op == G_LOAD && Ty.isVector() && 843 shouldWidenLoadResult(Query); 844 }, moreElementsToNextPow2(0)) 845 .widenScalarIf([=](const LegalityQuery &Query) { 846 const LLT Ty = Query.Types[0]; 847 return Op == G_LOAD && !Ty.isVector() && 848 shouldWidenLoadResult(Query); 849 }, widenScalarOrEltToNextPow2(0)) 850 .narrowScalarIf( 851 [=](const LegalityQuery &Query) -> bool { 852 return !Query.Types[0].isVector() && 853 needToSplitMemOp(Query, Op == G_LOAD); 854 }, 855 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 856 const LLT DstTy = Query.Types[0]; 857 const LLT PtrTy = Query.Types[1]; 858 859 const unsigned DstSize = DstTy.getSizeInBits(); 860 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 861 862 // Split extloads. 863 if (DstSize > MemSize) 864 return std::make_pair(0, LLT::scalar(MemSize)); 865 866 if (!isPowerOf2_32(DstSize)) { 867 // We're probably decomposing an odd sized store. Try to split 868 // to the widest type. TODO: Account for alignment. As-is it 869 // should be OK, since the new parts will be further legalized. 870 unsigned FloorSize = PowerOf2Floor(DstSize); 871 return std::make_pair(0, LLT::scalar(FloorSize)); 872 } 873 874 if (DstSize > 32 && (DstSize % 32 != 0)) { 875 // FIXME: Need a way to specify non-extload of larger size if 876 // suitably aligned. 877 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 878 } 879 880 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 881 Op == G_LOAD); 882 if (MemSize > MaxSize) 883 return std::make_pair(0, LLT::scalar(MaxSize)); 884 885 unsigned Align = Query.MMODescrs[0].AlignInBits; 886 return std::make_pair(0, LLT::scalar(Align)); 887 }) 888 .fewerElementsIf( 889 [=](const LegalityQuery &Query) -> bool { 890 return Query.Types[0].isVector() && 891 needToSplitMemOp(Query, Op == G_LOAD); 892 }, 893 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 894 const LLT DstTy = Query.Types[0]; 895 const LLT PtrTy = Query.Types[1]; 896 897 LLT EltTy = DstTy.getElementType(); 898 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 899 Op == G_LOAD); 900 901 // FIXME: Handle widened to power of 2 results better. This ends 902 // up scalarizing. 903 // FIXME: 3 element stores scalarized on SI 904 905 // Split if it's too large for the address space. 906 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 907 unsigned NumElts = DstTy.getNumElements(); 908 unsigned EltSize = EltTy.getSizeInBits(); 909 910 if (MaxSize % EltSize == 0) { 911 return std::make_pair( 912 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 913 } 914 915 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 916 917 // FIXME: Refine when odd breakdowns handled 918 // The scalars will need to be re-legalized. 919 if (NumPieces == 1 || NumPieces >= NumElts || 920 NumElts % NumPieces != 0) 921 return std::make_pair(0, EltTy); 922 923 return std::make_pair(0, 924 LLT::vector(NumElts / NumPieces, EltTy)); 925 } 926 927 // FIXME: We could probably handle weird extending loads better. 928 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 929 if (DstTy.getSizeInBits() > MemSize) 930 return std::make_pair(0, EltTy); 931 932 unsigned EltSize = EltTy.getSizeInBits(); 933 unsigned DstSize = DstTy.getSizeInBits(); 934 if (!isPowerOf2_32(DstSize)) { 935 // We're probably decomposing an odd sized store. Try to split 936 // to the widest type. TODO: Account for alignment. As-is it 937 // should be OK, since the new parts will be further legalized. 938 unsigned FloorSize = PowerOf2Floor(DstSize); 939 return std::make_pair( 940 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 941 } 942 943 // Need to split because of alignment. 944 unsigned Align = Query.MMODescrs[0].AlignInBits; 945 if (EltSize > Align && 946 (EltSize / Align < DstTy.getNumElements())) { 947 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 948 } 949 950 // May need relegalization for the scalars. 951 return std::make_pair(0, EltTy); 952 }) 953 .minScalar(0, S32); 954 955 if (IsStore) 956 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 957 958 // TODO: Need a bitcast lower option? 959 Actions 960 .legalIf([=](const LegalityQuery &Query) { 961 const LLT Ty0 = Query.Types[0]; 962 unsigned Size = Ty0.getSizeInBits(); 963 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 964 unsigned Align = Query.MMODescrs[0].AlignInBits; 965 966 // FIXME: Widening store from alignment not valid. 967 if (MemSize < Size) 968 MemSize = std::max(MemSize, Align); 969 970 // No extending vector loads. 971 if (Size > MemSize && Ty0.isVector()) 972 return false; 973 974 switch (MemSize) { 975 case 8: 976 case 16: 977 return Size == 32; 978 case 32: 979 case 64: 980 case 128: 981 return true; 982 case 96: 983 return ST.hasDwordx3LoadStores(); 984 case 256: 985 case 512: 986 return true; 987 default: 988 return false; 989 } 990 }) 991 .widenScalarToNextPow2(0) 992 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 993 } 994 995 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 996 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 997 {S32, GlobalPtr, 16, 2 * 8}, 998 {S32, LocalPtr, 8, 8}, 999 {S32, LocalPtr, 16, 16}, 1000 {S32, PrivatePtr, 8, 8}, 1001 {S32, PrivatePtr, 16, 16}, 1002 {S32, ConstantPtr, 8, 8}, 1003 {S32, ConstantPtr, 16, 2 * 8}}); 1004 if (ST.hasFlatAddressSpace()) { 1005 ExtLoads.legalForTypesWithMemDesc( 1006 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1007 } 1008 1009 ExtLoads.clampScalar(0, S32, S32) 1010 .widenScalarToNextPow2(0) 1011 .unsupportedIfMemSizeNotPow2() 1012 .lower(); 1013 1014 auto &Atomics = getActionDefinitionsBuilder( 1015 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1016 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1017 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1018 G_ATOMICRMW_UMIN}) 1019 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1020 {S64, GlobalPtr}, {S64, LocalPtr}}); 1021 if (ST.hasFlatAddressSpace()) { 1022 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1023 } 1024 1025 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1026 .legalFor({{S32, LocalPtr}}); 1027 1028 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1029 // demarshalling 1030 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1031 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1032 {S32, FlatPtr}, {S64, FlatPtr}}) 1033 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1034 {S32, RegionPtr}, {S64, RegionPtr}}); 1035 // TODO: Pointer types, any 32-bit or 64-bit vector 1036 1037 // Condition should be s32 for scalar, s1 for vector. 1038 getActionDefinitionsBuilder(G_SELECT) 1039 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1040 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1041 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1042 .clampScalar(0, S16, S64) 1043 .scalarize(1) 1044 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1045 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1046 .clampMaxNumElements(0, S32, 2) 1047 .clampMaxNumElements(0, LocalPtr, 2) 1048 .clampMaxNumElements(0, PrivatePtr, 2) 1049 .scalarize(0) 1050 .widenScalarToNextPow2(0) 1051 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1052 1053 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1054 // be more flexible with the shift amount type. 1055 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1056 .legalFor({{S32, S32}, {S64, S32}}); 1057 if (ST.has16BitInsts()) { 1058 if (ST.hasVOP3PInsts()) { 1059 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1060 .clampMaxNumElements(0, S16, 2); 1061 } else 1062 Shifts.legalFor({{S16, S16}}); 1063 1064 // TODO: Support 16-bit shift amounts for all types 1065 Shifts.widenScalarIf( 1066 [=](const LegalityQuery &Query) { 1067 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1068 // 32-bit amount. 1069 const LLT ValTy = Query.Types[0]; 1070 const LLT AmountTy = Query.Types[1]; 1071 return ValTy.getSizeInBits() <= 16 && 1072 AmountTy.getSizeInBits() < 16; 1073 }, changeTo(1, S16)); 1074 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1075 Shifts.clampScalar(1, S32, S32); 1076 Shifts.clampScalar(0, S16, S64); 1077 Shifts.widenScalarToNextPow2(0, 16); 1078 } else { 1079 // Make sure we legalize the shift amount type first, as the general 1080 // expansion for the shifted type will produce much worse code if it hasn't 1081 // been truncated already. 1082 Shifts.clampScalar(1, S32, S32); 1083 Shifts.clampScalar(0, S32, S64); 1084 Shifts.widenScalarToNextPow2(0, 32); 1085 } 1086 Shifts.scalarize(0); 1087 1088 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1089 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1090 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1091 unsigned IdxTypeIdx = 2; 1092 1093 getActionDefinitionsBuilder(Op) 1094 .customIf([=](const LegalityQuery &Query) { 1095 const LLT EltTy = Query.Types[EltTypeIdx]; 1096 const LLT VecTy = Query.Types[VecTypeIdx]; 1097 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1098 return (EltTy.getSizeInBits() == 16 || 1099 EltTy.getSizeInBits() % 32 == 0) && 1100 VecTy.getSizeInBits() % 32 == 0 && 1101 VecTy.getSizeInBits() <= 1024 && 1102 IdxTy.getSizeInBits() == 32; 1103 }) 1104 .clampScalar(EltTypeIdx, S32, S64) 1105 .clampScalar(VecTypeIdx, S32, S64) 1106 .clampScalar(IdxTypeIdx, S32, S32); 1107 } 1108 1109 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1110 .unsupportedIf([=](const LegalityQuery &Query) { 1111 const LLT &EltTy = Query.Types[1].getElementType(); 1112 return Query.Types[0] != EltTy; 1113 }); 1114 1115 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1116 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1117 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1118 1119 // FIXME: Doesn't handle extract of illegal sizes. 1120 getActionDefinitionsBuilder(Op) 1121 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1122 // FIXME: Multiples of 16 should not be legal. 1123 .legalIf([=](const LegalityQuery &Query) { 1124 const LLT BigTy = Query.Types[BigTyIdx]; 1125 const LLT LitTy = Query.Types[LitTyIdx]; 1126 return (BigTy.getSizeInBits() % 32 == 0) && 1127 (LitTy.getSizeInBits() % 16 == 0); 1128 }) 1129 .widenScalarIf( 1130 [=](const LegalityQuery &Query) { 1131 const LLT BigTy = Query.Types[BigTyIdx]; 1132 return (BigTy.getScalarSizeInBits() < 16); 1133 }, 1134 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1135 .widenScalarIf( 1136 [=](const LegalityQuery &Query) { 1137 const LLT LitTy = Query.Types[LitTyIdx]; 1138 return (LitTy.getScalarSizeInBits() < 16); 1139 }, 1140 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1141 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1142 .widenScalarToNextPow2(BigTyIdx, 32); 1143 1144 } 1145 1146 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1147 .legalForCartesianProduct(AllS32Vectors, {S32}) 1148 .legalForCartesianProduct(AllS64Vectors, {S64}) 1149 .clampNumElements(0, V16S32, V32S32) 1150 .clampNumElements(0, V2S64, V16S64) 1151 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1152 1153 if (ST.hasScalarPackInsts()) { 1154 BuildVector 1155 // FIXME: Should probably widen s1 vectors straight to s32 1156 .minScalarOrElt(0, S16) 1157 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1158 .minScalar(1, S32); 1159 1160 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1161 .legalFor({V2S16, S32}) 1162 .lower(); 1163 BuildVector.minScalarOrElt(0, S32); 1164 } else { 1165 BuildVector.customFor({V2S16, S16}); 1166 BuildVector.minScalarOrElt(0, S32); 1167 1168 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1169 .customFor({V2S16, S32}) 1170 .lower(); 1171 } 1172 1173 BuildVector.legalIf(isRegisterType(0)); 1174 1175 // FIXME: Clamp maximum size 1176 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1177 .legalIf(isRegisterType(0)); 1178 1179 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1180 // pre-legalize. 1181 if (ST.hasVOP3PInsts()) { 1182 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1183 .customFor({V2S16, V2S16}) 1184 .lower(); 1185 } else 1186 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1187 1188 // Merge/Unmerge 1189 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1190 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1191 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1192 1193 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1194 const LLT Ty = Query.Types[TypeIdx]; 1195 if (Ty.isVector()) { 1196 const LLT &EltTy = Ty.getElementType(); 1197 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1198 return true; 1199 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1200 return true; 1201 } 1202 return false; 1203 }; 1204 1205 auto &Builder = getActionDefinitionsBuilder(Op) 1206 .lowerFor({{S16, V2S16}}) 1207 .lowerIf([=](const LegalityQuery &Query) { 1208 const LLT BigTy = Query.Types[BigTyIdx]; 1209 return BigTy.getSizeInBits() == 32; 1210 }) 1211 // Try to widen to s16 first for small types. 1212 // TODO: Only do this on targets with legal s16 shifts 1213 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1214 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1215 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1216 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1217 elementTypeIs(1, S16)), 1218 changeTo(1, V2S16)) 1219 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1220 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1221 // valid. 1222 .clampScalar(LitTyIdx, S32, S512) 1223 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1224 // Break up vectors with weird elements into scalars 1225 .fewerElementsIf( 1226 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1227 scalarize(0)) 1228 .fewerElementsIf( 1229 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1230 scalarize(1)) 1231 .clampScalar(BigTyIdx, S32, S1024); 1232 1233 if (Op == G_MERGE_VALUES) { 1234 Builder.widenScalarIf( 1235 // TODO: Use 16-bit shifts if legal for 8-bit values? 1236 [=](const LegalityQuery &Query) { 1237 const LLT Ty = Query.Types[LitTyIdx]; 1238 return Ty.getSizeInBits() < 32; 1239 }, 1240 changeTo(LitTyIdx, S32)); 1241 } 1242 1243 Builder.widenScalarIf( 1244 [=](const LegalityQuery &Query) { 1245 const LLT Ty = Query.Types[BigTyIdx]; 1246 return !isPowerOf2_32(Ty.getSizeInBits()) && 1247 Ty.getSizeInBits() % 16 != 0; 1248 }, 1249 [=](const LegalityQuery &Query) { 1250 // Pick the next power of 2, or a multiple of 64 over 128. 1251 // Whichever is smaller. 1252 const LLT &Ty = Query.Types[BigTyIdx]; 1253 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1254 if (NewSizeInBits >= 256) { 1255 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1256 if (RoundedTo < NewSizeInBits) 1257 NewSizeInBits = RoundedTo; 1258 } 1259 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1260 }) 1261 .legalIf([=](const LegalityQuery &Query) { 1262 const LLT &BigTy = Query.Types[BigTyIdx]; 1263 const LLT &LitTy = Query.Types[LitTyIdx]; 1264 1265 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1266 return false; 1267 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1268 return false; 1269 1270 return BigTy.getSizeInBits() % 16 == 0 && 1271 LitTy.getSizeInBits() % 16 == 0 && 1272 BigTy.getSizeInBits() <= 1024; 1273 }) 1274 // Any vectors left are the wrong size. Scalarize them. 1275 .scalarize(0) 1276 .scalarize(1); 1277 } 1278 1279 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1280 // RegBankSelect. 1281 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1282 .legalFor({{S32}, {S64}}); 1283 1284 if (ST.hasVOP3PInsts()) { 1285 SextInReg.lowerFor({{V2S16}}) 1286 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1287 // get more vector shift opportunities, since we'll get those when 1288 // expanded. 1289 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1290 } else if (ST.has16BitInsts()) { 1291 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1292 } else { 1293 // Prefer to promote to s32 before lowering if we don't have 16-bit 1294 // shifts. This avoid a lot of intermediate truncate and extend operations. 1295 SextInReg.lowerFor({{S32}, {S64}}); 1296 } 1297 1298 SextInReg 1299 .scalarize(0) 1300 .clampScalar(0, S32, S64) 1301 .lower(); 1302 1303 getActionDefinitionsBuilder(G_FSHR) 1304 .legalFor({{S32, S32}}) 1305 .scalarize(0) 1306 .lower(); 1307 1308 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1309 .legalFor({S64}); 1310 1311 getActionDefinitionsBuilder({ 1312 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1313 G_FCOPYSIGN, 1314 1315 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1316 G_READ_REGISTER, 1317 G_WRITE_REGISTER, 1318 1319 G_SADDO, G_SSUBO, 1320 1321 // TODO: Implement 1322 G_FMINIMUM, G_FMAXIMUM, 1323 G_FSHL 1324 }).lower(); 1325 1326 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1327 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1328 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1329 .unsupported(); 1330 1331 computeTables(); 1332 verify(*ST.getInstrInfo()); 1333 } 1334 1335 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1336 MachineRegisterInfo &MRI, 1337 MachineIRBuilder &B, 1338 GISelChangeObserver &Observer) const { 1339 switch (MI.getOpcode()) { 1340 case TargetOpcode::G_ADDRSPACE_CAST: 1341 return legalizeAddrSpaceCast(MI, MRI, B); 1342 case TargetOpcode::G_FRINT: 1343 return legalizeFrint(MI, MRI, B); 1344 case TargetOpcode::G_FCEIL: 1345 return legalizeFceil(MI, MRI, B); 1346 case TargetOpcode::G_INTRINSIC_TRUNC: 1347 return legalizeIntrinsicTrunc(MI, MRI, B); 1348 case TargetOpcode::G_SITOFP: 1349 return legalizeITOFP(MI, MRI, B, true); 1350 case TargetOpcode::G_UITOFP: 1351 return legalizeITOFP(MI, MRI, B, false); 1352 case TargetOpcode::G_FPTOSI: 1353 return legalizeFPTOI(MI, MRI, B, true); 1354 case TargetOpcode::G_FPTOUI: 1355 return legalizeFPTOI(MI, MRI, B, false); 1356 case TargetOpcode::G_FMINNUM: 1357 case TargetOpcode::G_FMAXNUM: 1358 case TargetOpcode::G_FMINNUM_IEEE: 1359 case TargetOpcode::G_FMAXNUM_IEEE: 1360 return legalizeMinNumMaxNum(MI, MRI, B); 1361 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1362 return legalizeExtractVectorElt(MI, MRI, B); 1363 case TargetOpcode::G_INSERT_VECTOR_ELT: 1364 return legalizeInsertVectorElt(MI, MRI, B); 1365 case TargetOpcode::G_SHUFFLE_VECTOR: 1366 return legalizeShuffleVector(MI, MRI, B); 1367 case TargetOpcode::G_FSIN: 1368 case TargetOpcode::G_FCOS: 1369 return legalizeSinCos(MI, MRI, B); 1370 case TargetOpcode::G_GLOBAL_VALUE: 1371 return legalizeGlobalValue(MI, MRI, B); 1372 case TargetOpcode::G_LOAD: 1373 return legalizeLoad(MI, MRI, B, Observer); 1374 case TargetOpcode::G_FMAD: 1375 return legalizeFMad(MI, MRI, B); 1376 case TargetOpcode::G_FDIV: 1377 return legalizeFDIV(MI, MRI, B); 1378 case TargetOpcode::G_UDIV: 1379 case TargetOpcode::G_UREM: 1380 return legalizeUDIV_UREM(MI, MRI, B); 1381 case TargetOpcode::G_SDIV: 1382 case TargetOpcode::G_SREM: 1383 return legalizeSDIV_SREM(MI, MRI, B); 1384 case TargetOpcode::G_ATOMIC_CMPXCHG: 1385 return legalizeAtomicCmpXChg(MI, MRI, B); 1386 case TargetOpcode::G_FLOG: 1387 return legalizeFlog(MI, B, numbers::ln2f); 1388 case TargetOpcode::G_FLOG10: 1389 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1390 case TargetOpcode::G_FEXP: 1391 return legalizeFExp(MI, B); 1392 case TargetOpcode::G_FPOW: 1393 return legalizeFPow(MI, B); 1394 case TargetOpcode::G_FFLOOR: 1395 return legalizeFFloor(MI, MRI, B); 1396 case TargetOpcode::G_BUILD_VECTOR: 1397 return legalizeBuildVector(MI, MRI, B); 1398 default: 1399 return false; 1400 } 1401 1402 llvm_unreachable("expected switch to return"); 1403 } 1404 1405 Register AMDGPULegalizerInfo::getSegmentAperture( 1406 unsigned AS, 1407 MachineRegisterInfo &MRI, 1408 MachineIRBuilder &B) const { 1409 MachineFunction &MF = B.getMF(); 1410 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1411 const LLT S32 = LLT::scalar(32); 1412 1413 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1414 1415 if (ST.hasApertureRegs()) { 1416 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1417 // getreg. 1418 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1419 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1420 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1421 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1422 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1423 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1424 unsigned Encoding = 1425 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1426 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1427 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1428 1429 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1430 1431 B.buildInstr(AMDGPU::S_GETREG_B32) 1432 .addDef(GetReg) 1433 .addImm(Encoding); 1434 MRI.setType(GetReg, S32); 1435 1436 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1437 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1438 } 1439 1440 Register QueuePtr = MRI.createGenericVirtualRegister( 1441 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1442 1443 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1444 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1445 return Register(); 1446 1447 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1448 // private_segment_aperture_base_hi. 1449 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1450 1451 // TODO: can we be smarter about machine pointer info? 1452 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1453 MachineMemOperand *MMO = MF.getMachineMemOperand( 1454 PtrInfo, 1455 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1456 MachineMemOperand::MOInvariant, 1457 4, commonAlignment(Align(64), StructOffset)); 1458 1459 Register LoadAddr; 1460 1461 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1462 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1463 } 1464 1465 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1466 MachineInstr &MI, MachineRegisterInfo &MRI, 1467 MachineIRBuilder &B) const { 1468 MachineFunction &MF = B.getMF(); 1469 1470 B.setInstr(MI); 1471 1472 const LLT S32 = LLT::scalar(32); 1473 Register Dst = MI.getOperand(0).getReg(); 1474 Register Src = MI.getOperand(1).getReg(); 1475 1476 LLT DstTy = MRI.getType(Dst); 1477 LLT SrcTy = MRI.getType(Src); 1478 unsigned DestAS = DstTy.getAddressSpace(); 1479 unsigned SrcAS = SrcTy.getAddressSpace(); 1480 1481 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1482 // vector element. 1483 assert(!DstTy.isVector()); 1484 1485 const AMDGPUTargetMachine &TM 1486 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1487 1488 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1489 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1490 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1491 return true; 1492 } 1493 1494 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1495 // Truncate. 1496 B.buildExtract(Dst, Src, 0); 1497 MI.eraseFromParent(); 1498 return true; 1499 } 1500 1501 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1502 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1503 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1504 1505 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1506 // another. Merge operands are required to be the same type, but creating an 1507 // extra ptrtoint would be kind of pointless. 1508 auto HighAddr = B.buildConstant( 1509 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1510 B.buildMerge(Dst, {Src, HighAddr}); 1511 MI.eraseFromParent(); 1512 return true; 1513 } 1514 1515 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1516 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1517 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1518 unsigned NullVal = TM.getNullPointerValue(DestAS); 1519 1520 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1521 auto FlatNull = B.buildConstant(SrcTy, 0); 1522 1523 // Extract low 32-bits of the pointer. 1524 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1525 1526 auto CmpRes = 1527 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1528 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1529 1530 MI.eraseFromParent(); 1531 return true; 1532 } 1533 1534 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1535 return false; 1536 1537 if (!ST.hasFlatAddressSpace()) 1538 return false; 1539 1540 auto SegmentNull = 1541 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1542 auto FlatNull = 1543 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1544 1545 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1546 if (!ApertureReg.isValid()) 1547 return false; 1548 1549 auto CmpRes = 1550 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1551 1552 // Coerce the type of the low half of the result so we can use merge_values. 1553 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1554 1555 // TODO: Should we allow mismatched types but matching sizes in merges to 1556 // avoid the ptrtoint? 1557 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1558 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1559 1560 MI.eraseFromParent(); 1561 return true; 1562 } 1563 1564 bool AMDGPULegalizerInfo::legalizeFrint( 1565 MachineInstr &MI, MachineRegisterInfo &MRI, 1566 MachineIRBuilder &B) const { 1567 B.setInstr(MI); 1568 1569 Register Src = MI.getOperand(1).getReg(); 1570 LLT Ty = MRI.getType(Src); 1571 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1572 1573 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1574 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1575 1576 auto C1 = B.buildFConstant(Ty, C1Val); 1577 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1578 1579 // TODO: Should this propagate fast-math-flags? 1580 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1581 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1582 1583 auto C2 = B.buildFConstant(Ty, C2Val); 1584 auto Fabs = B.buildFAbs(Ty, Src); 1585 1586 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1587 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1588 return true; 1589 } 1590 1591 bool AMDGPULegalizerInfo::legalizeFceil( 1592 MachineInstr &MI, MachineRegisterInfo &MRI, 1593 MachineIRBuilder &B) const { 1594 B.setInstr(MI); 1595 1596 const LLT S1 = LLT::scalar(1); 1597 const LLT S64 = LLT::scalar(64); 1598 1599 Register Src = MI.getOperand(1).getReg(); 1600 assert(MRI.getType(Src) == S64); 1601 1602 // result = trunc(src) 1603 // if (src > 0.0 && src != result) 1604 // result += 1.0 1605 1606 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1607 1608 const auto Zero = B.buildFConstant(S64, 0.0); 1609 const auto One = B.buildFConstant(S64, 1.0); 1610 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1611 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1612 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1613 auto Add = B.buildSelect(S64, And, One, Zero); 1614 1615 // TODO: Should this propagate fast-math-flags? 1616 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1617 return true; 1618 } 1619 1620 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1621 MachineIRBuilder &B) { 1622 const unsigned FractBits = 52; 1623 const unsigned ExpBits = 11; 1624 LLT S32 = LLT::scalar(32); 1625 1626 auto Const0 = B.buildConstant(S32, FractBits - 32); 1627 auto Const1 = B.buildConstant(S32, ExpBits); 1628 1629 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1630 .addUse(Const0.getReg(0)) 1631 .addUse(Const1.getReg(0)); 1632 1633 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1634 } 1635 1636 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1637 MachineInstr &MI, MachineRegisterInfo &MRI, 1638 MachineIRBuilder &B) const { 1639 B.setInstr(MI); 1640 1641 const LLT S1 = LLT::scalar(1); 1642 const LLT S32 = LLT::scalar(32); 1643 const LLT S64 = LLT::scalar(64); 1644 1645 Register Src = MI.getOperand(1).getReg(); 1646 assert(MRI.getType(Src) == S64); 1647 1648 // TODO: Should this use extract since the low half is unused? 1649 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1650 Register Hi = Unmerge.getReg(1); 1651 1652 // Extract the upper half, since this is where we will find the sign and 1653 // exponent. 1654 auto Exp = extractF64Exponent(Hi, B); 1655 1656 const unsigned FractBits = 52; 1657 1658 // Extract the sign bit. 1659 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1660 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1661 1662 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1663 1664 const auto Zero32 = B.buildConstant(S32, 0); 1665 1666 // Extend back to 64-bits. 1667 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1668 1669 auto Shr = B.buildAShr(S64, FractMask, Exp); 1670 auto Not = B.buildNot(S64, Shr); 1671 auto Tmp0 = B.buildAnd(S64, Src, Not); 1672 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1673 1674 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1675 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1676 1677 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1678 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1679 return true; 1680 } 1681 1682 bool AMDGPULegalizerInfo::legalizeITOFP( 1683 MachineInstr &MI, MachineRegisterInfo &MRI, 1684 MachineIRBuilder &B, bool Signed) const { 1685 B.setInstr(MI); 1686 1687 Register Dst = MI.getOperand(0).getReg(); 1688 Register Src = MI.getOperand(1).getReg(); 1689 1690 const LLT S64 = LLT::scalar(64); 1691 const LLT S32 = LLT::scalar(32); 1692 1693 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1694 1695 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1696 1697 auto CvtHi = Signed ? 1698 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1699 B.buildUITOFP(S64, Unmerge.getReg(1)); 1700 1701 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1702 1703 auto ThirtyTwo = B.buildConstant(S32, 32); 1704 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1705 .addUse(CvtHi.getReg(0)) 1706 .addUse(ThirtyTwo.getReg(0)); 1707 1708 // TODO: Should this propagate fast-math-flags? 1709 B.buildFAdd(Dst, LdExp, CvtLo); 1710 MI.eraseFromParent(); 1711 return true; 1712 } 1713 1714 // TODO: Copied from DAG implementation. Verify logic and document how this 1715 // actually works. 1716 bool AMDGPULegalizerInfo::legalizeFPTOI( 1717 MachineInstr &MI, MachineRegisterInfo &MRI, 1718 MachineIRBuilder &B, bool Signed) const { 1719 B.setInstr(MI); 1720 1721 Register Dst = MI.getOperand(0).getReg(); 1722 Register Src = MI.getOperand(1).getReg(); 1723 1724 const LLT S64 = LLT::scalar(64); 1725 const LLT S32 = LLT::scalar(32); 1726 1727 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1728 1729 unsigned Flags = MI.getFlags(); 1730 1731 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1732 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1733 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1734 1735 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1736 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1737 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1738 1739 auto Hi = Signed ? 1740 B.buildFPTOSI(S32, FloorMul) : 1741 B.buildFPTOUI(S32, FloorMul); 1742 auto Lo = B.buildFPTOUI(S32, Fma); 1743 1744 B.buildMerge(Dst, { Lo, Hi }); 1745 MI.eraseFromParent(); 1746 1747 return true; 1748 } 1749 1750 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1751 MachineInstr &MI, MachineRegisterInfo &MRI, 1752 MachineIRBuilder &B) const { 1753 MachineFunction &MF = B.getMF(); 1754 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1755 1756 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1757 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1758 1759 // With ieee_mode disabled, the instructions have the correct behavior 1760 // already for G_FMINNUM/G_FMAXNUM 1761 if (!MFI->getMode().IEEE) 1762 return !IsIEEEOp; 1763 1764 if (IsIEEEOp) 1765 return true; 1766 1767 MachineIRBuilder HelperBuilder(MI); 1768 GISelObserverWrapper DummyObserver; 1769 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1770 HelperBuilder.setInstr(MI); 1771 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1772 } 1773 1774 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1775 MachineInstr &MI, MachineRegisterInfo &MRI, 1776 MachineIRBuilder &B) const { 1777 // TODO: Should move some of this into LegalizerHelper. 1778 1779 // TODO: Promote dynamic indexing of s16 to s32 1780 1781 // FIXME: Artifact combiner probably should have replaced the truncated 1782 // constant before this, so we shouldn't need 1783 // getConstantVRegValWithLookThrough. 1784 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1785 MI.getOperand(2).getReg(), MRI); 1786 if (!IdxVal) // Dynamic case will be selected to register indexing. 1787 return true; 1788 1789 Register Dst = MI.getOperand(0).getReg(); 1790 Register Vec = MI.getOperand(1).getReg(); 1791 1792 LLT VecTy = MRI.getType(Vec); 1793 LLT EltTy = VecTy.getElementType(); 1794 assert(EltTy == MRI.getType(Dst)); 1795 1796 B.setInstr(MI); 1797 1798 if (IdxVal->Value < VecTy.getNumElements()) 1799 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1800 else 1801 B.buildUndef(Dst); 1802 1803 MI.eraseFromParent(); 1804 return true; 1805 } 1806 1807 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1808 MachineInstr &MI, MachineRegisterInfo &MRI, 1809 MachineIRBuilder &B) const { 1810 // TODO: Should move some of this into LegalizerHelper. 1811 1812 // TODO: Promote dynamic indexing of s16 to s32 1813 1814 // FIXME: Artifact combiner probably should have replaced the truncated 1815 // constant before this, so we shouldn't need 1816 // getConstantVRegValWithLookThrough. 1817 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1818 MI.getOperand(3).getReg(), MRI); 1819 if (!IdxVal) // Dynamic case will be selected to register indexing. 1820 return true; 1821 1822 Register Dst = MI.getOperand(0).getReg(); 1823 Register Vec = MI.getOperand(1).getReg(); 1824 Register Ins = MI.getOperand(2).getReg(); 1825 1826 LLT VecTy = MRI.getType(Vec); 1827 LLT EltTy = VecTy.getElementType(); 1828 assert(EltTy == MRI.getType(Ins)); 1829 1830 B.setInstr(MI); 1831 1832 if (IdxVal->Value < VecTy.getNumElements()) 1833 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1834 else 1835 B.buildUndef(Dst); 1836 1837 MI.eraseFromParent(); 1838 return true; 1839 } 1840 1841 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1842 MachineInstr &MI, MachineRegisterInfo &MRI, 1843 MachineIRBuilder &B) const { 1844 const LLT V2S16 = LLT::vector(2, 16); 1845 1846 Register Dst = MI.getOperand(0).getReg(); 1847 Register Src0 = MI.getOperand(1).getReg(); 1848 LLT DstTy = MRI.getType(Dst); 1849 LLT SrcTy = MRI.getType(Src0); 1850 1851 if (SrcTy == V2S16 && DstTy == V2S16 && 1852 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1853 return true; 1854 1855 MachineIRBuilder HelperBuilder(MI); 1856 GISelObserverWrapper DummyObserver; 1857 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1858 HelperBuilder.setInstr(MI); 1859 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1860 } 1861 1862 bool AMDGPULegalizerInfo::legalizeSinCos( 1863 MachineInstr &MI, MachineRegisterInfo &MRI, 1864 MachineIRBuilder &B) const { 1865 B.setInstr(MI); 1866 1867 Register DstReg = MI.getOperand(0).getReg(); 1868 Register SrcReg = MI.getOperand(1).getReg(); 1869 LLT Ty = MRI.getType(DstReg); 1870 unsigned Flags = MI.getFlags(); 1871 1872 Register TrigVal; 1873 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 1874 if (ST.hasTrigReducedRange()) { 1875 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1876 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1877 .addUse(MulVal.getReg(0)) 1878 .setMIFlags(Flags).getReg(0); 1879 } else 1880 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1881 1882 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1883 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1884 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1885 .addUse(TrigVal) 1886 .setMIFlags(Flags); 1887 MI.eraseFromParent(); 1888 return true; 1889 } 1890 1891 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1892 Register DstReg, LLT PtrTy, 1893 MachineIRBuilder &B, const GlobalValue *GV, 1894 unsigned Offset, unsigned GAFlags) const { 1895 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1896 // to the following code sequence: 1897 // 1898 // For constant address space: 1899 // s_getpc_b64 s[0:1] 1900 // s_add_u32 s0, s0, $symbol 1901 // s_addc_u32 s1, s1, 0 1902 // 1903 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1904 // a fixup or relocation is emitted to replace $symbol with a literal 1905 // constant, which is a pc-relative offset from the encoding of the $symbol 1906 // operand to the global variable. 1907 // 1908 // For global address space: 1909 // s_getpc_b64 s[0:1] 1910 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1911 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1912 // 1913 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1914 // fixups or relocations are emitted to replace $symbol@*@lo and 1915 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1916 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1917 // operand to the global variable. 1918 // 1919 // What we want here is an offset from the value returned by s_getpc 1920 // (which is the address of the s_add_u32 instruction) to the global 1921 // variable, but since the encoding of $symbol starts 4 bytes after the start 1922 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1923 // small. This requires us to add 4 to the global variable offset in order to 1924 // compute the correct address. 1925 1926 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1927 1928 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1929 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1930 1931 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1932 .addDef(PCReg); 1933 1934 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1935 if (GAFlags == SIInstrInfo::MO_NONE) 1936 MIB.addImm(0); 1937 else 1938 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1939 1940 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1941 1942 if (PtrTy.getSizeInBits() == 32) 1943 B.buildExtract(DstReg, PCReg, 0); 1944 return true; 1945 } 1946 1947 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1948 MachineInstr &MI, MachineRegisterInfo &MRI, 1949 MachineIRBuilder &B) const { 1950 Register DstReg = MI.getOperand(0).getReg(); 1951 LLT Ty = MRI.getType(DstReg); 1952 unsigned AS = Ty.getAddressSpace(); 1953 1954 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1955 MachineFunction &MF = B.getMF(); 1956 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1957 B.setInstr(MI); 1958 1959 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1960 if (!MFI->isEntryFunction()) { 1961 const Function &Fn = MF.getFunction(); 1962 DiagnosticInfoUnsupported BadLDSDecl( 1963 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 1964 DS_Warning); 1965 Fn.getContext().diagnose(BadLDSDecl); 1966 1967 // We currently don't have a way to correctly allocate LDS objects that 1968 // aren't directly associated with a kernel. We do force inlining of 1969 // functions that use local objects. However, if these dead functions are 1970 // not eliminated, we don't want a compile time error. Just emit a warning 1971 // and a trap, since there should be no callable path here. 1972 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 1973 B.buildUndef(DstReg); 1974 MI.eraseFromParent(); 1975 return true; 1976 } 1977 1978 // TODO: We could emit code to handle the initialization somewhere. 1979 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1980 const SITargetLowering *TLI = ST.getTargetLowering(); 1981 if (!TLI->shouldUseLDSConstAddress(GV)) { 1982 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1983 return true; // Leave in place; 1984 } 1985 1986 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1987 MI.eraseFromParent(); 1988 return true; 1989 } 1990 1991 const Function &Fn = MF.getFunction(); 1992 DiagnosticInfoUnsupported BadInit( 1993 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1994 Fn.getContext().diagnose(BadInit); 1995 return true; 1996 } 1997 1998 const SITargetLowering *TLI = ST.getTargetLowering(); 1999 2000 if (TLI->shouldEmitFixup(GV)) { 2001 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2002 MI.eraseFromParent(); 2003 return true; 2004 } 2005 2006 if (TLI->shouldEmitPCReloc(GV)) { 2007 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2008 MI.eraseFromParent(); 2009 return true; 2010 } 2011 2012 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2013 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2014 2015 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2016 MachinePointerInfo::getGOT(MF), 2017 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2018 MachineMemOperand::MOInvariant, 2019 8 /*Size*/, Align(8)); 2020 2021 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2022 2023 if (Ty.getSizeInBits() == 32) { 2024 // Truncate if this is a 32-bit constant adrdess. 2025 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2026 B.buildExtract(DstReg, Load, 0); 2027 } else 2028 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2029 2030 MI.eraseFromParent(); 2031 return true; 2032 } 2033 2034 bool AMDGPULegalizerInfo::legalizeLoad( 2035 MachineInstr &MI, MachineRegisterInfo &MRI, 2036 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2037 B.setInstr(MI); 2038 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2039 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2040 Observer.changingInstr(MI); 2041 MI.getOperand(1).setReg(Cast.getReg(0)); 2042 Observer.changedInstr(MI); 2043 return true; 2044 } 2045 2046 bool AMDGPULegalizerInfo::legalizeFMad( 2047 MachineInstr &MI, MachineRegisterInfo &MRI, 2048 MachineIRBuilder &B) const { 2049 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2050 assert(Ty.isScalar()); 2051 2052 MachineFunction &MF = B.getMF(); 2053 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2054 2055 // TODO: Always legal with future ftz flag. 2056 // FIXME: Do we need just output? 2057 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2058 return true; 2059 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2060 return true; 2061 2062 MachineIRBuilder HelperBuilder(MI); 2063 GISelObserverWrapper DummyObserver; 2064 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2065 HelperBuilder.setInstr(MI); 2066 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2067 } 2068 2069 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2070 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2071 Register DstReg = MI.getOperand(0).getReg(); 2072 Register PtrReg = MI.getOperand(1).getReg(); 2073 Register CmpVal = MI.getOperand(2).getReg(); 2074 Register NewVal = MI.getOperand(3).getReg(); 2075 2076 assert(SITargetLowering::isFlatGlobalAddrSpace( 2077 MRI.getType(PtrReg).getAddressSpace()) && 2078 "this should not have been custom lowered"); 2079 2080 LLT ValTy = MRI.getType(CmpVal); 2081 LLT VecTy = LLT::vector(2, ValTy); 2082 2083 B.setInstr(MI); 2084 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2085 2086 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2087 .addDef(DstReg) 2088 .addUse(PtrReg) 2089 .addUse(PackedVal) 2090 .setMemRefs(MI.memoperands()); 2091 2092 MI.eraseFromParent(); 2093 return true; 2094 } 2095 2096 bool AMDGPULegalizerInfo::legalizeFlog( 2097 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2098 Register Dst = MI.getOperand(0).getReg(); 2099 Register Src = MI.getOperand(1).getReg(); 2100 LLT Ty = B.getMRI()->getType(Dst); 2101 unsigned Flags = MI.getFlags(); 2102 B.setInstr(MI); 2103 2104 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2105 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2106 2107 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2108 MI.eraseFromParent(); 2109 return true; 2110 } 2111 2112 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2113 MachineIRBuilder &B) const { 2114 Register Dst = MI.getOperand(0).getReg(); 2115 Register Src = MI.getOperand(1).getReg(); 2116 unsigned Flags = MI.getFlags(); 2117 LLT Ty = B.getMRI()->getType(Dst); 2118 B.setInstr(MI); 2119 2120 auto K = B.buildFConstant(Ty, numbers::log2e); 2121 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2122 B.buildFExp2(Dst, Mul, Flags); 2123 MI.eraseFromParent(); 2124 return true; 2125 } 2126 2127 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2128 MachineIRBuilder &B) const { 2129 Register Dst = MI.getOperand(0).getReg(); 2130 Register Src0 = MI.getOperand(1).getReg(); 2131 Register Src1 = MI.getOperand(2).getReg(); 2132 unsigned Flags = MI.getFlags(); 2133 LLT Ty = B.getMRI()->getType(Dst); 2134 B.setInstr(MI); 2135 const LLT S16 = LLT::scalar(16); 2136 const LLT S32 = LLT::scalar(32); 2137 2138 if (Ty == S32) { 2139 auto Log = B.buildFLog2(S32, Src0, Flags); 2140 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2141 .addUse(Log.getReg(0)) 2142 .addUse(Src1) 2143 .setMIFlags(Flags); 2144 B.buildFExp2(Dst, Mul, Flags); 2145 } else if (Ty == S16) { 2146 // There's no f16 fmul_legacy, so we need to convert for it. 2147 auto Log = B.buildFLog2(S16, Src0, Flags); 2148 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2149 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2150 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2151 .addUse(Ext0.getReg(0)) 2152 .addUse(Ext1.getReg(0)) 2153 .setMIFlags(Flags); 2154 2155 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2156 } else 2157 return false; 2158 2159 MI.eraseFromParent(); 2160 return true; 2161 } 2162 2163 // Find a source register, ignoring any possible source modifiers. 2164 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2165 Register ModSrc = OrigSrc; 2166 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2167 ModSrc = SrcFNeg->getOperand(1).getReg(); 2168 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2169 ModSrc = SrcFAbs->getOperand(1).getReg(); 2170 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2171 ModSrc = SrcFAbs->getOperand(1).getReg(); 2172 return ModSrc; 2173 } 2174 2175 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2176 MachineRegisterInfo &MRI, 2177 MachineIRBuilder &B) const { 2178 B.setInstr(MI); 2179 2180 const LLT S1 = LLT::scalar(1); 2181 const LLT S64 = LLT::scalar(64); 2182 Register Dst = MI.getOperand(0).getReg(); 2183 Register OrigSrc = MI.getOperand(1).getReg(); 2184 unsigned Flags = MI.getFlags(); 2185 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2186 "this should not have been custom lowered"); 2187 2188 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2189 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2190 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2191 // V_FRACT bug is: 2192 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2193 // 2194 // Convert floor(x) to (x - fract(x)) 2195 2196 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2197 .addUse(OrigSrc) 2198 .setMIFlags(Flags); 2199 2200 // Give source modifier matching some assistance before obscuring a foldable 2201 // pattern. 2202 2203 // TODO: We can avoid the neg on the fract? The input sign to fract 2204 // shouldn't matter? 2205 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2206 2207 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2208 2209 Register Min = MRI.createGenericVirtualRegister(S64); 2210 2211 // We don't need to concern ourselves with the snan handling difference, so 2212 // use the one which will directly select. 2213 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2214 if (MFI->getMode().IEEE) 2215 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2216 else 2217 B.buildFMinNum(Min, Fract, Const, Flags); 2218 2219 Register CorrectedFract = Min; 2220 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2221 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2222 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2223 } 2224 2225 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2226 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2227 2228 MI.eraseFromParent(); 2229 return true; 2230 } 2231 2232 // Turn an illegal packed v2s16 build vector into bit operations. 2233 // TODO: This should probably be a bitcast action in LegalizerHelper. 2234 bool AMDGPULegalizerInfo::legalizeBuildVector( 2235 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2236 Register Dst = MI.getOperand(0).getReg(); 2237 const LLT S32 = LLT::scalar(32); 2238 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2239 2240 Register Src0 = MI.getOperand(1).getReg(); 2241 Register Src1 = MI.getOperand(2).getReg(); 2242 assert(MRI.getType(Src0) == LLT::scalar(16)); 2243 2244 B.setInstr(MI); 2245 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2246 B.buildBitcast(Dst, Merge); 2247 2248 MI.eraseFromParent(); 2249 return true; 2250 } 2251 2252 // Return the use branch instruction, otherwise null if the usage is invalid. 2253 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2254 MachineRegisterInfo &MRI, 2255 MachineInstr *&Br, 2256 MachineBasicBlock *&UncondBrTarget) { 2257 Register CondDef = MI.getOperand(0).getReg(); 2258 if (!MRI.hasOneNonDBGUse(CondDef)) 2259 return nullptr; 2260 2261 MachineBasicBlock *Parent = MI.getParent(); 2262 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2263 if (UseMI.getParent() != Parent || 2264 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2265 return nullptr; 2266 2267 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2268 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2269 if (Next == Parent->end()) { 2270 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2271 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2272 return nullptr; 2273 UncondBrTarget = &*NextMBB; 2274 } else { 2275 if (Next->getOpcode() != AMDGPU::G_BR) 2276 return nullptr; 2277 Br = &*Next; 2278 UncondBrTarget = Br->getOperand(0).getMBB(); 2279 } 2280 2281 return &UseMI; 2282 } 2283 2284 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2285 MachineRegisterInfo &MRI, 2286 Register LiveIn, 2287 Register PhyReg) const { 2288 assert(PhyReg.isPhysical() && "Physical register expected"); 2289 2290 // Insert the live-in copy, if required, by defining destination virtual 2291 // register. 2292 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2293 if (!MRI.getVRegDef(LiveIn)) { 2294 // FIXME: Should have scoped insert pt 2295 MachineBasicBlock &OrigInsBB = B.getMBB(); 2296 auto OrigInsPt = B.getInsertPt(); 2297 2298 MachineBasicBlock &EntryMBB = B.getMF().front(); 2299 EntryMBB.addLiveIn(PhyReg); 2300 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2301 B.buildCopy(LiveIn, PhyReg); 2302 2303 B.setInsertPt(OrigInsBB, OrigInsPt); 2304 } 2305 2306 return LiveIn; 2307 } 2308 2309 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2310 MachineRegisterInfo &MRI, 2311 Register PhyReg, LLT Ty, 2312 bool InsertLiveInCopy) const { 2313 assert(PhyReg.isPhysical() && "Physical register expected"); 2314 2315 // Get or create virtual live-in regester 2316 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2317 if (!LiveIn) { 2318 LiveIn = MRI.createGenericVirtualRegister(Ty); 2319 MRI.addLiveIn(PhyReg, LiveIn); 2320 } 2321 2322 // When the actual true copy required is from virtual register to physical 2323 // register (to be inserted later), live-in copy insertion from physical 2324 // to register virtual register is not required 2325 if (!InsertLiveInCopy) 2326 return LiveIn; 2327 2328 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2329 } 2330 2331 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2332 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2333 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2334 const ArgDescriptor *Arg; 2335 const TargetRegisterClass *RC; 2336 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2337 if (!Arg) { 2338 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2339 return nullptr; 2340 } 2341 return Arg; 2342 } 2343 2344 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2345 const ArgDescriptor *Arg) const { 2346 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2347 return false; // TODO: Handle these 2348 2349 Register SrcReg = Arg->getRegister(); 2350 assert(SrcReg.isPhysical() && "Physical register expected"); 2351 assert(DstReg.isVirtual() && "Virtual register expected"); 2352 2353 MachineRegisterInfo &MRI = *B.getMRI(); 2354 2355 LLT Ty = MRI.getType(DstReg); 2356 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2357 2358 if (Arg->isMasked()) { 2359 // TODO: Should we try to emit this once in the entry block? 2360 const LLT S32 = LLT::scalar(32); 2361 const unsigned Mask = Arg->getMask(); 2362 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2363 2364 Register AndMaskSrc = LiveIn; 2365 2366 if (Shift != 0) { 2367 auto ShiftAmt = B.buildConstant(S32, Shift); 2368 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2369 } 2370 2371 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2372 } else { 2373 B.buildCopy(DstReg, LiveIn); 2374 } 2375 2376 return true; 2377 } 2378 2379 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2380 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2381 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2382 B.setInstr(MI); 2383 2384 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2385 if (!Arg) 2386 return false; 2387 2388 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2389 return false; 2390 2391 MI.eraseFromParent(); 2392 return true; 2393 } 2394 2395 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2396 MachineRegisterInfo &MRI, 2397 MachineIRBuilder &B) const { 2398 B.setInstr(MI); 2399 Register Dst = MI.getOperand(0).getReg(); 2400 LLT DstTy = MRI.getType(Dst); 2401 LLT S16 = LLT::scalar(16); 2402 LLT S32 = LLT::scalar(32); 2403 LLT S64 = LLT::scalar(64); 2404 2405 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2406 return true; 2407 2408 if (DstTy == S16) 2409 return legalizeFDIV16(MI, MRI, B); 2410 if (DstTy == S32) 2411 return legalizeFDIV32(MI, MRI, B); 2412 if (DstTy == S64) 2413 return legalizeFDIV64(MI, MRI, B); 2414 2415 return false; 2416 } 2417 2418 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2419 const LLT S32 = LLT::scalar(32); 2420 2421 auto Cvt0 = B.buildUITOFP(S32, Src); 2422 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2423 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2424 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2425 return B.buildFPTOUI(S32, Mul).getReg(0); 2426 } 2427 2428 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2429 Register DstReg, 2430 Register Num, 2431 Register Den, 2432 bool IsRem) const { 2433 const LLT S1 = LLT::scalar(1); 2434 const LLT S32 = LLT::scalar(32); 2435 2436 // RCP = URECIP(Den) = 2^32 / Den + e 2437 // e is rounding error. 2438 auto RCP = buildDivRCP(B, Den); 2439 2440 // RCP_LO = mul(RCP, Den) 2441 auto RCP_LO = B.buildMul(S32, RCP, Den); 2442 2443 // RCP_HI = mulhu (RCP, Den) */ 2444 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2445 2446 // NEG_RCP_LO = -RCP_LO 2447 auto Zero = B.buildConstant(S32, 0); 2448 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2449 2450 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2451 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2452 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2453 2454 // Calculate the rounding error from the URECIP instruction 2455 // E = mulhu(ABS_RCP_LO, RCP) 2456 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2457 2458 // RCP_A_E = RCP + E 2459 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2460 2461 // RCP_S_E = RCP - E 2462 auto RCP_S_E = B.buildSub(S32, RCP, E); 2463 2464 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2465 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2466 2467 // Quotient = mulhu(Tmp0, Num)stmp 2468 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2469 2470 // Num_S_Remainder = Quotient * Den 2471 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2472 2473 // Remainder = Num - Num_S_Remainder 2474 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2475 2476 // Remainder_GE_Den = Remainder >= Den 2477 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2478 2479 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2480 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2481 Num, Num_S_Remainder); 2482 2483 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2484 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2485 2486 // Calculate Division result: 2487 2488 // Quotient_A_One = Quotient + 1 2489 auto One = B.buildConstant(S32, 1); 2490 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2491 2492 // Quotient_S_One = Quotient - 1 2493 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2494 2495 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2496 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2497 2498 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2499 if (IsRem) { 2500 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2501 2502 // Calculate Rem result: 2503 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2504 2505 // Remainder_A_Den = Remainder + Den 2506 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2507 2508 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2509 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2510 2511 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2512 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2513 } else { 2514 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2515 } 2516 } 2517 2518 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2519 MachineRegisterInfo &MRI, 2520 MachineIRBuilder &B) const { 2521 B.setInstr(MI); 2522 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2523 Register DstReg = MI.getOperand(0).getReg(); 2524 Register Num = MI.getOperand(1).getReg(); 2525 Register Den = MI.getOperand(2).getReg(); 2526 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2527 MI.eraseFromParent(); 2528 return true; 2529 } 2530 2531 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2532 // 2533 // Return lo, hi of result 2534 // 2535 // %cvt.lo = G_UITOFP Val.lo 2536 // %cvt.hi = G_UITOFP Val.hi 2537 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2538 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2539 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2540 // %mul2 = G_FMUL %mul1, 2**(-32) 2541 // %trunc = G_INTRINSIC_TRUNC %mul2 2542 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2543 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2544 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2545 Register Val) { 2546 const LLT S32 = LLT::scalar(32); 2547 auto Unmerge = B.buildUnmerge(S32, Val); 2548 2549 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2550 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2551 2552 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2553 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2554 2555 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2556 auto Mul1 = 2557 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2558 2559 // 2**(-32) 2560 auto Mul2 = 2561 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2562 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2563 2564 // -(2**32) 2565 auto Mad2 = B.buildFMAD(S32, Trunc, 2566 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2567 2568 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2569 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2570 2571 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2572 } 2573 2574 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI, 2575 MachineRegisterInfo &MRI, 2576 MachineIRBuilder &B) const { 2577 B.setInstr(MI); 2578 2579 const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV; 2580 const LLT S32 = LLT::scalar(32); 2581 const LLT S64 = LLT::scalar(64); 2582 const LLT S1 = LLT::scalar(1); 2583 Register Numer = MI.getOperand(1).getReg(); 2584 Register Denom = MI.getOperand(2).getReg(); 2585 Register RcpLo, RcpHi; 2586 2587 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2588 2589 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2590 2591 auto Zero64 = B.buildConstant(S64, 0); 2592 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2593 2594 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2595 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2596 2597 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2598 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2599 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2600 2601 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2602 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2603 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2604 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2605 2606 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2607 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2608 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2609 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2610 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2611 2612 auto Zero32 = B.buildConstant(S32, 0); 2613 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2614 auto Add2_HiC = 2615 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2616 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2617 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2618 2619 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2620 Register NumerLo = UnmergeNumer.getReg(0); 2621 Register NumerHi = UnmergeNumer.getReg(1); 2622 2623 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2624 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2625 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2626 Register Mul3_Lo = UnmergeMul3.getReg(0); 2627 Register Mul3_Hi = UnmergeMul3.getReg(1); 2628 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2629 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2630 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2631 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2632 2633 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2634 Register DenomLo = UnmergeDenom.getReg(0); 2635 Register DenomHi = UnmergeDenom.getReg(1); 2636 2637 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2638 auto C1 = B.buildSExt(S32, CmpHi); 2639 2640 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2641 auto C2 = B.buildSExt(S32, CmpLo); 2642 2643 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2644 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2645 2646 // TODO: Here and below portions of the code can be enclosed into if/endif. 2647 // Currently control flow is unconditional and we have 4 selects after 2648 // potential endif to substitute PHIs. 2649 2650 // if C3 != 0 ... 2651 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2652 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2653 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2654 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2655 2656 auto One64 = B.buildConstant(S64, 1); 2657 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2658 2659 auto C4 = 2660 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2661 auto C5 = 2662 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2663 auto C6 = B.buildSelect( 2664 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2665 2666 // if (C6 != 0) 2667 auto Add4 = B.buildAdd(S64, Add3, One64); 2668 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2669 2670 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2671 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2672 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2673 2674 // endif C6 2675 // endif C3 2676 2677 if (IsDiv) { 2678 auto Sel1 = B.buildSelect( 2679 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2680 B.buildSelect(MI.getOperand(0), 2681 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2682 } else { 2683 auto Sel2 = B.buildSelect( 2684 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2685 B.buildSelect(MI.getOperand(0), 2686 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2687 } 2688 2689 MI.eraseFromParent(); 2690 return true; 2691 } 2692 2693 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2694 MachineRegisterInfo &MRI, 2695 MachineIRBuilder &B) const { 2696 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2697 if (Ty == LLT::scalar(32)) 2698 return legalizeUDIV_UREM32(MI, MRI, B); 2699 if (Ty == LLT::scalar(64)) 2700 return legalizeUDIV_UREM64(MI, MRI, B); 2701 return false; 2702 } 2703 2704 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2705 MachineRegisterInfo &MRI, 2706 MachineIRBuilder &B) const { 2707 B.setInstr(MI); 2708 const LLT S32 = LLT::scalar(32); 2709 2710 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2711 Register DstReg = MI.getOperand(0).getReg(); 2712 Register LHS = MI.getOperand(1).getReg(); 2713 Register RHS = MI.getOperand(2).getReg(); 2714 2715 auto ThirtyOne = B.buildConstant(S32, 31); 2716 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2717 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2718 2719 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2720 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2721 2722 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2723 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2724 2725 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2726 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2727 2728 if (IsRem) { 2729 auto RSign = LHSign; // Remainder sign is the same as LHS 2730 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2731 B.buildSub(DstReg, UDivRem, RSign); 2732 } else { 2733 auto DSign = B.buildXor(S32, LHSign, RHSign); 2734 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2735 B.buildSub(DstReg, UDivRem, DSign); 2736 } 2737 2738 MI.eraseFromParent(); 2739 return true; 2740 } 2741 2742 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2743 MachineRegisterInfo &MRI, 2744 MachineIRBuilder &B) const { 2745 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2746 return legalizeSDIV_SREM32(MI, MRI, B); 2747 return false; 2748 } 2749 2750 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2751 MachineRegisterInfo &MRI, 2752 MachineIRBuilder &B) const { 2753 Register Res = MI.getOperand(0).getReg(); 2754 Register LHS = MI.getOperand(1).getReg(); 2755 Register RHS = MI.getOperand(2).getReg(); 2756 2757 uint16_t Flags = MI.getFlags(); 2758 2759 LLT ResTy = MRI.getType(Res); 2760 LLT S32 = LLT::scalar(32); 2761 LLT S64 = LLT::scalar(64); 2762 2763 const MachineFunction &MF = B.getMF(); 2764 bool Unsafe = 2765 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2766 2767 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2768 return false; 2769 2770 if (!Unsafe && ResTy == S32 && 2771 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2772 return false; 2773 2774 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2775 // 1 / x -> RCP(x) 2776 if (CLHS->isExactlyValue(1.0)) { 2777 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2778 .addUse(RHS) 2779 .setMIFlags(Flags); 2780 2781 MI.eraseFromParent(); 2782 return true; 2783 } 2784 2785 // -1 / x -> RCP( FNEG(x) ) 2786 if (CLHS->isExactlyValue(-1.0)) { 2787 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2788 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2789 .addUse(FNeg.getReg(0)) 2790 .setMIFlags(Flags); 2791 2792 MI.eraseFromParent(); 2793 return true; 2794 } 2795 } 2796 2797 // x / y -> x * (1.0 / y) 2798 if (Unsafe) { 2799 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2800 .addUse(RHS) 2801 .setMIFlags(Flags); 2802 B.buildFMul(Res, LHS, RCP, Flags); 2803 2804 MI.eraseFromParent(); 2805 return true; 2806 } 2807 2808 return false; 2809 } 2810 2811 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2812 MachineRegisterInfo &MRI, 2813 MachineIRBuilder &B) const { 2814 B.setInstr(MI); 2815 Register Res = MI.getOperand(0).getReg(); 2816 Register LHS = MI.getOperand(1).getReg(); 2817 Register RHS = MI.getOperand(2).getReg(); 2818 2819 uint16_t Flags = MI.getFlags(); 2820 2821 LLT S16 = LLT::scalar(16); 2822 LLT S32 = LLT::scalar(32); 2823 2824 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2825 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2826 2827 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2828 .addUse(RHSExt.getReg(0)) 2829 .setMIFlags(Flags); 2830 2831 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2832 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2833 2834 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2835 .addUse(RDst.getReg(0)) 2836 .addUse(RHS) 2837 .addUse(LHS) 2838 .setMIFlags(Flags); 2839 2840 MI.eraseFromParent(); 2841 return true; 2842 } 2843 2844 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2845 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2846 static void toggleSPDenormMode(bool Enable, 2847 MachineIRBuilder &B, 2848 const GCNSubtarget &ST, 2849 AMDGPU::SIModeRegisterDefaults Mode) { 2850 // Set SP denorm mode to this value. 2851 unsigned SPDenormMode = 2852 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2853 2854 if (ST.hasDenormModeInst()) { 2855 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2856 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2857 2858 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2859 B.buildInstr(AMDGPU::S_DENORM_MODE) 2860 .addImm(NewDenormModeValue); 2861 2862 } else { 2863 // Select FP32 bit field in mode register. 2864 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2865 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2866 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2867 2868 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2869 .addImm(SPDenormMode) 2870 .addImm(SPDenormModeBitField); 2871 } 2872 } 2873 2874 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2875 MachineRegisterInfo &MRI, 2876 MachineIRBuilder &B) const { 2877 B.setInstr(MI); 2878 Register Res = MI.getOperand(0).getReg(); 2879 Register LHS = MI.getOperand(1).getReg(); 2880 Register RHS = MI.getOperand(2).getReg(); 2881 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2882 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2883 2884 uint16_t Flags = MI.getFlags(); 2885 2886 LLT S32 = LLT::scalar(32); 2887 LLT S1 = LLT::scalar(1); 2888 2889 auto One = B.buildFConstant(S32, 1.0f); 2890 2891 auto DenominatorScaled = 2892 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2893 .addUse(LHS) 2894 .addUse(RHS) 2895 .addImm(0) 2896 .setMIFlags(Flags); 2897 auto NumeratorScaled = 2898 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2899 .addUse(LHS) 2900 .addUse(RHS) 2901 .addImm(1) 2902 .setMIFlags(Flags); 2903 2904 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2905 .addUse(DenominatorScaled.getReg(0)) 2906 .setMIFlags(Flags); 2907 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2908 2909 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2910 // aren't modeled as reading it. 2911 if (!Mode.allFP32Denormals()) 2912 toggleSPDenormMode(true, B, ST, Mode); 2913 2914 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2915 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2916 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2917 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2918 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2919 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2920 2921 if (!Mode.allFP32Denormals()) 2922 toggleSPDenormMode(false, B, ST, Mode); 2923 2924 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2925 .addUse(Fma4.getReg(0)) 2926 .addUse(Fma1.getReg(0)) 2927 .addUse(Fma3.getReg(0)) 2928 .addUse(NumeratorScaled.getReg(1)) 2929 .setMIFlags(Flags); 2930 2931 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2932 .addUse(Fmas.getReg(0)) 2933 .addUse(RHS) 2934 .addUse(LHS) 2935 .setMIFlags(Flags); 2936 2937 MI.eraseFromParent(); 2938 return true; 2939 } 2940 2941 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2942 MachineRegisterInfo &MRI, 2943 MachineIRBuilder &B) const { 2944 B.setInstr(MI); 2945 Register Res = MI.getOperand(0).getReg(); 2946 Register LHS = MI.getOperand(1).getReg(); 2947 Register RHS = MI.getOperand(2).getReg(); 2948 2949 uint16_t Flags = MI.getFlags(); 2950 2951 LLT S64 = LLT::scalar(64); 2952 LLT S1 = LLT::scalar(1); 2953 2954 auto One = B.buildFConstant(S64, 1.0); 2955 2956 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2957 .addUse(LHS) 2958 .addUse(RHS) 2959 .addImm(0) 2960 .setMIFlags(Flags); 2961 2962 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2963 2964 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2965 .addUse(DivScale0.getReg(0)) 2966 .setMIFlags(Flags); 2967 2968 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2969 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2970 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2971 2972 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2973 .addUse(LHS) 2974 .addUse(RHS) 2975 .addImm(1) 2976 .setMIFlags(Flags); 2977 2978 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2979 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 2980 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2981 2982 Register Scale; 2983 if (!ST.hasUsableDivScaleConditionOutput()) { 2984 // Workaround a hardware bug on SI where the condition output from div_scale 2985 // is not usable. 2986 2987 LLT S32 = LLT::scalar(32); 2988 2989 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2990 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2991 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2992 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2993 2994 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2995 Scale1Unmerge.getReg(1)); 2996 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2997 Scale0Unmerge.getReg(1)); 2998 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 2999 } else { 3000 Scale = DivScale1.getReg(1); 3001 } 3002 3003 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3004 .addUse(Fma4.getReg(0)) 3005 .addUse(Fma3.getReg(0)) 3006 .addUse(Mul.getReg(0)) 3007 .addUse(Scale) 3008 .setMIFlags(Flags); 3009 3010 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3011 .addUse(Fmas.getReg(0)) 3012 .addUse(RHS) 3013 .addUse(LHS) 3014 .setMIFlags(Flags); 3015 3016 MI.eraseFromParent(); 3017 return true; 3018 } 3019 3020 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3021 MachineRegisterInfo &MRI, 3022 MachineIRBuilder &B) const { 3023 B.setInstr(MI); 3024 Register Res = MI.getOperand(0).getReg(); 3025 Register LHS = MI.getOperand(2).getReg(); 3026 Register RHS = MI.getOperand(3).getReg(); 3027 uint16_t Flags = MI.getFlags(); 3028 3029 LLT S32 = LLT::scalar(32); 3030 LLT S1 = LLT::scalar(1); 3031 3032 auto Abs = B.buildFAbs(S32, RHS, Flags); 3033 const APFloat C0Val(1.0f); 3034 3035 auto C0 = B.buildConstant(S32, 0x6f800000); 3036 auto C1 = B.buildConstant(S32, 0x2f800000); 3037 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3038 3039 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3040 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3041 3042 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3043 3044 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3045 .addUse(Mul0.getReg(0)) 3046 .setMIFlags(Flags); 3047 3048 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3049 3050 B.buildFMul(Res, Sel, Mul1, Flags); 3051 3052 MI.eraseFromParent(); 3053 return true; 3054 } 3055 3056 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3057 MachineRegisterInfo &MRI, 3058 MachineIRBuilder &B) const { 3059 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3060 if (!MFI->isEntryFunction()) { 3061 return legalizePreloadedArgIntrin(MI, MRI, B, 3062 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3063 } 3064 3065 B.setInstr(MI); 3066 3067 uint64_t Offset = 3068 ST.getTargetLowering()->getImplicitParameterOffset( 3069 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3070 Register DstReg = MI.getOperand(0).getReg(); 3071 LLT DstTy = MRI.getType(DstReg); 3072 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3073 3074 const ArgDescriptor *Arg; 3075 const TargetRegisterClass *RC; 3076 std::tie(Arg, RC) 3077 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3078 if (!Arg) 3079 return false; 3080 3081 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3082 if (!loadInputValue(KernargPtrReg, B, Arg)) 3083 return false; 3084 3085 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3086 MI.eraseFromParent(); 3087 return true; 3088 } 3089 3090 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3091 MachineRegisterInfo &MRI, 3092 MachineIRBuilder &B, 3093 unsigned AddrSpace) const { 3094 B.setInstr(MI); 3095 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3096 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3097 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3098 MI.eraseFromParent(); 3099 return true; 3100 } 3101 3102 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3103 // offset (the offset that is included in bounds checking and swizzling, to be 3104 // split between the instruction's voffset and immoffset fields) and soffset 3105 // (the offset that is excluded from bounds checking and swizzling, to go in 3106 // the instruction's soffset field). This function takes the first kind of 3107 // offset and figures out how to split it between voffset and immoffset. 3108 std::tuple<Register, unsigned, unsigned> 3109 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3110 Register OrigOffset) const { 3111 const unsigned MaxImm = 4095; 3112 Register BaseReg; 3113 unsigned TotalConstOffset; 3114 MachineInstr *OffsetDef; 3115 const LLT S32 = LLT::scalar(32); 3116 3117 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3118 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3119 3120 unsigned ImmOffset = TotalConstOffset; 3121 3122 // If the immediate value is too big for the immoffset field, put the value 3123 // and -4096 into the immoffset field so that the value that is copied/added 3124 // for the voffset field is a multiple of 4096, and it stands more chance 3125 // of being CSEd with the copy/add for another similar load/store. 3126 // However, do not do that rounding down to a multiple of 4096 if that is a 3127 // negative number, as it appears to be illegal to have a negative offset 3128 // in the vgpr, even if adding the immediate offset makes it positive. 3129 unsigned Overflow = ImmOffset & ~MaxImm; 3130 ImmOffset -= Overflow; 3131 if ((int32_t)Overflow < 0) { 3132 Overflow += ImmOffset; 3133 ImmOffset = 0; 3134 } 3135 3136 if (Overflow != 0) { 3137 if (!BaseReg) { 3138 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3139 } else { 3140 auto OverflowVal = B.buildConstant(S32, Overflow); 3141 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3142 } 3143 } 3144 3145 if (!BaseReg) 3146 BaseReg = B.buildConstant(S32, 0).getReg(0); 3147 3148 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3149 } 3150 3151 /// Handle register layout difference for f16 images for some subtargets. 3152 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3153 MachineRegisterInfo &MRI, 3154 Register Reg) const { 3155 if (!ST.hasUnpackedD16VMem()) 3156 return Reg; 3157 3158 const LLT S16 = LLT::scalar(16); 3159 const LLT S32 = LLT::scalar(32); 3160 LLT StoreVT = MRI.getType(Reg); 3161 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3162 3163 auto Unmerge = B.buildUnmerge(S16, Reg); 3164 3165 SmallVector<Register, 4> WideRegs; 3166 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3167 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3168 3169 int NumElts = StoreVT.getNumElements(); 3170 3171 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3172 } 3173 3174 Register AMDGPULegalizerInfo::fixStoreSourceType( 3175 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3176 MachineRegisterInfo *MRI = B.getMRI(); 3177 LLT Ty = MRI->getType(VData); 3178 3179 const LLT S16 = LLT::scalar(16); 3180 3181 // Fixup illegal register types for i8 stores. 3182 if (Ty == LLT::scalar(8) || Ty == S16) { 3183 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3184 return AnyExt; 3185 } 3186 3187 if (Ty.isVector()) { 3188 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3189 if (IsFormat) 3190 return handleD16VData(B, *MRI, VData); 3191 } 3192 } 3193 3194 return VData; 3195 } 3196 3197 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3198 MachineRegisterInfo &MRI, 3199 MachineIRBuilder &B, 3200 bool IsTyped, 3201 bool IsFormat) const { 3202 B.setInstr(MI); 3203 3204 Register VData = MI.getOperand(1).getReg(); 3205 LLT Ty = MRI.getType(VData); 3206 LLT EltTy = Ty.getScalarType(); 3207 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3208 const LLT S32 = LLT::scalar(32); 3209 3210 VData = fixStoreSourceType(B, VData, IsFormat); 3211 Register RSrc = MI.getOperand(2).getReg(); 3212 3213 MachineMemOperand *MMO = *MI.memoperands_begin(); 3214 const int MemSize = MMO->getSize(); 3215 3216 unsigned ImmOffset; 3217 unsigned TotalOffset; 3218 3219 // The typed intrinsics add an immediate after the registers. 3220 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3221 3222 // The struct intrinsic variants add one additional operand over raw. 3223 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3224 Register VIndex; 3225 int OpOffset = 0; 3226 if (HasVIndex) { 3227 VIndex = MI.getOperand(3).getReg(); 3228 OpOffset = 1; 3229 } 3230 3231 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3232 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3233 3234 unsigned Format = 0; 3235 if (IsTyped) { 3236 Format = MI.getOperand(5 + OpOffset).getImm(); 3237 ++OpOffset; 3238 } 3239 3240 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3241 3242 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3243 if (TotalOffset != 0) 3244 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3245 3246 unsigned Opc; 3247 if (IsTyped) { 3248 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3249 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3250 } else if (IsFormat) { 3251 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3252 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3253 } else { 3254 switch (MemSize) { 3255 case 1: 3256 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3257 break; 3258 case 2: 3259 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3260 break; 3261 default: 3262 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3263 break; 3264 } 3265 } 3266 3267 if (!VIndex) 3268 VIndex = B.buildConstant(S32, 0).getReg(0); 3269 3270 auto MIB = B.buildInstr(Opc) 3271 .addUse(VData) // vdata 3272 .addUse(RSrc) // rsrc 3273 .addUse(VIndex) // vindex 3274 .addUse(VOffset) // voffset 3275 .addUse(SOffset) // soffset 3276 .addImm(ImmOffset); // offset(imm) 3277 3278 if (IsTyped) 3279 MIB.addImm(Format); 3280 3281 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3282 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3283 .addMemOperand(MMO); 3284 3285 MI.eraseFromParent(); 3286 return true; 3287 } 3288 3289 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3290 MachineRegisterInfo &MRI, 3291 MachineIRBuilder &B, 3292 bool IsFormat, 3293 bool IsTyped) const { 3294 B.setInstr(MI); 3295 3296 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3297 MachineMemOperand *MMO = *MI.memoperands_begin(); 3298 const int MemSize = MMO->getSize(); 3299 const LLT S32 = LLT::scalar(32); 3300 3301 Register Dst = MI.getOperand(0).getReg(); 3302 Register RSrc = MI.getOperand(2).getReg(); 3303 3304 // The typed intrinsics add an immediate after the registers. 3305 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3306 3307 // The struct intrinsic variants add one additional operand over raw. 3308 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3309 Register VIndex; 3310 int OpOffset = 0; 3311 if (HasVIndex) { 3312 VIndex = MI.getOperand(3).getReg(); 3313 OpOffset = 1; 3314 } 3315 3316 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3317 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3318 3319 unsigned Format = 0; 3320 if (IsTyped) { 3321 Format = MI.getOperand(5 + OpOffset).getImm(); 3322 ++OpOffset; 3323 } 3324 3325 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3326 unsigned ImmOffset; 3327 unsigned TotalOffset; 3328 3329 LLT Ty = MRI.getType(Dst); 3330 LLT EltTy = Ty.getScalarType(); 3331 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3332 const bool Unpacked = ST.hasUnpackedD16VMem(); 3333 3334 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3335 if (TotalOffset != 0) 3336 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3337 3338 unsigned Opc; 3339 3340 if (IsTyped) { 3341 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3342 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3343 } else if (IsFormat) { 3344 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3345 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3346 } else { 3347 switch (MemSize) { 3348 case 1: 3349 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3350 break; 3351 case 2: 3352 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3353 break; 3354 default: 3355 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3356 break; 3357 } 3358 } 3359 3360 Register LoadDstReg; 3361 3362 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3363 LLT UnpackedTy = Ty.changeElementSize(32); 3364 3365 if (IsExtLoad) 3366 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3367 else if (Unpacked && IsD16 && Ty.isVector()) 3368 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3369 else 3370 LoadDstReg = Dst; 3371 3372 if (!VIndex) 3373 VIndex = B.buildConstant(S32, 0).getReg(0); 3374 3375 auto MIB = B.buildInstr(Opc) 3376 .addDef(LoadDstReg) // vdata 3377 .addUse(RSrc) // rsrc 3378 .addUse(VIndex) // vindex 3379 .addUse(VOffset) // voffset 3380 .addUse(SOffset) // soffset 3381 .addImm(ImmOffset); // offset(imm) 3382 3383 if (IsTyped) 3384 MIB.addImm(Format); 3385 3386 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3387 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3388 .addMemOperand(MMO); 3389 3390 if (LoadDstReg != Dst) { 3391 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3392 3393 // Widen result for extending loads was widened. 3394 if (IsExtLoad) 3395 B.buildTrunc(Dst, LoadDstReg); 3396 else { 3397 // Repack to original 16-bit vector result 3398 // FIXME: G_TRUNC should work, but legalization currently fails 3399 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3400 SmallVector<Register, 4> Repack; 3401 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3402 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3403 B.buildMerge(Dst, Repack); 3404 } 3405 } 3406 3407 MI.eraseFromParent(); 3408 return true; 3409 } 3410 3411 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3412 MachineIRBuilder &B, 3413 bool IsInc) const { 3414 B.setInstr(MI); 3415 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3416 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3417 B.buildInstr(Opc) 3418 .addDef(MI.getOperand(0).getReg()) 3419 .addUse(MI.getOperand(2).getReg()) 3420 .addUse(MI.getOperand(3).getReg()) 3421 .cloneMemRefs(MI); 3422 MI.eraseFromParent(); 3423 return true; 3424 } 3425 3426 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3427 switch (IntrID) { 3428 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3429 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3430 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3431 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3432 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3433 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3434 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3435 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3436 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3437 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3438 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3439 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3440 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3441 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3442 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3443 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3444 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3445 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3446 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3447 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3448 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3449 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3450 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3451 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3452 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3453 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3454 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3455 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3456 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3457 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3458 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3459 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3460 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3461 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3462 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3463 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3464 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3465 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3466 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3467 default: 3468 llvm_unreachable("unhandled atomic opcode"); 3469 } 3470 } 3471 3472 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3473 MachineIRBuilder &B, 3474 Intrinsic::ID IID) const { 3475 B.setInstr(MI); 3476 3477 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3478 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3479 3480 Register Dst = MI.getOperand(0).getReg(); 3481 Register VData = MI.getOperand(2).getReg(); 3482 3483 Register CmpVal; 3484 int OpOffset = 0; 3485 3486 if (IsCmpSwap) { 3487 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3488 ++OpOffset; 3489 } 3490 3491 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3492 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3493 3494 // The struct intrinsic variants add one additional operand over raw. 3495 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3496 Register VIndex; 3497 if (HasVIndex) { 3498 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3499 ++OpOffset; 3500 } 3501 3502 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3503 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3504 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3505 3506 MachineMemOperand *MMO = *MI.memoperands_begin(); 3507 3508 unsigned ImmOffset; 3509 unsigned TotalOffset; 3510 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3511 if (TotalOffset != 0) 3512 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3513 3514 if (!VIndex) 3515 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3516 3517 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3518 .addDef(Dst) 3519 .addUse(VData); // vdata 3520 3521 if (IsCmpSwap) 3522 MIB.addReg(CmpVal); 3523 3524 MIB.addUse(RSrc) // rsrc 3525 .addUse(VIndex) // vindex 3526 .addUse(VOffset) // voffset 3527 .addUse(SOffset) // soffset 3528 .addImm(ImmOffset) // offset(imm) 3529 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3530 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3531 .addMemOperand(MMO); 3532 3533 MI.eraseFromParent(); 3534 return true; 3535 } 3536 3537 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3538 /// vector with s16 typed elements. 3539 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3540 SmallVectorImpl<Register> &PackedAddrs, 3541 int AddrIdx, int DimIdx, int NumVAddrs, 3542 int NumGradients) { 3543 const LLT S16 = LLT::scalar(16); 3544 const LLT V2S16 = LLT::vector(2, 16); 3545 3546 for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) { 3547 MachineOperand &SrcOp = MI.getOperand(I); 3548 if (!SrcOp.isReg()) 3549 continue; // _L to _LZ may have eliminated this. 3550 3551 Register AddrReg = SrcOp.getReg(); 3552 3553 if (I < DimIdx) { 3554 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3555 PackedAddrs.push_back(AddrReg); 3556 } else { 3557 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3558 // derivatives dx/dh and dx/dv are packed with undef. 3559 if (((I + 1) >= (AddrIdx + NumVAddrs)) || 3560 ((NumGradients / 2) % 2 == 1 && 3561 (I == DimIdx + (NumGradients / 2) - 1 || 3562 I == DimIdx + NumGradients - 1)) || 3563 // Check for _L to _LZ optimization 3564 !MI.getOperand(I + 1).isReg()) { 3565 PackedAddrs.push_back( 3566 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3567 .getReg(0)); 3568 } else { 3569 PackedAddrs.push_back( 3570 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3571 .getReg(0)); 3572 ++I; 3573 } 3574 } 3575 } 3576 } 3577 3578 /// Convert from separate vaddr components to a single vector address register, 3579 /// and replace the remaining operands with $noreg. 3580 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3581 int DimIdx, int NumVAddrs) { 3582 const LLT S32 = LLT::scalar(32); 3583 3584 SmallVector<Register, 8> AddrRegs; 3585 for (int I = 0; I != NumVAddrs; ++I) { 3586 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3587 if (SrcOp.isReg()) { 3588 AddrRegs.push_back(SrcOp.getReg()); 3589 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3590 } 3591 } 3592 3593 int NumAddrRegs = AddrRegs.size(); 3594 if (NumAddrRegs != 1) { 3595 // Round up to 8 elements for v5-v7 3596 // FIXME: Missing intermediate sized register classes and instructions. 3597 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3598 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3599 auto Undef = B.buildUndef(S32); 3600 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3601 NumAddrRegs = RoundedNumRegs; 3602 } 3603 3604 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3605 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3606 } 3607 3608 for (int I = 1; I != NumVAddrs; ++I) { 3609 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3610 if (SrcOp.isReg()) 3611 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3612 } 3613 } 3614 3615 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3616 /// 3617 /// Depending on the subtarget, load/store with 16-bit element data need to be 3618 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3619 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3620 /// registers. 3621 /// 3622 /// We don't want to directly select image instructions just yet, but also want 3623 /// to exposes all register repacking to the legalizer/combiners. We also don't 3624 /// want a selected instrution entering RegBankSelect. In order to avoid 3625 /// defining a multitude of intermediate image instructions, directly hack on 3626 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3627 /// now unnecessary arguments with $noreg. 3628 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3629 MachineInstr &MI, MachineIRBuilder &B, 3630 GISelChangeObserver &Observer, 3631 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3632 B.setInstr(MI); 3633 3634 const int NumDefs = MI.getNumExplicitDefs(); 3635 bool IsTFE = NumDefs == 2; 3636 // We are only processing the operands of d16 image operations on subtargets 3637 // that use the unpacked register layout, or need to repack the TFE result. 3638 3639 // TODO: Do we need to guard against already legalized intrinsics? 3640 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3641 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3642 3643 MachineRegisterInfo *MRI = B.getMRI(); 3644 const LLT S32 = LLT::scalar(32); 3645 const LLT S16 = LLT::scalar(16); 3646 const LLT V2S16 = LLT::vector(2, 16); 3647 3648 // Index of first address argument 3649 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3650 3651 // Check for 16 bit addresses and pack if true. 3652 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3653 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3654 const bool IsA16 = AddrTy == S16; 3655 3656 int NumVAddrs, NumGradients; 3657 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3658 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3659 getDMaskIdx(BaseOpcode, NumDefs); 3660 unsigned DMask = 0; 3661 3662 int DMaskLanes = 0; 3663 if (!BaseOpcode->Atomic) { 3664 DMask = MI.getOperand(DMaskIdx).getImm(); 3665 if (BaseOpcode->Gather4) { 3666 DMaskLanes = 4; 3667 } else if (DMask != 0) { 3668 DMaskLanes = countPopulation(DMask); 3669 } else if (!IsTFE && !BaseOpcode->Store) { 3670 // If dmask is 0, this is a no-op load. This can be eliminated. 3671 B.buildUndef(MI.getOperand(0)); 3672 MI.eraseFromParent(); 3673 return true; 3674 } 3675 } 3676 3677 Observer.changingInstr(MI); 3678 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3679 3680 unsigned NewOpcode = NumDefs == 0 ? 3681 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3682 3683 // Track that we legalized this 3684 MI.setDesc(B.getTII().get(NewOpcode)); 3685 3686 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3687 // dmask to be at least 1 otherwise the instruction will fail 3688 if (IsTFE && DMask == 0) { 3689 DMask = 0x1; 3690 DMaskLanes = 1; 3691 MI.getOperand(DMaskIdx).setImm(DMask); 3692 } 3693 3694 if (BaseOpcode->Atomic) { 3695 Register VData0 = MI.getOperand(2).getReg(); 3696 LLT Ty = MRI->getType(VData0); 3697 3698 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3699 if (Ty.isVector()) 3700 return false; 3701 3702 if (BaseOpcode->AtomicX2) { 3703 Register VData1 = MI.getOperand(3).getReg(); 3704 // The two values are packed in one register. 3705 LLT PackedTy = LLT::vector(2, Ty); 3706 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3707 MI.getOperand(2).setReg(Concat.getReg(0)); 3708 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3709 } 3710 } 3711 3712 int CorrectedNumVAddrs = NumVAddrs; 3713 3714 // Optimize _L to _LZ when _L is zero 3715 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3716 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3717 const ConstantFP *ConstantLod; 3718 const int LodIdx = AddrIdx + NumVAddrs - 1; 3719 3720 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3721 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3722 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3723 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3724 LZMappingInfo->LZ, ImageDimIntr->Dim); 3725 3726 // The starting indexes should remain in the same place. 3727 --NumVAddrs; 3728 --CorrectedNumVAddrs; 3729 3730 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3731 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3732 MI.RemoveOperand(LodIdx); 3733 } 3734 } 3735 } 3736 3737 // Optimize _mip away, when 'lod' is zero 3738 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3739 int64_t ConstantLod; 3740 const int LodIdx = AddrIdx + NumVAddrs - 1; 3741 3742 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3743 if (ConstantLod == 0) { 3744 // TODO: Change intrinsic opcode and remove operand instead or replacing 3745 // it with 0, as the _L to _LZ handling is done above. 3746 MI.getOperand(LodIdx).ChangeToImmediate(0); 3747 --CorrectedNumVAddrs; 3748 } 3749 } 3750 } 3751 3752 // If the register allocator cannot place the address registers contiguously 3753 // without introducing moves, then using the non-sequential address encoding 3754 // is always preferable, since it saves VALU instructions and is usually a 3755 // wash in terms of code size or even better. 3756 // 3757 // However, we currently have no way of hinting to the register allocator 3758 // that MIMG addresses should be placed contiguously when it is possible to 3759 // do so, so force non-NSA for the common 2-address case as a heuristic. 3760 // 3761 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3762 // allocation when possible. 3763 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3764 3765 // Rewrite the addressing register layout before doing anything else. 3766 if (IsA16) { 3767 // FIXME: this feature is missing from gfx10. When that is fixed, this check 3768 // should be introduced. 3769 if (!ST.hasR128A16() && !ST.hasGFX10A16()) 3770 return false; 3771 3772 if (NumVAddrs > 1) { 3773 SmallVector<Register, 4> PackedRegs; 3774 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs, 3775 NumGradients); 3776 3777 if (!UseNSA && PackedRegs.size() > 1) { 3778 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3779 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3780 PackedRegs[0] = Concat.getReg(0); 3781 PackedRegs.resize(1); 3782 } 3783 3784 const int NumPacked = PackedRegs.size(); 3785 for (int I = 0; I != NumVAddrs; ++I) { 3786 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3787 if (!SrcOp.isReg()) { 3788 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3789 continue; 3790 } 3791 3792 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3793 3794 if (I < NumPacked) 3795 SrcOp.setReg(PackedRegs[I]); 3796 else 3797 SrcOp.setReg(AMDGPU::NoRegister); 3798 } 3799 } 3800 } else if (!UseNSA && NumVAddrs > 1) { 3801 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3802 } 3803 3804 3805 if (BaseOpcode->Store) { // No TFE for stores? 3806 // TODO: Handle dmask trim 3807 Register VData = MI.getOperand(1).getReg(); 3808 LLT Ty = MRI->getType(VData); 3809 if (!Ty.isVector() || Ty.getElementType() != S16) 3810 return true; 3811 3812 B.setInstr(MI); 3813 3814 Register RepackedReg = handleD16VData(B, *MRI, VData); 3815 if (RepackedReg != VData) { 3816 MI.getOperand(1).setReg(RepackedReg); 3817 } 3818 3819 return true; 3820 } 3821 3822 Register DstReg = MI.getOperand(0).getReg(); 3823 LLT Ty = MRI->getType(DstReg); 3824 const LLT EltTy = Ty.getScalarType(); 3825 const bool IsD16 = Ty.getScalarType() == S16; 3826 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3827 3828 // Confirm that the return type is large enough for the dmask specified 3829 if (NumElts < DMaskLanes) 3830 return false; 3831 3832 if (NumElts > 4 || DMaskLanes > 4) 3833 return false; 3834 3835 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3836 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3837 3838 // The raw dword aligned data component of the load. The only legal cases 3839 // where this matters should be when using the packed D16 format, for 3840 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3841 LLT RoundedTy; 3842 3843 // S32 vector to to cover all data, plus TFE result element. 3844 LLT TFETy; 3845 3846 // Register type to use for each loaded component. Will be S32 or V2S16. 3847 LLT RegTy; 3848 3849 if (IsD16 && ST.hasUnpackedD16VMem()) { 3850 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3851 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3852 RegTy = S32; 3853 } else { 3854 unsigned EltSize = EltTy.getSizeInBits(); 3855 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3856 unsigned RoundedSize = 32 * RoundedElts; 3857 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3858 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3859 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3860 } 3861 3862 // The return type does not need adjustment. 3863 // TODO: Should we change s16 case to s32 or <2 x s16>? 3864 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3865 return true; 3866 3867 Register Dst1Reg; 3868 3869 // Insert after the instruction. 3870 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3871 3872 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3873 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3874 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3875 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3876 3877 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3878 3879 MI.getOperand(0).setReg(NewResultReg); 3880 3881 // In the IR, TFE is supposed to be used with a 2 element struct return 3882 // type. The intruction really returns these two values in one contiguous 3883 // register, with one additional dword beyond the loaded data. Rewrite the 3884 // return type to use a single register result. 3885 3886 if (IsTFE) { 3887 Dst1Reg = MI.getOperand(1).getReg(); 3888 if (MRI->getType(Dst1Reg) != S32) 3889 return false; 3890 3891 // TODO: Make sure the TFE operand bit is set. 3892 MI.RemoveOperand(1); 3893 3894 // Handle the easy case that requires no repack instructions. 3895 if (Ty == S32) { 3896 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 3897 return true; 3898 } 3899 } 3900 3901 // Now figure out how to copy the new result register back into the old 3902 // result. 3903 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 3904 3905 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 3906 3907 if (ResultNumRegs == 1) { 3908 assert(!IsTFE); 3909 ResultRegs[0] = NewResultReg; 3910 } else { 3911 // We have to repack into a new vector of some kind. 3912 for (int I = 0; I != NumDataRegs; ++I) 3913 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 3914 B.buildUnmerge(ResultRegs, NewResultReg); 3915 3916 // Drop the final TFE element to get the data part. The TFE result is 3917 // directly written to the right place already. 3918 if (IsTFE) 3919 ResultRegs.resize(NumDataRegs); 3920 } 3921 3922 // For an s16 scalar result, we form an s32 result with a truncate regardless 3923 // of packed vs. unpacked. 3924 if (IsD16 && !Ty.isVector()) { 3925 B.buildTrunc(DstReg, ResultRegs[0]); 3926 return true; 3927 } 3928 3929 // Avoid a build/concat_vector of 1 entry. 3930 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 3931 B.buildBitcast(DstReg, ResultRegs[0]); 3932 return true; 3933 } 3934 3935 assert(Ty.isVector()); 3936 3937 if (IsD16) { 3938 // For packed D16 results with TFE enabled, all the data components are 3939 // S32. Cast back to the expected type. 3940 // 3941 // TODO: We don't really need to use load s32 elements. We would only need one 3942 // cast for the TFE result if a multiple of v2s16 was used. 3943 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 3944 for (Register &Reg : ResultRegs) 3945 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 3946 } else if (ST.hasUnpackedD16VMem()) { 3947 for (Register &Reg : ResultRegs) 3948 Reg = B.buildTrunc(S16, Reg).getReg(0); 3949 } 3950 } 3951 3952 auto padWithUndef = [&](LLT Ty, int NumElts) { 3953 if (NumElts == 0) 3954 return; 3955 Register Undef = B.buildUndef(Ty).getReg(0); 3956 for (int I = 0; I != NumElts; ++I) 3957 ResultRegs.push_back(Undef); 3958 }; 3959 3960 // Pad out any elements eliminated due to the dmask. 3961 LLT ResTy = MRI->getType(ResultRegs[0]); 3962 if (!ResTy.isVector()) { 3963 padWithUndef(ResTy, NumElts - ResultRegs.size()); 3964 B.buildBuildVector(DstReg, ResultRegs); 3965 return true; 3966 } 3967 3968 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 3969 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 3970 3971 // Deal with the one annoying legal case. 3972 const LLT V3S16 = LLT::vector(3, 16); 3973 if (Ty == V3S16) { 3974 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 3975 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 3976 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 3977 return true; 3978 } 3979 3980 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 3981 B.buildConcatVectors(DstReg, ResultRegs); 3982 return true; 3983 } 3984 3985 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3986 MachineInstr &MI, MachineIRBuilder &B, 3987 GISelChangeObserver &Observer) const { 3988 Register Dst = MI.getOperand(0).getReg(); 3989 LLT Ty = B.getMRI()->getType(Dst); 3990 unsigned Size = Ty.getSizeInBits(); 3991 MachineFunction &MF = B.getMF(); 3992 3993 Observer.changingInstr(MI); 3994 3995 // FIXME: We don't really need this intermediate instruction. The intrinsic 3996 // should be fixed to have a memory operand. Since it's readnone, we're not 3997 // allowed to add one. 3998 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 3999 MI.RemoveOperand(1); // Remove intrinsic ID 4000 4001 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4002 // TODO: Should this use datalayout alignment? 4003 const unsigned MemSize = (Size + 7) / 8; 4004 const Align MemAlign(4); 4005 MachineMemOperand *MMO = MF.getMachineMemOperand( 4006 MachinePointerInfo(), 4007 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4008 MachineMemOperand::MOInvariant, 4009 MemSize, MemAlign); 4010 MI.addMemOperand(MF, MMO); 4011 4012 // There are no 96-bit result scalar loads, but widening to 128-bit should 4013 // always be legal. We may need to restore this to a 96-bit result if it turns 4014 // out this needs to be converted to a vector load during RegBankSelect. 4015 if (!isPowerOf2_32(Size)) { 4016 LegalizerHelper Helper(MF, *this, Observer, B); 4017 B.setInstr(MI); 4018 4019 if (Ty.isVector()) 4020 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4021 else 4022 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4023 } 4024 4025 Observer.changedInstr(MI); 4026 return true; 4027 } 4028 4029 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4030 MachineRegisterInfo &MRI, 4031 MachineIRBuilder &B) const { 4032 B.setInstr(MI); 4033 4034 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4035 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4036 !ST.isTrapHandlerEnabled()) { 4037 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4038 } else { 4039 // Pass queue pointer to trap handler as input, and insert trap instruction 4040 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4041 const ArgDescriptor *Arg = 4042 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4043 if (!Arg) 4044 return false; 4045 MachineRegisterInfo &MRI = *B.getMRI(); 4046 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4047 Register LiveIn = getLiveInRegister( 4048 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4049 /*InsertLiveInCopy=*/false); 4050 if (!loadInputValue(LiveIn, B, Arg)) 4051 return false; 4052 B.buildCopy(SGPR01, LiveIn); 4053 B.buildInstr(AMDGPU::S_TRAP) 4054 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4055 .addReg(SGPR01, RegState::Implicit); 4056 } 4057 4058 MI.eraseFromParent(); 4059 return true; 4060 } 4061 4062 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4063 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4064 B.setInstr(MI); 4065 4066 // Is non-HSA path or trap-handler disabled? then, report a warning 4067 // accordingly 4068 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4069 !ST.isTrapHandlerEnabled()) { 4070 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4071 "debugtrap handler not supported", 4072 MI.getDebugLoc(), DS_Warning); 4073 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4074 Ctx.diagnose(NoTrap); 4075 } else { 4076 // Insert debug-trap instruction 4077 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4078 } 4079 4080 MI.eraseFromParent(); 4081 return true; 4082 } 4083 4084 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 4085 MachineIRBuilder &B, 4086 GISelChangeObserver &Observer) const { 4087 MachineRegisterInfo &MRI = *B.getMRI(); 4088 4089 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4090 auto IntrID = MI.getIntrinsicID(); 4091 switch (IntrID) { 4092 case Intrinsic::amdgcn_if: 4093 case Intrinsic::amdgcn_else: { 4094 MachineInstr *Br = nullptr; 4095 MachineBasicBlock *UncondBrTarget = nullptr; 4096 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4097 const SIRegisterInfo *TRI 4098 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4099 4100 B.setInstr(*BrCond); 4101 Register Def = MI.getOperand(1).getReg(); 4102 Register Use = MI.getOperand(3).getReg(); 4103 4104 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4105 if (IntrID == Intrinsic::amdgcn_if) { 4106 B.buildInstr(AMDGPU::SI_IF) 4107 .addDef(Def) 4108 .addUse(Use) 4109 .addMBB(UncondBrTarget); 4110 } else { 4111 B.buildInstr(AMDGPU::SI_ELSE) 4112 .addDef(Def) 4113 .addUse(Use) 4114 .addMBB(UncondBrTarget) 4115 .addImm(0); 4116 } 4117 4118 if (Br) { 4119 Br->getOperand(0).setMBB(CondBrTarget); 4120 } else { 4121 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4122 // since we're swapping branch targets it needs to be reinserted. 4123 // FIXME: IRTranslator should probably not do this 4124 B.buildBr(*CondBrTarget); 4125 } 4126 4127 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4128 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4129 MI.eraseFromParent(); 4130 BrCond->eraseFromParent(); 4131 return true; 4132 } 4133 4134 return false; 4135 } 4136 case Intrinsic::amdgcn_loop: { 4137 MachineInstr *Br = nullptr; 4138 MachineBasicBlock *UncondBrTarget = nullptr; 4139 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4140 const SIRegisterInfo *TRI 4141 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4142 4143 B.setInstr(*BrCond); 4144 4145 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4146 Register Reg = MI.getOperand(2).getReg(); 4147 B.buildInstr(AMDGPU::SI_LOOP) 4148 .addUse(Reg) 4149 .addMBB(UncondBrTarget); 4150 4151 if (Br) 4152 Br->getOperand(0).setMBB(CondBrTarget); 4153 else 4154 B.buildBr(*CondBrTarget); 4155 4156 MI.eraseFromParent(); 4157 BrCond->eraseFromParent(); 4158 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4159 return true; 4160 } 4161 4162 return false; 4163 } 4164 case Intrinsic::amdgcn_kernarg_segment_ptr: 4165 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4166 B.setInstr(MI); 4167 // This only makes sense to call in a kernel, so just lower to null. 4168 B.buildConstant(MI.getOperand(0).getReg(), 0); 4169 MI.eraseFromParent(); 4170 return true; 4171 } 4172 4173 return legalizePreloadedArgIntrin( 4174 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4175 case Intrinsic::amdgcn_implicitarg_ptr: 4176 return legalizeImplicitArgPtr(MI, MRI, B); 4177 case Intrinsic::amdgcn_workitem_id_x: 4178 return legalizePreloadedArgIntrin(MI, MRI, B, 4179 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4180 case Intrinsic::amdgcn_workitem_id_y: 4181 return legalizePreloadedArgIntrin(MI, MRI, B, 4182 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4183 case Intrinsic::amdgcn_workitem_id_z: 4184 return legalizePreloadedArgIntrin(MI, MRI, B, 4185 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4186 case Intrinsic::amdgcn_workgroup_id_x: 4187 return legalizePreloadedArgIntrin(MI, MRI, B, 4188 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4189 case Intrinsic::amdgcn_workgroup_id_y: 4190 return legalizePreloadedArgIntrin(MI, MRI, B, 4191 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4192 case Intrinsic::amdgcn_workgroup_id_z: 4193 return legalizePreloadedArgIntrin(MI, MRI, B, 4194 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4195 case Intrinsic::amdgcn_dispatch_ptr: 4196 return legalizePreloadedArgIntrin(MI, MRI, B, 4197 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4198 case Intrinsic::amdgcn_queue_ptr: 4199 return legalizePreloadedArgIntrin(MI, MRI, B, 4200 AMDGPUFunctionArgInfo::QUEUE_PTR); 4201 case Intrinsic::amdgcn_implicit_buffer_ptr: 4202 return legalizePreloadedArgIntrin( 4203 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4204 case Intrinsic::amdgcn_dispatch_id: 4205 return legalizePreloadedArgIntrin(MI, MRI, B, 4206 AMDGPUFunctionArgInfo::DISPATCH_ID); 4207 case Intrinsic::amdgcn_fdiv_fast: 4208 return legalizeFDIVFastIntrin(MI, MRI, B); 4209 case Intrinsic::amdgcn_is_shared: 4210 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4211 case Intrinsic::amdgcn_is_private: 4212 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4213 case Intrinsic::amdgcn_wavefrontsize: { 4214 B.setInstr(MI); 4215 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4216 MI.eraseFromParent(); 4217 return true; 4218 } 4219 case Intrinsic::amdgcn_s_buffer_load: 4220 return legalizeSBufferLoad(MI, B, Observer); 4221 case Intrinsic::amdgcn_raw_buffer_store: 4222 case Intrinsic::amdgcn_struct_buffer_store: 4223 return legalizeBufferStore(MI, MRI, B, false, false); 4224 case Intrinsic::amdgcn_raw_buffer_store_format: 4225 case Intrinsic::amdgcn_struct_buffer_store_format: 4226 return legalizeBufferStore(MI, MRI, B, false, true); 4227 case Intrinsic::amdgcn_raw_tbuffer_store: 4228 case Intrinsic::amdgcn_struct_tbuffer_store: 4229 return legalizeBufferStore(MI, MRI, B, true, true); 4230 case Intrinsic::amdgcn_raw_buffer_load: 4231 case Intrinsic::amdgcn_struct_buffer_load: 4232 return legalizeBufferLoad(MI, MRI, B, false, false); 4233 case Intrinsic::amdgcn_raw_buffer_load_format: 4234 case Intrinsic::amdgcn_struct_buffer_load_format: 4235 return legalizeBufferLoad(MI, MRI, B, true, false); 4236 case Intrinsic::amdgcn_raw_tbuffer_load: 4237 case Intrinsic::amdgcn_struct_tbuffer_load: 4238 return legalizeBufferLoad(MI, MRI, B, true, true); 4239 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4240 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4241 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4242 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4243 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4244 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4245 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4246 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4247 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4248 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4249 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4250 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4251 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4252 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4253 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4254 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4255 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4256 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4257 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4258 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4259 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4260 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4261 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4262 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4263 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4264 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4265 return legalizeBufferAtomic(MI, B, IntrID); 4266 case Intrinsic::amdgcn_atomic_inc: 4267 return legalizeAtomicIncDec(MI, B, true); 4268 case Intrinsic::amdgcn_atomic_dec: 4269 return legalizeAtomicIncDec(MI, B, false); 4270 case Intrinsic::trap: 4271 return legalizeTrapIntrinsic(MI, MRI, B); 4272 case Intrinsic::debugtrap: 4273 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4274 default: { 4275 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4276 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4277 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 4278 return true; 4279 } 4280 } 4281 4282 return true; 4283 } 4284