1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPULegalizerInfo.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "SIMachineFunctionInfo.h" 18 19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 20 #include "llvm/CodeGen/TargetOpcodes.h" 21 #include "llvm/CodeGen/ValueTypes.h" 22 #include "llvm/IR/DerivedTypes.h" 23 #include "llvm/IR/Type.h" 24 #include "llvm/Support/Debug.h" 25 26 using namespace llvm; 27 using namespace LegalizeActions; 28 using namespace LegalizeMutations; 29 using namespace LegalityPredicates; 30 31 32 static LegalityPredicate isMultiple32(unsigned TypeIdx, 33 unsigned MaxSize = 512) { 34 return [=](const LegalityQuery &Query) { 35 const LLT Ty = Query.Types[TypeIdx]; 36 const LLT EltTy = Ty.getScalarType(); 37 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 38 }; 39 } 40 41 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 42 return [=](const LegalityQuery &Query) { 43 const LLT Ty = Query.Types[TypeIdx]; 44 return Ty.isVector() && 45 Ty.getNumElements() % 2 != 0 && 46 Ty.getElementType().getSizeInBits() < 32; 47 }; 48 } 49 50 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 51 return [=](const LegalityQuery &Query) { 52 const LLT Ty = Query.Types[TypeIdx]; 53 const LLT EltTy = Ty.getElementType(); 54 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 55 }; 56 } 57 58 59 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, 60 const GCNTargetMachine &TM) { 61 using namespace TargetOpcode; 62 63 auto GetAddrSpacePtr = [&TM](unsigned AS) { 64 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 65 }; 66 67 const LLT S1 = LLT::scalar(1); 68 const LLT S8 = LLT::scalar(8); 69 const LLT S16 = LLT::scalar(16); 70 const LLT S32 = LLT::scalar(32); 71 const LLT S64 = LLT::scalar(64); 72 const LLT S128 = LLT::scalar(128); 73 const LLT S256 = LLT::scalar(256); 74 const LLT S512 = LLT::scalar(512); 75 76 const LLT V2S16 = LLT::vector(2, 16); 77 const LLT V4S16 = LLT::vector(4, 16); 78 const LLT V8S16 = LLT::vector(8, 16); 79 80 const LLT V2S32 = LLT::vector(2, 32); 81 const LLT V3S32 = LLT::vector(3, 32); 82 const LLT V4S32 = LLT::vector(4, 32); 83 const LLT V5S32 = LLT::vector(5, 32); 84 const LLT V6S32 = LLT::vector(6, 32); 85 const LLT V7S32 = LLT::vector(7, 32); 86 const LLT V8S32 = LLT::vector(8, 32); 87 const LLT V9S32 = LLT::vector(9, 32); 88 const LLT V10S32 = LLT::vector(10, 32); 89 const LLT V11S32 = LLT::vector(11, 32); 90 const LLT V12S32 = LLT::vector(12, 32); 91 const LLT V13S32 = LLT::vector(13, 32); 92 const LLT V14S32 = LLT::vector(14, 32); 93 const LLT V15S32 = LLT::vector(15, 32); 94 const LLT V16S32 = LLT::vector(16, 32); 95 96 const LLT V2S64 = LLT::vector(2, 64); 97 const LLT V3S64 = LLT::vector(3, 64); 98 const LLT V4S64 = LLT::vector(4, 64); 99 const LLT V5S64 = LLT::vector(5, 64); 100 const LLT V6S64 = LLT::vector(6, 64); 101 const LLT V7S64 = LLT::vector(7, 64); 102 const LLT V8S64 = LLT::vector(8, 64); 103 104 std::initializer_list<LLT> AllS32Vectors = 105 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 106 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32}; 107 std::initializer_list<LLT> AllS64Vectors = 108 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64}; 109 110 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 111 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 112 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 113 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 114 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 115 116 const LLT CodePtr = FlatPtr; 117 118 const std::initializer_list<LLT> AddrSpaces64 = { 119 GlobalPtr, ConstantPtr, FlatPtr 120 }; 121 122 const std::initializer_list<LLT> AddrSpaces32 = { 123 LocalPtr, PrivatePtr 124 }; 125 126 setAction({G_BRCOND, S1}, Legal); 127 128 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_UMULH, G_SMULH}) 129 .legalFor({S32}) 130 .clampScalar(0, S32, S32) 131 .scalarize(0); 132 133 // Report legal for any types we can handle anywhere. For the cases only legal 134 // on the SALU, RegBankSelect will be able to re-legalize. 135 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 136 .legalFor({S32, S1, S64, V2S32, V2S16, V4S16}) 137 .clampScalar(0, S32, S64) 138 .scalarize(0); 139 140 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO, 141 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 142 .legalFor({{S32, S1}}) 143 .clampScalar(0, S32, S32); 144 145 getActionDefinitionsBuilder(G_BITCAST) 146 .legalForCartesianProduct({S32, V2S16}) 147 .legalForCartesianProduct({S64, V2S32, V4S16}) 148 .legalForCartesianProduct({V2S64, V4S32}) 149 // Don't worry about the size constraint. 150 .legalIf(all(isPointer(0), isPointer(1))); 151 152 if (ST.has16BitInsts()) { 153 getActionDefinitionsBuilder(G_FCONSTANT) 154 .legalFor({S32, S64, S16}) 155 .clampScalar(0, S16, S64); 156 } else { 157 getActionDefinitionsBuilder(G_FCONSTANT) 158 .legalFor({S32, S64}) 159 .clampScalar(0, S32, S64); 160 } 161 162 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 163 .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 164 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 165 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 166 .clampScalarOrElt(0, S32, S512) 167 .legalIf(isMultiple32(0)) 168 .widenScalarToNextPow2(0, 32); 169 170 171 // FIXME: i1 operands to intrinsics should always be legal, but other i1 172 // values may not be legal. We need to figure out how to distinguish 173 // between these two scenarios. 174 getActionDefinitionsBuilder(G_CONSTANT) 175 .legalFor({S1, S32, S64, GlobalPtr, 176 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 177 .clampScalar(0, S32, S64) 178 .widenScalarToNextPow2(0) 179 .legalIf(isPointer(0)); 180 181 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 182 183 auto &FPOpActions = getActionDefinitionsBuilder( 184 { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE}) 185 .legalFor({S32, S64}); 186 187 if (ST.has16BitInsts()) { 188 if (ST.hasVOP3PInsts()) 189 FPOpActions.legalFor({S16, V2S16}); 190 else 191 FPOpActions.legalFor({S16}); 192 } 193 194 if (ST.hasVOP3PInsts()) 195 FPOpActions.clampMaxNumElements(0, S16, 2); 196 FPOpActions 197 .scalarize(0) 198 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 199 200 if (ST.has16BitInsts()) { 201 getActionDefinitionsBuilder(G_FSQRT) 202 .legalFor({S32, S64, S16}) 203 .scalarize(0) 204 .clampScalar(0, S16, S64); 205 } else { 206 getActionDefinitionsBuilder(G_FSQRT) 207 .legalFor({S32, S64}) 208 .scalarize(0) 209 .clampScalar(0, S32, S64); 210 } 211 212 getActionDefinitionsBuilder(G_FPTRUNC) 213 .legalFor({{S32, S64}, {S16, S32}}) 214 .scalarize(0); 215 216 getActionDefinitionsBuilder(G_FPEXT) 217 .legalFor({{S64, S32}, {S32, S16}}) 218 .lowerFor({{S64, S16}}) // FIXME: Implement 219 .scalarize(0); 220 221 getActionDefinitionsBuilder(G_FSUB) 222 // Use actual fsub instruction 223 .legalFor({S32}) 224 // Must use fadd + fneg 225 .lowerFor({S64, S16, V2S16}) 226 .scalarize(0) 227 .clampScalar(0, S32, S64); 228 229 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 230 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 231 {S32, S1}, {S64, S1}, {S16, S1}, 232 // FIXME: Hack 233 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 234 .scalarize(0); 235 236 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 237 .legalFor({{S32, S32}, {S64, S32}}) 238 .scalarize(0); 239 240 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 241 .legalFor({{S32, S32}, {S32, S64}}) 242 .scalarize(0); 243 244 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND}) 245 .legalFor({S32, S64}) 246 .scalarize(0); 247 248 249 getActionDefinitionsBuilder(G_GEP) 250 .legalForCartesianProduct(AddrSpaces64, {S64}) 251 .legalForCartesianProduct(AddrSpaces32, {S32}) 252 .scalarize(0); 253 254 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 255 256 getActionDefinitionsBuilder(G_ICMP) 257 .legalForCartesianProduct( 258 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 259 .legalFor({{S1, S32}, {S1, S64}}) 260 .widenScalarToNextPow2(1) 261 .clampScalar(1, S32, S64) 262 .scalarize(0) 263 .legalIf(all(typeIs(0, S1), isPointer(1))); 264 265 getActionDefinitionsBuilder(G_FCMP) 266 .legalFor({{S1, S32}, {S1, S64}}) 267 .widenScalarToNextPow2(1) 268 .clampScalar(1, S32, S64) 269 .scalarize(0); 270 271 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 272 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 273 G_FLOG, G_FLOG2, G_FLOG10}) 274 .legalFor({S32}) 275 .scalarize(0); 276 277 // The 64-bit versions produce 32-bit results, but only on the SALU. 278 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 279 G_CTTZ, G_CTTZ_ZERO_UNDEF, 280 G_CTPOP}) 281 .legalFor({{S32, S32}, {S32, S64}}) 282 .clampScalar(0, S32, S32) 283 .clampScalar(1, S32, S64); 284 // TODO: Scalarize 285 286 // TODO: Expand for > s32 287 getActionDefinitionsBuilder(G_BSWAP) 288 .legalFor({S32}) 289 .clampScalar(0, S32, S32) 290 .scalarize(0); 291 292 293 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 294 return [=](const LegalityQuery &Query) { 295 return Query.Types[TypeIdx0].getSizeInBits() < 296 Query.Types[TypeIdx1].getSizeInBits(); 297 }; 298 }; 299 300 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 301 return [=](const LegalityQuery &Query) { 302 return Query.Types[TypeIdx0].getSizeInBits() > 303 Query.Types[TypeIdx1].getSizeInBits(); 304 }; 305 }; 306 307 getActionDefinitionsBuilder(G_INTTOPTR) 308 // List the common cases 309 .legalForCartesianProduct(AddrSpaces64, {S64}) 310 .legalForCartesianProduct(AddrSpaces32, {S32}) 311 .scalarize(0) 312 // Accept any address space as long as the size matches 313 .legalIf(sameSize(0, 1)) 314 .widenScalarIf(smallerThan(1, 0), 315 [](const LegalityQuery &Query) { 316 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 317 }) 318 .narrowScalarIf(greaterThan(1, 0), 319 [](const LegalityQuery &Query) { 320 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 321 }); 322 323 getActionDefinitionsBuilder(G_PTRTOINT) 324 // List the common cases 325 .legalForCartesianProduct(AddrSpaces64, {S64}) 326 .legalForCartesianProduct(AddrSpaces32, {S32}) 327 .scalarize(0) 328 // Accept any address space as long as the size matches 329 .legalIf(sameSize(0, 1)) 330 .widenScalarIf(smallerThan(0, 1), 331 [](const LegalityQuery &Query) { 332 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 333 }) 334 .narrowScalarIf( 335 greaterThan(0, 1), 336 [](const LegalityQuery &Query) { 337 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 338 }); 339 340 if (ST.hasFlatAddressSpace()) { 341 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 342 .scalarize(0) 343 .custom(); 344 } 345 346 getActionDefinitionsBuilder({G_LOAD, G_STORE}) 347 .narrowScalarIf([](const LegalityQuery &Query) { 348 unsigned Size = Query.Types[0].getSizeInBits(); 349 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 350 return (Size > 32 && MemSize < Size); 351 }, 352 [](const LegalityQuery &Query) { 353 return std::make_pair(0, LLT::scalar(32)); 354 }) 355 .fewerElementsIf([=, &ST](const LegalityQuery &Query) { 356 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 357 return (MemSize == 96) && 358 Query.Types[0].isVector() && 359 ST.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS; 360 }, 361 [=](const LegalityQuery &Query) { 362 return std::make_pair(0, V2S32); 363 }) 364 .legalIf([=, &ST](const LegalityQuery &Query) { 365 const LLT &Ty0 = Query.Types[0]; 366 367 unsigned Size = Ty0.getSizeInBits(); 368 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 369 if (Size < 32 || (Size > 32 && MemSize < Size)) 370 return false; 371 372 if (Ty0.isVector() && Size != MemSize) 373 return false; 374 375 // TODO: Decompose private loads into 4-byte components. 376 // TODO: Illegal flat loads on SI 377 switch (MemSize) { 378 case 8: 379 case 16: 380 return Size == 32; 381 case 32: 382 case 64: 383 case 128: 384 return true; 385 386 case 96: 387 // XXX hasLoadX3 388 return (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS); 389 390 case 256: 391 case 512: 392 // TODO: constant loads 393 default: 394 return false; 395 } 396 }) 397 .clampScalar(0, S32, S64); 398 399 400 // FIXME: Handle alignment requirements. 401 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 402 .legalForTypesWithMemDesc({ 403 {S32, GlobalPtr, 8, 8}, 404 {S32, GlobalPtr, 16, 8}, 405 {S32, LocalPtr, 8, 8}, 406 {S32, LocalPtr, 16, 8}, 407 {S32, PrivatePtr, 8, 8}, 408 {S32, PrivatePtr, 16, 8}}); 409 if (ST.hasFlatAddressSpace()) { 410 ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8}, 411 {S32, FlatPtr, 16, 8}}); 412 } 413 414 ExtLoads.clampScalar(0, S32, S32) 415 .widenScalarToNextPow2(0) 416 .unsupportedIfMemSizeNotPow2() 417 .lower(); 418 419 auto &Atomics = getActionDefinitionsBuilder( 420 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 421 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 422 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 423 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG}) 424 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 425 {S64, GlobalPtr}, {S64, LocalPtr}}); 426 if (ST.hasFlatAddressSpace()) { 427 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 428 } 429 430 // TODO: Pointer types, any 32-bit or 64-bit vector 431 getActionDefinitionsBuilder(G_SELECT) 432 .legalForCartesianProduct({S32, S64, V2S32, V2S16, V4S16, 433 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 434 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 435 .clampScalar(0, S32, S64) 436 .fewerElementsIf( 437 [=](const LegalityQuery &Query) { 438 if (Query.Types[1].isVector()) 439 return true; 440 441 LLT Ty = Query.Types[0]; 442 443 // FIXME: Hack until odd splits handled 444 return Ty.isVector() && 445 (Ty.getScalarSizeInBits() > 32 || Ty.getNumElements() % 2 != 0); 446 }, 447 scalarize(0)) 448 // FIXME: Handle 16-bit vectors better 449 .fewerElementsIf( 450 [=](const LegalityQuery &Query) { 451 return Query.Types[0].isVector() && 452 Query.Types[0].getElementType().getSizeInBits() < 32;}, 453 scalarize(0)) 454 .scalarize(1) 455 .clampMaxNumElements(0, S32, 2) 456 .clampMaxNumElements(0, LocalPtr, 2) 457 .clampMaxNumElements(0, PrivatePtr, 2) 458 .legalIf(all(isPointer(0), typeIs(1, S1))); 459 460 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 461 // be more flexible with the shift amount type. 462 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 463 .legalFor({{S32, S32}, {S64, S32}}); 464 if (ST.has16BitInsts()) { 465 if (ST.hasVOP3PInsts()) { 466 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 467 .clampMaxNumElements(0, S16, 2); 468 } else 469 Shifts.legalFor({{S16, S32}, {S16, S16}}); 470 471 Shifts.clampScalar(1, S16, S32); 472 Shifts.clampScalar(0, S16, S64); 473 Shifts.widenScalarToNextPow2(0, 16); 474 } else { 475 // Make sure we legalize the shift amount type first, as the general 476 // expansion for the shifted type will produce much worse code if it hasn't 477 // been truncated already. 478 Shifts.clampScalar(1, S32, S32); 479 Shifts.clampScalar(0, S32, S64); 480 Shifts.widenScalarToNextPow2(0, 32); 481 } 482 Shifts.scalarize(0); 483 484 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 485 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 486 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 487 unsigned IdxTypeIdx = 2; 488 489 getActionDefinitionsBuilder(Op) 490 .legalIf([=](const LegalityQuery &Query) { 491 const LLT &VecTy = Query.Types[VecTypeIdx]; 492 const LLT &IdxTy = Query.Types[IdxTypeIdx]; 493 return VecTy.getSizeInBits() % 32 == 0 && 494 VecTy.getSizeInBits() <= 512 && 495 IdxTy.getSizeInBits() == 32; 496 }) 497 .clampScalar(EltTypeIdx, S32, S64) 498 .clampScalar(VecTypeIdx, S32, S64) 499 .clampScalar(IdxTypeIdx, S32, S32); 500 } 501 502 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 503 .unsupportedIf([=](const LegalityQuery &Query) { 504 const LLT &EltTy = Query.Types[1].getElementType(); 505 return Query.Types[0] != EltTy; 506 }); 507 508 // FIXME: Doesn't handle extract of illegal sizes. 509 getActionDefinitionsBuilder({G_EXTRACT, G_INSERT}) 510 .legalIf([=](const LegalityQuery &Query) { 511 const LLT &Ty0 = Query.Types[0]; 512 const LLT &Ty1 = Query.Types[1]; 513 return (Ty0.getSizeInBits() % 16 == 0) && 514 (Ty1.getSizeInBits() % 16 == 0); 515 }) 516 .widenScalarIf( 517 [=](const LegalityQuery &Query) { 518 const LLT Ty1 = Query.Types[1]; 519 return (Ty1.getScalarSizeInBits() < 16); 520 }, 521 LegalizeMutations::widenScalarOrEltToNextPow2(1, 16)); 522 523 // TODO: vectors of pointers 524 getActionDefinitionsBuilder(G_BUILD_VECTOR) 525 .legalForCartesianProduct(AllS32Vectors, {S32}) 526 .legalForCartesianProduct(AllS64Vectors, {S64}) 527 .clampNumElements(0, V16S32, V16S32) 528 .clampNumElements(0, V2S64, V8S64) 529 .minScalarSameAs(1, 0) 530 // FIXME: Sort of a hack to make progress on other legalizations. 531 .legalIf([=](const LegalityQuery &Query) { 532 return Query.Types[0].getScalarSizeInBits() <= 32 || 533 Query.Types[0].getScalarSizeInBits() == 64; 534 }); 535 536 // TODO: Support any combination of v2s32 537 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 538 .legalFor({{V4S32, V2S32}, 539 {V8S32, V2S32}, 540 {V8S32, V4S32}, 541 {V4S64, V2S64}, 542 {V4S16, V2S16}, 543 {V8S16, V2S16}, 544 {V8S16, V4S16}, 545 {LLT::vector(4, LocalPtr), LLT::vector(2, LocalPtr)}, 546 {LLT::vector(4, PrivatePtr), LLT::vector(2, PrivatePtr)}}); 547 548 // Merge/Unmerge 549 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 550 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 551 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 552 553 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 554 const LLT &Ty = Query.Types[TypeIdx]; 555 if (Ty.isVector()) { 556 const LLT &EltTy = Ty.getElementType(); 557 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 558 return true; 559 if (!isPowerOf2_32(EltTy.getSizeInBits())) 560 return true; 561 } 562 return false; 563 }; 564 565 getActionDefinitionsBuilder(Op) 566 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 567 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 568 // worth considering the multiples of 64 since 2*192 and 2*384 are not 569 // valid. 570 .clampScalar(LitTyIdx, S16, S256) 571 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 572 573 // Break up vectors with weird elements into scalars 574 .fewerElementsIf( 575 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 576 scalarize(0)) 577 .fewerElementsIf( 578 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 579 scalarize(1)) 580 .clampScalar(BigTyIdx, S32, S512) 581 .widenScalarIf( 582 [=](const LegalityQuery &Query) { 583 const LLT &Ty = Query.Types[BigTyIdx]; 584 return !isPowerOf2_32(Ty.getSizeInBits()) && 585 Ty.getSizeInBits() % 16 != 0; 586 }, 587 [=](const LegalityQuery &Query) { 588 // Pick the next power of 2, or a multiple of 64 over 128. 589 // Whichever is smaller. 590 const LLT &Ty = Query.Types[BigTyIdx]; 591 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 592 if (NewSizeInBits >= 256) { 593 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 594 if (RoundedTo < NewSizeInBits) 595 NewSizeInBits = RoundedTo; 596 } 597 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 598 }) 599 .legalIf([=](const LegalityQuery &Query) { 600 const LLT &BigTy = Query.Types[BigTyIdx]; 601 const LLT &LitTy = Query.Types[LitTyIdx]; 602 603 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 604 return false; 605 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 606 return false; 607 608 return BigTy.getSizeInBits() % 16 == 0 && 609 LitTy.getSizeInBits() % 16 == 0 && 610 BigTy.getSizeInBits() <= 512; 611 }) 612 // Any vectors left are the wrong size. Scalarize them. 613 .scalarize(0) 614 .scalarize(1); 615 } 616 617 computeTables(); 618 verify(*ST.getInstrInfo()); 619 } 620 621 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 622 MachineRegisterInfo &MRI, 623 MachineIRBuilder &MIRBuilder, 624 GISelChangeObserver &Observer) const { 625 switch (MI.getOpcode()) { 626 case TargetOpcode::G_ADDRSPACE_CAST: 627 return legalizeAddrSpaceCast(MI, MRI, MIRBuilder); 628 default: 629 return false; 630 } 631 632 llvm_unreachable("expected switch to return"); 633 } 634 635 unsigned AMDGPULegalizerInfo::getSegmentAperture( 636 unsigned AS, 637 MachineRegisterInfo &MRI, 638 MachineIRBuilder &MIRBuilder) const { 639 MachineFunction &MF = MIRBuilder.getMF(); 640 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 641 const LLT S32 = LLT::scalar(32); 642 643 if (ST.hasApertureRegs()) { 644 // FIXME: Use inline constants (src_{shared, private}_base) instead of 645 // getreg. 646 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 647 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 648 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 649 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 650 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 651 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 652 unsigned Encoding = 653 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 654 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 655 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 656 657 unsigned ShiftAmt = MRI.createGenericVirtualRegister(S32); 658 unsigned ApertureReg = MRI.createGenericVirtualRegister(S32); 659 unsigned GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 660 661 MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32) 662 .addDef(GetReg) 663 .addImm(Encoding); 664 MRI.setType(GetReg, S32); 665 666 MIRBuilder.buildConstant(ShiftAmt, WidthM1 + 1); 667 MIRBuilder.buildInstr(TargetOpcode::G_SHL) 668 .addDef(ApertureReg) 669 .addUse(GetReg) 670 .addUse(ShiftAmt); 671 672 return ApertureReg; 673 } 674 675 unsigned QueuePtr = MRI.createGenericVirtualRegister( 676 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 677 678 // FIXME: Placeholder until we can track the input registers. 679 MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef); 680 681 // Offset into amd_queue_t for group_segment_aperture_base_hi / 682 // private_segment_aperture_base_hi. 683 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 684 685 // FIXME: Don't use undef 686 Value *V = UndefValue::get(PointerType::get( 687 Type::getInt8Ty(MF.getFunction().getContext()), 688 AMDGPUAS::CONSTANT_ADDRESS)); 689 690 MachinePointerInfo PtrInfo(V, StructOffset); 691 MachineMemOperand *MMO = MF.getMachineMemOperand( 692 PtrInfo, 693 MachineMemOperand::MOLoad | 694 MachineMemOperand::MODereferenceable | 695 MachineMemOperand::MOInvariant, 696 4, 697 MinAlign(64, StructOffset)); 698 699 unsigned LoadResult = MRI.createGenericVirtualRegister(S32); 700 unsigned LoadAddr = AMDGPU::NoRegister; 701 702 MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 703 MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO); 704 return LoadResult; 705 } 706 707 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 708 MachineInstr &MI, MachineRegisterInfo &MRI, 709 MachineIRBuilder &MIRBuilder) const { 710 MachineFunction &MF = MIRBuilder.getMF(); 711 712 MIRBuilder.setInstr(MI); 713 714 unsigned Dst = MI.getOperand(0).getReg(); 715 unsigned Src = MI.getOperand(1).getReg(); 716 717 LLT DstTy = MRI.getType(Dst); 718 LLT SrcTy = MRI.getType(Src); 719 unsigned DestAS = DstTy.getAddressSpace(); 720 unsigned SrcAS = SrcTy.getAddressSpace(); 721 722 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 723 // vector element. 724 assert(!DstTy.isVector()); 725 726 const AMDGPUTargetMachine &TM 727 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 728 729 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 730 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 731 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST)); 732 return true; 733 } 734 735 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 736 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 737 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 738 unsigned NullVal = TM.getNullPointerValue(DestAS); 739 740 unsigned SegmentNullReg = MRI.createGenericVirtualRegister(DstTy); 741 unsigned FlatNullReg = MRI.createGenericVirtualRegister(SrcTy); 742 743 MIRBuilder.buildConstant(SegmentNullReg, NullVal); 744 MIRBuilder.buildConstant(FlatNullReg, 0); 745 746 unsigned PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 747 748 // Extract low 32-bits of the pointer. 749 MIRBuilder.buildExtract(PtrLo32, Src, 0); 750 751 unsigned CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 752 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNullReg); 753 MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNullReg); 754 755 MI.eraseFromParent(); 756 return true; 757 } 758 759 assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS || 760 SrcAS == AMDGPUAS::PRIVATE_ADDRESS); 761 762 unsigned FlatNullReg = MRI.createGenericVirtualRegister(DstTy); 763 unsigned SegmentNullReg = MRI.createGenericVirtualRegister(SrcTy); 764 MIRBuilder.buildConstant(SegmentNullReg, TM.getNullPointerValue(SrcAS)); 765 MIRBuilder.buildConstant(FlatNullReg, TM.getNullPointerValue(DestAS)); 766 767 unsigned ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder); 768 769 unsigned CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 770 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNullReg); 771 772 unsigned BuildPtr = MRI.createGenericVirtualRegister(DstTy); 773 774 // Coerce the type of the low half of the result so we can use merge_values. 775 unsigned SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32)); 776 MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT) 777 .addDef(SrcAsInt) 778 .addUse(Src); 779 780 // TODO: Should we allow mismatched types but matching sizes in merges to 781 // avoid the ptrtoint? 782 MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 783 MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNullReg); 784 785 MI.eraseFromParent(); 786 return true; 787 } 788