1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPULegalizerInfo.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "SIMachineFunctionInfo.h" 18 19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 20 #include "llvm/CodeGen/TargetOpcodes.h" 21 #include "llvm/CodeGen/ValueTypes.h" 22 #include "llvm/IR/DerivedTypes.h" 23 #include "llvm/IR/Type.h" 24 #include "llvm/Support/Debug.h" 25 26 using namespace llvm; 27 using namespace LegalizeActions; 28 using namespace LegalizeMutations; 29 using namespace LegalityPredicates; 30 31 32 static LegalityPredicate isMultiple32(unsigned TypeIdx, 33 unsigned MaxSize = 512) { 34 return [=](const LegalityQuery &Query) { 35 const LLT Ty = Query.Types[TypeIdx]; 36 const LLT EltTy = Ty.getScalarType(); 37 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 38 }; 39 } 40 41 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 42 return [=](const LegalityQuery &Query) { 43 const LLT Ty = Query.Types[TypeIdx]; 44 return Ty.isVector() && 45 Ty.getNumElements() % 2 != 0 && 46 Ty.getElementType().getSizeInBits() < 32; 47 }; 48 } 49 50 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 51 return [=](const LegalityQuery &Query) { 52 const LLT Ty = Query.Types[TypeIdx]; 53 const LLT EltTy = Ty.getElementType(); 54 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 55 }; 56 } 57 58 59 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, 60 const GCNTargetMachine &TM) { 61 using namespace TargetOpcode; 62 63 auto GetAddrSpacePtr = [&TM](unsigned AS) { 64 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 65 }; 66 67 const LLT S1 = LLT::scalar(1); 68 const LLT S8 = LLT::scalar(8); 69 const LLT S16 = LLT::scalar(16); 70 const LLT S32 = LLT::scalar(32); 71 const LLT S64 = LLT::scalar(64); 72 const LLT S128 = LLT::scalar(128); 73 const LLT S256 = LLT::scalar(256); 74 const LLT S512 = LLT::scalar(512); 75 76 const LLT V2S16 = LLT::vector(2, 16); 77 const LLT V4S16 = LLT::vector(4, 16); 78 const LLT V8S16 = LLT::vector(8, 16); 79 80 const LLT V2S32 = LLT::vector(2, 32); 81 const LLT V3S32 = LLT::vector(3, 32); 82 const LLT V4S32 = LLT::vector(4, 32); 83 const LLT V5S32 = LLT::vector(5, 32); 84 const LLT V6S32 = LLT::vector(6, 32); 85 const LLT V7S32 = LLT::vector(7, 32); 86 const LLT V8S32 = LLT::vector(8, 32); 87 const LLT V9S32 = LLT::vector(9, 32); 88 const LLT V10S32 = LLT::vector(10, 32); 89 const LLT V11S32 = LLT::vector(11, 32); 90 const LLT V12S32 = LLT::vector(12, 32); 91 const LLT V13S32 = LLT::vector(13, 32); 92 const LLT V14S32 = LLT::vector(14, 32); 93 const LLT V15S32 = LLT::vector(15, 32); 94 const LLT V16S32 = LLT::vector(16, 32); 95 96 const LLT V2S64 = LLT::vector(2, 64); 97 const LLT V3S64 = LLT::vector(3, 64); 98 const LLT V4S64 = LLT::vector(4, 64); 99 const LLT V5S64 = LLT::vector(5, 64); 100 const LLT V6S64 = LLT::vector(6, 64); 101 const LLT V7S64 = LLT::vector(7, 64); 102 const LLT V8S64 = LLT::vector(8, 64); 103 104 std::initializer_list<LLT> AllS32Vectors = 105 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 106 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32}; 107 std::initializer_list<LLT> AllS64Vectors = 108 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64}; 109 110 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 111 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 112 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 113 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 114 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 115 116 const LLT CodePtr = FlatPtr; 117 118 const LLT AddrSpaces[] = { 119 GlobalPtr, 120 ConstantPtr, 121 LocalPtr, 122 FlatPtr, 123 PrivatePtr 124 }; 125 126 setAction({G_BRCOND, S1}, Legal); 127 128 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_UMULH, G_SMULH}) 129 .legalFor({S32}) 130 .clampScalar(0, S32, S32) 131 .scalarize(0); 132 133 // Report legal for any types we can handle anywhere. For the cases only legal 134 // on the SALU, RegBankSelect will be able to re-legalize. 135 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 136 .legalFor({S32, S1, S64, V2S32, V2S16, V4S16}) 137 .clampScalar(0, S32, S64) 138 .scalarize(0); 139 140 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO, 141 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 142 .legalFor({{S32, S1}}) 143 .clampScalar(0, S32, S32); 144 145 getActionDefinitionsBuilder(G_BITCAST) 146 .legalForCartesianProduct({S32, V2S16}) 147 .legalForCartesianProduct({S64, V2S32, V4S16}) 148 .legalForCartesianProduct({V2S64, V4S32}) 149 // Don't worry about the size constraint. 150 .legalIf(all(isPointer(0), isPointer(1))); 151 152 getActionDefinitionsBuilder(G_FCONSTANT) 153 .legalFor({S32, S64, S16}); 154 155 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 156 .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 157 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 158 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 159 .clampScalarOrElt(0, S32, S512) 160 .legalIf(isMultiple32(0)) 161 .widenScalarToNextPow2(0, 32); 162 163 164 // FIXME: i1 operands to intrinsics should always be legal, but other i1 165 // values may not be legal. We need to figure out how to distinguish 166 // between these two scenarios. 167 getActionDefinitionsBuilder(G_CONSTANT) 168 .legalFor({S1, S32, S64, GlobalPtr, 169 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 170 .clampScalar(0, S32, S64) 171 .widenScalarToNextPow2(0) 172 .legalIf(isPointer(0)); 173 174 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 175 176 auto &FPOpActions = getActionDefinitionsBuilder( 177 { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE}) 178 .legalFor({S32, S64}); 179 180 if (ST.has16BitInsts()) { 181 if (ST.hasVOP3PInsts()) 182 FPOpActions.legalFor({S16, V2S16}); 183 else 184 FPOpActions.legalFor({S16}); 185 } 186 187 if (ST.hasVOP3PInsts()) 188 FPOpActions.clampMaxNumElements(0, S16, 2); 189 FPOpActions 190 .scalarize(0) 191 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 192 193 if (ST.has16BitInsts()) { 194 getActionDefinitionsBuilder(G_FSQRT) 195 .legalFor({S32, S64, S16}) 196 .scalarize(0) 197 .clampScalar(0, S16, S64); 198 } else { 199 getActionDefinitionsBuilder(G_FSQRT) 200 .legalFor({S32, S64}) 201 .scalarize(0) 202 .clampScalar(0, S32, S64); 203 } 204 205 getActionDefinitionsBuilder(G_FPTRUNC) 206 .legalFor({{S32, S64}, {S16, S32}}) 207 .scalarize(0); 208 209 getActionDefinitionsBuilder(G_FPEXT) 210 .legalFor({{S64, S32}, {S32, S16}}) 211 .lowerFor({{S64, S16}}) // FIXME: Implement 212 .scalarize(0); 213 214 getActionDefinitionsBuilder(G_FSUB) 215 // Use actual fsub instruction 216 .legalFor({S32}) 217 // Must use fadd + fneg 218 .lowerFor({S64, S16, V2S16}) 219 .scalarize(0) 220 .clampScalar(0, S32, S64); 221 222 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 223 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 224 {S32, S1}, {S64, S1}, {S16, S1}, 225 // FIXME: Hack 226 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 227 .scalarize(0); 228 229 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 230 .legalFor({{S32, S32}, {S64, S32}}) 231 .scalarize(0); 232 233 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 234 .legalFor({{S32, S32}, {S32, S64}}) 235 .scalarize(0); 236 237 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND}) 238 .legalFor({S32, S64}) 239 .scalarize(0); 240 241 for (LLT PtrTy : AddrSpaces) { 242 LLT IdxTy = LLT::scalar(PtrTy.getSizeInBits()); 243 setAction({G_GEP, PtrTy}, Legal); 244 setAction({G_GEP, 1, IdxTy}, Legal); 245 } 246 247 // FIXME: When RegBankSelect inserts copies, it will only create new registers 248 // with scalar types. This means we can end up with G_LOAD/G_STORE/G_GEP 249 // instruction with scalar types for their pointer operands. In assert builds, 250 // the instruction selector will assert if it sees a generic instruction which 251 // isn't legal, so we need to tell it that scalar types are legal for pointer 252 // operands 253 setAction({G_GEP, S64}, Legal); 254 255 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 256 257 getActionDefinitionsBuilder(G_ICMP) 258 .legalForCartesianProduct( 259 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 260 .legalFor({{S1, S32}, {S1, S64}}) 261 .widenScalarToNextPow2(1) 262 .clampScalar(1, S32, S64) 263 .scalarize(0) 264 .legalIf(all(typeIs(0, S1), isPointer(1))); 265 266 getActionDefinitionsBuilder(G_FCMP) 267 .legalFor({{S1, S32}, {S1, S64}}) 268 .widenScalarToNextPow2(1) 269 .clampScalar(1, S32, S64) 270 .scalarize(0); 271 272 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 273 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 274 G_FLOG, G_FLOG2, G_FLOG10}) 275 .legalFor({S32}) 276 .scalarize(0); 277 278 // The 64-bit versions produce 32-bit results, but only on the SALU. 279 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 280 G_CTTZ, G_CTTZ_ZERO_UNDEF, 281 G_CTPOP}) 282 .legalFor({{S32, S32}, {S32, S64}}) 283 .clampScalar(0, S32, S32) 284 .clampScalar(1, S32, S64); 285 // TODO: Scalarize 286 287 // TODO: Expand for > s32 288 getActionDefinitionsBuilder(G_BSWAP) 289 .legalFor({S32}) 290 .clampScalar(0, S32, S32) 291 .scalarize(0); 292 293 294 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 295 return [=](const LegalityQuery &Query) { 296 return Query.Types[TypeIdx0].getSizeInBits() < 297 Query.Types[TypeIdx1].getSizeInBits(); 298 }; 299 }; 300 301 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 302 return [=](const LegalityQuery &Query) { 303 return Query.Types[TypeIdx0].getSizeInBits() > 304 Query.Types[TypeIdx1].getSizeInBits(); 305 }; 306 }; 307 308 getActionDefinitionsBuilder(G_INTTOPTR) 309 // List the common cases 310 .legalForCartesianProduct({GlobalPtr, ConstantPtr, FlatPtr}, {S64}) 311 .legalForCartesianProduct({LocalPtr, PrivatePtr}, {S32}) 312 .scalarize(0) 313 // Accept any address space as long as the size matches 314 .legalIf(sameSize(0, 1)) 315 .widenScalarIf(smallerThan(1, 0), 316 [](const LegalityQuery &Query) { 317 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 318 }) 319 .narrowScalarIf(greaterThan(1, 0), 320 [](const LegalityQuery &Query) { 321 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 322 }); 323 324 getActionDefinitionsBuilder(G_PTRTOINT) 325 // List the common cases 326 .legalForCartesianProduct({GlobalPtr, ConstantPtr, FlatPtr}, {S64}) 327 .legalForCartesianProduct({LocalPtr, PrivatePtr}, {S32}) 328 .scalarize(0) 329 // Accept any address space as long as the size matches 330 .legalIf(sameSize(0, 1)) 331 .widenScalarIf(smallerThan(0, 1), 332 [](const LegalityQuery &Query) { 333 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 334 }) 335 .narrowScalarIf( 336 greaterThan(0, 1), 337 [](const LegalityQuery &Query) { 338 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 339 }); 340 341 if (ST.hasFlatAddressSpace()) { 342 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 343 .scalarize(0) 344 .custom(); 345 } 346 347 getActionDefinitionsBuilder({G_LOAD, G_STORE}) 348 .narrowScalarIf([](const LegalityQuery &Query) { 349 unsigned Size = Query.Types[0].getSizeInBits(); 350 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 351 return (Size > 32 && MemSize < Size); 352 }, 353 [](const LegalityQuery &Query) { 354 return std::make_pair(0, LLT::scalar(32)); 355 }) 356 .fewerElementsIf([=, &ST](const LegalityQuery &Query) { 357 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 358 return (MemSize == 96) && 359 Query.Types[0].isVector() && 360 ST.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS; 361 }, 362 [=](const LegalityQuery &Query) { 363 return std::make_pair(0, V2S32); 364 }) 365 .legalIf([=, &ST](const LegalityQuery &Query) { 366 const LLT &Ty0 = Query.Types[0]; 367 368 unsigned Size = Ty0.getSizeInBits(); 369 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 370 if (Size < 32 || (Size > 32 && MemSize < Size)) 371 return false; 372 373 if (Ty0.isVector() && Size != MemSize) 374 return false; 375 376 // TODO: Decompose private loads into 4-byte components. 377 // TODO: Illegal flat loads on SI 378 switch (MemSize) { 379 case 8: 380 case 16: 381 return Size == 32; 382 case 32: 383 case 64: 384 case 128: 385 return true; 386 387 case 96: 388 // XXX hasLoadX3 389 return (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS); 390 391 case 256: 392 case 512: 393 // TODO: constant loads 394 default: 395 return false; 396 } 397 }) 398 .clampScalar(0, S32, S64); 399 400 401 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 402 .legalForTypesWithMemSize({ 403 {S32, GlobalPtr, 8}, 404 {S32, GlobalPtr, 16}, 405 {S32, LocalPtr, 8}, 406 {S32, LocalPtr, 16}, 407 {S32, PrivatePtr, 8}, 408 {S32, PrivatePtr, 16}}); 409 if (ST.hasFlatAddressSpace()) { 410 ExtLoads.legalForTypesWithMemSize({{S32, FlatPtr, 8}, 411 {S32, FlatPtr, 16}}); 412 } 413 414 ExtLoads.clampScalar(0, S32, S32) 415 .widenScalarToNextPow2(0) 416 .unsupportedIfMemSizeNotPow2() 417 .lower(); 418 419 auto &Atomics = getActionDefinitionsBuilder( 420 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 421 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 422 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 423 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG}) 424 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 425 {S64, GlobalPtr}, {S64, LocalPtr}}); 426 if (ST.hasFlatAddressSpace()) { 427 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 428 } 429 430 // TODO: Pointer types, any 32-bit or 64-bit vector 431 getActionDefinitionsBuilder(G_SELECT) 432 .legalForCartesianProduct({S32, S64, V2S32, V2S16, V4S16, 433 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 434 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 435 .clampScalar(0, S32, S64) 436 .fewerElementsIf( 437 [=](const LegalityQuery &Query) { 438 if (Query.Types[1].isVector()) 439 return true; 440 441 LLT Ty = Query.Types[0]; 442 443 // FIXME: Hack until odd splits handled 444 return Ty.isVector() && 445 (Ty.getScalarSizeInBits() > 32 || Ty.getNumElements() % 2 != 0); 446 }, 447 scalarize(0)) 448 // FIXME: Handle 16-bit vectors better 449 .fewerElementsIf( 450 [=](const LegalityQuery &Query) { 451 return Query.Types[0].isVector() && 452 Query.Types[0].getElementType().getSizeInBits() < 32;}, 453 scalarize(0)) 454 .scalarize(1) 455 .clampMaxNumElements(0, S32, 2) 456 .clampMaxNumElements(0, LocalPtr, 2) 457 .clampMaxNumElements(0, PrivatePtr, 2) 458 .legalIf(all(isPointer(0), typeIs(1, S1))); 459 460 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 461 // be more flexible with the shift amount type. 462 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 463 .legalFor({{S32, S32}, {S64, S32}}); 464 if (ST.has16BitInsts()) { 465 if (ST.hasVOP3PInsts()) { 466 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 467 .clampMaxNumElements(0, S16, 2); 468 } else 469 Shifts.legalFor({{S16, S32}, {S16, S16}}); 470 471 Shifts.clampScalar(1, S16, S32); 472 Shifts.clampScalar(0, S16, S64); 473 Shifts.widenScalarToNextPow2(0, 16); 474 } else { 475 // Make sure we legalize the shift amount type first, as the general 476 // expansion for the shifted type will produce much worse code if it hasn't 477 // been truncated already. 478 Shifts.clampScalar(1, S32, S32); 479 Shifts.clampScalar(0, S32, S64); 480 Shifts.widenScalarToNextPow2(0, 32); 481 } 482 Shifts.scalarize(0); 483 484 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 485 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 486 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 487 unsigned IdxTypeIdx = 2; 488 489 getActionDefinitionsBuilder(Op) 490 .legalIf([=](const LegalityQuery &Query) { 491 const LLT &VecTy = Query.Types[VecTypeIdx]; 492 const LLT &IdxTy = Query.Types[IdxTypeIdx]; 493 return VecTy.getSizeInBits() % 32 == 0 && 494 VecTy.getSizeInBits() <= 512 && 495 IdxTy.getSizeInBits() == 32; 496 }) 497 .clampScalar(EltTypeIdx, S32, S64) 498 .clampScalar(VecTypeIdx, S32, S64) 499 .clampScalar(IdxTypeIdx, S32, S32); 500 } 501 502 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 503 .unsupportedIf([=](const LegalityQuery &Query) { 504 const LLT &EltTy = Query.Types[1].getElementType(); 505 return Query.Types[0] != EltTy; 506 }); 507 508 // FIXME: Doesn't handle extract of illegal sizes. 509 getActionDefinitionsBuilder({G_EXTRACT, G_INSERT}) 510 .legalIf([=](const LegalityQuery &Query) { 511 const LLT &Ty0 = Query.Types[0]; 512 const LLT &Ty1 = Query.Types[1]; 513 return (Ty0.getSizeInBits() % 16 == 0) && 514 (Ty1.getSizeInBits() % 16 == 0); 515 }) 516 .widenScalarIf( 517 [=](const LegalityQuery &Query) { 518 const LLT Ty1 = Query.Types[1]; 519 return (Ty1.getScalarSizeInBits() < 16); 520 }, 521 LegalizeMutations::widenScalarOrEltToNextPow2(1, 16)); 522 523 // TODO: vectors of pointers 524 getActionDefinitionsBuilder(G_BUILD_VECTOR) 525 .legalForCartesianProduct(AllS32Vectors, {S32}) 526 .legalForCartesianProduct(AllS64Vectors, {S64}) 527 .clampNumElements(0, V16S32, V16S32) 528 .clampNumElements(0, V2S64, V8S64) 529 .minScalarSameAs(1, 0) 530 // FIXME: Sort of a hack to make progress on other legalizations. 531 .legalIf([=](const LegalityQuery &Query) { 532 return Query.Types[0].getScalarSizeInBits() <= 32 || 533 Query.Types[0].getScalarSizeInBits() == 64; 534 }); 535 536 // TODO: Support any combination of v2s32 537 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 538 .legalFor({{V4S32, V2S32}, 539 {V8S32, V2S32}, 540 {V8S32, V4S32}, 541 {V4S64, V2S64}, 542 {V4S16, V2S16}, 543 {V8S16, V2S16}, 544 {V8S16, V4S16}, 545 {LLT::vector(4, LocalPtr), LLT::vector(2, LocalPtr)}, 546 {LLT::vector(4, PrivatePtr), LLT::vector(2, PrivatePtr)}}); 547 548 // Merge/Unmerge 549 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 550 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 551 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 552 553 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 554 const LLT &Ty = Query.Types[TypeIdx]; 555 if (Ty.isVector()) { 556 const LLT &EltTy = Ty.getElementType(); 557 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 558 return true; 559 if (!isPowerOf2_32(EltTy.getSizeInBits())) 560 return true; 561 } 562 return false; 563 }; 564 565 getActionDefinitionsBuilder(Op) 566 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 567 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 568 // worth considering the multiples of 64 since 2*192 and 2*384 are not 569 // valid. 570 .clampScalar(LitTyIdx, S16, S256) 571 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 572 573 // Break up vectors with weird elements into scalars 574 .fewerElementsIf( 575 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 576 scalarize(0)) 577 .fewerElementsIf( 578 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 579 scalarize(1)) 580 .clampScalar(BigTyIdx, S32, S512) 581 .widenScalarIf( 582 [=](const LegalityQuery &Query) { 583 const LLT &Ty = Query.Types[BigTyIdx]; 584 return !isPowerOf2_32(Ty.getSizeInBits()) && 585 Ty.getSizeInBits() % 16 != 0; 586 }, 587 [=](const LegalityQuery &Query) { 588 // Pick the next power of 2, or a multiple of 64 over 128. 589 // Whichever is smaller. 590 const LLT &Ty = Query.Types[BigTyIdx]; 591 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 592 if (NewSizeInBits >= 256) { 593 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 594 if (RoundedTo < NewSizeInBits) 595 NewSizeInBits = RoundedTo; 596 } 597 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 598 }) 599 .legalIf([=](const LegalityQuery &Query) { 600 const LLT &BigTy = Query.Types[BigTyIdx]; 601 const LLT &LitTy = Query.Types[LitTyIdx]; 602 603 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 604 return false; 605 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 606 return false; 607 608 return BigTy.getSizeInBits() % 16 == 0 && 609 LitTy.getSizeInBits() % 16 == 0 && 610 BigTy.getSizeInBits() <= 512; 611 }) 612 // Any vectors left are the wrong size. Scalarize them. 613 .scalarize(0) 614 .scalarize(1); 615 } 616 617 computeTables(); 618 verify(*ST.getInstrInfo()); 619 } 620 621 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 622 MachineRegisterInfo &MRI, 623 MachineIRBuilder &MIRBuilder, 624 GISelChangeObserver &Observer) const { 625 switch (MI.getOpcode()) { 626 case TargetOpcode::G_ADDRSPACE_CAST: 627 return legalizeAddrSpaceCast(MI, MRI, MIRBuilder); 628 default: 629 return false; 630 } 631 632 llvm_unreachable("expected switch to return"); 633 } 634 635 unsigned AMDGPULegalizerInfo::getSegmentAperture( 636 unsigned AS, 637 MachineRegisterInfo &MRI, 638 MachineIRBuilder &MIRBuilder) const { 639 MachineFunction &MF = MIRBuilder.getMF(); 640 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 641 const LLT S32 = LLT::scalar(32); 642 643 if (ST.hasApertureRegs()) { 644 // FIXME: Use inline constants (src_{shared, private}_base) instead of 645 // getreg. 646 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 647 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 648 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 649 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 650 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 651 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 652 unsigned Encoding = 653 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 654 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 655 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 656 657 unsigned ShiftAmt = MRI.createGenericVirtualRegister(S32); 658 unsigned ApertureReg = MRI.createGenericVirtualRegister(S32); 659 unsigned GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 660 661 MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32) 662 .addDef(GetReg) 663 .addImm(Encoding); 664 MRI.setType(GetReg, S32); 665 666 MIRBuilder.buildConstant(ShiftAmt, WidthM1 + 1); 667 MIRBuilder.buildInstr(TargetOpcode::G_SHL) 668 .addDef(ApertureReg) 669 .addUse(GetReg) 670 .addUse(ShiftAmt); 671 672 return ApertureReg; 673 } 674 675 unsigned QueuePtr = MRI.createGenericVirtualRegister( 676 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 677 678 // FIXME: Placeholder until we can track the input registers. 679 MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef); 680 681 // Offset into amd_queue_t for group_segment_aperture_base_hi / 682 // private_segment_aperture_base_hi. 683 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 684 685 // FIXME: Don't use undef 686 Value *V = UndefValue::get(PointerType::get( 687 Type::getInt8Ty(MF.getFunction().getContext()), 688 AMDGPUAS::CONSTANT_ADDRESS)); 689 690 MachinePointerInfo PtrInfo(V, StructOffset); 691 MachineMemOperand *MMO = MF.getMachineMemOperand( 692 PtrInfo, 693 MachineMemOperand::MOLoad | 694 MachineMemOperand::MODereferenceable | 695 MachineMemOperand::MOInvariant, 696 4, 697 MinAlign(64, StructOffset)); 698 699 unsigned LoadResult = MRI.createGenericVirtualRegister(S32); 700 unsigned LoadAddr = AMDGPU::NoRegister; 701 702 MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 703 MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO); 704 return LoadResult; 705 } 706 707 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 708 MachineInstr &MI, MachineRegisterInfo &MRI, 709 MachineIRBuilder &MIRBuilder) const { 710 MachineFunction &MF = MIRBuilder.getMF(); 711 712 MIRBuilder.setInstr(MI); 713 714 unsigned Dst = MI.getOperand(0).getReg(); 715 unsigned Src = MI.getOperand(1).getReg(); 716 717 LLT DstTy = MRI.getType(Dst); 718 LLT SrcTy = MRI.getType(Src); 719 unsigned DestAS = DstTy.getAddressSpace(); 720 unsigned SrcAS = SrcTy.getAddressSpace(); 721 722 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 723 // vector element. 724 assert(!DstTy.isVector()); 725 726 const AMDGPUTargetMachine &TM 727 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 728 729 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 730 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 731 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST)); 732 return true; 733 } 734 735 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 736 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 737 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 738 unsigned NullVal = TM.getNullPointerValue(DestAS); 739 740 unsigned SegmentNullReg = MRI.createGenericVirtualRegister(DstTy); 741 unsigned FlatNullReg = MRI.createGenericVirtualRegister(SrcTy); 742 743 MIRBuilder.buildConstant(SegmentNullReg, NullVal); 744 MIRBuilder.buildConstant(FlatNullReg, 0); 745 746 unsigned PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 747 748 // Extract low 32-bits of the pointer. 749 MIRBuilder.buildExtract(PtrLo32, Src, 0); 750 751 unsigned CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 752 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNullReg); 753 MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNullReg); 754 755 MI.eraseFromParent(); 756 return true; 757 } 758 759 assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS || 760 SrcAS == AMDGPUAS::PRIVATE_ADDRESS); 761 762 unsigned FlatNullReg = MRI.createGenericVirtualRegister(DstTy); 763 unsigned SegmentNullReg = MRI.createGenericVirtualRegister(SrcTy); 764 MIRBuilder.buildConstant(SegmentNullReg, TM.getNullPointerValue(SrcAS)); 765 MIRBuilder.buildConstant(FlatNullReg, TM.getNullPointerValue(DestAS)); 766 767 unsigned ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder); 768 769 unsigned CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 770 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNullReg); 771 772 unsigned BuildPtr = MRI.createGenericVirtualRegister(DstTy); 773 774 // Coerce the type of the low half of the result so we can use merge_values. 775 unsigned SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32)); 776 MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT) 777 .addDef(SrcAsInt) 778 .addUse(Src); 779 780 // TODO: Should we allow mismatched types but matching sizes in merges to 781 // avoid the ptrtoint? 782 MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 783 MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNullReg); 784 785 MI.eraseFromParent(); 786 return true; 787 } 788