1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 // ds_read_b32 v0, v2 offset:16
12 // ds_read_b32 v1, v2 offset:32
13 // ==>
14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 // s_buffer_load_dword s4, s[0:3], 4
18 // s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 // s_movk_i32 s0, 0x1800
28 // v_add_co_u32_e32 v0, vcc, s0, v2
29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 // s_movk_i32 s0, 0x1000
32 // v_add_co_u32_e32 v5, vcc, s0, v2
33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 // global_load_dwordx2 v[5:6], v[5:6], off
35 // global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 // s_movk_i32 s0, 0x1000
38 // v_add_co_u32_e32 v5, vcc, s0, v2
39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 // global_load_dwordx2 v[5:6], v[5:6], off
41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 // the constant into the data register is placed between the stores, although
47 // this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 // one pair, and recomputes live intervals and moves on to the next pair. It
51 // would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 // cluster of loads have offsets that are too large to fit in the 8-bit
55 // offsets, but are close enough to fit in the 8 bits, we can add to the base
56 // pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "llvm/Analysis/AliasAnalysis.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/InitializePasses.h"
66
67 using namespace llvm;
68
69 #define DEBUG_TYPE "si-load-store-opt"
70
71 namespace {
72 enum InstClassEnum {
73 UNKNOWN,
74 DS_READ,
75 DS_WRITE,
76 S_BUFFER_LOAD_IMM,
77 BUFFER_LOAD,
78 BUFFER_STORE,
79 MIMG,
80 TBUFFER_LOAD,
81 TBUFFER_STORE,
82 GLOBAL_LOAD_SADDR,
83 GLOBAL_STORE_SADDR,
84 FLAT_LOAD,
85 FLAT_STORE,
86 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
87 GLOBAL_STORE // any CombineInfo, they are only ever returned by
88 // getCommonInstClass.
89 };
90
91 struct AddressRegs {
92 unsigned char NumVAddrs = 0;
93 bool SBase = false;
94 bool SRsrc = false;
95 bool SOffset = false;
96 bool SAddr = false;
97 bool VAddr = false;
98 bool Addr = false;
99 bool SSamp = false;
100 };
101
102 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
103 const unsigned MaxAddressRegs = 12 + 1 + 1;
104
105 class SILoadStoreOptimizer : public MachineFunctionPass {
106 struct CombineInfo {
107 MachineBasicBlock::iterator I;
108 unsigned EltSize;
109 unsigned Offset;
110 unsigned Width;
111 unsigned Format;
112 unsigned BaseOff;
113 unsigned DMask;
114 InstClassEnum InstClass;
115 unsigned CPol = 0;
116 bool IsAGPR;
117 bool UseST64;
118 int AddrIdx[MaxAddressRegs];
119 const MachineOperand *AddrReg[MaxAddressRegs];
120 unsigned NumAddresses;
121 unsigned Order;
122
hasSameBaseAddress__anonb64536af0111::SILoadStoreOptimizer::CombineInfo123 bool hasSameBaseAddress(const MachineInstr &MI) {
124 for (unsigned i = 0; i < NumAddresses; i++) {
125 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
126
127 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
128 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
129 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
130 return false;
131 }
132 continue;
133 }
134
135 // Check same base pointer. Be careful of subregisters, which can occur
136 // with vectors of pointers.
137 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
138 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
139 return false;
140 }
141 }
142 return true;
143 }
144
hasMergeableAddress__anonb64536af0111::SILoadStoreOptimizer::CombineInfo145 bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
146 for (unsigned i = 0; i < NumAddresses; ++i) {
147 const MachineOperand *AddrOp = AddrReg[i];
148 // Immediates are always OK.
149 if (AddrOp->isImm())
150 continue;
151
152 // Don't try to merge addresses that aren't either immediates or registers.
153 // TODO: Should be possible to merge FrameIndexes and maybe some other
154 // non-register
155 if (!AddrOp->isReg())
156 return false;
157
158 // TODO: We should be able to merge physical reg addresses.
159 if (AddrOp->getReg().isPhysical())
160 return false;
161
162 // If an address has only one use then there will be on other
163 // instructions with the same address, so we can't merge this one.
164 if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
165 return false;
166 }
167 return true;
168 }
169
170 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
171
172 // Compare by pointer order.
operator <__anonb64536af0111::SILoadStoreOptimizer::CombineInfo173 bool operator<(const CombineInfo& Other) const {
174 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
175 }
176 };
177
178 struct BaseRegisters {
179 Register LoReg;
180 Register HiReg;
181
182 unsigned LoSubReg = 0;
183 unsigned HiSubReg = 0;
184 };
185
186 struct MemAddress {
187 BaseRegisters Base;
188 int64_t Offset = 0;
189 };
190
191 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
192
193 private:
194 const GCNSubtarget *STM = nullptr;
195 const SIInstrInfo *TII = nullptr;
196 const SIRegisterInfo *TRI = nullptr;
197 MachineRegisterInfo *MRI = nullptr;
198 AliasAnalysis *AA = nullptr;
199 bool OptimizeAgain;
200
201 bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
202 const DenseSet<Register> &ARegUses,
203 const MachineInstr &A, const MachineInstr &B) const;
204 static bool dmasksCanBeCombined(const CombineInfo &CI,
205 const SIInstrInfo &TII,
206 const CombineInfo &Paired);
207 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
208 CombineInfo &Paired, bool Modify = false);
209 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
210 const CombineInfo &Paired);
211 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
212 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
213 const CombineInfo &Paired);
214 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
215 const CombineInfo &Paired);
216 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
217
218 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
219
220 unsigned read2Opcode(unsigned EltSize) const;
221 unsigned read2ST64Opcode(unsigned EltSize) const;
222 MachineBasicBlock::iterator
223 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
224 MachineBasicBlock::iterator InsertBefore);
225
226 unsigned write2Opcode(unsigned EltSize) const;
227 unsigned write2ST64Opcode(unsigned EltSize) const;
228 MachineBasicBlock::iterator
229 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
230 MachineBasicBlock::iterator InsertBefore);
231 MachineBasicBlock::iterator
232 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
233 MachineBasicBlock::iterator InsertBefore);
234 MachineBasicBlock::iterator
235 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
236 MachineBasicBlock::iterator InsertBefore);
237 MachineBasicBlock::iterator
238 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
239 MachineBasicBlock::iterator InsertBefore);
240 MachineBasicBlock::iterator
241 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
242 MachineBasicBlock::iterator InsertBefore);
243 MachineBasicBlock::iterator
244 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
245 MachineBasicBlock::iterator InsertBefore);
246 MachineBasicBlock::iterator
247 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
248 MachineBasicBlock::iterator InsertBefore);
249 MachineBasicBlock::iterator
250 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
251 MachineBasicBlock::iterator InsertBefore);
252 MachineBasicBlock::iterator
253 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
254 MachineBasicBlock::iterator InsertBefore);
255
256 void updateBaseAndOffset(MachineInstr &I, Register NewBase,
257 int32_t NewOffset) const;
258 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
259 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
260 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
261 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
262 /// Promotes constant offset to the immediate by adjusting the base. It
263 /// tries to use a base from the nearby instructions that allows it to have
264 /// a 13bit constant offset which gets promoted to the immediate.
265 bool promoteConstantOffsetToImm(MachineInstr &CI,
266 MemInfoMap &Visited,
267 SmallPtrSet<MachineInstr *, 4> &Promoted) const;
268 void addInstToMergeableList(const CombineInfo &CI,
269 std::list<std::list<CombineInfo> > &MergeableInsts) const;
270
271 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
272 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
273 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
274 std::list<std::list<CombineInfo>> &MergeableInsts) const;
275
276 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
277 const CombineInfo &Paired);
278
279 static InstClassEnum getCommonInstClass(const CombineInfo &CI,
280 const CombineInfo &Paired);
281
282 public:
283 static char ID;
284
SILoadStoreOptimizer()285 SILoadStoreOptimizer() : MachineFunctionPass(ID) {
286 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
287 }
288
289 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
290 bool &OptimizeListAgain);
291 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
292
293 bool runOnMachineFunction(MachineFunction &MF) override;
294
getPassName() const295 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
296
getAnalysisUsage(AnalysisUsage & AU) const297 void getAnalysisUsage(AnalysisUsage &AU) const override {
298 AU.setPreservesCFG();
299 AU.addRequired<AAResultsWrapperPass>();
300
301 MachineFunctionPass::getAnalysisUsage(AU);
302 }
303
getRequiredProperties() const304 MachineFunctionProperties getRequiredProperties() const override {
305 return MachineFunctionProperties()
306 .set(MachineFunctionProperties::Property::IsSSA);
307 }
308 };
309
getOpcodeWidth(const MachineInstr & MI,const SIInstrInfo & TII)310 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
311 const unsigned Opc = MI.getOpcode();
312
313 if (TII.isMUBUF(Opc)) {
314 // FIXME: Handle d16 correctly
315 return AMDGPU::getMUBUFElements(Opc);
316 }
317 if (TII.isMIMG(MI)) {
318 uint64_t DMaskImm =
319 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
320 return countPopulation(DMaskImm);
321 }
322 if (TII.isMTBUF(Opc)) {
323 return AMDGPU::getMTBUFElements(Opc);
324 }
325
326 switch (Opc) {
327 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
328 case AMDGPU::GLOBAL_LOAD_DWORD:
329 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
330 case AMDGPU::GLOBAL_STORE_DWORD:
331 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
332 case AMDGPU::FLAT_LOAD_DWORD:
333 case AMDGPU::FLAT_STORE_DWORD:
334 return 1;
335 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
336 case AMDGPU::GLOBAL_LOAD_DWORDX2:
337 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
338 case AMDGPU::GLOBAL_STORE_DWORDX2:
339 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
340 case AMDGPU::FLAT_LOAD_DWORDX2:
341 case AMDGPU::FLAT_STORE_DWORDX2:
342 return 2;
343 case AMDGPU::GLOBAL_LOAD_DWORDX3:
344 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
345 case AMDGPU::GLOBAL_STORE_DWORDX3:
346 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
347 case AMDGPU::FLAT_LOAD_DWORDX3:
348 case AMDGPU::FLAT_STORE_DWORDX3:
349 return 3;
350 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
351 case AMDGPU::GLOBAL_LOAD_DWORDX4:
352 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
353 case AMDGPU::GLOBAL_STORE_DWORDX4:
354 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
355 case AMDGPU::FLAT_LOAD_DWORDX4:
356 case AMDGPU::FLAT_STORE_DWORDX4:
357 return 4;
358 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
359 return 8;
360 case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH;
361 case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
362 case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH;
363 case AMDGPU::DS_WRITE_B32_gfx9:
364 return 1;
365 case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH;
366 case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH;
367 case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH;
368 case AMDGPU::DS_WRITE_B64_gfx9:
369 return 2;
370 default:
371 return 0;
372 }
373 }
374
375 /// Maps instruction opcode to enum InstClassEnum.
getInstClass(unsigned Opc,const SIInstrInfo & TII)376 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
377 switch (Opc) {
378 default:
379 if (TII.isMUBUF(Opc)) {
380 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
381 default:
382 return UNKNOWN;
383 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
384 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
385 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
386 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
387 return BUFFER_LOAD;
388 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
389 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
390 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
391 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
392 return BUFFER_STORE;
393 }
394 }
395 if (TII.isMIMG(Opc)) {
396 // Ignore instructions encoded without vaddr.
397 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
398 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
399 return UNKNOWN;
400 // Ignore BVH instructions
401 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
402 return UNKNOWN;
403 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
404 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
405 TII.isGather4(Opc))
406 return UNKNOWN;
407 return MIMG;
408 }
409 if (TII.isMTBUF(Opc)) {
410 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
411 default:
412 return UNKNOWN;
413 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
414 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
415 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
416 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
417 return TBUFFER_LOAD;
418 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
419 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
420 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
421 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
422 return TBUFFER_STORE;
423 }
424 }
425 return UNKNOWN;
426 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
427 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
428 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
429 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
430 return S_BUFFER_LOAD_IMM;
431 case AMDGPU::DS_READ_B32:
432 case AMDGPU::DS_READ_B32_gfx9:
433 case AMDGPU::DS_READ_B64:
434 case AMDGPU::DS_READ_B64_gfx9:
435 return DS_READ;
436 case AMDGPU::DS_WRITE_B32:
437 case AMDGPU::DS_WRITE_B32_gfx9:
438 case AMDGPU::DS_WRITE_B64:
439 case AMDGPU::DS_WRITE_B64_gfx9:
440 return DS_WRITE;
441 case AMDGPU::GLOBAL_LOAD_DWORD:
442 case AMDGPU::GLOBAL_LOAD_DWORDX2:
443 case AMDGPU::GLOBAL_LOAD_DWORDX3:
444 case AMDGPU::GLOBAL_LOAD_DWORDX4:
445 case AMDGPU::FLAT_LOAD_DWORD:
446 case AMDGPU::FLAT_LOAD_DWORDX2:
447 case AMDGPU::FLAT_LOAD_DWORDX3:
448 case AMDGPU::FLAT_LOAD_DWORDX4:
449 return FLAT_LOAD;
450 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
451 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
452 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
453 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
454 return GLOBAL_LOAD_SADDR;
455 case AMDGPU::GLOBAL_STORE_DWORD:
456 case AMDGPU::GLOBAL_STORE_DWORDX2:
457 case AMDGPU::GLOBAL_STORE_DWORDX3:
458 case AMDGPU::GLOBAL_STORE_DWORDX4:
459 case AMDGPU::FLAT_STORE_DWORD:
460 case AMDGPU::FLAT_STORE_DWORDX2:
461 case AMDGPU::FLAT_STORE_DWORDX3:
462 case AMDGPU::FLAT_STORE_DWORDX4:
463 return FLAT_STORE;
464 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
465 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
466 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
467 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
468 return GLOBAL_STORE_SADDR;
469 }
470 }
471
472 /// Determines instruction subclass from opcode. Only instructions
473 /// of the same subclass can be merged together. The merged instruction may have
474 /// a different subclass but must have the same class.
getInstSubclass(unsigned Opc,const SIInstrInfo & TII)475 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
476 switch (Opc) {
477 default:
478 if (TII.isMUBUF(Opc))
479 return AMDGPU::getMUBUFBaseOpcode(Opc);
480 if (TII.isMIMG(Opc)) {
481 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
482 assert(Info);
483 return Info->BaseOpcode;
484 }
485 if (TII.isMTBUF(Opc))
486 return AMDGPU::getMTBUFBaseOpcode(Opc);
487 return -1;
488 case AMDGPU::DS_READ_B32:
489 case AMDGPU::DS_READ_B32_gfx9:
490 case AMDGPU::DS_READ_B64:
491 case AMDGPU::DS_READ_B64_gfx9:
492 case AMDGPU::DS_WRITE_B32:
493 case AMDGPU::DS_WRITE_B32_gfx9:
494 case AMDGPU::DS_WRITE_B64:
495 case AMDGPU::DS_WRITE_B64_gfx9:
496 return Opc;
497 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
498 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
499 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
500 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
501 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
502 case AMDGPU::GLOBAL_LOAD_DWORD:
503 case AMDGPU::GLOBAL_LOAD_DWORDX2:
504 case AMDGPU::GLOBAL_LOAD_DWORDX3:
505 case AMDGPU::GLOBAL_LOAD_DWORDX4:
506 case AMDGPU::FLAT_LOAD_DWORD:
507 case AMDGPU::FLAT_LOAD_DWORDX2:
508 case AMDGPU::FLAT_LOAD_DWORDX3:
509 case AMDGPU::FLAT_LOAD_DWORDX4:
510 return AMDGPU::FLAT_LOAD_DWORD;
511 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
512 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
513 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
514 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
515 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
516 case AMDGPU::GLOBAL_STORE_DWORD:
517 case AMDGPU::GLOBAL_STORE_DWORDX2:
518 case AMDGPU::GLOBAL_STORE_DWORDX3:
519 case AMDGPU::GLOBAL_STORE_DWORDX4:
520 case AMDGPU::FLAT_STORE_DWORD:
521 case AMDGPU::FLAT_STORE_DWORDX2:
522 case AMDGPU::FLAT_STORE_DWORDX3:
523 case AMDGPU::FLAT_STORE_DWORDX4:
524 return AMDGPU::FLAT_STORE_DWORD;
525 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
526 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
527 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
528 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
529 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
530 }
531 }
532
533 // GLOBAL loads and stores are classified as FLAT initially. If both combined
534 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
535 // If either or both instructions are non segment specific FLAT the resulting
536 // combined operation will be FLAT, potentially promoting one of the GLOBAL
537 // operations to FLAT.
538 // For other instructions return the original unmodified class.
539 InstClassEnum
getCommonInstClass(const CombineInfo & CI,const CombineInfo & Paired)540 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
541 const CombineInfo &Paired) {
542 assert(CI.InstClass == Paired.InstClass);
543
544 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
545 SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
546 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
547
548 return CI.InstClass;
549 }
550
getRegs(unsigned Opc,const SIInstrInfo & TII)551 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
552 AddressRegs Result;
553
554 if (TII.isMUBUF(Opc)) {
555 if (AMDGPU::getMUBUFHasVAddr(Opc))
556 Result.VAddr = true;
557 if (AMDGPU::getMUBUFHasSrsrc(Opc))
558 Result.SRsrc = true;
559 if (AMDGPU::getMUBUFHasSoffset(Opc))
560 Result.SOffset = true;
561
562 return Result;
563 }
564
565 if (TII.isMIMG(Opc)) {
566 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
567 if (VAddr0Idx >= 0) {
568 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
569 Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
570 } else {
571 Result.VAddr = true;
572 }
573 Result.SRsrc = true;
574 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
575 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
576 Result.SSamp = true;
577
578 return Result;
579 }
580 if (TII.isMTBUF(Opc)) {
581 if (AMDGPU::getMTBUFHasVAddr(Opc))
582 Result.VAddr = true;
583 if (AMDGPU::getMTBUFHasSrsrc(Opc))
584 Result.SRsrc = true;
585 if (AMDGPU::getMTBUFHasSoffset(Opc))
586 Result.SOffset = true;
587
588 return Result;
589 }
590
591 switch (Opc) {
592 default:
593 return Result;
594 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
595 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
596 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
597 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
598 Result.SBase = true;
599 return Result;
600 case AMDGPU::DS_READ_B32:
601 case AMDGPU::DS_READ_B64:
602 case AMDGPU::DS_READ_B32_gfx9:
603 case AMDGPU::DS_READ_B64_gfx9:
604 case AMDGPU::DS_WRITE_B32:
605 case AMDGPU::DS_WRITE_B64:
606 case AMDGPU::DS_WRITE_B32_gfx9:
607 case AMDGPU::DS_WRITE_B64_gfx9:
608 Result.Addr = true;
609 return Result;
610 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
611 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
612 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
613 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
614 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
615 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
616 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
617 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
618 Result.SAddr = true;
619 LLVM_FALLTHROUGH;
620 case AMDGPU::GLOBAL_LOAD_DWORD:
621 case AMDGPU::GLOBAL_LOAD_DWORDX2:
622 case AMDGPU::GLOBAL_LOAD_DWORDX3:
623 case AMDGPU::GLOBAL_LOAD_DWORDX4:
624 case AMDGPU::GLOBAL_STORE_DWORD:
625 case AMDGPU::GLOBAL_STORE_DWORDX2:
626 case AMDGPU::GLOBAL_STORE_DWORDX3:
627 case AMDGPU::GLOBAL_STORE_DWORDX4:
628 case AMDGPU::FLAT_LOAD_DWORD:
629 case AMDGPU::FLAT_LOAD_DWORDX2:
630 case AMDGPU::FLAT_LOAD_DWORDX3:
631 case AMDGPU::FLAT_LOAD_DWORDX4:
632 case AMDGPU::FLAT_STORE_DWORD:
633 case AMDGPU::FLAT_STORE_DWORDX2:
634 case AMDGPU::FLAT_STORE_DWORDX3:
635 case AMDGPU::FLAT_STORE_DWORDX4:
636 Result.VAddr = true;
637 return Result;
638 }
639 }
640
setMI(MachineBasicBlock::iterator MI,const SILoadStoreOptimizer & LSO)641 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
642 const SILoadStoreOptimizer &LSO) {
643 I = MI;
644 unsigned Opc = MI->getOpcode();
645 InstClass = getInstClass(Opc, *LSO.TII);
646
647 if (InstClass == UNKNOWN)
648 return;
649
650 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
651
652 switch (InstClass) {
653 case DS_READ:
654 EltSize =
655 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
656 : 4;
657 break;
658 case DS_WRITE:
659 EltSize =
660 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
661 : 4;
662 break;
663 case S_BUFFER_LOAD_IMM:
664 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
665 break;
666 default:
667 EltSize = 4;
668 break;
669 }
670
671 if (InstClass == MIMG) {
672 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
673 // Offset is not considered for MIMG instructions.
674 Offset = 0;
675 } else {
676 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
677 Offset = I->getOperand(OffsetIdx).getImm();
678 }
679
680 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
681 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
682
683 Width = getOpcodeWidth(*I, *LSO.TII);
684
685 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
686 Offset &= 0xffff;
687 } else if (InstClass != MIMG) {
688 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
689 }
690
691 AddressRegs Regs = getRegs(Opc, *LSO.TII);
692
693 NumAddresses = 0;
694 for (unsigned J = 0; J < Regs.NumVAddrs; J++)
695 AddrIdx[NumAddresses++] =
696 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
697 if (Regs.Addr)
698 AddrIdx[NumAddresses++] =
699 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
700 if (Regs.SBase)
701 AddrIdx[NumAddresses++] =
702 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
703 if (Regs.SRsrc)
704 AddrIdx[NumAddresses++] =
705 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
706 if (Regs.SOffset)
707 AddrIdx[NumAddresses++] =
708 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
709 if (Regs.SAddr)
710 AddrIdx[NumAddresses++] =
711 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
712 if (Regs.VAddr)
713 AddrIdx[NumAddresses++] =
714 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
715 if (Regs.SSamp)
716 AddrIdx[NumAddresses++] =
717 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
718 assert(NumAddresses <= MaxAddressRegs);
719
720 for (unsigned J = 0; J < NumAddresses; J++)
721 AddrReg[J] = &I->getOperand(AddrIdx[J]);
722 }
723
724 } // end anonymous namespace.
725
726 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
727 "SI Load Store Optimizer", false, false)
728 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
729 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
730 false, false)
731
732 char SILoadStoreOptimizer::ID = 0;
733
734 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
735
createSILoadStoreOptimizerPass()736 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
737 return new SILoadStoreOptimizer();
738 }
739
addDefsUsesToList(const MachineInstr & MI,DenseSet<Register> & RegDefs,DenseSet<Register> & RegUses)740 static void addDefsUsesToList(const MachineInstr &MI,
741 DenseSet<Register> &RegDefs,
742 DenseSet<Register> &RegUses) {
743 for (const auto &Op : MI.operands()) {
744 if (!Op.isReg())
745 continue;
746 if (Op.isDef())
747 RegDefs.insert(Op.getReg());
748 if (Op.readsReg())
749 RegUses.insert(Op.getReg());
750 }
751 }
752
canSwapInstructions(const DenseSet<Register> & ARegDefs,const DenseSet<Register> & ARegUses,const MachineInstr & A,const MachineInstr & B) const753 bool SILoadStoreOptimizer::canSwapInstructions(
754 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
755 const MachineInstr &A, const MachineInstr &B) const {
756 if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
757 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
758 return false;
759 for (const auto &BOp : B.operands()) {
760 if (!BOp.isReg())
761 continue;
762 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
763 return false;
764 if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
765 return false;
766 }
767 return true;
768 }
769
770 // Given that \p CI and \p Paired are adjacent memory operations produce a new
771 // MMO for the combined operation with a new access size.
772 MachineMemOperand *
combineKnownAdjacentMMOs(const CombineInfo & CI,const CombineInfo & Paired)773 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
774 const CombineInfo &Paired) {
775 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
776 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
777
778 unsigned Size = MMOa->getSize() + MMOb->getSize();
779
780 // A base pointer for the combined operation is the same as the leading
781 // operation's pointer.
782 if (Paired < CI)
783 std::swap(MMOa, MMOb);
784
785 MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
786 // If merging FLAT and GLOBAL set address space to FLAT.
787 if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
788 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
789
790 MachineFunction *MF = CI.I->getMF();
791 return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
792 }
793
dmasksCanBeCombined(const CombineInfo & CI,const SIInstrInfo & TII,const CombineInfo & Paired)794 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
795 const SIInstrInfo &TII,
796 const CombineInfo &Paired) {
797 assert(CI.InstClass == MIMG);
798
799 // Ignore instructions with tfe/lwe set.
800 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
801 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
802
803 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
804 return false;
805
806 // Check other optional immediate operands for equality.
807 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
808 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
809 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
810
811 for (auto op : OperandsToMatch) {
812 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
813 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
814 return false;
815 if (Idx != -1 &&
816 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
817 return false;
818 }
819
820 // Check DMask for overlaps.
821 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
822 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
823
824 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
825 if ((1u << AllowedBitsForMin) <= MinMask)
826 return false;
827
828 return true;
829 }
830
getBufferFormatWithCompCount(unsigned OldFormat,unsigned ComponentCount,const GCNSubtarget & STI)831 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
832 unsigned ComponentCount,
833 const GCNSubtarget &STI) {
834 if (ComponentCount > 4)
835 return 0;
836
837 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
838 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
839 if (!OldFormatInfo)
840 return 0;
841
842 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
843 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
844 ComponentCount,
845 OldFormatInfo->NumFormat, STI);
846
847 if (!NewFormatInfo)
848 return 0;
849
850 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
851 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
852
853 return NewFormatInfo->Format;
854 }
855
856 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
857 // highest power of two. Note that the result is well defined for all inputs
858 // including corner cases like:
859 // - if Lo == Hi, return that value
860 // - if Lo == 0, return 0 (even though the "- 1" below underflows
861 // - if Lo > Hi, return 0 (as if the range wrapped around)
mostAlignedValueInRange(uint32_t Lo,uint32_t Hi)862 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
863 return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
864 }
865
offsetsCanBeCombined(CombineInfo & CI,const GCNSubtarget & STI,CombineInfo & Paired,bool Modify)866 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
867 const GCNSubtarget &STI,
868 CombineInfo &Paired,
869 bool Modify) {
870 assert(CI.InstClass != MIMG);
871
872 // XXX - Would the same offset be OK? Is there any reason this would happen or
873 // be useful?
874 if (CI.Offset == Paired.Offset)
875 return false;
876
877 // This won't be valid if the offset isn't aligned.
878 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
879 return false;
880
881 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
882
883 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
884 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
885 if (!Info0)
886 return false;
887 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
888 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
889 if (!Info1)
890 return false;
891
892 if (Info0->BitsPerComp != Info1->BitsPerComp ||
893 Info0->NumFormat != Info1->NumFormat)
894 return false;
895
896 // TODO: Should be possible to support more formats, but if format loads
897 // are not dword-aligned, the merged load might not be valid.
898 if (Info0->BitsPerComp != 32)
899 return false;
900
901 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
902 return false;
903 }
904
905 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
906 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
907 CI.UseST64 = false;
908 CI.BaseOff = 0;
909
910 // Handle all non-DS instructions.
911 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
912 return (EltOffset0 + CI.Width == EltOffset1 ||
913 EltOffset1 + Paired.Width == EltOffset0) &&
914 CI.CPol == Paired.CPol;
915 }
916
917 // If the offset in elements doesn't fit in 8-bits, we might be able to use
918 // the stride 64 versions.
919 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
920 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
921 if (Modify) {
922 CI.Offset = EltOffset0 / 64;
923 Paired.Offset = EltOffset1 / 64;
924 CI.UseST64 = true;
925 }
926 return true;
927 }
928
929 // Check if the new offsets fit in the reduced 8-bit range.
930 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
931 if (Modify) {
932 CI.Offset = EltOffset0;
933 Paired.Offset = EltOffset1;
934 }
935 return true;
936 }
937
938 // Try to shift base address to decrease offsets.
939 uint32_t Min = std::min(EltOffset0, EltOffset1);
940 uint32_t Max = std::max(EltOffset0, EltOffset1);
941
942 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
943 if (((Max - Min) & ~Mask) == 0) {
944 if (Modify) {
945 // From the range of values we could use for BaseOff, choose the one that
946 // is aligned to the highest power of two, to maximise the chance that
947 // the same offset can be reused for other load/store pairs.
948 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
949 // Copy the low bits of the offsets, so that when we adjust them by
950 // subtracting BaseOff they will be multiples of 64.
951 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
952 CI.BaseOff = BaseOff * CI.EltSize;
953 CI.Offset = (EltOffset0 - BaseOff) / 64;
954 Paired.Offset = (EltOffset1 - BaseOff) / 64;
955 CI.UseST64 = true;
956 }
957 return true;
958 }
959
960 if (isUInt<8>(Max - Min)) {
961 if (Modify) {
962 // From the range of values we could use for BaseOff, choose the one that
963 // is aligned to the highest power of two, to maximise the chance that
964 // the same offset can be reused for other load/store pairs.
965 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
966 CI.BaseOff = BaseOff * CI.EltSize;
967 CI.Offset = EltOffset0 - BaseOff;
968 Paired.Offset = EltOffset1 - BaseOff;
969 }
970 return true;
971 }
972
973 return false;
974 }
975
widthsFit(const GCNSubtarget & STM,const CombineInfo & CI,const CombineInfo & Paired)976 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
977 const CombineInfo &CI,
978 const CombineInfo &Paired) {
979 const unsigned Width = (CI.Width + Paired.Width);
980 switch (CI.InstClass) {
981 default:
982 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
983 case S_BUFFER_LOAD_IMM:
984 switch (Width) {
985 default:
986 return false;
987 case 2:
988 case 4:
989 case 8:
990 return true;
991 }
992 }
993 }
994
995 const TargetRegisterClass *
getDataRegClass(const MachineInstr & MI) const996 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
997 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
998 return TRI->getRegClassForReg(*MRI, Dst->getReg());
999 }
1000 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1001 return TRI->getRegClassForReg(*MRI, Src->getReg());
1002 }
1003 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1004 return TRI->getRegClassForReg(*MRI, Src->getReg());
1005 }
1006 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1007 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1008 }
1009 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1010 return TRI->getRegClassForReg(*MRI, Src->getReg());
1011 }
1012 return nullptr;
1013 }
1014
1015 /// This function assumes that CI comes before Paired in a basic block. Return
1016 /// an insertion point for the merged instruction or nullptr on failure.
1017 SILoadStoreOptimizer::CombineInfo *
checkAndPrepareMerge(CombineInfo & CI,CombineInfo & Paired)1018 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1019 CombineInfo &Paired) {
1020 // If another instruction has already been merged into CI, it may now be a
1021 // type that we can't do any further merging into.
1022 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1023 return nullptr;
1024 assert(CI.InstClass == Paired.InstClass);
1025
1026 if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1027 getInstSubclass(Paired.I->getOpcode(), *TII))
1028 return nullptr;
1029
1030 // Check both offsets (or masks for MIMG) can be combined and fit in the
1031 // reduced range.
1032 if (CI.InstClass == MIMG) {
1033 if (!dmasksCanBeCombined(CI, *TII, Paired))
1034 return nullptr;
1035 } else {
1036 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1037 return nullptr;
1038 }
1039
1040 DenseSet<Register> RegDefs;
1041 DenseSet<Register> RegUses;
1042 CombineInfo *Where;
1043 if (CI.I->mayLoad()) {
1044 // Try to hoist Paired up to CI.
1045 addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1046 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1047 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1048 return nullptr;
1049 }
1050 Where = &CI;
1051 } else {
1052 // Try to sink CI down to Paired.
1053 addDefsUsesToList(*CI.I, RegDefs, RegUses);
1054 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1055 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1056 return nullptr;
1057 }
1058 Where = &Paired;
1059 }
1060
1061 // Call offsetsCanBeCombined with modify = true so that the offsets are
1062 // correct for the new instruction. This should return true, because
1063 // this function should only be called on CombineInfo objects that
1064 // have already been confirmed to be mergeable.
1065 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1066 offsetsCanBeCombined(CI, *STM, Paired, true);
1067 return Where;
1068 }
1069
read2Opcode(unsigned EltSize) const1070 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1071 if (STM->ldsRequiresM0Init())
1072 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1073 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1074 }
1075
read2ST64Opcode(unsigned EltSize) const1076 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1077 if (STM->ldsRequiresM0Init())
1078 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1079
1080 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1081 : AMDGPU::DS_READ2ST64_B64_gfx9;
1082 }
1083
1084 MachineBasicBlock::iterator
mergeRead2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1085 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1086 MachineBasicBlock::iterator InsertBefore) {
1087 MachineBasicBlock *MBB = CI.I->getParent();
1088
1089 // Be careful, since the addresses could be subregisters themselves in weird
1090 // cases, like vectors of pointers.
1091 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1092
1093 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1094 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1095
1096 unsigned NewOffset0 = CI.Offset;
1097 unsigned NewOffset1 = Paired.Offset;
1098 unsigned Opc =
1099 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1100
1101 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1102 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1103
1104 if (NewOffset0 > NewOffset1) {
1105 // Canonicalize the merged instruction so the smaller offset comes first.
1106 std::swap(NewOffset0, NewOffset1);
1107 std::swap(SubRegIdx0, SubRegIdx1);
1108 }
1109
1110 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1111 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1112
1113 const MCInstrDesc &Read2Desc = TII->get(Opc);
1114
1115 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1116 Register DestReg = MRI->createVirtualRegister(SuperRC);
1117
1118 DebugLoc DL = CI.I->getDebugLoc();
1119
1120 Register BaseReg = AddrReg->getReg();
1121 unsigned BaseSubReg = AddrReg->getSubReg();
1122 unsigned BaseRegFlags = 0;
1123 if (CI.BaseOff) {
1124 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1125 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1126 .addImm(CI.BaseOff);
1127
1128 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1129 BaseRegFlags = RegState::Kill;
1130
1131 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1132 .addReg(ImmReg)
1133 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1134 .addImm(0); // clamp bit
1135 BaseSubReg = 0;
1136 }
1137
1138 MachineInstrBuilder Read2 =
1139 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1140 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1141 .addImm(NewOffset0) // offset0
1142 .addImm(NewOffset1) // offset1
1143 .addImm(0) // gds
1144 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1145
1146 (void)Read2;
1147
1148 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1149
1150 // Copy to the old destination registers.
1151 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1152 .add(*Dest0) // Copy to same destination including flags and sub reg.
1153 .addReg(DestReg, 0, SubRegIdx0);
1154 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1155 .add(*Dest1)
1156 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1157
1158 CI.I->eraseFromParent();
1159 Paired.I->eraseFromParent();
1160
1161 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1162 return Read2;
1163 }
1164
write2Opcode(unsigned EltSize) const1165 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1166 if (STM->ldsRequiresM0Init())
1167 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1168 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1169 : AMDGPU::DS_WRITE2_B64_gfx9;
1170 }
1171
write2ST64Opcode(unsigned EltSize) const1172 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1173 if (STM->ldsRequiresM0Init())
1174 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1175 : AMDGPU::DS_WRITE2ST64_B64;
1176
1177 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1178 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1179 }
1180
mergeWrite2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1181 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1182 CombineInfo &CI, CombineInfo &Paired,
1183 MachineBasicBlock::iterator InsertBefore) {
1184 MachineBasicBlock *MBB = CI.I->getParent();
1185
1186 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1187 // sure we preserve the subregister index and any register flags set on them.
1188 const MachineOperand *AddrReg =
1189 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1190 const MachineOperand *Data0 =
1191 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1192 const MachineOperand *Data1 =
1193 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1194
1195 unsigned NewOffset0 = CI.Offset;
1196 unsigned NewOffset1 = Paired.Offset;
1197 unsigned Opc =
1198 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1199
1200 if (NewOffset0 > NewOffset1) {
1201 // Canonicalize the merged instruction so the smaller offset comes first.
1202 std::swap(NewOffset0, NewOffset1);
1203 std::swap(Data0, Data1);
1204 }
1205
1206 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1207 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1208
1209 const MCInstrDesc &Write2Desc = TII->get(Opc);
1210 DebugLoc DL = CI.I->getDebugLoc();
1211
1212 Register BaseReg = AddrReg->getReg();
1213 unsigned BaseSubReg = AddrReg->getSubReg();
1214 unsigned BaseRegFlags = 0;
1215 if (CI.BaseOff) {
1216 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1217 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1218 .addImm(CI.BaseOff);
1219
1220 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1221 BaseRegFlags = RegState::Kill;
1222
1223 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1224 .addReg(ImmReg)
1225 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1226 .addImm(0); // clamp bit
1227 BaseSubReg = 0;
1228 }
1229
1230 MachineInstrBuilder Write2 =
1231 BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1232 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1233 .add(*Data0) // data0
1234 .add(*Data1) // data1
1235 .addImm(NewOffset0) // offset0
1236 .addImm(NewOffset1) // offset1
1237 .addImm(0) // gds
1238 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1239
1240 CI.I->eraseFromParent();
1241 Paired.I->eraseFromParent();
1242
1243 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1244 return Write2;
1245 }
1246
1247 MachineBasicBlock::iterator
mergeImagePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1248 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1249 MachineBasicBlock::iterator InsertBefore) {
1250 MachineBasicBlock *MBB = CI.I->getParent();
1251 DebugLoc DL = CI.I->getDebugLoc();
1252 const unsigned Opcode = getNewOpcode(CI, Paired);
1253
1254 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1255
1256 Register DestReg = MRI->createVirtualRegister(SuperRC);
1257 unsigned MergedDMask = CI.DMask | Paired.DMask;
1258 unsigned DMaskIdx =
1259 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1260
1261 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1262 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1263 if (I == DMaskIdx)
1264 MIB.addImm(MergedDMask);
1265 else
1266 MIB.add((*CI.I).getOperand(I));
1267 }
1268
1269 // It shouldn't be possible to get this far if the two instructions
1270 // don't have a single memoperand, because MachineInstr::mayAlias()
1271 // will return true if this is the case.
1272 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1273
1274 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1275
1276 unsigned SubRegIdx0, SubRegIdx1;
1277 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1278
1279 // Copy to the old destination registers.
1280 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1281 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1282 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1283
1284 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1285 .add(*Dest0) // Copy to same destination including flags and sub reg.
1286 .addReg(DestReg, 0, SubRegIdx0);
1287 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1288 .add(*Dest1)
1289 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1290
1291 CI.I->eraseFromParent();
1292 Paired.I->eraseFromParent();
1293 return New;
1294 }
1295
mergeSBufferLoadImmPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1296 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
1297 CombineInfo &CI, CombineInfo &Paired,
1298 MachineBasicBlock::iterator InsertBefore) {
1299 MachineBasicBlock *MBB = CI.I->getParent();
1300 DebugLoc DL = CI.I->getDebugLoc();
1301 const unsigned Opcode = getNewOpcode(CI, Paired);
1302
1303 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1304
1305 Register DestReg = MRI->createVirtualRegister(SuperRC);
1306 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1307
1308 // It shouldn't be possible to get this far if the two instructions
1309 // don't have a single memoperand, because MachineInstr::mayAlias()
1310 // will return true if this is the case.
1311 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1312
1313 MachineInstr *New =
1314 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1315 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
1316 .addImm(MergedOffset) // offset
1317 .addImm(CI.CPol) // cpol
1318 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1319
1320 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1321 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1322 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1323
1324 // Copy to the old destination registers.
1325 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1326 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1327 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1328
1329 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1330 .add(*Dest0) // Copy to same destination including flags and sub reg.
1331 .addReg(DestReg, 0, SubRegIdx0);
1332 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1333 .add(*Dest1)
1334 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1335
1336 CI.I->eraseFromParent();
1337 Paired.I->eraseFromParent();
1338 return New;
1339 }
1340
mergeBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1341 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1342 CombineInfo &CI, CombineInfo &Paired,
1343 MachineBasicBlock::iterator InsertBefore) {
1344 MachineBasicBlock *MBB = CI.I->getParent();
1345 DebugLoc DL = CI.I->getDebugLoc();
1346
1347 const unsigned Opcode = getNewOpcode(CI, Paired);
1348
1349 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1350
1351 // Copy to the new source register.
1352 Register DestReg = MRI->createVirtualRegister(SuperRC);
1353 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1354
1355 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1356
1357 AddressRegs Regs = getRegs(Opcode, *TII);
1358
1359 if (Regs.VAddr)
1360 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1361
1362 // It shouldn't be possible to get this far if the two instructions
1363 // don't have a single memoperand, because MachineInstr::mayAlias()
1364 // will return true if this is the case.
1365 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1366
1367 MachineInstr *New =
1368 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1369 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1370 .addImm(MergedOffset) // offset
1371 .addImm(CI.CPol) // cpol
1372 .addImm(0) // tfe
1373 .addImm(0) // swz
1374 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1375
1376 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1377 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1378 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1379
1380 // Copy to the old destination registers.
1381 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1382 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1383 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1384
1385 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1386 .add(*Dest0) // Copy to same destination including flags and sub reg.
1387 .addReg(DestReg, 0, SubRegIdx0);
1388 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1389 .add(*Dest1)
1390 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1391
1392 CI.I->eraseFromParent();
1393 Paired.I->eraseFromParent();
1394 return New;
1395 }
1396
mergeTBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1397 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1398 CombineInfo &CI, CombineInfo &Paired,
1399 MachineBasicBlock::iterator InsertBefore) {
1400 MachineBasicBlock *MBB = CI.I->getParent();
1401 DebugLoc DL = CI.I->getDebugLoc();
1402
1403 const unsigned Opcode = getNewOpcode(CI, Paired);
1404
1405 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1406
1407 // Copy to the new source register.
1408 Register DestReg = MRI->createVirtualRegister(SuperRC);
1409 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1410
1411 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1412
1413 AddressRegs Regs = getRegs(Opcode, *TII);
1414
1415 if (Regs.VAddr)
1416 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1417
1418 unsigned JoinedFormat =
1419 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1420
1421 // It shouldn't be possible to get this far if the two instructions
1422 // don't have a single memoperand, because MachineInstr::mayAlias()
1423 // will return true if this is the case.
1424 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1425
1426 MachineInstr *New =
1427 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1428 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1429 .addImm(MergedOffset) // offset
1430 .addImm(JoinedFormat) // format
1431 .addImm(CI.CPol) // cpol
1432 .addImm(0) // tfe
1433 .addImm(0) // swz
1434 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1435
1436 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1437 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1438 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1439
1440 // Copy to the old destination registers.
1441 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1442 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1443 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1444
1445 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1446 .add(*Dest0) // Copy to same destination including flags and sub reg.
1447 .addReg(DestReg, 0, SubRegIdx0);
1448 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1449 .add(*Dest1)
1450 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1451
1452 CI.I->eraseFromParent();
1453 Paired.I->eraseFromParent();
1454 return New;
1455 }
1456
mergeTBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1457 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1458 CombineInfo &CI, CombineInfo &Paired,
1459 MachineBasicBlock::iterator InsertBefore) {
1460 MachineBasicBlock *MBB = CI.I->getParent();
1461 DebugLoc DL = CI.I->getDebugLoc();
1462
1463 const unsigned Opcode = getNewOpcode(CI, Paired);
1464
1465 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1466 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1467 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1468
1469 // Copy to the new source register.
1470 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1471 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1472
1473 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1474 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1475
1476 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1477 .add(*Src0)
1478 .addImm(SubRegIdx0)
1479 .add(*Src1)
1480 .addImm(SubRegIdx1);
1481
1482 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1483 .addReg(SrcReg, RegState::Kill);
1484
1485 AddressRegs Regs = getRegs(Opcode, *TII);
1486
1487 if (Regs.VAddr)
1488 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1489
1490 unsigned JoinedFormat =
1491 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1492
1493 // It shouldn't be possible to get this far if the two instructions
1494 // don't have a single memoperand, because MachineInstr::mayAlias()
1495 // will return true if this is the case.
1496 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1497
1498 MachineInstr *New =
1499 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1500 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1501 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1502 .addImm(JoinedFormat) // format
1503 .addImm(CI.CPol) // cpol
1504 .addImm(0) // tfe
1505 .addImm(0) // swz
1506 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1507
1508 CI.I->eraseFromParent();
1509 Paired.I->eraseFromParent();
1510 return New;
1511 }
1512
mergeFlatLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1513 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1514 CombineInfo &CI, CombineInfo &Paired,
1515 MachineBasicBlock::iterator InsertBefore) {
1516 MachineBasicBlock *MBB = CI.I->getParent();
1517 DebugLoc DL = CI.I->getDebugLoc();
1518
1519 const unsigned Opcode = getNewOpcode(CI, Paired);
1520
1521 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1522 Register DestReg = MRI->createVirtualRegister(SuperRC);
1523
1524 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1525
1526 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1527 MIB.add(*SAddr);
1528
1529 MachineInstr *New =
1530 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1531 .addImm(std::min(CI.Offset, Paired.Offset))
1532 .addImm(CI.CPol)
1533 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1534
1535 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1536 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1537 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1538
1539 // Copy to the old destination registers.
1540 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1541 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1542 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1543
1544 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1545 .add(*Dest0) // Copy to same destination including flags and sub reg.
1546 .addReg(DestReg, 0, SubRegIdx0);
1547 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1548 .add(*Dest1)
1549 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1550
1551 CI.I->eraseFromParent();
1552 Paired.I->eraseFromParent();
1553 return New;
1554 }
1555
mergeFlatStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1556 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1557 CombineInfo &CI, CombineInfo &Paired,
1558 MachineBasicBlock::iterator InsertBefore) {
1559 MachineBasicBlock *MBB = CI.I->getParent();
1560 DebugLoc DL = CI.I->getDebugLoc();
1561
1562 const unsigned Opcode = getNewOpcode(CI, Paired);
1563
1564 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1565 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1566 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1567
1568 // Copy to the new source register.
1569 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1570 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1571
1572 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1573 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1574
1575 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1576 .add(*Src0)
1577 .addImm(SubRegIdx0)
1578 .add(*Src1)
1579 .addImm(SubRegIdx1);
1580
1581 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1582 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1583 .addReg(SrcReg, RegState::Kill);
1584
1585 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1586 MIB.add(*SAddr);
1587
1588 MachineInstr *New =
1589 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1590 .addImm(CI.CPol)
1591 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1592
1593 CI.I->eraseFromParent();
1594 Paired.I->eraseFromParent();
1595 return New;
1596 }
1597
getNewOpcode(const CombineInfo & CI,const CombineInfo & Paired)1598 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1599 const CombineInfo &Paired) {
1600 const unsigned Width = CI.Width + Paired.Width;
1601
1602 switch (getCommonInstClass(CI, Paired)) {
1603 default:
1604 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1605 // FIXME: Handle d16 correctly
1606 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1607 Width);
1608 case TBUFFER_LOAD:
1609 case TBUFFER_STORE:
1610 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1611 Width);
1612
1613 case UNKNOWN:
1614 llvm_unreachable("Unknown instruction class");
1615 case S_BUFFER_LOAD_IMM:
1616 switch (Width) {
1617 default:
1618 return 0;
1619 case 2:
1620 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1621 case 4:
1622 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1623 case 8:
1624 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1625 }
1626 case GLOBAL_LOAD:
1627 switch (Width) {
1628 default:
1629 return 0;
1630 case 2:
1631 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1632 case 3:
1633 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1634 case 4:
1635 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1636 }
1637 case GLOBAL_LOAD_SADDR:
1638 switch (Width) {
1639 default:
1640 return 0;
1641 case 2:
1642 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1643 case 3:
1644 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1645 case 4:
1646 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1647 }
1648 case GLOBAL_STORE:
1649 switch (Width) {
1650 default:
1651 return 0;
1652 case 2:
1653 return AMDGPU::GLOBAL_STORE_DWORDX2;
1654 case 3:
1655 return AMDGPU::GLOBAL_STORE_DWORDX3;
1656 case 4:
1657 return AMDGPU::GLOBAL_STORE_DWORDX4;
1658 }
1659 case GLOBAL_STORE_SADDR:
1660 switch (Width) {
1661 default:
1662 return 0;
1663 case 2:
1664 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1665 case 3:
1666 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1667 case 4:
1668 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1669 }
1670 case FLAT_LOAD:
1671 switch (Width) {
1672 default:
1673 return 0;
1674 case 2:
1675 return AMDGPU::FLAT_LOAD_DWORDX2;
1676 case 3:
1677 return AMDGPU::FLAT_LOAD_DWORDX3;
1678 case 4:
1679 return AMDGPU::FLAT_LOAD_DWORDX4;
1680 }
1681 case FLAT_STORE:
1682 switch (Width) {
1683 default:
1684 return 0;
1685 case 2:
1686 return AMDGPU::FLAT_STORE_DWORDX2;
1687 case 3:
1688 return AMDGPU::FLAT_STORE_DWORDX3;
1689 case 4:
1690 return AMDGPU::FLAT_STORE_DWORDX4;
1691 }
1692 case MIMG:
1693 assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
1694 "No overlaps");
1695 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1696 }
1697 }
1698
1699 std::pair<unsigned, unsigned>
getSubRegIdxs(const CombineInfo & CI,const CombineInfo & Paired)1700 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1701 const CombineInfo &Paired) {
1702 assert((CI.InstClass != MIMG || (countPopulation(CI.DMask | Paired.DMask) ==
1703 CI.Width + Paired.Width)) &&
1704 "No overlaps");
1705
1706 unsigned Idx0;
1707 unsigned Idx1;
1708
1709 static const unsigned Idxs[5][4] = {
1710 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1711 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1712 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1713 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1714 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1715 };
1716
1717 assert(CI.Width >= 1 && CI.Width <= 4);
1718 assert(Paired.Width >= 1 && Paired.Width <= 4);
1719
1720 if (Paired < CI) {
1721 Idx1 = Idxs[0][Paired.Width - 1];
1722 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1723 } else {
1724 Idx0 = Idxs[0][CI.Width - 1];
1725 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1726 }
1727
1728 return std::make_pair(Idx0, Idx1);
1729 }
1730
1731 const TargetRegisterClass *
getTargetRegisterClass(const CombineInfo & CI,const CombineInfo & Paired)1732 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1733 const CombineInfo &Paired) {
1734 if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1735 switch (CI.Width + Paired.Width) {
1736 default:
1737 return nullptr;
1738 case 2:
1739 return &AMDGPU::SReg_64_XEXECRegClass;
1740 case 4:
1741 return &AMDGPU::SGPR_128RegClass;
1742 case 8:
1743 return &AMDGPU::SGPR_256RegClass;
1744 case 16:
1745 return &AMDGPU::SGPR_512RegClass;
1746 }
1747 }
1748
1749 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1750 return TRI->isAGPRClass(getDataRegClass(*CI.I))
1751 ? TRI->getAGPRClassForBitWidth(BitWidth)
1752 : TRI->getVGPRClassForBitWidth(BitWidth);
1753 }
1754
mergeBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1755 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1756 CombineInfo &CI, CombineInfo &Paired,
1757 MachineBasicBlock::iterator InsertBefore) {
1758 MachineBasicBlock *MBB = CI.I->getParent();
1759 DebugLoc DL = CI.I->getDebugLoc();
1760
1761 const unsigned Opcode = getNewOpcode(CI, Paired);
1762
1763 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1764 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1765 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1766
1767 // Copy to the new source register.
1768 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1769 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1770
1771 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1772 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1773
1774 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1775 .add(*Src0)
1776 .addImm(SubRegIdx0)
1777 .add(*Src1)
1778 .addImm(SubRegIdx1);
1779
1780 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1781 .addReg(SrcReg, RegState::Kill);
1782
1783 AddressRegs Regs = getRegs(Opcode, *TII);
1784
1785 if (Regs.VAddr)
1786 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1787
1788
1789 // It shouldn't be possible to get this far if the two instructions
1790 // don't have a single memoperand, because MachineInstr::mayAlias()
1791 // will return true if this is the case.
1792 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1793
1794 MachineInstr *New =
1795 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1796 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1797 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1798 .addImm(CI.CPol) // cpol
1799 .addImm(0) // tfe
1800 .addImm(0) // swz
1801 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1802
1803 CI.I->eraseFromParent();
1804 Paired.I->eraseFromParent();
1805 return New;
1806 }
1807
1808 MachineOperand
createRegOrImm(int32_t Val,MachineInstr & MI) const1809 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1810 APInt V(32, Val, true);
1811 if (TII->isInlineConstant(V))
1812 return MachineOperand::CreateImm(Val);
1813
1814 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1815 MachineInstr *Mov =
1816 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1817 TII->get(AMDGPU::S_MOV_B32), Reg)
1818 .addImm(Val);
1819 (void)Mov;
1820 LLVM_DEBUG(dbgs() << " "; Mov->dump());
1821 return MachineOperand::CreateReg(Reg, false);
1822 }
1823
1824 // Compute base address using Addr and return the final register.
computeBase(MachineInstr & MI,const MemAddress & Addr) const1825 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1826 const MemAddress &Addr) const {
1827 MachineBasicBlock *MBB = MI.getParent();
1828 MachineBasicBlock::iterator MBBI = MI.getIterator();
1829 DebugLoc DL = MI.getDebugLoc();
1830
1831 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1832 Addr.Base.LoSubReg) &&
1833 "Expected 32-bit Base-Register-Low!!");
1834
1835 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1836 Addr.Base.HiSubReg) &&
1837 "Expected 32-bit Base-Register-Hi!!");
1838
1839 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1840 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1841 MachineOperand OffsetHi =
1842 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1843
1844 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1845 Register CarryReg = MRI->createVirtualRegister(CarryRC);
1846 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1847
1848 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1849 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1850 MachineInstr *LoHalf =
1851 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1852 .addReg(CarryReg, RegState::Define)
1853 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1854 .add(OffsetLo)
1855 .addImm(0); // clamp bit
1856 (void)LoHalf;
1857 LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1858
1859 MachineInstr *HiHalf =
1860 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1861 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1862 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1863 .add(OffsetHi)
1864 .addReg(CarryReg, RegState::Kill)
1865 .addImm(0); // clamp bit
1866 (void)HiHalf;
1867 LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1868
1869 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1870 MachineInstr *FullBase =
1871 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1872 .addReg(DestSub0)
1873 .addImm(AMDGPU::sub0)
1874 .addReg(DestSub1)
1875 .addImm(AMDGPU::sub1);
1876 (void)FullBase;
1877 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1878
1879 return FullDestReg;
1880 }
1881
1882 // Update base and offset with the NewBase and NewOffset in MI.
updateBaseAndOffset(MachineInstr & MI,Register NewBase,int32_t NewOffset) const1883 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1884 Register NewBase,
1885 int32_t NewOffset) const {
1886 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1887 Base->setReg(NewBase);
1888 Base->setIsKill(false);
1889 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1890 }
1891
1892 Optional<int32_t>
extractConstOffset(const MachineOperand & Op) const1893 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1894 if (Op.isImm())
1895 return Op.getImm();
1896
1897 if (!Op.isReg())
1898 return None;
1899
1900 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1901 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1902 !Def->getOperand(1).isImm())
1903 return None;
1904
1905 return Def->getOperand(1).getImm();
1906 }
1907
1908 // Analyze Base and extracts:
1909 // - 32bit base registers, subregisters
1910 // - 64bit constant offset
1911 // Expecting base computation as:
1912 // %OFFSET0:sgpr_32 = S_MOV_B32 8000
1913 // %LO:vgpr_32, %c:sreg_64_xexec =
1914 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1915 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1916 // %Base:vreg_64 =
1917 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
processBaseWithConstOffset(const MachineOperand & Base,MemAddress & Addr) const1918 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1919 MemAddress &Addr) const {
1920 if (!Base.isReg())
1921 return;
1922
1923 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1924 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1925 || Def->getNumOperands() != 5)
1926 return;
1927
1928 MachineOperand BaseLo = Def->getOperand(1);
1929 MachineOperand BaseHi = Def->getOperand(3);
1930 if (!BaseLo.isReg() || !BaseHi.isReg())
1931 return;
1932
1933 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1934 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1935
1936 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
1937 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1938 return;
1939
1940 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1941 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1942
1943 auto Offset0P = extractConstOffset(*Src0);
1944 if (Offset0P)
1945 BaseLo = *Src1;
1946 else {
1947 if (!(Offset0P = extractConstOffset(*Src1)))
1948 return;
1949 BaseLo = *Src0;
1950 }
1951
1952 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1953 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1954
1955 if (Src0->isImm())
1956 std::swap(Src0, Src1);
1957
1958 if (!Src1->isImm())
1959 return;
1960
1961 uint64_t Offset1 = Src1->getImm();
1962 BaseHi = *Src0;
1963
1964 Addr.Base.LoReg = BaseLo.getReg();
1965 Addr.Base.HiReg = BaseHi.getReg();
1966 Addr.Base.LoSubReg = BaseLo.getSubReg();
1967 Addr.Base.HiSubReg = BaseHi.getSubReg();
1968 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1969 }
1970
promoteConstantOffsetToImm(MachineInstr & MI,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList) const1971 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1972 MachineInstr &MI,
1973 MemInfoMap &Visited,
1974 SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1975
1976 if (!(MI.mayLoad() ^ MI.mayStore()))
1977 return false;
1978
1979 // TODO: Support flat and scratch.
1980 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1981 return false;
1982
1983 if (MI.mayLoad() &&
1984 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
1985 return false;
1986
1987 if (AnchorList.count(&MI))
1988 return false;
1989
1990 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1991
1992 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1993 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
1994 return false;
1995 }
1996
1997 // Step1: Find the base-registers and a 64bit constant offset.
1998 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1999 MemAddress MAddr;
2000 if (Visited.find(&MI) == Visited.end()) {
2001 processBaseWithConstOffset(Base, MAddr);
2002 Visited[&MI] = MAddr;
2003 } else
2004 MAddr = Visited[&MI];
2005
2006 if (MAddr.Offset == 0) {
2007 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
2008 " constant offsets that can be promoted.\n";);
2009 return false;
2010 }
2011
2012 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
2013 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2014
2015 // Step2: Traverse through MI's basic block and find an anchor(that has the
2016 // same base-registers) with the highest 13bit distance from MI's offset.
2017 // E.g. (64bit loads)
2018 // bb:
2019 // addr1 = &a + 4096; load1 = load(addr1, 0)
2020 // addr2 = &a + 6144; load2 = load(addr2, 0)
2021 // addr3 = &a + 8192; load3 = load(addr3, 0)
2022 // addr4 = &a + 10240; load4 = load(addr4, 0)
2023 // addr5 = &a + 12288; load5 = load(addr5, 0)
2024 //
2025 // Starting from the first load, the optimization will try to find a new base
2026 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2027 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2028 // as the new-base(anchor) because of the maximum distance which can
2029 // accommodate more intermediate bases presumably.
2030 //
2031 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2032 // (&a + 8192) for load1, load2, load4.
2033 // addr = &a + 8192
2034 // load1 = load(addr, -4096)
2035 // load2 = load(addr, -2048)
2036 // load3 = load(addr, 0)
2037 // load4 = load(addr, 2048)
2038 // addr5 = &a + 12288; load5 = load(addr5, 0)
2039 //
2040 MachineInstr *AnchorInst = nullptr;
2041 MemAddress AnchorAddr;
2042 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2043 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2044
2045 MachineBasicBlock *MBB = MI.getParent();
2046 MachineBasicBlock::iterator E = MBB->end();
2047 MachineBasicBlock::iterator MBBI = MI.getIterator();
2048 ++MBBI;
2049 const SITargetLowering *TLI =
2050 static_cast<const SITargetLowering *>(STM->getTargetLowering());
2051
2052 for ( ; MBBI != E; ++MBBI) {
2053 MachineInstr &MINext = *MBBI;
2054 // TODO: Support finding an anchor(with same base) from store addresses or
2055 // any other load addresses where the opcodes are different.
2056 if (MINext.getOpcode() != MI.getOpcode() ||
2057 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2058 continue;
2059
2060 const MachineOperand &BaseNext =
2061 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2062 MemAddress MAddrNext;
2063 if (Visited.find(&MINext) == Visited.end()) {
2064 processBaseWithConstOffset(BaseNext, MAddrNext);
2065 Visited[&MINext] = MAddrNext;
2066 } else
2067 MAddrNext = Visited[&MINext];
2068
2069 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2070 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2071 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2072 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2073 continue;
2074
2075 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
2076
2077 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2078 TargetLoweringBase::AddrMode AM;
2079 AM.HasBaseReg = true;
2080 AM.BaseOffs = Dist;
2081 if (TLI->isLegalGlobalAddressingMode(AM) &&
2082 (uint32_t)std::abs(Dist) > MaxDist) {
2083 MaxDist = std::abs(Dist);
2084
2085 AnchorAddr = MAddrNext;
2086 AnchorInst = &MINext;
2087 }
2088 }
2089
2090 if (AnchorInst) {
2091 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
2092 AnchorInst->dump());
2093 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
2094 << AnchorAddr.Offset << "\n\n");
2095
2096 // Instead of moving up, just re-compute anchor-instruction's base address.
2097 Register Base = computeBase(MI, AnchorAddr);
2098
2099 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2100 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
2101
2102 for (auto P : InstsWCommonBase) {
2103 TargetLoweringBase::AddrMode AM;
2104 AM.HasBaseReg = true;
2105 AM.BaseOffs = P.second - AnchorAddr.Offset;
2106
2107 if (TLI->isLegalGlobalAddressingMode(AM)) {
2108 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
2109 dbgs() << ")"; P.first->dump());
2110 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
2111 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
2112 }
2113 }
2114 AnchorList.insert(AnchorInst);
2115 return true;
2116 }
2117
2118 return false;
2119 }
2120
addInstToMergeableList(const CombineInfo & CI,std::list<std::list<CombineInfo>> & MergeableInsts) const2121 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2122 std::list<std::list<CombineInfo> > &MergeableInsts) const {
2123 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2124 if (AddrList.front().InstClass == CI.InstClass &&
2125 AddrList.front().IsAGPR == CI.IsAGPR &&
2126 AddrList.front().hasSameBaseAddress(*CI.I)) {
2127 AddrList.emplace_back(CI);
2128 return;
2129 }
2130 }
2131
2132 // Base address not found, so add a new list.
2133 MergeableInsts.emplace_back(1, CI);
2134 }
2135
2136 std::pair<MachineBasicBlock::iterator, bool>
collectMergeableInsts(MachineBasicBlock::iterator Begin,MachineBasicBlock::iterator End,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList,std::list<std::list<CombineInfo>> & MergeableInsts) const2137 SILoadStoreOptimizer::collectMergeableInsts(
2138 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2139 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2140 std::list<std::list<CombineInfo>> &MergeableInsts) const {
2141 bool Modified = false;
2142
2143 // Sort potential mergeable instructions into lists. One list per base address.
2144 unsigned Order = 0;
2145 MachineBasicBlock::iterator BlockI = Begin;
2146 for (; BlockI != End; ++BlockI) {
2147 MachineInstr &MI = *BlockI;
2148
2149 // We run this before checking if an address is mergeable, because it can produce
2150 // better code even if the instructions aren't mergeable.
2151 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2152 Modified = true;
2153
2154 // Treat volatile accesses, ordered accesses and unmodeled side effects as
2155 // barriers. We can look after this barrier for separate merges.
2156 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2157 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2158
2159 // Search will resume after this instruction in a separate merge list.
2160 ++BlockI;
2161 break;
2162 }
2163
2164 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2165 if (InstClass == UNKNOWN)
2166 continue;
2167
2168 // Do not merge VMEM buffer instructions with "swizzled" bit set.
2169 int Swizzled =
2170 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2171 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2172 continue;
2173
2174 CombineInfo CI;
2175 CI.setMI(MI, *this);
2176 CI.Order = Order++;
2177
2178 if (!CI.hasMergeableAddress(*MRI))
2179 continue;
2180
2181 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2182 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2183 // operands. However we are reporting that ds_write2 shall have
2184 // only VGPR data so that machine copy propagation does not
2185 // create an illegal instruction with a VGPR and AGPR sources.
2186 // Consequenctially if we create such instruction the verifier
2187 // will complain.
2188 continue;
2189 }
2190
2191 LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2192
2193 addInstToMergeableList(CI, MergeableInsts);
2194 }
2195
2196 // At this point we have lists of Mergeable instructions.
2197 //
2198 // Part 2: Sort lists by offset and then for each CombineInfo object in the
2199 // list try to find an instruction that can be merged with I. If an instruction
2200 // is found, it is stored in the Paired field. If no instructions are found, then
2201 // the CombineInfo object is deleted from the list.
2202
2203 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2204 E = MergeableInsts.end(); I != E;) {
2205
2206 std::list<CombineInfo> &MergeList = *I;
2207 if (MergeList.size() <= 1) {
2208 // This means we have found only one instruction with a given address
2209 // that can be merged, and we need at least 2 instructions to do a merge,
2210 // so this list can be discarded.
2211 I = MergeableInsts.erase(I);
2212 continue;
2213 }
2214
2215 // Sort the lists by offsets, this way mergeable instructions will be
2216 // adjacent to each other in the list, which will make it easier to find
2217 // matches.
2218 MergeList.sort(
2219 [] (const CombineInfo &A, const CombineInfo &B) {
2220 return A.Offset < B.Offset;
2221 });
2222 ++I;
2223 }
2224
2225 return std::make_pair(BlockI, Modified);
2226 }
2227
2228 // Scan through looking for adjacent LDS operations with constant offsets from
2229 // the same base register. We rely on the scheduler to do the hard work of
2230 // clustering nearby loads, and assume these are all adjacent.
optimizeBlock(std::list<std::list<CombineInfo>> & MergeableInsts)2231 bool SILoadStoreOptimizer::optimizeBlock(
2232 std::list<std::list<CombineInfo> > &MergeableInsts) {
2233 bool Modified = false;
2234
2235 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2236 E = MergeableInsts.end(); I != E;) {
2237 std::list<CombineInfo> &MergeList = *I;
2238
2239 bool OptimizeListAgain = false;
2240 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2241 // We weren't able to make any changes, so delete the list so we don't
2242 // process the same instructions the next time we try to optimize this
2243 // block.
2244 I = MergeableInsts.erase(I);
2245 continue;
2246 }
2247
2248 Modified = true;
2249
2250 // We made changes, but also determined that there were no more optimization
2251 // opportunities, so we don't need to reprocess the list
2252 if (!OptimizeListAgain) {
2253 I = MergeableInsts.erase(I);
2254 continue;
2255 }
2256 OptimizeAgain = true;
2257 }
2258 return Modified;
2259 }
2260
2261 bool
optimizeInstsWithSameBaseAddr(std::list<CombineInfo> & MergeList,bool & OptimizeListAgain)2262 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2263 std::list<CombineInfo> &MergeList,
2264 bool &OptimizeListAgain) {
2265 if (MergeList.empty())
2266 return false;
2267
2268 bool Modified = false;
2269
2270 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2271 Next = std::next(I)) {
2272
2273 auto First = I;
2274 auto Second = Next;
2275
2276 if ((*First).Order > (*Second).Order)
2277 std::swap(First, Second);
2278 CombineInfo &CI = *First;
2279 CombineInfo &Paired = *Second;
2280
2281 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2282 if (!Where) {
2283 ++I;
2284 continue;
2285 }
2286
2287 Modified = true;
2288
2289 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2290
2291 MachineBasicBlock::iterator NewMI;
2292 switch (CI.InstClass) {
2293 default:
2294 llvm_unreachable("unknown InstClass");
2295 break;
2296 case DS_READ:
2297 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2298 break;
2299 case DS_WRITE:
2300 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2301 break;
2302 case S_BUFFER_LOAD_IMM:
2303 NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I);
2304 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2305 break;
2306 case BUFFER_LOAD:
2307 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2308 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2309 break;
2310 case BUFFER_STORE:
2311 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2312 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2313 break;
2314 case MIMG:
2315 NewMI = mergeImagePair(CI, Paired, Where->I);
2316 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2317 break;
2318 case TBUFFER_LOAD:
2319 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2320 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2321 break;
2322 case TBUFFER_STORE:
2323 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2324 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2325 break;
2326 case FLAT_LOAD:
2327 case GLOBAL_LOAD:
2328 case GLOBAL_LOAD_SADDR:
2329 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2330 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2331 break;
2332 case FLAT_STORE:
2333 case GLOBAL_STORE:
2334 case GLOBAL_STORE_SADDR:
2335 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2336 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2337 break;
2338 }
2339 CI.setMI(NewMI, *this);
2340 CI.Order = Where->Order;
2341 if (I == Second)
2342 I = Next;
2343
2344 MergeList.erase(Second);
2345 }
2346
2347 return Modified;
2348 }
2349
runOnMachineFunction(MachineFunction & MF)2350 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2351 if (skipFunction(MF.getFunction()))
2352 return false;
2353
2354 STM = &MF.getSubtarget<GCNSubtarget>();
2355 if (!STM->loadStoreOptEnabled())
2356 return false;
2357
2358 TII = STM->getInstrInfo();
2359 TRI = &TII->getRegisterInfo();
2360
2361 MRI = &MF.getRegInfo();
2362 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2363
2364 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2365
2366 bool Modified = false;
2367
2368 // Contains the list of instructions for which constant offsets are being
2369 // promoted to the IMM. This is tracked for an entire block at time.
2370 SmallPtrSet<MachineInstr *, 4> AnchorList;
2371 MemInfoMap Visited;
2372
2373 for (MachineBasicBlock &MBB : MF) {
2374 MachineBasicBlock::iterator SectionEnd;
2375 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2376 I = SectionEnd) {
2377 bool CollectModified;
2378 std::list<std::list<CombineInfo>> MergeableInsts;
2379
2380 // First pass: Collect list of all instructions we know how to merge in a
2381 // subset of the block.
2382 std::tie(SectionEnd, CollectModified) =
2383 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2384
2385 Modified |= CollectModified;
2386
2387 do {
2388 OptimizeAgain = false;
2389 Modified |= optimizeBlock(MergeableInsts);
2390 } while (OptimizeAgain);
2391 }
2392
2393 Visited.clear();
2394 AnchorList.clear();
2395 }
2396
2397 return Modified;
2398 }
2399