1 //===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implementation of the TargetInstrInfo class that is common to all
12 /// AMD GPUs.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPUInstrInfo.h"
17 #include "AMDGPURegisterInfo.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "llvm/CodeGen/MachineFrameInfo.h"
20 #include "llvm/CodeGen/MachineInstrBuilder.h"
21 #include "llvm/CodeGen/MachineRegisterInfo.h"
22 
23 using namespace llvm;
24 
25 #define GET_INSTRINFO_CTOR_DTOR
26 #include "AMDGPUGenInstrInfo.inc"
27 
28 namespace llvm {
29 namespace AMDGPU {
30 #define GET_RSRCINTRINSIC_IMPL
31 #include "AMDGPUGenSearchableTables.inc"
32 
33 #define GET_D16IMAGEDIMINTRINSIC_IMPL
34 #include "AMDGPUGenSearchableTables.inc"
35 }
36 }
37 
38 // Pin the vtable to this file.
39 void AMDGPUInstrInfo::anchor() {}
40 
41 AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST)
42   : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
43     ST(ST),
44     AMDGPUASI(ST.getAMDGPUAS()) {}
45 
46 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
47 // the first 16 loads will be interleaved with the stores, and the next 16 will
48 // be clustered as expected. It should really split into 2 16 store batches.
49 //
50 // Loads are clustered until this returns false, rather than trying to schedule
51 // groups of stores. This also means we have to deal with saying different
52 // address space loads should be clustered, and ones which might cause bank
53 // conflicts.
54 //
55 // This might be deprecated so it might not be worth that much effort to fix.
56 bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
57                                               int64_t Offset0, int64_t Offset1,
58                                               unsigned NumLoads) const {
59   assert(Offset1 > Offset0 &&
60          "Second offset should be larger than first offset!");
61   // If we have less than 16 loads in a row, and the offsets are within 64
62   // bytes, then schedule together.
63 
64   // A cacheline is 64 bytes (for global memory).
65   return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
66 }
67 
68 // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
69 enum SIEncodingFamily {
70   SI = 0,
71   VI = 1,
72   SDWA = 2,
73   SDWA9 = 3,
74   GFX80 = 4,
75   GFX9 = 5
76 };
77 
78 static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
79   switch (ST.getGeneration()) {
80   case AMDGPUSubtarget::SOUTHERN_ISLANDS:
81   case AMDGPUSubtarget::SEA_ISLANDS:
82     return SIEncodingFamily::SI;
83   case AMDGPUSubtarget::VOLCANIC_ISLANDS:
84   case AMDGPUSubtarget::GFX9:
85     return SIEncodingFamily::VI;
86 
87   // FIXME: This should never be called for r600 GPUs.
88   case AMDGPUSubtarget::R600:
89   case AMDGPUSubtarget::R700:
90   case AMDGPUSubtarget::EVERGREEN:
91   case AMDGPUSubtarget::NORTHERN_ISLANDS:
92     return SIEncodingFamily::SI;
93   }
94 
95   llvm_unreachable("Unknown subtarget generation!");
96 }
97 
98 int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
99   SIEncodingFamily Gen = subtargetEncodingFamily(ST);
100 
101   if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
102     ST.getGeneration() >= AMDGPUSubtarget::GFX9)
103     Gen = SIEncodingFamily::GFX9;
104 
105   if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
106     Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
107                                                       : SIEncodingFamily::SDWA;
108   // Adjust the encoding family to GFX80 for D16 buffer instructions when the
109   // subtarget has UnpackedD16VMem feature.
110   // TODO: remove this when we discard GFX80 encoding.
111   if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16)
112                               && !(get(Opcode).TSFlags & SIInstrFlags::MIMG))
113     Gen = SIEncodingFamily::GFX80;
114 
115   int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
116 
117   // -1 means that Opcode is already a native instruction.
118   if (MCOp == -1)
119     return Opcode;
120 
121   // (uint16_t)-1 means that Opcode is a pseudo instruction that has
122   // no encoding in the given subtarget generation.
123   if (MCOp == (uint16_t)-1)
124     return -1;
125 
126   return MCOp;
127 }
128 
129 // TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
130 bool AMDGPUInstrInfo::isUniformMMO(const MachineMemOperand *MMO) {
131   const Value *Ptr = MMO->getValue();
132   // UndefValue means this is a load of a kernel input.  These are uniform.
133   // Sometimes LDS instructions have constant pointers.
134   // If Ptr is null, then that means this mem operand contains a
135   // PseudoSourceValue like GOT.
136   if (!Ptr || isa<UndefValue>(Ptr) ||
137       isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
138     return true;
139 
140   if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
141     return true;
142 
143   if (const Argument *Arg = dyn_cast<Argument>(Ptr))
144     return AMDGPU::isArgPassedInSGPR(Arg);
145 
146   const Instruction *I = dyn_cast<Instruction>(Ptr);
147   return I && I->getMetadata("amdgpu.uniform");
148 }
149