1; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s
2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s
3
4@lds = addrspace(3) global [512 x float] undef, align 4
5@lds.f64 = addrspace(3) global [512 x double] undef, align 8
6
7; GCN-LABEL: {{^}}simple_write2_one_val_f32:
8; CI-DAG: s_mov_b32 m0
9; GFX9-NOT: m0
10
11; GCN-DAG: {{buffer|flat|global}}_load_dword [[VAL:v[0-9]+]]
12; GCN-DAG: v_lshlrev_b32_e32 [[VBASE:v[0-9]+]], 2, v{{[0-9]+}}
13; GCN-DAG: v_add_{{[ui]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds@abs32@lo, [[VBASE]]
14; GCN: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
15; GCN: s_endpgm
16define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
17  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
18  %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
19  %val = load float, float addrspace(1)* %in.gep, align 4
20  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
21  store float %val, float addrspace(3)* %arrayidx0, align 4
22  %add.x = add nsw i32 %x.i, 8
23  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
24  store float %val, float addrspace(3)* %arrayidx1, align 4
25  ret void
26}
27
28; GCN-LABEL: {{^}}simple_write2_two_val_f32:
29; CI-DAG: s_mov_b32 m0
30; GFX9-NOT: m0
31
32; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
33; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
34
35; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
36; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4{{$}}
37
38; GCN-DAG: v_lshlrev_b32_e32 [[VBASE:v[0-9]+]], 2, v{{[0-9]+}}
39; GCN-DAG: v_add_{{[ui]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds@abs32@lo, [[VBASE]]
40; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
41; GCN: s_endpgm
42define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
43  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
44  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
45  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
46  %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
47  %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
48  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
49  store float %val0, float addrspace(3)* %arrayidx0, align 4
50  %add.x = add nsw i32 %x.i, 8
51  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
52  store float %val1, float addrspace(3)* %arrayidx1, align 4
53  ret void
54}
55
56; GCN-LABEL: @simple_write2_two_val_f32_volatile_0
57; CI-DAG: s_mov_b32 m0
58; GFX9-NOT: m0
59
60; GCN-NOT: ds_write2_b32
61; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
62; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
63; GCN: s_endpgm
64define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
65  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
66  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
67  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
68  %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4
69  %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4
70  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
71  store volatile float %val0, float addrspace(3)* %arrayidx0, align 4
72  %add.x = add nsw i32 %x.i, 8
73  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
74  store float %val1, float addrspace(3)* %arrayidx1, align 4
75  ret void
76}
77
78; GCN-LABEL: @simple_write2_two_val_f32_volatile_1
79; CI-DAG: s_mov_b32 m0
80; GFX9-NOT: m0
81
82; GCN-NOT: ds_write2_b32
83; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
84; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
85; GCN: s_endpgm
86define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
87  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
88  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
89  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
90  %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4
91  %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4
92  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
93  store float %val0, float addrspace(3)* %arrayidx0, align 4
94  %add.x = add nsw i32 %x.i, 8
95  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
96  store volatile float %val1, float addrspace(3)* %arrayidx1, align 4
97  ret void
98}
99
100; 2 data subregisters from different super registers.
101; GCN-LABEL: {{^}}simple_write2_two_val_subreg2_mixed_f32:
102; GFX9-NOT: m0
103
104; CI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
105; CI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
106; CI-DAG: s_mov_b32 m0
107
108; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
109; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
110;
111; TODO: This should be an s_mov_b32. The v_mov_b32 gets introduced by an
112;       early legalization of the constant bus constraint on the v_lshl_add_u32,
113;       and then SIFoldOperands folds in an unlucky order.
114; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
115; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], {{v[0-9]+}}, 2, [[VBASE]]
116
117; GFX9-DAG: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
118; GFX9-DAG: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
119
120; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
121; GCN: s_endpgm
122define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
123  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
124  %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
125  %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
126  %val0 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8
127  %val1 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8
128  %val0.0 = extractelement <2 x float> %val0, i32 0
129  %val1.1 = extractelement <2 x float> %val1, i32 1
130  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
131  store float %val0.0, float addrspace(3)* %arrayidx0, align 4
132  %add.x = add nsw i32 %x.i, 8
133  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
134  store float %val1.1, float addrspace(3)* %arrayidx1, align 4
135  ret void
136}
137
138; GCN-LABEL: @simple_write2_two_val_subreg2_f32
139; CI-DAG: s_mov_b32 m0
140; GFX9-NOT: m0
141
142; GCN-DAG: {{buffer|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
143
144; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
145; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
146; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
147; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]]
148
149; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
150; GCN: s_endpgm
151define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
152  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
153  %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
154  %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
155  %val0 = extractelement <2 x float> %val, i32 0
156  %val1 = extractelement <2 x float> %val, i32 1
157  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
158  store float %val0, float addrspace(3)* %arrayidx0, align 4
159  %add.x = add nsw i32 %x.i, 8
160  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
161  store float %val1, float addrspace(3)* %arrayidx1, align 4
162  ret void
163}
164
165; GCN-LABEL: @simple_write2_two_val_subreg4_f32
166; CI-DAG: s_mov_b32 m0
167; GFX9-NOT: m0
168
169; GCN-DAG: {{buffer|global}}_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
170
171; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
172; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
173; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
174; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]]
175
176; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
177; GCN: s_endpgm
178define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
179  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
180  %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
181  %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
182  %val0 = extractelement <4 x float> %val, i32 0
183  %val1 = extractelement <4 x float> %val, i32 3
184  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
185  store float %val0, float addrspace(3)* %arrayidx0, align 4
186  %add.x = add nsw i32 %x.i, 8
187  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
188  store float %val1, float addrspace(3)* %arrayidx1, align 4
189  ret void
190}
191
192; GCN-LABEL: @simple_write2_two_val_max_offset_f32
193; CI-DAG: s_mov_b32 m0
194; GFX9-NOT: m0
195
196; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
197; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
198
199; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
200; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4{{$}}
201
202; GCN-DAG: v_lshlrev_b32_e32 [[VBASE:v[0-9]+]], 2, v{{[0-9]+}}
203; GCN-DAG: v_add_{{[ui]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds@abs32@lo, [[VBASE]]
204
205; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
206; GCN: s_endpgm
207define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
208  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
209  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
210  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
211  %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
212  %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
213  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
214  store float %val0, float addrspace(3)* %arrayidx0, align 4
215  %add.x = add nsw i32 %x.i, 255
216  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
217  store float %val1, float addrspace(3)* %arrayidx1, align 4
218  ret void
219}
220
221; GCN-LABEL: @simple_write2_two_val_too_far_f32
222; CI-DAG: s_mov_b32 m0
223; GFX9-NOT: m0
224
225; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
226; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
227; GCN: s_endpgm
228define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
229  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
230  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
231  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
232  %val0 = load float, float addrspace(1)* %in0.gep, align 4
233  %val1 = load float, float addrspace(1)* %in1.gep, align 4
234  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
235  store float %val0, float addrspace(3)* %arrayidx0, align 4
236  %add.x = add nsw i32 %x.i, 257
237  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
238  store float %val1, float addrspace(3)* %arrayidx1, align 4
239  ret void
240}
241
242; GCN-LABEL: @simple_write2_two_val_f32_x2
243; CI-DAG: s_mov_b32 m0
244; GFX9-NOT: m0
245
246; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
247; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
248; GCN: s_endpgm
249define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
250  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
251  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
252  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
253  %val0 = load float, float addrspace(1)* %in0.gep, align 4
254  %val1 = load float, float addrspace(1)* %in1.gep, align 4
255
256  %idx.0 = add nsw i32 %tid.x, 0
257  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
258  store float %val0, float addrspace(3)* %arrayidx0, align 4
259
260  %idx.1 = add nsw i32 %tid.x, 8
261  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
262  store float %val1, float addrspace(3)* %arrayidx1, align 4
263
264  %idx.2 = add nsw i32 %tid.x, 11
265  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
266  store float %val0, float addrspace(3)* %arrayidx2, align 4
267
268  %idx.3 = add nsw i32 %tid.x, 27
269  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
270  store float %val1, float addrspace(3)* %arrayidx3, align 4
271
272  ret void
273}
274
275; GCN-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
276; CI-DAG: s_mov_b32 m0
277; GFX9-NOT: m0
278
279; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
280; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
281; GCN: s_endpgm
282define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
283  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
284  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
285  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
286  %val0 = load float, float addrspace(1)* %in0.gep, align 4
287  %val1 = load float, float addrspace(1)* %in1.gep, align 4
288
289  %idx.0 = add nsw i32 %tid.x, 3
290  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
291  store float %val0, float addrspace(3)* %arrayidx0, align 4
292
293  %idx.1 = add nsw i32 %tid.x, 8
294  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
295  store float %val1, float addrspace(3)* %arrayidx1, align 4
296
297  %idx.2 = add nsw i32 %tid.x, 11
298  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
299  store float %val0, float addrspace(3)* %arrayidx2, align 4
300
301  %idx.3 = add nsw i32 %tid.x, 27
302  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
303  store float %val1, float addrspace(3)* %arrayidx3, align 4
304
305  ret void
306}
307
308; GCN-LABEL: @write2_ptr_subreg_arg_two_val_f32
309; CI-DAG: s_mov_b32 m0
310; GFX9-NOT: m0
311
312; GCN-NOT: ds_write2_b32
313; GCN: ds_write_b32
314; GCN: ds_write_b32
315; GCN: s_endpgm
316define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
317  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
318  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
319  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
320  %val0 = load float, float addrspace(1)* %in0.gep, align 4
321  %val1 = load float, float addrspace(1)* %in1.gep, align 4
322
323  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
324  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
325  %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
326  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
327  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
328
329  ; Apply an additional offset after the vector that will be more obviously folded.
330  %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8
331  store float %val0, float addrspace(3)* %gep.0, align 4
332
333  %add.x = add nsw i32 %x.i, 8
334  store float %val1, float addrspace(3)* %gep.1.offset, align 4
335  ret void
336}
337
338; GCN-LABEL: @simple_write2_one_val_f64
339; CI-DAG: s_mov_b32 m0
340; GFX9-NOT: m0
341
342; GCN-DAG: {{buffer|global}}_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
343; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
344; GCN: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
345; GCN: s_endpgm
346define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
347  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
348  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
349  %val = load double, double addrspace(1)* %in.gep, align 8
350  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
351  store double %val, double addrspace(3)* %arrayidx0, align 8
352  %add.x = add nsw i32 %x.i, 8
353  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
354  store double %val, double addrspace(3)* %arrayidx1, align 8
355  ret void
356}
357
358; GCN-LABEL: @misaligned_simple_write2_one_val_f64
359; CI-DAG: s_mov_b32 m0
360; GFX9-NOT: m0
361
362; GCN-DAG: {{buffer|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
363; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
364; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1
365; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
366; GCN: s_endpgm
367define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
368  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
369  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
370  %val = load double, double addrspace(1)* %in.gep, align 8
371  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
372  store double %val, double addrspace(3)* %arrayidx0, align 4
373  %add.x = add nsw i32 %x.i, 7
374  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
375  store double %val, double addrspace(3)* %arrayidx1, align 4
376  ret void
377}
378
379; GCN-LABEL: @simple_write2_two_val_f64
380; CI-DAG: s_mov_b32 m0
381; GFX9-NOT: m0
382
383; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
384; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
385
386; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]$}}
387; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:8
388
389; GCN-DAG: v_lshlrev_b32_e32 [[VBASE:v[0-9]+]], 3, v{{[0-9]+}}
390; GCN-DAG: v_add_{{[ui]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds.f64@abs32@lo, [[VBASE]]
391; GCN: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
392; GCN: s_endpgm
393define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
394  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
395  %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
396  %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
397  %val0 = load volatile double, double addrspace(1)* %in.gep.0, align 8
398  %val1 = load volatile double, double addrspace(1)* %in.gep.1, align 8
399  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
400  store double %val0, double addrspace(3)* %arrayidx0, align 8
401  %add.x = add nsw i32 %x.i, 8
402  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
403  store double %val1, double addrspace(3)* %arrayidx1, align 8
404  ret void
405}
406
407@foo = addrspace(3) global [4 x i32] undef, align 4
408
409; GCN-LABEL: @store_constant_adjacent_offsets
410; CI-DAG: s_mov_b32 m0
411; GFX9-NOT: m0
412
413; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
414; GCN: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
415define amdgpu_kernel void @store_constant_adjacent_offsets() {
416  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
417  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
418  ret void
419}
420
421; GCN-LABEL: @store_constant_disjoint_offsets
422; CI-DAG: s_mov_b32 m0
423; GFX9-NOT: m0
424
425; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
426; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
427; GCN: ds_write2_b32 [[PTR]], [[VAL]], [[VAL]] offset1:2
428define amdgpu_kernel void @store_constant_disjoint_offsets() {
429  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
430  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
431  ret void
432}
433
434@bar = addrspace(3) global [4 x i64] undef, align 4
435
436; GCN-LABEL: @store_misaligned64_constant_offsets
437; CI-DAG: s_mov_b32 m0
438; GFX9-NOT: m0
439
440; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}}
441; CI-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
442; CI-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
443; GFX9-DAG: ds_write_b128 [[PTR]], {{v\[[0-9]+:[0-9]+\]}}
444
445; GCN: s_endpgm
446define amdgpu_kernel void @store_misaligned64_constant_offsets() {
447  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
448  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
449  ret void
450}
451
452@bar.large = addrspace(3) global [4096 x i64] undef, align 4
453
454; GCN-LABEL: @store_misaligned64_constant_large_offsets
455; CI-DAG: s_mov_b32 m0
456; GFX9-NOT: m0
457
458; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], bar.large@abs32@lo
459; GCN-DAG: s_add_i32 [[SBASE1:s[0-9]+]], [[SBASE0]], 0x4000{{$}}
460; GCN-DAG: s_addk_i32 [[SBASE0]], 0x7ff8{{$}}
461; GCN-DAG: v_mov_b32_e32 [[VBASE0:v[0-9]+]], [[SBASE0]]{{$}}
462; GCN-DAG: v_mov_b32_e32 [[VBASE1:v[0-9]+]], [[SBASE1]]{{$}}
463; GCN-DAG: ds_write2_b32 [[VBASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
464; GCN-DAG: ds_write2_b32 [[VBASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
465; GCN: s_endpgm
466define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
467  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
468  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
469  ret void
470}
471
472@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
473@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
474
475define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
476  %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
477  %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
478  %val = load float, float addrspace(1)* %in
479  %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
480  store float %val, float addrspace(3)* %arrayidx44, align 4
481  %add47 = add nsw i32 %x.i, 1
482  %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
483  store float %val, float addrspace(3)* %arrayidx48, align 4
484  %add51 = add nsw i32 %x.i, 16
485  %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
486  store float %val, float addrspace(3)* %arrayidx52, align 4
487  %add55 = add nsw i32 %x.i, 17
488  %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
489  store float %val, float addrspace(3)* %arrayidx56, align 4
490  %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
491  store float %val, float addrspace(3)* %arrayidx60, align 4
492  %add63 = add nsw i32 %y.i, 1
493  %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
494  store float %val, float addrspace(3)* %arrayidx64, align 4
495  %add67 = add nsw i32 %y.i, 32
496  %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
497  store float %val, float addrspace(3)* %arrayidx68, align 4
498  %add71 = add nsw i32 %y.i, 33
499  %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
500  store float %val, float addrspace(3)* %arrayidx72, align 4
501  %add75 = add nsw i32 %y.i, 64
502  %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
503  store float %val, float addrspace(3)* %arrayidx76, align 4
504  %add79 = add nsw i32 %y.i, 65
505  %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
506  store float %val, float addrspace(3)* %arrayidx80, align 4
507  ret void
508}
509
510; GCN-LABEL: {{^}}simple_write2_v4f32_superreg_align4:
511; CI: s_mov_b32 m0
512; GFX9-NOT: m0
513
514; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}}
515; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:2 offset1:3{{$}}
516; GFX9: ds_write_b128 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
517define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
518  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
519  %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
520  %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4
521  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(3)* %out, i32 %x.i
522  store <4 x float> %val0, <4 x float> addrspace(3)* %out.gep, align 4
523  ret void
524}
525
526declare i32 @llvm.amdgcn.workgroup.id.x() #1
527declare i32 @llvm.amdgcn.workgroup.id.y() #1
528declare i32 @llvm.amdgcn.workitem.id.x() #1
529declare i32 @llvm.amdgcn.workitem.id.y() #1
530
531attributes #0 = { nounwind }
532attributes #1 = { nounwind readnone speculatable }
533attributes #2 = { convergent nounwind }
534