1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s
3; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s
4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s
5
6@lds = addrspace(3) global [512 x float] undef, align 4
7@lds.f64 = addrspace(3) global [512 x double] undef, align 8
8
9define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
10; CI-LABEL: simple_write2_one_val_f32:
11; CI:       ; %bb.0:
12; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
13; CI-NEXT:    s_mov_b32 s3, 0xf000
14; CI-NEXT:    s_mov_b32 s2, 0
15; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
16; CI-NEXT:    v_mov_b32_e32 v1, 0
17; CI-NEXT:    s_waitcnt lgkmcnt(0)
18; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
19; CI-NEXT:    s_mov_b32 m0, -1
20; CI-NEXT:    s_waitcnt vmcnt(0)
21; CI-NEXT:    ds_write2_b32 v0, v1, v1 offset1:8
22; CI-NEXT:    s_endpgm
23;
24; GFX9-LABEL: simple_write2_one_val_f32:
25; GFX9:       ; %bb.0:
26; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
27; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
28; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
29; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
30; GFX9-NEXT:    s_waitcnt vmcnt(0)
31; GFX9-NEXT:    ds_write2_b32 v0, v1, v1 offset1:8
32; GFX9-NEXT:    s_endpgm
33  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
34  %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
35  %val = load float, float addrspace(1)* %in.gep, align 4
36  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
37  store float %val, float addrspace(3)* %arrayidx0, align 4
38  %add.x = add nsw i32 %x.i, 8
39  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
40  store float %val, float addrspace(3)* %arrayidx1, align 4
41  ret void
42}
43
44define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
45; CI-LABEL: simple_write2_two_val_f32:
46; CI:       ; %bb.0:
47; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
48; CI-NEXT:    s_mov_b32 s3, 0xf000
49; CI-NEXT:    s_mov_b32 s2, 0
50; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
51; CI-NEXT:    v_mov_b32_e32 v1, 0
52; CI-NEXT:    s_waitcnt lgkmcnt(0)
53; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
54; CI-NEXT:    s_waitcnt vmcnt(0)
55; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
56; CI-NEXT:    s_waitcnt vmcnt(0)
57; CI-NEXT:    s_mov_b32 m0, -1
58; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset1:8
59; CI-NEXT:    s_endpgm
60;
61; GFX9-LABEL: simple_write2_two_val_f32:
62; GFX9:       ; %bb.0:
63; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
64; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
65; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
67; GFX9-NEXT:    s_waitcnt vmcnt(0)
68; GFX9-NEXT:    global_load_dword v2, v0, s[0:1] offset:4 glc
69; GFX9-NEXT:    s_waitcnt vmcnt(0)
70; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
71; GFX9-NEXT:    s_endpgm
72  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
73  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
74  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
75  %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
76  %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
77  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
78  store float %val0, float addrspace(3)* %arrayidx0, align 4
79  %add.x = add nsw i32 %x.i, 8
80  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
81  store float %val1, float addrspace(3)* %arrayidx1, align 4
82  ret void
83}
84
85define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
86; CI-LABEL: simple_write2_two_val_f32_volatile_0:
87; CI:       ; %bb.0:
88; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
89; CI-NEXT:    s_mov_b32 s3, 0xf000
90; CI-NEXT:    s_mov_b32 s2, 0
91; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
92; CI-NEXT:    v_mov_b32_e32 v1, 0
93; CI-NEXT:    s_waitcnt lgkmcnt(0)
94; CI-NEXT:    s_mov_b64 s[0:1], s[4:5]
95; CI-NEXT:    s_mov_b64 s[4:5], s[6:7]
96; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
97; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
98; CI-NEXT:    s_waitcnt vmcnt(0)
99; CI-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 glc
100; CI-NEXT:    s_waitcnt vmcnt(0)
101; CI-NEXT:    s_mov_b32 m0, -1
102; CI-NEXT:    ds_write_b32 v0, v2
103; CI-NEXT:    ds_write_b32 v0, v1 offset:32
104; CI-NEXT:    s_endpgm
105;
106; GFX9-LABEL: simple_write2_two_val_f32_volatile_0:
107; GFX9:       ; %bb.0:
108; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
109; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
110; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
111; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
112; GFX9-NEXT:    s_waitcnt vmcnt(0)
113; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
114; GFX9-NEXT:    s_waitcnt vmcnt(0)
115; GFX9-NEXT:    ds_write_b32 v0, v1
116; GFX9-NEXT:    ds_write_b32 v0, v2 offset:32
117; GFX9-NEXT:    s_endpgm
118  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
119  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
120  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
121  %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4
122  %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4
123  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
124  store volatile float %val0, float addrspace(3)* %arrayidx0, align 4
125  %add.x = add nsw i32 %x.i, 8
126  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
127  store float %val1, float addrspace(3)* %arrayidx1, align 4
128  ret void
129}
130
131define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
132; CI-LABEL: simple_write2_two_val_f32_volatile_1:
133; CI:       ; %bb.0:
134; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
135; CI-NEXT:    s_mov_b32 s3, 0xf000
136; CI-NEXT:    s_mov_b32 s2, 0
137; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
138; CI-NEXT:    v_mov_b32_e32 v1, 0
139; CI-NEXT:    s_waitcnt lgkmcnt(0)
140; CI-NEXT:    s_mov_b64 s[0:1], s[4:5]
141; CI-NEXT:    s_mov_b64 s[4:5], s[6:7]
142; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
143; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
144; CI-NEXT:    s_waitcnt vmcnt(0)
145; CI-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 glc
146; CI-NEXT:    s_waitcnt vmcnt(0)
147; CI-NEXT:    s_mov_b32 m0, -1
148; CI-NEXT:    ds_write_b32 v0, v2
149; CI-NEXT:    ds_write_b32 v0, v1 offset:32
150; CI-NEXT:    s_endpgm
151;
152; GFX9-LABEL: simple_write2_two_val_f32_volatile_1:
153; GFX9:       ; %bb.0:
154; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
155; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
156; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
158; GFX9-NEXT:    s_waitcnt vmcnt(0)
159; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
160; GFX9-NEXT:    s_waitcnt vmcnt(0)
161; GFX9-NEXT:    ds_write_b32 v0, v1
162; GFX9-NEXT:    ds_write_b32 v0, v2 offset:32
163; GFX9-NEXT:    s_endpgm
164  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
165  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
166  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
167  %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4
168  %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4
169  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
170  store float %val0, float addrspace(3)* %arrayidx0, align 4
171  %add.x = add nsw i32 %x.i, 8
172  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
173  store volatile float %val1, float addrspace(3)* %arrayidx1, align 4
174  ret void
175}
176
177; 2 data subregisters from different super registers.
178; TODO: GFX9 has v_mov_b32_e32 v2, lds@abs32@lo
179;       This should be an s_mov_b32. The v_mov_b32 gets introduced by an
180;       early legalization of the constant bus constraint on the v_lshl_add_u32,
181;       and then SIFoldOperands folds in an unlucky order.
182define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
183; CI-LABEL: simple_write2_two_val_subreg2_mixed_f32:
184; CI:       ; %bb.0:
185; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
186; CI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
187; CI-NEXT:    s_mov_b32 s3, 0xf000
188; CI-NEXT:    s_mov_b32 s2, 0
189; CI-NEXT:    v_mov_b32_e32 v2, 0
190; CI-NEXT:    s_waitcnt lgkmcnt(0)
191; CI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 glc
192; CI-NEXT:    s_waitcnt vmcnt(0)
193; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8 glc
194; CI-NEXT:    s_waitcnt vmcnt(0)
195; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
196; CI-NEXT:    s_mov_b32 m0, -1
197; CI-NEXT:    ds_write2_b32 v0, v3, v2 offset1:8
198; CI-NEXT:    s_endpgm
199;
200; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32:
201; GFX9:       ; %bb.0:
202; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
203; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
204; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
205; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
206; GFX9-NEXT:    global_load_dwordx2 v[1:2], v5, s[0:1] glc
207; GFX9-NEXT:    s_waitcnt vmcnt(0)
208; GFX9-NEXT:    global_load_dwordx2 v[3:4], v5, s[0:1] offset:8 glc
209; GFX9-NEXT:    s_waitcnt vmcnt(0)
210; GFX9-NEXT:    ds_write2_b32 v0, v1, v4 offset1:8
211; GFX9-NEXT:    s_endpgm
212  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
213  %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
214  %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
215  %val0 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8
216  %val1 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8
217  %val0.0 = extractelement <2 x float> %val0, i32 0
218  %val1.1 = extractelement <2 x float> %val1, i32 1
219  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
220  store float %val0.0, float addrspace(3)* %arrayidx0, align 4
221  %add.x = add nsw i32 %x.i, 8
222  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
223  store float %val1.1, float addrspace(3)* %arrayidx1, align 4
224  ret void
225}
226
227define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
228; CI-LABEL: simple_write2_two_val_subreg2_f32:
229; CI:       ; %bb.0:
230; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
231; CI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
232; CI-NEXT:    s_mov_b32 s3, 0xf000
233; CI-NEXT:    s_mov_b32 s2, 0
234; CI-NEXT:    v_mov_b32_e32 v2, 0
235; CI-NEXT:    s_waitcnt lgkmcnt(0)
236; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64
237; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
238; CI-NEXT:    s_mov_b32 m0, -1
239; CI-NEXT:    s_waitcnt vmcnt(0)
240; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
241; CI-NEXT:    s_endpgm
242;
243; GFX9-LABEL: simple_write2_two_val_subreg2_f32:
244; GFX9:       ; %bb.0:
245; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
246; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
247; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
248; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
249; GFX9-NEXT:    global_load_dwordx2 v[1:2], v1, s[0:1]
250; GFX9-NEXT:    s_waitcnt vmcnt(0)
251; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
252; GFX9-NEXT:    s_endpgm
253  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
254  %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
255  %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
256  %val0 = extractelement <2 x float> %val, i32 0
257  %val1 = extractelement <2 x float> %val, i32 1
258  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
259  store float %val0, float addrspace(3)* %arrayidx0, align 4
260  %add.x = add nsw i32 %x.i, 8
261  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
262  store float %val1, float addrspace(3)* %arrayidx1, align 4
263  ret void
264}
265
266define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
267; CI-LABEL: simple_write2_two_val_subreg4_f32:
268; CI:       ; %bb.0:
269; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
270; CI-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
271; CI-NEXT:    s_mov_b32 s3, 0xf000
272; CI-NEXT:    s_mov_b32 s2, 0
273; CI-NEXT:    v_mov_b32_e32 v2, 0
274; CI-NEXT:    s_waitcnt lgkmcnt(0)
275; CI-NEXT:    buffer_load_dwordx4 v[1:4], v[1:2], s[0:3], 0 addr64
276; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
277; CI-NEXT:    s_mov_b32 m0, -1
278; CI-NEXT:    s_waitcnt vmcnt(0)
279; CI-NEXT:    ds_write2_b32 v0, v1, v4 offset1:8
280; CI-NEXT:    s_endpgm
281;
282; GFX9-LABEL: simple_write2_two_val_subreg4_f32:
283; GFX9:       ; %bb.0:
284; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
285; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
286; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
287; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
288; GFX9-NEXT:    global_load_dwordx4 v[1:4], v1, s[0:1]
289; GFX9-NEXT:    s_waitcnt vmcnt(0)
290; GFX9-NEXT:    ds_write2_b32 v0, v1, v4 offset1:8
291; GFX9-NEXT:    s_endpgm
292  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
293  %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
294  %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
295  %val0 = extractelement <4 x float> %val, i32 0
296  %val1 = extractelement <4 x float> %val, i32 3
297  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
298  store float %val0, float addrspace(3)* %arrayidx0, align 4
299  %add.x = add nsw i32 %x.i, 8
300  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
301  store float %val1, float addrspace(3)* %arrayidx1, align 4
302  ret void
303}
304
305define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
306; CI-LABEL: simple_write2_two_val_max_offset_f32:
307; CI:       ; %bb.0:
308; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
309; CI-NEXT:    s_mov_b32 s3, 0xf000
310; CI-NEXT:    s_mov_b32 s2, 0
311; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
312; CI-NEXT:    v_mov_b32_e32 v1, 0
313; CI-NEXT:    s_waitcnt lgkmcnt(0)
314; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
315; CI-NEXT:    s_waitcnt vmcnt(0)
316; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
317; CI-NEXT:    s_waitcnt vmcnt(0)
318; CI-NEXT:    s_mov_b32 m0, -1
319; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset1:255
320; CI-NEXT:    s_endpgm
321;
322; GFX9-LABEL: simple_write2_two_val_max_offset_f32:
323; GFX9:       ; %bb.0:
324; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
325; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
326; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
327; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
328; GFX9-NEXT:    s_waitcnt vmcnt(0)
329; GFX9-NEXT:    global_load_dword v2, v0, s[0:1] offset:4 glc
330; GFX9-NEXT:    s_waitcnt vmcnt(0)
331; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:255
332; GFX9-NEXT:    s_endpgm
333  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
334  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
335  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
336  %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
337  %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
338  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
339  store float %val0, float addrspace(3)* %arrayidx0, align 4
340  %add.x = add nsw i32 %x.i, 255
341  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
342  store float %val1, float addrspace(3)* %arrayidx1, align 4
343  ret void
344}
345
346define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
347; CI-LABEL: simple_write2_two_val_too_far_f32:
348; CI:       ; %bb.0:
349; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
350; CI-NEXT:    s_mov_b32 s3, 0xf000
351; CI-NEXT:    s_mov_b32 s2, 0
352; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
353; CI-NEXT:    v_mov_b32_e32 v1, 0
354; CI-NEXT:    s_waitcnt lgkmcnt(0)
355; CI-NEXT:    s_mov_b64 s[0:1], s[4:5]
356; CI-NEXT:    s_mov_b64 s[4:5], s[6:7]
357; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
358; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
359; CI-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
360; CI-NEXT:    s_mov_b32 m0, -1
361; CI-NEXT:    s_waitcnt vmcnt(1)
362; CI-NEXT:    ds_write_b32 v0, v2
363; CI-NEXT:    s_waitcnt vmcnt(0)
364; CI-NEXT:    ds_write_b32 v0, v1 offset:1028
365; CI-NEXT:    s_endpgm
366;
367; GFX9-LABEL: simple_write2_two_val_too_far_f32:
368; GFX9:       ; %bb.0:
369; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
370; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
371; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
372; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
373; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
374; GFX9-NEXT:    s_waitcnt vmcnt(1)
375; GFX9-NEXT:    ds_write_b32 v0, v1
376; GFX9-NEXT:    s_waitcnt vmcnt(0)
377; GFX9-NEXT:    ds_write_b32 v0, v2 offset:1028
378; GFX9-NEXT:    s_endpgm
379  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
380  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
381  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
382  %val0 = load float, float addrspace(1)* %in0.gep, align 4
383  %val1 = load float, float addrspace(1)* %in1.gep, align 4
384  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
385  store float %val0, float addrspace(3)* %arrayidx0, align 4
386  %add.x = add nsw i32 %x.i, 257
387  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
388  store float %val1, float addrspace(3)* %arrayidx1, align 4
389  ret void
390}
391
392define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
393; CI-LABEL: simple_write2_two_val_f32_x2:
394; CI:       ; %bb.0:
395; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
396; CI-NEXT:    s_mov_b32 s3, 0xf000
397; CI-NEXT:    s_mov_b32 s2, 0
398; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
399; CI-NEXT:    v_mov_b32_e32 v1, 0
400; CI-NEXT:    s_waitcnt lgkmcnt(0)
401; CI-NEXT:    s_mov_b64 s[0:1], s[4:5]
402; CI-NEXT:    s_mov_b64 s[4:5], s[6:7]
403; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
404; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
405; CI-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
406; CI-NEXT:    s_mov_b32 m0, -1
407; CI-NEXT:    s_waitcnt vmcnt(0)
408; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset1:8
409; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset0:11 offset1:27
410; CI-NEXT:    s_endpgm
411;
412; GFX9-LABEL: simple_write2_two_val_f32_x2:
413; GFX9:       ; %bb.0:
414; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
415; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
416; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
417; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
418; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
419; GFX9-NEXT:    s_waitcnt vmcnt(0)
420; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
421; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
422; GFX9-NEXT:    s_endpgm
423  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
424  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
425  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
426  %val0 = load float, float addrspace(1)* %in0.gep, align 4
427  %val1 = load float, float addrspace(1)* %in1.gep, align 4
428
429  %idx.0 = add nsw i32 %tid.x, 0
430  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
431  store float %val0, float addrspace(3)* %arrayidx0, align 4
432
433  %idx.1 = add nsw i32 %tid.x, 8
434  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
435  store float %val1, float addrspace(3)* %arrayidx1, align 4
436
437  %idx.2 = add nsw i32 %tid.x, 11
438  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
439  store float %val0, float addrspace(3)* %arrayidx2, align 4
440
441  %idx.3 = add nsw i32 %tid.x, 27
442  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
443  store float %val1, float addrspace(3)* %arrayidx3, align 4
444
445  ret void
446}
447
448define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
449; CI-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
450; CI:       ; %bb.0:
451; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
452; CI-NEXT:    s_mov_b32 s3, 0xf000
453; CI-NEXT:    s_mov_b32 s2, 0
454; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
455; CI-NEXT:    v_mov_b32_e32 v1, 0
456; CI-NEXT:    s_waitcnt lgkmcnt(0)
457; CI-NEXT:    s_mov_b64 s[0:1], s[4:5]
458; CI-NEXT:    s_mov_b64 s[4:5], s[6:7]
459; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
460; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
461; CI-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
462; CI-NEXT:    s_mov_b32 m0, -1
463; CI-NEXT:    s_waitcnt vmcnt(0)
464; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset0:3 offset1:8
465; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset0:11 offset1:27
466; CI-NEXT:    s_endpgm
467;
468; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
469; GFX9:       ; %bb.0:
470; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
471; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
472; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
473; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
474; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
475; GFX9-NEXT:    s_waitcnt vmcnt(0)
476; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:3 offset1:8
477; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
478; GFX9-NEXT:    s_endpgm
479  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
480  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
481  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
482  %val0 = load float, float addrspace(1)* %in0.gep, align 4
483  %val1 = load float, float addrspace(1)* %in1.gep, align 4
484
485  %idx.0 = add nsw i32 %tid.x, 3
486  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
487  store float %val0, float addrspace(3)* %arrayidx0, align 4
488
489  %idx.1 = add nsw i32 %tid.x, 8
490  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
491  store float %val1, float addrspace(3)* %arrayidx1, align 4
492
493  %idx.2 = add nsw i32 %tid.x, 11
494  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
495  store float %val0, float addrspace(3)* %arrayidx2, align 4
496
497  %idx.3 = add nsw i32 %tid.x, 27
498  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
499  store float %val1, float addrspace(3)* %arrayidx3, align 4
500
501  ret void
502}
503
504define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
505; CI-LABEL: write2_ptr_subreg_arg_two_val_f32:
506; CI:       ; %bb.0:
507; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
508; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xf
509; CI-NEXT:    s_mov_b32 s3, 0xf000
510; CI-NEXT:    s_mov_b32 s2, 0
511; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
512; CI-NEXT:    s_waitcnt lgkmcnt(0)
513; CI-NEXT:    s_mov_b64 s[0:1], s[4:5]
514; CI-NEXT:    v_mov_b32_e32 v1, 0
515; CI-NEXT:    s_mov_b64 s[4:5], s[6:7]
516; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
517; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
518; CI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
519; CI-NEXT:    v_mov_b32_e32 v1, s8
520; CI-NEXT:    s_mov_b32 m0, -1
521; CI-NEXT:    v_mov_b32_e32 v3, s9
522; CI-NEXT:    s_waitcnt vmcnt(1)
523; CI-NEXT:    ds_write_b32 v1, v2 offset:32
524; CI-NEXT:    s_waitcnt vmcnt(0)
525; CI-NEXT:    ds_write_b32 v3, v0 offset:32
526; CI-NEXT:    s_endpgm
527;
528; GFX9-LABEL: write2_ptr_subreg_arg_two_val_f32:
529; GFX9:       ; %bb.0:
530; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
531; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
532; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
533; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
534; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
535; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
536; GFX9-NEXT:    v_mov_b32_e32 v0, s2
537; GFX9-NEXT:    v_mov_b32_e32 v3, s3
538; GFX9-NEXT:    s_waitcnt vmcnt(1)
539; GFX9-NEXT:    ds_write_b32 v0, v1 offset:32
540; GFX9-NEXT:    s_waitcnt vmcnt(0)
541; GFX9-NEXT:    ds_write_b32 v3, v2 offset:32
542; GFX9-NEXT:    s_endpgm
543  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
544  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
545  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
546  %val0 = load float, float addrspace(1)* %in0.gep, align 4
547  %val1 = load float, float addrspace(1)* %in1.gep, align 4
548
549  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
550  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
551  %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
552  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
553  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
554
555  ; Apply an additional offset after the vector that will be more obviously folded.
556  %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8
557  store float %val0, float addrspace(3)* %gep.0, align 4
558
559  %add.x = add nsw i32 %x.i, 8
560  store float %val1, float addrspace(3)* %gep.1.offset, align 4
561  ret void
562}
563
564define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
565; CI-LABEL: simple_write2_one_val_f64:
566; CI:       ; %bb.0:
567; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
568; CI-NEXT:    s_mov_b32 s3, 0xf000
569; CI-NEXT:    s_mov_b32 s2, 0
570; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
571; CI-NEXT:    v_mov_b32_e32 v1, 0
572; CI-NEXT:    s_waitcnt lgkmcnt(0)
573; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64
574; CI-NEXT:    s_mov_b32 m0, -1
575; CI-NEXT:    s_waitcnt vmcnt(0)
576; CI-NEXT:    ds_write2_b64 v0, v[1:2], v[1:2] offset1:8
577; CI-NEXT:    s_endpgm
578;
579; GFX9-LABEL: simple_write2_one_val_f64:
580; GFX9:       ; %bb.0:
581; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
582; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
583; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
584; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
585; GFX9-NEXT:    s_waitcnt vmcnt(0)
586; GFX9-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset1:8
587; GFX9-NEXT:    s_endpgm
588  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
589  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
590  %val = load double, double addrspace(1)* %in.gep, align 8
591  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
592  store double %val, double addrspace(3)* %arrayidx0, align 8
593  %add.x = add nsw i32 %x.i, 8
594  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
595  store double %val, double addrspace(3)* %arrayidx1, align 8
596  ret void
597}
598
599define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
600; CI-LABEL: misaligned_simple_write2_one_val_f64:
601; CI:       ; %bb.0:
602; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
603; CI-NEXT:    s_load_dword s0, s[0:1], 0xd
604; CI-NEXT:    s_mov_b32 s7, 0xf000
605; CI-NEXT:    s_mov_b32 s6, 0
606; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
607; CI-NEXT:    v_mov_b32_e32 v1, 0
608; CI-NEXT:    s_waitcnt lgkmcnt(0)
609; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
610; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
611; CI-NEXT:    s_mov_b32 m0, -1
612; CI-NEXT:    s_waitcnt vmcnt(0)
613; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
614; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset0:14 offset1:15
615; CI-NEXT:    s_endpgm
616;
617; GFX9-LABEL: misaligned_simple_write2_one_val_f64:
618; GFX9:       ; %bb.0:
619; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
620; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
621; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
622; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
623; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
624; GFX9-NEXT:    v_add_u32_e32 v2, s4, v2
625; GFX9-NEXT:    s_waitcnt vmcnt(0)
626; GFX9-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
627; GFX9-NEXT:    ds_write2_b32 v2, v0, v1 offset0:14 offset1:15
628; GFX9-NEXT:    s_endpgm
629  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
630  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
631  %val = load double, double addrspace(1)* %in.gep, align 8
632  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
633  store double %val, double addrspace(3)* %arrayidx0, align 4
634  %add.x = add nsw i32 %x.i, 7
635  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
636  store double %val, double addrspace(3)* %arrayidx1, align 4
637  ret void
638}
639
640define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
641; CI-LABEL: unaligned_offset_simple_write2_one_val_f64:
642; CI:       ; %bb.0:
643; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
644; CI-NEXT:    s_load_dword s0, s[0:1], 0xd
645; CI-NEXT:    s_mov_b32 s7, 0xf000
646; CI-NEXT:    s_mov_b32 s6, 0
647; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
648; CI-NEXT:    v_mov_b32_e32 v1, 0
649; CI-NEXT:    s_waitcnt lgkmcnt(0)
650; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
651; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
652; CI-NEXT:    s_mov_b32 m0, -1
653; CI-NEXT:    s_waitcnt vmcnt(0)
654; CI-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
655; CI-NEXT:    ds_write_b8 v0, v1 offset:5
656; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
657; CI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
658; CI-NEXT:    ds_write_b8 v0, v2 offset:13
659; CI-NEXT:    ds_write_b8 v0, v1 offset:9
660; CI-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
661; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
662; CI-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
663; CI-NEXT:    ds_write_b8 v0, v3 offset:8
664; CI-NEXT:    ds_write_b8 v0, v4 offset:7
665; CI-NEXT:    ds_write_b8 v0, v5 offset:6
666; CI-NEXT:    ds_write_b8 v0, v1 offset:16
667; CI-NEXT:    ds_write_b8 v0, v6 offset:15
668; CI-NEXT:    ds_write_b8 v0, v2 offset:14
669; CI-NEXT:    ds_write_b8 v0, v3 offset:12
670; CI-NEXT:    ds_write_b8 v0, v4 offset:11
671; CI-NEXT:    ds_write_b8 v0, v5 offset:10
672; CI-NEXT:    s_endpgm
673;
674; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64:
675; GFX9-ALIGNED:       ; %bb.0:
676; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
677; GFX9-ALIGNED-NEXT:    s_load_dword s4, s[0:1], 0x34
678; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
679; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
680; GFX9-ALIGNED-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
681; GFX9-ALIGNED-NEXT:    v_add_u32_e32 v2, s4, v2
682; GFX9-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
683; GFX9-ALIGNED-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
684; GFX9-ALIGNED-NEXT:    ds_write_b8_d16_hi v2, v0 offset:7
685; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v0 offset:5
686; GFX9-ALIGNED-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
687; GFX9-ALIGNED-NEXT:    ds_write_b8_d16_hi v2, v1 offset:15
688; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v1 offset:13
689; GFX9-ALIGNED-NEXT:    ds_write_b8_d16_hi v2, v0 offset:11
690; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v0 offset:9
691; GFX9-ALIGNED-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
692; GFX9-ALIGNED-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
693; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v3 offset:8
694; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v4 offset:6
695; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v0 offset:16
696; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v1 offset:14
697; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v3 offset:12
698; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v4 offset:10
699; GFX9-ALIGNED-NEXT:    s_endpgm
700;
701; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64:
702; GFX9-UNALIGNED:       ; %bb.0:
703; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
704; GFX9-UNALIGNED-NEXT:    s_load_dword s4, s[0:1], 0x34
705; GFX9-UNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
706; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
707; GFX9-UNALIGNED-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
708; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v2, s4, v2
709; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v3, 5, v2
710; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v2, 9, v2
711; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
712; GFX9-UNALIGNED-NEXT:    ds_write2_b32 v3, v0, v1 offset1:1
713; GFX9-UNALIGNED-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
714; GFX9-UNALIGNED-NEXT:    s_endpgm
715  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
716  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
717  %val = load double, double addrspace(1)* %in.gep, align 8
718  %base = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
719  %base.i8 = bitcast double addrspace(3)* %base to i8 addrspace(3)*
720  %addr0.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 5
721  %addr0 = bitcast i8 addrspace(3)* %addr0.i8 to double addrspace(3)*
722  store double %val, double addrspace(3)* %addr0, align 1
723  %addr1.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 9
724  %addr1 = bitcast i8 addrspace(3)* %addr1.i8 to double addrspace(3)*
725  store double %val, double addrspace(3)* %addr1, align 1
726  ret void
727}
728
729define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
730; CI-LABEL: simple_write2_two_val_f64:
731; CI:       ; %bb.0:
732; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
733; CI-NEXT:    s_mov_b32 s3, 0xf000
734; CI-NEXT:    s_mov_b32 s2, 0
735; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
736; CI-NEXT:    v_mov_b32_e32 v1, 0
737; CI-NEXT:    s_waitcnt lgkmcnt(0)
738; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc
739; CI-NEXT:    s_waitcnt vmcnt(0)
740; CI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc
741; CI-NEXT:    s_waitcnt vmcnt(0)
742; CI-NEXT:    s_mov_b32 m0, -1
743; CI-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:8
744; CI-NEXT:    s_endpgm
745;
746; GFX9-LABEL: simple_write2_two_val_f64:
747; GFX9:       ; %bb.0:
748; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
749; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
750; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
751; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1] glc
752; GFX9-NEXT:    s_waitcnt vmcnt(0)
753; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc
754; GFX9-NEXT:    s_waitcnt vmcnt(0)
755; GFX9-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:8
756; GFX9-NEXT:    s_endpgm
757  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
758  %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
759  %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
760  %val0 = load volatile double, double addrspace(1)* %in.gep.0, align 8
761  %val1 = load volatile double, double addrspace(1)* %in.gep.1, align 8
762  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
763  store double %val0, double addrspace(3)* %arrayidx0, align 8
764  %add.x = add nsw i32 %x.i, 8
765  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
766  store double %val1, double addrspace(3)* %arrayidx1, align 8
767  ret void
768}
769
770@foo = addrspace(3) global [4 x i32] undef, align 4
771
772define amdgpu_kernel void @store_constant_adjacent_offsets() {
773; CI-LABEL: store_constant_adjacent_offsets:
774; CI:       ; %bb.0:
775; CI-NEXT:    s_movk_i32 s0, 0x7b
776; CI-NEXT:    v_mov_b32_e32 v0, 0
777; CI-NEXT:    v_mov_b32_e32 v1, s0
778; CI-NEXT:    v_mov_b32_e32 v2, s0
779; CI-NEXT:    s_mov_b32 m0, -1
780; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
781; CI-NEXT:    s_endpgm
782;
783; GFX9-LABEL: store_constant_adjacent_offsets:
784; GFX9:       ; %bb.0:
785; GFX9-NEXT:    s_movk_i32 s0, 0x7b
786; GFX9-NEXT:    v_mov_b32_e32 v0, 0
787; GFX9-NEXT:    v_mov_b32_e32 v1, s0
788; GFX9-NEXT:    v_mov_b32_e32 v2, s0
789; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
790; GFX9-NEXT:    s_endpgm
791  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
792  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
793  ret void
794}
795
796define amdgpu_kernel void @store_constant_disjoint_offsets() {
797; CI-LABEL: store_constant_disjoint_offsets:
798; CI:       ; %bb.0:
799; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
800; CI-NEXT:    v_mov_b32_e32 v1, 0
801; CI-NEXT:    s_mov_b32 m0, -1
802; CI-NEXT:    ds_write2_b32 v1, v0, v0 offset1:2
803; CI-NEXT:    s_endpgm
804;
805; GFX9-LABEL: store_constant_disjoint_offsets:
806; GFX9:       ; %bb.0:
807; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
808; GFX9-NEXT:    v_mov_b32_e32 v1, 0
809; GFX9-NEXT:    ds_write2_b32 v1, v0, v0 offset1:2
810; GFX9-NEXT:    s_endpgm
811  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
812  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
813  ret void
814}
815
816@bar = addrspace(3) global [4 x i64] undef, align 4
817
818define amdgpu_kernel void @store_misaligned64_constant_offsets() {
819; CI-LABEL: store_misaligned64_constant_offsets:
820; CI:       ; %bb.0:
821; CI-NEXT:    v_mov_b32_e32 v0, 0
822; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
823; CI-NEXT:    s_mov_b32 m0, -1
824; CI-NEXT:    ds_write2_b32 v0, v1, v0 offset1:1
825; CI-NEXT:    ds_write2_b32 v0, v1, v0 offset0:2 offset1:3
826; CI-NEXT:    s_endpgm
827;
828; GFX9-ALIGNED-LABEL: store_misaligned64_constant_offsets:
829; GFX9-ALIGNED:       ; %bb.0:
830; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 0
831; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v1, 0x7b
832; GFX9-ALIGNED-NEXT:    ds_write2_b32 v0, v1, v0 offset1:1
833; GFX9-ALIGNED-NEXT:    ds_write2_b32 v0, v1, v0 offset0:2 offset1:3
834; GFX9-ALIGNED-NEXT:    s_endpgm
835;
836; GFX9-UNALIGNED-LABEL: store_misaligned64_constant_offsets:
837; GFX9-UNALIGNED:       ; %bb.0:
838; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x7b
839; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v1, 0
840; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, v0
841; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v3, v1
842; GFX9-UNALIGNED-NEXT:    ds_write_b128 v1, v[0:3]
843; GFX9-UNALIGNED-NEXT:    s_endpgm
844  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
845  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
846  ret void
847}
848
849@bar.large = addrspace(3) global [4096 x i64] undef, align 4
850
851define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
852; CI-LABEL: store_misaligned64_constant_large_offsets:
853; CI:       ; %bb.0:
854; CI-NEXT:    v_mov_b32_e32 v0, 0x4000
855; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
856; CI-NEXT:    v_mov_b32_e32 v2, 0
857; CI-NEXT:    s_mov_b32 m0, -1
858; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
859; CI-NEXT:    v_mov_b32_e32 v0, 0x7ff8
860; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
861; CI-NEXT:    s_endpgm
862;
863; GFX9-LABEL: store_misaligned64_constant_large_offsets:
864; GFX9:       ; %bb.0:
865; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4000
866; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7b
867; GFX9-NEXT:    v_mov_b32_e32 v2, 0
868; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
869; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7ff8
870; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
871; GFX9-NEXT:    s_endpgm
872  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
873  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
874  ret void
875}
876
877@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
878@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
879
880define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
881; CI-LABEL: write2_sgemm_sequence:
882; CI:       ; %bb.0:
883; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
884; CI-NEXT:    s_lshl_b32 s2, s2, 2
885; CI-NEXT:    s_add_i32 s3, s2, 0xc20
886; CI-NEXT:    v_mov_b32_e32 v0, s3
887; CI-NEXT:    s_addk_i32 s2, 0xc60
888; CI-NEXT:    s_waitcnt lgkmcnt(0)
889; CI-NEXT:    s_load_dword s0, s[0:1], 0x0
890; CI-NEXT:    s_mov_b32 m0, -1
891; CI-NEXT:    s_waitcnt lgkmcnt(0)
892; CI-NEXT:    v_mov_b32_e32 v2, s0
893; CI-NEXT:    v_mov_b32_e32 v3, s0
894; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
895; CI-NEXT:    v_mov_b32_e32 v0, s2
896; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
897; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v1
898; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
899; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset0:32 offset1:33
900; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset0:64 offset1:65
901; CI-NEXT:    s_endpgm
902;
903; GFX9-LABEL: write2_sgemm_sequence:
904; GFX9:       ; %bb.0:
905; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
906; GFX9-NEXT:    s_lshl_b32 s2, s2, 2
907; GFX9-NEXT:    s_add_i32 s3, s2, 0xc20
908; GFX9-NEXT:    s_addk_i32 s2, 0xc60
909; GFX9-NEXT:    v_mov_b32_e32 v0, s3
910; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
911; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
912; GFX9-NEXT:    v_mov_b32_e32 v2, s2
913; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
914; GFX9-NEXT:    v_mov_b32_e32 v3, s0
915; GFX9-NEXT:    v_mov_b32_e32 v4, s0
916; GFX9-NEXT:    ds_write2_b32 v0, v3, v4 offset1:1
917; GFX9-NEXT:    ds_write2_b32 v2, v3, v4 offset1:1
918; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v1
919; GFX9-NEXT:    ds_write2_b32 v0, v3, v4 offset1:1
920; GFX9-NEXT:    ds_write2_b32 v0, v3, v4 offset0:32 offset1:33
921; GFX9-NEXT:    ds_write2_b32 v0, v3, v4 offset0:64 offset1:65
922; GFX9-NEXT:    s_endpgm
923  %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
924  %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
925  %val = load float, float addrspace(1)* %in
926  %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
927  store float %val, float addrspace(3)* %arrayidx44, align 4
928  %add47 = add nsw i32 %x.i, 1
929  %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
930  store float %val, float addrspace(3)* %arrayidx48, align 4
931  %add51 = add nsw i32 %x.i, 16
932  %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
933  store float %val, float addrspace(3)* %arrayidx52, align 4
934  %add55 = add nsw i32 %x.i, 17
935  %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
936  store float %val, float addrspace(3)* %arrayidx56, align 4
937  %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
938  store float %val, float addrspace(3)* %arrayidx60, align 4
939  %add63 = add nsw i32 %y.i, 1
940  %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
941  store float %val, float addrspace(3)* %arrayidx64, align 4
942  %add67 = add nsw i32 %y.i, 32
943  %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
944  store float %val, float addrspace(3)* %arrayidx68, align 4
945  %add71 = add nsw i32 %y.i, 33
946  %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
947  store float %val, float addrspace(3)* %arrayidx72, align 4
948  %add75 = add nsw i32 %y.i, 64
949  %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
950  store float %val, float addrspace(3)* %arrayidx76, align 4
951  %add79 = add nsw i32 %y.i, 65
952  %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
953  store float %val, float addrspace(3)* %arrayidx80, align 4
954  ret void
955}
956
957define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
958; CI-LABEL: simple_write2_v4f32_superreg_align4:
959; CI:       ; %bb.0:
960; CI-NEXT:    s_load_dword s4, s[0:1], 0x9
961; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
962; CI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
963; CI-NEXT:    s_mov_b32 m0, -1
964; CI-NEXT:    s_waitcnt lgkmcnt(0)
965; CI-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
966; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
967; CI-NEXT:    s_waitcnt lgkmcnt(0)
968; CI-NEXT:    v_mov_b32_e32 v1, s0
969; CI-NEXT:    v_mov_b32_e32 v2, s1
970; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
971; CI-NEXT:    v_mov_b32_e32 v3, s2
972; CI-NEXT:    v_mov_b32_e32 v1, s3
973; CI-NEXT:    ds_write2_b32 v0, v3, v1 offset0:2 offset1:3
974; CI-NEXT:    s_endpgm
975;
976; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
977; GFX9-ALIGNED:       ; %bb.0:
978; GFX9-ALIGNED-NEXT:    s_load_dword s4, s[0:1], 0x24
979; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
980; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
981; GFX9-ALIGNED-NEXT:    v_lshl_add_u32 v0, v0, 4, s4
982; GFX9-ALIGNED-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
983; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
984; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v1, s0
985; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v2, s1
986; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v3, s2
987; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v4, s3
988; GFX9-ALIGNED-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
989; GFX9-ALIGNED-NEXT:    ds_write2_b32 v0, v3, v4 offset0:2 offset1:3
990; GFX9-ALIGNED-NEXT:    s_endpgm
991;
992; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
993; GFX9-UNALIGNED:       ; %bb.0:
994; GFX9-UNALIGNED-NEXT:    s_load_dword s4, s[0:1], 0x24
995; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
996; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
997; GFX9-UNALIGNED-NEXT:    v_lshl_add_u32 v4, v0, 4, s4
998; GFX9-UNALIGNED-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
999; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
1000; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
1001; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
1002; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, s2
1003; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v3, s3
1004; GFX9-UNALIGNED-NEXT:    ds_write_b128 v4, v[0:3]
1005; GFX9-UNALIGNED-NEXT:    s_endpgm
1006  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
1007  %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
1008  %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4
1009  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(3)* %out, i32 %x.i
1010  store <4 x float> %val0, <4 x float> addrspace(3)* %out.gep, align 4
1011  ret void
1012}
1013
1014@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1
1015
1016define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
1017; CI-LABEL: write2_v2i32_align1_odd_offset:
1018; CI:       ; %bb.0: ; %entry
1019; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
1020; CI-NEXT:    v_mov_b32_e32 v1, 0
1021; CI-NEXT:    s_mov_b32 m0, -1
1022; CI-NEXT:    ds_write_b8 v1, v0 offset:65
1023; CI-NEXT:    v_mov_b32_e32 v0, 1
1024; CI-NEXT:    ds_write_b8 v1, v0 offset:70
1025; CI-NEXT:    v_mov_b32_e32 v0, 0xc8
1026; CI-NEXT:    ds_write_b8 v1, v0 offset:69
1027; CI-NEXT:    ds_write_b8 v1, v1 offset:68
1028; CI-NEXT:    ds_write_b8 v1, v1 offset:67
1029; CI-NEXT:    ds_write_b8 v1, v1 offset:66
1030; CI-NEXT:    ds_write_b8 v1, v1 offset:72
1031; CI-NEXT:    ds_write_b8 v1, v1 offset:71
1032; CI-NEXT:    s_endpgm
1033;
1034; GFX9-ALIGNED-LABEL: write2_v2i32_align1_odd_offset:
1035; GFX9-ALIGNED:       ; %bb.0: ; %entry
1036; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 0x7b
1037; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v1, 0
1038; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:65
1039; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 1
1040; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:70
1041; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 0xc8
1042; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:69
1043; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:68
1044; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:67
1045; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:66
1046; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:72
1047; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:71
1048; GFX9-ALIGNED-NEXT:    s_endpgm
1049;
1050; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset:
1051; GFX9-UNALIGNED:       ; %bb.0: ; %entry
1052; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x41
1053; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v1, 0x7b
1054; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x1c8
1055; GFX9-UNALIGNED-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
1056; GFX9-UNALIGNED-NEXT:    s_endpgm
1057entry:
1058  store <2 x i32> <i32 123, i32 456>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1
1059  ret void
1060}
1061
1062declare i32 @llvm.amdgcn.workgroup.id.x() #1
1063declare i32 @llvm.amdgcn.workgroup.id.y() #1
1064declare i32 @llvm.amdgcn.workitem.id.x() #1
1065declare i32 @llvm.amdgcn.workitem.id.y() #1
1066
1067attributes #0 = { nounwind }
1068attributes #1 = { nounwind readnone speculatable }
1069attributes #2 = { convergent nounwind }
1070