1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
3; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-ALIGNED %s
4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s
5
6@lds = addrspace(3) global [512 x float] undef, align 4
7@lds.f64 = addrspace(3) global [512 x double] undef, align 8
8
9define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
10; CI-LABEL: simple_write2_one_val_f32:
11; CI:       ; %bb.0:
12; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
13; CI-NEXT:    s_mov_b32 s3, 0xf000
14; CI-NEXT:    s_mov_b32 s2, 0
15; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
16; CI-NEXT:    v_mov_b32_e32 v1, 0
17; CI-NEXT:    s_waitcnt lgkmcnt(0)
18; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
19; CI-NEXT:    s_mov_b32 m0, -1
20; CI-NEXT:    s_waitcnt vmcnt(0)
21; CI-NEXT:    ds_write2_b32 v0, v1, v1 offset1:8
22; CI-NEXT:    s_endpgm
23;
24; GFX9-LABEL: simple_write2_one_val_f32:
25; GFX9:       ; %bb.0:
26; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
27; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
28; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
29; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
30; GFX9-NEXT:    s_waitcnt vmcnt(0)
31; GFX9-NEXT:    ds_write2_b32 v0, v1, v1 offset1:8
32; GFX9-NEXT:    s_endpgm
33  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
34  %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
35  %val = load float, float addrspace(1)* %in.gep, align 4
36  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
37  store float %val, float addrspace(3)* %arrayidx0, align 4
38  %add.x = add nsw i32 %x.i, 8
39  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
40  store float %val, float addrspace(3)* %arrayidx1, align 4
41  ret void
42}
43
44define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
45; CI-LABEL: simple_write2_two_val_f32:
46; CI:       ; %bb.0:
47; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
48; CI-NEXT:    s_mov_b32 s3, 0xf000
49; CI-NEXT:    s_mov_b32 s2, 0
50; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
51; CI-NEXT:    v_mov_b32_e32 v1, 0
52; CI-NEXT:    s_waitcnt lgkmcnt(0)
53; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
54; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4
55; CI-NEXT:    s_mov_b32 m0, -1
56; CI-NEXT:    s_waitcnt vmcnt(0)
57; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset1:8
58; CI-NEXT:    s_endpgm
59;
60; GFX9-LABEL: simple_write2_two_val_f32:
61; GFX9:       ; %bb.0:
62; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
63; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
64; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
65; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
66; GFX9-NEXT:    global_load_dword v2, v0, s[0:1] offset:4
67; GFX9-NEXT:    s_waitcnt vmcnt(0)
68; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
69; GFX9-NEXT:    s_endpgm
70  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
71  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
72  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
73  %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
74  %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
75  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
76  store float %val0, float addrspace(3)* %arrayidx0, align 4
77  %add.x = add nsw i32 %x.i, 8
78  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
79  store float %val1, float addrspace(3)* %arrayidx1, align 4
80  ret void
81}
82
83define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
84; CI-LABEL: simple_write2_two_val_f32_volatile_0:
85; CI:       ; %bb.0:
86; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
87; CI-NEXT:    s_mov_b32 s3, 0xf000
88; CI-NEXT:    s_mov_b32 s2, 0
89; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
90; CI-NEXT:    v_mov_b32_e32 v1, 0
91; CI-NEXT:    s_waitcnt lgkmcnt(0)
92; CI-NEXT:    s_mov_b64 s[0:1], s[4:5]
93; CI-NEXT:    s_mov_b64 s[4:5], s[6:7]
94; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
95; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
96; CI-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
97; CI-NEXT:    s_mov_b32 m0, -1
98; CI-NEXT:    s_waitcnt vmcnt(1)
99; CI-NEXT:    ds_write_b32 v0, v2
100; CI-NEXT:    s_waitcnt vmcnt(0)
101; CI-NEXT:    ds_write_b32 v0, v1 offset:32
102; CI-NEXT:    s_endpgm
103;
104; GFX9-LABEL: simple_write2_two_val_f32_volatile_0:
105; GFX9:       ; %bb.0:
106; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
107; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
108; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
109; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
110; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
111; GFX9-NEXT:    s_waitcnt vmcnt(1)
112; GFX9-NEXT:    ds_write_b32 v0, v1
113; GFX9-NEXT:    s_waitcnt vmcnt(0)
114; GFX9-NEXT:    ds_write_b32 v0, v2 offset:32
115; GFX9-NEXT:    s_endpgm
116  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
117  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
118  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
119  %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4
120  %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4
121  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
122  store volatile float %val0, float addrspace(3)* %arrayidx0, align 4
123  %add.x = add nsw i32 %x.i, 8
124  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
125  store float %val1, float addrspace(3)* %arrayidx1, align 4
126  ret void
127}
128
129define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
130; CI-LABEL: simple_write2_two_val_f32_volatile_1:
131; CI:       ; %bb.0:
132; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
133; CI-NEXT:    s_mov_b32 s3, 0xf000
134; CI-NEXT:    s_mov_b32 s2, 0
135; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
136; CI-NEXT:    v_mov_b32_e32 v1, 0
137; CI-NEXT:    s_waitcnt lgkmcnt(0)
138; CI-NEXT:    s_mov_b64 s[0:1], s[4:5]
139; CI-NEXT:    s_mov_b64 s[4:5], s[6:7]
140; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
141; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
142; CI-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
143; CI-NEXT:    s_mov_b32 m0, -1
144; CI-NEXT:    s_waitcnt vmcnt(1)
145; CI-NEXT:    ds_write_b32 v0, v2
146; CI-NEXT:    s_waitcnt vmcnt(0)
147; CI-NEXT:    ds_write_b32 v0, v1 offset:32
148; CI-NEXT:    s_endpgm
149;
150; GFX9-LABEL: simple_write2_two_val_f32_volatile_1:
151; GFX9:       ; %bb.0:
152; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
153; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
154; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
155; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
156; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
157; GFX9-NEXT:    s_waitcnt vmcnt(1)
158; GFX9-NEXT:    ds_write_b32 v0, v1
159; GFX9-NEXT:    s_waitcnt vmcnt(0)
160; GFX9-NEXT:    ds_write_b32 v0, v2 offset:32
161; GFX9-NEXT:    s_endpgm
162  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
163  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
164  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
165  %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4
166  %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4
167  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
168  store float %val0, float addrspace(3)* %arrayidx0, align 4
169  %add.x = add nsw i32 %x.i, 8
170  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
171  store volatile float %val1, float addrspace(3)* %arrayidx1, align 4
172  ret void
173}
174
175; 2 data subregisters from different super registers.
176; TODO: GFX9 has v_mov_b32_e32 v2, lds@abs32@lo
177;       This should be an s_mov_b32. The v_mov_b32 gets introduced by an
178;       early legalization of the constant bus constraint on the v_lshl_add_u32,
179;       and then SIFoldOperands folds in an unlucky order.
180define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
181; CI-LABEL: simple_write2_two_val_subreg2_mixed_f32:
182; CI:       ; %bb.0:
183; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
184; CI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
185; CI-NEXT:    s_mov_b32 s3, 0xf000
186; CI-NEXT:    s_mov_b32 s2, 0
187; CI-NEXT:    v_mov_b32_e32 v2, 0
188; CI-NEXT:    s_waitcnt lgkmcnt(0)
189; CI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64
190; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8
191; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
192; CI-NEXT:    s_mov_b32 m0, -1
193; CI-NEXT:    s_waitcnt vmcnt(0)
194; CI-NEXT:    ds_write2_b32 v0, v3, v2 offset1:8
195; CI-NEXT:    s_endpgm
196;
197; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32:
198; GFX9:       ; %bb.0:
199; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
200; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
201; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
202; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
203; GFX9-NEXT:    global_load_dwordx2 v[1:2], v3, s[0:1]
204; GFX9-NEXT:    s_waitcnt vmcnt(0)
205; GFX9-NEXT:    global_load_dwordx2 v[2:3], v3, s[0:1] offset:8
206; GFX9-NEXT:    s_waitcnt vmcnt(0)
207; GFX9-NEXT:    ds_write2_b32 v0, v1, v3 offset1:8
208; GFX9-NEXT:    s_endpgm
209  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
210  %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
211  %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
212  %val0 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8
213  %val1 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8
214  %val0.0 = extractelement <2 x float> %val0, i32 0
215  %val1.1 = extractelement <2 x float> %val1, i32 1
216  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
217  store float %val0.0, float addrspace(3)* %arrayidx0, align 4
218  %add.x = add nsw i32 %x.i, 8
219  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
220  store float %val1.1, float addrspace(3)* %arrayidx1, align 4
221  ret void
222}
223
224define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
225; CI-LABEL: simple_write2_two_val_subreg2_f32:
226; CI:       ; %bb.0:
227; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
228; CI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
229; CI-NEXT:    s_mov_b32 s3, 0xf000
230; CI-NEXT:    s_mov_b32 s2, 0
231; CI-NEXT:    v_mov_b32_e32 v2, 0
232; CI-NEXT:    s_waitcnt lgkmcnt(0)
233; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64
234; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
235; CI-NEXT:    s_mov_b32 m0, -1
236; CI-NEXT:    s_waitcnt vmcnt(0)
237; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
238; CI-NEXT:    s_endpgm
239;
240; GFX9-LABEL: simple_write2_two_val_subreg2_f32:
241; GFX9:       ; %bb.0:
242; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
243; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
244; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
245; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
246; GFX9-NEXT:    global_load_dwordx2 v[1:2], v1, s[0:1]
247; GFX9-NEXT:    s_waitcnt vmcnt(0)
248; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
249; GFX9-NEXT:    s_endpgm
250  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
251  %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
252  %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
253  %val0 = extractelement <2 x float> %val, i32 0
254  %val1 = extractelement <2 x float> %val, i32 1
255  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
256  store float %val0, float addrspace(3)* %arrayidx0, align 4
257  %add.x = add nsw i32 %x.i, 8
258  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
259  store float %val1, float addrspace(3)* %arrayidx1, align 4
260  ret void
261}
262
263define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
264; CI-LABEL: simple_write2_two_val_subreg4_f32:
265; CI:       ; %bb.0:
266; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
267; CI-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
268; CI-NEXT:    s_mov_b32 s3, 0xf000
269; CI-NEXT:    s_mov_b32 s2, 0
270; CI-NEXT:    v_mov_b32_e32 v2, 0
271; CI-NEXT:    s_waitcnt lgkmcnt(0)
272; CI-NEXT:    buffer_load_dwordx4 v[1:4], v[1:2], s[0:3], 0 addr64
273; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
274; CI-NEXT:    s_mov_b32 m0, -1
275; CI-NEXT:    s_waitcnt vmcnt(0)
276; CI-NEXT:    ds_write2_b32 v0, v1, v4 offset1:8
277; CI-NEXT:    s_endpgm
278;
279; GFX9-LABEL: simple_write2_two_val_subreg4_f32:
280; GFX9:       ; %bb.0:
281; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
282; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
283; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
284; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
285; GFX9-NEXT:    global_load_dwordx4 v[1:4], v1, s[0:1]
286; GFX9-NEXT:    s_waitcnt vmcnt(0)
287; GFX9-NEXT:    ds_write2_b32 v0, v1, v4 offset1:8
288; GFX9-NEXT:    s_endpgm
289  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
290  %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
291  %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
292  %val0 = extractelement <4 x float> %val, i32 0
293  %val1 = extractelement <4 x float> %val, i32 3
294  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
295  store float %val0, float addrspace(3)* %arrayidx0, align 4
296  %add.x = add nsw i32 %x.i, 8
297  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
298  store float %val1, float addrspace(3)* %arrayidx1, align 4
299  ret void
300}
301
302define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
303; CI-LABEL: simple_write2_two_val_max_offset_f32:
304; CI:       ; %bb.0:
305; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
306; CI-NEXT:    s_mov_b32 s3, 0xf000
307; CI-NEXT:    s_mov_b32 s2, 0
308; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
309; CI-NEXT:    v_mov_b32_e32 v1, 0
310; CI-NEXT:    s_waitcnt lgkmcnt(0)
311; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
312; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4
313; CI-NEXT:    s_mov_b32 m0, -1
314; CI-NEXT:    s_waitcnt vmcnt(0)
315; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset1:255
316; CI-NEXT:    s_endpgm
317;
318; GFX9-LABEL: simple_write2_two_val_max_offset_f32:
319; GFX9:       ; %bb.0:
320; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
321; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
322; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
323; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
324; GFX9-NEXT:    global_load_dword v2, v0, s[0:1] offset:4
325; GFX9-NEXT:    s_waitcnt vmcnt(0)
326; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:255
327; GFX9-NEXT:    s_endpgm
328  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
329  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
330  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
331  %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
332  %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
333  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
334  store float %val0, float addrspace(3)* %arrayidx0, align 4
335  %add.x = add nsw i32 %x.i, 255
336  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
337  store float %val1, float addrspace(3)* %arrayidx1, align 4
338  ret void
339}
340
341define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
342; CI-LABEL: simple_write2_two_val_too_far_f32:
343; CI:       ; %bb.0:
344; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
345; CI-NEXT:    s_mov_b32 s3, 0xf000
346; CI-NEXT:    s_mov_b32 s2, 0
347; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
348; CI-NEXT:    v_mov_b32_e32 v1, 0
349; CI-NEXT:    s_waitcnt lgkmcnt(0)
350; CI-NEXT:    s_mov_b64 s[0:1], s[4:5]
351; CI-NEXT:    s_mov_b64 s[4:5], s[6:7]
352; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
353; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
354; CI-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
355; CI-NEXT:    s_mov_b32 m0, -1
356; CI-NEXT:    s_waitcnt vmcnt(1)
357; CI-NEXT:    ds_write_b32 v0, v2
358; CI-NEXT:    s_waitcnt vmcnt(0)
359; CI-NEXT:    ds_write_b32 v0, v1 offset:1028
360; CI-NEXT:    s_endpgm
361;
362; GFX9-LABEL: simple_write2_two_val_too_far_f32:
363; GFX9:       ; %bb.0:
364; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
365; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
366; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
367; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
368; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
369; GFX9-NEXT:    s_waitcnt vmcnt(1)
370; GFX9-NEXT:    ds_write_b32 v0, v1
371; GFX9-NEXT:    s_waitcnt vmcnt(0)
372; GFX9-NEXT:    ds_write_b32 v0, v2 offset:1028
373; GFX9-NEXT:    s_endpgm
374  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
375  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
376  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
377  %val0 = load float, float addrspace(1)* %in0.gep, align 4
378  %val1 = load float, float addrspace(1)* %in1.gep, align 4
379  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
380  store float %val0, float addrspace(3)* %arrayidx0, align 4
381  %add.x = add nsw i32 %x.i, 257
382  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
383  store float %val1, float addrspace(3)* %arrayidx1, align 4
384  ret void
385}
386
387define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
388; CI-LABEL: simple_write2_two_val_f32_x2:
389; CI:       ; %bb.0:
390; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
391; CI-NEXT:    s_mov_b32 s3, 0xf000
392; CI-NEXT:    s_mov_b32 s2, 0
393; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
394; CI-NEXT:    v_mov_b32_e32 v1, 0
395; CI-NEXT:    s_waitcnt lgkmcnt(0)
396; CI-NEXT:    s_mov_b64 s[0:1], s[4:5]
397; CI-NEXT:    s_mov_b64 s[4:5], s[6:7]
398; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
399; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
400; CI-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
401; CI-NEXT:    s_mov_b32 m0, -1
402; CI-NEXT:    s_waitcnt vmcnt(0)
403; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset1:8
404; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset0:11 offset1:27
405; CI-NEXT:    s_endpgm
406;
407; GFX9-LABEL: simple_write2_two_val_f32_x2:
408; GFX9:       ; %bb.0:
409; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
410; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
411; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
412; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
413; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
414; GFX9-NEXT:    s_waitcnt vmcnt(0)
415; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
416; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
417; GFX9-NEXT:    s_endpgm
418  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
419  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
420  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
421  %val0 = load float, float addrspace(1)* %in0.gep, align 4
422  %val1 = load float, float addrspace(1)* %in1.gep, align 4
423
424  %idx.0 = add nsw i32 %tid.x, 0
425  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
426  store float %val0, float addrspace(3)* %arrayidx0, align 4
427
428  %idx.1 = add nsw i32 %tid.x, 8
429  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
430  store float %val1, float addrspace(3)* %arrayidx1, align 4
431
432  %idx.2 = add nsw i32 %tid.x, 11
433  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
434  store float %val0, float addrspace(3)* %arrayidx2, align 4
435
436  %idx.3 = add nsw i32 %tid.x, 27
437  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
438  store float %val1, float addrspace(3)* %arrayidx3, align 4
439
440  ret void
441}
442
443define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
444; CI-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
445; CI:       ; %bb.0:
446; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
447; CI-NEXT:    s_mov_b32 s3, 0xf000
448; CI-NEXT:    s_mov_b32 s2, 0
449; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
450; CI-NEXT:    v_mov_b32_e32 v1, 0
451; CI-NEXT:    s_waitcnt lgkmcnt(0)
452; CI-NEXT:    s_mov_b64 s[0:1], s[4:5]
453; CI-NEXT:    s_mov_b64 s[4:5], s[6:7]
454; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
455; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
456; CI-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
457; CI-NEXT:    s_mov_b32 m0, -1
458; CI-NEXT:    s_waitcnt vmcnt(0)
459; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset0:3 offset1:8
460; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset0:11 offset1:27
461; CI-NEXT:    s_endpgm
462;
463; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
464; GFX9:       ; %bb.0:
465; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
466; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
467; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
468; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
469; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
470; GFX9-NEXT:    s_waitcnt vmcnt(0)
471; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:3 offset1:8
472; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
473; GFX9-NEXT:    s_endpgm
474  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
475  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
476  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
477  %val0 = load float, float addrspace(1)* %in0.gep, align 4
478  %val1 = load float, float addrspace(1)* %in1.gep, align 4
479
480  %idx.0 = add nsw i32 %tid.x, 3
481  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
482  store float %val0, float addrspace(3)* %arrayidx0, align 4
483
484  %idx.1 = add nsw i32 %tid.x, 8
485  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
486  store float %val1, float addrspace(3)* %arrayidx1, align 4
487
488  %idx.2 = add nsw i32 %tid.x, 11
489  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
490  store float %val0, float addrspace(3)* %arrayidx2, align 4
491
492  %idx.3 = add nsw i32 %tid.x, 27
493  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
494  store float %val1, float addrspace(3)* %arrayidx3, align 4
495
496  ret void
497}
498
499define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
500; CI-LABEL: write2_ptr_subreg_arg_two_val_f32:
501; CI:       ; %bb.0:
502; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
503; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xf
504; CI-NEXT:    s_mov_b32 s3, 0xf000
505; CI-NEXT:    s_mov_b32 s2, 0
506; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
507; CI-NEXT:    s_waitcnt lgkmcnt(0)
508; CI-NEXT:    s_mov_b64 s[0:1], s[4:5]
509; CI-NEXT:    v_mov_b32_e32 v1, 0
510; CI-NEXT:    s_mov_b64 s[4:5], s[6:7]
511; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
512; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
513; CI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
514; CI-NEXT:    v_mov_b32_e32 v1, s8
515; CI-NEXT:    s_mov_b32 m0, -1
516; CI-NEXT:    v_mov_b32_e32 v3, s9
517; CI-NEXT:    s_waitcnt vmcnt(1)
518; CI-NEXT:    ds_write_b32 v1, v2 offset:32
519; CI-NEXT:    s_waitcnt vmcnt(0)
520; CI-NEXT:    ds_write_b32 v3, v0 offset:32
521; CI-NEXT:    s_endpgm
522;
523; GFX9-LABEL: write2_ptr_subreg_arg_two_val_f32:
524; GFX9:       ; %bb.0:
525; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
526; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
527; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
528; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
529; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
530; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
531; GFX9-NEXT:    v_mov_b32_e32 v2, s0
532; GFX9-NEXT:    v_mov_b32_e32 v3, s1
533; GFX9-NEXT:    s_waitcnt vmcnt(1)
534; GFX9-NEXT:    ds_write_b32 v2, v1 offset:32
535; GFX9-NEXT:    s_waitcnt vmcnt(0)
536; GFX9-NEXT:    ds_write_b32 v3, v0 offset:32
537; GFX9-NEXT:    s_endpgm
538  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
539  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
540  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
541  %val0 = load float, float addrspace(1)* %in0.gep, align 4
542  %val1 = load float, float addrspace(1)* %in1.gep, align 4
543
544  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
545  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
546  %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
547  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
548  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
549
550  ; Apply an additional offset after the vector that will be more obviously folded.
551  %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8
552  store float %val0, float addrspace(3)* %gep.0, align 4
553
554  %add.x = add nsw i32 %x.i, 8
555  store float %val1, float addrspace(3)* %gep.1.offset, align 4
556  ret void
557}
558
559define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
560; CI-LABEL: simple_write2_one_val_f64:
561; CI:       ; %bb.0:
562; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
563; CI-NEXT:    s_mov_b32 s3, 0xf000
564; CI-NEXT:    s_mov_b32 s2, 0
565; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
566; CI-NEXT:    v_mov_b32_e32 v1, 0
567; CI-NEXT:    s_waitcnt lgkmcnt(0)
568; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64
569; CI-NEXT:    s_mov_b32 m0, -1
570; CI-NEXT:    s_waitcnt vmcnt(0)
571; CI-NEXT:    ds_write2_b64 v0, v[1:2], v[1:2] offset1:8
572; CI-NEXT:    s_endpgm
573;
574; GFX9-LABEL: simple_write2_one_val_f64:
575; GFX9:       ; %bb.0:
576; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
577; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
578; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
579; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
580; GFX9-NEXT:    s_waitcnt vmcnt(0)
581; GFX9-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset1:8
582; GFX9-NEXT:    s_endpgm
583  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
584  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
585  %val = load double, double addrspace(1)* %in.gep, align 8
586  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
587  store double %val, double addrspace(3)* %arrayidx0, align 8
588  %add.x = add nsw i32 %x.i, 8
589  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
590  store double %val, double addrspace(3)* %arrayidx1, align 8
591  ret void
592}
593
594define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
595; CI-LABEL: misaligned_simple_write2_one_val_f64:
596; CI:       ; %bb.0:
597; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
598; CI-NEXT:    s_load_dword s0, s[0:1], 0xd
599; CI-NEXT:    s_mov_b32 s7, 0xf000
600; CI-NEXT:    s_mov_b32 s6, 0
601; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
602; CI-NEXT:    v_mov_b32_e32 v1, 0
603; CI-NEXT:    s_waitcnt lgkmcnt(0)
604; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
605; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
606; CI-NEXT:    s_mov_b32 m0, -1
607; CI-NEXT:    s_waitcnt vmcnt(0)
608; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
609; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset0:14 offset1:15
610; CI-NEXT:    s_endpgm
611;
612; GFX9-LABEL: misaligned_simple_write2_one_val_f64:
613; GFX9:       ; %bb.0:
614; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
615; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x34
616; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
617; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
618; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
619; GFX9-NEXT:    v_add_u32_e32 v2, s0, v2
620; GFX9-NEXT:    s_waitcnt vmcnt(0)
621; GFX9-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
622; GFX9-NEXT:    ds_write2_b32 v2, v0, v1 offset0:14 offset1:15
623; GFX9-NEXT:    s_endpgm
624  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
625  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
626  %val = load double, double addrspace(1)* %in.gep, align 8
627  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
628  store double %val, double addrspace(3)* %arrayidx0, align 4
629  %add.x = add nsw i32 %x.i, 7
630  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
631  store double %val, double addrspace(3)* %arrayidx1, align 4
632  ret void
633}
634
635define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
636; CI-LABEL: unaligned_offset_simple_write2_one_val_f64:
637; CI:       ; %bb.0:
638; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
639; CI-NEXT:    s_load_dword s0, s[0:1], 0xd
640; CI-NEXT:    s_mov_b32 s7, 0xf000
641; CI-NEXT:    s_mov_b32 s6, 0
642; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
643; CI-NEXT:    v_mov_b32_e32 v1, 0
644; CI-NEXT:    s_waitcnt lgkmcnt(0)
645; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
646; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
647; CI-NEXT:    s_mov_b32 m0, -1
648; CI-NEXT:    s_waitcnt vmcnt(0)
649; CI-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
650; CI-NEXT:    ds_write_b8 v0, v1 offset:5
651; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
652; CI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
653; CI-NEXT:    ds_write_b8 v0, v2 offset:13
654; CI-NEXT:    ds_write_b8 v0, v1 offset:9
655; CI-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
656; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
657; CI-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
658; CI-NEXT:    ds_write_b8 v0, v3 offset:8
659; CI-NEXT:    ds_write_b8 v0, v4 offset:7
660; CI-NEXT:    ds_write_b8 v0, v5 offset:6
661; CI-NEXT:    ds_write_b8 v0, v1 offset:16
662; CI-NEXT:    ds_write_b8 v0, v6 offset:15
663; CI-NEXT:    ds_write_b8 v0, v2 offset:14
664; CI-NEXT:    ds_write_b8 v0, v3 offset:12
665; CI-NEXT:    ds_write_b8 v0, v4 offset:11
666; CI-NEXT:    ds_write_b8 v0, v5 offset:10
667; CI-NEXT:    s_endpgm
668;
669; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64:
670; GFX9-ALIGNED:       ; %bb.0:
671; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
672; GFX9-ALIGNED-NEXT:    s_load_dword s0, s[0:1], 0x34
673; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
674; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
675; GFX9-ALIGNED-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
676; GFX9-ALIGNED-NEXT:    v_add_u32_e32 v2, s0, v2
677; GFX9-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
678; GFX9-ALIGNED-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
679; GFX9-ALIGNED-NEXT:    ds_write_b8_d16_hi v2, v0 offset:7
680; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v0 offset:5
681; GFX9-ALIGNED-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
682; GFX9-ALIGNED-NEXT:    ds_write_b8_d16_hi v2, v1 offset:15
683; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v1 offset:13
684; GFX9-ALIGNED-NEXT:    ds_write_b8_d16_hi v2, v0 offset:11
685; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v0 offset:9
686; GFX9-ALIGNED-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
687; GFX9-ALIGNED-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
688; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v3 offset:8
689; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v4 offset:6
690; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v0 offset:16
691; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v1 offset:14
692; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v3 offset:12
693; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v4 offset:10
694; GFX9-ALIGNED-NEXT:    s_endpgm
695;
696; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64:
697; GFX9-UNALIGNED:       ; %bb.0:
698; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
699; GFX9-UNALIGNED-NEXT:    s_load_dword s0, s[0:1], 0x34
700; GFX9-UNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
701; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
702; GFX9-UNALIGNED-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
703; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v2, s0, v2
704; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v3, 5, v2
705; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v2, 9, v2
706; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
707; GFX9-UNALIGNED-NEXT:    ds_write2_b32 v3, v0, v1 offset1:1
708; GFX9-UNALIGNED-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
709; GFX9-UNALIGNED-NEXT:    s_endpgm
710  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
711  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
712  %val = load double, double addrspace(1)* %in.gep, align 8
713  %base = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
714  %base.i8 = bitcast double addrspace(3)* %base to i8 addrspace(3)*
715  %addr0.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 5
716  %addr0 = bitcast i8 addrspace(3)* %addr0.i8 to double addrspace(3)*
717  store double %val, double addrspace(3)* %addr0, align 1
718  %addr1.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 9
719  %addr1 = bitcast i8 addrspace(3)* %addr1.i8 to double addrspace(3)*
720  store double %val, double addrspace(3)* %addr1, align 1
721  ret void
722}
723
724define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
725; CI-LABEL: simple_write2_two_val_f64:
726; CI:       ; %bb.0:
727; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
728; CI-NEXT:    s_mov_b32 s3, 0xf000
729; CI-NEXT:    s_mov_b32 s2, 0
730; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
731; CI-NEXT:    v_mov_b32_e32 v1, 0
732; CI-NEXT:    s_waitcnt lgkmcnt(0)
733; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
734; CI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
735; CI-NEXT:    s_mov_b32 m0, -1
736; CI-NEXT:    s_waitcnt vmcnt(0)
737; CI-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:8
738; CI-NEXT:    s_endpgm
739;
740; GFX9-LABEL: simple_write2_two_val_f64:
741; GFX9:       ; %bb.0:
742; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
743; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
744; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
745; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
746; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[0:1] offset:8
747; GFX9-NEXT:    s_waitcnt vmcnt(0)
748; GFX9-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:8
749; GFX9-NEXT:    s_endpgm
750  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
751  %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
752  %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
753  %val0 = load volatile double, double addrspace(1)* %in.gep.0, align 8
754  %val1 = load volatile double, double addrspace(1)* %in.gep.1, align 8
755  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
756  store double %val0, double addrspace(3)* %arrayidx0, align 8
757  %add.x = add nsw i32 %x.i, 8
758  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
759  store double %val1, double addrspace(3)* %arrayidx1, align 8
760  ret void
761}
762
763@foo = addrspace(3) global [4 x i32] undef, align 4
764
765define amdgpu_kernel void @store_constant_adjacent_offsets() {
766; CI-LABEL: store_constant_adjacent_offsets:
767; CI:       ; %bb.0:
768; CI-NEXT:    s_movk_i32 s0, 0x7b
769; CI-NEXT:    v_mov_b32_e32 v0, 0
770; CI-NEXT:    v_mov_b32_e32 v1, s0
771; CI-NEXT:    v_mov_b32_e32 v2, s0
772; CI-NEXT:    s_mov_b32 m0, -1
773; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
774; CI-NEXT:    s_endpgm
775;
776; GFX9-LABEL: store_constant_adjacent_offsets:
777; GFX9:       ; %bb.0:
778; GFX9-NEXT:    s_movk_i32 s0, 0x7b
779; GFX9-NEXT:    v_mov_b32_e32 v0, 0
780; GFX9-NEXT:    v_mov_b32_e32 v1, s0
781; GFX9-NEXT:    v_mov_b32_e32 v2, s0
782; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
783; GFX9-NEXT:    s_endpgm
784  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
785  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
786  ret void
787}
788
789define amdgpu_kernel void @store_constant_disjoint_offsets() {
790; CI-LABEL: store_constant_disjoint_offsets:
791; CI:       ; %bb.0:
792; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
793; CI-NEXT:    v_mov_b32_e32 v1, 0
794; CI-NEXT:    s_mov_b32 m0, -1
795; CI-NEXT:    ds_write2_b32 v1, v0, v0 offset1:2
796; CI-NEXT:    s_endpgm
797;
798; GFX9-LABEL: store_constant_disjoint_offsets:
799; GFX9:       ; %bb.0:
800; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
801; GFX9-NEXT:    v_mov_b32_e32 v1, 0
802; GFX9-NEXT:    ds_write2_b32 v1, v0, v0 offset1:2
803; GFX9-NEXT:    s_endpgm
804  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
805  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
806  ret void
807}
808
809@bar = addrspace(3) global [4 x i64] undef, align 4
810
811define amdgpu_kernel void @store_misaligned64_constant_offsets() {
812; CI-LABEL: store_misaligned64_constant_offsets:
813; CI:       ; %bb.0:
814; CI-NEXT:    v_mov_b32_e32 v0, 0
815; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
816; CI-NEXT:    s_mov_b32 m0, -1
817; CI-NEXT:    ds_write2_b32 v0, v1, v0 offset1:1
818; CI-NEXT:    ds_write2_b32 v0, v1, v0 offset0:2 offset1:3
819; CI-NEXT:    s_endpgm
820;
821; GFX9-ALIGNED-LABEL: store_misaligned64_constant_offsets:
822; GFX9-ALIGNED:       ; %bb.0:
823; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 0
824; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v1, 0x7b
825; GFX9-ALIGNED-NEXT:    ds_write2_b32 v0, v1, v0 offset1:1
826; GFX9-ALIGNED-NEXT:    ds_write2_b32 v0, v1, v0 offset0:2 offset1:3
827; GFX9-ALIGNED-NEXT:    s_endpgm
828;
829; GFX9-UNALIGNED-LABEL: store_misaligned64_constant_offsets:
830; GFX9-UNALIGNED:       ; %bb.0:
831; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x7b
832; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v1, 0
833; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, v0
834; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v3, v1
835; GFX9-UNALIGNED-NEXT:    ds_write_b128 v1, v[0:3]
836; GFX9-UNALIGNED-NEXT:    s_endpgm
837  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
838  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
839  ret void
840}
841
842@bar.large = addrspace(3) global [4096 x i64] undef, align 4
843
844define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
845; CI-LABEL: store_misaligned64_constant_large_offsets:
846; CI:       ; %bb.0:
847; CI-NEXT:    v_mov_b32_e32 v0, 0x4000
848; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
849; CI-NEXT:    v_mov_b32_e32 v2, 0
850; CI-NEXT:    s_mov_b32 m0, -1
851; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
852; CI-NEXT:    v_mov_b32_e32 v0, 0x7ff8
853; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
854; CI-NEXT:    s_endpgm
855;
856; GFX9-LABEL: store_misaligned64_constant_large_offsets:
857; GFX9:       ; %bb.0:
858; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4000
859; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7b
860; GFX9-NEXT:    v_mov_b32_e32 v2, 0
861; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
862; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7ff8
863; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
864; GFX9-NEXT:    s_endpgm
865  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
866  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
867  ret void
868}
869
870@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
871@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
872
873define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
874; CI-LABEL: write2_sgemm_sequence:
875; CI:       ; %bb.0:
876; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
877; CI-NEXT:    s_lshl_b32 s2, s2, 2
878; CI-NEXT:    s_add_i32 s3, s2, 0xc20
879; CI-NEXT:    v_mov_b32_e32 v0, s3
880; CI-NEXT:    s_addk_i32 s2, 0xc60
881; CI-NEXT:    s_waitcnt lgkmcnt(0)
882; CI-NEXT:    s_load_dword s0, s[0:1], 0x0
883; CI-NEXT:    s_mov_b32 m0, -1
884; CI-NEXT:    s_waitcnt lgkmcnt(0)
885; CI-NEXT:    v_mov_b32_e32 v2, s0
886; CI-NEXT:    v_mov_b32_e32 v3, s0
887; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
888; CI-NEXT:    v_mov_b32_e32 v0, s2
889; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
890; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v1
891; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
892; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset0:32 offset1:33
893; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset0:64 offset1:65
894; CI-NEXT:    s_endpgm
895;
896; GFX9-LABEL: write2_sgemm_sequence:
897; GFX9:       ; %bb.0:
898; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
899; GFX9-NEXT:    s_lshl_b32 s2, s2, 2
900; GFX9-NEXT:    s_add_i32 s3, s2, 0xc20
901; GFX9-NEXT:    s_addk_i32 s2, 0xc60
902; GFX9-NEXT:    v_mov_b32_e32 v0, s3
903; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
904; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
905; GFX9-NEXT:    v_mov_b32_e32 v2, s2
906; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
907; GFX9-NEXT:    v_mov_b32_e32 v3, s0
908; GFX9-NEXT:    v_mov_b32_e32 v4, s0
909; GFX9-NEXT:    ds_write2_b32 v0, v3, v4 offset1:1
910; GFX9-NEXT:    ds_write2_b32 v2, v3, v4 offset1:1
911; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v1
912; GFX9-NEXT:    ds_write2_b32 v0, v3, v4 offset1:1
913; GFX9-NEXT:    ds_write2_b32 v0, v3, v4 offset0:32 offset1:33
914; GFX9-NEXT:    ds_write2_b32 v0, v3, v4 offset0:64 offset1:65
915; GFX9-NEXT:    s_endpgm
916  %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
917  %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
918  %val = load float, float addrspace(1)* %in
919  %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
920  store float %val, float addrspace(3)* %arrayidx44, align 4
921  %add47 = add nsw i32 %x.i, 1
922  %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
923  store float %val, float addrspace(3)* %arrayidx48, align 4
924  %add51 = add nsw i32 %x.i, 16
925  %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
926  store float %val, float addrspace(3)* %arrayidx52, align 4
927  %add55 = add nsw i32 %x.i, 17
928  %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
929  store float %val, float addrspace(3)* %arrayidx56, align 4
930  %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
931  store float %val, float addrspace(3)* %arrayidx60, align 4
932  %add63 = add nsw i32 %y.i, 1
933  %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
934  store float %val, float addrspace(3)* %arrayidx64, align 4
935  %add67 = add nsw i32 %y.i, 32
936  %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
937  store float %val, float addrspace(3)* %arrayidx68, align 4
938  %add71 = add nsw i32 %y.i, 33
939  %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
940  store float %val, float addrspace(3)* %arrayidx72, align 4
941  %add75 = add nsw i32 %y.i, 64
942  %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
943  store float %val, float addrspace(3)* %arrayidx76, align 4
944  %add79 = add nsw i32 %y.i, 65
945  %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
946  store float %val, float addrspace(3)* %arrayidx80, align 4
947  ret void
948}
949
950define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
951; CI-LABEL: simple_write2_v4f32_superreg_align4:
952; CI:       ; %bb.0:
953; CI-NEXT:    s_load_dword s4, s[0:1], 0x9
954; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
955; CI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
956; CI-NEXT:    s_mov_b32 m0, -1
957; CI-NEXT:    s_waitcnt lgkmcnt(0)
958; CI-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
959; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
960; CI-NEXT:    s_waitcnt lgkmcnt(0)
961; CI-NEXT:    v_mov_b32_e32 v1, s0
962; CI-NEXT:    v_mov_b32_e32 v2, s1
963; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
964; CI-NEXT:    v_mov_b32_e32 v3, s2
965; CI-NEXT:    v_mov_b32_e32 v1, s3
966; CI-NEXT:    ds_write2_b32 v0, v3, v1 offset0:2 offset1:3
967; CI-NEXT:    s_endpgm
968;
969; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
970; GFX9-ALIGNED:       ; %bb.0:
971; GFX9-ALIGNED-NEXT:    s_load_dword s4, s[0:1], 0x24
972; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
973; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
974; GFX9-ALIGNED-NEXT:    v_lshl_add_u32 v0, v0, 4, s4
975; GFX9-ALIGNED-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
976; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
977; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v1, s0
978; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v2, s1
979; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v3, s2
980; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v4, s3
981; GFX9-ALIGNED-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
982; GFX9-ALIGNED-NEXT:    ds_write2_b32 v0, v3, v4 offset0:2 offset1:3
983; GFX9-ALIGNED-NEXT:    s_endpgm
984;
985; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
986; GFX9-UNALIGNED:       ; %bb.0:
987; GFX9-UNALIGNED-NEXT:    s_load_dword s4, s[0:1], 0x24
988; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
989; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
990; GFX9-UNALIGNED-NEXT:    v_lshl_add_u32 v4, v0, 4, s4
991; GFX9-UNALIGNED-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
992; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
993; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
994; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
995; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, s2
996; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v3, s3
997; GFX9-UNALIGNED-NEXT:    ds_write_b128 v4, v[0:3]
998; GFX9-UNALIGNED-NEXT:    s_endpgm
999  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
1000  %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
1001  %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4
1002  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(3)* %out, i32 %x.i
1003  store <4 x float> %val0, <4 x float> addrspace(3)* %out.gep, align 4
1004  ret void
1005}
1006
1007@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1
1008
1009define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
1010; CI-LABEL: write2_v2i32_align1_odd_offset:
1011; CI:       ; %bb.0: ; %entry
1012; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
1013; CI-NEXT:    v_mov_b32_e32 v1, 0
1014; CI-NEXT:    s_mov_b32 m0, -1
1015; CI-NEXT:    ds_write_b8 v1, v0 offset:65
1016; CI-NEXT:    v_mov_b32_e32 v0, 1
1017; CI-NEXT:    ds_write_b8 v1, v0 offset:70
1018; CI-NEXT:    v_mov_b32_e32 v0, 0xc8
1019; CI-NEXT:    ds_write_b8 v1, v0 offset:69
1020; CI-NEXT:    ds_write_b8 v1, v1 offset:68
1021; CI-NEXT:    ds_write_b8 v1, v1 offset:67
1022; CI-NEXT:    ds_write_b8 v1, v1 offset:66
1023; CI-NEXT:    ds_write_b8 v1, v1 offset:72
1024; CI-NEXT:    ds_write_b8 v1, v1 offset:71
1025; CI-NEXT:    s_endpgm
1026;
1027; GFX9-ALIGNED-LABEL: write2_v2i32_align1_odd_offset:
1028; GFX9-ALIGNED:       ; %bb.0: ; %entry
1029; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 0x7b
1030; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v1, 0
1031; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:65
1032; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 1
1033; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:70
1034; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 0xc8
1035; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:69
1036; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:68
1037; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:67
1038; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:66
1039; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:72
1040; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:71
1041; GFX9-ALIGNED-NEXT:    s_endpgm
1042;
1043; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset:
1044; GFX9-UNALIGNED:       ; %bb.0: ; %entry
1045; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x41
1046; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v1, 0x7b
1047; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x1c8
1048; GFX9-UNALIGNED-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
1049; GFX9-UNALIGNED-NEXT:    s_endpgm
1050entry:
1051  store <2 x i32> <i32 123, i32 456>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1
1052  ret void
1053}
1054
1055declare i32 @llvm.amdgcn.workgroup.id.x() #1
1056declare i32 @llvm.amdgcn.workgroup.id.y() #1
1057declare i32 @llvm.amdgcn.workitem.id.x() #1
1058declare i32 @llvm.amdgcn.workitem.id.y() #1
1059
1060attributes #0 = { nounwind }
1061attributes #1 = { nounwind readnone speculatable }
1062attributes #2 = { convergent nounwind }
1063