1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
2; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s
3; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s
4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s
5
6@lds = addrspace(3) global [512 x float] undef, align 4
7@lds.f64 = addrspace(3) global [512 x double] undef, align 8
8
9define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
10; CI-LABEL: simple_write2_one_val_f32:
11; CI:       ; %bb.0:
12; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
13; CI-NEXT:    s_mov_b32 s3, 0xf000
14; CI-NEXT:    s_mov_b32 s2, 0
15; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
16; CI-NEXT:    v_mov_b32_e32 v1, 0
17; CI-NEXT:    s_waitcnt lgkmcnt(0)
18; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
19; CI-NEXT:    s_mov_b32 m0, -1
20; CI-NEXT:    s_waitcnt vmcnt(0)
21; CI-NEXT:    ds_write2_b32 v0, v1, v1 offset1:8
22; CI-NEXT:    s_endpgm
23;
24; GFX9-LABEL: simple_write2_one_val_f32:
25; GFX9:       ; %bb.0:
26; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
27; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
28; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
29; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
30; GFX9-NEXT:    s_waitcnt vmcnt(0)
31; GFX9-NEXT:    ds_write2_b32 v0, v1, v1 offset1:8
32; GFX9-NEXT:    s_endpgm
33  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
34  %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
35  %val = load float, float addrspace(1)* %in.gep, align 4
36  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
37  store float %val, float addrspace(3)* %arrayidx0, align 4
38  %add.x = add nsw i32 %x.i, 8
39  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
40  store float %val, float addrspace(3)* %arrayidx1, align 4
41  ret void
42}
43
44define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
45; CI-LABEL: simple_write2_two_val_f32:
46; CI:       ; %bb.0:
47; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
48; CI-NEXT:    s_mov_b32 s3, 0xf000
49; CI-NEXT:    s_mov_b32 s2, 0
50; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
51; CI-NEXT:    v_mov_b32_e32 v1, 0
52; CI-NEXT:    s_waitcnt lgkmcnt(0)
53; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
54; CI-NEXT:    s_waitcnt vmcnt(0)
55; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
56; CI-NEXT:    s_waitcnt vmcnt(0)
57; CI-NEXT:    s_mov_b32 m0, -1
58; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset1:8
59; CI-NEXT:    s_endpgm
60;
61; GFX9-LABEL: simple_write2_two_val_f32:
62; GFX9:       ; %bb.0:
63; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
64; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
65; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
67; GFX9-NEXT:    s_waitcnt vmcnt(0)
68; GFX9-NEXT:    global_load_dword v2, v0, s[0:1] offset:4 glc
69; GFX9-NEXT:    s_waitcnt vmcnt(0)
70; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
71; GFX9-NEXT:    s_endpgm
72  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
73  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
74  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
75  %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
76  %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
77  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
78  store float %val0, float addrspace(3)* %arrayidx0, align 4
79  %add.x = add nsw i32 %x.i, 8
80  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
81  store float %val1, float addrspace(3)* %arrayidx1, align 4
82  ret void
83}
84
85define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
86; CI-LABEL: simple_write2_two_val_f32_volatile_0:
87; CI:       ; %bb.0:
88; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
89; CI-NEXT:    s_mov_b32 s7, 0xf000
90; CI-NEXT:    s_mov_b32 s6, 0
91; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
92; CI-NEXT:    v_mov_b32_e32 v1, 0
93; CI-NEXT:    s_waitcnt lgkmcnt(0)
94; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
95; CI-NEXT:    s_mov_b64 s[0:1], s[2:3]
96; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
97; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
98; CI-NEXT:    s_waitcnt vmcnt(0)
99; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 glc
100; CI-NEXT:    s_waitcnt vmcnt(0)
101; CI-NEXT:    s_mov_b32 m0, -1
102; CI-NEXT:    ds_write_b32 v0, v2
103; CI-NEXT:    ds_write_b32 v0, v1 offset:32
104; CI-NEXT:    s_endpgm
105;
106; GFX9-LABEL: simple_write2_two_val_f32_volatile_0:
107; GFX9:       ; %bb.0:
108; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
109; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
110; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
111; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
112; GFX9-NEXT:    s_waitcnt vmcnt(0)
113; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
114; GFX9-NEXT:    s_waitcnt vmcnt(0)
115; GFX9-NEXT:    ds_write_b32 v0, v1
116; GFX9-NEXT:    ds_write_b32 v0, v2 offset:32
117; GFX9-NEXT:    s_endpgm
118  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
119  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
120  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
121  %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4
122  %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4
123  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
124  store volatile float %val0, float addrspace(3)* %arrayidx0, align 4
125  %add.x = add nsw i32 %x.i, 8
126  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
127  store float %val1, float addrspace(3)* %arrayidx1, align 4
128  ret void
129}
130
131define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
132; CI-LABEL: simple_write2_two_val_f32_volatile_1:
133; CI:       ; %bb.0:
134; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
135; CI-NEXT:    s_mov_b32 s7, 0xf000
136; CI-NEXT:    s_mov_b32 s6, 0
137; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
138; CI-NEXT:    v_mov_b32_e32 v1, 0
139; CI-NEXT:    s_waitcnt lgkmcnt(0)
140; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
141; CI-NEXT:    s_mov_b64 s[0:1], s[2:3]
142; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
143; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
144; CI-NEXT:    s_waitcnt vmcnt(0)
145; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 glc
146; CI-NEXT:    s_waitcnt vmcnt(0)
147; CI-NEXT:    s_mov_b32 m0, -1
148; CI-NEXT:    ds_write_b32 v0, v2
149; CI-NEXT:    ds_write_b32 v0, v1 offset:32
150; CI-NEXT:    s_endpgm
151;
152; GFX9-LABEL: simple_write2_two_val_f32_volatile_1:
153; GFX9:       ; %bb.0:
154; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
155; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
156; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
158; GFX9-NEXT:    s_waitcnt vmcnt(0)
159; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
160; GFX9-NEXT:    s_waitcnt vmcnt(0)
161; GFX9-NEXT:    ds_write_b32 v0, v1
162; GFX9-NEXT:    ds_write_b32 v0, v2 offset:32
163; GFX9-NEXT:    s_endpgm
164  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
165  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
166  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
167  %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4
168  %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4
169  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
170  store float %val0, float addrspace(3)* %arrayidx0, align 4
171  %add.x = add nsw i32 %x.i, 8
172  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
173  store volatile float %val1, float addrspace(3)* %arrayidx1, align 4
174  ret void
175}
176
177; 2 data subregisters from different super registers.
178; TODO: GFX9 has v_mov_b32_e32 v2, lds@abs32@lo
179;       This should be an s_mov_b32. The v_mov_b32 gets introduced by an
180;       early legalization of the constant bus constraint on the v_lshl_add_u32,
181;       and then SIFoldOperands folds in an unlucky order.
182define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
183; CI-LABEL: simple_write2_two_val_subreg2_mixed_f32:
184; CI:       ; %bb.0:
185; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
186; CI-NEXT:    s_mov_b32 s3, 0xf000
187; CI-NEXT:    s_mov_b32 s2, 0
188; CI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
189; CI-NEXT:    v_mov_b32_e32 v2, 0
190; CI-NEXT:    s_waitcnt lgkmcnt(0)
191; CI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 glc
192; CI-NEXT:    s_waitcnt vmcnt(0)
193; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8 glc
194; CI-NEXT:    s_waitcnt vmcnt(0)
195; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
196; CI-NEXT:    s_mov_b32 m0, -1
197; CI-NEXT:    ds_write2_b32 v0, v3, v2 offset1:8
198; CI-NEXT:    s_endpgm
199;
200; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32:
201; GFX9:       ; %bb.0:
202; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
203; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
204; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
205; GFX9-NEXT:    ; kill: killed $vgpr4
206; GFX9-NEXT:    ; kill: killed $sgpr0_sgpr1
207; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
208; GFX9-NEXT:    global_load_dwordx2 v[1:2], v4, s[0:1] glc
209; GFX9-NEXT:    s_waitcnt vmcnt(0)
210; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc
211; GFX9-NEXT:    s_waitcnt vmcnt(0)
212; GFX9-NEXT:    ds_write2_b32 v0, v1, v3 offset1:8
213; GFX9-NEXT:    s_endpgm
214  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
215  %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
216  %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
217  %val0 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8
218  %val1 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8
219  %val0.0 = extractelement <2 x float> %val0, i32 0
220  %val1.1 = extractelement <2 x float> %val1, i32 1
221  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
222  store float %val0.0, float addrspace(3)* %arrayidx0, align 4
223  %add.x = add nsw i32 %x.i, 8
224  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
225  store float %val1.1, float addrspace(3)* %arrayidx1, align 4
226  ret void
227}
228
229define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
230; CI-LABEL: simple_write2_two_val_subreg2_f32:
231; CI:       ; %bb.0:
232; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
233; CI-NEXT:    s_mov_b32 s3, 0xf000
234; CI-NEXT:    s_mov_b32 s2, 0
235; CI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
236; CI-NEXT:    v_mov_b32_e32 v2, 0
237; CI-NEXT:    s_waitcnt lgkmcnt(0)
238; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64
239; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
240; CI-NEXT:    s_mov_b32 m0, -1
241; CI-NEXT:    s_waitcnt vmcnt(0)
242; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
243; CI-NEXT:    s_endpgm
244;
245; GFX9-LABEL: simple_write2_two_val_subreg2_f32:
246; GFX9:       ; %bb.0:
247; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
248; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
249; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
250; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
251; GFX9-NEXT:    global_load_dwordx2 v[1:2], v1, s[0:1]
252; GFX9-NEXT:    s_waitcnt vmcnt(0)
253; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
254; GFX9-NEXT:    s_endpgm
255  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
256  %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
257  %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
258  %val0 = extractelement <2 x float> %val, i32 0
259  %val1 = extractelement <2 x float> %val, i32 1
260  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
261  store float %val0, float addrspace(3)* %arrayidx0, align 4
262  %add.x = add nsw i32 %x.i, 8
263  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
264  store float %val1, float addrspace(3)* %arrayidx1, align 4
265  ret void
266}
267
268define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
269; CI-LABEL: simple_write2_two_val_subreg4_f32:
270; CI:       ; %bb.0:
271; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
272; CI-NEXT:    s_mov_b32 s3, 0xf000
273; CI-NEXT:    s_mov_b32 s2, 0
274; CI-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
275; CI-NEXT:    v_mov_b32_e32 v2, 0
276; CI-NEXT:    s_waitcnt lgkmcnt(0)
277; CI-NEXT:    buffer_load_dwordx4 v[1:4], v[1:2], s[0:3], 0 addr64
278; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
279; CI-NEXT:    s_mov_b32 m0, -1
280; CI-NEXT:    s_waitcnt vmcnt(0)
281; CI-NEXT:    ds_write2_b32 v0, v1, v4 offset1:8
282; CI-NEXT:    s_endpgm
283;
284; GFX9-LABEL: simple_write2_two_val_subreg4_f32:
285; GFX9:       ; %bb.0:
286; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
287; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
288; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
289; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
290; GFX9-NEXT:    global_load_dwordx4 v[1:4], v1, s[0:1]
291; GFX9-NEXT:    s_waitcnt vmcnt(0)
292; GFX9-NEXT:    ds_write2_b32 v0, v1, v4 offset1:8
293; GFX9-NEXT:    s_endpgm
294  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
295  %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
296  %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
297  %val0 = extractelement <4 x float> %val, i32 0
298  %val1 = extractelement <4 x float> %val, i32 3
299  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
300  store float %val0, float addrspace(3)* %arrayidx0, align 4
301  %add.x = add nsw i32 %x.i, 8
302  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
303  store float %val1, float addrspace(3)* %arrayidx1, align 4
304  ret void
305}
306
307define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
308; CI-LABEL: simple_write2_two_val_max_offset_f32:
309; CI:       ; %bb.0:
310; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
311; CI-NEXT:    s_mov_b32 s3, 0xf000
312; CI-NEXT:    s_mov_b32 s2, 0
313; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
314; CI-NEXT:    v_mov_b32_e32 v1, 0
315; CI-NEXT:    s_waitcnt lgkmcnt(0)
316; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
317; CI-NEXT:    s_waitcnt vmcnt(0)
318; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
319; CI-NEXT:    s_waitcnt vmcnt(0)
320; CI-NEXT:    s_mov_b32 m0, -1
321; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset1:255
322; CI-NEXT:    s_endpgm
323;
324; GFX9-LABEL: simple_write2_two_val_max_offset_f32:
325; GFX9:       ; %bb.0:
326; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
327; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
328; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
329; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
330; GFX9-NEXT:    s_waitcnt vmcnt(0)
331; GFX9-NEXT:    global_load_dword v2, v0, s[0:1] offset:4 glc
332; GFX9-NEXT:    s_waitcnt vmcnt(0)
333; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:255
334; GFX9-NEXT:    s_endpgm
335  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
336  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
337  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
338  %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
339  %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
340  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
341  store float %val0, float addrspace(3)* %arrayidx0, align 4
342  %add.x = add nsw i32 %x.i, 255
343  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
344  store float %val1, float addrspace(3)* %arrayidx1, align 4
345  ret void
346}
347
348define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
349; CI-LABEL: simple_write2_two_val_too_far_f32:
350; CI:       ; %bb.0:
351; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
352; CI-NEXT:    s_mov_b32 s7, 0xf000
353; CI-NEXT:    s_mov_b32 s6, 0
354; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
355; CI-NEXT:    v_mov_b32_e32 v1, 0
356; CI-NEXT:    s_waitcnt lgkmcnt(0)
357; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
358; CI-NEXT:    s_mov_b64 s[0:1], s[2:3]
359; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
360; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
361; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
362; CI-NEXT:    s_mov_b32 m0, -1
363; CI-NEXT:    s_waitcnt vmcnt(1)
364; CI-NEXT:    ds_write_b32 v0, v2
365; CI-NEXT:    s_waitcnt vmcnt(0)
366; CI-NEXT:    ds_write_b32 v0, v1 offset:1028
367; CI-NEXT:    s_endpgm
368;
369; GFX9-LABEL: simple_write2_two_val_too_far_f32:
370; GFX9:       ; %bb.0:
371; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
372; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
373; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
374; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
375; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
376; GFX9-NEXT:    s_waitcnt vmcnt(1)
377; GFX9-NEXT:    ds_write_b32 v0, v1
378; GFX9-NEXT:    s_waitcnt vmcnt(0)
379; GFX9-NEXT:    ds_write_b32 v0, v2 offset:1028
380; GFX9-NEXT:    s_endpgm
381  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
382  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
383  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
384  %val0 = load float, float addrspace(1)* %in0.gep, align 4
385  %val1 = load float, float addrspace(1)* %in1.gep, align 4
386  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
387  store float %val0, float addrspace(3)* %arrayidx0, align 4
388  %add.x = add nsw i32 %x.i, 257
389  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
390  store float %val1, float addrspace(3)* %arrayidx1, align 4
391  ret void
392}
393
394define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
395; CI-LABEL: simple_write2_two_val_f32_x2:
396; CI:       ; %bb.0:
397; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
398; CI-NEXT:    s_mov_b32 s7, 0xf000
399; CI-NEXT:    s_mov_b32 s6, 0
400; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
401; CI-NEXT:    v_mov_b32_e32 v1, 0
402; CI-NEXT:    s_waitcnt lgkmcnt(0)
403; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
404; CI-NEXT:    s_mov_b64 s[0:1], s[2:3]
405; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
406; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
407; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
408; CI-NEXT:    s_mov_b32 m0, -1
409; CI-NEXT:    s_waitcnt vmcnt(0)
410; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset1:8
411; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset0:11 offset1:27
412; CI-NEXT:    s_endpgm
413;
414; GFX9-LABEL: simple_write2_two_val_f32_x2:
415; GFX9:       ; %bb.0:
416; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
417; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
418; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
419; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
420; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
421; GFX9-NEXT:    s_waitcnt vmcnt(0)
422; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
423; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
424; GFX9-NEXT:    s_endpgm
425  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
426  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
427  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
428  %val0 = load float, float addrspace(1)* %in0.gep, align 4
429  %val1 = load float, float addrspace(1)* %in1.gep, align 4
430
431  %idx.0 = add nsw i32 %tid.x, 0
432  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
433  store float %val0, float addrspace(3)* %arrayidx0, align 4
434
435  %idx.1 = add nsw i32 %tid.x, 8
436  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
437  store float %val1, float addrspace(3)* %arrayidx1, align 4
438
439  %idx.2 = add nsw i32 %tid.x, 11
440  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
441  store float %val0, float addrspace(3)* %arrayidx2, align 4
442
443  %idx.3 = add nsw i32 %tid.x, 27
444  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
445  store float %val1, float addrspace(3)* %arrayidx3, align 4
446
447  ret void
448}
449
450define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
451; CI-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
452; CI:       ; %bb.0:
453; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
454; CI-NEXT:    s_mov_b32 s7, 0xf000
455; CI-NEXT:    s_mov_b32 s6, 0
456; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
457; CI-NEXT:    v_mov_b32_e32 v1, 0
458; CI-NEXT:    s_waitcnt lgkmcnt(0)
459; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
460; CI-NEXT:    s_mov_b64 s[0:1], s[2:3]
461; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
462; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
463; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
464; CI-NEXT:    s_mov_b32 m0, -1
465; CI-NEXT:    s_waitcnt vmcnt(0)
466; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset0:3 offset1:8
467; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset0:11 offset1:27
468; CI-NEXT:    s_endpgm
469;
470; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
471; GFX9:       ; %bb.0:
472; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
473; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
474; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
475; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
476; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
477; GFX9-NEXT:    s_waitcnt vmcnt(0)
478; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:3 offset1:8
479; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
480; GFX9-NEXT:    s_endpgm
481  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
482  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
483  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
484  %val0 = load float, float addrspace(1)* %in0.gep, align 4
485  %val1 = load float, float addrspace(1)* %in1.gep, align 4
486
487  %idx.0 = add nsw i32 %tid.x, 3
488  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
489  store float %val0, float addrspace(3)* %arrayidx0, align 4
490
491  %idx.1 = add nsw i32 %tid.x, 8
492  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
493  store float %val1, float addrspace(3)* %arrayidx1, align 4
494
495  %idx.2 = add nsw i32 %tid.x, 11
496  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
497  store float %val0, float addrspace(3)* %arrayidx2, align 4
498
499  %idx.3 = add nsw i32 %tid.x, 27
500  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
501  store float %val1, float addrspace(3)* %arrayidx3, align 4
502
503  ret void
504}
505
506define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
507; CI-LABEL: write2_ptr_subreg_arg_two_val_f32:
508; CI:       ; %bb.0:
509; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
510; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xf
511; CI-NEXT:    s_mov_b32 s3, 0xf000
512; CI-NEXT:    s_mov_b32 s2, 0
513; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
514; CI-NEXT:    s_waitcnt lgkmcnt(0)
515; CI-NEXT:    s_mov_b64 s[0:1], s[4:5]
516; CI-NEXT:    v_mov_b32_e32 v1, 0
517; CI-NEXT:    s_mov_b64 s[4:5], s[6:7]
518; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
519; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
520; CI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
521; CI-NEXT:    v_mov_b32_e32 v1, s8
522; CI-NEXT:    s_mov_b32 m0, -1
523; CI-NEXT:    v_mov_b32_e32 v3, s9
524; CI-NEXT:    s_waitcnt vmcnt(1)
525; CI-NEXT:    ds_write_b32 v1, v2 offset:32
526; CI-NEXT:    s_waitcnt vmcnt(0)
527; CI-NEXT:    ds_write_b32 v3, v0 offset:32
528; CI-NEXT:    s_endpgm
529;
530; GFX9-LABEL: write2_ptr_subreg_arg_two_val_f32:
531; GFX9:       ; %bb.0:
532; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
533; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
534; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
535; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
536; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
537; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
538; GFX9-NEXT:    v_mov_b32_e32 v0, s2
539; GFX9-NEXT:    v_mov_b32_e32 v3, s3
540; GFX9-NEXT:    s_waitcnt vmcnt(1)
541; GFX9-NEXT:    ds_write_b32 v0, v1 offset:32
542; GFX9-NEXT:    s_waitcnt vmcnt(0)
543; GFX9-NEXT:    ds_write_b32 v3, v2 offset:32
544; GFX9-NEXT:    s_endpgm
545  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
546  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
547  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
548  %val0 = load float, float addrspace(1)* %in0.gep, align 4
549  %val1 = load float, float addrspace(1)* %in1.gep, align 4
550
551  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
552  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
553  %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
554  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
555  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
556
557  ; Apply an additional offset after the vector that will be more obviously folded.
558  %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8
559  store float %val0, float addrspace(3)* %gep.0, align 4
560
561  %add.x = add nsw i32 %x.i, 8
562  store float %val1, float addrspace(3)* %gep.1.offset, align 4
563  ret void
564}
565
566define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
567; CI-LABEL: simple_write2_one_val_f64:
568; CI:       ; %bb.0:
569; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
570; CI-NEXT:    s_mov_b32 s3, 0xf000
571; CI-NEXT:    s_mov_b32 s2, 0
572; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
573; CI-NEXT:    v_mov_b32_e32 v1, 0
574; CI-NEXT:    s_waitcnt lgkmcnt(0)
575; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64
576; CI-NEXT:    s_mov_b32 m0, -1
577; CI-NEXT:    s_waitcnt vmcnt(0)
578; CI-NEXT:    ds_write2_b64 v0, v[1:2], v[1:2] offset1:8
579; CI-NEXT:    s_endpgm
580;
581; GFX9-LABEL: simple_write2_one_val_f64:
582; GFX9:       ; %bb.0:
583; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
584; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
585; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
586; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
587; GFX9-NEXT:    s_waitcnt vmcnt(0)
588; GFX9-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset1:8
589; GFX9-NEXT:    s_endpgm
590  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
591  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
592  %val = load double, double addrspace(1)* %in.gep, align 8
593  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
594  store double %val, double addrspace(3)* %arrayidx0, align 8
595  %add.x = add nsw i32 %x.i, 8
596  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
597  store double %val, double addrspace(3)* %arrayidx1, align 8
598  ret void
599}
600
601define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
602; CI-LABEL: misaligned_simple_write2_one_val_f64:
603; CI:       ; %bb.0:
604; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
605; CI-NEXT:    s_load_dword s0, s[0:1], 0xd
606; CI-NEXT:    s_mov_b32 s7, 0xf000
607; CI-NEXT:    s_mov_b32 s6, 0
608; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
609; CI-NEXT:    v_mov_b32_e32 v1, 0
610; CI-NEXT:    s_waitcnt lgkmcnt(0)
611; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
612; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
613; CI-NEXT:    s_mov_b32 m0, -1
614; CI-NEXT:    s_waitcnt vmcnt(0)
615; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
616; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset0:14 offset1:15
617; CI-NEXT:    s_endpgm
618;
619; GFX9-LABEL: misaligned_simple_write2_one_val_f64:
620; GFX9:       ; %bb.0:
621; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
622; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
623; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
624; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
625; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
626; GFX9-NEXT:    v_add_u32_e32 v2, s4, v2
627; GFX9-NEXT:    s_waitcnt vmcnt(0)
628; GFX9-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
629; GFX9-NEXT:    ds_write2_b32 v2, v0, v1 offset0:14 offset1:15
630; GFX9-NEXT:    s_endpgm
631  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
632  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
633  %val = load double, double addrspace(1)* %in.gep, align 8
634  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
635  store double %val, double addrspace(3)* %arrayidx0, align 4
636  %add.x = add nsw i32 %x.i, 7
637  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
638  store double %val, double addrspace(3)* %arrayidx1, align 4
639  ret void
640}
641
642define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
643; CI-LABEL: unaligned_offset_simple_write2_one_val_f64:
644; CI:       ; %bb.0:
645; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
646; CI-NEXT:    s_load_dword s0, s[0:1], 0xd
647; CI-NEXT:    s_mov_b32 s7, 0xf000
648; CI-NEXT:    s_mov_b32 s6, 0
649; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
650; CI-NEXT:    v_mov_b32_e32 v1, 0
651; CI-NEXT:    s_waitcnt lgkmcnt(0)
652; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
653; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
654; CI-NEXT:    s_mov_b32 m0, -1
655; CI-NEXT:    s_waitcnt vmcnt(0)
656; CI-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
657; CI-NEXT:    ds_write_b8 v0, v1 offset:5
658; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
659; CI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
660; CI-NEXT:    ds_write_b8 v0, v2 offset:13
661; CI-NEXT:    ds_write_b8 v0, v1 offset:9
662; CI-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
663; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
664; CI-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
665; CI-NEXT:    ds_write_b8 v0, v3 offset:8
666; CI-NEXT:    ds_write_b8 v0, v4 offset:7
667; CI-NEXT:    ds_write_b8 v0, v5 offset:6
668; CI-NEXT:    ds_write_b8 v0, v1 offset:16
669; CI-NEXT:    ds_write_b8 v0, v6 offset:15
670; CI-NEXT:    ds_write_b8 v0, v2 offset:14
671; CI-NEXT:    ds_write_b8 v0, v3 offset:12
672; CI-NEXT:    ds_write_b8 v0, v4 offset:11
673; CI-NEXT:    ds_write_b8 v0, v5 offset:10
674; CI-NEXT:    s_endpgm
675;
676; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64:
677; GFX9-ALIGNED:       ; %bb.0:
678; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
679; GFX9-ALIGNED-NEXT:    s_load_dword s4, s[0:1], 0x34
680; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
681; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
682; GFX9-ALIGNED-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
683; GFX9-ALIGNED-NEXT:    v_add_u32_e32 v2, s4, v2
684; GFX9-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
685; GFX9-ALIGNED-NEXT:    ds_write_b8_d16_hi v2, v0 offset:7
686; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v0 offset:5
687; GFX9-ALIGNED-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
688; GFX9-ALIGNED-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
689; GFX9-ALIGNED-NEXT:    ds_write_b8_d16_hi v2, v1 offset:15
690; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v1 offset:13
691; GFX9-ALIGNED-NEXT:    ds_write_b8_d16_hi v2, v0 offset:11
692; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v0 offset:9
693; GFX9-ALIGNED-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
694; GFX9-ALIGNED-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
695; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v3 offset:8
696; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v4 offset:6
697; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v0 offset:16
698; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v1 offset:14
699; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v3 offset:12
700; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v4 offset:10
701; GFX9-ALIGNED-NEXT:    s_endpgm
702;
703; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64:
704; GFX9-UNALIGNED:       ; %bb.0:
705; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
706; GFX9-UNALIGNED-NEXT:    s_load_dword s4, s[0:1], 0x34
707; GFX9-UNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
708; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
709; GFX9-UNALIGNED-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
710; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v2, s4, v2
711; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v3, 5, v2
712; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v2, 9, v2
713; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
714; GFX9-UNALIGNED-NEXT:    ds_write2_b32 v3, v0, v1 offset1:1
715; GFX9-UNALIGNED-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
716; GFX9-UNALIGNED-NEXT:    s_endpgm
717  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
718  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
719  %val = load double, double addrspace(1)* %in.gep, align 8
720  %base = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
721  %base.i8 = bitcast double addrspace(3)* %base to i8 addrspace(3)*
722  %addr0.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 5
723  %addr0 = bitcast i8 addrspace(3)* %addr0.i8 to double addrspace(3)*
724  store double %val, double addrspace(3)* %addr0, align 1
725  %addr1.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 9
726  %addr1 = bitcast i8 addrspace(3)* %addr1.i8 to double addrspace(3)*
727  store double %val, double addrspace(3)* %addr1, align 1
728  ret void
729}
730
731define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
732; CI-LABEL: simple_write2_two_val_f64:
733; CI:       ; %bb.0:
734; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
735; CI-NEXT:    s_mov_b32 s3, 0xf000
736; CI-NEXT:    s_mov_b32 s2, 0
737; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
738; CI-NEXT:    v_mov_b32_e32 v1, 0
739; CI-NEXT:    s_waitcnt lgkmcnt(0)
740; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc
741; CI-NEXT:    s_waitcnt vmcnt(0)
742; CI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc
743; CI-NEXT:    s_waitcnt vmcnt(0)
744; CI-NEXT:    s_mov_b32 m0, -1
745; CI-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:8
746; CI-NEXT:    s_endpgm
747;
748; GFX9-LABEL: simple_write2_two_val_f64:
749; GFX9:       ; %bb.0:
750; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
751; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
752; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
753; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1] glc
754; GFX9-NEXT:    s_waitcnt vmcnt(0)
755; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc
756; GFX9-NEXT:    s_waitcnt vmcnt(0)
757; GFX9-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:8
758; GFX9-NEXT:    s_endpgm
759  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
760  %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
761  %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
762  %val0 = load volatile double, double addrspace(1)* %in.gep.0, align 8
763  %val1 = load volatile double, double addrspace(1)* %in.gep.1, align 8
764  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
765  store double %val0, double addrspace(3)* %arrayidx0, align 8
766  %add.x = add nsw i32 %x.i, 8
767  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
768  store double %val1, double addrspace(3)* %arrayidx1, align 8
769  ret void
770}
771
772@foo = addrspace(3) global [4 x i32] undef, align 4
773
774define amdgpu_kernel void @store_constant_adjacent_offsets() {
775; CI-LABEL: store_constant_adjacent_offsets:
776; CI:       ; %bb.0:
777; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
778; CI-NEXT:    v_mov_b32_e32 v1, v0
779; CI-NEXT:    v_mov_b32_e32 v2, 0
780; CI-NEXT:    s_mov_b32 m0, -1
781; CI-NEXT:    ds_write_b64 v2, v[0:1]
782; CI-NEXT:    s_endpgm
783;
784; GFX9-LABEL: store_constant_adjacent_offsets:
785; GFX9:       ; %bb.0:
786; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
787; GFX9-NEXT:    v_mov_b32_e32 v1, v0
788; GFX9-NEXT:    v_mov_b32_e32 v2, 0
789; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
790; GFX9-NEXT:    s_endpgm
791  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
792  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
793  ret void
794}
795
796define amdgpu_kernel void @store_constant_disjoint_offsets() {
797; CI-LABEL: store_constant_disjoint_offsets:
798; CI:       ; %bb.0:
799; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
800; CI-NEXT:    v_mov_b32_e32 v1, 0
801; CI-NEXT:    s_mov_b32 m0, -1
802; CI-NEXT:    ds_write2_b32 v1, v0, v0 offset1:2
803; CI-NEXT:    s_endpgm
804;
805; GFX9-LABEL: store_constant_disjoint_offsets:
806; GFX9:       ; %bb.0:
807; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
808; GFX9-NEXT:    v_mov_b32_e32 v1, 0
809; GFX9-NEXT:    ds_write2_b32 v1, v0, v0 offset1:2
810; GFX9-NEXT:    s_endpgm
811  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
812  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
813  ret void
814}
815
816@bar = addrspace(3) global [4 x i64] undef, align 4
817
818define amdgpu_kernel void @store_misaligned64_constant_offsets() {
819; CI-LABEL: store_misaligned64_constant_offsets:
820; CI:       ; %bb.0:
821; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
822; CI-NEXT:    v_mov_b32_e32 v1, 0
823; CI-NEXT:    v_mov_b32_e32 v2, v0
824; CI-NEXT:    v_mov_b32_e32 v3, v1
825; CI-NEXT:    s_mov_b32 m0, -1
826; CI-NEXT:    ds_write_b128 v1, v[0:3]
827; CI-NEXT:    s_endpgm
828;
829; GFX9-LABEL: store_misaligned64_constant_offsets:
830; GFX9:       ; %bb.0:
831; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
832; GFX9-NEXT:    v_mov_b32_e32 v1, 0
833; GFX9-NEXT:    v_mov_b32_e32 v2, v0
834; GFX9-NEXT:    v_mov_b32_e32 v3, v1
835; GFX9-NEXT:    ds_write_b128 v1, v[0:3]
836; GFX9-NEXT:    s_endpgm
837  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
838  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
839  ret void
840}
841
842@bar.large = addrspace(3) global [4096 x i64] undef, align 4
843
844define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
845; CI-LABEL: store_misaligned64_constant_large_offsets:
846; CI:       ; %bb.0:
847; CI-NEXT:    s_mov_b64 s[0:1], 0x7b
848; CI-NEXT:    v_mov_b32_e32 v0, s0
849; CI-NEXT:    v_mov_b32_e32 v2, 0
850; CI-NEXT:    v_mov_b32_e32 v1, s1
851; CI-NEXT:    s_mov_b32 m0, -1
852; CI-NEXT:    ds_write_b64 v2, v[0:1] offset:16384
853; CI-NEXT:    ds_write_b64 v2, v[0:1] offset:32760
854; CI-NEXT:    s_endpgm
855;
856; GFX9-LABEL: store_misaligned64_constant_large_offsets:
857; GFX9:       ; %bb.0:
858; GFX9-NEXT:    s_mov_b64 s[0:1], 0x7b
859; GFX9-NEXT:    v_mov_b32_e32 v0, s0
860; GFX9-NEXT:    v_mov_b32_e32 v2, 0
861; GFX9-NEXT:    v_mov_b32_e32 v1, s1
862; GFX9-NEXT:    ds_write_b64 v2, v[0:1] offset:16384
863; GFX9-NEXT:    ds_write_b64 v2, v[0:1] offset:32760
864; GFX9-NEXT:    s_endpgm
865  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
866  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
867  ret void
868}
869
870@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
871@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
872
873define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
874; CI-LABEL: write2_sgemm_sequence:
875; CI:       ; %bb.0:
876; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
877; CI-NEXT:    s_mov_b32 m0, -1
878; CI-NEXT:    s_waitcnt lgkmcnt(0)
879; CI-NEXT:    s_load_dword s0, s[0:1], 0x0
880; CI-NEXT:    s_lshl_b32 s1, s2, 2
881; CI-NEXT:    s_add_i32 s2, s1, 0xc20
882; CI-NEXT:    s_addk_i32 s1, 0xc60
883; CI-NEXT:    v_mov_b32_e32 v0, s2
884; CI-NEXT:    s_waitcnt lgkmcnt(0)
885; CI-NEXT:    v_mov_b32_e32 v2, s0
886; CI-NEXT:    v_mov_b32_e32 v3, s0
887; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
888; CI-NEXT:    v_mov_b32_e32 v0, s1
889; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
890; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v1
891; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
892; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset0:32 offset1:33
893; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset0:64 offset1:65
894; CI-NEXT:    s_endpgm
895;
896; GFX9-LABEL: write2_sgemm_sequence:
897; GFX9:       ; %bb.0:
898; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
899; GFX9-NEXT:    s_lshl_b32 s2, s2, 2
900; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
901; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
902; GFX9-NEXT:    s_add_i32 s1, s2, 0xc20
903; GFX9-NEXT:    s_addk_i32 s2, 0xc60
904; GFX9-NEXT:    v_mov_b32_e32 v0, s1
905; GFX9-NEXT:    v_mov_b32_e32 v2, s2
906; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
907; GFX9-NEXT:    v_mov_b32_e32 v3, s0
908; GFX9-NEXT:    v_mov_b32_e32 v4, s0
909; GFX9-NEXT:    ds_write2_b32 v0, v3, v4 offset1:1
910; GFX9-NEXT:    ds_write2_b32 v2, v3, v4 offset1:1
911; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v1
912; GFX9-NEXT:    ds_write2_b32 v0, v3, v4 offset1:1
913; GFX9-NEXT:    ds_write2_b32 v0, v3, v4 offset0:32 offset1:33
914; GFX9-NEXT:    ds_write2_b32 v0, v3, v4 offset0:64 offset1:65
915; GFX9-NEXT:    s_endpgm
916  %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
917  %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
918  %val = load float, float addrspace(1)* %in
919  %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
920  store float %val, float addrspace(3)* %arrayidx44, align 4
921  %add47 = add nsw i32 %x.i, 1
922  %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
923  store float %val, float addrspace(3)* %arrayidx48, align 4
924  %add51 = add nsw i32 %x.i, 16
925  %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
926  store float %val, float addrspace(3)* %arrayidx52, align 4
927  %add55 = add nsw i32 %x.i, 17
928  %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
929  store float %val, float addrspace(3)* %arrayidx56, align 4
930  %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
931  store float %val, float addrspace(3)* %arrayidx60, align 4
932  %add63 = add nsw i32 %y.i, 1
933  %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
934  store float %val, float addrspace(3)* %arrayidx64, align 4
935  %add67 = add nsw i32 %y.i, 32
936  %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
937  store float %val, float addrspace(3)* %arrayidx68, align 4
938  %add71 = add nsw i32 %y.i, 33
939  %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
940  store float %val, float addrspace(3)* %arrayidx72, align 4
941  %add75 = add nsw i32 %y.i, 64
942  %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
943  store float %val, float addrspace(3)* %arrayidx76, align 4
944  %add79 = add nsw i32 %y.i, 65
945  %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
946  store float %val, float addrspace(3)* %arrayidx80, align 4
947  ret void
948}
949
950define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
951; CI-LABEL: simple_write2_v4f32_superreg_align4:
952; CI:       ; %bb.0:
953; CI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
954; CI-NEXT:    s_load_dword s4, s[0:1], 0x9
955; CI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
956; CI-NEXT:    s_mov_b32 m0, -1
957; CI-NEXT:    s_waitcnt lgkmcnt(0)
958; CI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
959; CI-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
960; CI-NEXT:    s_waitcnt lgkmcnt(0)
961; CI-NEXT:    v_mov_b32_e32 v1, s0
962; CI-NEXT:    v_mov_b32_e32 v2, s1
963; CI-NEXT:    v_mov_b32_e32 v3, s2
964; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
965; CI-NEXT:    v_mov_b32_e32 v1, s3
966; CI-NEXT:    ds_write2_b32 v0, v3, v1 offset0:2 offset1:3
967; CI-NEXT:    s_endpgm
968;
969; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
970; GFX9-ALIGNED:       ; %bb.0:
971; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
972; GFX9-ALIGNED-NEXT:    s_load_dword s4, s[0:1], 0x24
973; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
974; GFX9-ALIGNED-NEXT:    v_lshl_add_u32 v0, v0, 4, s4
975; GFX9-ALIGNED-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
976; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
977; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v1, s0
978; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v2, s1
979; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v3, s2
980; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v4, s3
981; GFX9-ALIGNED-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
982; GFX9-ALIGNED-NEXT:    ds_write2_b32 v0, v3, v4 offset0:2 offset1:3
983; GFX9-ALIGNED-NEXT:    s_endpgm
984;
985; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
986; GFX9-UNALIGNED:       ; %bb.0:
987; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
988; GFX9-UNALIGNED-NEXT:    s_load_dword s4, s[0:1], 0x24
989; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
990; GFX9-UNALIGNED-NEXT:    v_lshl_add_u32 v4, v0, 4, s4
991; GFX9-UNALIGNED-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
992; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
993; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
994; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, s2
995; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
996; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v3, s3
997; GFX9-UNALIGNED-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
998; GFX9-UNALIGNED-NEXT:    s_endpgm
999  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
1000  %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
1001  %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4
1002  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(3)* %out, i32 %x.i
1003  store <4 x float> %val0, <4 x float> addrspace(3)* %out.gep, align 4
1004  ret void
1005}
1006
1007@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1
1008
1009define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
1010; CI-LABEL: write2_v2i32_align1_odd_offset:
1011; CI:       ; %bb.0: ; %entry
1012; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
1013; CI-NEXT:    v_mov_b32_e32 v1, 0
1014; CI-NEXT:    s_mov_b32 m0, -1
1015; CI-NEXT:    ds_write_b8 v1, v0 offset:65
1016; CI-NEXT:    v_mov_b32_e32 v0, 1
1017; CI-NEXT:    ds_write_b8 v1, v0 offset:70
1018; CI-NEXT:    v_mov_b32_e32 v0, 0xc8
1019; CI-NEXT:    ds_write_b8 v1, v0 offset:69
1020; CI-NEXT:    ds_write_b8 v1, v1 offset:68
1021; CI-NEXT:    ds_write_b8 v1, v1 offset:67
1022; CI-NEXT:    ds_write_b8 v1, v1 offset:66
1023; CI-NEXT:    ds_write_b8 v1, v1 offset:72
1024; CI-NEXT:    ds_write_b8 v1, v1 offset:71
1025; CI-NEXT:    s_endpgm
1026;
1027; GFX9-ALIGNED-LABEL: write2_v2i32_align1_odd_offset:
1028; GFX9-ALIGNED:       ; %bb.0: ; %entry
1029; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 0x7b
1030; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v1, 0
1031; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:65
1032; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 1
1033; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:70
1034; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 0xc8
1035; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:69
1036; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:68
1037; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:67
1038; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:66
1039; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:72
1040; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:71
1041; GFX9-ALIGNED-NEXT:    s_endpgm
1042;
1043; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset:
1044; GFX9-UNALIGNED:       ; %bb.0: ; %entry
1045; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x41
1046; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v1, 0x7b
1047; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x1c8
1048; GFX9-UNALIGNED-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
1049; GFX9-UNALIGNED-NEXT:    s_endpgm
1050entry:
1051  store <2 x i32> <i32 123, i32 456>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1
1052  ret void
1053}
1054
1055declare i32 @llvm.amdgcn.workgroup.id.x() #1
1056declare i32 @llvm.amdgcn.workgroup.id.y() #1
1057declare i32 @llvm.amdgcn.workitem.id.x() #1
1058declare i32 @llvm.amdgcn.workitem.id.y() #1
1059
1060attributes #0 = { nounwind }
1061attributes #1 = { nounwind readnone speculatable }
1062attributes #2 = { convergent nounwind }
1063