1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
2; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s
3; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s
4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s
5
6@lds = addrspace(3) global [512 x float] undef, align 4
7@lds.f64 = addrspace(3) global [512 x double] undef, align 8
8
9define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
10; CI-LABEL: simple_write2_one_val_f32:
11; CI:       ; %bb.0:
12; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
13; CI-NEXT:    s_mov_b32 s3, 0xf000
14; CI-NEXT:    s_mov_b32 s2, 0
15; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
16; CI-NEXT:    v_mov_b32_e32 v1, 0
17; CI-NEXT:    s_waitcnt lgkmcnt(0)
18; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
19; CI-NEXT:    s_mov_b32 m0, -1
20; CI-NEXT:    s_waitcnt vmcnt(0)
21; CI-NEXT:    ds_write2_b32 v0, v1, v1 offset1:8
22; CI-NEXT:    s_endpgm
23;
24; GFX9-LABEL: simple_write2_one_val_f32:
25; GFX9:       ; %bb.0:
26; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
27; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
28; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
29; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
30; GFX9-NEXT:    s_waitcnt vmcnt(0)
31; GFX9-NEXT:    ds_write2_b32 v0, v1, v1 offset1:8
32; GFX9-NEXT:    s_endpgm
33  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
34  %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
35  %val = load float, float addrspace(1)* %in.gep, align 4
36  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
37  store float %val, float addrspace(3)* %arrayidx0, align 4
38  %add.x = add nsw i32 %x.i, 8
39  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
40  store float %val, float addrspace(3)* %arrayidx1, align 4
41  ret void
42}
43
44define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
45; CI-LABEL: simple_write2_two_val_f32:
46; CI:       ; %bb.0:
47; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
48; CI-NEXT:    s_mov_b32 s3, 0xf000
49; CI-NEXT:    s_mov_b32 s2, 0
50; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
51; CI-NEXT:    v_mov_b32_e32 v1, 0
52; CI-NEXT:    s_waitcnt lgkmcnt(0)
53; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
54; CI-NEXT:    s_waitcnt vmcnt(0)
55; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
56; CI-NEXT:    s_waitcnt vmcnt(0)
57; CI-NEXT:    s_mov_b32 m0, -1
58; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset1:8
59; CI-NEXT:    s_endpgm
60;
61; GFX9-LABEL: simple_write2_two_val_f32:
62; GFX9:       ; %bb.0:
63; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
64; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
65; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
67; GFX9-NEXT:    s_waitcnt vmcnt(0)
68; GFX9-NEXT:    global_load_dword v2, v0, s[0:1] offset:4 glc
69; GFX9-NEXT:    s_waitcnt vmcnt(0)
70; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
71; GFX9-NEXT:    s_endpgm
72  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
73  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
74  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
75  %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
76  %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
77  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
78  store float %val0, float addrspace(3)* %arrayidx0, align 4
79  %add.x = add nsw i32 %x.i, 8
80  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
81  store float %val1, float addrspace(3)* %arrayidx1, align 4
82  ret void
83}
84
85define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
86; CI-LABEL: simple_write2_two_val_f32_volatile_0:
87; CI:       ; %bb.0:
88; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2
89; CI-NEXT:    s_mov_b32 s7, 0xf000
90; CI-NEXT:    s_mov_b32 s6, 0
91; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
92; CI-NEXT:    v_mov_b32_e32 v1, 0
93; CI-NEXT:    s_waitcnt lgkmcnt(0)
94; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
95; CI-NEXT:    s_mov_b64 s[0:1], s[2:3]
96; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
97; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
98; CI-NEXT:    s_waitcnt vmcnt(0)
99; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 glc
100; CI-NEXT:    s_waitcnt vmcnt(0)
101; CI-NEXT:    s_mov_b32 m0, -1
102; CI-NEXT:    ds_write_b32 v0, v2
103; CI-NEXT:    ds_write_b32 v0, v1 offset:32
104; CI-NEXT:    s_endpgm
105;
106; GFX9-LABEL: simple_write2_two_val_f32_volatile_0:
107; GFX9:       ; %bb.0:
108; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x8
109; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
110; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
111; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
112; GFX9-NEXT:    s_waitcnt vmcnt(0)
113; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
114; GFX9-NEXT:    s_waitcnt vmcnt(0)
115; GFX9-NEXT:    ds_write_b32 v0, v1
116; GFX9-NEXT:    ds_write_b32 v0, v2 offset:32
117; GFX9-NEXT:    s_endpgm
118  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
119  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
120  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
121  %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4
122  %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4
123  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
124  store volatile float %val0, float addrspace(3)* %arrayidx0, align 4
125  %add.x = add nsw i32 %x.i, 8
126  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
127  store float %val1, float addrspace(3)* %arrayidx1, align 4
128  ret void
129}
130
131define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
132; CI-LABEL: simple_write2_two_val_f32_volatile_1:
133; CI:       ; %bb.0:
134; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2
135; CI-NEXT:    s_mov_b32 s7, 0xf000
136; CI-NEXT:    s_mov_b32 s6, 0
137; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
138; CI-NEXT:    v_mov_b32_e32 v1, 0
139; CI-NEXT:    s_waitcnt lgkmcnt(0)
140; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
141; CI-NEXT:    s_mov_b64 s[0:1], s[2:3]
142; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
143; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
144; CI-NEXT:    s_waitcnt vmcnt(0)
145; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 glc
146; CI-NEXT:    s_waitcnt vmcnt(0)
147; CI-NEXT:    s_mov_b32 m0, -1
148; CI-NEXT:    ds_write_b32 v0, v2
149; CI-NEXT:    ds_write_b32 v0, v1 offset:32
150; CI-NEXT:    s_endpgm
151;
152; GFX9-LABEL: simple_write2_two_val_f32_volatile_1:
153; GFX9:       ; %bb.0:
154; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x8
155; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
156; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
158; GFX9-NEXT:    s_waitcnt vmcnt(0)
159; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
160; GFX9-NEXT:    s_waitcnt vmcnt(0)
161; GFX9-NEXT:    ds_write_b32 v0, v1
162; GFX9-NEXT:    ds_write_b32 v0, v2 offset:32
163; GFX9-NEXT:    s_endpgm
164  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
165  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
166  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
167  %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4
168  %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4
169  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
170  store float %val0, float addrspace(3)* %arrayidx0, align 4
171  %add.x = add nsw i32 %x.i, 8
172  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
173  store volatile float %val1, float addrspace(3)* %arrayidx1, align 4
174  ret void
175}
176
177; 2 data subregisters from different super registers.
178; TODO: GFX9 has v_mov_b32_e32 v2, lds@abs32@lo
179;       This should be an s_mov_b32. The v_mov_b32 gets introduced by an
180;       early legalization of the constant bus constraint on the v_lshl_add_u32,
181;       and then SIFoldOperands folds in an unlucky order.
182define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
183; CI-LABEL: simple_write2_two_val_subreg2_mixed_f32:
184; CI:       ; %bb.0:
185; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
186; CI-NEXT:    s_mov_b32 s3, 0xf000
187; CI-NEXT:    s_mov_b32 s2, 0
188; CI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
189; CI-NEXT:    v_mov_b32_e32 v2, 0
190; CI-NEXT:    s_waitcnt lgkmcnt(0)
191; CI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 glc
192; CI-NEXT:    s_waitcnt vmcnt(0)
193; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8 glc
194; CI-NEXT:    s_waitcnt vmcnt(0)
195; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
196; CI-NEXT:    s_mov_b32 m0, -1
197; CI-NEXT:    ds_write2_b32 v0, v3, v2 offset1:8
198; CI-NEXT:    s_endpgm
199;
200; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32:
201; GFX9:       ; %bb.0:
202; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
203; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
204; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
205; GFX9-NEXT:    ; kill: killed $vgpr4
206; GFX9-NEXT:    ; kill: killed $sgpr0_sgpr1
207; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
208; GFX9-NEXT:    global_load_dwordx2 v[1:2], v4, s[0:1] glc
209; GFX9-NEXT:    s_waitcnt vmcnt(0)
210; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc
211; GFX9-NEXT:    s_waitcnt vmcnt(0)
212; GFX9-NEXT:    ds_write2_b32 v0, v1, v3 offset1:8
213; GFX9-NEXT:    s_endpgm
214  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
215  %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
216  %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
217  %val0 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8
218  %val1 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8
219  %val0.0 = extractelement <2 x float> %val0, i32 0
220  %val1.1 = extractelement <2 x float> %val1, i32 1
221  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
222  store float %val0.0, float addrspace(3)* %arrayidx0, align 4
223  %add.x = add nsw i32 %x.i, 8
224  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
225  store float %val1.1, float addrspace(3)* %arrayidx1, align 4
226  ret void
227}
228
229define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
230; CI-LABEL: simple_write2_two_val_subreg2_f32:
231; CI:       ; %bb.0:
232; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
233; CI-NEXT:    s_mov_b32 s3, 0xf000
234; CI-NEXT:    s_mov_b32 s2, 0
235; CI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
236; CI-NEXT:    v_mov_b32_e32 v2, 0
237; CI-NEXT:    s_waitcnt lgkmcnt(0)
238; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64
239; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
240; CI-NEXT:    s_mov_b32 m0, -1
241; CI-NEXT:    s_waitcnt vmcnt(0)
242; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
243; CI-NEXT:    s_endpgm
244;
245; GFX9-LABEL: simple_write2_two_val_subreg2_f32:
246; GFX9:       ; %bb.0:
247; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
248; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
249; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
250; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
251; GFX9-NEXT:    global_load_dwordx2 v[1:2], v1, s[0:1]
252; GFX9-NEXT:    s_waitcnt vmcnt(0)
253; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
254; GFX9-NEXT:    s_endpgm
255  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
256  %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
257  %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
258  %val0 = extractelement <2 x float> %val, i32 0
259  %val1 = extractelement <2 x float> %val, i32 1
260  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
261  store float %val0, float addrspace(3)* %arrayidx0, align 4
262  %add.x = add nsw i32 %x.i, 8
263  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
264  store float %val1, float addrspace(3)* %arrayidx1, align 4
265  ret void
266}
267
268define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
269; CI-LABEL: simple_write2_two_val_subreg4_f32:
270; CI:       ; %bb.0:
271; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
272; CI-NEXT:    s_mov_b32 s3, 0xf000
273; CI-NEXT:    s_mov_b32 s2, 0
274; CI-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
275; CI-NEXT:    v_mov_b32_e32 v2, 0
276; CI-NEXT:    s_waitcnt lgkmcnt(0)
277; CI-NEXT:    buffer_load_dwordx4 v[1:4], v[1:2], s[0:3], 0 addr64
278; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
279; CI-NEXT:    s_mov_b32 m0, -1
280; CI-NEXT:    s_waitcnt vmcnt(0)
281; CI-NEXT:    ds_write2_b32 v0, v1, v4 offset1:8
282; CI-NEXT:    s_endpgm
283;
284; GFX9-LABEL: simple_write2_two_val_subreg4_f32:
285; GFX9:       ; %bb.0:
286; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
287; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
288; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
289; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
290; GFX9-NEXT:    global_load_dwordx4 v[1:4], v1, s[0:1]
291; GFX9-NEXT:    s_waitcnt vmcnt(0)
292; GFX9-NEXT:    ds_write2_b32 v0, v1, v4 offset1:8
293; GFX9-NEXT:    s_endpgm
294  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
295  %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
296  %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
297  %val0 = extractelement <4 x float> %val, i32 0
298  %val1 = extractelement <4 x float> %val, i32 3
299  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
300  store float %val0, float addrspace(3)* %arrayidx0, align 4
301  %add.x = add nsw i32 %x.i, 8
302  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
303  store float %val1, float addrspace(3)* %arrayidx1, align 4
304  ret void
305}
306
307define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
308; CI-LABEL: simple_write2_two_val_max_offset_f32:
309; CI:       ; %bb.0:
310; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
311; CI-NEXT:    s_mov_b32 s3, 0xf000
312; CI-NEXT:    s_mov_b32 s2, 0
313; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
314; CI-NEXT:    v_mov_b32_e32 v1, 0
315; CI-NEXT:    s_waitcnt lgkmcnt(0)
316; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
317; CI-NEXT:    s_waitcnt vmcnt(0)
318; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
319; CI-NEXT:    s_waitcnt vmcnt(0)
320; CI-NEXT:    s_mov_b32 m0, -1
321; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset1:255
322; CI-NEXT:    s_endpgm
323;
324; GFX9-LABEL: simple_write2_two_val_max_offset_f32:
325; GFX9:       ; %bb.0:
326; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
327; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
328; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
329; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
330; GFX9-NEXT:    s_waitcnt vmcnt(0)
331; GFX9-NEXT:    global_load_dword v2, v0, s[0:1] offset:4 glc
332; GFX9-NEXT:    s_waitcnt vmcnt(0)
333; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:255
334; GFX9-NEXT:    s_endpgm
335  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
336  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
337  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
338  %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
339  %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
340  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
341  store float %val0, float addrspace(3)* %arrayidx0, align 4
342  %add.x = add nsw i32 %x.i, 255
343  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
344  store float %val1, float addrspace(3)* %arrayidx1, align 4
345  ret void
346}
347
348define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
349; CI-LABEL: simple_write2_two_val_too_far_f32:
350; CI:       ; %bb.0:
351; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2
352; CI-NEXT:    s_mov_b32 s7, 0xf000
353; CI-NEXT:    s_mov_b32 s6, 0
354; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
355; CI-NEXT:    v_mov_b32_e32 v1, 0
356; CI-NEXT:    s_waitcnt lgkmcnt(0)
357; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
358; CI-NEXT:    s_mov_b64 s[0:1], s[2:3]
359; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
360; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
361; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
362; CI-NEXT:    s_mov_b32 m0, -1
363; CI-NEXT:    s_waitcnt vmcnt(1)
364; CI-NEXT:    ds_write_b32 v0, v2
365; CI-NEXT:    s_waitcnt vmcnt(0)
366; CI-NEXT:    ds_write_b32 v0, v1 offset:1028
367; CI-NEXT:    s_endpgm
368;
369; GFX9-LABEL: simple_write2_two_val_too_far_f32:
370; GFX9:       ; %bb.0:
371; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x8
372; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
373; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
374; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
375; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
376; GFX9-NEXT:    s_waitcnt vmcnt(1)
377; GFX9-NEXT:    ds_write_b32 v0, v1
378; GFX9-NEXT:    s_waitcnt vmcnt(0)
379; GFX9-NEXT:    ds_write_b32 v0, v2 offset:1028
380; GFX9-NEXT:    s_endpgm
381  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
382  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
383  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
384  %val0 = load float, float addrspace(1)* %in0.gep, align 4
385  %val1 = load float, float addrspace(1)* %in1.gep, align 4
386  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
387  store float %val0, float addrspace(3)* %arrayidx0, align 4
388  %add.x = add nsw i32 %x.i, 257
389  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
390  store float %val1, float addrspace(3)* %arrayidx1, align 4
391  ret void
392}
393
394define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
395; CI-LABEL: simple_write2_two_val_f32_x2:
396; CI:       ; %bb.0:
397; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2
398; CI-NEXT:    s_mov_b32 s7, 0xf000
399; CI-NEXT:    s_mov_b32 s6, 0
400; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
401; CI-NEXT:    v_mov_b32_e32 v1, 0
402; CI-NEXT:    s_waitcnt lgkmcnt(0)
403; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
404; CI-NEXT:    s_mov_b64 s[0:1], s[2:3]
405; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
406; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
407; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
408; CI-NEXT:    s_mov_b32 m0, -1
409; CI-NEXT:    s_waitcnt vmcnt(0)
410; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset1:8
411; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset0:11 offset1:27
412; CI-NEXT:    s_endpgm
413;
414; GFX9-LABEL: simple_write2_two_val_f32_x2:
415; GFX9:       ; %bb.0:
416; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x8
417; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
418; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
419; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
420; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
421; GFX9-NEXT:    s_waitcnt vmcnt(0)
422; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
423; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
424; GFX9-NEXT:    s_endpgm
425  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
426  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
427  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
428  %val0 = load float, float addrspace(1)* %in0.gep, align 4
429  %val1 = load float, float addrspace(1)* %in1.gep, align 4
430
431  %idx.0 = add nsw i32 %tid.x, 0
432  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
433  store float %val0, float addrspace(3)* %arrayidx0, align 4
434
435  %idx.1 = add nsw i32 %tid.x, 8
436  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
437  store float %val1, float addrspace(3)* %arrayidx1, align 4
438
439  %idx.2 = add nsw i32 %tid.x, 11
440  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
441  store float %val0, float addrspace(3)* %arrayidx2, align 4
442
443  %idx.3 = add nsw i32 %tid.x, 27
444  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
445  store float %val1, float addrspace(3)* %arrayidx3, align 4
446
447  ret void
448}
449
450define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
451; CI-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
452; CI:       ; %bb.0:
453; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2
454; CI-NEXT:    s_mov_b32 s7, 0xf000
455; CI-NEXT:    s_mov_b32 s6, 0
456; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
457; CI-NEXT:    v_mov_b32_e32 v1, 0
458; CI-NEXT:    s_waitcnt lgkmcnt(0)
459; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
460; CI-NEXT:    s_mov_b64 s[0:1], s[2:3]
461; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
462; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
463; CI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
464; CI-NEXT:    s_mov_b32 m0, -1
465; CI-NEXT:    s_waitcnt vmcnt(0)
466; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset0:3 offset1:8
467; CI-NEXT:    ds_write2_b32 v0, v2, v1 offset0:11 offset1:27
468; CI-NEXT:    s_endpgm
469;
470; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
471; GFX9:       ; %bb.0:
472; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x8
473; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
474; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
475; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
476; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
477; GFX9-NEXT:    s_waitcnt vmcnt(0)
478; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:3 offset1:8
479; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
480; GFX9-NEXT:    s_endpgm
481  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
482  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
483  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
484  %val0 = load float, float addrspace(1)* %in0.gep, align 4
485  %val1 = load float, float addrspace(1)* %in1.gep, align 4
486
487  %idx.0 = add nsw i32 %tid.x, 3
488  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
489  store float %val0, float addrspace(3)* %arrayidx0, align 4
490
491  %idx.1 = add nsw i32 %tid.x, 8
492  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
493  store float %val1, float addrspace(3)* %arrayidx1, align 4
494
495  %idx.2 = add nsw i32 %tid.x, 11
496  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
497  store float %val0, float addrspace(3)* %arrayidx2, align 4
498
499  %idx.3 = add nsw i32 %tid.x, 27
500  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
501  store float %val1, float addrspace(3)* %arrayidx3, align 4
502
503  ret void
504}
505
506define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
507; CI-LABEL: write2_ptr_subreg_arg_two_val_f32:
508; CI:       ; %bb.0:
509; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2
510; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x6
511; CI-NEXT:    s_mov_b32 s3, 0xf000
512; CI-NEXT:    s_mov_b32 s2, 0
513; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
514; CI-NEXT:    s_waitcnt lgkmcnt(0)
515; CI-NEXT:    s_mov_b64 s[0:1], s[4:5]
516; CI-NEXT:    v_mov_b32_e32 v1, 0
517; CI-NEXT:    s_mov_b64 s[4:5], s[6:7]
518; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
519; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
520; CI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
521; CI-NEXT:    v_mov_b32_e32 v1, s8
522; CI-NEXT:    s_mov_b32 m0, -1
523; CI-NEXT:    v_mov_b32_e32 v3, s9
524; CI-NEXT:    s_waitcnt vmcnt(1)
525; CI-NEXT:    ds_write_b32 v1, v2 offset:32
526; CI-NEXT:    s_waitcnt vmcnt(0)
527; CI-NEXT:    ds_write_b32 v3, v0 offset:32
528; CI-NEXT:    s_endpgm
529;
530; GFX9-LABEL: write2_ptr_subreg_arg_two_val_f32:
531; GFX9:       ; %bb.0:
532; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
533; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x18
534; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
535; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
536; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
537; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
538; GFX9-NEXT:    v_mov_b32_e32 v0, s2
539; GFX9-NEXT:    v_mov_b32_e32 v3, s3
540; GFX9-NEXT:    s_waitcnt vmcnt(1)
541; GFX9-NEXT:    ds_write_b32 v0, v1 offset:32
542; GFX9-NEXT:    s_waitcnt vmcnt(0)
543; GFX9-NEXT:    ds_write_b32 v3, v2 offset:32
544; GFX9-NEXT:    s_endpgm
545  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
546  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
547  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
548  %val0 = load float, float addrspace(1)* %in0.gep, align 4
549  %val1 = load float, float addrspace(1)* %in1.gep, align 4
550
551  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
552  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
553  %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
554  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
555  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
556
557  ; Apply an additional offset after the vector that will be more obviously folded.
558  %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8
559  store float %val0, float addrspace(3)* %gep.0, align 4
560
561  %add.x = add nsw i32 %x.i, 8
562  store float %val1, float addrspace(3)* %gep.1.offset, align 4
563  ret void
564}
565
566define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
567; CI-LABEL: simple_write2_one_val_f64:
568; CI:       ; %bb.0:
569; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
570; CI-NEXT:    s_mov_b32 s3, 0xf000
571; CI-NEXT:    s_mov_b32 s2, 0
572; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
573; CI-NEXT:    v_mov_b32_e32 v1, 0
574; CI-NEXT:    s_waitcnt lgkmcnt(0)
575; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64
576; CI-NEXT:    s_mov_b32 m0, -1
577; CI-NEXT:    s_waitcnt vmcnt(0)
578; CI-NEXT:    ds_write2_b64 v0, v[1:2], v[1:2] offset1:8
579; CI-NEXT:    s_endpgm
580;
581; GFX9-LABEL: simple_write2_one_val_f64:
582; GFX9:       ; %bb.0:
583; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
584; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
585; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
586; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
587; GFX9-NEXT:    s_waitcnt vmcnt(0)
588; GFX9-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset1:8
589; GFX9-NEXT:    s_endpgm
590  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
591  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
592  %val = load double, double addrspace(1)* %in.gep, align 8
593  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
594  store double %val, double addrspace(3)* %arrayidx0, align 8
595  %add.x = add nsw i32 %x.i, 8
596  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
597  store double %val, double addrspace(3)* %arrayidx1, align 8
598  ret void
599}
600
601define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
602; CI-LABEL: misaligned_simple_write2_one_val_f64:
603; CI:       ; %bb.0:
604; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2
605; CI-NEXT:    s_load_dword s0, s[0:1], 0x4
606; CI-NEXT:    s_mov_b32 s7, 0xf000
607; CI-NEXT:    s_mov_b32 s6, 0
608; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
609; CI-NEXT:    v_mov_b32_e32 v1, 0
610; CI-NEXT:    s_waitcnt lgkmcnt(0)
611; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
612; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
613; CI-NEXT:    s_mov_b32 m0, -1
614; CI-NEXT:    s_waitcnt vmcnt(0)
615; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
616; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset0:14 offset1:15
617; CI-NEXT:    s_endpgm
618;
619; GFX9-LABEL: misaligned_simple_write2_one_val_f64:
620; GFX9:       ; %bb.0:
621; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
622; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x10
623; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
624; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
625; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
626; GFX9-NEXT:    v_add_u32_e32 v2, s4, v2
627; GFX9-NEXT:    s_waitcnt vmcnt(0)
628; GFX9-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
629; GFX9-NEXT:    ds_write2_b32 v2, v0, v1 offset0:14 offset1:15
630; GFX9-NEXT:    s_endpgm
631  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
632  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
633  %val = load double, double addrspace(1)* %in.gep, align 8
634  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
635  store double %val, double addrspace(3)* %arrayidx0, align 4
636  %add.x = add nsw i32 %x.i, 7
637  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
638  store double %val, double addrspace(3)* %arrayidx1, align 4
639  ret void
640}
641
642define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
643; CI-LABEL: unaligned_offset_simple_write2_one_val_f64:
644; CI:       ; %bb.0:
645; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2
646; CI-NEXT:    s_load_dword s0, s[0:1], 0x4
647; CI-NEXT:    s_mov_b32 s7, 0xf000
648; CI-NEXT:    s_mov_b32 s6, 0
649; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
650; CI-NEXT:    v_mov_b32_e32 v1, 0
651; CI-NEXT:    s_waitcnt lgkmcnt(0)
652; CI-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
653; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
654; CI-NEXT:    s_mov_b32 m0, -1
655; CI-NEXT:    s_waitcnt vmcnt(0)
656; CI-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
657; CI-NEXT:    ds_write_b8 v0, v1 offset:5
658; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
659; CI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
660; CI-NEXT:    ds_write_b8 v0, v2 offset:13
661; CI-NEXT:    ds_write_b8 v0, v1 offset:9
662; CI-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
663; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
664; CI-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
665; CI-NEXT:    ds_write_b8 v0, v3 offset:8
666; CI-NEXT:    ds_write_b8 v0, v4 offset:7
667; CI-NEXT:    ds_write_b8 v0, v5 offset:6
668; CI-NEXT:    ds_write_b8 v0, v1 offset:16
669; CI-NEXT:    ds_write_b8 v0, v6 offset:15
670; CI-NEXT:    ds_write_b8 v0, v2 offset:14
671; CI-NEXT:    ds_write_b8 v0, v3 offset:12
672; CI-NEXT:    ds_write_b8 v0, v4 offset:11
673; CI-NEXT:    ds_write_b8 v0, v5 offset:10
674; CI-NEXT:    s_endpgm
675;
676; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64:
677; GFX9-ALIGNED:       ; %bb.0:
678; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
679; GFX9-ALIGNED-NEXT:    s_load_dword s4, s[0:1], 0x10
680; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
681; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
682; GFX9-ALIGNED-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
683; GFX9-ALIGNED-NEXT:    v_add_u32_e32 v2, s4, v2
684; GFX9-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
685; GFX9-ALIGNED-NEXT:    ds_write_b8_d16_hi v2, v0 offset:7
686; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v0 offset:5
687; GFX9-ALIGNED-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
688; GFX9-ALIGNED-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
689; GFX9-ALIGNED-NEXT:    ds_write_b8_d16_hi v2, v1 offset:15
690; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v1 offset:13
691; GFX9-ALIGNED-NEXT:    ds_write_b8_d16_hi v2, v0 offset:11
692; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v0 offset:9
693; GFX9-ALIGNED-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
694; GFX9-ALIGNED-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
695; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v3 offset:8
696; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v4 offset:6
697; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v0 offset:16
698; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v1 offset:14
699; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v3 offset:12
700; GFX9-ALIGNED-NEXT:    ds_write_b8 v2, v4 offset:10
701; GFX9-ALIGNED-NEXT:    s_endpgm
702;
703; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64:
704; GFX9-UNALIGNED:       ; %bb.0:
705; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
706; GFX9-UNALIGNED-NEXT:    s_load_dword s4, s[0:1], 0x10
707; GFX9-UNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
708; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
709; GFX9-UNALIGNED-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
710; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v2, s4, v2
711; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
712; GFX9-UNALIGNED-NEXT:    ds_write_b64 v2, v[0:1] offset:5
713; GFX9-UNALIGNED-NEXT:    ds_write_b64 v2, v[0:1] offset:9
714; GFX9-UNALIGNED-NEXT:    s_endpgm
715  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
716  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
717  %val = load double, double addrspace(1)* %in.gep, align 8
718  %base = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
719  %base.i8 = bitcast double addrspace(3)* %base to i8 addrspace(3)*
720  %addr0.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 5
721  %addr0 = bitcast i8 addrspace(3)* %addr0.i8 to double addrspace(3)*
722  store double %val, double addrspace(3)* %addr0, align 1
723  %addr1.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 9
724  %addr1 = bitcast i8 addrspace(3)* %addr1.i8 to double addrspace(3)*
725  store double %val, double addrspace(3)* %addr1, align 1
726  ret void
727}
728
729define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
730; CI-LABEL: simple_write2_two_val_f64:
731; CI:       ; %bb.0:
732; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
733; CI-NEXT:    s_mov_b32 s3, 0xf000
734; CI-NEXT:    s_mov_b32 s2, 0
735; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
736; CI-NEXT:    v_mov_b32_e32 v1, 0
737; CI-NEXT:    s_waitcnt lgkmcnt(0)
738; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc
739; CI-NEXT:    s_waitcnt vmcnt(0)
740; CI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc
741; CI-NEXT:    s_waitcnt vmcnt(0)
742; CI-NEXT:    s_mov_b32 m0, -1
743; CI-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:8
744; CI-NEXT:    s_endpgm
745;
746; GFX9-LABEL: simple_write2_two_val_f64:
747; GFX9:       ; %bb.0:
748; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
749; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
750; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
751; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1] glc
752; GFX9-NEXT:    s_waitcnt vmcnt(0)
753; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc
754; GFX9-NEXT:    s_waitcnt vmcnt(0)
755; GFX9-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:8
756; GFX9-NEXT:    s_endpgm
757  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
758  %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
759  %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
760  %val0 = load volatile double, double addrspace(1)* %in.gep.0, align 8
761  %val1 = load volatile double, double addrspace(1)* %in.gep.1, align 8
762  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
763  store double %val0, double addrspace(3)* %arrayidx0, align 8
764  %add.x = add nsw i32 %x.i, 8
765  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
766  store double %val1, double addrspace(3)* %arrayidx1, align 8
767  ret void
768}
769
770@foo = addrspace(3) global [4 x i32] undef, align 4
771
772define amdgpu_kernel void @store_constant_adjacent_offsets() {
773; CI-LABEL: store_constant_adjacent_offsets:
774; CI:       ; %bb.0:
775; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
776; CI-NEXT:    v_mov_b32_e32 v1, v0
777; CI-NEXT:    v_mov_b32_e32 v2, 0
778; CI-NEXT:    s_mov_b32 m0, -1
779; CI-NEXT:    ds_write_b64 v2, v[0:1]
780; CI-NEXT:    s_endpgm
781;
782; GFX9-LABEL: store_constant_adjacent_offsets:
783; GFX9:       ; %bb.0:
784; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
785; GFX9-NEXT:    v_mov_b32_e32 v1, v0
786; GFX9-NEXT:    v_mov_b32_e32 v2, 0
787; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
788; GFX9-NEXT:    s_endpgm
789  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
790  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
791  ret void
792}
793
794define amdgpu_kernel void @store_constant_disjoint_offsets() {
795; CI-LABEL: store_constant_disjoint_offsets:
796; CI:       ; %bb.0:
797; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
798; CI-NEXT:    v_mov_b32_e32 v1, 0
799; CI-NEXT:    s_mov_b32 m0, -1
800; CI-NEXT:    ds_write2_b32 v1, v0, v0 offset1:2
801; CI-NEXT:    s_endpgm
802;
803; GFX9-LABEL: store_constant_disjoint_offsets:
804; GFX9:       ; %bb.0:
805; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
806; GFX9-NEXT:    v_mov_b32_e32 v1, 0
807; GFX9-NEXT:    ds_write2_b32 v1, v0, v0 offset1:2
808; GFX9-NEXT:    s_endpgm
809  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
810  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
811  ret void
812}
813
814@bar = addrspace(3) global [4 x i64] undef, align 4
815
816define amdgpu_kernel void @store_misaligned64_constant_offsets() {
817; CI-LABEL: store_misaligned64_constant_offsets:
818; CI:       ; %bb.0:
819; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
820; CI-NEXT:    v_mov_b32_e32 v1, 0
821; CI-NEXT:    v_mov_b32_e32 v2, v0
822; CI-NEXT:    v_mov_b32_e32 v3, v1
823; CI-NEXT:    s_mov_b32 m0, -1
824; CI-NEXT:    ds_write_b128 v1, v[0:3]
825; CI-NEXT:    s_endpgm
826;
827; GFX9-LABEL: store_misaligned64_constant_offsets:
828; GFX9:       ; %bb.0:
829; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
830; GFX9-NEXT:    v_mov_b32_e32 v1, 0
831; GFX9-NEXT:    v_mov_b32_e32 v2, v0
832; GFX9-NEXT:    v_mov_b32_e32 v3, v1
833; GFX9-NEXT:    ds_write_b128 v1, v[0:3]
834; GFX9-NEXT:    s_endpgm
835  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
836  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
837  ret void
838}
839
840@bar.large = addrspace(3) global [4096 x i64] undef, align 4
841
842define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
843; CI-LABEL: store_misaligned64_constant_large_offsets:
844; CI:       ; %bb.0:
845; CI-NEXT:    s_mov_b64 s[0:1], 0x7b
846; CI-NEXT:    v_mov_b32_e32 v0, s0
847; CI-NEXT:    v_mov_b32_e32 v2, 0
848; CI-NEXT:    v_mov_b32_e32 v1, s1
849; CI-NEXT:    s_mov_b32 m0, -1
850; CI-NEXT:    ds_write_b64 v2, v[0:1] offset:16384
851; CI-NEXT:    ds_write_b64 v2, v[0:1] offset:32760
852; CI-NEXT:    s_endpgm
853;
854; GFX9-LABEL: store_misaligned64_constant_large_offsets:
855; GFX9:       ; %bb.0:
856; GFX9-NEXT:    s_mov_b64 s[0:1], 0x7b
857; GFX9-NEXT:    v_mov_b32_e32 v0, s0
858; GFX9-NEXT:    v_mov_b32_e32 v2, 0
859; GFX9-NEXT:    v_mov_b32_e32 v1, s1
860; GFX9-NEXT:    ds_write_b64 v2, v[0:1] offset:16384
861; GFX9-NEXT:    ds_write_b64 v2, v[0:1] offset:32760
862; GFX9-NEXT:    s_endpgm
863  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
864  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
865  ret void
866}
867
868@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
869@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
870
871define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
872; CI-LABEL: write2_sgemm_sequence:
873; CI:       ; %bb.0:
874; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x4
875; CI-NEXT:    s_mov_b32 m0, -1
876; CI-NEXT:    s_waitcnt lgkmcnt(0)
877; CI-NEXT:    s_load_dword s0, s[0:1], 0x0
878; CI-NEXT:    s_lshl_b32 s1, s2, 2
879; CI-NEXT:    s_add_i32 s2, s1, 0xc20
880; CI-NEXT:    s_addk_i32 s1, 0xc60
881; CI-NEXT:    v_mov_b32_e32 v0, s2
882; CI-NEXT:    s_waitcnt lgkmcnt(0)
883; CI-NEXT:    v_mov_b32_e32 v2, s0
884; CI-NEXT:    v_mov_b32_e32 v3, s0
885; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
886; CI-NEXT:    v_mov_b32_e32 v0, s1
887; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
888; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v1
889; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
890; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset0:32 offset1:33
891; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset0:64 offset1:65
892; CI-NEXT:    s_endpgm
893;
894; GFX9-LABEL: write2_sgemm_sequence:
895; GFX9:       ; %bb.0:
896; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x10
897; GFX9-NEXT:    s_lshl_b32 s2, s2, 2
898; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
899; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
900; GFX9-NEXT:    s_add_i32 s1, s2, 0xc20
901; GFX9-NEXT:    s_addk_i32 s2, 0xc60
902; GFX9-NEXT:    v_mov_b32_e32 v0, s1
903; GFX9-NEXT:    v_mov_b32_e32 v2, s2
904; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
905; GFX9-NEXT:    v_mov_b32_e32 v3, s0
906; GFX9-NEXT:    v_mov_b32_e32 v4, s0
907; GFX9-NEXT:    ds_write2_b32 v0, v3, v4 offset1:1
908; GFX9-NEXT:    ds_write2_b32 v2, v3, v4 offset1:1
909; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v1
910; GFX9-NEXT:    ds_write2_b32 v0, v3, v4 offset1:1
911; GFX9-NEXT:    ds_write2_b32 v0, v3, v4 offset0:32 offset1:33
912; GFX9-NEXT:    ds_write2_b32 v0, v3, v4 offset0:64 offset1:65
913; GFX9-NEXT:    s_endpgm
914  %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
915  %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
916  %val = load float, float addrspace(1)* %in
917  %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
918  store float %val, float addrspace(3)* %arrayidx44, align 4
919  %add47 = add nsw i32 %x.i, 1
920  %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
921  store float %val, float addrspace(3)* %arrayidx48, align 4
922  %add51 = add nsw i32 %x.i, 16
923  %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
924  store float %val, float addrspace(3)* %arrayidx52, align 4
925  %add55 = add nsw i32 %x.i, 17
926  %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
927  store float %val, float addrspace(3)* %arrayidx56, align 4
928  %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
929  store float %val, float addrspace(3)* %arrayidx60, align 4
930  %add63 = add nsw i32 %y.i, 1
931  %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
932  store float %val, float addrspace(3)* %arrayidx64, align 4
933  %add67 = add nsw i32 %y.i, 32
934  %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
935  store float %val, float addrspace(3)* %arrayidx68, align 4
936  %add71 = add nsw i32 %y.i, 33
937  %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
938  store float %val, float addrspace(3)* %arrayidx72, align 4
939  %add75 = add nsw i32 %y.i, 64
940  %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
941  store float %val, float addrspace(3)* %arrayidx76, align 4
942  %add79 = add nsw i32 %y.i, 65
943  %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
944  store float %val, float addrspace(3)* %arrayidx80, align 4
945  ret void
946}
947
948define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
949; CI-LABEL: simple_write2_v4f32_superreg_align4:
950; CI:       ; %bb.0:
951; CI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
952; CI-NEXT:    s_load_dword s4, s[0:1], 0x0
953; CI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
954; CI-NEXT:    s_mov_b32 m0, -1
955; CI-NEXT:    s_waitcnt lgkmcnt(0)
956; CI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
957; CI-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
958; CI-NEXT:    s_waitcnt lgkmcnt(0)
959; CI-NEXT:    v_mov_b32_e32 v1, s0
960; CI-NEXT:    v_mov_b32_e32 v2, s1
961; CI-NEXT:    v_mov_b32_e32 v3, s2
962; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
963; CI-NEXT:    v_mov_b32_e32 v1, s3
964; CI-NEXT:    ds_write2_b32 v0, v3, v1 offset0:2 offset1:3
965; CI-NEXT:    s_endpgm
966;
967; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
968; GFX9-ALIGNED:       ; %bb.0:
969; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
970; GFX9-ALIGNED-NEXT:    s_load_dword s4, s[0:1], 0x0
971; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
972; GFX9-ALIGNED-NEXT:    v_lshl_add_u32 v0, v0, 4, s4
973; GFX9-ALIGNED-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
974; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
975; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v1, s0
976; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v2, s1
977; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v3, s2
978; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v4, s3
979; GFX9-ALIGNED-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
980; GFX9-ALIGNED-NEXT:    ds_write2_b32 v0, v3, v4 offset0:2 offset1:3
981; GFX9-ALIGNED-NEXT:    s_endpgm
982;
983; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
984; GFX9-UNALIGNED:       ; %bb.0:
985; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
986; GFX9-UNALIGNED-NEXT:    s_load_dword s4, s[0:1], 0x0
987; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
988; GFX9-UNALIGNED-NEXT:    v_lshl_add_u32 v0, v0, 4, s4
989; GFX9-UNALIGNED-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
990; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
991; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v1, s2
992; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, s3
993; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v3, s0
994; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v4, s1
995; GFX9-UNALIGNED-NEXT:    ds_write2_b32 v0, v1, v2 offset0:2 offset1:3
996; GFX9-UNALIGNED-NEXT:    ds_write2_b32 v0, v3, v4 offset1:1
997; GFX9-UNALIGNED-NEXT:    s_endpgm
998  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
999  %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
1000  %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4
1001  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(3)* %out, i32 %x.i
1002  store <4 x float> %val0, <4 x float> addrspace(3)* %out.gep, align 4
1003  ret void
1004}
1005
1006@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1
1007
1008define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
1009; CI-LABEL: write2_v2i32_align1_odd_offset:
1010; CI:       ; %bb.0: ; %entry
1011; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
1012; CI-NEXT:    v_mov_b32_e32 v1, 0
1013; CI-NEXT:    s_mov_b32 m0, -1
1014; CI-NEXT:    ds_write_b8 v1, v0 offset:65
1015; CI-NEXT:    v_mov_b32_e32 v0, 1
1016; CI-NEXT:    ds_write_b8 v1, v0 offset:70
1017; CI-NEXT:    v_mov_b32_e32 v0, 0xc8
1018; CI-NEXT:    ds_write_b8 v1, v0 offset:69
1019; CI-NEXT:    ds_write_b8 v1, v1 offset:68
1020; CI-NEXT:    ds_write_b8 v1, v1 offset:67
1021; CI-NEXT:    ds_write_b8 v1, v1 offset:66
1022; CI-NEXT:    ds_write_b8 v1, v1 offset:72
1023; CI-NEXT:    ds_write_b8 v1, v1 offset:71
1024; CI-NEXT:    s_endpgm
1025;
1026; GFX9-ALIGNED-LABEL: write2_v2i32_align1_odd_offset:
1027; GFX9-ALIGNED:       ; %bb.0: ; %entry
1028; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 0x7b
1029; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v1, 0
1030; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:65
1031; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 1
1032; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:70
1033; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 0xc8
1034; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:69
1035; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:68
1036; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:67
1037; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:66
1038; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:72
1039; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:71
1040; GFX9-ALIGNED-NEXT:    s_endpgm
1041;
1042; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset:
1043; GFX9-UNALIGNED:       ; %bb.0: ; %entry
1044; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x7b
1045; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v1, 0x1c8
1046; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0
1047; GFX9-UNALIGNED-NEXT:    ds_write_b64 v2, v[0:1] offset:65
1048; GFX9-UNALIGNED-NEXT:    s_endpgm
1049entry:
1050  store <2 x i32> <i32 123, i32 456>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1
1051  ret void
1052}
1053
1054declare i32 @llvm.amdgcn.workgroup.id.x() #1
1055declare i32 @llvm.amdgcn.workgroup.id.y() #1
1056declare i32 @llvm.amdgcn.workitem.id.x() #1
1057declare i32 @llvm.amdgcn.workitem.id.y() #1
1058
1059attributes #0 = { nounwind }
1060attributes #1 = { nounwind readnone speculatable }
1061attributes #2 = { convergent nounwind }
1062