1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1030 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
4; RUN: llc -march=amdgcn -mcpu=gfx940 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
6
7define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
8; GFX9-LABEL: store_load_sindex_kernel:
9; GFX9:       ; %bb.0: ; %bb
10; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
11; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
12; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
13; GFX9-NEXT:    v_mov_b32_e32 v0, 15
14; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
16; GFX9-NEXT:    s_and_b32 s0, s0, 15
17; GFX9-NEXT:    s_add_i32 s1, s1, 4
18; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
19; GFX9-NEXT:    scratch_store_dword off, v0, s1
20; GFX9-NEXT:    s_waitcnt vmcnt(0)
21; GFX9-NEXT:    s_add_i32 s0, s0, 4
22; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
23; GFX9-NEXT:    s_waitcnt vmcnt(0)
24; GFX9-NEXT:    s_endpgm
25;
26; GFX10-LABEL: store_load_sindex_kernel:
27; GFX10:       ; %bb.0: ; %bb
28; GFX10-NEXT:    s_add_u32 s2, s2, s5
29; GFX10-NEXT:    s_addc_u32 s3, s3, 0
30; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
31; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
32; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
33; GFX10-NEXT:    v_mov_b32_e32 v0, 15
34; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
35; GFX10-NEXT:    s_and_b32 s1, s0, 15
36; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
37; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
38; GFX10-NEXT:    s_add_i32 s0, s0, 4
39; GFX10-NEXT:    s_add_i32 s1, s1, 4
40; GFX10-NEXT:    scratch_store_dword off, v0, s0
41; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
42; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
43; GFX10-NEXT:    s_waitcnt vmcnt(0)
44; GFX10-NEXT:    s_endpgm
45;
46; GFX940-LABEL: store_load_sindex_kernel:
47; GFX940:       ; %bb.0: ; %bb
48; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
49; GFX940-NEXT:    v_mov_b32_e32 v0, 15
50; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
51; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
52; GFX940-NEXT:    s_and_b32 s0, s0, 15
53; GFX940-NEXT:    v_mov_b32_e32 v1, s1
54; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
55; GFX940-NEXT:    scratch_store_dword v1, v0, off offset:4 sc0 sc1
56; GFX940-NEXT:    s_waitcnt vmcnt(0)
57; GFX940-NEXT:    v_mov_b32_e32 v0, s0
58; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:4 sc0 sc1
59; GFX940-NEXT:    s_waitcnt vmcnt(0)
60; GFX940-NEXT:    s_endpgm
61;
62; GFX11-LABEL: store_load_sindex_kernel:
63; GFX11:       ; %bb.0: ; %bb
64; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
65; GFX11-NEXT:    v_mov_b32_e32 v1, 15
66; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
67; GFX11-NEXT:    s_lshl_b32 s1, s0, 2
68; GFX11-NEXT:    s_and_b32 s0, s0, 15
69; GFX11-NEXT:    v_mov_b32_e32 v0, s1
70; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
71; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
72; GFX11-NEXT:    v_mov_b32_e32 v2, s0
73; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4 dlc
74; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
75; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:4 glc dlc
76; GFX11-NEXT:    s_waitcnt vmcnt(0)
77; GFX11-NEXT:    s_endpgm
78bb:
79  %i = alloca [32 x float], align 4, addrspace(5)
80  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
81  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
82  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
83  store volatile i32 15, i32 addrspace(5)* %i8, align 4
84  %i9 = and i32 %idx, 15
85  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
86  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
87  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
88  ret void
89}
90
91define amdgpu_kernel void @store_load_vindex_kernel() {
92; GFX9-LABEL: store_load_vindex_kernel:
93; GFX9:       ; %bb.0: ; %bb
94; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
95; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
96; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
97; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
98; GFX9-NEXT:    v_add_u32_e32 v1, 4, v1
99; GFX9-NEXT:    v_mov_b32_e32 v2, 15
100; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
101; GFX9-NEXT:    scratch_store_dword v1, v2, off
102; GFX9-NEXT:    s_waitcnt vmcnt(0)
103; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
104; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
105; GFX9-NEXT:    s_waitcnt vmcnt(0)
106; GFX9-NEXT:    s_endpgm
107;
108; GFX10-LABEL: store_load_vindex_kernel:
109; GFX10:       ; %bb.0: ; %bb
110; GFX10-NEXT:    s_add_u32 s0, s0, s3
111; GFX10-NEXT:    s_addc_u32 s1, s1, 0
112; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
113; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
114; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
115; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
116; GFX10-NEXT:    v_mov_b32_e32 v2, 15
117; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
118; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
119; GFX10-NEXT:    v_add_nc_u32_e32 v1, 4, v1
120; GFX10-NEXT:    scratch_store_dword v0, v2, off
121; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
122; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
123; GFX10-NEXT:    s_waitcnt vmcnt(0)
124; GFX10-NEXT:    s_endpgm
125;
126; GFX940-LABEL: store_load_vindex_kernel:
127; GFX940:       ; %bb.0: ; %bb
128; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
129; GFX940-NEXT:    v_mov_b32_e32 v2, 15
130; GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
131; GFX940-NEXT:    scratch_store_dword v1, v2, off offset:4 sc0 sc1
132; GFX940-NEXT:    s_waitcnt vmcnt(0)
133; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
134; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:128 sc0 sc1
135; GFX940-NEXT:    s_waitcnt vmcnt(0)
136; GFX940-NEXT:    s_endpgm
137;
138; GFX11-LABEL: store_load_vindex_kernel:
139; GFX11:       ; %bb.0: ; %bb
140; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
141; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
142; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
143; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
144; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:4 dlc
145; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
146; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:128 glc dlc
147; GFX11-NEXT:    s_waitcnt vmcnt(0)
148; GFX11-NEXT:    s_endpgm
149bb:
150  %i = alloca [32 x float], align 4, addrspace(5)
151  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
152  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
153  %i3 = zext i32 %i2 to i64
154  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
155  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
156  store volatile i32 15, i32 addrspace(5)* %i8, align 4
157  %i9 = sub nsw i32 31, %i2
158  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
159  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
160  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
161  ret void
162}
163
164define void @store_load_vindex_foo(i32 %idx) {
165; GFX9-LABEL: store_load_vindex_foo:
166; GFX9:       ; %bb.0: ; %bb
167; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
169; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
170; GFX9-NEXT:    v_add_u32_e32 v1, s32, v1
171; GFX9-NEXT:    v_mov_b32_e32 v2, 15
172; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
173; GFX9-NEXT:    scratch_store_dword v1, v2, off
174; GFX9-NEXT:    s_waitcnt vmcnt(0)
175; GFX9-NEXT:    v_add_u32_e32 v0, s32, v0
176; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
177; GFX9-NEXT:    s_waitcnt vmcnt(0)
178; GFX9-NEXT:    s_setpc_b64 s[30:31]
179;
180; GFX10-LABEL: store_load_vindex_foo:
181; GFX10:       ; %bb.0: ; %bb
182; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
183; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
184; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
185; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
186; GFX10-NEXT:    v_mov_b32_e32 v2, 15
187; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
188; GFX10-NEXT:    v_add_nc_u32_e32 v0, s32, v0
189; GFX10-NEXT:    v_add_nc_u32_e32 v1, s32, v1
190; GFX10-NEXT:    scratch_store_dword v0, v2, off
191; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
192; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
193; GFX10-NEXT:    s_waitcnt vmcnt(0)
194; GFX10-NEXT:    s_setpc_b64 s[30:31]
195;
196; GFX940-LABEL: store_load_vindex_foo:
197; GFX940:       ; %bb.0: ; %bb
198; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
199; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
200; GFX940-NEXT:    v_mov_b32_e32 v2, 15
201; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
202; GFX940-NEXT:    scratch_store_dword v1, v2, s32 sc0 sc1
203; GFX940-NEXT:    s_waitcnt vmcnt(0)
204; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
205; GFX940-NEXT:    scratch_load_dword v0, v0, s32 sc0 sc1
206; GFX940-NEXT:    s_waitcnt vmcnt(0)
207; GFX940-NEXT:    s_setpc_b64 s[30:31]
208;
209; GFX11-LABEL: store_load_vindex_foo:
210; GFX11:       ; %bb.0: ; %bb
211; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
212; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
213; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
214; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
215; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
216; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
217; GFX11-NEXT:    scratch_store_b32 v0, v2, s32 dlc
218; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
219; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 glc dlc
220; GFX11-NEXT:    s_waitcnt vmcnt(0)
221; GFX11-NEXT:    s_setpc_b64 s[30:31]
222bb:
223  %i = alloca [32 x float], align 4, addrspace(5)
224  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
225  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
226  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
227  store volatile i32 15, i32 addrspace(5)* %i8, align 4
228  %i9 = and i32 %idx, 15
229  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
230  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
231  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
232  ret void
233}
234
235define void @private_ptr_foo(float addrspace(5)* nocapture %arg) {
236; GFX9-LABEL: private_ptr_foo:
237; GFX9:       ; %bb.0:
238; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
239; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
240; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
241; GFX9-NEXT:    s_waitcnt vmcnt(0)
242; GFX9-NEXT:    s_setpc_b64 s[30:31]
243;
244; GFX10-LABEL: private_ptr_foo:
245; GFX10:       ; %bb.0:
246; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
247; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
248; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
249; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
250; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
251; GFX10-NEXT:    s_setpc_b64 s[30:31]
252;
253; GFX940-LABEL: private_ptr_foo:
254; GFX940:       ; %bb.0:
255; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
256; GFX940-NEXT:    v_mov_b32_e32 v1, 0x41200000
257; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4
258; GFX940-NEXT:    s_waitcnt vmcnt(0)
259; GFX940-NEXT:    s_setpc_b64 s[30:31]
260;
261; GFX11-LABEL: private_ptr_foo:
262; GFX11:       ; %bb.0:
263; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
264; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
265; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41200000
266; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4
267; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
268; GFX11-NEXT:    s_setpc_b64 s[30:31]
269  %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1
270  store float 1.000000e+01, float addrspace(5)* %gep, align 4
271  ret void
272}
273
274define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
275; GFX9-LABEL: store_load_sindex_small_offset_kernel:
276; GFX9:       ; %bb.0: ; %bb
277; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
278; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
279; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
280; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
281; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
282; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
283; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
284; GFX9-NEXT:    s_and_b32 s0, s0, 15
285; GFX9-NEXT:    v_mov_b32_e32 v0, 15
286; GFX9-NEXT:    s_addk_i32 s1, 0x104
287; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
288; GFX9-NEXT:    scratch_store_dword off, v0, s1
289; GFX9-NEXT:    s_waitcnt vmcnt(0)
290; GFX9-NEXT:    s_addk_i32 s0, 0x104
291; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
292; GFX9-NEXT:    s_waitcnt vmcnt(0)
293; GFX9-NEXT:    s_endpgm
294;
295; GFX10-LABEL: store_load_sindex_small_offset_kernel:
296; GFX10:       ; %bb.0: ; %bb
297; GFX10-NEXT:    s_add_u32 s2, s2, s5
298; GFX10-NEXT:    s_addc_u32 s3, s3, 0
299; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
300; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
301; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
302; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
303; GFX10-NEXT:    s_waitcnt vmcnt(0)
304; GFX10-NEXT:    v_mov_b32_e32 v0, 15
305; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
306; GFX10-NEXT:    s_and_b32 s1, s0, 15
307; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
308; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
309; GFX10-NEXT:    s_addk_i32 s0, 0x104
310; GFX10-NEXT:    s_addk_i32 s1, 0x104
311; GFX10-NEXT:    scratch_store_dword off, v0, s0
312; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
313; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
314; GFX10-NEXT:    s_waitcnt vmcnt(0)
315; GFX10-NEXT:    s_endpgm
316;
317; GFX940-LABEL: store_load_sindex_small_offset_kernel:
318; GFX940:       ; %bb.0: ; %bb
319; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
320; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
321; GFX940-NEXT:    s_waitcnt vmcnt(0)
322; GFX940-NEXT:    v_mov_b32_e32 v0, 15
323; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
324; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
325; GFX940-NEXT:    s_and_b32 s0, s0, 15
326; GFX940-NEXT:    v_mov_b32_e32 v1, s1
327; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
328; GFX940-NEXT:    scratch_store_dword v1, v0, off offset:260 sc0 sc1
329; GFX940-NEXT:    s_waitcnt vmcnt(0)
330; GFX940-NEXT:    v_mov_b32_e32 v0, s0
331; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:260 sc0 sc1
332; GFX940-NEXT:    s_waitcnt vmcnt(0)
333; GFX940-NEXT:    s_endpgm
334;
335; GFX11-LABEL: store_load_sindex_small_offset_kernel:
336; GFX11:       ; %bb.0: ; %bb
337; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
338; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
339; GFX11-NEXT:    s_waitcnt vmcnt(0)
340; GFX11-NEXT:    v_mov_b32_e32 v1, 15
341; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
342; GFX11-NEXT:    s_lshl_b32 s1, s0, 2
343; GFX11-NEXT:    s_and_b32 s0, s0, 15
344; GFX11-NEXT:    v_mov_b32_e32 v0, s1
345; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
346; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
347; GFX11-NEXT:    v_mov_b32_e32 v2, s0
348; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:260 dlc
349; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
350; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:260 glc dlc
351; GFX11-NEXT:    s_waitcnt vmcnt(0)
352; GFX11-NEXT:    s_endpgm
353bb:
354  %padding = alloca [64 x i32], align 4, addrspace(5)
355  %i = alloca [32 x float], align 4, addrspace(5)
356  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
357  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
358  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
359  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
360  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
361  store volatile i32 15, i32 addrspace(5)* %i8, align 4
362  %i9 = and i32 %idx, 15
363  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
364  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
365  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
366  ret void
367}
368
369define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
370; GFX9-LABEL: store_load_vindex_small_offset_kernel:
371; GFX9:       ; %bb.0: ; %bb
372; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
373; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
374; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
375; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
376; GFX9-NEXT:    s_waitcnt vmcnt(0)
377; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
378; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
379; GFX9-NEXT:    v_add_u32_e32 v1, 0x104, v1
380; GFX9-NEXT:    v_mov_b32_e32 v2, 15
381; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
382; GFX9-NEXT:    scratch_store_dword v1, v2, off
383; GFX9-NEXT:    s_waitcnt vmcnt(0)
384; GFX9-NEXT:    v_add_u32_e32 v0, 0x104, v0
385; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
386; GFX9-NEXT:    s_waitcnt vmcnt(0)
387; GFX9-NEXT:    s_endpgm
388;
389; GFX10-LABEL: store_load_vindex_small_offset_kernel:
390; GFX10:       ; %bb.0: ; %bb
391; GFX10-NEXT:    s_add_u32 s0, s0, s3
392; GFX10-NEXT:    s_addc_u32 s1, s1, 0
393; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
394; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
395; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
396; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
397; GFX10-NEXT:    v_mov_b32_e32 v2, 15
398; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
399; GFX10-NEXT:    s_waitcnt vmcnt(0)
400; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
401; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x104, v0
402; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x104, v1
403; GFX10-NEXT:    scratch_store_dword v0, v2, off
404; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
405; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
406; GFX10-NEXT:    s_waitcnt vmcnt(0)
407; GFX10-NEXT:    s_endpgm
408;
409; GFX940-LABEL: store_load_vindex_small_offset_kernel:
410; GFX940:       ; %bb.0: ; %bb
411; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
412; GFX940-NEXT:    s_waitcnt vmcnt(0)
413; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
414; GFX940-NEXT:    v_mov_b32_e32 v2, 15
415; GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
416; GFX940-NEXT:    scratch_store_dword v1, v2, off offset:260 sc0 sc1
417; GFX940-NEXT:    s_waitcnt vmcnt(0)
418; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
419; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:384 sc0 sc1
420; GFX940-NEXT:    s_waitcnt vmcnt(0)
421; GFX940-NEXT:    s_endpgm
422;
423; GFX11-LABEL: store_load_vindex_small_offset_kernel:
424; GFX11:       ; %bb.0: ; %bb
425; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
426; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
427; GFX11-NEXT:    v_mov_b32_e32 v2, 15
428; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
429; GFX11-NEXT:    s_waitcnt vmcnt(0)
430; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
431; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:260 dlc
432; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
433; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:384 glc dlc
434; GFX11-NEXT:    s_waitcnt vmcnt(0)
435; GFX11-NEXT:    s_endpgm
436bb:
437  %padding = alloca [64 x i32], align 4, addrspace(5)
438  %i = alloca [32 x float], align 4, addrspace(5)
439  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
440  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
441  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
442  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
443  %i3 = zext i32 %i2 to i64
444  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
445  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
446  store volatile i32 15, i32 addrspace(5)* %i8, align 4
447  %i9 = sub nsw i32 31, %i2
448  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
449  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
450  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
451  ret void
452}
453
454define void @store_load_vindex_small_offset_foo(i32 %idx) {
455; GFX9-LABEL: store_load_vindex_small_offset_foo:
456; GFX9:       ; %bb.0: ; %bb
457; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
458; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
459; GFX9-NEXT:    s_waitcnt vmcnt(0)
460; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
461; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x100
462; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
463; GFX9-NEXT:    v_add_u32_e32 v1, vcc_hi, v1
464; GFX9-NEXT:    v_mov_b32_e32 v2, 15
465; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
466; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x100
467; GFX9-NEXT:    scratch_store_dword v1, v2, off
468; GFX9-NEXT:    s_waitcnt vmcnt(0)
469; GFX9-NEXT:    v_add_u32_e32 v0, vcc_hi, v0
470; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
471; GFX9-NEXT:    s_waitcnt vmcnt(0)
472; GFX9-NEXT:    s_setpc_b64 s[30:31]
473;
474; GFX10-LABEL: store_load_vindex_small_offset_foo:
475; GFX10:       ; %bb.0: ; %bb
476; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
477; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
478; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
479; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
480; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
481; GFX10-NEXT:    v_mov_b32_e32 v2, 15
482; GFX10-NEXT:    scratch_load_dword v3, off, s32 glc dlc
483; GFX10-NEXT:    s_waitcnt vmcnt(0)
484; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
485; GFX10-NEXT:    v_add_nc_u32_e32 v0, vcc_lo, v0
486; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
487; GFX10-NEXT:    v_add_nc_u32_e32 v1, vcc_lo, v1
488; GFX10-NEXT:    scratch_store_dword v0, v2, off
489; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
490; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
491; GFX10-NEXT:    s_waitcnt vmcnt(0)
492; GFX10-NEXT:    s_setpc_b64 s[30:31]
493;
494; GFX940-LABEL: store_load_vindex_small_offset_foo:
495; GFX940:       ; %bb.0: ; %bb
496; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
497; GFX940-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
498; GFX940-NEXT:    s_waitcnt vmcnt(0)
499; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
500; GFX940-NEXT:    v_mov_b32_e32 v2, 15
501; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
502; GFX940-NEXT:    scratch_store_dword v1, v2, s32 offset:256 sc0 sc1
503; GFX940-NEXT:    s_waitcnt vmcnt(0)
504; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
505; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
506; GFX940-NEXT:    s_waitcnt vmcnt(0)
507; GFX940-NEXT:    s_setpc_b64 s[30:31]
508;
509; GFX11-LABEL: store_load_vindex_small_offset_foo:
510; GFX11:       ; %bb.0: ; %bb
511; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
512; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
513; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
514; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
515; GFX11-NEXT:    scratch_load_b32 v3, off, s32 glc dlc
516; GFX11-NEXT:    s_waitcnt vmcnt(0)
517; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
518; GFX11-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 dlc
519; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
520; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 glc dlc
521; GFX11-NEXT:    s_waitcnt vmcnt(0)
522; GFX11-NEXT:    s_setpc_b64 s[30:31]
523bb:
524  %padding = alloca [64 x i32], align 4, addrspace(5)
525  %i = alloca [32 x float], align 4, addrspace(5)
526  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
527  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
528  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
529  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
530  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
531  store volatile i32 15, i32 addrspace(5)* %i8, align 4
532  %i9 = and i32 %idx, 15
533  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
534  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
535  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
536  ret void
537}
538
539define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
540; GFX9-LABEL: store_load_sindex_large_offset_kernel:
541; GFX9:       ; %bb.0: ; %bb
542; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
543; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
544; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
545; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
546; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
547; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
548; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
549; GFX9-NEXT:    s_and_b32 s0, s0, 15
550; GFX9-NEXT:    v_mov_b32_e32 v0, 15
551; GFX9-NEXT:    s_addk_i32 s1, 0x4004
552; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
553; GFX9-NEXT:    scratch_store_dword off, v0, s1
554; GFX9-NEXT:    s_waitcnt vmcnt(0)
555; GFX9-NEXT:    s_addk_i32 s0, 0x4004
556; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
557; GFX9-NEXT:    s_waitcnt vmcnt(0)
558; GFX9-NEXT:    s_endpgm
559;
560; GFX10-LABEL: store_load_sindex_large_offset_kernel:
561; GFX10:       ; %bb.0: ; %bb
562; GFX10-NEXT:    s_add_u32 s2, s2, s5
563; GFX10-NEXT:    s_addc_u32 s3, s3, 0
564; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
565; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
566; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
567; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
568; GFX10-NEXT:    s_waitcnt vmcnt(0)
569; GFX10-NEXT:    v_mov_b32_e32 v0, 15
570; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
571; GFX10-NEXT:    s_and_b32 s1, s0, 15
572; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
573; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
574; GFX10-NEXT:    s_addk_i32 s0, 0x4004
575; GFX10-NEXT:    s_addk_i32 s1, 0x4004
576; GFX10-NEXT:    scratch_store_dword off, v0, s0
577; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
578; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
579; GFX10-NEXT:    s_waitcnt vmcnt(0)
580; GFX10-NEXT:    s_endpgm
581;
582; GFX940-LABEL: store_load_sindex_large_offset_kernel:
583; GFX940:       ; %bb.0: ; %bb
584; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
585; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
586; GFX940-NEXT:    s_waitcnt vmcnt(0)
587; GFX940-NEXT:    v_mov_b32_e32 v0, 15
588; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4004
589; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
590; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
591; GFX940-NEXT:    s_and_b32 s0, s0, 15
592; GFX940-NEXT:    v_mov_b32_e32 v1, s1
593; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
594; GFX940-NEXT:    scratch_store_dword v1, v0, vcc_hi sc0 sc1
595; GFX940-NEXT:    s_waitcnt vmcnt(0)
596; GFX940-NEXT:    v_mov_b32_e32 v0, s0
597; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4004
598; GFX940-NEXT:    scratch_load_dword v0, v0, vcc_hi sc0 sc1
599; GFX940-NEXT:    s_waitcnt vmcnt(0)
600; GFX940-NEXT:    s_endpgm
601;
602; GFX11-LABEL: store_load_sindex_large_offset_kernel:
603; GFX11:       ; %bb.0: ; %bb
604; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
605; GFX11-NEXT:    s_movk_i32 vcc_lo, 0x4004
606; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
607; GFX11-NEXT:    s_waitcnt vmcnt(0)
608; GFX11-NEXT:    v_mov_b32_e32 v1, 15
609; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
610; GFX11-NEXT:    s_lshl_b32 s1, s0, 2
611; GFX11-NEXT:    s_and_b32 s0, s0, 15
612; GFX11-NEXT:    v_mov_b32_e32 v0, s1
613; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
614; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
615; GFX11-NEXT:    v_mov_b32_e32 v2, s0
616; GFX11-NEXT:    scratch_store_b32 v0, v1, vcc_lo dlc
617; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
618; GFX11-NEXT:    s_movk_i32 vcc_lo, 0x4004
619; GFX11-NEXT:    scratch_load_b32 v0, v2, vcc_lo glc dlc
620; GFX11-NEXT:    s_waitcnt vmcnt(0)
621; GFX11-NEXT:    s_endpgm
622bb:
623  %padding = alloca [4096 x i32], align 4, addrspace(5)
624  %i = alloca [32 x float], align 4, addrspace(5)
625  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
626  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
627  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
628  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
629  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
630  store volatile i32 15, i32 addrspace(5)* %i8, align 4
631  %i9 = and i32 %idx, 15
632  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
633  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
634  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
635  ret void
636}
637
638define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
639; GFX9-LABEL: store_load_vindex_large_offset_kernel:
640; GFX9:       ; %bb.0: ; %bb
641; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
642; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
643; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
644; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
645; GFX9-NEXT:    s_waitcnt vmcnt(0)
646; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
647; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
648; GFX9-NEXT:    v_add_u32_e32 v1, 0x4004, v1
649; GFX9-NEXT:    v_mov_b32_e32 v2, 15
650; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
651; GFX9-NEXT:    scratch_store_dword v1, v2, off
652; GFX9-NEXT:    s_waitcnt vmcnt(0)
653; GFX9-NEXT:    v_add_u32_e32 v0, 0x4004, v0
654; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
655; GFX9-NEXT:    s_waitcnt vmcnt(0)
656; GFX9-NEXT:    s_endpgm
657;
658; GFX10-LABEL: store_load_vindex_large_offset_kernel:
659; GFX10:       ; %bb.0: ; %bb
660; GFX10-NEXT:    s_add_u32 s0, s0, s3
661; GFX10-NEXT:    s_addc_u32 s1, s1, 0
662; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
663; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
664; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
665; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
666; GFX10-NEXT:    v_mov_b32_e32 v2, 15
667; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
668; GFX10-NEXT:    s_waitcnt vmcnt(0)
669; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
670; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x4004, v0
671; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v1
672; GFX10-NEXT:    scratch_store_dword v0, v2, off
673; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
674; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
675; GFX10-NEXT:    s_waitcnt vmcnt(0)
676; GFX10-NEXT:    s_endpgm
677;
678; GFX940-LABEL: store_load_vindex_large_offset_kernel:
679; GFX940:       ; %bb.0: ; %bb
680; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
681; GFX940-NEXT:    s_waitcnt vmcnt(0)
682; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
683; GFX940-NEXT:    v_mov_b32_e32 v2, 15
684; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4004
685; GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
686; GFX940-NEXT:    scratch_store_dword v1, v2, vcc_hi sc0 sc1
687; GFX940-NEXT:    s_waitcnt vmcnt(0)
688; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
689; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4004
690; GFX940-NEXT:    scratch_load_dword v0, v0, vcc_hi offset:124 sc0 sc1
691; GFX940-NEXT:    s_waitcnt vmcnt(0)
692; GFX940-NEXT:    s_endpgm
693;
694; GFX11-LABEL: store_load_vindex_large_offset_kernel:
695; GFX11:       ; %bb.0: ; %bb
696; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
697; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
698; GFX11-NEXT:    v_mov_b32_e32 v2, 15
699; GFX11-NEXT:    s_movk_i32 vcc_lo, 0x4004
700; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
701; GFX11-NEXT:    s_waitcnt vmcnt(0)
702; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
703; GFX11-NEXT:    scratch_store_b32 v0, v2, vcc_lo dlc
704; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
705; GFX11-NEXT:    s_movk_i32 vcc_lo, 0x4004
706; GFX11-NEXT:    scratch_load_b32 v0, v1, vcc_lo offset:124 glc dlc
707; GFX11-NEXT:    s_waitcnt vmcnt(0)
708; GFX11-NEXT:    s_endpgm
709bb:
710  %padding = alloca [4096 x i32], align 4, addrspace(5)
711  %i = alloca [32 x float], align 4, addrspace(5)
712  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
713  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
714  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
715  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
716  %i3 = zext i32 %i2 to i64
717  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
718  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
719  store volatile i32 15, i32 addrspace(5)* %i8, align 4
720  %i9 = sub nsw i32 31, %i2
721  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
722  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
723  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
724  ret void
725}
726
727define void @store_load_vindex_large_offset_foo(i32 %idx) {
728; GFX9-LABEL: store_load_vindex_large_offset_foo:
729; GFX9:       ; %bb.0: ; %bb
730; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
731; GFX9-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
732; GFX9-NEXT:    s_waitcnt vmcnt(0)
733; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
734; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
735; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
736; GFX9-NEXT:    v_add_u32_e32 v1, vcc_hi, v1
737; GFX9-NEXT:    v_mov_b32_e32 v2, 15
738; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
739; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
740; GFX9-NEXT:    scratch_store_dword v1, v2, off
741; GFX9-NEXT:    s_waitcnt vmcnt(0)
742; GFX9-NEXT:    v_add_u32_e32 v0, vcc_hi, v0
743; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
744; GFX9-NEXT:    s_waitcnt vmcnt(0)
745; GFX9-NEXT:    s_setpc_b64 s[30:31]
746;
747; GFX10-LABEL: store_load_vindex_large_offset_foo:
748; GFX10:       ; %bb.0: ; %bb
749; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
750; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
751; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
752; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
753; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
754; GFX10-NEXT:    v_mov_b32_e32 v2, 15
755; GFX10-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
756; GFX10-NEXT:    s_waitcnt vmcnt(0)
757; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
758; GFX10-NEXT:    v_add_nc_u32_e32 v0, vcc_lo, v0
759; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
760; GFX10-NEXT:    v_add_nc_u32_e32 v1, vcc_lo, v1
761; GFX10-NEXT:    scratch_store_dword v0, v2, off
762; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
763; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
764; GFX10-NEXT:    s_waitcnt vmcnt(0)
765; GFX10-NEXT:    s_setpc_b64 s[30:31]
766;
767; GFX940-LABEL: store_load_vindex_large_offset_foo:
768; GFX940:       ; %bb.0: ; %bb
769; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
770; GFX940-NEXT:    scratch_load_dword v1, off, s32 offset:4 sc0 sc1
771; GFX940-NEXT:    s_waitcnt vmcnt(0)
772; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
773; GFX940-NEXT:    v_mov_b32_e32 v2, 15
774; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
775; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
776; GFX940-NEXT:    scratch_store_dword v1, v2, vcc_hi sc0 sc1
777; GFX940-NEXT:    s_waitcnt vmcnt(0)
778; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
779; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
780; GFX940-NEXT:    scratch_load_dword v0, v0, vcc_hi sc0 sc1
781; GFX940-NEXT:    s_waitcnt vmcnt(0)
782; GFX940-NEXT:    s_setpc_b64 s[30:31]
783;
784; GFX11-LABEL: store_load_vindex_large_offset_foo:
785; GFX11:       ; %bb.0: ; %bb
786; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
787; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
788; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
789; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
790; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
791; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:4 glc dlc
792; GFX11-NEXT:    s_waitcnt vmcnt(0)
793; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
794; GFX11-NEXT:    scratch_store_b32 v0, v2, vcc_lo dlc
795; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
796; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
797; GFX11-NEXT:    scratch_load_b32 v0, v1, vcc_lo glc dlc
798; GFX11-NEXT:    s_waitcnt vmcnt(0)
799; GFX11-NEXT:    s_setpc_b64 s[30:31]
800bb:
801  %padding = alloca [4096 x i32], align 4, addrspace(5)
802  %i = alloca [32 x float], align 4, addrspace(5)
803  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
804  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
805  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
806  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
807  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
808  store volatile i32 15, i32 addrspace(5)* %i8, align 4
809  %i9 = and i32 %idx, 15
810  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
811  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
812  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
813  ret void
814}
815
816define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
817; GFX9-LABEL: store_load_large_imm_offset_kernel:
818; GFX9:       ; %bb.0: ; %bb
819; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
820; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
821; GFX9-NEXT:    v_mov_b32_e32 v0, 13
822; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
823; GFX9-NEXT:    s_movk_i32 s0, 0x3e80
824; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
825; GFX9-NEXT:    s_waitcnt vmcnt(0)
826; GFX9-NEXT:    v_mov_b32_e32 v0, 15
827; GFX9-NEXT:    s_add_i32 s0, s0, 4
828; GFX9-NEXT:    scratch_store_dword off, v0, s0
829; GFX9-NEXT:    s_waitcnt vmcnt(0)
830; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
831; GFX9-NEXT:    s_waitcnt vmcnt(0)
832; GFX9-NEXT:    s_endpgm
833;
834; GFX10-LABEL: store_load_large_imm_offset_kernel:
835; GFX10:       ; %bb.0: ; %bb
836; GFX10-NEXT:    s_add_u32 s0, s0, s3
837; GFX10-NEXT:    s_addc_u32 s1, s1, 0
838; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
839; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
840; GFX10-NEXT:    v_mov_b32_e32 v0, 13
841; GFX10-NEXT:    v_mov_b32_e32 v1, 15
842; GFX10-NEXT:    s_movk_i32 s0, 0x3e80
843; GFX10-NEXT:    s_add_i32 s0, s0, 4
844; GFX10-NEXT:    scratch_store_dword off, v0, off offset:4
845; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
846; GFX10-NEXT:    scratch_store_dword off, v1, s0
847; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
848; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
849; GFX10-NEXT:    s_waitcnt vmcnt(0)
850; GFX10-NEXT:    s_endpgm
851;
852; GFX940-LABEL: store_load_large_imm_offset_kernel:
853; GFX940:       ; %bb.0: ; %bb
854; GFX940-NEXT:    v_mov_b32_e32 v0, 13
855; GFX940-NEXT:    scratch_store_dword off, v0, off offset:4 sc0 sc1
856; GFX940-NEXT:    s_waitcnt vmcnt(0)
857; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3e80
858; GFX940-NEXT:    v_mov_b32_e32 v1, 15
859; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
860; GFX940-NEXT:    s_waitcnt vmcnt(0)
861; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:4 sc0 sc1
862; GFX940-NEXT:    s_waitcnt vmcnt(0)
863; GFX940-NEXT:    s_endpgm
864;
865; GFX11-LABEL: store_load_large_imm_offset_kernel:
866; GFX11:       ; %bb.0: ; %bb
867; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80
868; GFX11-NEXT:    v_mov_b32_e32 v2, 15
869; GFX11-NEXT:    scratch_store_b32 off, v0, off offset:4 dlc
870; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
871; GFX11-NEXT:    scratch_store_b32 v1, v2, off offset:4 dlc
872; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
873; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:4 glc dlc
874; GFX11-NEXT:    s_waitcnt vmcnt(0)
875; GFX11-NEXT:    s_endpgm
876bb:
877  %i = alloca [4096 x i32], align 4, addrspace(5)
878  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
879  store volatile i32 13, i32 addrspace(5)* %i1, align 4
880  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
881  store volatile i32 15, i32 addrspace(5)* %i7, align 4
882  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
883  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
884  ret void
885}
886
887define void @store_load_large_imm_offset_foo() {
888; GFX9-LABEL: store_load_large_imm_offset_foo:
889; GFX9:       ; %bb.0: ; %bb
890; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
891; GFX9-NEXT:    v_mov_b32_e32 v0, 13
892; GFX9-NEXT:    s_movk_i32 s0, 0x3e80
893; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 4
894; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:4
895; GFX9-NEXT:    s_waitcnt vmcnt(0)
896; GFX9-NEXT:    v_mov_b32_e32 v0, 15
897; GFX9-NEXT:    s_add_i32 s0, s0, vcc_hi
898; GFX9-NEXT:    scratch_store_dword off, v0, s0
899; GFX9-NEXT:    s_waitcnt vmcnt(0)
900; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
901; GFX9-NEXT:    s_waitcnt vmcnt(0)
902; GFX9-NEXT:    s_setpc_b64 s[30:31]
903;
904; GFX10-LABEL: store_load_large_imm_offset_foo:
905; GFX10:       ; %bb.0: ; %bb
906; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
907; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
908; GFX10-NEXT:    v_mov_b32_e32 v0, 13
909; GFX10-NEXT:    v_mov_b32_e32 v1, 15
910; GFX10-NEXT:    s_movk_i32 s0, 0x3e80
911; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 4
912; GFX10-NEXT:    s_add_i32 s0, s0, vcc_lo
913; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:4
914; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
915; GFX10-NEXT:    scratch_store_dword off, v1, s0
916; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
917; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
918; GFX10-NEXT:    s_waitcnt vmcnt(0)
919; GFX10-NEXT:    s_setpc_b64 s[30:31]
920;
921; GFX940-LABEL: store_load_large_imm_offset_foo:
922; GFX940:       ; %bb.0: ; %bb
923; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
924; GFX940-NEXT:    v_mov_b32_e32 v0, 13
925; GFX940-NEXT:    scratch_store_dword off, v0, s32 offset:4 sc0 sc1
926; GFX940-NEXT:    s_waitcnt vmcnt(0)
927; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3e80
928; GFX940-NEXT:    v_mov_b32_e32 v1, 15
929; GFX940-NEXT:    scratch_store_dword v0, v1, s32 offset:4 sc0 sc1
930; GFX940-NEXT:    s_waitcnt vmcnt(0)
931; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:4 sc0 sc1
932; GFX940-NEXT:    s_waitcnt vmcnt(0)
933; GFX940-NEXT:    s_setpc_b64 s[30:31]
934;
935; GFX11-LABEL: store_load_large_imm_offset_foo:
936; GFX11:       ; %bb.0: ; %bb
937; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
938; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
939; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80
940; GFX11-NEXT:    v_mov_b32_e32 v2, 15
941; GFX11-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
942; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
943; GFX11-NEXT:    scratch_store_b32 v1, v2, s32 offset:4 dlc
944; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
945; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 offset:4 glc dlc
946; GFX11-NEXT:    s_waitcnt vmcnt(0)
947; GFX11-NEXT:    s_setpc_b64 s[30:31]
948bb:
949  %i = alloca [4096 x i32], align 4, addrspace(5)
950  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
951  store volatile i32 13, i32 addrspace(5)* %i1, align 4
952  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
953  store volatile i32 15, i32 addrspace(5)* %i7, align 4
954  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
955  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
956  ret void
957}
958
959define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
960; GFX9-LABEL: store_load_vidx_sidx_offset:
961; GFX9:       ; %bb.0: ; %bb
962; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
963; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
964; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
965; GFX9-NEXT:    v_mov_b32_e32 v1, 15
966; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
967; GFX9-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
968; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
969; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
970; GFX9-NEXT:    s_waitcnt vmcnt(0)
971; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
972; GFX9-NEXT:    s_waitcnt vmcnt(0)
973; GFX9-NEXT:    s_endpgm
974;
975; GFX10-LABEL: store_load_vidx_sidx_offset:
976; GFX10:       ; %bb.0: ; %bb
977; GFX10-NEXT:    s_add_u32 s2, s2, s5
978; GFX10-NEXT:    s_addc_u32 s3, s3, 0
979; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
980; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
981; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
982; GFX10-NEXT:    v_mov_b32_e32 v1, 15
983; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
984; GFX10-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
985; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
986; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
987; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
988; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
989; GFX10-NEXT:    s_waitcnt vmcnt(0)
990; GFX10-NEXT:    s_endpgm
991;
992; GFX940-LABEL: store_load_vidx_sidx_offset:
993; GFX940:       ; %bb.0: ; %bb
994; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
995; GFX940-NEXT:    v_mov_b32_e32 v1, 15
996; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
997; GFX940-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
998; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:1028 sc0 sc1
999; GFX940-NEXT:    s_waitcnt vmcnt(0)
1000; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:1028 sc0 sc1
1001; GFX940-NEXT:    s_waitcnt vmcnt(0)
1002; GFX940-NEXT:    s_endpgm
1003;
1004; GFX11-LABEL: store_load_vidx_sidx_offset:
1005; GFX11:       ; %bb.0: ; %bb
1006; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
1007; GFX11-NEXT:    v_mov_b32_e32 v1, 15
1008; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1009; GFX11-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
1010; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:1028 dlc
1011; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1012; GFX11-NEXT:    scratch_load_b32 v0, v0, off offset:1028 glc dlc
1013; GFX11-NEXT:    s_waitcnt vmcnt(0)
1014; GFX11-NEXT:    s_endpgm
1015bb:
1016  %alloca = alloca [32 x i32], align 4, addrspace(5)
1017  %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
1018  %add1 = add nsw i32 %sidx, %vidx
1019  %add2 = add nsw i32 %add1, 256
1020  %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2
1021  store volatile i32 15, i32 addrspace(5)* %gep, align 4
1022  %load = load volatile i32, i32 addrspace(5)* %gep, align 4
1023  ret void
1024}
1025
1026define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
1027; GFX9-LABEL: store_load_i64_aligned:
1028; GFX9:       ; %bb.0: ; %bb
1029; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1030; GFX9-NEXT:    v_mov_b32_e32 v1, 15
1031; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1032; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
1033; GFX9-NEXT:    s_waitcnt vmcnt(0)
1034; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
1035; GFX9-NEXT:    s_waitcnt vmcnt(0)
1036; GFX9-NEXT:    s_setpc_b64 s[30:31]
1037;
1038; GFX10-LABEL: store_load_i64_aligned:
1039; GFX10:       ; %bb.0: ; %bb
1040; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1041; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1042; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1043; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1044; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
1045; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1046; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
1047; GFX10-NEXT:    s_waitcnt vmcnt(0)
1048; GFX10-NEXT:    s_setpc_b64 s[30:31]
1049;
1050; GFX940-LABEL: store_load_i64_aligned:
1051; GFX940:       ; %bb.0: ; %bb
1052; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1053; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 15
1054; GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
1055; GFX940-NEXT:    s_waitcnt vmcnt(0)
1056; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
1057; GFX940-NEXT:    s_waitcnt vmcnt(0)
1058; GFX940-NEXT:    s_setpc_b64 s[30:31]
1059;
1060; GFX11-LABEL: store_load_i64_aligned:
1061; GFX11:       ; %bb.0: ; %bb
1062; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1063; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1064; GFX11-NEXT:    v_mov_b32_e32 v1, 15
1065; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1066; GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
1067; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1068; GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
1069; GFX11-NEXT:    s_waitcnt vmcnt(0)
1070; GFX11-NEXT:    s_setpc_b64 s[30:31]
1071bb:
1072  store volatile i64 15, i64 addrspace(5)* %arg, align 8
1073  %load = load volatile i64, i64 addrspace(5)* %arg, align 8
1074  ret void
1075}
1076
1077define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
1078; GFX9-LABEL: store_load_i64_unaligned:
1079; GFX9:       ; %bb.0: ; %bb
1080; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1081; GFX9-NEXT:    v_mov_b32_e32 v1, 15
1082; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1083; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
1084; GFX9-NEXT:    s_waitcnt vmcnt(0)
1085; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
1086; GFX9-NEXT:    s_waitcnt vmcnt(0)
1087; GFX9-NEXT:    s_setpc_b64 s[30:31]
1088;
1089; GFX10-LABEL: store_load_i64_unaligned:
1090; GFX10:       ; %bb.0: ; %bb
1091; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1092; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1093; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1094; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1095; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
1096; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1097; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
1098; GFX10-NEXT:    s_waitcnt vmcnt(0)
1099; GFX10-NEXT:    s_setpc_b64 s[30:31]
1100;
1101; GFX940-LABEL: store_load_i64_unaligned:
1102; GFX940:       ; %bb.0: ; %bb
1103; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1104; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 15
1105; GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
1106; GFX940-NEXT:    s_waitcnt vmcnt(0)
1107; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
1108; GFX940-NEXT:    s_waitcnt vmcnt(0)
1109; GFX940-NEXT:    s_setpc_b64 s[30:31]
1110;
1111; GFX11-LABEL: store_load_i64_unaligned:
1112; GFX11:       ; %bb.0: ; %bb
1113; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1114; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1115; GFX11-NEXT:    v_mov_b32_e32 v1, 15
1116; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1117; GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
1118; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1119; GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
1120; GFX11-NEXT:    s_waitcnt vmcnt(0)
1121; GFX11-NEXT:    s_setpc_b64 s[30:31]
1122bb:
1123  store volatile i64 15, i64 addrspace(5)* %arg, align 1
1124  %load = load volatile i64, i64 addrspace(5)* %arg, align 1
1125  ret void
1126}
1127
1128define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) {
1129; GFX9-LABEL: store_load_v3i32_unaligned:
1130; GFX9:       ; %bb.0: ; %bb
1131; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1132; GFX9-NEXT:    s_mov_b32 s2, 3
1133; GFX9-NEXT:    s_mov_b32 s1, 2
1134; GFX9-NEXT:    s_mov_b32 s0, 1
1135; GFX9-NEXT:    v_mov_b32_e32 v3, s2
1136; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1137; GFX9-NEXT:    v_mov_b32_e32 v1, s0
1138; GFX9-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
1139; GFX9-NEXT:    s_waitcnt vmcnt(0)
1140; GFX9-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
1141; GFX9-NEXT:    s_waitcnt vmcnt(0)
1142; GFX9-NEXT:    s_setpc_b64 s[30:31]
1143;
1144; GFX10-LABEL: store_load_v3i32_unaligned:
1145; GFX10:       ; %bb.0: ; %bb
1146; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1147; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1148; GFX10-NEXT:    s_mov_b32 s2, 3
1149; GFX10-NEXT:    s_mov_b32 s1, 2
1150; GFX10-NEXT:    s_mov_b32 s0, 1
1151; GFX10-NEXT:    v_mov_b32_e32 v3, s2
1152; GFX10-NEXT:    v_mov_b32_e32 v2, s1
1153; GFX10-NEXT:    v_mov_b32_e32 v1, s0
1154; GFX10-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
1155; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1156; GFX10-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
1157; GFX10-NEXT:    s_waitcnt vmcnt(0)
1158; GFX10-NEXT:    s_setpc_b64 s[30:31]
1159;
1160; GFX940-LABEL: store_load_v3i32_unaligned:
1161; GFX940:       ; %bb.0: ; %bb
1162; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1163; GFX940-NEXT:    s_mov_b32 s2, 3
1164; GFX940-NEXT:    s_mov_b32 s1, 2
1165; GFX940-NEXT:    s_mov_b32 s0, 1
1166; GFX940-NEXT:    v_mov_b32_e32 v4, s2
1167; GFX940-NEXT:    v_mov_b32_e32 v3, s1
1168; GFX940-NEXT:    v_mov_b32_e32 v2, s0
1169; GFX940-NEXT:    scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
1170; GFX940-NEXT:    s_waitcnt vmcnt(0)
1171; GFX940-NEXT:    scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
1172; GFX940-NEXT:    s_waitcnt vmcnt(0)
1173; GFX940-NEXT:    s_setpc_b64 s[30:31]
1174;
1175; GFX11-LABEL: store_load_v3i32_unaligned:
1176; GFX11:       ; %bb.0: ; %bb
1177; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1178; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1179; GFX11-NEXT:    s_mov_b32 s2, 3
1180; GFX11-NEXT:    s_mov_b32 s1, 2
1181; GFX11-NEXT:    s_mov_b32 s0, 1
1182; GFX11-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1
1183; GFX11-NEXT:    v_mov_b32_e32 v1, s0
1184; GFX11-NEXT:    scratch_store_b96 v0, v[1:3], off dlc
1185; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1186; GFX11-NEXT:    scratch_load_b96 v[0:2], v0, off glc dlc
1187; GFX11-NEXT:    s_waitcnt vmcnt(0)
1188; GFX11-NEXT:    s_setpc_b64 s[30:31]
1189bb:
1190  store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1
1191  %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1
1192  ret void
1193}
1194
1195define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) {
1196; GFX9-LABEL: store_load_v4i32_unaligned:
1197; GFX9:       ; %bb.0: ; %bb
1198; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1199; GFX9-NEXT:    s_mov_b32 s3, 4
1200; GFX9-NEXT:    s_mov_b32 s2, 3
1201; GFX9-NEXT:    s_mov_b32 s1, 2
1202; GFX9-NEXT:    s_mov_b32 s0, 1
1203; GFX9-NEXT:    v_mov_b32_e32 v4, s3
1204; GFX9-NEXT:    v_mov_b32_e32 v3, s2
1205; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1206; GFX9-NEXT:    v_mov_b32_e32 v1, s0
1207; GFX9-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
1208; GFX9-NEXT:    s_waitcnt vmcnt(0)
1209; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
1210; GFX9-NEXT:    s_waitcnt vmcnt(0)
1211; GFX9-NEXT:    s_setpc_b64 s[30:31]
1212;
1213; GFX10-LABEL: store_load_v4i32_unaligned:
1214; GFX10:       ; %bb.0: ; %bb
1215; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1216; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1217; GFX10-NEXT:    s_mov_b32 s3, 4
1218; GFX10-NEXT:    s_mov_b32 s2, 3
1219; GFX10-NEXT:    s_mov_b32 s1, 2
1220; GFX10-NEXT:    s_mov_b32 s0, 1
1221; GFX10-NEXT:    v_mov_b32_e32 v4, s3
1222; GFX10-NEXT:    v_mov_b32_e32 v3, s2
1223; GFX10-NEXT:    v_mov_b32_e32 v2, s1
1224; GFX10-NEXT:    v_mov_b32_e32 v1, s0
1225; GFX10-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
1226; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1227; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
1228; GFX10-NEXT:    s_waitcnt vmcnt(0)
1229; GFX10-NEXT:    s_setpc_b64 s[30:31]
1230;
1231; GFX940-LABEL: store_load_v4i32_unaligned:
1232; GFX940:       ; %bb.0: ; %bb
1233; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1234; GFX940-NEXT:    s_mov_b32 s3, 4
1235; GFX940-NEXT:    s_mov_b32 s2, 3
1236; GFX940-NEXT:    s_mov_b32 s1, 2
1237; GFX940-NEXT:    s_mov_b32 s0, 1
1238; GFX940-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
1239; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
1240; GFX940-NEXT:    scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
1241; GFX940-NEXT:    s_waitcnt vmcnt(0)
1242; GFX940-NEXT:    scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
1243; GFX940-NEXT:    s_waitcnt vmcnt(0)
1244; GFX940-NEXT:    s_setpc_b64 s[30:31]
1245;
1246; GFX11-LABEL: store_load_v4i32_unaligned:
1247; GFX11:       ; %bb.0: ; %bb
1248; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1249; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1250; GFX11-NEXT:    s_mov_b32 s3, 4
1251; GFX11-NEXT:    s_mov_b32 s2, 3
1252; GFX11-NEXT:    s_mov_b32 s1, 2
1253; GFX11-NEXT:    s_mov_b32 s0, 1
1254; GFX11-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
1255; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
1256; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off dlc
1257; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1258; GFX11-NEXT:    scratch_load_b128 v[0:3], v0, off glc dlc
1259; GFX11-NEXT:    s_waitcnt vmcnt(0)
1260; GFX11-NEXT:    s_setpc_b64 s[30:31]
1261bb:
1262  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1
1263  %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1
1264  ret void
1265}
1266
1267declare i32 @llvm.amdgcn.workitem.id.x()
1268