1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
4; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
5; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s
6; RUN: llc -march=amdgcn -mcpu=gfx940 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s
7; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s
8; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s
9; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-PAL %s
10
11define amdgpu_kernel void @zero_init_kernel() {
12; GFX9-LABEL: zero_init_kernel:
13; GFX9:       ; %bb.0:
14; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
15; GFX9-NEXT:    s_mov_b32 s0, 0
16; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
17; GFX9-NEXT:    s_mov_b32 s1, s0
18; GFX9-NEXT:    s_mov_b32 s2, s0
19; GFX9-NEXT:    s_mov_b32 s3, s0
20; GFX9-NEXT:    v_mov_b32_e32 v0, s0
21; GFX9-NEXT:    v_mov_b32_e32 v1, s1
22; GFX9-NEXT:    v_mov_b32_e32 v2, s2
23; GFX9-NEXT:    v_mov_b32_e32 v3, s3
24; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
25; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64
26; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
27; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
28; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
29; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
30; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
31; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
32; GFX9-NEXT:    s_endpgm
33;
34; GFX10-LABEL: zero_init_kernel:
35; GFX10:       ; %bb.0:
36; GFX10-NEXT:    s_add_u32 s0, s0, s3
37; GFX10-NEXT:    s_addc_u32 s1, s1, 0
38; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
39; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
40; GFX10-NEXT:    s_mov_b32 s0, 0
41; GFX10-NEXT:    s_mov_b32 s1, s0
42; GFX10-NEXT:    s_mov_b32 s2, s0
43; GFX10-NEXT:    s_mov_b32 s3, s0
44; GFX10-NEXT:    v_mov_b32_e32 v0, s0
45; GFX10-NEXT:    v_mov_b32_e32 v1, s1
46; GFX10-NEXT:    v_mov_b32_e32 v2, s2
47; GFX10-NEXT:    v_mov_b32_e32 v3, s3
48; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
49; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
50; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
51; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
52; GFX10-NEXT:    s_endpgm
53;
54; GFX11-LABEL: zero_init_kernel:
55; GFX11:       ; %bb.0:
56; GFX11-NEXT:    s_mov_b32 s0, 0
57; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
58; GFX11-NEXT:    s_mov_b32 s1, s0
59; GFX11-NEXT:    s_mov_b32 s2, s0
60; GFX11-NEXT:    s_mov_b32 s3, s0
61; GFX11-NEXT:    v_mov_b32_e32 v0, s0
62; GFX11-NEXT:    v_mov_b32_e32 v1, s1
63; GFX11-NEXT:    v_mov_b32_e32 v2, s2
64; GFX11-NEXT:    v_mov_b32_e32 v3, s3
65; GFX11-NEXT:    s_clause 0x3
66; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:64
67; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:48
68; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
69; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:16
70; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
71; GFX11-NEXT:    s_endpgm
72;
73; GFX9-PAL-LABEL: zero_init_kernel:
74; GFX9-PAL:       ; %bb.0:
75; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
76; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
77; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
78; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
79; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
80; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
81; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
82; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
83; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
84; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
85; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
86; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
87; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
88; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
89; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
90; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
91; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64
92; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
93; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
94; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
95; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
96; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
97; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
98; GFX9-PAL-NEXT:    s_endpgm
99;
100; GFX940-LABEL: zero_init_kernel:
101; GFX940:       ; %bb.0:
102; GFX940-NEXT:    s_mov_b32 s0, 0
103; GFX940-NEXT:    s_mov_b32 s1, s0
104; GFX940-NEXT:    s_mov_b32 s2, s0
105; GFX940-NEXT:    s_mov_b32 s3, s0
106; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
107; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
108; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
109; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
110; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
111; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
112; GFX940-NEXT:    s_endpgm
113;
114; GFX1010-PAL-LABEL: zero_init_kernel:
115; GFX1010-PAL:       ; %bb.0:
116; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
117; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
118; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
119; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
120; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
121; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
122; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
123; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
124; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
125; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
126; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
127; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
128; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
129; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
130; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
131; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
132; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
133; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
134; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:64
135; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
136; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
137; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
138; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
139; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
140; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
141; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
142; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
143; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
144; GFX1010-PAL-NEXT:    s_endpgm
145;
146; GFX1030-PAL-LABEL: zero_init_kernel:
147; GFX1030-PAL:       ; %bb.0:
148; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
149; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
150; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
151; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
152; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
153; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
154; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
155; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
156; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
157; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
158; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
159; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
160; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
161; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
162; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
163; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
164; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
165; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
166; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
167; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
168; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
169; GFX1030-PAL-NEXT:    s_endpgm
170;
171; GFX11-PAL-LABEL: zero_init_kernel:
172; GFX11-PAL:       ; %bb.0:
173; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
174; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
175; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
176; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
177; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
178; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, s0
179; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, s1
180; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, s2
181; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, s3
182; GFX11-PAL-NEXT:    s_clause 0x3
183; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:64
184; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:48
185; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
186; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16
187; GFX11-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
188; GFX11-PAL-NEXT:    s_endpgm
189  %alloca = alloca [32 x i16], align 2, addrspace(5)
190  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
191  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
192  ret void
193}
194
195define void @zero_init_foo() {
196; GFX9-LABEL: zero_init_foo:
197; GFX9:       ; %bb.0:
198; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
199; GFX9-NEXT:    s_mov_b32 s0, 0
200; GFX9-NEXT:    s_mov_b32 s1, s0
201; GFX9-NEXT:    s_mov_b32 s2, s0
202; GFX9-NEXT:    s_mov_b32 s3, s0
203; GFX9-NEXT:    v_mov_b32_e32 v0, s0
204; GFX9-NEXT:    v_mov_b32_e32 v1, s1
205; GFX9-NEXT:    v_mov_b32_e32 v2, s2
206; GFX9-NEXT:    v_mov_b32_e32 v3, s3
207; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
208; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
209; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
210; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
211; GFX9-NEXT:    s_waitcnt vmcnt(0)
212; GFX9-NEXT:    s_setpc_b64 s[30:31]
213;
214; GFX10-LABEL: zero_init_foo:
215; GFX10:       ; %bb.0:
216; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
217; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
218; GFX10-NEXT:    s_mov_b32 s0, 0
219; GFX10-NEXT:    s_mov_b32 s1, s0
220; GFX10-NEXT:    s_mov_b32 s2, s0
221; GFX10-NEXT:    s_mov_b32 s3, s0
222; GFX10-NEXT:    v_mov_b32_e32 v0, s0
223; GFX10-NEXT:    v_mov_b32_e32 v1, s1
224; GFX10-NEXT:    v_mov_b32_e32 v2, s2
225; GFX10-NEXT:    v_mov_b32_e32 v3, s3
226; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
227; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
228; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
229; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
230; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
231; GFX10-NEXT:    s_setpc_b64 s[30:31]
232;
233; GFX11-LABEL: zero_init_foo:
234; GFX11:       ; %bb.0:
235; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
236; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
237; GFX11-NEXT:    s_mov_b32 s0, 0
238; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
239; GFX11-NEXT:    s_mov_b32 s1, s0
240; GFX11-NEXT:    s_mov_b32 s2, s0
241; GFX11-NEXT:    s_mov_b32 s3, s0
242; GFX11-NEXT:    v_mov_b32_e32 v0, s0
243; GFX11-NEXT:    v_mov_b32_e32 v1, s1
244; GFX11-NEXT:    v_mov_b32_e32 v2, s2
245; GFX11-NEXT:    v_mov_b32_e32 v3, s3
246; GFX11-NEXT:    s_clause 0x3
247; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:48
248; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:32
249; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
250; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
251; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
252; GFX11-NEXT:    s_setpc_b64 s[30:31]
253;
254; GFX9-PAL-LABEL: zero_init_foo:
255; GFX9-PAL:       ; %bb.0:
256; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
257; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
258; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
259; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
260; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
261; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
262; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
263; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
264; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
265; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
266; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
267; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
268; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
269; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
270; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
271;
272; GFX940-LABEL: zero_init_foo:
273; GFX940:       ; %bb.0:
274; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
275; GFX940-NEXT:    s_mov_b32 s0, 0
276; GFX940-NEXT:    s_mov_b32 s1, s0
277; GFX940-NEXT:    s_mov_b32 s2, s0
278; GFX940-NEXT:    s_mov_b32 s3, s0
279; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
280; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
281; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
282; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
283; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
284; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
285; GFX940-NEXT:    s_waitcnt vmcnt(0)
286; GFX940-NEXT:    s_setpc_b64 s[30:31]
287;
288; GFX10-PAL-LABEL: zero_init_foo:
289; GFX10-PAL:       ; %bb.0:
290; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
291; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
292; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
293; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
294; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
295; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
296; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
297; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
298; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
299; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
300; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
301; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
302; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
303; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
304; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
305; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
306;
307; GFX11-PAL-LABEL: zero_init_foo:
308; GFX11-PAL:       ; %bb.0:
309; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
310; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
311; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
312; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
313; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
314; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
315; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
316; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, s0
317; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, s1
318; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, s2
319; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, s3
320; GFX11-PAL-NEXT:    s_clause 0x3
321; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:48
322; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:32
323; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
324; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32
325; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
326; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
327; GCN-LABEL: zero_init_foo:
328; GCN:       ; %bb.0:
329; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
330; GCN-NEXT:    s_mov_b32 s0, 0
331; GCN-NEXT:    s_mov_b32 s1, s0
332; GCN-NEXT:    s_mov_b32 s2, s0
333; GCN-NEXT:    s_mov_b32 s3, s0
334; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
335; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
336; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
337; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
338; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
339; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
340; GCN-NEXT:    s_waitcnt vmcnt(0)
341; GCN-NEXT:    s_setpc_b64 s[30:31]
342  %alloca = alloca [32 x i16], align 2, addrspace(5)
343  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
344  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
345  ret void
346}
347
348define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
349; GFX9-LABEL: store_load_sindex_kernel:
350; GFX9:       ; %bb.0: ; %bb
351; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
352; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
353; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
354; GFX9-NEXT:    v_mov_b32_e32 v0, 15
355; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
356; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
357; GFX9-NEXT:    s_and_b32 s0, s0, 15
358; GFX9-NEXT:    s_add_i32 s1, s1, 4
359; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
360; GFX9-NEXT:    scratch_store_dword off, v0, s1
361; GFX9-NEXT:    s_waitcnt vmcnt(0)
362; GFX9-NEXT:    s_add_i32 s0, s0, 4
363; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
364; GFX9-NEXT:    s_waitcnt vmcnt(0)
365; GFX9-NEXT:    s_endpgm
366;
367; GFX10-LABEL: store_load_sindex_kernel:
368; GFX10:       ; %bb.0: ; %bb
369; GFX10-NEXT:    s_add_u32 s2, s2, s5
370; GFX10-NEXT:    s_addc_u32 s3, s3, 0
371; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
372; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
373; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
374; GFX10-NEXT:    v_mov_b32_e32 v0, 15
375; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
376; GFX10-NEXT:    s_and_b32 s1, s0, 15
377; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
378; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
379; GFX10-NEXT:    s_add_i32 s0, s0, 4
380; GFX10-NEXT:    s_add_i32 s1, s1, 4
381; GFX10-NEXT:    scratch_store_dword off, v0, s0
382; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
383; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
384; GFX10-NEXT:    s_waitcnt vmcnt(0)
385; GFX10-NEXT:    s_endpgm
386;
387; GFX11-LABEL: store_load_sindex_kernel:
388; GFX11:       ; %bb.0: ; %bb
389; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
390; GFX11-NEXT:    v_mov_b32_e32 v0, 15
391; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
392; GFX11-NEXT:    s_and_b32 s1, s0, 15
393; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
394; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
395; GFX11-NEXT:    s_add_i32 s0, s0, 4
396; GFX11-NEXT:    s_add_i32 s1, s1, 4
397; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
398; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
399; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
400; GFX11-NEXT:    s_waitcnt vmcnt(0)
401; GFX11-NEXT:    s_endpgm
402;
403; GFX9-PAL-LABEL: store_load_sindex_kernel:
404; GFX9-PAL:       ; %bb.0: ; %bb
405; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
406; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
407; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
408; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
409; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
410; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
411; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
412; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
413; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
414; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
415; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
416; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 4
417; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
418; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
419; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
420; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
421; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
422; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
423; GFX9-PAL-NEXT:    s_endpgm
424;
425; GFX940-LABEL: store_load_sindex_kernel:
426; GFX940:       ; %bb.0: ; %bb
427; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
428; GFX940-NEXT:    v_mov_b32_e32 v0, 15
429; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
430; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
431; GFX940-NEXT:    s_and_b32 s0, s0, 15
432; GFX940-NEXT:    s_add_i32 s1, s1, 4
433; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
434; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
435; GFX940-NEXT:    s_waitcnt vmcnt(0)
436; GFX940-NEXT:    s_add_i32 s0, s0, 4
437; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
438; GFX940-NEXT:    s_waitcnt vmcnt(0)
439; GFX940-NEXT:    s_endpgm
440;
441; GFX10-PAL-LABEL: store_load_sindex_kernel:
442; GFX10-PAL:       ; %bb.0: ; %bb
443; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
444; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
445; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
446; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
447; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
448; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
449; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
450; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
451; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
452; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
453; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
454; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
455; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
456; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
457; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
458; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 4
459; GFX10-PAL-NEXT:    s_add_i32 s1, s1, 4
460; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
461; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
462; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
463; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
464; GFX10-PAL-NEXT:    s_endpgm
465;
466; GFX11-PAL-LABEL: store_load_sindex_kernel:
467; GFX11-PAL:       ; %bb.0: ; %bb
468; GFX11-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
469; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
470; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
471; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
472; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
473; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
474; GFX11-PAL-NEXT:    s_add_i32 s0, s0, 4
475; GFX11-PAL-NEXT:    s_add_i32 s1, s1, 4
476; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
477; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
478; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
479; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
480; GFX11-PAL-NEXT:    s_endpgm
481; GCN-LABEL: store_load_sindex_kernel:
482; GCN:       ; %bb.0: ; %bb
483; GCN-NEXT:    s_load_dword s0, s[0:1], 0x24
484; GCN-NEXT:    v_mov_b32_e32 v0, 15
485; GCN-NEXT:    s_waitcnt lgkmcnt(0)
486; GCN-NEXT:    s_lshl_b32 s1, s0, 2
487; GCN-NEXT:    s_and_b32 s0, s0, 15
488; GCN-NEXT:    s_lshl_b32 s0, s0, 2
489; GCN-NEXT:    s_add_u32 s1, 4, s1
490; GCN-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
491; GCN-NEXT:    s_waitcnt vmcnt(0)
492; GCN-NEXT:    s_add_u32 s0, 4, s0
493; GCN-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
494; GCN-NEXT:    s_waitcnt vmcnt(0)
495; GCN-NEXT:    s_endpgm
496bb:
497  %i = alloca [32 x float], align 4, addrspace(5)
498  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
499  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
500  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
501  store volatile i32 15, i32 addrspace(5)* %i8, align 4
502  %i9 = and i32 %idx, 15
503  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
504  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
505  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
506  ret void
507}
508
509define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
510; GFX9-LABEL: store_load_sindex_foo:
511; GFX9:       ; %bb.0: ; %bb
512; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
513; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
514; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
515; GFX9-NEXT:    s_add_i32 s0, s0, 4
516; GFX9-NEXT:    v_mov_b32_e32 v0, 15
517; GFX9-NEXT:    scratch_store_dword off, v0, s0
518; GFX9-NEXT:    s_waitcnt vmcnt(0)
519; GFX9-NEXT:    s_and_b32 s0, s2, 15
520; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
521; GFX9-NEXT:    s_add_i32 s0, s0, 4
522; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
523; GFX9-NEXT:    s_waitcnt vmcnt(0)
524; GFX9-NEXT:    s_endpgm
525;
526; GFX10-LABEL: store_load_sindex_foo:
527; GFX10:       ; %bb.0: ; %bb
528; GFX10-NEXT:    s_add_u32 s0, s0, s3
529; GFX10-NEXT:    s_addc_u32 s1, s1, 0
530; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
531; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
532; GFX10-NEXT:    v_mov_b32_e32 v0, 15
533; GFX10-NEXT:    s_and_b32 s0, s2, 15
534; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
535; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
536; GFX10-NEXT:    s_add_i32 s1, s1, 4
537; GFX10-NEXT:    s_add_i32 s0, s0, 4
538; GFX10-NEXT:    scratch_store_dword off, v0, s1
539; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
540; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
541; GFX10-NEXT:    s_waitcnt vmcnt(0)
542; GFX10-NEXT:    s_endpgm
543;
544; GFX11-LABEL: store_load_sindex_foo:
545; GFX11:       ; %bb.0: ; %bb
546; GFX11-NEXT:    v_mov_b32_e32 v0, 15
547; GFX11-NEXT:    s_and_b32 s1, s0, 15
548; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
549; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
550; GFX11-NEXT:    s_add_i32 s0, s0, 4
551; GFX11-NEXT:    s_add_i32 s1, s1, 4
552; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
553; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
554; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
555; GFX11-NEXT:    s_waitcnt vmcnt(0)
556; GFX11-NEXT:    s_endpgm
557;
558; GFX9-PAL-LABEL: store_load_sindex_foo:
559; GFX9-PAL:       ; %bb.0: ; %bb
560; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
561; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
562; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
563; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
564; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
565; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
566; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
567; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
568; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
569; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
570; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 4
571; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
572; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
573; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
574; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
575; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
576; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
577; GFX9-PAL-NEXT:    s_endpgm
578;
579; GFX940-LABEL: store_load_sindex_foo:
580; GFX940:       ; %bb.0: ; %bb
581; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
582; GFX940-NEXT:    s_and_b32 s0, s0, 15
583; GFX940-NEXT:    s_add_i32 s1, s1, 4
584; GFX940-NEXT:    v_mov_b32_e32 v0, 15
585; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
586; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
587; GFX940-NEXT:    s_waitcnt vmcnt(0)
588; GFX940-NEXT:    s_add_i32 s0, s0, 4
589; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
590; GFX940-NEXT:    s_waitcnt vmcnt(0)
591; GFX940-NEXT:    s_endpgm
592;
593; GFX10-PAL-LABEL: store_load_sindex_foo:
594; GFX10-PAL:       ; %bb.0: ; %bb
595; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
596; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
597; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
598; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
599; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
600; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
601; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
602; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
603; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
604; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
605; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
606; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
607; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
608; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 4
609; GFX10-PAL-NEXT:    s_add_i32 s1, s1, 4
610; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
611; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
612; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
613; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
614; GFX10-PAL-NEXT:    s_endpgm
615;
616; GFX11-PAL-LABEL: store_load_sindex_foo:
617; GFX11-PAL:       ; %bb.0: ; %bb
618; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
619; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
620; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
621; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
622; GFX11-PAL-NEXT:    s_add_i32 s0, s0, 4
623; GFX11-PAL-NEXT:    s_add_i32 s1, s1, 4
624; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
625; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
626; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
627; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
628; GFX11-PAL-NEXT:    s_endpgm
629; GCN-LABEL: store_load_sindex_foo:
630; GCN:       ; %bb.0: ; %bb
631; GCN-NEXT:    s_lshl_b32 s1, s0, 2
632; GCN-NEXT:    s_and_b32 s0, s0, 15
633; GCN-NEXT:    s_lshl_b32 s0, s0, 2
634; GCN-NEXT:    s_add_u32 s1, 4, s1
635; GCN-NEXT:    v_mov_b32_e32 v0, 15
636; GCN-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
637; GCN-NEXT:    s_waitcnt vmcnt(0)
638; GCN-NEXT:    s_add_u32 s0, 4, s0
639; GCN-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
640; GCN-NEXT:    s_waitcnt vmcnt(0)
641; GCN-NEXT:    s_endpgm
642bb:
643  %i = alloca [32 x float], align 4, addrspace(5)
644  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
645  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
646  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
647  store volatile i32 15, i32 addrspace(5)* %i8, align 4
648  %i9 = and i32 %idx, 15
649  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
650  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
651  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
652  ret void
653}
654
655define amdgpu_kernel void @store_load_vindex_kernel() {
656; GFX9-LABEL: store_load_vindex_kernel:
657; GFX9:       ; %bb.0: ; %bb
658; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
659; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
660; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
661; GFX9-NEXT:    v_add_u32_e32 v1, 4, v0
662; GFX9-NEXT:    v_mov_b32_e32 v2, 15
663; GFX9-NEXT:    scratch_store_dword v1, v2, off
664; GFX9-NEXT:    s_waitcnt vmcnt(0)
665; GFX9-NEXT:    v_sub_u32_e32 v0, 4, v0
666; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
667; GFX9-NEXT:    s_waitcnt vmcnt(0)
668; GFX9-NEXT:    s_endpgm
669;
670; GFX10-LABEL: store_load_vindex_kernel:
671; GFX10:       ; %bb.0: ; %bb
672; GFX10-NEXT:    s_add_u32 s0, s0, s3
673; GFX10-NEXT:    s_addc_u32 s1, s1, 0
674; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
675; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
676; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
677; GFX10-NEXT:    v_mov_b32_e32 v2, 15
678; GFX10-NEXT:    v_add_nc_u32_e32 v1, 4, v0
679; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 4, v0
680; GFX10-NEXT:    scratch_store_dword v1, v2, off
681; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
682; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
683; GFX10-NEXT:    s_waitcnt vmcnt(0)
684; GFX10-NEXT:    s_endpgm
685;
686; GFX11-LABEL: store_load_vindex_kernel:
687; GFX11:       ; %bb.0: ; %bb
688; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
689; GFX11-NEXT:    v_mov_b32_e32 v1, 15
690; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
691; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
692; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4 dlc
693; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
694; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
695; GFX11-NEXT:    s_waitcnt vmcnt(0)
696; GFX11-NEXT:    s_endpgm
697;
698; GFX9-PAL-LABEL: store_load_vindex_kernel:
699; GFX9-PAL:       ; %bb.0: ; %bb
700; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
701; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
702; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
703; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
704; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 4, v0
705; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
706; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 4, v0
707; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
708; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
709; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
710; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
711; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
712; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
713; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
714; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
715; GFX9-PAL-NEXT:    s_endpgm
716;
717; GFX940-LABEL: store_load_vindex_kernel:
718; GFX940:       ; %bb.0: ; %bb
719; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
720; GFX940-NEXT:    v_mov_b32_e32 v1, 15
721; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
722; GFX940-NEXT:    s_waitcnt vmcnt(0)
723; GFX940-NEXT:    v_sub_u32_e32 v0, 4, v0
724; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
725; GFX940-NEXT:    s_waitcnt vmcnt(0)
726; GFX940-NEXT:    s_endpgm
727;
728; GFX10-PAL-LABEL: store_load_vindex_kernel:
729; GFX10-PAL:       ; %bb.0: ; %bb
730; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
731; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
732; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
733; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
734; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
735; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
736; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
737; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
738; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
739; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
740; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
741; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v1, 4, v0
742; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, 4, v0
743; GFX10-PAL-NEXT:    scratch_store_dword v1, v2, off
744; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
745; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
746; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
747; GFX10-PAL-NEXT:    s_endpgm
748;
749; GFX11-PAL-LABEL: store_load_vindex_kernel:
750; GFX11-PAL:       ; %bb.0: ; %bb
751; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
752; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
753; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
754; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
755; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4 dlc
756; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
757; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
758; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
759; GFX11-PAL-NEXT:    s_endpgm
760; GCN-LABEL: store_load_vindex_kernel:
761; GCN:       ; %bb.0: ; %bb
762; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
763; GCN-NEXT:    v_mov_b32_e32 v1, 15
764; GCN-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
765; GCN-NEXT:    s_waitcnt vmcnt(0)
766; GCN-NEXT:    v_sub_u32_e32 v0, 4, v0
767; GCN-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
768; GCN-NEXT:    s_waitcnt vmcnt(0)
769; GCN-NEXT:    s_endpgm
770bb:
771  %i = alloca [32 x float], align 4, addrspace(5)
772  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
773  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
774  %i3 = zext i32 %i2 to i64
775  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
776  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
777  store volatile i32 15, i32 addrspace(5)* %i8, align 4
778  %i9 = sub nsw i32 31, %i2
779  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
780  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
781  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
782  ret void
783}
784
785define void @store_load_vindex_foo(i32 %idx) {
786; GFX9-LABEL: store_load_vindex_foo:
787; GFX9:       ; %bb.0: ; %bb
788; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
789; GFX9-NEXT:    v_mov_b32_e32 v1, s32
790; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
791; GFX9-NEXT:    v_mov_b32_e32 v3, 15
792; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
793; GFX9-NEXT:    scratch_store_dword v2, v3, off
794; GFX9-NEXT:    s_waitcnt vmcnt(0)
795; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
796; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
797; GFX9-NEXT:    s_waitcnt vmcnt(0)
798; GFX9-NEXT:    s_setpc_b64 s[30:31]
799;
800; GFX10-LABEL: store_load_vindex_foo:
801; GFX10:       ; %bb.0: ; %bb
802; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
803; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
804; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
805; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
806; GFX10-NEXT:    v_mov_b32_e32 v2, 15
807; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, s32
808; GFX10-NEXT:    scratch_store_dword v0, v2, off
809; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
810; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
811; GFX10-NEXT:    s_waitcnt vmcnt(0)
812; GFX10-NEXT:    s_setpc_b64 s[30:31]
813;
814; GFX11-LABEL: store_load_vindex_foo:
815; GFX11:       ; %bb.0: ; %bb
816; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
817; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
818; GFX11-NEXT:    v_and_b32_e32 v1, 15, v0
819; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
820; GFX11-NEXT:    v_mov_b32_e32 v2, 15
821; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
822; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
823; GFX11-NEXT:    scratch_store_b32 v0, v2, s32 dlc
824; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
825; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 glc dlc
826; GFX11-NEXT:    s_waitcnt vmcnt(0)
827; GFX11-NEXT:    s_setpc_b64 s[30:31]
828;
829; GFX9-PAL-LABEL: store_load_vindex_foo:
830; GFX9-PAL:       ; %bb.0: ; %bb
831; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
832; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s32
833; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
834; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
835; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
836; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
837; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
838; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
839; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
840; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
841; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
842;
843; GFX940-LABEL: store_load_vindex_foo:
844; GFX940:       ; %bb.0: ; %bb
845; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
846; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
847; GFX940-NEXT:    v_mov_b32_e32 v2, 15
848; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
849; GFX940-NEXT:    scratch_store_dword v1, v2, s32 sc0 sc1
850; GFX940-NEXT:    s_waitcnt vmcnt(0)
851; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
852; GFX940-NEXT:    scratch_load_dword v0, v0, s32 sc0 sc1
853; GFX940-NEXT:    s_waitcnt vmcnt(0)
854; GFX940-NEXT:    s_setpc_b64 s[30:31]
855;
856; GFX10-PAL-LABEL: store_load_vindex_foo:
857; GFX10-PAL:       ; %bb.0: ; %bb
858; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
859; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
860; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
861; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
862; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
863; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, s32
864; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
865; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
866; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
867; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
868; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
869;
870; GFX11-PAL-LABEL: store_load_vindex_foo:
871; GFX11-PAL:       ; %bb.0: ; %bb
872; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
873; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
874; GFX11-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
875; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
876; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 15
877; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
878; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
879; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, s32 dlc
880; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
881; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 glc dlc
882; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
883; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
884; GCN-LABEL: store_load_vindex_foo:
885; GCN:       ; %bb.0: ; %bb
886; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
887; GCN-NEXT:    v_mov_b32_e32 v2, 15
888; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
889; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
890; GCN-NEXT:    scratch_store_dword v1, v2, s32 sc0 sc1
891; GCN-NEXT:    s_waitcnt vmcnt(0)
892; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
893; GCN-NEXT:    scratch_load_dword v0, v0, s32 sc0 sc1
894; GCN-NEXT:    s_waitcnt vmcnt(0)
895; GCN-NEXT:    s_setpc_b64 s[30:31]
896bb:
897  %i = alloca [32 x float], align 4, addrspace(5)
898  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
899  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
900  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
901  store volatile i32 15, i32 addrspace(5)* %i8, align 4
902  %i9 = and i32 %idx, 15
903  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
904  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
905  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
906  ret void
907}
908
909define void @private_ptr_foo(float addrspace(5)* nocapture %arg) {
910; GFX9-LABEL: private_ptr_foo:
911; GFX9:       ; %bb.0:
912; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
913; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
914; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
915; GFX9-NEXT:    s_waitcnt vmcnt(0)
916; GFX9-NEXT:    s_setpc_b64 s[30:31]
917;
918; GFX10-LABEL: private_ptr_foo:
919; GFX10:       ; %bb.0:
920; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
921; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
922; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
923; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
924; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
925; GFX10-NEXT:    s_setpc_b64 s[30:31]
926;
927; GFX11-LABEL: private_ptr_foo:
928; GFX11:       ; %bb.0:
929; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
930; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
931; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41200000
932; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4
933; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
934; GFX11-NEXT:    s_setpc_b64 s[30:31]
935;
936; GFX9-PAL-LABEL: private_ptr_foo:
937; GFX9-PAL:       ; %bb.0:
938; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
939; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
940; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
941; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
942; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
943;
944; GFX940-LABEL: private_ptr_foo:
945; GFX940:       ; %bb.0:
946; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
947; GFX940-NEXT:    v_mov_b32_e32 v1, 0x41200000
948; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4
949; GFX940-NEXT:    s_waitcnt vmcnt(0)
950; GFX940-NEXT:    s_setpc_b64 s[30:31]
951;
952; GFX10-PAL-LABEL: private_ptr_foo:
953; GFX10-PAL:       ; %bb.0:
954; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
955; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
956; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
957; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
958; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
959; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
960;
961; GFX11-PAL-LABEL: private_ptr_foo:
962; GFX11-PAL:       ; %bb.0:
963; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
964; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
965; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
966; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4
967; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
968; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
969; GCN-LABEL: private_ptr_foo:
970; GCN:       ; %bb.0:
971; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
972; GCN-NEXT:    v_mov_b32_e32 v1, 0x41200000
973; GCN-NEXT:    scratch_store_dword v0, v1, off offset:4
974; GCN-NEXT:    s_waitcnt vmcnt(0)
975; GCN-NEXT:    s_setpc_b64 s[30:31]
976  %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1
977  store float 1.000000e+01, float addrspace(5)* %gep, align 4
978  ret void
979}
980
981define amdgpu_kernel void @zero_init_small_offset_kernel() {
982; GFX9-LABEL: zero_init_small_offset_kernel:
983; GFX9:       ; %bb.0:
984; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
985; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
986; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
987; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
988; GFX9-NEXT:    s_waitcnt vmcnt(0)
989; GFX9-NEXT:    s_mov_b32 s0, 0
990; GFX9-NEXT:    s_mov_b32 s1, s0
991; GFX9-NEXT:    s_mov_b32 s2, s0
992; GFX9-NEXT:    s_mov_b32 s3, s0
993; GFX9-NEXT:    v_mov_b32_e32 v0, s0
994; GFX9-NEXT:    v_mov_b32_e32 v1, s1
995; GFX9-NEXT:    v_mov_b32_e32 v2, s2
996; GFX9-NEXT:    v_mov_b32_e32 v3, s3
997; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
998; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272
999; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1000; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288
1001; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1002; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304
1003; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1004; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
1005; GFX9-NEXT:    s_endpgm
1006;
1007; GFX10-LABEL: zero_init_small_offset_kernel:
1008; GFX10:       ; %bb.0:
1009; GFX10-NEXT:    s_add_u32 s0, s0, s3
1010; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1011; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1012; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1013; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1014; GFX10-NEXT:    s_waitcnt vmcnt(0)
1015; GFX10-NEXT:    s_mov_b32 s0, 0
1016; GFX10-NEXT:    s_mov_b32 s1, s0
1017; GFX10-NEXT:    s_mov_b32 s2, s0
1018; GFX10-NEXT:    s_mov_b32 s3, s0
1019; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1020; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1021; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1022; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1023; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
1024; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
1025; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
1026; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
1027; GFX10-NEXT:    s_endpgm
1028;
1029; GFX11-LABEL: zero_init_small_offset_kernel:
1030; GFX11:       ; %bb.0:
1031; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
1032; GFX11-NEXT:    s_waitcnt vmcnt(0)
1033; GFX11-NEXT:    s_mov_b32 s0, 0
1034; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1035; GFX11-NEXT:    s_mov_b32 s1, s0
1036; GFX11-NEXT:    s_mov_b32 s2, s0
1037; GFX11-NEXT:    s_mov_b32 s3, s0
1038; GFX11-NEXT:    v_mov_b32_e32 v0, s0
1039; GFX11-NEXT:    v_mov_b32_e32 v1, s1
1040; GFX11-NEXT:    v_mov_b32_e32 v2, s2
1041; GFX11-NEXT:    v_mov_b32_e32 v3, s3
1042; GFX11-NEXT:    s_clause 0x3
1043; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
1044; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:288
1045; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:304
1046; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:320
1047; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1048; GFX11-NEXT:    s_endpgm
1049;
1050; GFX9-PAL-LABEL: zero_init_small_offset_kernel:
1051; GFX9-PAL:       ; %bb.0:
1052; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1053; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1054; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1055; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1056; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1057; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1058; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1059; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1060; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1061; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1062; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1063; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
1064; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1065; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
1066; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
1067; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
1068; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
1069; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
1070; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1071; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272
1072; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1073; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288
1074; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1075; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304
1076; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1077; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
1078; GFX9-PAL-NEXT:    s_endpgm
1079;
1080; GFX940-LABEL: zero_init_small_offset_kernel:
1081; GFX940:       ; %bb.0:
1082; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
1083; GFX940-NEXT:    s_waitcnt vmcnt(0)
1084; GFX940-NEXT:    s_mov_b32 s0, 0
1085; GFX940-NEXT:    s_mov_b32 s1, s0
1086; GFX940-NEXT:    s_mov_b32 s2, s0
1087; GFX940-NEXT:    s_mov_b32 s3, s0
1088; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1089; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
1090; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
1091; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
1092; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
1093; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
1094; GFX940-NEXT:    s_endpgm
1095;
1096; GFX1010-PAL-LABEL: zero_init_small_offset_kernel:
1097; GFX1010-PAL:       ; %bb.0:
1098; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1099; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1100; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1101; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1102; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1103; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1104; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1105; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1106; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1107; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1108; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
1109; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
1110; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1111; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
1112; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1113; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
1114; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
1115; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
1116; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
1117; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
1118; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1119; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:272
1120; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1121; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1122; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:288
1123; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1124; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1125; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:304
1126; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1127; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1128; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:320
1129; GFX1010-PAL-NEXT:    s_endpgm
1130;
1131; GFX1030-PAL-LABEL: zero_init_small_offset_kernel:
1132; GFX1030-PAL:       ; %bb.0:
1133; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1134; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1135; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1136; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1137; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1138; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1139; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1140; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1141; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1142; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1143; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1144; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
1145; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
1146; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1147; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
1148; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
1149; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
1150; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
1151; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
1152; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
1153; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
1154; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
1155; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
1156; GFX1030-PAL-NEXT:    s_endpgm
1157;
1158; GFX11-PAL-LABEL: zero_init_small_offset_kernel:
1159; GFX11-PAL:       ; %bb.0:
1160; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
1161; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1162; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
1163; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1164; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
1165; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
1166; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
1167; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, s0
1168; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, s1
1169; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, s2
1170; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, s3
1171; GFX11-PAL-NEXT:    s_clause 0x3
1172; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
1173; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:288
1174; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:304
1175; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:320
1176; GFX11-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1177; GFX11-PAL-NEXT:    s_endpgm
1178  %padding = alloca [64 x i32], align 4, addrspace(5)
1179  %alloca = alloca [32 x i16], align 2, addrspace(5)
1180  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1181  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1182  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
1183  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
1184  ret void
1185}
1186
1187define void @zero_init_small_offset_foo() {
1188; GFX9-LABEL: zero_init_small_offset_foo:
1189; GFX9:       ; %bb.0:
1190; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1191; GFX9-NEXT:    scratch_load_dword v0, off, s32 glc
1192; GFX9-NEXT:    s_waitcnt vmcnt(0)
1193; GFX9-NEXT:    s_mov_b32 s0, 0
1194; GFX9-NEXT:    s_mov_b32 s1, s0
1195; GFX9-NEXT:    s_mov_b32 s2, s0
1196; GFX9-NEXT:    s_mov_b32 s3, s0
1197; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1198; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1199; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1200; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1201; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1202; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1203; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1204; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1205; GFX9-NEXT:    s_waitcnt vmcnt(0)
1206; GFX9-NEXT:    s_setpc_b64 s[30:31]
1207;
1208; GFX10-LABEL: zero_init_small_offset_foo:
1209; GFX10:       ; %bb.0:
1210; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1211; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1212; GFX10-NEXT:    scratch_load_dword v0, off, s32 glc dlc
1213; GFX10-NEXT:    s_waitcnt vmcnt(0)
1214; GFX10-NEXT:    s_mov_b32 s0, 0
1215; GFX10-NEXT:    s_mov_b32 s1, s0
1216; GFX10-NEXT:    s_mov_b32 s2, s0
1217; GFX10-NEXT:    s_mov_b32 s3, s0
1218; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1219; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1220; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1221; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1222; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1223; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1224; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1225; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1226; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1227; GFX10-NEXT:    s_setpc_b64 s[30:31]
1228;
1229; GFX11-LABEL: zero_init_small_offset_foo:
1230; GFX11:       ; %bb.0:
1231; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1232; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1233; GFX11-NEXT:    scratch_load_b32 v0, off, s32 glc dlc
1234; GFX11-NEXT:    s_waitcnt vmcnt(0)
1235; GFX11-NEXT:    s_mov_b32 s0, 0
1236; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1237; GFX11-NEXT:    s_mov_b32 s1, s0
1238; GFX11-NEXT:    s_mov_b32 s2, s0
1239; GFX11-NEXT:    s_mov_b32 s3, s0
1240; GFX11-NEXT:    v_mov_b32_e32 v0, s0
1241; GFX11-NEXT:    v_mov_b32_e32 v1, s1
1242; GFX11-NEXT:    v_mov_b32_e32 v2, s2
1243; GFX11-NEXT:    v_mov_b32_e32 v3, s3
1244; GFX11-NEXT:    s_clause 0x3
1245; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:256
1246; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:272
1247; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:288
1248; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:304
1249; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1250; GFX11-NEXT:    s_setpc_b64 s[30:31]
1251;
1252; GFX9-PAL-LABEL: zero_init_small_offset_foo:
1253; GFX9-PAL:       ; %bb.0:
1254; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1255; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 glc
1256; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1257; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1258; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
1259; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1260; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
1261; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
1262; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
1263; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
1264; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
1265; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1266; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1267; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1268; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1269; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1270; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1271;
1272; GFX940-LABEL: zero_init_small_offset_foo:
1273; GFX940:       ; %bb.0:
1274; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1275; GFX940-NEXT:    scratch_load_dword v0, off, s32 sc0 sc1
1276; GFX940-NEXT:    s_waitcnt vmcnt(0)
1277; GFX940-NEXT:    s_mov_b32 s0, 0
1278; GFX940-NEXT:    s_mov_b32 s1, s0
1279; GFX940-NEXT:    s_mov_b32 s2, s0
1280; GFX940-NEXT:    s_mov_b32 s3, s0
1281; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1282; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
1283; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1284; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1285; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1286; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1287; GFX940-NEXT:    s_waitcnt vmcnt(0)
1288; GFX940-NEXT:    s_setpc_b64 s[30:31]
1289;
1290; GFX10-PAL-LABEL: zero_init_small_offset_foo:
1291; GFX10-PAL:       ; %bb.0:
1292; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1293; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1294; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s32 glc dlc
1295; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1296; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
1297; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
1298; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1299; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
1300; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
1301; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
1302; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
1303; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
1304; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1305; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1306; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1307; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1308; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1309; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1310;
1311; GFX11-PAL-LABEL: zero_init_small_offset_foo:
1312; GFX11-PAL:       ; %bb.0:
1313; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1314; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1315; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s32 glc dlc
1316; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1317; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
1318; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1319; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
1320; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
1321; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
1322; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, s0
1323; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, s1
1324; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, s2
1325; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, s3
1326; GFX11-PAL-NEXT:    s_clause 0x3
1327; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:256
1328; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:272
1329; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:288
1330; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:304
1331; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1332; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
1333; GCN-LABEL: zero_init_small_offset_foo:
1334; GCN:       ; %bb.0:
1335; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1336; GCN-NEXT:    scratch_load_dword v0, off, s32 sc0 sc1
1337; GCN-NEXT:    s_waitcnt vmcnt(0)
1338; GCN-NEXT:    s_mov_b32 s0, 0
1339; GCN-NEXT:    s_mov_b32 s1, s0
1340; GCN-NEXT:    s_mov_b32 s2, s0
1341; GCN-NEXT:    s_mov_b32 s3, s0
1342; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1343; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
1344; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1345; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1346; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1347; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1348; GCN-NEXT:    s_waitcnt vmcnt(0)
1349; GCN-NEXT:    s_setpc_b64 s[30:31]
1350  %padding = alloca [64 x i32], align 4, addrspace(5)
1351  %alloca = alloca [32 x i16], align 2, addrspace(5)
1352  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1353  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1354  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
1355  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
1356  ret void
1357}
1358
1359define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
1360; GFX9-LABEL: store_load_sindex_small_offset_kernel:
1361; GFX9:       ; %bb.0: ; %bb
1362; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
1363; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
1364; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1365; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1366; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1367; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1368; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
1369; GFX9-NEXT:    s_and_b32 s0, s0, 15
1370; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1371; GFX9-NEXT:    s_addk_i32 s1, 0x104
1372; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1373; GFX9-NEXT:    scratch_store_dword off, v0, s1
1374; GFX9-NEXT:    s_waitcnt vmcnt(0)
1375; GFX9-NEXT:    s_addk_i32 s0, 0x104
1376; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1377; GFX9-NEXT:    s_waitcnt vmcnt(0)
1378; GFX9-NEXT:    s_endpgm
1379;
1380; GFX10-LABEL: store_load_sindex_small_offset_kernel:
1381; GFX10:       ; %bb.0: ; %bb
1382; GFX10-NEXT:    s_add_u32 s2, s2, s5
1383; GFX10-NEXT:    s_addc_u32 s3, s3, 0
1384; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1385; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1386; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
1387; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1388; GFX10-NEXT:    s_waitcnt vmcnt(0)
1389; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1390; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1391; GFX10-NEXT:    s_and_b32 s1, s0, 15
1392; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1393; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
1394; GFX10-NEXT:    s_addk_i32 s0, 0x104
1395; GFX10-NEXT:    s_addk_i32 s1, 0x104
1396; GFX10-NEXT:    scratch_store_dword off, v0, s0
1397; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1398; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1399; GFX10-NEXT:    s_waitcnt vmcnt(0)
1400; GFX10-NEXT:    s_endpgm
1401;
1402; GFX11-LABEL: store_load_sindex_small_offset_kernel:
1403; GFX11:       ; %bb.0: ; %bb
1404; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
1405; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
1406; GFX11-NEXT:    s_waitcnt vmcnt(0)
1407; GFX11-NEXT:    v_mov_b32_e32 v0, 15
1408; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1409; GFX11-NEXT:    s_and_b32 s1, s0, 15
1410; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
1411; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
1412; GFX11-NEXT:    s_addk_i32 s0, 0x104
1413; GFX11-NEXT:    s_addk_i32 s1, 0x104
1414; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
1415; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1416; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
1417; GFX11-NEXT:    s_waitcnt vmcnt(0)
1418; GFX11-NEXT:    s_endpgm
1419;
1420; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel:
1421; GFX9-PAL:       ; %bb.0: ; %bb
1422; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
1423; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
1424; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1425; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1426; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
1427; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1428; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1429; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
1430; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
1431; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1432; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1433; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1434; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1435; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1436; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x104
1437; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1438; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1439; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1440; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x104
1441; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
1442; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1443; GFX9-PAL-NEXT:    s_endpgm
1444;
1445; GFX940-LABEL: store_load_sindex_small_offset_kernel:
1446; GFX940:       ; %bb.0: ; %bb
1447; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
1448; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
1449; GFX940-NEXT:    s_waitcnt vmcnt(0)
1450; GFX940-NEXT:    v_mov_b32_e32 v0, 15
1451; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1452; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
1453; GFX940-NEXT:    s_and_b32 s0, s0, 15
1454; GFX940-NEXT:    s_addk_i32 s1, 0x104
1455; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
1456; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
1457; GFX940-NEXT:    s_waitcnt vmcnt(0)
1458; GFX940-NEXT:    s_addk_i32 s0, 0x104
1459; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
1460; GFX940-NEXT:    s_waitcnt vmcnt(0)
1461; GFX940-NEXT:    s_endpgm
1462;
1463; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel:
1464; GFX1010-PAL:       ; %bb.0: ; %bb
1465; GFX1010-PAL-NEXT:    s_getpc_b64 s[4:5]
1466; GFX1010-PAL-NEXT:    s_mov_b32 s4, s0
1467; GFX1010-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1468; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1469; GFX1010-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1470; GFX1010-PAL-NEXT:    s_add_u32 s4, s4, s3
1471; GFX1010-PAL-NEXT:    s_addc_u32 s5, s5, 0
1472; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
1473; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
1474; GFX1010-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
1475; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1476; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
1477; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1478; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
1479; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1480; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
1481; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1482; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1483; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x104
1484; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x104
1485; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
1486; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1487; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1488; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1489; GFX1010-PAL-NEXT:    s_endpgm
1490;
1491; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel:
1492; GFX1030-PAL:       ; %bb.0: ; %bb
1493; GFX1030-PAL-NEXT:    s_getpc_b64 s[4:5]
1494; GFX1030-PAL-NEXT:    s_mov_b32 s4, s0
1495; GFX1030-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1496; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1497; GFX1030-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1498; GFX1030-PAL-NEXT:    s_add_u32 s4, s4, s3
1499; GFX1030-PAL-NEXT:    s_addc_u32 s5, s5, 0
1500; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
1501; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
1502; GFX1030-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
1503; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1504; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1505; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
1506; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1507; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
1508; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1509; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1510; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x104
1511; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x104
1512; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
1513; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1514; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1515; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1516; GFX1030-PAL-NEXT:    s_endpgm
1517;
1518; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel:
1519; GFX11-PAL:       ; %bb.0: ; %bb
1520; GFX11-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
1521; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
1522; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1523; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
1524; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1525; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
1526; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1527; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1528; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x104
1529; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x104
1530; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
1531; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1532; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
1533; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1534; GFX11-PAL-NEXT:    s_endpgm
1535bb:
1536  %padding = alloca [64 x i32], align 4, addrspace(5)
1537  %i = alloca [32 x float], align 4, addrspace(5)
1538  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1539  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1540  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1541  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1542  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1543  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1544  %i9 = and i32 %idx, 15
1545  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1546  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1547  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1548  ret void
1549}
1550
1551define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
1552; GFX9-LABEL: store_load_sindex_small_offset_foo:
1553; GFX9:       ; %bb.0: ; %bb
1554; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1555; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1556; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1557; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1558; GFX9-NEXT:    s_waitcnt vmcnt(0)
1559; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
1560; GFX9-NEXT:    s_addk_i32 s0, 0x104
1561; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1562; GFX9-NEXT:    scratch_store_dword off, v0, s0
1563; GFX9-NEXT:    s_waitcnt vmcnt(0)
1564; GFX9-NEXT:    s_and_b32 s0, s2, 15
1565; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1566; GFX9-NEXT:    s_addk_i32 s0, 0x104
1567; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1568; GFX9-NEXT:    s_waitcnt vmcnt(0)
1569; GFX9-NEXT:    s_endpgm
1570;
1571; GFX10-LABEL: store_load_sindex_small_offset_foo:
1572; GFX10:       ; %bb.0: ; %bb
1573; GFX10-NEXT:    s_add_u32 s0, s0, s3
1574; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1575; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1576; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1577; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1578; GFX10-NEXT:    s_waitcnt vmcnt(0)
1579; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1580; GFX10-NEXT:    s_and_b32 s0, s2, 15
1581; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
1582; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1583; GFX10-NEXT:    s_addk_i32 s1, 0x104
1584; GFX10-NEXT:    s_addk_i32 s0, 0x104
1585; GFX10-NEXT:    scratch_store_dword off, v0, s1
1586; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1587; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
1588; GFX10-NEXT:    s_waitcnt vmcnt(0)
1589; GFX10-NEXT:    s_endpgm
1590;
1591; GFX11-LABEL: store_load_sindex_small_offset_foo:
1592; GFX11:       ; %bb.0: ; %bb
1593; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
1594; GFX11-NEXT:    s_waitcnt vmcnt(0)
1595; GFX11-NEXT:    v_mov_b32_e32 v0, 15
1596; GFX11-NEXT:    s_and_b32 s1, s0, 15
1597; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
1598; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
1599; GFX11-NEXT:    s_addk_i32 s0, 0x104
1600; GFX11-NEXT:    s_addk_i32 s1, 0x104
1601; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
1602; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1603; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
1604; GFX11-NEXT:    s_waitcnt vmcnt(0)
1605; GFX11-NEXT:    s_endpgm
1606;
1607; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo:
1608; GFX9-PAL:       ; %bb.0: ; %bb
1609; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1610; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1611; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1612; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1613; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1614; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1615; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1616; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1617; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1618; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1619; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1620; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1621; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x104
1622; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1623; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1624; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1625; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1626; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x104
1627; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
1628; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1629; GFX9-PAL-NEXT:    s_endpgm
1630;
1631; GFX940-LABEL: store_load_sindex_small_offset_foo:
1632; GFX940:       ; %bb.0: ; %bb
1633; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
1634; GFX940-NEXT:    s_waitcnt vmcnt(0)
1635; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
1636; GFX940-NEXT:    s_and_b32 s0, s0, 15
1637; GFX940-NEXT:    s_addk_i32 s1, 0x104
1638; GFX940-NEXT:    v_mov_b32_e32 v0, 15
1639; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
1640; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
1641; GFX940-NEXT:    s_waitcnt vmcnt(0)
1642; GFX940-NEXT:    s_addk_i32 s0, 0x104
1643; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
1644; GFX940-NEXT:    s_waitcnt vmcnt(0)
1645; GFX940-NEXT:    s_endpgm
1646;
1647; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo:
1648; GFX1010-PAL:       ; %bb.0: ; %bb
1649; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1650; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1651; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1652; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1653; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1654; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1655; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1656; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1657; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1658; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1659; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
1660; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
1661; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1662; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
1663; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1664; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1665; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x104
1666; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x104
1667; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
1668; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1669; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1670; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1671; GFX1010-PAL-NEXT:    s_endpgm
1672;
1673; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo:
1674; GFX1030-PAL:       ; %bb.0: ; %bb
1675; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1676; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1677; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1678; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1679; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1680; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1681; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1682; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1683; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1684; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1685; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1686; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
1687; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
1688; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1689; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1690; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x104
1691; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x104
1692; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
1693; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1694; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1695; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1696; GFX1030-PAL-NEXT:    s_endpgm
1697;
1698; GFX11-PAL-LABEL: store_load_sindex_small_offset_foo:
1699; GFX11-PAL:       ; %bb.0: ; %bb
1700; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
1701; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1702; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
1703; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
1704; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1705; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1706; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x104
1707; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x104
1708; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
1709; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1710; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
1711; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1712; GFX11-PAL-NEXT:    s_endpgm
1713bb:
1714  %padding = alloca [64 x i32], align 4, addrspace(5)
1715  %i = alloca [32 x float], align 4, addrspace(5)
1716  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1717  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1718  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1719  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1720  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1721  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1722  %i9 = and i32 %idx, 15
1723  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1724  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1725  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1726  ret void
1727}
1728
1729define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
1730; GFX9-LABEL: store_load_vindex_small_offset_kernel:
1731; GFX9:       ; %bb.0: ; %bb
1732; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1733; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1734; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1735; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
1736; GFX9-NEXT:    s_waitcnt vmcnt(0)
1737; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1738; GFX9-NEXT:    v_add_u32_e32 v1, 0x104, v0
1739; GFX9-NEXT:    v_mov_b32_e32 v2, 15
1740; GFX9-NEXT:    scratch_store_dword v1, v2, off
1741; GFX9-NEXT:    s_waitcnt vmcnt(0)
1742; GFX9-NEXT:    v_sub_u32_e32 v0, 0x104, v0
1743; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
1744; GFX9-NEXT:    s_waitcnt vmcnt(0)
1745; GFX9-NEXT:    s_endpgm
1746;
1747; GFX10-LABEL: store_load_vindex_small_offset_kernel:
1748; GFX10:       ; %bb.0: ; %bb
1749; GFX10-NEXT:    s_add_u32 s0, s0, s3
1750; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1751; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1752; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1753; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1754; GFX10-NEXT:    v_mov_b32_e32 v2, 15
1755; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
1756; GFX10-NEXT:    s_waitcnt vmcnt(0)
1757; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
1758; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
1759; GFX10-NEXT:    scratch_store_dword v1, v2, off
1760; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1761; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1762; GFX10-NEXT:    s_waitcnt vmcnt(0)
1763; GFX10-NEXT:    s_endpgm
1764;
1765; GFX11-LABEL: store_load_vindex_small_offset_kernel:
1766; GFX11:       ; %bb.0: ; %bb
1767; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1768; GFX11-NEXT:    v_mov_b32_e32 v1, 15
1769; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
1770; GFX11-NEXT:    s_waitcnt vmcnt(0)
1771; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
1772; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:260 dlc
1773; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1774; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
1775; GFX11-NEXT:    s_waitcnt vmcnt(0)
1776; GFX11-NEXT:    s_endpgm
1777;
1778; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel:
1779; GFX9-PAL:       ; %bb.0: ; %bb
1780; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1781; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1782; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1783; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1784; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1785; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
1786; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1787; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1788; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1789; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1790; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
1791; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1792; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x104, v0
1793; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
1794; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1795; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x104, v0
1796; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
1797; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1798; GFX9-PAL-NEXT:    s_endpgm
1799;
1800; GFX940-LABEL: store_load_vindex_small_offset_kernel:
1801; GFX940:       ; %bb.0: ; %bb
1802; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
1803; GFX940-NEXT:    s_waitcnt vmcnt(0)
1804; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1805; GFX940-NEXT:    v_mov_b32_e32 v1, 15
1806; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:260 sc0 sc1
1807; GFX940-NEXT:    s_waitcnt vmcnt(0)
1808; GFX940-NEXT:    v_sub_u32_e32 v0, 0x104, v0
1809; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
1810; GFX940-NEXT:    s_waitcnt vmcnt(0)
1811; GFX940-NEXT:    s_endpgm
1812;
1813; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel:
1814; GFX1010-PAL:       ; %bb.0: ; %bb
1815; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1816; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1817; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1818; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1819; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1820; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1821; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1822; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1823; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1824; GFX1010-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1825; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, 15
1826; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1827; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, vcc_lo offset:4 glc dlc
1828; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1829; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
1830; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
1831; GFX1010-PAL-NEXT:    scratch_store_dword v1, v2, off
1832; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1833; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1834; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1835; GFX1010-PAL-NEXT:    s_endpgm
1836;
1837; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel:
1838; GFX1030-PAL:       ; %bb.0: ; %bb
1839; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1840; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1841; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1842; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1843; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1844; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1845; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1846; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1847; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1848; GFX1030-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1849; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, 15
1850; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
1851; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1852; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
1853; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
1854; GFX1030-PAL-NEXT:    scratch_store_dword v1, v2, off
1855; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1856; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1857; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1858; GFX1030-PAL-NEXT:    s_endpgm
1859;
1860; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel:
1861; GFX11-PAL:       ; %bb.0: ; %bb
1862; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1863; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
1864; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
1865; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1866; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
1867; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:260 dlc
1868; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1869; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
1870; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1871; GFX11-PAL-NEXT:    s_endpgm
1872bb:
1873  %padding = alloca [64 x i32], align 4, addrspace(5)
1874  %i = alloca [32 x float], align 4, addrspace(5)
1875  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1876  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1877  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1878  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
1879  %i3 = zext i32 %i2 to i64
1880  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
1881  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1882  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1883  %i9 = sub nsw i32 31, %i2
1884  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1885  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1886  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1887  ret void
1888}
1889
1890define void @store_load_vindex_small_offset_foo(i32 %idx) {
1891; GFX9-LABEL: store_load_vindex_small_offset_foo:
1892; GFX9:       ; %bb.0: ; %bb
1893; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1894; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
1895; GFX9-NEXT:    s_waitcnt vmcnt(0)
1896; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x100
1897; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
1898; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1899; GFX9-NEXT:    v_mov_b32_e32 v3, 15
1900; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
1901; GFX9-NEXT:    scratch_store_dword v2, v3, off
1902; GFX9-NEXT:    s_waitcnt vmcnt(0)
1903; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1904; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
1905; GFX9-NEXT:    s_waitcnt vmcnt(0)
1906; GFX9-NEXT:    s_setpc_b64 s[30:31]
1907;
1908; GFX10-LABEL: store_load_vindex_small_offset_foo:
1909; GFX10:       ; %bb.0: ; %bb
1910; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1911; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1912; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
1913; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
1914; GFX10-NEXT:    v_mov_b32_e32 v2, 15
1915; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, vcc_lo
1916; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
1917; GFX10-NEXT:    scratch_load_dword v3, off, s32 glc dlc
1918; GFX10-NEXT:    s_waitcnt vmcnt(0)
1919; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
1920; GFX10-NEXT:    scratch_store_dword v0, v2, off
1921; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1922; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
1923; GFX10-NEXT:    s_waitcnt vmcnt(0)
1924; GFX10-NEXT:    s_setpc_b64 s[30:31]
1925;
1926; GFX11-LABEL: store_load_vindex_small_offset_foo:
1927; GFX11:       ; %bb.0: ; %bb
1928; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1929; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1930; GFX11-NEXT:    v_and_b32_e32 v1, 15, v0
1931; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1932; GFX11-NEXT:    v_mov_b32_e32 v2, 15
1933; GFX11-NEXT:    scratch_load_b32 v3, off, s32 glc dlc
1934; GFX11-NEXT:    s_waitcnt vmcnt(0)
1935; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
1936; GFX11-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 dlc
1937; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1938; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 glc dlc
1939; GFX11-NEXT:    s_waitcnt vmcnt(0)
1940; GFX11-NEXT:    s_setpc_b64 s[30:31]
1941;
1942; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
1943; GFX9-PAL:       ; %bb.0: ; %bb
1944; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1945; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 glc
1946; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1947; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x100
1948; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
1949; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1950; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
1951; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
1952; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1953; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1954; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1955; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
1956; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1957; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1958;
1959; GFX940-LABEL: store_load_vindex_small_offset_foo:
1960; GFX940:       ; %bb.0: ; %bb
1961; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1962; GFX940-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
1963; GFX940-NEXT:    s_waitcnt vmcnt(0)
1964; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1965; GFX940-NEXT:    v_mov_b32_e32 v2, 15
1966; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
1967; GFX940-NEXT:    scratch_store_dword v1, v2, s32 offset:256 sc0 sc1
1968; GFX940-NEXT:    s_waitcnt vmcnt(0)
1969; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1970; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
1971; GFX940-NEXT:    s_waitcnt vmcnt(0)
1972; GFX940-NEXT:    s_setpc_b64 s[30:31]
1973;
1974; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo:
1975; GFX10-PAL:       ; %bb.0: ; %bb
1976; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1977; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1978; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
1979; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x100
1980; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
1981; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, vcc_lo
1982; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x100
1983; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 glc dlc
1984; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1985; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
1986; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
1987; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1988; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
1989; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1990; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1991;
1992; GFX11-PAL-LABEL: store_load_vindex_small_offset_foo:
1993; GFX11-PAL:       ; %bb.0: ; %bb
1994; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1995; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1996; GFX11-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
1997; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1998; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 15
1999; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, s32 glc dlc
2000; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2001; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
2002; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 dlc
2003; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2004; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 glc dlc
2005; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2006; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
2007; GCN-LABEL: store_load_vindex_small_offset_foo:
2008; GCN:       ; %bb.0: ; %bb
2009; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2010; GCN-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
2011; GCN-NEXT:    s_waitcnt vmcnt(0)
2012; GCN-NEXT:    v_mov_b32_e32 v2, 15
2013; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
2014; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
2015; GCN-NEXT:    scratch_store_dword v1, v2, s32 offset:256 sc0 sc1
2016; GCN-NEXT:    s_waitcnt vmcnt(0)
2017; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2018; GCN-NEXT:    scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
2019; GCN-NEXT:    s_waitcnt vmcnt(0)
2020; GCN-NEXT:    s_setpc_b64 s[30:31]
2021bb:
2022  %padding = alloca [64 x i32], align 4, addrspace(5)
2023  %i = alloca [32 x float], align 4, addrspace(5)
2024  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
2025  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2026  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2027  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
2028  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2029  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2030  %i9 = and i32 %idx, 15
2031  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2032  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2033  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2034  ret void
2035}
2036
2037define amdgpu_kernel void @zero_init_large_offset_kernel() {
2038; GFX9-LABEL: zero_init_large_offset_kernel:
2039; GFX9:       ; %bb.0:
2040; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
2041; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
2042; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
2043; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:16 glc
2044; GFX9-NEXT:    s_waitcnt vmcnt(0)
2045; GFX9-NEXT:    s_mov_b32 s0, 0
2046; GFX9-NEXT:    s_mov_b32 s1, s0
2047; GFX9-NEXT:    s_mov_b32 s2, s0
2048; GFX9-NEXT:    s_mov_b32 s3, s0
2049; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2050; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2051; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2052; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2053; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
2054; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
2055; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
2056; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
2057; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
2058; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
2059; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
2060; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
2061; GFX9-NEXT:    s_endpgm
2062;
2063; GFX10-LABEL: zero_init_large_offset_kernel:
2064; GFX10:       ; %bb.0:
2065; GFX10-NEXT:    s_add_u32 s0, s0, s3
2066; GFX10-NEXT:    s_addc_u32 s1, s1, 0
2067; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
2068; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
2069; GFX10-NEXT:    scratch_load_dword v0, off, off offset:16 glc dlc
2070; GFX10-NEXT:    s_waitcnt vmcnt(0)
2071; GFX10-NEXT:    s_mov_b32 s0, 0
2072; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
2073; GFX10-NEXT:    s_mov_b32 s1, s0
2074; GFX10-NEXT:    s_mov_b32 s2, s0
2075; GFX10-NEXT:    s_mov_b32 s3, s0
2076; GFX10-NEXT:    v_mov_b32_e32 v0, s0
2077; GFX10-NEXT:    v_mov_b32_e32 v1, s1
2078; GFX10-NEXT:    v_mov_b32_e32 v2, s2
2079; GFX10-NEXT:    v_mov_b32_e32 v3, s3
2080; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
2081; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
2082; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
2083; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
2084; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
2085; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
2086; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
2087; GFX10-NEXT:    s_endpgm
2088;
2089; GFX11-LABEL: zero_init_large_offset_kernel:
2090; GFX11:       ; %bb.0:
2091; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:16 glc dlc
2092; GFX11-NEXT:    s_waitcnt vmcnt(0)
2093; GFX11-NEXT:    s_mov_b32 s0, 0
2094; GFX11-NEXT:    s_movk_i32 vcc_lo, 0x4010
2095; GFX11-NEXT:    s_mov_b32 s1, s0
2096; GFX11-NEXT:    s_mov_b32 s2, s0
2097; GFX11-NEXT:    s_mov_b32 s3, s0
2098; GFX11-NEXT:    v_mov_b32_e32 v0, s0
2099; GFX11-NEXT:    v_mov_b32_e32 v1, s1
2100; GFX11-NEXT:    v_mov_b32_e32 v2, s2
2101; GFX11-NEXT:    v_mov_b32_e32 v3, s3
2102; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo
2103; GFX11-NEXT:    s_movk_i32 vcc_lo, 0x4010
2104; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:16
2105; GFX11-NEXT:    s_movk_i32 vcc_lo, 0x4010
2106; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:32
2107; GFX11-NEXT:    s_movk_i32 vcc_lo, 0x4010
2108; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:48
2109; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2110; GFX11-NEXT:    s_endpgm
2111;
2112; GFX9-PAL-LABEL: zero_init_large_offset_kernel:
2113; GFX9-PAL:       ; %bb.0:
2114; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
2115; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2116; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2117; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
2118; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
2119; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2120; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2121; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
2122; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2123; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:16 glc
2124; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2125; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
2126; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2127; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
2128; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
2129; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
2130; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
2131; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
2132; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
2133; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
2134; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
2135; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
2136; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
2137; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
2138; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
2139; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
2140; GFX9-PAL-NEXT:    s_endpgm
2141;
2142; GFX940-LABEL: zero_init_large_offset_kernel:
2143; GFX940:       ; %bb.0:
2144; GFX940-NEXT:    scratch_load_dword v0, off, off offset:16 sc0 sc1
2145; GFX940-NEXT:    s_waitcnt vmcnt(0)
2146; GFX940-NEXT:    s_mov_b32 s0, 0
2147; GFX940-NEXT:    s_mov_b32 s1, s0
2148; GFX940-NEXT:    s_mov_b32 s2, s0
2149; GFX940-NEXT:    s_mov_b32 s3, s0
2150; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2151; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
2152; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4010
2153; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
2154; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4010
2155; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
2156; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4010
2157; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
2158; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4010
2159; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
2160; GFX940-NEXT:    s_endpgm
2161;
2162; GFX1010-PAL-LABEL: zero_init_large_offset_kernel:
2163; GFX1010-PAL:       ; %bb.0:
2164; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
2165; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2166; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2167; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2168; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2169; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
2170; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
2171; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2172; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2173; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2174; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
2175; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:16 glc dlc
2176; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2177; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
2178; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2179; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
2180; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
2181; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
2182; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
2183; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
2184; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2185; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
2186; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2187; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2188; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
2189; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2190; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2191; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
2192; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2193; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2194; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
2195; GFX1010-PAL-NEXT:    s_endpgm
2196;
2197; GFX1030-PAL-LABEL: zero_init_large_offset_kernel:
2198; GFX1030-PAL:       ; %bb.0:
2199; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
2200; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2201; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2202; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2203; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2204; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
2205; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
2206; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2207; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2208; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:16 glc dlc
2209; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2210; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
2211; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2212; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
2213; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2214; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
2215; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
2216; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
2217; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
2218; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
2219; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
2220; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2221; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
2222; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2223; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
2224; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2225; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
2226; GFX1030-PAL-NEXT:    s_endpgm
2227;
2228; GFX11-PAL-LABEL: zero_init_large_offset_kernel:
2229; GFX11-PAL:       ; %bb.0:
2230; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:16 glc dlc
2231; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2232; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
2233; GFX11-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2234; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
2235; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
2236; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
2237; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, s0
2238; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, s1
2239; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, s2
2240; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, s3
2241; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo
2242; GFX11-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2243; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:16
2244; GFX11-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2245; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:32
2246; GFX11-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2247; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:48
2248; GFX11-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2249; GFX11-PAL-NEXT:    s_endpgm
2250  %padding = alloca [4096 x i32], align 4, addrspace(5)
2251  %alloca = alloca [32 x i16], align 2, addrspace(5)
2252  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2253  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2254  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
2255  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
2256  ret void
2257}
2258
2259define void @zero_init_large_offset_foo() {
2260; GFX9-LABEL: zero_init_large_offset_foo:
2261; GFX9:       ; %bb.0:
2262; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2263; GFX9-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc
2264; GFX9-NEXT:    s_waitcnt vmcnt(0)
2265; GFX9-NEXT:    s_mov_b32 s0, 0
2266; GFX9-NEXT:    s_mov_b32 s1, s0
2267; GFX9-NEXT:    s_mov_b32 s2, s0
2268; GFX9-NEXT:    s_mov_b32 s3, s0
2269; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2270; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2271; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2272; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2273; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2274; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
2275; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2276; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
2277; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2278; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
2279; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2280; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
2281; GFX9-NEXT:    s_waitcnt vmcnt(0)
2282; GFX9-NEXT:    s_setpc_b64 s[30:31]
2283;
2284; GFX10-LABEL: zero_init_large_offset_foo:
2285; GFX10:       ; %bb.0:
2286; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2287; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2288; GFX10-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc dlc
2289; GFX10-NEXT:    s_waitcnt vmcnt(0)
2290; GFX10-NEXT:    s_mov_b32 s0, 0
2291; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2292; GFX10-NEXT:    s_mov_b32 s1, s0
2293; GFX10-NEXT:    s_mov_b32 s2, s0
2294; GFX10-NEXT:    s_mov_b32 s3, s0
2295; GFX10-NEXT:    v_mov_b32_e32 v0, s0
2296; GFX10-NEXT:    v_mov_b32_e32 v1, s1
2297; GFX10-NEXT:    v_mov_b32_e32 v2, s2
2298; GFX10-NEXT:    v_mov_b32_e32 v3, s3
2299; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
2300; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2301; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
2302; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2303; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
2304; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2305; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
2306; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2307; GFX10-NEXT:    s_setpc_b64 s[30:31]
2308;
2309; GFX11-LABEL: zero_init_large_offset_foo:
2310; GFX11:       ; %bb.0:
2311; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2312; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2313; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:16 glc dlc
2314; GFX11-NEXT:    s_waitcnt vmcnt(0)
2315; GFX11-NEXT:    s_mov_b32 s0, 0
2316; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2317; GFX11-NEXT:    s_mov_b32 s1, s0
2318; GFX11-NEXT:    s_mov_b32 s2, s0
2319; GFX11-NEXT:    s_mov_b32 s3, s0
2320; GFX11-NEXT:    v_mov_b32_e32 v0, s0
2321; GFX11-NEXT:    v_mov_b32_e32 v1, s1
2322; GFX11-NEXT:    v_mov_b32_e32 v2, s2
2323; GFX11-NEXT:    v_mov_b32_e32 v3, s3
2324; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo
2325; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2326; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:16
2327; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2328; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:32
2329; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2330; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:48
2331; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2332; GFX11-NEXT:    s_setpc_b64 s[30:31]
2333;
2334; GFX9-PAL-LABEL: zero_init_large_offset_foo:
2335; GFX9-PAL:       ; %bb.0:
2336; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2337; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc
2338; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2339; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
2340; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
2341; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2342; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
2343; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
2344; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
2345; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
2346; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
2347; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2348; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
2349; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2350; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
2351; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2352; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
2353; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2354; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
2355; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2356; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2357;
2358; GFX940-LABEL: zero_init_large_offset_foo:
2359; GFX940:       ; %bb.0:
2360; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2361; GFX940-NEXT:    scratch_load_dword v0, off, s32 offset:16 sc0 sc1
2362; GFX940-NEXT:    s_waitcnt vmcnt(0)
2363; GFX940-NEXT:    s_mov_b32 s0, 0
2364; GFX940-NEXT:    s_mov_b32 s1, s0
2365; GFX940-NEXT:    s_mov_b32 s2, s0
2366; GFX940-NEXT:    s_mov_b32 s3, s0
2367; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2368; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
2369; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2370; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
2371; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2372; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
2373; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2374; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
2375; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2376; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
2377; GFX940-NEXT:    s_waitcnt vmcnt(0)
2378; GFX940-NEXT:    s_setpc_b64 s[30:31]
2379;
2380; GFX1010-PAL-LABEL: zero_init_large_offset_foo:
2381; GFX1010-PAL:       ; %bb.0:
2382; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2383; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2384; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc dlc
2385; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2386; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
2387; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2388; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
2389; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2390; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
2391; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
2392; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
2393; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
2394; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
2395; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
2396; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2397; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2398; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
2399; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2400; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2401; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
2402; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2403; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2404; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
2405; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2406; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
2407;
2408; GFX1030-PAL-LABEL: zero_init_large_offset_foo:
2409; GFX1030-PAL:       ; %bb.0:
2410; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2411; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2412; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc dlc
2413; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2414; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
2415; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2416; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
2417; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2418; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
2419; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
2420; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
2421; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
2422; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
2423; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
2424; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2425; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
2426; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2427; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
2428; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2429; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
2430; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2431; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
2432;
2433; GFX11-PAL-LABEL: zero_init_large_offset_foo:
2434; GFX11-PAL:       ; %bb.0:
2435; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2436; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2437; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s32 offset:16 glc dlc
2438; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2439; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
2440; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2441; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
2442; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
2443; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
2444; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, s0
2445; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, s1
2446; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, s2
2447; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, s3
2448; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo
2449; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2450; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:16
2451; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2452; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:32
2453; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2454; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:48
2455; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2456; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
2457  %padding = alloca [4096 x i32], align 4, addrspace(5)
2458  %alloca = alloca [32 x i16], align 2, addrspace(5)
2459  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2460  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2461  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
2462  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
2463  ret void
2464}
2465
2466define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
2467; GFX9-LABEL: store_load_sindex_large_offset_kernel:
2468; GFX9:       ; %bb.0: ; %bb
2469; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
2470; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
2471; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2472; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
2473; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
2474; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2475; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
2476; GFX9-NEXT:    s_and_b32 s0, s0, 15
2477; GFX9-NEXT:    v_mov_b32_e32 v0, 15
2478; GFX9-NEXT:    s_addk_i32 s1, 0x4004
2479; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
2480; GFX9-NEXT:    scratch_store_dword off, v0, s1
2481; GFX9-NEXT:    s_waitcnt vmcnt(0)
2482; GFX9-NEXT:    s_addk_i32 s0, 0x4004
2483; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
2484; GFX9-NEXT:    s_waitcnt vmcnt(0)
2485; GFX9-NEXT:    s_endpgm
2486;
2487; GFX10-LABEL: store_load_sindex_large_offset_kernel:
2488; GFX10:       ; %bb.0: ; %bb
2489; GFX10-NEXT:    s_add_u32 s2, s2, s5
2490; GFX10-NEXT:    s_addc_u32 s3, s3, 0
2491; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2492; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2493; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
2494; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2495; GFX10-NEXT:    s_waitcnt vmcnt(0)
2496; GFX10-NEXT:    v_mov_b32_e32 v0, 15
2497; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2498; GFX10-NEXT:    s_and_b32 s1, s0, 15
2499; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
2500; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
2501; GFX10-NEXT:    s_addk_i32 s0, 0x4004
2502; GFX10-NEXT:    s_addk_i32 s1, 0x4004
2503; GFX10-NEXT:    scratch_store_dword off, v0, s0
2504; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2505; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2506; GFX10-NEXT:    s_waitcnt vmcnt(0)
2507; GFX10-NEXT:    s_endpgm
2508;
2509; GFX11-LABEL: store_load_sindex_large_offset_kernel:
2510; GFX11:       ; %bb.0: ; %bb
2511; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
2512; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
2513; GFX11-NEXT:    s_waitcnt vmcnt(0)
2514; GFX11-NEXT:    v_mov_b32_e32 v0, 15
2515; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2516; GFX11-NEXT:    s_and_b32 s1, s0, 15
2517; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
2518; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
2519; GFX11-NEXT:    s_addk_i32 s0, 0x4004
2520; GFX11-NEXT:    s_addk_i32 s1, 0x4004
2521; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
2522; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2523; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
2524; GFX11-NEXT:    s_waitcnt vmcnt(0)
2525; GFX11-NEXT:    s_endpgm
2526;
2527; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel:
2528; GFX9-PAL:       ; %bb.0: ; %bb
2529; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
2530; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
2531; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2532; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
2533; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
2534; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2535; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2536; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
2537; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
2538; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
2539; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2540; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
2541; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
2542; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
2543; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x4004
2544; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2545; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
2546; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2547; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x4004
2548; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
2549; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2550; GFX9-PAL-NEXT:    s_endpgm
2551;
2552; GFX940-LABEL: store_load_sindex_large_offset_kernel:
2553; GFX940:       ; %bb.0: ; %bb
2554; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
2555; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
2556; GFX940-NEXT:    s_waitcnt vmcnt(0)
2557; GFX940-NEXT:    v_mov_b32_e32 v0, 15
2558; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
2559; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
2560; GFX940-NEXT:    s_and_b32 s0, s0, 15
2561; GFX940-NEXT:    s_addk_i32 s1, 0x4004
2562; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
2563; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
2564; GFX940-NEXT:    s_waitcnt vmcnt(0)
2565; GFX940-NEXT:    s_addk_i32 s0, 0x4004
2566; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
2567; GFX940-NEXT:    s_waitcnt vmcnt(0)
2568; GFX940-NEXT:    s_endpgm
2569;
2570; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel:
2571; GFX1010-PAL:       ; %bb.0: ; %bb
2572; GFX1010-PAL-NEXT:    s_getpc_b64 s[4:5]
2573; GFX1010-PAL-NEXT:    s_mov_b32 s4, s0
2574; GFX1010-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2575; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2576; GFX1010-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2577; GFX1010-PAL-NEXT:    s_add_u32 s4, s4, s3
2578; GFX1010-PAL-NEXT:    s_addc_u32 s5, s5, 0
2579; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
2580; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
2581; GFX1010-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
2582; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2583; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
2584; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2585; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
2586; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2587; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
2588; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2589; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2590; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x4004
2591; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x4004
2592; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
2593; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2594; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2595; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2596; GFX1010-PAL-NEXT:    s_endpgm
2597;
2598; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel:
2599; GFX1030-PAL:       ; %bb.0: ; %bb
2600; GFX1030-PAL-NEXT:    s_getpc_b64 s[4:5]
2601; GFX1030-PAL-NEXT:    s_mov_b32 s4, s0
2602; GFX1030-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2603; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2604; GFX1030-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2605; GFX1030-PAL-NEXT:    s_add_u32 s4, s4, s3
2606; GFX1030-PAL-NEXT:    s_addc_u32 s5, s5, 0
2607; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
2608; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
2609; GFX1030-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
2610; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2611; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2612; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
2613; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2614; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
2615; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2616; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2617; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x4004
2618; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x4004
2619; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
2620; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2621; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2622; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2623; GFX1030-PAL-NEXT:    s_endpgm
2624;
2625; GFX11-PAL-LABEL: store_load_sindex_large_offset_kernel:
2626; GFX11-PAL:       ; %bb.0: ; %bb
2627; GFX11-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
2628; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
2629; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2630; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
2631; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2632; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
2633; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2634; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2635; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x4004
2636; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x4004
2637; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
2638; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2639; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
2640; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2641; GFX11-PAL-NEXT:    s_endpgm
2642bb:
2643  %padding = alloca [4096 x i32], align 4, addrspace(5)
2644  %i = alloca [32 x float], align 4, addrspace(5)
2645  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2646  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2647  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2648  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
2649  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2650  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2651  %i9 = and i32 %idx, 15
2652  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2653  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2654  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2655  ret void
2656}
2657
2658define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
2659; GFX9-LABEL: store_load_sindex_large_offset_foo:
2660; GFX9:       ; %bb.0: ; %bb
2661; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
2662; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
2663; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
2664; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
2665; GFX9-NEXT:    s_waitcnt vmcnt(0)
2666; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
2667; GFX9-NEXT:    s_addk_i32 s0, 0x4004
2668; GFX9-NEXT:    v_mov_b32_e32 v0, 15
2669; GFX9-NEXT:    scratch_store_dword off, v0, s0
2670; GFX9-NEXT:    s_waitcnt vmcnt(0)
2671; GFX9-NEXT:    s_and_b32 s0, s2, 15
2672; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
2673; GFX9-NEXT:    s_addk_i32 s0, 0x4004
2674; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
2675; GFX9-NEXT:    s_waitcnt vmcnt(0)
2676; GFX9-NEXT:    s_endpgm
2677;
2678; GFX10-LABEL: store_load_sindex_large_offset_foo:
2679; GFX10:       ; %bb.0: ; %bb
2680; GFX10-NEXT:    s_add_u32 s0, s0, s3
2681; GFX10-NEXT:    s_addc_u32 s1, s1, 0
2682; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
2683; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
2684; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2685; GFX10-NEXT:    s_waitcnt vmcnt(0)
2686; GFX10-NEXT:    v_mov_b32_e32 v0, 15
2687; GFX10-NEXT:    s_and_b32 s0, s2, 15
2688; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
2689; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
2690; GFX10-NEXT:    s_addk_i32 s1, 0x4004
2691; GFX10-NEXT:    s_addk_i32 s0, 0x4004
2692; GFX10-NEXT:    scratch_store_dword off, v0, s1
2693; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2694; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
2695; GFX10-NEXT:    s_waitcnt vmcnt(0)
2696; GFX10-NEXT:    s_endpgm
2697;
2698; GFX11-LABEL: store_load_sindex_large_offset_foo:
2699; GFX11:       ; %bb.0: ; %bb
2700; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
2701; GFX11-NEXT:    s_waitcnt vmcnt(0)
2702; GFX11-NEXT:    v_mov_b32_e32 v0, 15
2703; GFX11-NEXT:    s_and_b32 s1, s0, 15
2704; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
2705; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
2706; GFX11-NEXT:    s_addk_i32 s0, 0x4004
2707; GFX11-NEXT:    s_addk_i32 s1, 0x4004
2708; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
2709; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2710; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
2711; GFX11-NEXT:    s_waitcnt vmcnt(0)
2712; GFX11-NEXT:    s_endpgm
2713;
2714; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo:
2715; GFX9-PAL:       ; %bb.0: ; %bb
2716; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
2717; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2718; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2719; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
2720; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2721; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2722; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
2723; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2724; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
2725; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2726; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
2727; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
2728; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x4004
2729; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
2730; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2731; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
2732; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2733; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x4004
2734; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
2735; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2736; GFX9-PAL-NEXT:    s_endpgm
2737;
2738; GFX940-LABEL: store_load_sindex_large_offset_foo:
2739; GFX940:       ; %bb.0: ; %bb
2740; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
2741; GFX940-NEXT:    s_waitcnt vmcnt(0)
2742; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
2743; GFX940-NEXT:    s_and_b32 s0, s0, 15
2744; GFX940-NEXT:    s_addk_i32 s1, 0x4004
2745; GFX940-NEXT:    v_mov_b32_e32 v0, 15
2746; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
2747; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
2748; GFX940-NEXT:    s_waitcnt vmcnt(0)
2749; GFX940-NEXT:    s_addk_i32 s0, 0x4004
2750; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
2751; GFX940-NEXT:    s_waitcnt vmcnt(0)
2752; GFX940-NEXT:    s_endpgm
2753;
2754; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo:
2755; GFX1010-PAL:       ; %bb.0: ; %bb
2756; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
2757; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2758; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2759; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2760; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2761; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
2762; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
2763; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2764; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2765; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2766; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
2767; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
2768; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2769; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
2770; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2771; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2772; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x4004
2773; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x4004
2774; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
2775; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2776; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2777; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2778; GFX1010-PAL-NEXT:    s_endpgm
2779;
2780; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo:
2781; GFX1030-PAL:       ; %bb.0: ; %bb
2782; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
2783; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2784; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2785; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2786; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2787; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
2788; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
2789; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2790; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2791; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2792; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2793; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
2794; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
2795; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2796; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2797; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x4004
2798; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x4004
2799; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
2800; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2801; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2802; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2803; GFX1030-PAL-NEXT:    s_endpgm
2804;
2805; GFX11-PAL-LABEL: store_load_sindex_large_offset_foo:
2806; GFX11-PAL:       ; %bb.0: ; %bb
2807; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
2808; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2809; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
2810; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
2811; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2812; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2813; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x4004
2814; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x4004
2815; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
2816; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2817; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
2818; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2819; GFX11-PAL-NEXT:    s_endpgm
2820bb:
2821  %padding = alloca [4096 x i32], align 4, addrspace(5)
2822  %i = alloca [32 x float], align 4, addrspace(5)
2823  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2824  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2825  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2826  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
2827  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2828  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2829  %i9 = and i32 %idx, 15
2830  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2831  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2832  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2833  ret void
2834}
2835
2836define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
2837; GFX9-LABEL: store_load_vindex_large_offset_kernel:
2838; GFX9:       ; %bb.0: ; %bb
2839; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
2840; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
2841; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
2842; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
2843; GFX9-NEXT:    s_waitcnt vmcnt(0)
2844; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2845; GFX9-NEXT:    v_add_u32_e32 v1, 0x4004, v0
2846; GFX9-NEXT:    v_mov_b32_e32 v2, 15
2847; GFX9-NEXT:    scratch_store_dword v1, v2, off
2848; GFX9-NEXT:    s_waitcnt vmcnt(0)
2849; GFX9-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
2850; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
2851; GFX9-NEXT:    s_waitcnt vmcnt(0)
2852; GFX9-NEXT:    s_endpgm
2853;
2854; GFX10-LABEL: store_load_vindex_large_offset_kernel:
2855; GFX10:       ; %bb.0: ; %bb
2856; GFX10-NEXT:    s_add_u32 s0, s0, s3
2857; GFX10-NEXT:    s_addc_u32 s1, s1, 0
2858; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
2859; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
2860; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2861; GFX10-NEXT:    v_mov_b32_e32 v2, 15
2862; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
2863; GFX10-NEXT:    s_waitcnt vmcnt(0)
2864; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
2865; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
2866; GFX10-NEXT:    scratch_store_dword v1, v2, off
2867; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2868; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
2869; GFX10-NEXT:    s_waitcnt vmcnt(0)
2870; GFX10-NEXT:    s_endpgm
2871;
2872; GFX11-LABEL: store_load_vindex_large_offset_kernel:
2873; GFX11:       ; %bb.0: ; %bb
2874; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2875; GFX11-NEXT:    v_mov_b32_e32 v1, 15
2876; GFX11-NEXT:    s_movk_i32 vcc_lo, 0x4004
2877; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
2878; GFX11-NEXT:    s_waitcnt vmcnt(0)
2879; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
2880; GFX11-NEXT:    scratch_store_b32 v0, v1, vcc_lo dlc
2881; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2882; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
2883; GFX11-NEXT:    s_waitcnt vmcnt(0)
2884; GFX11-NEXT:    s_endpgm
2885;
2886; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel:
2887; GFX9-PAL:       ; %bb.0: ; %bb
2888; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
2889; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2890; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2891; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
2892; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2893; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
2894; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2895; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2896; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
2897; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2898; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
2899; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2900; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x4004, v0
2901; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
2902; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2903; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
2904; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
2905; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2906; GFX9-PAL-NEXT:    s_endpgm
2907;
2908; GFX940-LABEL: store_load_vindex_large_offset_kernel:
2909; GFX940:       ; %bb.0: ; %bb
2910; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
2911; GFX940-NEXT:    s_waitcnt vmcnt(0)
2912; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2913; GFX940-NEXT:    v_mov_b32_e32 v1, 15
2914; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4004
2915; GFX940-NEXT:    scratch_store_dword v0, v1, vcc_hi sc0 sc1
2916; GFX940-NEXT:    s_waitcnt vmcnt(0)
2917; GFX940-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
2918; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
2919; GFX940-NEXT:    s_waitcnt vmcnt(0)
2920; GFX940-NEXT:    s_endpgm
2921;
2922; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel:
2923; GFX1010-PAL:       ; %bb.0: ; %bb
2924; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
2925; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2926; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2927; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2928; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2929; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
2930; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
2931; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2932; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2933; GFX1010-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2934; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, 15
2935; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2936; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, vcc_lo offset:4 glc dlc
2937; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2938; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
2939; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
2940; GFX1010-PAL-NEXT:    scratch_store_dword v1, v2, off
2941; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2942; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
2943; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2944; GFX1010-PAL-NEXT:    s_endpgm
2945;
2946; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel:
2947; GFX1030-PAL:       ; %bb.0: ; %bb
2948; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
2949; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2950; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2951; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2952; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2953; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
2954; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
2955; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2956; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2957; GFX1030-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2958; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, 15
2959; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
2960; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2961; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
2962; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
2963; GFX1030-PAL-NEXT:    scratch_store_dword v1, v2, off
2964; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2965; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
2966; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2967; GFX1030-PAL-NEXT:    s_endpgm
2968;
2969; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel:
2970; GFX11-PAL:       ; %bb.0: ; %bb
2971; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2972; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
2973; GFX11-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4004
2974; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
2975; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2976; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
2977; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, vcc_lo dlc
2978; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2979; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
2980; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2981; GFX11-PAL-NEXT:    s_endpgm
2982bb:
2983  %padding = alloca [4096 x i32], align 4, addrspace(5)
2984  %i = alloca [32 x float], align 4, addrspace(5)
2985  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2986  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2987  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2988  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
2989  %i3 = zext i32 %i2 to i64
2990  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
2991  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2992  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2993  %i9 = sub nsw i32 31, %i2
2994  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2995  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2996  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2997  ret void
2998}
2999
3000define void @store_load_vindex_large_offset_foo(i32 %idx) {
3001; GFX9-LABEL: store_load_vindex_large_offset_foo:
3002; GFX9:       ; %bb.0: ; %bb
3003; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3004; GFX9-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
3005; GFX9-NEXT:    s_waitcnt vmcnt(0)
3006; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
3007; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
3008; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
3009; GFX9-NEXT:    v_mov_b32_e32 v3, 15
3010; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
3011; GFX9-NEXT:    scratch_store_dword v2, v3, off
3012; GFX9-NEXT:    s_waitcnt vmcnt(0)
3013; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
3014; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
3015; GFX9-NEXT:    s_waitcnt vmcnt(0)
3016; GFX9-NEXT:    s_setpc_b64 s[30:31]
3017;
3018; GFX10-LABEL: store_load_vindex_large_offset_foo:
3019; GFX10:       ; %bb.0: ; %bb
3020; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3021; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3022; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
3023; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3024; GFX10-NEXT:    v_mov_b32_e32 v2, 15
3025; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, vcc_lo
3026; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3027; GFX10-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
3028; GFX10-NEXT:    s_waitcnt vmcnt(0)
3029; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
3030; GFX10-NEXT:    scratch_store_dword v0, v2, off
3031; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3032; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
3033; GFX10-NEXT:    s_waitcnt vmcnt(0)
3034; GFX10-NEXT:    s_setpc_b64 s[30:31]
3035;
3036; GFX11-LABEL: store_load_vindex_large_offset_foo:
3037; GFX11:       ; %bb.0: ; %bb
3038; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3039; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3040; GFX11-NEXT:    v_and_b32_e32 v1, 15, v0
3041; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3042; GFX11-NEXT:    v_mov_b32_e32 v2, 15
3043; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3044; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:4 glc dlc
3045; GFX11-NEXT:    s_waitcnt vmcnt(0)
3046; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
3047; GFX11-NEXT:    scratch_store_b32 v0, v2, vcc_lo dlc
3048; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3049; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3050; GFX11-NEXT:    scratch_load_b32 v0, v1, vcc_lo glc dlc
3051; GFX11-NEXT:    s_waitcnt vmcnt(0)
3052; GFX11-NEXT:    s_setpc_b64 s[30:31]
3053;
3054; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
3055; GFX9-PAL:       ; %bb.0: ; %bb
3056; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3057; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
3058; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3059; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
3060; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
3061; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
3062; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
3063; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
3064; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
3065; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3066; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
3067; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
3068; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3069; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3070;
3071; GFX940-LABEL: store_load_vindex_large_offset_foo:
3072; GFX940:       ; %bb.0: ; %bb
3073; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3074; GFX940-NEXT:    scratch_load_dword v1, off, s32 offset:4 sc0 sc1
3075; GFX940-NEXT:    s_waitcnt vmcnt(0)
3076; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
3077; GFX940-NEXT:    v_mov_b32_e32 v2, 15
3078; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
3079; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
3080; GFX940-NEXT:    scratch_store_dword v1, v2, vcc_hi sc0 sc1
3081; GFX940-NEXT:    s_waitcnt vmcnt(0)
3082; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3083; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
3084; GFX940-NEXT:    scratch_load_dword v0, v0, vcc_hi sc0 sc1
3085; GFX940-NEXT:    s_waitcnt vmcnt(0)
3086; GFX940-NEXT:    s_setpc_b64 s[30:31]
3087;
3088; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo:
3089; GFX10-PAL:       ; %bb.0: ; %bb
3090; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3091; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3092; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
3093; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3094; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
3095; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, vcc_lo
3096; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3097; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
3098; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3099; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
3100; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
3101; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3102; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
3103; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3104; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3105;
3106; GFX11-PAL-LABEL: store_load_vindex_large_offset_foo:
3107; GFX11-PAL:       ; %bb.0: ; %bb
3108; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3109; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3110; GFX11-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
3111; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3112; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 15
3113; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3114; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, s32 offset:4 glc dlc
3115; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3116; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
3117; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, vcc_lo dlc
3118; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3119; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3120; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, vcc_lo glc dlc
3121; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3122; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
3123; GCN-LABEL: store_load_vindex_large_offset_foo:
3124; GCN:       ; %bb.0: ; %bb
3125; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3126; GCN-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
3127; GCN-NEXT:    s_waitcnt vmcnt(0)
3128; GCN-NEXT:    v_mov_b32_e32 v2, 15
3129; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
3130; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
3131; GCN-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
3132; GCN-NEXT:    scratch_store_dword v1, v2, vcc_hi sc0 sc1
3133; GCN-NEXT:    s_waitcnt vmcnt(0)
3134; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3135; GCN-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
3136; GCN-NEXT:    scratch_load_dword v0, v0, vcc_hi sc0 sc1
3137; GCN-NEXT:    s_waitcnt vmcnt(0)
3138; GCN-NEXT:    s_setpc_b64 s[30:31]
3139bb:
3140  %padding = alloca [4096 x i32], align 4, addrspace(5)
3141  %i = alloca [32 x float], align 4, addrspace(5)
3142  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
3143  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
3144  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
3145  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
3146  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
3147  store volatile i32 15, i32 addrspace(5)* %i8, align 4
3148  %i9 = and i32 %idx, 15
3149  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
3150  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
3151  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
3152  ret void
3153}
3154
3155define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
3156; GFX9-LABEL: store_load_large_imm_offset_kernel:
3157; GFX9:       ; %bb.0: ; %bb
3158; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
3159; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
3160; GFX9-NEXT:    v_mov_b32_e32 v0, 13
3161; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
3162; GFX9-NEXT:    s_movk_i32 s0, 0x3000
3163; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
3164; GFX9-NEXT:    s_waitcnt vmcnt(0)
3165; GFX9-NEXT:    s_add_i32 s0, s0, 4
3166; GFX9-NEXT:    v_mov_b32_e32 v0, 15
3167; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
3168; GFX9-NEXT:    s_waitcnt vmcnt(0)
3169; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
3170; GFX9-NEXT:    s_waitcnt vmcnt(0)
3171; GFX9-NEXT:    s_endpgm
3172;
3173; GFX10-LABEL: store_load_large_imm_offset_kernel:
3174; GFX10:       ; %bb.0: ; %bb
3175; GFX10-NEXT:    s_add_u32 s0, s0, s3
3176; GFX10-NEXT:    s_addc_u32 s1, s1, 0
3177; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
3178; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
3179; GFX10-NEXT:    v_mov_b32_e32 v0, 13
3180; GFX10-NEXT:    v_mov_b32_e32 v1, 15
3181; GFX10-NEXT:    s_movk_i32 s0, 0x3800
3182; GFX10-NEXT:    s_add_i32 s0, s0, 4
3183; GFX10-NEXT:    scratch_store_dword off, v0, off offset:4
3184; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3185; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
3186; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3187; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
3188; GFX10-NEXT:    s_waitcnt vmcnt(0)
3189; GFX10-NEXT:    s_endpgm
3190;
3191; GFX11-LABEL: store_load_large_imm_offset_kernel:
3192; GFX11:       ; %bb.0: ; %bb
3193; GFX11-NEXT:    v_mov_b32_e32 v0, 13
3194; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3000
3195; GFX11-NEXT:    v_mov_b32_e32 v2, 15
3196; GFX11-NEXT:    scratch_store_b32 off, v0, off offset:4 dlc
3197; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3198; GFX11-NEXT:    scratch_store_b32 v1, v2, off offset:3716 dlc
3199; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3200; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:3716 glc dlc
3201; GFX11-NEXT:    s_waitcnt vmcnt(0)
3202; GFX11-NEXT:    s_endpgm
3203;
3204; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel:
3205; GFX9-PAL:       ; %bb.0: ; %bb
3206; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
3207; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
3208; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
3209; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
3210; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
3211; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
3212; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3213; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
3214; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
3215; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
3216; GFX9-PAL-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
3217; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3218; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
3219; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
3220; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
3221; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3222; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
3223; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3224; GFX9-PAL-NEXT:    s_endpgm
3225;
3226; GFX940-LABEL: store_load_large_imm_offset_kernel:
3227; GFX940:       ; %bb.0: ; %bb
3228; GFX940-NEXT:    v_mov_b32_e32 v0, 13
3229; GFX940-NEXT:    scratch_store_dword off, v0, off offset:4 sc0 sc1
3230; GFX940-NEXT:    s_waitcnt vmcnt(0)
3231; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3000
3232; GFX940-NEXT:    v_mov_b32_e32 v1, 15
3233; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:3716 sc0 sc1
3234; GFX940-NEXT:    s_waitcnt vmcnt(0)
3235; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:3716 sc0 sc1
3236; GFX940-NEXT:    s_waitcnt vmcnt(0)
3237; GFX940-NEXT:    s_endpgm
3238;
3239; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel:
3240; GFX1010-PAL:       ; %bb.0: ; %bb
3241; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
3242; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
3243; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
3244; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3245; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
3246; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
3247; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
3248; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
3249; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
3250; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 13
3251; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 15
3252; GFX1010-PAL-NEXT:    s_movk_i32 s0, 0x3800
3253; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
3254; GFX1010-PAL-NEXT:    s_add_i32 s0, s0, 4
3255; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, vcc_lo offset:4
3256; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3257; GFX1010-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
3258; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3259; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
3260; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
3261; GFX1010-PAL-NEXT:    s_endpgm
3262;
3263; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel:
3264; GFX1030-PAL:       ; %bb.0: ; %bb
3265; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
3266; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
3267; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
3268; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3269; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
3270; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
3271; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
3272; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
3273; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
3274; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 13
3275; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 15
3276; GFX1030-PAL-NEXT:    s_movk_i32 s0, 0x3800
3277; GFX1030-PAL-NEXT:    s_add_i32 s0, s0, 4
3278; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, off offset:4
3279; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3280; GFX1030-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
3281; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3282; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
3283; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
3284; GFX1030-PAL-NEXT:    s_endpgm
3285;
3286; GFX11-PAL-LABEL: store_load_large_imm_offset_kernel:
3287; GFX11-PAL:       ; %bb.0: ; %bb
3288; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 13
3289; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 0x3000
3290; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 15
3291; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, off offset:4 dlc
3292; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3293; GFX11-PAL-NEXT:    scratch_store_b32 v1, v2, off offset:3716 dlc
3294; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3295; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, off offset:3716 glc dlc
3296; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3297; GFX11-PAL-NEXT:    s_endpgm
3298bb:
3299  %i = alloca [4096 x i32], align 4, addrspace(5)
3300  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
3301  store volatile i32 13, i32 addrspace(5)* %i1, align 4
3302  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
3303  store volatile i32 15, i32 addrspace(5)* %i7, align 4
3304  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
3305  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
3306  ret void
3307}
3308
3309define void @store_load_large_imm_offset_foo() {
3310; GFX9-LABEL: store_load_large_imm_offset_foo:
3311; GFX9:       ; %bb.0: ; %bb
3312; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3313; GFX9-NEXT:    v_mov_b32_e32 v0, 13
3314; GFX9-NEXT:    s_movk_i32 s0, 0x3000
3315; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 4
3316; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:4
3317; GFX9-NEXT:    s_waitcnt vmcnt(0)
3318; GFX9-NEXT:    s_add_i32 s0, s0, vcc_hi
3319; GFX9-NEXT:    v_mov_b32_e32 v0, 15
3320; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
3321; GFX9-NEXT:    s_waitcnt vmcnt(0)
3322; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
3323; GFX9-NEXT:    s_waitcnt vmcnt(0)
3324; GFX9-NEXT:    s_setpc_b64 s[30:31]
3325;
3326; GFX10-LABEL: store_load_large_imm_offset_foo:
3327; GFX10:       ; %bb.0: ; %bb
3328; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3329; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3330; GFX10-NEXT:    v_mov_b32_e32 v0, 13
3331; GFX10-NEXT:    v_mov_b32_e32 v1, 15
3332; GFX10-NEXT:    s_movk_i32 s0, 0x3800
3333; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 4
3334; GFX10-NEXT:    s_add_i32 s0, s0, vcc_lo
3335; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:4
3336; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3337; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
3338; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3339; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
3340; GFX10-NEXT:    s_waitcnt vmcnt(0)
3341; GFX10-NEXT:    s_setpc_b64 s[30:31]
3342;
3343; GFX11-LABEL: store_load_large_imm_offset_foo:
3344; GFX11:       ; %bb.0: ; %bb
3345; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3346; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3347; GFX11-NEXT:    v_mov_b32_e32 v0, 13
3348; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3000
3349; GFX11-NEXT:    v_mov_b32_e32 v2, 15
3350; GFX11-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
3351; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3352; GFX11-NEXT:    scratch_store_b32 v1, v2, s32 offset:3716 dlc
3353; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3354; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 offset:3716 glc dlc
3355; GFX11-NEXT:    s_waitcnt vmcnt(0)
3356; GFX11-NEXT:    s_setpc_b64 s[30:31]
3357;
3358; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
3359; GFX9-PAL:       ; %bb.0: ; %bb
3360; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3361; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
3362; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
3363; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 4
3364; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s32 offset:4
3365; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3366; GFX9-PAL-NEXT:    s_add_i32 s0, s0, vcc_hi
3367; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
3368; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
3369; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3370; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
3371; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3372; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3373;
3374; GFX940-LABEL: store_load_large_imm_offset_foo:
3375; GFX940:       ; %bb.0: ; %bb
3376; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3377; GFX940-NEXT:    v_mov_b32_e32 v0, 13
3378; GFX940-NEXT:    scratch_store_dword off, v0, s32 offset:4 sc0 sc1
3379; GFX940-NEXT:    s_waitcnt vmcnt(0)
3380; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3000
3381; GFX940-NEXT:    v_mov_b32_e32 v1, 15
3382; GFX940-NEXT:    scratch_store_dword v0, v1, s32 offset:3716 sc0 sc1
3383; GFX940-NEXT:    s_waitcnt vmcnt(0)
3384; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1
3385; GFX940-NEXT:    s_waitcnt vmcnt(0)
3386; GFX940-NEXT:    s_setpc_b64 s[30:31]
3387;
3388; GFX10-PAL-LABEL: store_load_large_imm_offset_foo:
3389; GFX10-PAL:       ; %bb.0: ; %bb
3390; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3391; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3392; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 13
3393; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
3394; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x3800
3395; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 4
3396; GFX10-PAL-NEXT:    s_add_i32 s0, s0, vcc_lo
3397; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s32 offset:4
3398; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3399; GFX10-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
3400; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3401; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
3402; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3403; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3404;
3405; GFX11-PAL-LABEL: store_load_large_imm_offset_foo:
3406; GFX11-PAL:       ; %bb.0: ; %bb
3407; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3408; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3409; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 13
3410; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 0x3000
3411; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 15
3412; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
3413; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3414; GFX11-PAL-NEXT:    scratch_store_b32 v1, v2, s32 offset:3716 dlc
3415; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3416; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:3716 glc dlc
3417; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3418; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
3419; GCN-LABEL: store_load_large_imm_offset_foo:
3420; GCN:       ; %bb.0: ; %bb
3421; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3422; GCN-NEXT:    v_mov_b32_e32 v0, 13
3423; GCN-NEXT:    scratch_store_dword off, v0, s32 sc0 sc1
3424; GCN-NEXT:    s_waitcnt vmcnt(0)
3425; GCN-NEXT:    v_mov_b32_e32 v0, 0x3000
3426; GCN-NEXT:    v_mov_b32_e32 v1, 15
3427; GCN-NEXT:    scratch_store_dword v0, v1, s32 offset:3712 sc0 sc1
3428; GCN-NEXT:    s_waitcnt vmcnt(0)
3429; GCN-NEXT:    scratch_load_dword v0, v0, s32 offset:3712 sc0 sc1
3430; GCN-NEXT:    s_waitcnt vmcnt(0)
3431; GCN-NEXT:    s_setpc_b64 s[30:31]
3432bb:
3433  %i = alloca [4096 x i32], align 4, addrspace(5)
3434  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
3435  store volatile i32 13, i32 addrspace(5)* %i1, align 4
3436  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
3437  store volatile i32 15, i32 addrspace(5)* %i7, align 4
3438  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
3439  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
3440  ret void
3441}
3442
3443define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
3444; GFX9-LABEL: store_load_vidx_sidx_offset:
3445; GFX9:       ; %bb.0: ; %bb
3446; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
3447; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
3448; GFX9-NEXT:    v_mov_b32_e32 v1, 4
3449; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
3450; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3451; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
3452; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
3453; GFX9-NEXT:    v_mov_b32_e32 v1, 15
3454; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
3455; GFX9-NEXT:    s_waitcnt vmcnt(0)
3456; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
3457; GFX9-NEXT:    s_waitcnt vmcnt(0)
3458; GFX9-NEXT:    s_endpgm
3459;
3460; GFX10-LABEL: store_load_vidx_sidx_offset:
3461; GFX10:       ; %bb.0: ; %bb
3462; GFX10-NEXT:    s_add_u32 s2, s2, s5
3463; GFX10-NEXT:    s_addc_u32 s3, s3, 0
3464; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
3465; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
3466; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
3467; GFX10-NEXT:    v_mov_b32_e32 v1, 15
3468; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3469; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
3470; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
3471; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
3472; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3473; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
3474; GFX10-NEXT:    s_waitcnt vmcnt(0)
3475; GFX10-NEXT:    s_endpgm
3476;
3477; GFX11-LABEL: store_load_vidx_sidx_offset:
3478; GFX11:       ; %bb.0: ; %bb
3479; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
3480; GFX11-NEXT:    v_mov_b32_e32 v1, 15
3481; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3482; GFX11-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
3483; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:1028 dlc
3484; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3485; GFX11-NEXT:    scratch_load_b32 v0, v0, off offset:1028 glc dlc
3486; GFX11-NEXT:    s_waitcnt vmcnt(0)
3487; GFX11-NEXT:    s_endpgm
3488;
3489; GFX9-PAL-LABEL: store_load_vidx_sidx_offset:
3490; GFX9-PAL:       ; %bb.0: ; %bb
3491; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
3492; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
3493; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
3494; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 4
3495; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
3496; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3497; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
3498; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
3499; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
3500; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
3501; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
3502; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
3503; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
3504; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3505; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
3506; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3507; GFX9-PAL-NEXT:    s_endpgm
3508;
3509; GFX940-LABEL: store_load_vidx_sidx_offset:
3510; GFX940:       ; %bb.0: ; %bb
3511; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
3512; GFX940-NEXT:    v_mov_b32_e32 v1, 15
3513; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
3514; GFX940-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
3515; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:1028 sc0 sc1
3516; GFX940-NEXT:    s_waitcnt vmcnt(0)
3517; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:1028 sc0 sc1
3518; GFX940-NEXT:    s_waitcnt vmcnt(0)
3519; GFX940-NEXT:    s_endpgm
3520;
3521; GFX10-PAL-LABEL: store_load_vidx_sidx_offset:
3522; GFX10-PAL:       ; %bb.0: ; %bb
3523; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
3524; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
3525; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
3526; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3527; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
3528; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
3529; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
3530; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
3531; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
3532; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
3533; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
3534; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3535; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
3536; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
3537; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
3538; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3539; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
3540; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3541; GFX10-PAL-NEXT:    s_endpgm
3542;
3543; GFX11-PAL-LABEL: store_load_vidx_sidx_offset:
3544; GFX11-PAL:       ; %bb.0: ; %bb
3545; GFX11-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
3546; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
3547; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3548; GFX11-PAL-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
3549; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:1028 dlc
3550; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3551; GFX11-PAL-NEXT:    scratch_load_b32 v0, v0, off offset:1028 glc dlc
3552; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3553; GFX11-PAL-NEXT:    s_endpgm
3554; GCN-LABEL: store_load_vidx_sidx_offset:
3555; GCN:       ; %bb.0: ; %bb
3556; GCN-NEXT:    s_load_dword s0, s[0:1], 0x24
3557; GCN-NEXT:    v_mov_b32_e32 v1, 15
3558; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3559; GCN-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
3560; GCN-NEXT:    scratch_store_dword v0, v1, off offset:1028 sc0 sc1
3561; GCN-NEXT:    s_waitcnt vmcnt(0)
3562; GCN-NEXT:    scratch_load_dword v0, v0, off offset:1028 sc0 sc1
3563; GCN-NEXT:    s_waitcnt vmcnt(0)
3564; GCN-NEXT:    s_endpgm
3565bb:
3566  %alloca = alloca [32 x i32], align 4, addrspace(5)
3567  %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
3568  %add1 = add nsw i32 %sidx, %vidx
3569  %add2 = add nsw i32 %add1, 256
3570  %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2
3571  store volatile i32 15, i32 addrspace(5)* %gep, align 4
3572  %load = load volatile i32, i32 addrspace(5)* %gep, align 4
3573  ret void
3574}
3575
3576define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
3577; GFX9-LABEL: store_load_i64_aligned:
3578; GFX9:       ; %bb.0: ; %bb
3579; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3580; GFX9-NEXT:    v_mov_b32_e32 v1, 15
3581; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3582; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3583; GFX9-NEXT:    s_waitcnt vmcnt(0)
3584; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
3585; GFX9-NEXT:    s_waitcnt vmcnt(0)
3586; GFX9-NEXT:    s_setpc_b64 s[30:31]
3587;
3588; GFX10-LABEL: store_load_i64_aligned:
3589; GFX10:       ; %bb.0: ; %bb
3590; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3591; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3592; GFX10-NEXT:    v_mov_b32_e32 v1, 15
3593; GFX10-NEXT:    v_mov_b32_e32 v2, 0
3594; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3595; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3596; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
3597; GFX10-NEXT:    s_waitcnt vmcnt(0)
3598; GFX10-NEXT:    s_setpc_b64 s[30:31]
3599;
3600; GFX11-LABEL: store_load_i64_aligned:
3601; GFX11:       ; %bb.0: ; %bb
3602; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3603; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3604; GFX11-NEXT:    v_mov_b32_e32 v1, 15
3605; GFX11-NEXT:    v_mov_b32_e32 v2, 0
3606; GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
3607; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3608; GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
3609; GFX11-NEXT:    s_waitcnt vmcnt(0)
3610; GFX11-NEXT:    s_setpc_b64 s[30:31]
3611;
3612; GFX9-PAL-LABEL: store_load_i64_aligned:
3613; GFX9-PAL:       ; %bb.0: ; %bb
3614; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3615; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
3616; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
3617; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3618; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3619; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
3620; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3621; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3622;
3623; GFX940-LABEL: store_load_i64_aligned:
3624; GFX940:       ; %bb.0: ; %bb
3625; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3626; GFX940-NEXT:    v_mov_b32_e32 v2, 15
3627; GFX940-NEXT:    v_mov_b32_e32 v3, 0
3628; GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
3629; GFX940-NEXT:    s_waitcnt vmcnt(0)
3630; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
3631; GFX940-NEXT:    s_waitcnt vmcnt(0)
3632; GFX940-NEXT:    s_setpc_b64 s[30:31]
3633;
3634; GFX10-PAL-LABEL: store_load_i64_aligned:
3635; GFX10-PAL:       ; %bb.0: ; %bb
3636; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3637; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3638; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
3639; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
3640; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3641; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3642; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
3643; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3644; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3645;
3646; GFX11-PAL-LABEL: store_load_i64_aligned:
3647; GFX11-PAL:       ; %bb.0: ; %bb
3648; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3649; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3650; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
3651; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 0
3652; GFX11-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
3653; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3654; GFX11-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
3655; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3656; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
3657; GCN-LABEL: store_load_i64_aligned:
3658; GCN:       ; %bb.0: ; %bb
3659; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3660; GCN-NEXT:    v_mov_b32_e32 v2, 15
3661; GCN-NEXT:    v_mov_b32_e32 v3, 0
3662; GCN-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
3663; GCN-NEXT:    s_waitcnt vmcnt(0)
3664; GCN-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
3665; GCN-NEXT:    s_waitcnt vmcnt(0)
3666; GCN-NEXT:    s_setpc_b64 s[30:31]
3667bb:
3668  store volatile i64 15, i64 addrspace(5)* %arg, align 8
3669  %load = load volatile i64, i64 addrspace(5)* %arg, align 8
3670  ret void
3671}
3672
3673define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
3674; GFX9-LABEL: store_load_i64_unaligned:
3675; GFX9:       ; %bb.0: ; %bb
3676; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3677; GFX9-NEXT:    v_mov_b32_e32 v1, 15
3678; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3679; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3680; GFX9-NEXT:    s_waitcnt vmcnt(0)
3681; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
3682; GFX9-NEXT:    s_waitcnt vmcnt(0)
3683; GFX9-NEXT:    s_setpc_b64 s[30:31]
3684;
3685; GFX10-LABEL: store_load_i64_unaligned:
3686; GFX10:       ; %bb.0: ; %bb
3687; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3688; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3689; GFX10-NEXT:    v_mov_b32_e32 v1, 15
3690; GFX10-NEXT:    v_mov_b32_e32 v2, 0
3691; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3692; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3693; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
3694; GFX10-NEXT:    s_waitcnt vmcnt(0)
3695; GFX10-NEXT:    s_setpc_b64 s[30:31]
3696;
3697; GFX11-LABEL: store_load_i64_unaligned:
3698; GFX11:       ; %bb.0: ; %bb
3699; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3700; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3701; GFX11-NEXT:    v_mov_b32_e32 v1, 15
3702; GFX11-NEXT:    v_mov_b32_e32 v2, 0
3703; GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
3704; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3705; GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
3706; GFX11-NEXT:    s_waitcnt vmcnt(0)
3707; GFX11-NEXT:    s_setpc_b64 s[30:31]
3708;
3709; GFX9-PAL-LABEL: store_load_i64_unaligned:
3710; GFX9-PAL:       ; %bb.0: ; %bb
3711; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3712; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
3713; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
3714; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3715; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3716; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
3717; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3718; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3719;
3720; GFX940-LABEL: store_load_i64_unaligned:
3721; GFX940:       ; %bb.0: ; %bb
3722; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3723; GFX940-NEXT:    v_mov_b32_e32 v2, 15
3724; GFX940-NEXT:    v_mov_b32_e32 v3, 0
3725; GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
3726; GFX940-NEXT:    s_waitcnt vmcnt(0)
3727; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
3728; GFX940-NEXT:    s_waitcnt vmcnt(0)
3729; GFX940-NEXT:    s_setpc_b64 s[30:31]
3730;
3731; GFX10-PAL-LABEL: store_load_i64_unaligned:
3732; GFX10-PAL:       ; %bb.0: ; %bb
3733; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3734; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3735; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
3736; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
3737; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3738; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3739; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
3740; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3741; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3742;
3743; GFX11-PAL-LABEL: store_load_i64_unaligned:
3744; GFX11-PAL:       ; %bb.0: ; %bb
3745; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3746; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3747; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
3748; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 0
3749; GFX11-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
3750; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3751; GFX11-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
3752; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3753; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
3754; GCN-LABEL: store_load_i64_unaligned:
3755; GCN:       ; %bb.0: ; %bb
3756; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3757; GCN-NEXT:    v_mov_b32_e32 v2, 15
3758; GCN-NEXT:    v_mov_b32_e32 v3, 0
3759; GCN-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
3760; GCN-NEXT:    s_waitcnt vmcnt(0)
3761; GCN-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
3762; GCN-NEXT:    s_waitcnt vmcnt(0)
3763; GCN-NEXT:    s_setpc_b64 s[30:31]
3764bb:
3765  store volatile i64 15, i64 addrspace(5)* %arg, align 1
3766  %load = load volatile i64, i64 addrspace(5)* %arg, align 1
3767  ret void
3768}
3769
3770define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) {
3771; GFX9-LABEL: store_load_v3i32_unaligned:
3772; GFX9:       ; %bb.0: ; %bb
3773; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3774; GFX9-NEXT:    v_mov_b32_e32 v1, 1
3775; GFX9-NEXT:    v_mov_b32_e32 v2, 2
3776; GFX9-NEXT:    v_mov_b32_e32 v3, 3
3777; GFX9-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
3778; GFX9-NEXT:    s_waitcnt vmcnt(0)
3779; GFX9-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
3780; GFX9-NEXT:    s_waitcnt vmcnt(0)
3781; GFX9-NEXT:    s_setpc_b64 s[30:31]
3782;
3783; GFX10-LABEL: store_load_v3i32_unaligned:
3784; GFX10:       ; %bb.0: ; %bb
3785; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3786; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3787; GFX10-NEXT:    v_mov_b32_e32 v1, 1
3788; GFX10-NEXT:    v_mov_b32_e32 v2, 2
3789; GFX10-NEXT:    v_mov_b32_e32 v3, 3
3790; GFX10-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
3791; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3792; GFX10-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
3793; GFX10-NEXT:    s_waitcnt vmcnt(0)
3794; GFX10-NEXT:    s_setpc_b64 s[30:31]
3795;
3796; GFX11-LABEL: store_load_v3i32_unaligned:
3797; GFX11:       ; %bb.0: ; %bb
3798; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3799; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3800; GFX11-NEXT:    v_mov_b32_e32 v1, 1
3801; GFX11-NEXT:    v_mov_b32_e32 v2, 2
3802; GFX11-NEXT:    v_mov_b32_e32 v3, 3
3803; GFX11-NEXT:    scratch_store_b96 v0, v[1:3], off dlc
3804; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3805; GFX11-NEXT:    scratch_load_b96 v[0:2], v0, off glc dlc
3806; GFX11-NEXT:    s_waitcnt vmcnt(0)
3807; GFX11-NEXT:    s_setpc_b64 s[30:31]
3808;
3809; GFX9-PAL-LABEL: store_load_v3i32_unaligned:
3810; GFX9-PAL:       ; %bb.0: ; %bb
3811; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3812; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
3813; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
3814; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
3815; GFX9-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
3816; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3817; GFX9-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
3818; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3819; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3820;
3821; GFX940-LABEL: store_load_v3i32_unaligned:
3822; GFX940:       ; %bb.0: ; %bb
3823; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3824; GFX940-NEXT:    v_mov_b32_e32 v2, 1
3825; GFX940-NEXT:    v_mov_b32_e32 v3, 2
3826; GFX940-NEXT:    v_mov_b32_e32 v4, 3
3827; GFX940-NEXT:    scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
3828; GFX940-NEXT:    s_waitcnt vmcnt(0)
3829; GFX940-NEXT:    scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
3830; GFX940-NEXT:    s_waitcnt vmcnt(0)
3831; GFX940-NEXT:    s_setpc_b64 s[30:31]
3832;
3833; GFX10-PAL-LABEL: store_load_v3i32_unaligned:
3834; GFX10-PAL:       ; %bb.0: ; %bb
3835; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3836; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3837; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
3838; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
3839; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
3840; GFX10-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
3841; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3842; GFX10-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
3843; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3844; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3845;
3846; GFX11-PAL-LABEL: store_load_v3i32_unaligned:
3847; GFX11-PAL:       ; %bb.0: ; %bb
3848; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3849; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3850; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 1
3851; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 2
3852; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, 3
3853; GFX11-PAL-NEXT:    scratch_store_b96 v0, v[1:3], off dlc
3854; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3855; GFX11-PAL-NEXT:    scratch_load_b96 v[0:2], v0, off glc dlc
3856; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3857; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
3858; GCN-LABEL: store_load_v3i32_unaligned:
3859; GCN:       ; %bb.0: ; %bb
3860; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3861; GCN-NEXT:    v_mov_b32_e32 v2, 1
3862; GCN-NEXT:    v_mov_b32_e32 v3, 2
3863; GCN-NEXT:    v_mov_b32_e32 v4, 3
3864; GCN-NEXT:    scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
3865; GCN-NEXT:    s_waitcnt vmcnt(0)
3866; GCN-NEXT:    scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
3867; GCN-NEXT:    s_waitcnt vmcnt(0)
3868; GCN-NEXT:    s_setpc_b64 s[30:31]
3869bb:
3870  store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1
3871  %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1
3872  ret void
3873}
3874
3875define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) {
3876; GFX9-LABEL: store_load_v4i32_unaligned:
3877; GFX9:       ; %bb.0: ; %bb
3878; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3879; GFX9-NEXT:    v_mov_b32_e32 v1, 1
3880; GFX9-NEXT:    v_mov_b32_e32 v2, 2
3881; GFX9-NEXT:    v_mov_b32_e32 v3, 3
3882; GFX9-NEXT:    v_mov_b32_e32 v4, 4
3883; GFX9-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
3884; GFX9-NEXT:    s_waitcnt vmcnt(0)
3885; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
3886; GFX9-NEXT:    s_waitcnt vmcnt(0)
3887; GFX9-NEXT:    s_setpc_b64 s[30:31]
3888;
3889; GFX10-LABEL: store_load_v4i32_unaligned:
3890; GFX10:       ; %bb.0: ; %bb
3891; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3892; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3893; GFX10-NEXT:    v_mov_b32_e32 v1, 1
3894; GFX10-NEXT:    v_mov_b32_e32 v2, 2
3895; GFX10-NEXT:    v_mov_b32_e32 v3, 3
3896; GFX10-NEXT:    v_mov_b32_e32 v4, 4
3897; GFX10-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
3898; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3899; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
3900; GFX10-NEXT:    s_waitcnt vmcnt(0)
3901; GFX10-NEXT:    s_setpc_b64 s[30:31]
3902;
3903; GFX11-LABEL: store_load_v4i32_unaligned:
3904; GFX11:       ; %bb.0: ; %bb
3905; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3906; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3907; GFX11-NEXT:    v_mov_b32_e32 v1, 1
3908; GFX11-NEXT:    v_mov_b32_e32 v2, 2
3909; GFX11-NEXT:    v_mov_b32_e32 v3, 3
3910; GFX11-NEXT:    v_mov_b32_e32 v4, 4
3911; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off dlc
3912; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3913; GFX11-NEXT:    scratch_load_b128 v[0:3], v0, off glc dlc
3914; GFX11-NEXT:    s_waitcnt vmcnt(0)
3915; GFX11-NEXT:    s_setpc_b64 s[30:31]
3916;
3917; GFX9-PAL-LABEL: store_load_v4i32_unaligned:
3918; GFX9-PAL:       ; %bb.0: ; %bb
3919; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3920; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
3921; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
3922; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
3923; GFX9-PAL-NEXT:    v_mov_b32_e32 v4, 4
3924; GFX9-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
3925; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3926; GFX9-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
3927; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3928; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3929;
3930; GFX940-LABEL: store_load_v4i32_unaligned:
3931; GFX940:       ; %bb.0: ; %bb
3932; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3933; GFX940-NEXT:    v_mov_b32_e32 v2, 1
3934; GFX940-NEXT:    v_mov_b32_e32 v3, 2
3935; GFX940-NEXT:    v_mov_b32_e32 v4, 3
3936; GFX940-NEXT:    v_mov_b32_e32 v5, 4
3937; GFX940-NEXT:    scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
3938; GFX940-NEXT:    s_waitcnt vmcnt(0)
3939; GFX940-NEXT:    scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
3940; GFX940-NEXT:    s_waitcnt vmcnt(0)
3941; GFX940-NEXT:    s_setpc_b64 s[30:31]
3942;
3943; GFX10-PAL-LABEL: store_load_v4i32_unaligned:
3944; GFX10-PAL:       ; %bb.0: ; %bb
3945; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3946; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3947; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
3948; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
3949; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
3950; GFX10-PAL-NEXT:    v_mov_b32_e32 v4, 4
3951; GFX10-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
3952; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3953; GFX10-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
3954; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3955; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3956;
3957; GFX11-PAL-LABEL: store_load_v4i32_unaligned:
3958; GFX11-PAL:       ; %bb.0: ; %bb
3959; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3960; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3961; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 1
3962; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 2
3963; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, 3
3964; GFX11-PAL-NEXT:    v_mov_b32_e32 v4, 4
3965; GFX11-PAL-NEXT:    scratch_store_b128 v0, v[1:4], off dlc
3966; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3967; GFX11-PAL-NEXT:    scratch_load_b128 v[0:3], v0, off glc dlc
3968; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3969; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
3970; GCN-LABEL: store_load_v4i32_unaligned:
3971; GCN:       ; %bb.0: ; %bb
3972; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3973; GCN-NEXT:    v_mov_b32_e32 v2, 1
3974; GCN-NEXT:    v_mov_b32_e32 v3, 2
3975; GCN-NEXT:    v_mov_b32_e32 v4, 3
3976; GCN-NEXT:    v_mov_b32_e32 v5, 4
3977; GCN-NEXT:    scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
3978; GCN-NEXT:    s_waitcnt vmcnt(0)
3979; GCN-NEXT:    scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
3980; GCN-NEXT:    s_waitcnt vmcnt(0)
3981; GCN-NEXT:    s_setpc_b64 s[30:31]
3982bb:
3983  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1
3984  %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1
3985  ret void
3986}
3987
3988define void @store_load_i32_negative_unaligned(i8 addrspace(5)* nocapture %arg) {
3989; GFX9-LABEL: store_load_i32_negative_unaligned:
3990; GFX9:       ; %bb.0: ; %bb
3991; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3992; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
3993; GFX9-NEXT:    v_mov_b32_e32 v1, 1
3994; GFX9-NEXT:    scratch_store_byte v0, v1, off
3995; GFX9-NEXT:    s_waitcnt vmcnt(0)
3996; GFX9-NEXT:    scratch_load_ubyte v0, v0, off glc
3997; GFX9-NEXT:    s_waitcnt vmcnt(0)
3998; GFX9-NEXT:    s_setpc_b64 s[30:31]
3999;
4000; GFX10-LABEL: store_load_i32_negative_unaligned:
4001; GFX10:       ; %bb.0: ; %bb
4002; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4003; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4004; GFX10-NEXT:    v_mov_b32_e32 v1, 1
4005; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-1
4006; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4007; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
4008; GFX10-NEXT:    s_waitcnt vmcnt(0)
4009; GFX10-NEXT:    s_setpc_b64 s[30:31]
4010;
4011; GFX11-LABEL: store_load_i32_negative_unaligned:
4012; GFX11:       ; %bb.0: ; %bb
4013; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4014; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4015; GFX11-NEXT:    v_mov_b32_e32 v1, 1
4016; GFX11-NEXT:    scratch_store_b8 v0, v1, off offset:-1 dlc
4017; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4018; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:-1 glc dlc
4019; GFX11-NEXT:    s_waitcnt vmcnt(0)
4020; GFX11-NEXT:    s_setpc_b64 s[30:31]
4021;
4022; GFX9-PAL-LABEL: store_load_i32_negative_unaligned:
4023; GFX9-PAL:       ; %bb.0: ; %bb
4024; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4025; GFX9-PAL-NEXT:    v_add_u32_e32 v0, -1, v0
4026; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
4027; GFX9-PAL-NEXT:    scratch_store_byte v0, v1, off
4028; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4029; GFX9-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc
4030; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4031; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
4032;
4033; GFX940-LABEL: store_load_i32_negative_unaligned:
4034; GFX940:       ; %bb.0: ; %bb
4035; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4036; GFX940-NEXT:    v_add_u32_e32 v0, -1, v0
4037; GFX940-NEXT:    v_mov_b32_e32 v1, 1
4038; GFX940-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
4039; GFX940-NEXT:    s_waitcnt vmcnt(0)
4040; GFX940-NEXT:    scratch_load_ubyte v0, v0, off sc0 sc1
4041; GFX940-NEXT:    s_waitcnt vmcnt(0)
4042; GFX940-NEXT:    s_setpc_b64 s[30:31]
4043;
4044; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned:
4045; GFX1010-PAL:       ; %bb.0: ; %bb
4046; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4047; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4048; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, -1, v0
4049; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
4050; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off
4051; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4052; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
4053; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
4054; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
4055;
4056; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned:
4057; GFX1030-PAL:       ; %bb.0: ; %bb
4058; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4059; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4060; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
4061; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-1
4062; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4063; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
4064; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
4065; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
4066;
4067; GFX11-PAL-LABEL: store_load_i32_negative_unaligned:
4068; GFX11-PAL:       ; %bb.0: ; %bb
4069; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4070; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4071; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 1
4072; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-1 dlc
4073; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4074; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-1 glc dlc
4075; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
4076; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
4077bb:
4078  %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -1
4079  store volatile i8 1, i8 addrspace(5)* %ptr, align 1
4080  %load = load volatile i8, i8 addrspace(5)* %ptr, align 1
4081  ret void
4082}
4083
4084define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture %arg) {
4085; GFX9-LABEL: store_load_i32_large_negative_unaligned:
4086; GFX9:       ; %bb.0: ; %bb
4087; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4088; GFX9-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
4089; GFX9-NEXT:    v_mov_b32_e32 v1, 1
4090; GFX9-NEXT:    scratch_store_byte v0, v1, off
4091; GFX9-NEXT:    s_waitcnt vmcnt(0)
4092; GFX9-NEXT:    scratch_load_ubyte v0, v0, off glc
4093; GFX9-NEXT:    s_waitcnt vmcnt(0)
4094; GFX9-NEXT:    s_setpc_b64 s[30:31]
4095;
4096; GFX10-LABEL: store_load_i32_large_negative_unaligned:
4097; GFX10:       ; %bb.0: ; %bb
4098; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4099; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4100; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
4101; GFX10-NEXT:    v_mov_b32_e32 v1, 1
4102; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-129
4103; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4104; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
4105; GFX10-NEXT:    s_waitcnt vmcnt(0)
4106; GFX10-NEXT:    s_setpc_b64 s[30:31]
4107;
4108; GFX11-LABEL: store_load_i32_large_negative_unaligned:
4109; GFX11:       ; %bb.0: ; %bb
4110; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4111; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4112; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
4113; GFX11-NEXT:    v_mov_b32_e32 v1, 1
4114; GFX11-NEXT:    scratch_store_b8 v0, v1, off offset:-129 dlc
4115; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4116; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:-129 glc dlc
4117; GFX11-NEXT:    s_waitcnt vmcnt(0)
4118; GFX11-NEXT:    s_setpc_b64 s[30:31]
4119;
4120; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned:
4121; GFX9-PAL:       ; %bb.0: ; %bb
4122; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4123; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
4124; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
4125; GFX9-PAL-NEXT:    scratch_store_byte v0, v1, off
4126; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4127; GFX9-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc
4128; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4129; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
4130;
4131; GFX940-LABEL: store_load_i32_large_negative_unaligned:
4132; GFX940:       ; %bb.0: ; %bb
4133; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4134; GFX940-NEXT:    s_movk_i32 s0, 0xef7f
4135; GFX940-NEXT:    v_mov_b32_e32 v1, 1
4136; GFX940-NEXT:    scratch_store_byte v0, v1, s0 sc0 sc1
4137; GFX940-NEXT:    s_waitcnt vmcnt(0)
4138; GFX940-NEXT:    scratch_load_ubyte v0, v0, s0 sc0 sc1
4139; GFX940-NEXT:    s_waitcnt vmcnt(0)
4140; GFX940-NEXT:    s_setpc_b64 s[30:31]
4141;
4142; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned:
4143; GFX1010-PAL:       ; %bb.0: ; %bb
4144; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4145; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4146; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xffffefff, v0
4147; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
4148; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-128
4149; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4150; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-128 glc dlc
4151; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
4152; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
4153;
4154; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned:
4155; GFX1030-PAL:       ; %bb.0: ; %bb
4156; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4157; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4158; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
4159; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
4160; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-129
4161; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4162; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
4163; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
4164; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
4165;
4166; GFX11-PAL-LABEL: store_load_i32_large_negative_unaligned:
4167; GFX11-PAL:       ; %bb.0: ; %bb
4168; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4169; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4170; GFX11-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
4171; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 1
4172; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-129 dlc
4173; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4174; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-129 glc dlc
4175; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
4176; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
4177bb:
4178  %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -4225
4179  store volatile i8 1, i8 addrspace(5)* %ptr, align 1
4180  %load = load volatile i8, i8 addrspace(5)* %ptr, align 1
4181  ret void
4182}
4183
4184define amdgpu_ps void @large_offset() {
4185; GFX9-LABEL: large_offset:
4186; GFX9:       ; %bb.0: ; %bb
4187; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s2
4188; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4189; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
4190; GFX9-NEXT:    v_mov_b32_e32 v1, v0
4191; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4192; GFX9-NEXT:    v_mov_b32_e32 v3, v0
4193; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
4194; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024
4195; GFX9-NEXT:    s_waitcnt vmcnt(0)
4196; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
4197; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc
4198; GFX9-NEXT:    s_waitcnt vmcnt(0)
4199; GFX9-NEXT:    v_mov_b32_e32 v0, 16
4200; GFX9-NEXT:    ;;#ASMSTART
4201; GFX9-NEXT:    ; use v0
4202; GFX9-NEXT:    ;;#ASMEND
4203; GFX9-NEXT:    v_mov_b32_e32 v0, 0x810
4204; GFX9-NEXT:    ;;#ASMSTART
4205; GFX9-NEXT:    ; use v0
4206; GFX9-NEXT:    ;;#ASMEND
4207; GFX9-NEXT:    s_endpgm
4208;
4209; GFX10-LABEL: large_offset:
4210; GFX10:       ; %bb.0: ; %bb
4211; GFX10-NEXT:    s_add_u32 s0, s0, s2
4212; GFX10-NEXT:    s_addc_u32 s1, s1, 0
4213; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
4214; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
4215; GFX10-NEXT:    v_mov_b32_e32 v0, 0
4216; GFX10-NEXT:    s_movk_i32 s0, 0x810
4217; GFX10-NEXT:    s_addk_i32 s0, 0x3c0
4218; GFX10-NEXT:    v_mov_b32_e32 v1, v0
4219; GFX10-NEXT:    v_mov_b32_e32 v2, v0
4220; GFX10-NEXT:    v_mov_b32_e32 v3, v0
4221; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
4222; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4223; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 glc dlc
4224; GFX10-NEXT:    s_waitcnt vmcnt(0)
4225; GFX10-NEXT:    v_mov_b32_e32 v0, 16
4226; GFX10-NEXT:    v_mov_b32_e32 v1, 0x810
4227; GFX10-NEXT:    ;;#ASMSTART
4228; GFX10-NEXT:    ; use v0
4229; GFX10-NEXT:    ;;#ASMEND
4230; GFX10-NEXT:    ;;#ASMSTART
4231; GFX10-NEXT:    ; use v1
4232; GFX10-NEXT:    ;;#ASMEND
4233; GFX10-NEXT:    s_endpgm
4234;
4235; GFX11-LABEL: large_offset:
4236; GFX11:       ; %bb.0: ; %bb
4237; GFX11-NEXT:    v_mov_b32_e32 v0, 0
4238; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4239; GFX11-NEXT:    v_mov_b32_e32 v1, v0
4240; GFX11-NEXT:    v_mov_b32_e32 v2, v0
4241; GFX11-NEXT:    v_mov_b32_e32 v3, v0
4242; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:3024 dlc
4243; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4244; GFX11-NEXT:    scratch_load_b128 v[0:3], off, off offset:3024 glc dlc
4245; GFX11-NEXT:    s_waitcnt vmcnt(0)
4246; GFX11-NEXT:    v_mov_b32_e32 v0, 16
4247; GFX11-NEXT:    v_mov_b32_e32 v1, 0x810
4248; GFX11-NEXT:    ;;#ASMSTART
4249; GFX11-NEXT:    ; use v0
4250; GFX11-NEXT:    ;;#ASMEND
4251; GFX11-NEXT:    ;;#ASMSTART
4252; GFX11-NEXT:    ; use v1
4253; GFX11-NEXT:    ;;#ASMEND
4254; GFX11-NEXT:    s_endpgm
4255;
4256; GFX9-PAL-LABEL: large_offset:
4257; GFX9-PAL:       ; %bb.0: ; %bb
4258; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
4259; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
4260; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
4261; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0
4262; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, v0
4263; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, v0
4264; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, v0
4265; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
4266; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
4267; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s0
4268; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
4269; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
4270; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024
4271; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4272; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
4273; GFX9-PAL-NEXT:    scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc
4274; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4275; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 16
4276; GFX9-PAL-NEXT:    ;;#ASMSTART
4277; GFX9-PAL-NEXT:    ; use v0
4278; GFX9-PAL-NEXT:    ;;#ASMEND
4279; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0x810
4280; GFX9-PAL-NEXT:    ;;#ASMSTART
4281; GFX9-PAL-NEXT:    ; use v0
4282; GFX9-PAL-NEXT:    ;;#ASMEND
4283; GFX9-PAL-NEXT:    s_endpgm
4284;
4285; GFX940-LABEL: large_offset:
4286; GFX940:       ; %bb.0: ; %bb
4287; GFX940-NEXT:    v_mov_b32_e32 v0, 0
4288; GFX940-NEXT:    v_mov_b32_e32 v1, v0
4289; GFX940-NEXT:    v_mov_b32_e32 v2, v0
4290; GFX940-NEXT:    v_mov_b32_e32 v3, v0
4291; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1
4292; GFX940-NEXT:    s_waitcnt vmcnt(0)
4293; GFX940-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1
4294; GFX940-NEXT:    s_waitcnt vmcnt(0)
4295; GFX940-NEXT:    v_mov_b32_e32 v0, 16
4296; GFX940-NEXT:    ;;#ASMSTART
4297; GFX940-NEXT:    ; use v0
4298; GFX940-NEXT:    ;;#ASMEND
4299; GFX940-NEXT:    v_mov_b32_e32 v0, 0x810
4300; GFX940-NEXT:    ;;#ASMSTART
4301; GFX940-NEXT:    ; use v0
4302; GFX940-NEXT:    ;;#ASMEND
4303; GFX940-NEXT:    s_endpgm
4304;
4305; GFX10-PAL-LABEL: large_offset:
4306; GFX10-PAL:       ; %bb.0: ; %bb
4307; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
4308; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
4309; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
4310; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
4311; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
4312; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s0
4313; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
4314; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
4315; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
4316; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 0
4317; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x810
4318; GFX10-PAL-NEXT:    s_addk_i32 s0, 0x3c0
4319; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, v0
4320; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, v0
4321; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, v0
4322; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
4323; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4324; GFX10-PAL-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 glc dlc
4325; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
4326; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 16
4327; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x810
4328; GFX10-PAL-NEXT:    ;;#ASMSTART
4329; GFX10-PAL-NEXT:    ; use v0
4330; GFX10-PAL-NEXT:    ;;#ASMEND
4331; GFX10-PAL-NEXT:    ;;#ASMSTART
4332; GFX10-PAL-NEXT:    ; use v1
4333; GFX10-PAL-NEXT:    ;;#ASMEND
4334; GFX10-PAL-NEXT:    s_endpgm
4335;
4336; GFX11-PAL-LABEL: large_offset:
4337; GFX11-PAL:       ; %bb.0: ; %bb
4338; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 0
4339; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4340; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, v0
4341; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, v0
4342; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, v0
4343; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:3024 dlc
4344; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4345; GFX11-PAL-NEXT:    scratch_load_b128 v[0:3], off, off offset:3024 glc dlc
4346; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
4347; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 16
4348; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 0x810
4349; GFX11-PAL-NEXT:    ;;#ASMSTART
4350; GFX11-PAL-NEXT:    ; use v0
4351; GFX11-PAL-NEXT:    ;;#ASMEND
4352; GFX11-PAL-NEXT:    ;;#ASMSTART
4353; GFX11-PAL-NEXT:    ; use v1
4354; GFX11-PAL-NEXT:    ;;#ASMEND
4355; GFX11-PAL-NEXT:    s_endpgm
4356bb:
4357  %alloca = alloca [128 x <4 x i32>], align 16, addrspace(5)
4358  %alloca2 = alloca [128 x <4 x i32>], align 16, addrspace(5)
4359  %gep = getelementptr inbounds [128 x <4 x i32>], [128 x <4 x i32>] addrspace(5)* %alloca2, i32 0, i32 60
4360  store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(5)* %gep, align 16
4361  %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %gep, align 16
4362  call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca) #0
4363  call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca2) #0
4364  ret void
4365}
4366
4367declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg)
4368declare i32 @llvm.amdgcn.workitem.id.x()
4369