1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
4; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
5; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s
6; RUN: llc -march=amdgcn -mcpu=gfx940 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s
7; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s
8; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s
9; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-PAL %s
10
11define amdgpu_kernel void @zero_init_kernel() {
12; GFX9-LABEL: zero_init_kernel:
13; GFX9:       ; %bb.0:
14; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
15; GFX9-NEXT:    s_mov_b32 s0, 0
16; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
17; GFX9-NEXT:    s_mov_b32 s1, s0
18; GFX9-NEXT:    s_mov_b32 s2, s0
19; GFX9-NEXT:    s_mov_b32 s3, s0
20; GFX9-NEXT:    v_mov_b32_e32 v0, s0
21; GFX9-NEXT:    v_mov_b32_e32 v1, s1
22; GFX9-NEXT:    v_mov_b32_e32 v2, s2
23; GFX9-NEXT:    v_mov_b32_e32 v3, s3
24; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
25; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64
26; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
27; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
28; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
29; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
30; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
31; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
32; GFX9-NEXT:    s_endpgm
33;
34; GFX10-LABEL: zero_init_kernel:
35; GFX10:       ; %bb.0:
36; GFX10-NEXT:    s_add_u32 s0, s0, s3
37; GFX10-NEXT:    s_addc_u32 s1, s1, 0
38; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
39; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
40; GFX10-NEXT:    s_mov_b32 s0, 0
41; GFX10-NEXT:    s_mov_b32 s1, s0
42; GFX10-NEXT:    s_mov_b32 s2, s0
43; GFX10-NEXT:    s_mov_b32 s3, s0
44; GFX10-NEXT:    v_mov_b32_e32 v0, s0
45; GFX10-NEXT:    v_mov_b32_e32 v1, s1
46; GFX10-NEXT:    v_mov_b32_e32 v2, s2
47; GFX10-NEXT:    v_mov_b32_e32 v3, s3
48; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
49; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
50; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
51; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
52; GFX10-NEXT:    s_endpgm
53;
54; GFX11-LABEL: zero_init_kernel:
55; GFX11:       ; %bb.0:
56; GFX11-NEXT:    s_mov_b32 s0, 0
57; GFX11-NEXT:    s_mov_b32 s1, s0
58; GFX11-NEXT:    s_mov_b32 s2, s0
59; GFX11-NEXT:    s_mov_b32 s3, s0
60; GFX11-NEXT:    v_mov_b32_e32 v0, s0
61; GFX11-NEXT:    v_mov_b32_e32 v1, s1
62; GFX11-NEXT:    v_mov_b32_e32 v2, s2
63; GFX11-NEXT:    v_mov_b32_e32 v3, s3
64; GFX11-NEXT:    s_clause 0x3
65; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:64
66; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:48
67; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
68; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:16
69; GFX11-NEXT:    s_endpgm
70;
71; GFX9-PAL-LABEL: zero_init_kernel:
72; GFX9-PAL:       ; %bb.0:
73; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
74; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
75; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
76; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
77; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
78; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
79; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
80; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
81; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
82; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
83; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
84; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
85; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
86; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
87; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
88; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
89; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64
90; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
91; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
92; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
93; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
94; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
95; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
96; GFX9-PAL-NEXT:    s_endpgm
97;
98; GFX940-LABEL: zero_init_kernel:
99; GFX940:       ; %bb.0:
100; GFX940-NEXT:    s_mov_b32 s0, 0
101; GFX940-NEXT:    s_mov_b32 s1, s0
102; GFX940-NEXT:    s_mov_b32 s2, s0
103; GFX940-NEXT:    s_mov_b32 s3, s0
104; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
105; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
106; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
107; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
108; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
109; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
110; GFX940-NEXT:    s_endpgm
111;
112; GFX1010-PAL-LABEL: zero_init_kernel:
113; GFX1010-PAL:       ; %bb.0:
114; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
115; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
116; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
117; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
118; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
119; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
120; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
121; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
122; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
123; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
124; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
125; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
126; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
127; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
128; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
129; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
130; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
131; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
132; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:64
133; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
134; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
135; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
136; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
137; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
138; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
139; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
140; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
141; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
142; GFX1010-PAL-NEXT:    s_endpgm
143;
144; GFX1030-PAL-LABEL: zero_init_kernel:
145; GFX1030-PAL:       ; %bb.0:
146; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
147; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
148; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
149; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
150; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
151; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
152; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
153; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
154; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
155; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
156; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
157; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
158; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
159; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
160; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
161; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
162; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
163; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
164; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
165; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
166; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
167; GFX1030-PAL-NEXT:    s_endpgm
168;
169; GFX11-PAL-LABEL: zero_init_kernel:
170; GFX11-PAL:       ; %bb.0:
171; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
172; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
173; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
174; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
175; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, s0
176; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, s1
177; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, s2
178; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, s3
179; GFX11-PAL-NEXT:    s_clause 0x3
180; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:64
181; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:48
182; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
183; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16
184; GFX11-PAL-NEXT:    s_endpgm
185  %alloca = alloca [32 x i16], align 2, addrspace(5)
186  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
187  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
188  ret void
189}
190
191define void @zero_init_foo() {
192; GFX9-LABEL: zero_init_foo:
193; GFX9:       ; %bb.0:
194; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
195; GFX9-NEXT:    s_mov_b32 s0, 0
196; GFX9-NEXT:    s_mov_b32 s1, s0
197; GFX9-NEXT:    s_mov_b32 s2, s0
198; GFX9-NEXT:    s_mov_b32 s3, s0
199; GFX9-NEXT:    v_mov_b32_e32 v0, s0
200; GFX9-NEXT:    v_mov_b32_e32 v1, s1
201; GFX9-NEXT:    v_mov_b32_e32 v2, s2
202; GFX9-NEXT:    v_mov_b32_e32 v3, s3
203; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
204; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
205; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
206; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
207; GFX9-NEXT:    s_waitcnt vmcnt(0)
208; GFX9-NEXT:    s_setpc_b64 s[30:31]
209;
210; GFX10-LABEL: zero_init_foo:
211; GFX10:       ; %bb.0:
212; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
213; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
214; GFX10-NEXT:    s_mov_b32 s0, 0
215; GFX10-NEXT:    s_mov_b32 s1, s0
216; GFX10-NEXT:    s_mov_b32 s2, s0
217; GFX10-NEXT:    s_mov_b32 s3, s0
218; GFX10-NEXT:    v_mov_b32_e32 v0, s0
219; GFX10-NEXT:    v_mov_b32_e32 v1, s1
220; GFX10-NEXT:    v_mov_b32_e32 v2, s2
221; GFX10-NEXT:    v_mov_b32_e32 v3, s3
222; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
223; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
224; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
225; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
226; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
227; GFX10-NEXT:    s_setpc_b64 s[30:31]
228;
229; GFX11-LABEL: zero_init_foo:
230; GFX11:       ; %bb.0:
231; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
232; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
233; GFX11-NEXT:    s_mov_b32 s0, 0
234; GFX11-NEXT:    s_mov_b32 s1, s0
235; GFX11-NEXT:    s_mov_b32 s2, s0
236; GFX11-NEXT:    s_mov_b32 s3, s0
237; GFX11-NEXT:    v_mov_b32_e32 v0, s0
238; GFX11-NEXT:    v_mov_b32_e32 v1, s1
239; GFX11-NEXT:    v_mov_b32_e32 v2, s2
240; GFX11-NEXT:    v_mov_b32_e32 v3, s3
241; GFX11-NEXT:    s_clause 0x3
242; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:48
243; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:32
244; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
245; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
246; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
247; GFX11-NEXT:    s_setpc_b64 s[30:31]
248;
249; GFX9-PAL-LABEL: zero_init_foo:
250; GFX9-PAL:       ; %bb.0:
251; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
252; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
253; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
254; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
255; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
256; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
257; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
258; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
259; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
260; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
261; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
262; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
263; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
264; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
265; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
266;
267; GFX940-LABEL: zero_init_foo:
268; GFX940:       ; %bb.0:
269; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
270; GFX940-NEXT:    s_mov_b32 s0, 0
271; GFX940-NEXT:    s_mov_b32 s1, s0
272; GFX940-NEXT:    s_mov_b32 s2, s0
273; GFX940-NEXT:    s_mov_b32 s3, s0
274; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
275; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
276; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
277; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
278; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
279; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
280; GFX940-NEXT:    s_waitcnt vmcnt(0)
281; GFX940-NEXT:    s_setpc_b64 s[30:31]
282;
283; GFX10-PAL-LABEL: zero_init_foo:
284; GFX10-PAL:       ; %bb.0:
285; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
286; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
287; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
288; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
289; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
290; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
291; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
292; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
293; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
294; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
295; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
296; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
297; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
298; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
299; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
300; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
301;
302; GFX11-PAL-LABEL: zero_init_foo:
303; GFX11-PAL:       ; %bb.0:
304; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
305; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
306; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
307; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
308; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
309; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
310; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, s0
311; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, s1
312; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, s2
313; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, s3
314; GFX11-PAL-NEXT:    s_clause 0x3
315; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:48
316; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:32
317; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
318; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32
319; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
320; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
321; GCN-LABEL: zero_init_foo:
322; GCN:       ; %bb.0:
323; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
324; GCN-NEXT:    s_mov_b32 s0, 0
325; GCN-NEXT:    s_mov_b32 s1, s0
326; GCN-NEXT:    s_mov_b32 s2, s0
327; GCN-NEXT:    s_mov_b32 s3, s0
328; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
329; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
330; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
331; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
332; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
333; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
334; GCN-NEXT:    s_waitcnt vmcnt(0)
335; GCN-NEXT:    s_setpc_b64 s[30:31]
336  %alloca = alloca [32 x i16], align 2, addrspace(5)
337  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
338  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
339  ret void
340}
341
342define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
343; GFX9-LABEL: store_load_sindex_kernel:
344; GFX9:       ; %bb.0: ; %bb
345; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
346; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
347; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
348; GFX9-NEXT:    v_mov_b32_e32 v0, 15
349; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
350; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
351; GFX9-NEXT:    s_and_b32 s0, s0, 15
352; GFX9-NEXT:    s_add_i32 s1, s1, 4
353; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
354; GFX9-NEXT:    scratch_store_dword off, v0, s1
355; GFX9-NEXT:    s_waitcnt vmcnt(0)
356; GFX9-NEXT:    s_add_i32 s0, s0, 4
357; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
358; GFX9-NEXT:    s_waitcnt vmcnt(0)
359; GFX9-NEXT:    s_endpgm
360;
361; GFX10-LABEL: store_load_sindex_kernel:
362; GFX10:       ; %bb.0: ; %bb
363; GFX10-NEXT:    s_add_u32 s2, s2, s5
364; GFX10-NEXT:    s_addc_u32 s3, s3, 0
365; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
366; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
367; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
368; GFX10-NEXT:    v_mov_b32_e32 v0, 15
369; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
370; GFX10-NEXT:    s_and_b32 s1, s0, 15
371; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
372; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
373; GFX10-NEXT:    s_add_i32 s0, s0, 4
374; GFX10-NEXT:    s_add_i32 s1, s1, 4
375; GFX10-NEXT:    scratch_store_dword off, v0, s0
376; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
377; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
378; GFX10-NEXT:    s_waitcnt vmcnt(0)
379; GFX10-NEXT:    s_endpgm
380;
381; GFX11-LABEL: store_load_sindex_kernel:
382; GFX11:       ; %bb.0: ; %bb
383; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
384; GFX11-NEXT:    v_mov_b32_e32 v0, 15
385; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
386; GFX11-NEXT:    s_and_b32 s1, s0, 15
387; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
388; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
389; GFX11-NEXT:    s_add_i32 s0, s0, 4
390; GFX11-NEXT:    s_add_i32 s1, s1, 4
391; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
392; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
393; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
394; GFX11-NEXT:    s_waitcnt vmcnt(0)
395; GFX11-NEXT:    s_endpgm
396;
397; GFX9-PAL-LABEL: store_load_sindex_kernel:
398; GFX9-PAL:       ; %bb.0: ; %bb
399; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
400; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
401; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
402; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
403; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
404; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
405; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
406; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
407; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
408; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
409; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
410; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 4
411; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
412; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
413; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
414; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
415; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
416; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
417; GFX9-PAL-NEXT:    s_endpgm
418;
419; GFX940-LABEL: store_load_sindex_kernel:
420; GFX940:       ; %bb.0: ; %bb
421; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
422; GFX940-NEXT:    v_mov_b32_e32 v0, 15
423; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
424; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
425; GFX940-NEXT:    s_and_b32 s0, s0, 15
426; GFX940-NEXT:    s_add_i32 s1, s1, 4
427; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
428; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
429; GFX940-NEXT:    s_waitcnt vmcnt(0)
430; GFX940-NEXT:    s_add_i32 s0, s0, 4
431; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
432; GFX940-NEXT:    s_waitcnt vmcnt(0)
433; GFX940-NEXT:    s_endpgm
434;
435; GFX10-PAL-LABEL: store_load_sindex_kernel:
436; GFX10-PAL:       ; %bb.0: ; %bb
437; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
438; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
439; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
440; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
441; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
442; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
443; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
444; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
445; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
446; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
447; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
448; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
449; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
450; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
451; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
452; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 4
453; GFX10-PAL-NEXT:    s_add_i32 s1, s1, 4
454; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
455; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
456; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
457; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
458; GFX10-PAL-NEXT:    s_endpgm
459;
460; GFX11-PAL-LABEL: store_load_sindex_kernel:
461; GFX11-PAL:       ; %bb.0: ; %bb
462; GFX11-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
463; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
464; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
465; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
466; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
467; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
468; GFX11-PAL-NEXT:    s_add_i32 s0, s0, 4
469; GFX11-PAL-NEXT:    s_add_i32 s1, s1, 4
470; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
471; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
472; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
473; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
474; GFX11-PAL-NEXT:    s_endpgm
475; GCN-LABEL: store_load_sindex_kernel:
476; GCN:       ; %bb.0: ; %bb
477; GCN-NEXT:    s_load_dword s0, s[0:1], 0x24
478; GCN-NEXT:    v_mov_b32_e32 v0, 15
479; GCN-NEXT:    s_waitcnt lgkmcnt(0)
480; GCN-NEXT:    s_lshl_b32 s1, s0, 2
481; GCN-NEXT:    s_and_b32 s0, s0, 15
482; GCN-NEXT:    s_lshl_b32 s0, s0, 2
483; GCN-NEXT:    s_add_u32 s1, 4, s1
484; GCN-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
485; GCN-NEXT:    s_waitcnt vmcnt(0)
486; GCN-NEXT:    s_add_u32 s0, 4, s0
487; GCN-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
488; GCN-NEXT:    s_waitcnt vmcnt(0)
489; GCN-NEXT:    s_endpgm
490bb:
491  %i = alloca [32 x float], align 4, addrspace(5)
492  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
493  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
494  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
495  store volatile i32 15, i32 addrspace(5)* %i8, align 4
496  %i9 = and i32 %idx, 15
497  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
498  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
499  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
500  ret void
501}
502
503define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
504; GFX9-LABEL: store_load_sindex_foo:
505; GFX9:       ; %bb.0: ; %bb
506; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
507; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
508; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
509; GFX9-NEXT:    s_add_i32 s0, s0, 4
510; GFX9-NEXT:    v_mov_b32_e32 v0, 15
511; GFX9-NEXT:    scratch_store_dword off, v0, s0
512; GFX9-NEXT:    s_waitcnt vmcnt(0)
513; GFX9-NEXT:    s_and_b32 s0, s2, 15
514; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
515; GFX9-NEXT:    s_add_i32 s0, s0, 4
516; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
517; GFX9-NEXT:    s_waitcnt vmcnt(0)
518; GFX9-NEXT:    s_endpgm
519;
520; GFX10-LABEL: store_load_sindex_foo:
521; GFX10:       ; %bb.0: ; %bb
522; GFX10-NEXT:    s_add_u32 s0, s0, s3
523; GFX10-NEXT:    s_addc_u32 s1, s1, 0
524; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
525; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
526; GFX10-NEXT:    v_mov_b32_e32 v0, 15
527; GFX10-NEXT:    s_and_b32 s0, s2, 15
528; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
529; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
530; GFX10-NEXT:    s_add_i32 s1, s1, 4
531; GFX10-NEXT:    s_add_i32 s0, s0, 4
532; GFX10-NEXT:    scratch_store_dword off, v0, s1
533; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
534; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
535; GFX10-NEXT:    s_waitcnt vmcnt(0)
536; GFX10-NEXT:    s_endpgm
537;
538; GFX11-LABEL: store_load_sindex_foo:
539; GFX11:       ; %bb.0: ; %bb
540; GFX11-NEXT:    v_mov_b32_e32 v0, 15
541; GFX11-NEXT:    s_and_b32 s1, s0, 15
542; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
543; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
544; GFX11-NEXT:    s_add_i32 s0, s0, 4
545; GFX11-NEXT:    s_add_i32 s1, s1, 4
546; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
547; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
548; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
549; GFX11-NEXT:    s_waitcnt vmcnt(0)
550; GFX11-NEXT:    s_endpgm
551;
552; GFX9-PAL-LABEL: store_load_sindex_foo:
553; GFX9-PAL:       ; %bb.0: ; %bb
554; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
555; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
556; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
557; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
558; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
559; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
560; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
561; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
562; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
563; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
564; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 4
565; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
566; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
567; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
568; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
569; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
570; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
571; GFX9-PAL-NEXT:    s_endpgm
572;
573; GFX940-LABEL: store_load_sindex_foo:
574; GFX940:       ; %bb.0: ; %bb
575; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
576; GFX940-NEXT:    s_and_b32 s0, s0, 15
577; GFX940-NEXT:    s_add_i32 s1, s1, 4
578; GFX940-NEXT:    v_mov_b32_e32 v0, 15
579; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
580; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
581; GFX940-NEXT:    s_waitcnt vmcnt(0)
582; GFX940-NEXT:    s_add_i32 s0, s0, 4
583; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
584; GFX940-NEXT:    s_waitcnt vmcnt(0)
585; GFX940-NEXT:    s_endpgm
586;
587; GFX10-PAL-LABEL: store_load_sindex_foo:
588; GFX10-PAL:       ; %bb.0: ; %bb
589; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
590; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
591; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
592; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
593; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
594; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
595; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
596; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
597; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
598; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
599; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
600; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
601; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
602; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 4
603; GFX10-PAL-NEXT:    s_add_i32 s1, s1, 4
604; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
605; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
606; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
607; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
608; GFX10-PAL-NEXT:    s_endpgm
609;
610; GFX11-PAL-LABEL: store_load_sindex_foo:
611; GFX11-PAL:       ; %bb.0: ; %bb
612; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
613; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
614; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
615; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
616; GFX11-PAL-NEXT:    s_add_i32 s0, s0, 4
617; GFX11-PAL-NEXT:    s_add_i32 s1, s1, 4
618; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
619; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
620; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
621; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
622; GFX11-PAL-NEXT:    s_endpgm
623; GCN-LABEL: store_load_sindex_foo:
624; GCN:       ; %bb.0: ; %bb
625; GCN-NEXT:    s_lshl_b32 s1, s0, 2
626; GCN-NEXT:    s_and_b32 s0, s0, 15
627; GCN-NEXT:    s_lshl_b32 s0, s0, 2
628; GCN-NEXT:    s_add_u32 s1, 4, s1
629; GCN-NEXT:    v_mov_b32_e32 v0, 15
630; GCN-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
631; GCN-NEXT:    s_waitcnt vmcnt(0)
632; GCN-NEXT:    s_add_u32 s0, 4, s0
633; GCN-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
634; GCN-NEXT:    s_waitcnt vmcnt(0)
635; GCN-NEXT:    s_endpgm
636bb:
637  %i = alloca [32 x float], align 4, addrspace(5)
638  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
639  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
640  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
641  store volatile i32 15, i32 addrspace(5)* %i8, align 4
642  %i9 = and i32 %idx, 15
643  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
644  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
645  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
646  ret void
647}
648
649define amdgpu_kernel void @store_load_vindex_kernel() {
650; GFX9-LABEL: store_load_vindex_kernel:
651; GFX9:       ; %bb.0: ; %bb
652; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
653; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
654; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
655; GFX9-NEXT:    v_add_u32_e32 v1, 4, v0
656; GFX9-NEXT:    v_mov_b32_e32 v2, 15
657; GFX9-NEXT:    scratch_store_dword v1, v2, off
658; GFX9-NEXT:    s_waitcnt vmcnt(0)
659; GFX9-NEXT:    v_sub_u32_e32 v0, 4, v0
660; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
661; GFX9-NEXT:    s_waitcnt vmcnt(0)
662; GFX9-NEXT:    s_endpgm
663;
664; GFX10-LABEL: store_load_vindex_kernel:
665; GFX10:       ; %bb.0: ; %bb
666; GFX10-NEXT:    s_add_u32 s0, s0, s3
667; GFX10-NEXT:    s_addc_u32 s1, s1, 0
668; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
669; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
670; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
671; GFX10-NEXT:    v_mov_b32_e32 v2, 15
672; GFX10-NEXT:    v_add_nc_u32_e32 v1, 4, v0
673; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 4, v0
674; GFX10-NEXT:    scratch_store_dword v1, v2, off
675; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
676; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
677; GFX10-NEXT:    s_waitcnt vmcnt(0)
678; GFX10-NEXT:    s_endpgm
679;
680; GFX11-LABEL: store_load_vindex_kernel:
681; GFX11:       ; %bb.0: ; %bb
682; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
683; GFX11-NEXT:    v_mov_b32_e32 v1, 15
684; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
685; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4 dlc
686; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
687; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
688; GFX11-NEXT:    s_waitcnt vmcnt(0)
689; GFX11-NEXT:    s_endpgm
690;
691; GFX9-PAL-LABEL: store_load_vindex_kernel:
692; GFX9-PAL:       ; %bb.0: ; %bb
693; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
694; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
695; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
696; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
697; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 4, v0
698; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
699; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 4, v0
700; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
701; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
702; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
703; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
704; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
705; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
706; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
707; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
708; GFX9-PAL-NEXT:    s_endpgm
709;
710; GFX940-LABEL: store_load_vindex_kernel:
711; GFX940:       ; %bb.0: ; %bb
712; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
713; GFX940-NEXT:    v_mov_b32_e32 v1, 15
714; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
715; GFX940-NEXT:    s_waitcnt vmcnt(0)
716; GFX940-NEXT:    v_sub_u32_e32 v0, 4, v0
717; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
718; GFX940-NEXT:    s_waitcnt vmcnt(0)
719; GFX940-NEXT:    s_endpgm
720;
721; GFX10-PAL-LABEL: store_load_vindex_kernel:
722; GFX10-PAL:       ; %bb.0: ; %bb
723; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
724; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
725; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
726; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
727; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
728; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
729; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
730; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
731; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
732; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
733; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
734; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v1, 4, v0
735; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, 4, v0
736; GFX10-PAL-NEXT:    scratch_store_dword v1, v2, off
737; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
738; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
739; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
740; GFX10-PAL-NEXT:    s_endpgm
741;
742; GFX11-PAL-LABEL: store_load_vindex_kernel:
743; GFX11-PAL:       ; %bb.0: ; %bb
744; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
745; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
746; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
747; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4 dlc
748; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
749; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
750; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
751; GFX11-PAL-NEXT:    s_endpgm
752; GCN-LABEL: store_load_vindex_kernel:
753; GCN:       ; %bb.0: ; %bb
754; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
755; GCN-NEXT:    v_mov_b32_e32 v1, 15
756; GCN-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
757; GCN-NEXT:    s_waitcnt vmcnt(0)
758; GCN-NEXT:    v_sub_u32_e32 v0, 4, v0
759; GCN-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
760; GCN-NEXT:    s_waitcnt vmcnt(0)
761; GCN-NEXT:    s_endpgm
762bb:
763  %i = alloca [32 x float], align 4, addrspace(5)
764  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
765  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
766  %i3 = zext i32 %i2 to i64
767  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
768  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
769  store volatile i32 15, i32 addrspace(5)* %i8, align 4
770  %i9 = sub nsw i32 31, %i2
771  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
772  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
773  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
774  ret void
775}
776
777define void @store_load_vindex_foo(i32 %idx) {
778; GFX9-LABEL: store_load_vindex_foo:
779; GFX9:       ; %bb.0: ; %bb
780; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
781; GFX9-NEXT:    v_mov_b32_e32 v1, s32
782; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
783; GFX9-NEXT:    v_mov_b32_e32 v3, 15
784; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
785; GFX9-NEXT:    scratch_store_dword v2, v3, off
786; GFX9-NEXT:    s_waitcnt vmcnt(0)
787; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
788; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
789; GFX9-NEXT:    s_waitcnt vmcnt(0)
790; GFX9-NEXT:    s_setpc_b64 s[30:31]
791;
792; GFX10-LABEL: store_load_vindex_foo:
793; GFX10:       ; %bb.0: ; %bb
794; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
795; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
796; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
797; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
798; GFX10-NEXT:    v_mov_b32_e32 v2, 15
799; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, s32
800; GFX10-NEXT:    scratch_store_dword v0, v2, off
801; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
802; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
803; GFX10-NEXT:    s_waitcnt vmcnt(0)
804; GFX10-NEXT:    s_setpc_b64 s[30:31]
805;
806; GFX11-LABEL: store_load_vindex_foo:
807; GFX11:       ; %bb.0: ; %bb
808; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
809; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
810; GFX11-NEXT:    v_and_b32_e32 v1, 15, v0
811; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
812; GFX11-NEXT:    v_mov_b32_e32 v2, 15
813; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
814; GFX11-NEXT:    scratch_store_b32 v0, v2, s32 dlc
815; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
816; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 glc dlc
817; GFX11-NEXT:    s_waitcnt vmcnt(0)
818; GFX11-NEXT:    s_setpc_b64 s[30:31]
819;
820; GFX9-PAL-LABEL: store_load_vindex_foo:
821; GFX9-PAL:       ; %bb.0: ; %bb
822; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
823; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s32
824; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
825; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
826; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
827; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
828; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
829; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
830; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
831; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
832; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
833;
834; GFX940-LABEL: store_load_vindex_foo:
835; GFX940:       ; %bb.0: ; %bb
836; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
837; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
838; GFX940-NEXT:    v_mov_b32_e32 v2, 15
839; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
840; GFX940-NEXT:    scratch_store_dword v1, v2, s32 sc0 sc1
841; GFX940-NEXT:    s_waitcnt vmcnt(0)
842; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
843; GFX940-NEXT:    scratch_load_dword v0, v0, s32 sc0 sc1
844; GFX940-NEXT:    s_waitcnt vmcnt(0)
845; GFX940-NEXT:    s_setpc_b64 s[30:31]
846;
847; GFX10-PAL-LABEL: store_load_vindex_foo:
848; GFX10-PAL:       ; %bb.0: ; %bb
849; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
850; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
851; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
852; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
853; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
854; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, s32
855; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
856; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
857; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
858; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
859; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
860;
861; GFX11-PAL-LABEL: store_load_vindex_foo:
862; GFX11-PAL:       ; %bb.0: ; %bb
863; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
864; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
865; GFX11-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
866; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
867; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 15
868; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
869; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, s32 dlc
870; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
871; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 glc dlc
872; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
873; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
874; GCN-LABEL: store_load_vindex_foo:
875; GCN:       ; %bb.0: ; %bb
876; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
877; GCN-NEXT:    v_mov_b32_e32 v2, 15
878; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
879; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
880; GCN-NEXT:    scratch_store_dword v1, v2, s32 sc0 sc1
881; GCN-NEXT:    s_waitcnt vmcnt(0)
882; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
883; GCN-NEXT:    scratch_load_dword v0, v0, s32 sc0 sc1
884; GCN-NEXT:    s_waitcnt vmcnt(0)
885; GCN-NEXT:    s_setpc_b64 s[30:31]
886bb:
887  %i = alloca [32 x float], align 4, addrspace(5)
888  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
889  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
890  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
891  store volatile i32 15, i32 addrspace(5)* %i8, align 4
892  %i9 = and i32 %idx, 15
893  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
894  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
895  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
896  ret void
897}
898
899define void @private_ptr_foo(float addrspace(5)* nocapture %arg) {
900; GFX9-LABEL: private_ptr_foo:
901; GFX9:       ; %bb.0:
902; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
903; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
904; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
905; GFX9-NEXT:    s_waitcnt vmcnt(0)
906; GFX9-NEXT:    s_setpc_b64 s[30:31]
907;
908; GFX10-LABEL: private_ptr_foo:
909; GFX10:       ; %bb.0:
910; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
911; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
912; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
913; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
914; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
915; GFX10-NEXT:    s_setpc_b64 s[30:31]
916;
917; GFX11-LABEL: private_ptr_foo:
918; GFX11:       ; %bb.0:
919; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
920; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
921; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41200000
922; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4
923; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
924; GFX11-NEXT:    s_setpc_b64 s[30:31]
925;
926; GFX9-PAL-LABEL: private_ptr_foo:
927; GFX9-PAL:       ; %bb.0:
928; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
929; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
930; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
931; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
932; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
933;
934; GFX940-LABEL: private_ptr_foo:
935; GFX940:       ; %bb.0:
936; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
937; GFX940-NEXT:    v_mov_b32_e32 v1, 0x41200000
938; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4
939; GFX940-NEXT:    s_waitcnt vmcnt(0)
940; GFX940-NEXT:    s_setpc_b64 s[30:31]
941;
942; GFX10-PAL-LABEL: private_ptr_foo:
943; GFX10-PAL:       ; %bb.0:
944; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
945; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
946; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
947; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
948; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
949; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
950;
951; GFX11-PAL-LABEL: private_ptr_foo:
952; GFX11-PAL:       ; %bb.0:
953; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
954; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
955; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
956; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4
957; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
958; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
959; GCN-LABEL: private_ptr_foo:
960; GCN:       ; %bb.0:
961; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
962; GCN-NEXT:    v_mov_b32_e32 v1, 0x41200000
963; GCN-NEXT:    scratch_store_dword v0, v1, off offset:4
964; GCN-NEXT:    s_waitcnt vmcnt(0)
965; GCN-NEXT:    s_setpc_b64 s[30:31]
966  %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1
967  store float 1.000000e+01, float addrspace(5)* %gep, align 4
968  ret void
969}
970
971define amdgpu_kernel void @zero_init_small_offset_kernel() {
972; GFX9-LABEL: zero_init_small_offset_kernel:
973; GFX9:       ; %bb.0:
974; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
975; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
976; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
977; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
978; GFX9-NEXT:    s_waitcnt vmcnt(0)
979; GFX9-NEXT:    s_mov_b32 s0, 0
980; GFX9-NEXT:    s_mov_b32 s1, s0
981; GFX9-NEXT:    s_mov_b32 s2, s0
982; GFX9-NEXT:    s_mov_b32 s3, s0
983; GFX9-NEXT:    v_mov_b32_e32 v0, s0
984; GFX9-NEXT:    v_mov_b32_e32 v1, s1
985; GFX9-NEXT:    v_mov_b32_e32 v2, s2
986; GFX9-NEXT:    v_mov_b32_e32 v3, s3
987; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
988; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272
989; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
990; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288
991; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
992; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304
993; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
994; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
995; GFX9-NEXT:    s_endpgm
996;
997; GFX10-LABEL: zero_init_small_offset_kernel:
998; GFX10:       ; %bb.0:
999; GFX10-NEXT:    s_add_u32 s0, s0, s3
1000; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1001; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1002; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1003; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1004; GFX10-NEXT:    s_waitcnt vmcnt(0)
1005; GFX10-NEXT:    s_mov_b32 s0, 0
1006; GFX10-NEXT:    s_mov_b32 s1, s0
1007; GFX10-NEXT:    s_mov_b32 s2, s0
1008; GFX10-NEXT:    s_mov_b32 s3, s0
1009; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1010; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1011; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1012; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1013; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
1014; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
1015; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
1016; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
1017; GFX10-NEXT:    s_endpgm
1018;
1019; GFX11-LABEL: zero_init_small_offset_kernel:
1020; GFX11:       ; %bb.0:
1021; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
1022; GFX11-NEXT:    s_waitcnt vmcnt(0)
1023; GFX11-NEXT:    s_mov_b32 s0, 0
1024; GFX11-NEXT:    s_mov_b32 s1, s0
1025; GFX11-NEXT:    s_mov_b32 s2, s0
1026; GFX11-NEXT:    s_mov_b32 s3, s0
1027; GFX11-NEXT:    v_mov_b32_e32 v0, s0
1028; GFX11-NEXT:    v_mov_b32_e32 v1, s1
1029; GFX11-NEXT:    v_mov_b32_e32 v2, s2
1030; GFX11-NEXT:    v_mov_b32_e32 v3, s3
1031; GFX11-NEXT:    s_clause 0x3
1032; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
1033; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:288
1034; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:304
1035; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:320
1036; GFX11-NEXT:    s_endpgm
1037;
1038; GFX9-PAL-LABEL: zero_init_small_offset_kernel:
1039; GFX9-PAL:       ; %bb.0:
1040; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1041; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1042; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1043; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1044; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1045; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1046; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1047; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1048; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1049; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1050; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1051; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
1052; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1053; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
1054; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
1055; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
1056; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
1057; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
1058; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1059; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272
1060; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1061; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288
1062; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1063; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304
1064; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1065; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
1066; GFX9-PAL-NEXT:    s_endpgm
1067;
1068; GFX940-LABEL: zero_init_small_offset_kernel:
1069; GFX940:       ; %bb.0:
1070; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
1071; GFX940-NEXT:    s_waitcnt vmcnt(0)
1072; GFX940-NEXT:    s_mov_b32 s0, 0
1073; GFX940-NEXT:    s_mov_b32 s1, s0
1074; GFX940-NEXT:    s_mov_b32 s2, s0
1075; GFX940-NEXT:    s_mov_b32 s3, s0
1076; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1077; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
1078; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
1079; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
1080; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
1081; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
1082; GFX940-NEXT:    s_endpgm
1083;
1084; GFX1010-PAL-LABEL: zero_init_small_offset_kernel:
1085; GFX1010-PAL:       ; %bb.0:
1086; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1087; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1088; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1089; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1090; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1091; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1092; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1093; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1094; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1095; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1096; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
1097; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
1098; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1099; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
1100; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1101; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
1102; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
1103; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
1104; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
1105; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
1106; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1107; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:272
1108; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1109; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1110; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:288
1111; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1112; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1113; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:304
1114; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1115; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1116; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:320
1117; GFX1010-PAL-NEXT:    s_endpgm
1118;
1119; GFX1030-PAL-LABEL: zero_init_small_offset_kernel:
1120; GFX1030-PAL:       ; %bb.0:
1121; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1122; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1123; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1124; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1125; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1126; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1127; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1128; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1129; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1130; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1131; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1132; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
1133; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
1134; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1135; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
1136; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
1137; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
1138; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
1139; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
1140; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
1141; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
1142; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
1143; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
1144; GFX1030-PAL-NEXT:    s_endpgm
1145;
1146; GFX11-PAL-LABEL: zero_init_small_offset_kernel:
1147; GFX11-PAL:       ; %bb.0:
1148; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
1149; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1150; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
1151; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
1152; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
1153; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
1154; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, s0
1155; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, s1
1156; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, s2
1157; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, s3
1158; GFX11-PAL-NEXT:    s_clause 0x3
1159; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
1160; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:288
1161; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:304
1162; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:320
1163; GFX11-PAL-NEXT:    s_endpgm
1164  %padding = alloca [64 x i32], align 4, addrspace(5)
1165  %alloca = alloca [32 x i16], align 2, addrspace(5)
1166  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1167  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1168  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
1169  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
1170  ret void
1171}
1172
1173define void @zero_init_small_offset_foo() {
1174; GFX9-LABEL: zero_init_small_offset_foo:
1175; GFX9:       ; %bb.0:
1176; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1177; GFX9-NEXT:    scratch_load_dword v0, off, s32 glc
1178; GFX9-NEXT:    s_waitcnt vmcnt(0)
1179; GFX9-NEXT:    s_mov_b32 s0, 0
1180; GFX9-NEXT:    s_mov_b32 s1, s0
1181; GFX9-NEXT:    s_mov_b32 s2, s0
1182; GFX9-NEXT:    s_mov_b32 s3, s0
1183; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1184; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1185; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1186; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1187; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1188; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1189; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1190; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1191; GFX9-NEXT:    s_waitcnt vmcnt(0)
1192; GFX9-NEXT:    s_setpc_b64 s[30:31]
1193;
1194; GFX10-LABEL: zero_init_small_offset_foo:
1195; GFX10:       ; %bb.0:
1196; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1197; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1198; GFX10-NEXT:    scratch_load_dword v0, off, s32 glc dlc
1199; GFX10-NEXT:    s_waitcnt vmcnt(0)
1200; GFX10-NEXT:    s_mov_b32 s0, 0
1201; GFX10-NEXT:    s_mov_b32 s1, s0
1202; GFX10-NEXT:    s_mov_b32 s2, s0
1203; GFX10-NEXT:    s_mov_b32 s3, s0
1204; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1205; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1206; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1207; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1208; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1209; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1210; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1211; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1212; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1213; GFX10-NEXT:    s_setpc_b64 s[30:31]
1214;
1215; GFX11-LABEL: zero_init_small_offset_foo:
1216; GFX11:       ; %bb.0:
1217; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1218; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1219; GFX11-NEXT:    scratch_load_b32 v0, off, s32 glc dlc
1220; GFX11-NEXT:    s_waitcnt vmcnt(0)
1221; GFX11-NEXT:    s_mov_b32 s0, 0
1222; GFX11-NEXT:    s_mov_b32 s1, s0
1223; GFX11-NEXT:    s_mov_b32 s2, s0
1224; GFX11-NEXT:    s_mov_b32 s3, s0
1225; GFX11-NEXT:    v_mov_b32_e32 v0, s0
1226; GFX11-NEXT:    v_mov_b32_e32 v1, s1
1227; GFX11-NEXT:    v_mov_b32_e32 v2, s2
1228; GFX11-NEXT:    v_mov_b32_e32 v3, s3
1229; GFX11-NEXT:    s_clause 0x3
1230; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:256
1231; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:272
1232; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:288
1233; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:304
1234; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1235; GFX11-NEXT:    s_setpc_b64 s[30:31]
1236;
1237; GFX9-PAL-LABEL: zero_init_small_offset_foo:
1238; GFX9-PAL:       ; %bb.0:
1239; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1240; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 glc
1241; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1242; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1243; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
1244; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1245; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
1246; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
1247; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
1248; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
1249; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
1250; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1251; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1252; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1253; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1254; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1255; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1256;
1257; GFX940-LABEL: zero_init_small_offset_foo:
1258; GFX940:       ; %bb.0:
1259; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1260; GFX940-NEXT:    scratch_load_dword v0, off, s32 sc0 sc1
1261; GFX940-NEXT:    s_waitcnt vmcnt(0)
1262; GFX940-NEXT:    s_mov_b32 s0, 0
1263; GFX940-NEXT:    s_mov_b32 s1, s0
1264; GFX940-NEXT:    s_mov_b32 s2, s0
1265; GFX940-NEXT:    s_mov_b32 s3, s0
1266; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1267; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
1268; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1269; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1270; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1271; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1272; GFX940-NEXT:    s_waitcnt vmcnt(0)
1273; GFX940-NEXT:    s_setpc_b64 s[30:31]
1274;
1275; GFX10-PAL-LABEL: zero_init_small_offset_foo:
1276; GFX10-PAL:       ; %bb.0:
1277; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1278; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1279; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s32 glc dlc
1280; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1281; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
1282; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
1283; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1284; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
1285; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
1286; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
1287; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
1288; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
1289; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1290; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1291; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1292; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1293; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1294; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1295;
1296; GFX11-PAL-LABEL: zero_init_small_offset_foo:
1297; GFX11-PAL:       ; %bb.0:
1298; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1299; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1300; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s32 glc dlc
1301; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1302; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
1303; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
1304; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
1305; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
1306; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, s0
1307; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, s1
1308; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, s2
1309; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, s3
1310; GFX11-PAL-NEXT:    s_clause 0x3
1311; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:256
1312; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:272
1313; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:288
1314; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:304
1315; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1316; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
1317; GCN-LABEL: zero_init_small_offset_foo:
1318; GCN:       ; %bb.0:
1319; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1320; GCN-NEXT:    scratch_load_dword v0, off, s32 sc0 sc1
1321; GCN-NEXT:    s_waitcnt vmcnt(0)
1322; GCN-NEXT:    s_mov_b32 s0, 0
1323; GCN-NEXT:    s_mov_b32 s1, s0
1324; GCN-NEXT:    s_mov_b32 s2, s0
1325; GCN-NEXT:    s_mov_b32 s3, s0
1326; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1327; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
1328; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1329; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1330; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1331; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1332; GCN-NEXT:    s_waitcnt vmcnt(0)
1333; GCN-NEXT:    s_setpc_b64 s[30:31]
1334  %padding = alloca [64 x i32], align 4, addrspace(5)
1335  %alloca = alloca [32 x i16], align 2, addrspace(5)
1336  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1337  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1338  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
1339  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
1340  ret void
1341}
1342
1343define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
1344; GFX9-LABEL: store_load_sindex_small_offset_kernel:
1345; GFX9:       ; %bb.0: ; %bb
1346; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
1347; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
1348; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1349; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1350; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1351; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1352; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
1353; GFX9-NEXT:    s_and_b32 s0, s0, 15
1354; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1355; GFX9-NEXT:    s_addk_i32 s1, 0x104
1356; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1357; GFX9-NEXT:    scratch_store_dword off, v0, s1
1358; GFX9-NEXT:    s_waitcnt vmcnt(0)
1359; GFX9-NEXT:    s_addk_i32 s0, 0x104
1360; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1361; GFX9-NEXT:    s_waitcnt vmcnt(0)
1362; GFX9-NEXT:    s_endpgm
1363;
1364; GFX10-LABEL: store_load_sindex_small_offset_kernel:
1365; GFX10:       ; %bb.0: ; %bb
1366; GFX10-NEXT:    s_add_u32 s2, s2, s5
1367; GFX10-NEXT:    s_addc_u32 s3, s3, 0
1368; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1369; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1370; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
1371; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1372; GFX10-NEXT:    s_waitcnt vmcnt(0)
1373; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1374; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1375; GFX10-NEXT:    s_and_b32 s1, s0, 15
1376; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1377; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
1378; GFX10-NEXT:    s_addk_i32 s0, 0x104
1379; GFX10-NEXT:    s_addk_i32 s1, 0x104
1380; GFX10-NEXT:    scratch_store_dword off, v0, s0
1381; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1382; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1383; GFX10-NEXT:    s_waitcnt vmcnt(0)
1384; GFX10-NEXT:    s_endpgm
1385;
1386; GFX11-LABEL: store_load_sindex_small_offset_kernel:
1387; GFX11:       ; %bb.0: ; %bb
1388; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
1389; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
1390; GFX11-NEXT:    s_waitcnt vmcnt(0)
1391; GFX11-NEXT:    v_mov_b32_e32 v0, 15
1392; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1393; GFX11-NEXT:    s_and_b32 s1, s0, 15
1394; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
1395; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
1396; GFX11-NEXT:    s_addk_i32 s0, 0x104
1397; GFX11-NEXT:    s_addk_i32 s1, 0x104
1398; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
1399; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1400; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
1401; GFX11-NEXT:    s_waitcnt vmcnt(0)
1402; GFX11-NEXT:    s_endpgm
1403;
1404; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel:
1405; GFX9-PAL:       ; %bb.0: ; %bb
1406; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
1407; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
1408; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1409; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1410; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
1411; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1412; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1413; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
1414; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
1415; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1416; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1417; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1418; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1419; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1420; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x104
1421; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1422; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1423; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1424; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x104
1425; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
1426; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1427; GFX9-PAL-NEXT:    s_endpgm
1428;
1429; GFX940-LABEL: store_load_sindex_small_offset_kernel:
1430; GFX940:       ; %bb.0: ; %bb
1431; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
1432; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
1433; GFX940-NEXT:    s_waitcnt vmcnt(0)
1434; GFX940-NEXT:    v_mov_b32_e32 v0, 15
1435; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1436; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
1437; GFX940-NEXT:    s_and_b32 s0, s0, 15
1438; GFX940-NEXT:    s_addk_i32 s1, 0x104
1439; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
1440; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
1441; GFX940-NEXT:    s_waitcnt vmcnt(0)
1442; GFX940-NEXT:    s_addk_i32 s0, 0x104
1443; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
1444; GFX940-NEXT:    s_waitcnt vmcnt(0)
1445; GFX940-NEXT:    s_endpgm
1446;
1447; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel:
1448; GFX1010-PAL:       ; %bb.0: ; %bb
1449; GFX1010-PAL-NEXT:    s_getpc_b64 s[4:5]
1450; GFX1010-PAL-NEXT:    s_mov_b32 s4, s0
1451; GFX1010-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1452; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1453; GFX1010-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1454; GFX1010-PAL-NEXT:    s_add_u32 s4, s4, s3
1455; GFX1010-PAL-NEXT:    s_addc_u32 s5, s5, 0
1456; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
1457; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
1458; GFX1010-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
1459; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1460; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
1461; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1462; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
1463; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1464; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
1465; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1466; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1467; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x104
1468; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x104
1469; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
1470; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1471; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1472; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1473; GFX1010-PAL-NEXT:    s_endpgm
1474;
1475; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel:
1476; GFX1030-PAL:       ; %bb.0: ; %bb
1477; GFX1030-PAL-NEXT:    s_getpc_b64 s[4:5]
1478; GFX1030-PAL-NEXT:    s_mov_b32 s4, s0
1479; GFX1030-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1480; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1481; GFX1030-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1482; GFX1030-PAL-NEXT:    s_add_u32 s4, s4, s3
1483; GFX1030-PAL-NEXT:    s_addc_u32 s5, s5, 0
1484; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
1485; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
1486; GFX1030-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
1487; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1488; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1489; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
1490; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1491; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
1492; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1493; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1494; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x104
1495; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x104
1496; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
1497; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1498; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1499; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1500; GFX1030-PAL-NEXT:    s_endpgm
1501;
1502; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel:
1503; GFX11-PAL:       ; %bb.0: ; %bb
1504; GFX11-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
1505; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
1506; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1507; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
1508; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1509; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
1510; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1511; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1512; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x104
1513; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x104
1514; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
1515; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1516; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
1517; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1518; GFX11-PAL-NEXT:    s_endpgm
1519bb:
1520  %padding = alloca [64 x i32], align 4, addrspace(5)
1521  %i = alloca [32 x float], align 4, addrspace(5)
1522  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1523  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1524  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1525  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1526  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1527  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1528  %i9 = and i32 %idx, 15
1529  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1530  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1531  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1532  ret void
1533}
1534
1535define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
1536; GFX9-LABEL: store_load_sindex_small_offset_foo:
1537; GFX9:       ; %bb.0: ; %bb
1538; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1539; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1540; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1541; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1542; GFX9-NEXT:    s_waitcnt vmcnt(0)
1543; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
1544; GFX9-NEXT:    s_addk_i32 s0, 0x104
1545; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1546; GFX9-NEXT:    scratch_store_dword off, v0, s0
1547; GFX9-NEXT:    s_waitcnt vmcnt(0)
1548; GFX9-NEXT:    s_and_b32 s0, s2, 15
1549; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1550; GFX9-NEXT:    s_addk_i32 s0, 0x104
1551; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1552; GFX9-NEXT:    s_waitcnt vmcnt(0)
1553; GFX9-NEXT:    s_endpgm
1554;
1555; GFX10-LABEL: store_load_sindex_small_offset_foo:
1556; GFX10:       ; %bb.0: ; %bb
1557; GFX10-NEXT:    s_add_u32 s0, s0, s3
1558; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1559; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1560; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1561; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1562; GFX10-NEXT:    s_waitcnt vmcnt(0)
1563; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1564; GFX10-NEXT:    s_and_b32 s0, s2, 15
1565; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
1566; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1567; GFX10-NEXT:    s_addk_i32 s1, 0x104
1568; GFX10-NEXT:    s_addk_i32 s0, 0x104
1569; GFX10-NEXT:    scratch_store_dword off, v0, s1
1570; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1571; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
1572; GFX10-NEXT:    s_waitcnt vmcnt(0)
1573; GFX10-NEXT:    s_endpgm
1574;
1575; GFX11-LABEL: store_load_sindex_small_offset_foo:
1576; GFX11:       ; %bb.0: ; %bb
1577; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
1578; GFX11-NEXT:    s_waitcnt vmcnt(0)
1579; GFX11-NEXT:    v_mov_b32_e32 v0, 15
1580; GFX11-NEXT:    s_and_b32 s1, s0, 15
1581; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
1582; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
1583; GFX11-NEXT:    s_addk_i32 s0, 0x104
1584; GFX11-NEXT:    s_addk_i32 s1, 0x104
1585; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
1586; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1587; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
1588; GFX11-NEXT:    s_waitcnt vmcnt(0)
1589; GFX11-NEXT:    s_endpgm
1590;
1591; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo:
1592; GFX9-PAL:       ; %bb.0: ; %bb
1593; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1594; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1595; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1596; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1597; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1598; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1599; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1600; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1601; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1602; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1603; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1604; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1605; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x104
1606; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1607; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1608; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1609; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1610; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x104
1611; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
1612; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1613; GFX9-PAL-NEXT:    s_endpgm
1614;
1615; GFX940-LABEL: store_load_sindex_small_offset_foo:
1616; GFX940:       ; %bb.0: ; %bb
1617; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
1618; GFX940-NEXT:    s_waitcnt vmcnt(0)
1619; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
1620; GFX940-NEXT:    s_and_b32 s0, s0, 15
1621; GFX940-NEXT:    s_addk_i32 s1, 0x104
1622; GFX940-NEXT:    v_mov_b32_e32 v0, 15
1623; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
1624; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
1625; GFX940-NEXT:    s_waitcnt vmcnt(0)
1626; GFX940-NEXT:    s_addk_i32 s0, 0x104
1627; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
1628; GFX940-NEXT:    s_waitcnt vmcnt(0)
1629; GFX940-NEXT:    s_endpgm
1630;
1631; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo:
1632; GFX1010-PAL:       ; %bb.0: ; %bb
1633; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1634; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1635; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1636; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1637; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1638; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1639; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1640; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1641; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1642; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1643; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
1644; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
1645; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1646; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
1647; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1648; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1649; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x104
1650; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x104
1651; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
1652; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1653; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1654; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1655; GFX1010-PAL-NEXT:    s_endpgm
1656;
1657; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo:
1658; GFX1030-PAL:       ; %bb.0: ; %bb
1659; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1660; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1661; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1662; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1663; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1664; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1665; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1666; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1667; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1668; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1669; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1670; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
1671; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
1672; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1673; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1674; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x104
1675; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x104
1676; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
1677; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1678; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1679; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1680; GFX1030-PAL-NEXT:    s_endpgm
1681;
1682; GFX11-PAL-LABEL: store_load_sindex_small_offset_foo:
1683; GFX11-PAL:       ; %bb.0: ; %bb
1684; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
1685; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1686; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
1687; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
1688; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1689; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1690; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x104
1691; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x104
1692; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
1693; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1694; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
1695; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1696; GFX11-PAL-NEXT:    s_endpgm
1697bb:
1698  %padding = alloca [64 x i32], align 4, addrspace(5)
1699  %i = alloca [32 x float], align 4, addrspace(5)
1700  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1701  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1702  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1703  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1704  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1705  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1706  %i9 = and i32 %idx, 15
1707  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1708  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1709  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1710  ret void
1711}
1712
1713define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
1714; GFX9-LABEL: store_load_vindex_small_offset_kernel:
1715; GFX9:       ; %bb.0: ; %bb
1716; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1717; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1718; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1719; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
1720; GFX9-NEXT:    s_waitcnt vmcnt(0)
1721; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1722; GFX9-NEXT:    v_add_u32_e32 v1, 0x104, v0
1723; GFX9-NEXT:    v_mov_b32_e32 v2, 15
1724; GFX9-NEXT:    scratch_store_dword v1, v2, off
1725; GFX9-NEXT:    s_waitcnt vmcnt(0)
1726; GFX9-NEXT:    v_sub_u32_e32 v0, 0x104, v0
1727; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
1728; GFX9-NEXT:    s_waitcnt vmcnt(0)
1729; GFX9-NEXT:    s_endpgm
1730;
1731; GFX10-LABEL: store_load_vindex_small_offset_kernel:
1732; GFX10:       ; %bb.0: ; %bb
1733; GFX10-NEXT:    s_add_u32 s0, s0, s3
1734; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1735; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1736; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1737; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1738; GFX10-NEXT:    v_mov_b32_e32 v2, 15
1739; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
1740; GFX10-NEXT:    s_waitcnt vmcnt(0)
1741; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
1742; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
1743; GFX10-NEXT:    scratch_store_dword v1, v2, off
1744; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1745; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1746; GFX10-NEXT:    s_waitcnt vmcnt(0)
1747; GFX10-NEXT:    s_endpgm
1748;
1749; GFX11-LABEL: store_load_vindex_small_offset_kernel:
1750; GFX11:       ; %bb.0: ; %bb
1751; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1752; GFX11-NEXT:    v_mov_b32_e32 v1, 15
1753; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
1754; GFX11-NEXT:    s_waitcnt vmcnt(0)
1755; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
1756; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:260 dlc
1757; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1758; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
1759; GFX11-NEXT:    s_waitcnt vmcnt(0)
1760; GFX11-NEXT:    s_endpgm
1761;
1762; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel:
1763; GFX9-PAL:       ; %bb.0: ; %bb
1764; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1765; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1766; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1767; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1768; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1769; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
1770; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1771; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1772; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1773; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1774; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
1775; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1776; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x104, v0
1777; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
1778; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1779; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x104, v0
1780; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
1781; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1782; GFX9-PAL-NEXT:    s_endpgm
1783;
1784; GFX940-LABEL: store_load_vindex_small_offset_kernel:
1785; GFX940:       ; %bb.0: ; %bb
1786; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
1787; GFX940-NEXT:    s_waitcnt vmcnt(0)
1788; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1789; GFX940-NEXT:    v_mov_b32_e32 v1, 15
1790; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:260 sc0 sc1
1791; GFX940-NEXT:    s_waitcnt vmcnt(0)
1792; GFX940-NEXT:    v_sub_u32_e32 v0, 0x104, v0
1793; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
1794; GFX940-NEXT:    s_waitcnt vmcnt(0)
1795; GFX940-NEXT:    s_endpgm
1796;
1797; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel:
1798; GFX1010-PAL:       ; %bb.0: ; %bb
1799; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1800; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1801; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1802; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1803; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1804; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1805; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1806; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1807; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1808; GFX1010-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1809; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, 15
1810; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1811; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, vcc_lo offset:4 glc dlc
1812; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1813; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
1814; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
1815; GFX1010-PAL-NEXT:    scratch_store_dword v1, v2, off
1816; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1817; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1818; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1819; GFX1010-PAL-NEXT:    s_endpgm
1820;
1821; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel:
1822; GFX1030-PAL:       ; %bb.0: ; %bb
1823; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1824; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1825; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1826; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1827; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1828; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1829; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1830; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1831; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1832; GFX1030-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1833; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, 15
1834; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
1835; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1836; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
1837; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
1838; GFX1030-PAL-NEXT:    scratch_store_dword v1, v2, off
1839; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1840; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1841; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1842; GFX1030-PAL-NEXT:    s_endpgm
1843;
1844; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel:
1845; GFX11-PAL:       ; %bb.0: ; %bb
1846; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1847; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
1848; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
1849; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1850; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
1851; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:260 dlc
1852; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1853; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
1854; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1855; GFX11-PAL-NEXT:    s_endpgm
1856bb:
1857  %padding = alloca [64 x i32], align 4, addrspace(5)
1858  %i = alloca [32 x float], align 4, addrspace(5)
1859  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1860  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1861  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1862  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
1863  %i3 = zext i32 %i2 to i64
1864  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
1865  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1866  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1867  %i9 = sub nsw i32 31, %i2
1868  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1869  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1870  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1871  ret void
1872}
1873
1874define void @store_load_vindex_small_offset_foo(i32 %idx) {
1875; GFX9-LABEL: store_load_vindex_small_offset_foo:
1876; GFX9:       ; %bb.0: ; %bb
1877; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1878; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
1879; GFX9-NEXT:    s_waitcnt vmcnt(0)
1880; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x100
1881; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
1882; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1883; GFX9-NEXT:    v_mov_b32_e32 v3, 15
1884; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
1885; GFX9-NEXT:    scratch_store_dword v2, v3, off
1886; GFX9-NEXT:    s_waitcnt vmcnt(0)
1887; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1888; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
1889; GFX9-NEXT:    s_waitcnt vmcnt(0)
1890; GFX9-NEXT:    s_setpc_b64 s[30:31]
1891;
1892; GFX10-LABEL: store_load_vindex_small_offset_foo:
1893; GFX10:       ; %bb.0: ; %bb
1894; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1895; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1896; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
1897; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
1898; GFX10-NEXT:    v_mov_b32_e32 v2, 15
1899; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, vcc_lo
1900; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
1901; GFX10-NEXT:    scratch_load_dword v3, off, s32 glc dlc
1902; GFX10-NEXT:    s_waitcnt vmcnt(0)
1903; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
1904; GFX10-NEXT:    scratch_store_dword v0, v2, off
1905; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1906; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
1907; GFX10-NEXT:    s_waitcnt vmcnt(0)
1908; GFX10-NEXT:    s_setpc_b64 s[30:31]
1909;
1910; GFX11-LABEL: store_load_vindex_small_offset_foo:
1911; GFX11:       ; %bb.0: ; %bb
1912; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1913; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1914; GFX11-NEXT:    v_and_b32_e32 v1, 15, v0
1915; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1916; GFX11-NEXT:    v_mov_b32_e32 v2, 15
1917; GFX11-NEXT:    scratch_load_b32 v3, off, s32 glc dlc
1918; GFX11-NEXT:    s_waitcnt vmcnt(0)
1919; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
1920; GFX11-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 dlc
1921; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1922; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 glc dlc
1923; GFX11-NEXT:    s_waitcnt vmcnt(0)
1924; GFX11-NEXT:    s_setpc_b64 s[30:31]
1925;
1926; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
1927; GFX9-PAL:       ; %bb.0: ; %bb
1928; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1929; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 glc
1930; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1931; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x100
1932; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
1933; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1934; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
1935; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
1936; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1937; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1938; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1939; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
1940; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1941; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1942;
1943; GFX940-LABEL: store_load_vindex_small_offset_foo:
1944; GFX940:       ; %bb.0: ; %bb
1945; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1946; GFX940-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
1947; GFX940-NEXT:    s_waitcnt vmcnt(0)
1948; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1949; GFX940-NEXT:    v_mov_b32_e32 v2, 15
1950; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
1951; GFX940-NEXT:    scratch_store_dword v1, v2, s32 offset:256 sc0 sc1
1952; GFX940-NEXT:    s_waitcnt vmcnt(0)
1953; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1954; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
1955; GFX940-NEXT:    s_waitcnt vmcnt(0)
1956; GFX940-NEXT:    s_setpc_b64 s[30:31]
1957;
1958; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo:
1959; GFX10-PAL:       ; %bb.0: ; %bb
1960; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1961; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1962; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
1963; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x100
1964; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
1965; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, vcc_lo
1966; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x100
1967; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 glc dlc
1968; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1969; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
1970; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
1971; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1972; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
1973; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1974; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1975;
1976; GFX11-PAL-LABEL: store_load_vindex_small_offset_foo:
1977; GFX11-PAL:       ; %bb.0: ; %bb
1978; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1979; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1980; GFX11-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
1981; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1982; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 15
1983; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, s32 glc dlc
1984; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1985; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
1986; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 dlc
1987; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1988; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 glc dlc
1989; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1990; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
1991; GCN-LABEL: store_load_vindex_small_offset_foo:
1992; GCN:       ; %bb.0: ; %bb
1993; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1994; GCN-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
1995; GCN-NEXT:    s_waitcnt vmcnt(0)
1996; GCN-NEXT:    v_mov_b32_e32 v2, 15
1997; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1998; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
1999; GCN-NEXT:    scratch_store_dword v1, v2, s32 offset:256 sc0 sc1
2000; GCN-NEXT:    s_waitcnt vmcnt(0)
2001; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2002; GCN-NEXT:    scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
2003; GCN-NEXT:    s_waitcnt vmcnt(0)
2004; GCN-NEXT:    s_setpc_b64 s[30:31]
2005bb:
2006  %padding = alloca [64 x i32], align 4, addrspace(5)
2007  %i = alloca [32 x float], align 4, addrspace(5)
2008  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
2009  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2010  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2011  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
2012  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2013  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2014  %i9 = and i32 %idx, 15
2015  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2016  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2017  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2018  ret void
2019}
2020
2021define amdgpu_kernel void @zero_init_large_offset_kernel() {
2022; GFX9-LABEL: zero_init_large_offset_kernel:
2023; GFX9:       ; %bb.0:
2024; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
2025; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
2026; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
2027; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:16 glc
2028; GFX9-NEXT:    s_waitcnt vmcnt(0)
2029; GFX9-NEXT:    s_mov_b32 s0, 0
2030; GFX9-NEXT:    s_mov_b32 s1, s0
2031; GFX9-NEXT:    s_mov_b32 s2, s0
2032; GFX9-NEXT:    s_mov_b32 s3, s0
2033; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2034; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2035; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2036; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2037; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
2038; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
2039; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
2040; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
2041; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
2042; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
2043; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
2044; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
2045; GFX9-NEXT:    s_endpgm
2046;
2047; GFX10-LABEL: zero_init_large_offset_kernel:
2048; GFX10:       ; %bb.0:
2049; GFX10-NEXT:    s_add_u32 s0, s0, s3
2050; GFX10-NEXT:    s_addc_u32 s1, s1, 0
2051; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
2052; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
2053; GFX10-NEXT:    scratch_load_dword v0, off, off offset:16 glc dlc
2054; GFX10-NEXT:    s_waitcnt vmcnt(0)
2055; GFX10-NEXT:    s_mov_b32 s0, 0
2056; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
2057; GFX10-NEXT:    s_mov_b32 s1, s0
2058; GFX10-NEXT:    s_mov_b32 s2, s0
2059; GFX10-NEXT:    s_mov_b32 s3, s0
2060; GFX10-NEXT:    v_mov_b32_e32 v0, s0
2061; GFX10-NEXT:    v_mov_b32_e32 v1, s1
2062; GFX10-NEXT:    v_mov_b32_e32 v2, s2
2063; GFX10-NEXT:    v_mov_b32_e32 v3, s3
2064; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
2065; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
2066; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
2067; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
2068; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
2069; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
2070; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
2071; GFX10-NEXT:    s_endpgm
2072;
2073; GFX11-LABEL: zero_init_large_offset_kernel:
2074; GFX11:       ; %bb.0:
2075; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:16 glc dlc
2076; GFX11-NEXT:    s_waitcnt vmcnt(0)
2077; GFX11-NEXT:    s_mov_b32 s0, 0
2078; GFX11-NEXT:    s_movk_i32 vcc_lo, 0x4010
2079; GFX11-NEXT:    s_mov_b32 s1, s0
2080; GFX11-NEXT:    s_mov_b32 s2, s0
2081; GFX11-NEXT:    s_mov_b32 s3, s0
2082; GFX11-NEXT:    v_mov_b32_e32 v0, s0
2083; GFX11-NEXT:    v_mov_b32_e32 v1, s1
2084; GFX11-NEXT:    v_mov_b32_e32 v2, s2
2085; GFX11-NEXT:    v_mov_b32_e32 v3, s3
2086; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo
2087; GFX11-NEXT:    s_movk_i32 vcc_lo, 0x4010
2088; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:16
2089; GFX11-NEXT:    s_movk_i32 vcc_lo, 0x4010
2090; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:32
2091; GFX11-NEXT:    s_movk_i32 vcc_lo, 0x4010
2092; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:48
2093; GFX11-NEXT:    s_endpgm
2094;
2095; GFX9-PAL-LABEL: zero_init_large_offset_kernel:
2096; GFX9-PAL:       ; %bb.0:
2097; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
2098; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2099; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2100; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
2101; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
2102; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2103; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2104; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
2105; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2106; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:16 glc
2107; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2108; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
2109; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2110; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
2111; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
2112; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
2113; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
2114; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
2115; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
2116; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
2117; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
2118; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
2119; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
2120; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
2121; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
2122; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
2123; GFX9-PAL-NEXT:    s_endpgm
2124;
2125; GFX940-LABEL: zero_init_large_offset_kernel:
2126; GFX940:       ; %bb.0:
2127; GFX940-NEXT:    scratch_load_dword v0, off, off offset:16 sc0 sc1
2128; GFX940-NEXT:    s_waitcnt vmcnt(0)
2129; GFX940-NEXT:    s_mov_b32 s0, 0
2130; GFX940-NEXT:    s_mov_b32 s1, s0
2131; GFX940-NEXT:    s_mov_b32 s2, s0
2132; GFX940-NEXT:    s_mov_b32 s3, s0
2133; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2134; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
2135; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4010
2136; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
2137; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4010
2138; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
2139; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4010
2140; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
2141; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4010
2142; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
2143; GFX940-NEXT:    s_endpgm
2144;
2145; GFX1010-PAL-LABEL: zero_init_large_offset_kernel:
2146; GFX1010-PAL:       ; %bb.0:
2147; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
2148; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2149; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2150; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2151; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2152; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
2153; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
2154; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2155; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2156; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2157; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
2158; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:16 glc dlc
2159; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2160; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
2161; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2162; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
2163; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
2164; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
2165; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
2166; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
2167; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2168; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
2169; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2170; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2171; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
2172; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2173; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2174; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
2175; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2176; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2177; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
2178; GFX1010-PAL-NEXT:    s_endpgm
2179;
2180; GFX1030-PAL-LABEL: zero_init_large_offset_kernel:
2181; GFX1030-PAL:       ; %bb.0:
2182; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
2183; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2184; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2185; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2186; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2187; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
2188; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
2189; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2190; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2191; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:16 glc dlc
2192; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2193; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
2194; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2195; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
2196; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2197; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
2198; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
2199; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
2200; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
2201; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
2202; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
2203; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2204; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
2205; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2206; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
2207; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2208; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
2209; GFX1030-PAL-NEXT:    s_endpgm
2210;
2211; GFX11-PAL-LABEL: zero_init_large_offset_kernel:
2212; GFX11-PAL:       ; %bb.0:
2213; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:16 glc dlc
2214; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2215; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
2216; GFX11-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2217; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
2218; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
2219; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
2220; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, s0
2221; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, s1
2222; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, s2
2223; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, s3
2224; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo
2225; GFX11-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2226; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:16
2227; GFX11-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2228; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:32
2229; GFX11-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2230; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:48
2231; GFX11-PAL-NEXT:    s_endpgm
2232  %padding = alloca [4096 x i32], align 4, addrspace(5)
2233  %alloca = alloca [32 x i16], align 2, addrspace(5)
2234  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2235  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2236  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
2237  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
2238  ret void
2239}
2240
2241define void @zero_init_large_offset_foo() {
2242; GFX9-LABEL: zero_init_large_offset_foo:
2243; GFX9:       ; %bb.0:
2244; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2245; GFX9-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc
2246; GFX9-NEXT:    s_waitcnt vmcnt(0)
2247; GFX9-NEXT:    s_mov_b32 s0, 0
2248; GFX9-NEXT:    s_mov_b32 s1, s0
2249; GFX9-NEXT:    s_mov_b32 s2, s0
2250; GFX9-NEXT:    s_mov_b32 s3, s0
2251; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2252; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2253; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2254; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2255; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2256; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
2257; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2258; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
2259; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2260; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
2261; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2262; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
2263; GFX9-NEXT:    s_waitcnt vmcnt(0)
2264; GFX9-NEXT:    s_setpc_b64 s[30:31]
2265;
2266; GFX10-LABEL: zero_init_large_offset_foo:
2267; GFX10:       ; %bb.0:
2268; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2269; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2270; GFX10-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc dlc
2271; GFX10-NEXT:    s_waitcnt vmcnt(0)
2272; GFX10-NEXT:    s_mov_b32 s0, 0
2273; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2274; GFX10-NEXT:    s_mov_b32 s1, s0
2275; GFX10-NEXT:    s_mov_b32 s2, s0
2276; GFX10-NEXT:    s_mov_b32 s3, s0
2277; GFX10-NEXT:    v_mov_b32_e32 v0, s0
2278; GFX10-NEXT:    v_mov_b32_e32 v1, s1
2279; GFX10-NEXT:    v_mov_b32_e32 v2, s2
2280; GFX10-NEXT:    v_mov_b32_e32 v3, s3
2281; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
2282; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2283; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
2284; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2285; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
2286; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2287; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
2288; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2289; GFX10-NEXT:    s_setpc_b64 s[30:31]
2290;
2291; GFX11-LABEL: zero_init_large_offset_foo:
2292; GFX11:       ; %bb.0:
2293; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2294; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2295; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:16 glc dlc
2296; GFX11-NEXT:    s_waitcnt vmcnt(0)
2297; GFX11-NEXT:    s_mov_b32 s0, 0
2298; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2299; GFX11-NEXT:    s_mov_b32 s1, s0
2300; GFX11-NEXT:    s_mov_b32 s2, s0
2301; GFX11-NEXT:    s_mov_b32 s3, s0
2302; GFX11-NEXT:    v_mov_b32_e32 v0, s0
2303; GFX11-NEXT:    v_mov_b32_e32 v1, s1
2304; GFX11-NEXT:    v_mov_b32_e32 v2, s2
2305; GFX11-NEXT:    v_mov_b32_e32 v3, s3
2306; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo
2307; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2308; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:16
2309; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2310; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:32
2311; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2312; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:48
2313; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2314; GFX11-NEXT:    s_setpc_b64 s[30:31]
2315;
2316; GFX9-PAL-LABEL: zero_init_large_offset_foo:
2317; GFX9-PAL:       ; %bb.0:
2318; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2319; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc
2320; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2321; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
2322; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
2323; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2324; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
2325; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
2326; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
2327; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
2328; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
2329; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2330; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
2331; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2332; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
2333; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2334; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
2335; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2336; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
2337; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2338; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2339;
2340; GFX940-LABEL: zero_init_large_offset_foo:
2341; GFX940:       ; %bb.0:
2342; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2343; GFX940-NEXT:    scratch_load_dword v0, off, s32 offset:16 sc0 sc1
2344; GFX940-NEXT:    s_waitcnt vmcnt(0)
2345; GFX940-NEXT:    s_mov_b32 s0, 0
2346; GFX940-NEXT:    s_mov_b32 s1, s0
2347; GFX940-NEXT:    s_mov_b32 s2, s0
2348; GFX940-NEXT:    s_mov_b32 s3, s0
2349; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2350; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
2351; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2352; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
2353; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2354; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
2355; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2356; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
2357; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2358; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
2359; GFX940-NEXT:    s_waitcnt vmcnt(0)
2360; GFX940-NEXT:    s_setpc_b64 s[30:31]
2361;
2362; GFX1010-PAL-LABEL: zero_init_large_offset_foo:
2363; GFX1010-PAL:       ; %bb.0:
2364; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2365; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2366; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc dlc
2367; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2368; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
2369; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2370; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
2371; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2372; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
2373; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
2374; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
2375; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
2376; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
2377; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
2378; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2379; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2380; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
2381; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2382; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2383; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
2384; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2385; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2386; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
2387; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2388; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
2389;
2390; GFX1030-PAL-LABEL: zero_init_large_offset_foo:
2391; GFX1030-PAL:       ; %bb.0:
2392; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2393; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2394; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc dlc
2395; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2396; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
2397; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2398; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
2399; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2400; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
2401; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
2402; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
2403; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
2404; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
2405; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
2406; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2407; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
2408; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2409; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
2410; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2411; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
2412; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2413; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
2414;
2415; GFX11-PAL-LABEL: zero_init_large_offset_foo:
2416; GFX11-PAL:       ; %bb.0:
2417; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2418; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2419; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s32 offset:16 glc dlc
2420; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2421; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
2422; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2423; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
2424; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
2425; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
2426; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, s0
2427; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, s1
2428; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, s2
2429; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, s3
2430; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo
2431; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2432; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:16
2433; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2434; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:32
2435; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2436; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:48
2437; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2438; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
2439  %padding = alloca [4096 x i32], align 4, addrspace(5)
2440  %alloca = alloca [32 x i16], align 2, addrspace(5)
2441  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2442  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2443  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
2444  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
2445  ret void
2446}
2447
2448define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
2449; GFX9-LABEL: store_load_sindex_large_offset_kernel:
2450; GFX9:       ; %bb.0: ; %bb
2451; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
2452; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
2453; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2454; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
2455; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
2456; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2457; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
2458; GFX9-NEXT:    s_and_b32 s0, s0, 15
2459; GFX9-NEXT:    v_mov_b32_e32 v0, 15
2460; GFX9-NEXT:    s_addk_i32 s1, 0x4004
2461; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
2462; GFX9-NEXT:    scratch_store_dword off, v0, s1
2463; GFX9-NEXT:    s_waitcnt vmcnt(0)
2464; GFX9-NEXT:    s_addk_i32 s0, 0x4004
2465; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
2466; GFX9-NEXT:    s_waitcnt vmcnt(0)
2467; GFX9-NEXT:    s_endpgm
2468;
2469; GFX10-LABEL: store_load_sindex_large_offset_kernel:
2470; GFX10:       ; %bb.0: ; %bb
2471; GFX10-NEXT:    s_add_u32 s2, s2, s5
2472; GFX10-NEXT:    s_addc_u32 s3, s3, 0
2473; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2474; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2475; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
2476; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2477; GFX10-NEXT:    s_waitcnt vmcnt(0)
2478; GFX10-NEXT:    v_mov_b32_e32 v0, 15
2479; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2480; GFX10-NEXT:    s_and_b32 s1, s0, 15
2481; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
2482; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
2483; GFX10-NEXT:    s_addk_i32 s0, 0x4004
2484; GFX10-NEXT:    s_addk_i32 s1, 0x4004
2485; GFX10-NEXT:    scratch_store_dword off, v0, s0
2486; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2487; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2488; GFX10-NEXT:    s_waitcnt vmcnt(0)
2489; GFX10-NEXT:    s_endpgm
2490;
2491; GFX11-LABEL: store_load_sindex_large_offset_kernel:
2492; GFX11:       ; %bb.0: ; %bb
2493; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
2494; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
2495; GFX11-NEXT:    s_waitcnt vmcnt(0)
2496; GFX11-NEXT:    v_mov_b32_e32 v0, 15
2497; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2498; GFX11-NEXT:    s_and_b32 s1, s0, 15
2499; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
2500; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
2501; GFX11-NEXT:    s_addk_i32 s0, 0x4004
2502; GFX11-NEXT:    s_addk_i32 s1, 0x4004
2503; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
2504; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2505; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
2506; GFX11-NEXT:    s_waitcnt vmcnt(0)
2507; GFX11-NEXT:    s_endpgm
2508;
2509; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel:
2510; GFX9-PAL:       ; %bb.0: ; %bb
2511; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
2512; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
2513; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2514; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
2515; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
2516; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2517; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2518; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
2519; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
2520; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
2521; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2522; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
2523; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
2524; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
2525; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x4004
2526; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2527; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
2528; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2529; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x4004
2530; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
2531; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2532; GFX9-PAL-NEXT:    s_endpgm
2533;
2534; GFX940-LABEL: store_load_sindex_large_offset_kernel:
2535; GFX940:       ; %bb.0: ; %bb
2536; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
2537; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
2538; GFX940-NEXT:    s_waitcnt vmcnt(0)
2539; GFX940-NEXT:    v_mov_b32_e32 v0, 15
2540; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
2541; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
2542; GFX940-NEXT:    s_and_b32 s0, s0, 15
2543; GFX940-NEXT:    s_addk_i32 s1, 0x4004
2544; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
2545; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
2546; GFX940-NEXT:    s_waitcnt vmcnt(0)
2547; GFX940-NEXT:    s_addk_i32 s0, 0x4004
2548; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
2549; GFX940-NEXT:    s_waitcnt vmcnt(0)
2550; GFX940-NEXT:    s_endpgm
2551;
2552; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel:
2553; GFX1010-PAL:       ; %bb.0: ; %bb
2554; GFX1010-PAL-NEXT:    s_getpc_b64 s[4:5]
2555; GFX1010-PAL-NEXT:    s_mov_b32 s4, s0
2556; GFX1010-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2557; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2558; GFX1010-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2559; GFX1010-PAL-NEXT:    s_add_u32 s4, s4, s3
2560; GFX1010-PAL-NEXT:    s_addc_u32 s5, s5, 0
2561; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
2562; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
2563; GFX1010-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
2564; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2565; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
2566; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2567; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
2568; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2569; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
2570; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2571; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2572; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x4004
2573; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x4004
2574; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
2575; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2576; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2577; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2578; GFX1010-PAL-NEXT:    s_endpgm
2579;
2580; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel:
2581; GFX1030-PAL:       ; %bb.0: ; %bb
2582; GFX1030-PAL-NEXT:    s_getpc_b64 s[4:5]
2583; GFX1030-PAL-NEXT:    s_mov_b32 s4, s0
2584; GFX1030-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2585; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2586; GFX1030-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2587; GFX1030-PAL-NEXT:    s_add_u32 s4, s4, s3
2588; GFX1030-PAL-NEXT:    s_addc_u32 s5, s5, 0
2589; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
2590; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
2591; GFX1030-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
2592; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2593; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2594; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
2595; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2596; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
2597; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2598; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2599; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x4004
2600; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x4004
2601; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
2602; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2603; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2604; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2605; GFX1030-PAL-NEXT:    s_endpgm
2606;
2607; GFX11-PAL-LABEL: store_load_sindex_large_offset_kernel:
2608; GFX11-PAL:       ; %bb.0: ; %bb
2609; GFX11-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
2610; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
2611; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2612; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
2613; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2614; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
2615; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2616; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2617; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x4004
2618; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x4004
2619; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
2620; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2621; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
2622; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2623; GFX11-PAL-NEXT:    s_endpgm
2624bb:
2625  %padding = alloca [4096 x i32], align 4, addrspace(5)
2626  %i = alloca [32 x float], align 4, addrspace(5)
2627  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2628  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2629  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2630  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
2631  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2632  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2633  %i9 = and i32 %idx, 15
2634  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2635  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2636  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2637  ret void
2638}
2639
2640define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
2641; GFX9-LABEL: store_load_sindex_large_offset_foo:
2642; GFX9:       ; %bb.0: ; %bb
2643; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
2644; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
2645; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
2646; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
2647; GFX9-NEXT:    s_waitcnt vmcnt(0)
2648; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
2649; GFX9-NEXT:    s_addk_i32 s0, 0x4004
2650; GFX9-NEXT:    v_mov_b32_e32 v0, 15
2651; GFX9-NEXT:    scratch_store_dword off, v0, s0
2652; GFX9-NEXT:    s_waitcnt vmcnt(0)
2653; GFX9-NEXT:    s_and_b32 s0, s2, 15
2654; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
2655; GFX9-NEXT:    s_addk_i32 s0, 0x4004
2656; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
2657; GFX9-NEXT:    s_waitcnt vmcnt(0)
2658; GFX9-NEXT:    s_endpgm
2659;
2660; GFX10-LABEL: store_load_sindex_large_offset_foo:
2661; GFX10:       ; %bb.0: ; %bb
2662; GFX10-NEXT:    s_add_u32 s0, s0, s3
2663; GFX10-NEXT:    s_addc_u32 s1, s1, 0
2664; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
2665; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
2666; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2667; GFX10-NEXT:    s_waitcnt vmcnt(0)
2668; GFX10-NEXT:    v_mov_b32_e32 v0, 15
2669; GFX10-NEXT:    s_and_b32 s0, s2, 15
2670; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
2671; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
2672; GFX10-NEXT:    s_addk_i32 s1, 0x4004
2673; GFX10-NEXT:    s_addk_i32 s0, 0x4004
2674; GFX10-NEXT:    scratch_store_dword off, v0, s1
2675; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2676; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
2677; GFX10-NEXT:    s_waitcnt vmcnt(0)
2678; GFX10-NEXT:    s_endpgm
2679;
2680; GFX11-LABEL: store_load_sindex_large_offset_foo:
2681; GFX11:       ; %bb.0: ; %bb
2682; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
2683; GFX11-NEXT:    s_waitcnt vmcnt(0)
2684; GFX11-NEXT:    v_mov_b32_e32 v0, 15
2685; GFX11-NEXT:    s_and_b32 s1, s0, 15
2686; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
2687; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
2688; GFX11-NEXT:    s_addk_i32 s0, 0x4004
2689; GFX11-NEXT:    s_addk_i32 s1, 0x4004
2690; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
2691; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2692; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
2693; GFX11-NEXT:    s_waitcnt vmcnt(0)
2694; GFX11-NEXT:    s_endpgm
2695;
2696; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo:
2697; GFX9-PAL:       ; %bb.0: ; %bb
2698; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
2699; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2700; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2701; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
2702; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2703; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2704; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
2705; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2706; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
2707; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2708; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
2709; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
2710; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x4004
2711; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
2712; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2713; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
2714; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2715; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x4004
2716; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
2717; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2718; GFX9-PAL-NEXT:    s_endpgm
2719;
2720; GFX940-LABEL: store_load_sindex_large_offset_foo:
2721; GFX940:       ; %bb.0: ; %bb
2722; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
2723; GFX940-NEXT:    s_waitcnt vmcnt(0)
2724; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
2725; GFX940-NEXT:    s_and_b32 s0, s0, 15
2726; GFX940-NEXT:    s_addk_i32 s1, 0x4004
2727; GFX940-NEXT:    v_mov_b32_e32 v0, 15
2728; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
2729; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
2730; GFX940-NEXT:    s_waitcnt vmcnt(0)
2731; GFX940-NEXT:    s_addk_i32 s0, 0x4004
2732; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
2733; GFX940-NEXT:    s_waitcnt vmcnt(0)
2734; GFX940-NEXT:    s_endpgm
2735;
2736; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo:
2737; GFX1010-PAL:       ; %bb.0: ; %bb
2738; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
2739; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2740; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2741; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2742; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2743; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
2744; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
2745; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2746; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2747; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2748; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
2749; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
2750; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2751; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
2752; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2753; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2754; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x4004
2755; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x4004
2756; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
2757; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2758; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2759; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2760; GFX1010-PAL-NEXT:    s_endpgm
2761;
2762; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo:
2763; GFX1030-PAL:       ; %bb.0: ; %bb
2764; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
2765; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2766; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2767; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2768; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2769; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
2770; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
2771; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2772; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2773; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2774; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2775; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
2776; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
2777; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2778; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2779; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x4004
2780; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x4004
2781; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
2782; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2783; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2784; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2785; GFX1030-PAL-NEXT:    s_endpgm
2786;
2787; GFX11-PAL-LABEL: store_load_sindex_large_offset_foo:
2788; GFX11-PAL:       ; %bb.0: ; %bb
2789; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
2790; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2791; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
2792; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
2793; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2794; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2795; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x4004
2796; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x4004
2797; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
2798; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2799; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
2800; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2801; GFX11-PAL-NEXT:    s_endpgm
2802bb:
2803  %padding = alloca [4096 x i32], align 4, addrspace(5)
2804  %i = alloca [32 x float], align 4, addrspace(5)
2805  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2806  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2807  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2808  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
2809  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2810  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2811  %i9 = and i32 %idx, 15
2812  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2813  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2814  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2815  ret void
2816}
2817
2818define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
2819; GFX9-LABEL: store_load_vindex_large_offset_kernel:
2820; GFX9:       ; %bb.0: ; %bb
2821; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
2822; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
2823; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
2824; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
2825; GFX9-NEXT:    s_waitcnt vmcnt(0)
2826; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2827; GFX9-NEXT:    v_add_u32_e32 v1, 0x4004, v0
2828; GFX9-NEXT:    v_mov_b32_e32 v2, 15
2829; GFX9-NEXT:    scratch_store_dword v1, v2, off
2830; GFX9-NEXT:    s_waitcnt vmcnt(0)
2831; GFX9-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
2832; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
2833; GFX9-NEXT:    s_waitcnt vmcnt(0)
2834; GFX9-NEXT:    s_endpgm
2835;
2836; GFX10-LABEL: store_load_vindex_large_offset_kernel:
2837; GFX10:       ; %bb.0: ; %bb
2838; GFX10-NEXT:    s_add_u32 s0, s0, s3
2839; GFX10-NEXT:    s_addc_u32 s1, s1, 0
2840; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
2841; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
2842; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2843; GFX10-NEXT:    v_mov_b32_e32 v2, 15
2844; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
2845; GFX10-NEXT:    s_waitcnt vmcnt(0)
2846; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
2847; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
2848; GFX10-NEXT:    scratch_store_dword v1, v2, off
2849; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2850; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
2851; GFX10-NEXT:    s_waitcnt vmcnt(0)
2852; GFX10-NEXT:    s_endpgm
2853;
2854; GFX11-LABEL: store_load_vindex_large_offset_kernel:
2855; GFX11:       ; %bb.0: ; %bb
2856; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2857; GFX11-NEXT:    v_mov_b32_e32 v1, 15
2858; GFX11-NEXT:    s_movk_i32 vcc_lo, 0x4004
2859; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
2860; GFX11-NEXT:    s_waitcnt vmcnt(0)
2861; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
2862; GFX11-NEXT:    scratch_store_b32 v0, v1, vcc_lo dlc
2863; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2864; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
2865; GFX11-NEXT:    s_waitcnt vmcnt(0)
2866; GFX11-NEXT:    s_endpgm
2867;
2868; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel:
2869; GFX9-PAL:       ; %bb.0: ; %bb
2870; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
2871; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2872; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2873; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
2874; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2875; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
2876; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2877; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2878; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
2879; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2880; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
2881; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2882; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x4004, v0
2883; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
2884; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2885; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
2886; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
2887; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2888; GFX9-PAL-NEXT:    s_endpgm
2889;
2890; GFX940-LABEL: store_load_vindex_large_offset_kernel:
2891; GFX940:       ; %bb.0: ; %bb
2892; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
2893; GFX940-NEXT:    s_waitcnt vmcnt(0)
2894; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2895; GFX940-NEXT:    v_mov_b32_e32 v1, 15
2896; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4004
2897; GFX940-NEXT:    scratch_store_dword v0, v1, vcc_hi sc0 sc1
2898; GFX940-NEXT:    s_waitcnt vmcnt(0)
2899; GFX940-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
2900; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
2901; GFX940-NEXT:    s_waitcnt vmcnt(0)
2902; GFX940-NEXT:    s_endpgm
2903;
2904; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel:
2905; GFX1010-PAL:       ; %bb.0: ; %bb
2906; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
2907; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2908; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2909; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2910; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2911; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
2912; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
2913; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2914; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2915; GFX1010-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2916; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, 15
2917; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2918; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, vcc_lo offset:4 glc dlc
2919; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2920; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
2921; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
2922; GFX1010-PAL-NEXT:    scratch_store_dword v1, v2, off
2923; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2924; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
2925; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2926; GFX1010-PAL-NEXT:    s_endpgm
2927;
2928; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel:
2929; GFX1030-PAL:       ; %bb.0: ; %bb
2930; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
2931; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2932; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2933; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2934; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2935; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
2936; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
2937; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2938; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2939; GFX1030-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2940; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, 15
2941; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
2942; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2943; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
2944; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
2945; GFX1030-PAL-NEXT:    scratch_store_dword v1, v2, off
2946; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2947; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
2948; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2949; GFX1030-PAL-NEXT:    s_endpgm
2950;
2951; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel:
2952; GFX11-PAL:       ; %bb.0: ; %bb
2953; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2954; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
2955; GFX11-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4004
2956; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
2957; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2958; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
2959; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, vcc_lo dlc
2960; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2961; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
2962; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2963; GFX11-PAL-NEXT:    s_endpgm
2964bb:
2965  %padding = alloca [4096 x i32], align 4, addrspace(5)
2966  %i = alloca [32 x float], align 4, addrspace(5)
2967  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2968  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2969  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2970  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
2971  %i3 = zext i32 %i2 to i64
2972  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
2973  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2974  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2975  %i9 = sub nsw i32 31, %i2
2976  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2977  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2978  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2979  ret void
2980}
2981
2982define void @store_load_vindex_large_offset_foo(i32 %idx) {
2983; GFX9-LABEL: store_load_vindex_large_offset_foo:
2984; GFX9:       ; %bb.0: ; %bb
2985; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2986; GFX9-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
2987; GFX9-NEXT:    s_waitcnt vmcnt(0)
2988; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
2989; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
2990; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
2991; GFX9-NEXT:    v_mov_b32_e32 v3, 15
2992; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
2993; GFX9-NEXT:    scratch_store_dword v2, v3, off
2994; GFX9-NEXT:    s_waitcnt vmcnt(0)
2995; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
2996; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
2997; GFX9-NEXT:    s_waitcnt vmcnt(0)
2998; GFX9-NEXT:    s_setpc_b64 s[30:31]
2999;
3000; GFX10-LABEL: store_load_vindex_large_offset_foo:
3001; GFX10:       ; %bb.0: ; %bb
3002; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3003; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3004; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
3005; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3006; GFX10-NEXT:    v_mov_b32_e32 v2, 15
3007; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, vcc_lo
3008; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3009; GFX10-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
3010; GFX10-NEXT:    s_waitcnt vmcnt(0)
3011; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
3012; GFX10-NEXT:    scratch_store_dword v0, v2, off
3013; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3014; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
3015; GFX10-NEXT:    s_waitcnt vmcnt(0)
3016; GFX10-NEXT:    s_setpc_b64 s[30:31]
3017;
3018; GFX11-LABEL: store_load_vindex_large_offset_foo:
3019; GFX11:       ; %bb.0: ; %bb
3020; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3021; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3022; GFX11-NEXT:    v_and_b32_e32 v1, 15, v0
3023; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3024; GFX11-NEXT:    v_mov_b32_e32 v2, 15
3025; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3026; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:4 glc dlc
3027; GFX11-NEXT:    s_waitcnt vmcnt(0)
3028; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
3029; GFX11-NEXT:    scratch_store_b32 v0, v2, vcc_lo dlc
3030; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3031; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3032; GFX11-NEXT:    scratch_load_b32 v0, v1, vcc_lo glc dlc
3033; GFX11-NEXT:    s_waitcnt vmcnt(0)
3034; GFX11-NEXT:    s_setpc_b64 s[30:31]
3035;
3036; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
3037; GFX9-PAL:       ; %bb.0: ; %bb
3038; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3039; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
3040; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3041; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
3042; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
3043; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
3044; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
3045; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
3046; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
3047; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3048; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
3049; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
3050; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3051; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3052;
3053; GFX940-LABEL: store_load_vindex_large_offset_foo:
3054; GFX940:       ; %bb.0: ; %bb
3055; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3056; GFX940-NEXT:    scratch_load_dword v1, off, s32 offset:4 sc0 sc1
3057; GFX940-NEXT:    s_waitcnt vmcnt(0)
3058; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
3059; GFX940-NEXT:    v_mov_b32_e32 v2, 15
3060; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
3061; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
3062; GFX940-NEXT:    scratch_store_dword v1, v2, vcc_hi sc0 sc1
3063; GFX940-NEXT:    s_waitcnt vmcnt(0)
3064; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3065; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
3066; GFX940-NEXT:    scratch_load_dword v0, v0, vcc_hi sc0 sc1
3067; GFX940-NEXT:    s_waitcnt vmcnt(0)
3068; GFX940-NEXT:    s_setpc_b64 s[30:31]
3069;
3070; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo:
3071; GFX10-PAL:       ; %bb.0: ; %bb
3072; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3073; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3074; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
3075; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3076; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
3077; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, vcc_lo
3078; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3079; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
3080; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3081; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
3082; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
3083; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3084; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
3085; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3086; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3087;
3088; GFX11-PAL-LABEL: store_load_vindex_large_offset_foo:
3089; GFX11-PAL:       ; %bb.0: ; %bb
3090; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3091; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3092; GFX11-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
3093; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3094; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 15
3095; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3096; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, s32 offset:4 glc dlc
3097; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3098; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
3099; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, vcc_lo dlc
3100; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3101; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3102; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, vcc_lo glc dlc
3103; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3104; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
3105; GCN-LABEL: store_load_vindex_large_offset_foo:
3106; GCN:       ; %bb.0: ; %bb
3107; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3108; GCN-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
3109; GCN-NEXT:    s_waitcnt vmcnt(0)
3110; GCN-NEXT:    v_mov_b32_e32 v2, 15
3111; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
3112; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
3113; GCN-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
3114; GCN-NEXT:    scratch_store_dword v1, v2, vcc_hi sc0 sc1
3115; GCN-NEXT:    s_waitcnt vmcnt(0)
3116; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3117; GCN-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
3118; GCN-NEXT:    scratch_load_dword v0, v0, vcc_hi sc0 sc1
3119; GCN-NEXT:    s_waitcnt vmcnt(0)
3120; GCN-NEXT:    s_setpc_b64 s[30:31]
3121bb:
3122  %padding = alloca [4096 x i32], align 4, addrspace(5)
3123  %i = alloca [32 x float], align 4, addrspace(5)
3124  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
3125  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
3126  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
3127  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
3128  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
3129  store volatile i32 15, i32 addrspace(5)* %i8, align 4
3130  %i9 = and i32 %idx, 15
3131  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
3132  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
3133  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
3134  ret void
3135}
3136
3137define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
3138; GFX9-LABEL: store_load_large_imm_offset_kernel:
3139; GFX9:       ; %bb.0: ; %bb
3140; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
3141; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
3142; GFX9-NEXT:    v_mov_b32_e32 v0, 13
3143; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
3144; GFX9-NEXT:    s_movk_i32 s0, 0x3000
3145; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
3146; GFX9-NEXT:    s_waitcnt vmcnt(0)
3147; GFX9-NEXT:    s_add_i32 s0, s0, 4
3148; GFX9-NEXT:    v_mov_b32_e32 v0, 15
3149; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
3150; GFX9-NEXT:    s_waitcnt vmcnt(0)
3151; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
3152; GFX9-NEXT:    s_waitcnt vmcnt(0)
3153; GFX9-NEXT:    s_endpgm
3154;
3155; GFX10-LABEL: store_load_large_imm_offset_kernel:
3156; GFX10:       ; %bb.0: ; %bb
3157; GFX10-NEXT:    s_add_u32 s0, s0, s3
3158; GFX10-NEXT:    s_addc_u32 s1, s1, 0
3159; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
3160; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
3161; GFX10-NEXT:    v_mov_b32_e32 v0, 13
3162; GFX10-NEXT:    v_mov_b32_e32 v1, 15
3163; GFX10-NEXT:    s_movk_i32 s0, 0x3800
3164; GFX10-NEXT:    s_add_i32 s0, s0, 4
3165; GFX10-NEXT:    scratch_store_dword off, v0, off offset:4
3166; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3167; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
3168; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3169; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
3170; GFX10-NEXT:    s_waitcnt vmcnt(0)
3171; GFX10-NEXT:    s_endpgm
3172;
3173; GFX11-LABEL: store_load_large_imm_offset_kernel:
3174; GFX11:       ; %bb.0: ; %bb
3175; GFX11-NEXT:    v_mov_b32_e32 v0, 13
3176; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3000
3177; GFX11-NEXT:    v_mov_b32_e32 v2, 15
3178; GFX11-NEXT:    scratch_store_b32 off, v0, off offset:4 dlc
3179; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3180; GFX11-NEXT:    scratch_store_b32 v1, v2, off offset:3716 dlc
3181; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3182; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:3716 glc dlc
3183; GFX11-NEXT:    s_waitcnt vmcnt(0)
3184; GFX11-NEXT:    s_endpgm
3185;
3186; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel:
3187; GFX9-PAL:       ; %bb.0: ; %bb
3188; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
3189; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
3190; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
3191; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
3192; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
3193; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
3194; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3195; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
3196; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
3197; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
3198; GFX9-PAL-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
3199; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3200; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
3201; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
3202; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
3203; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3204; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
3205; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3206; GFX9-PAL-NEXT:    s_endpgm
3207;
3208; GFX940-LABEL: store_load_large_imm_offset_kernel:
3209; GFX940:       ; %bb.0: ; %bb
3210; GFX940-NEXT:    v_mov_b32_e32 v0, 13
3211; GFX940-NEXT:    scratch_store_dword off, v0, off offset:4 sc0 sc1
3212; GFX940-NEXT:    s_waitcnt vmcnt(0)
3213; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3000
3214; GFX940-NEXT:    v_mov_b32_e32 v1, 15
3215; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:3716 sc0 sc1
3216; GFX940-NEXT:    s_waitcnt vmcnt(0)
3217; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:3716 sc0 sc1
3218; GFX940-NEXT:    s_waitcnt vmcnt(0)
3219; GFX940-NEXT:    s_endpgm
3220;
3221; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel:
3222; GFX1010-PAL:       ; %bb.0: ; %bb
3223; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
3224; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
3225; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
3226; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3227; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
3228; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
3229; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
3230; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
3231; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
3232; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 13
3233; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 15
3234; GFX1010-PAL-NEXT:    s_movk_i32 s0, 0x3800
3235; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
3236; GFX1010-PAL-NEXT:    s_add_i32 s0, s0, 4
3237; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, vcc_lo offset:4
3238; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3239; GFX1010-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
3240; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3241; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
3242; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
3243; GFX1010-PAL-NEXT:    s_endpgm
3244;
3245; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel:
3246; GFX1030-PAL:       ; %bb.0: ; %bb
3247; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
3248; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
3249; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
3250; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3251; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
3252; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
3253; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
3254; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
3255; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
3256; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 13
3257; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 15
3258; GFX1030-PAL-NEXT:    s_movk_i32 s0, 0x3800
3259; GFX1030-PAL-NEXT:    s_add_i32 s0, s0, 4
3260; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, off offset:4
3261; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3262; GFX1030-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
3263; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3264; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
3265; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
3266; GFX1030-PAL-NEXT:    s_endpgm
3267;
3268; GFX11-PAL-LABEL: store_load_large_imm_offset_kernel:
3269; GFX11-PAL:       ; %bb.0: ; %bb
3270; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 13
3271; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 0x3000
3272; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 15
3273; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, off offset:4 dlc
3274; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3275; GFX11-PAL-NEXT:    scratch_store_b32 v1, v2, off offset:3716 dlc
3276; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3277; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, off offset:3716 glc dlc
3278; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3279; GFX11-PAL-NEXT:    s_endpgm
3280bb:
3281  %i = alloca [4096 x i32], align 4, addrspace(5)
3282  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
3283  store volatile i32 13, i32 addrspace(5)* %i1, align 4
3284  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
3285  store volatile i32 15, i32 addrspace(5)* %i7, align 4
3286  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
3287  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
3288  ret void
3289}
3290
3291define void @store_load_large_imm_offset_foo() {
3292; GFX9-LABEL: store_load_large_imm_offset_foo:
3293; GFX9:       ; %bb.0: ; %bb
3294; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3295; GFX9-NEXT:    v_mov_b32_e32 v0, 13
3296; GFX9-NEXT:    s_movk_i32 s0, 0x3000
3297; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 4
3298; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:4
3299; GFX9-NEXT:    s_waitcnt vmcnt(0)
3300; GFX9-NEXT:    s_add_i32 s0, s0, vcc_hi
3301; GFX9-NEXT:    v_mov_b32_e32 v0, 15
3302; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
3303; GFX9-NEXT:    s_waitcnt vmcnt(0)
3304; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
3305; GFX9-NEXT:    s_waitcnt vmcnt(0)
3306; GFX9-NEXT:    s_setpc_b64 s[30:31]
3307;
3308; GFX10-LABEL: store_load_large_imm_offset_foo:
3309; GFX10:       ; %bb.0: ; %bb
3310; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3311; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3312; GFX10-NEXT:    v_mov_b32_e32 v0, 13
3313; GFX10-NEXT:    v_mov_b32_e32 v1, 15
3314; GFX10-NEXT:    s_movk_i32 s0, 0x3800
3315; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 4
3316; GFX10-NEXT:    s_add_i32 s0, s0, vcc_lo
3317; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:4
3318; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3319; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
3320; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3321; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
3322; GFX10-NEXT:    s_waitcnt vmcnt(0)
3323; GFX10-NEXT:    s_setpc_b64 s[30:31]
3324;
3325; GFX11-LABEL: store_load_large_imm_offset_foo:
3326; GFX11:       ; %bb.0: ; %bb
3327; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3328; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3329; GFX11-NEXT:    v_mov_b32_e32 v0, 13
3330; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3000
3331; GFX11-NEXT:    v_mov_b32_e32 v2, 15
3332; GFX11-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
3333; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3334; GFX11-NEXT:    scratch_store_b32 v1, v2, s32 offset:3716 dlc
3335; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3336; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 offset:3716 glc dlc
3337; GFX11-NEXT:    s_waitcnt vmcnt(0)
3338; GFX11-NEXT:    s_setpc_b64 s[30:31]
3339;
3340; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
3341; GFX9-PAL:       ; %bb.0: ; %bb
3342; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3343; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
3344; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
3345; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 4
3346; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s32 offset:4
3347; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3348; GFX9-PAL-NEXT:    s_add_i32 s0, s0, vcc_hi
3349; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
3350; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
3351; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3352; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
3353; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3354; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3355;
3356; GFX940-LABEL: store_load_large_imm_offset_foo:
3357; GFX940:       ; %bb.0: ; %bb
3358; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3359; GFX940-NEXT:    v_mov_b32_e32 v0, 13
3360; GFX940-NEXT:    scratch_store_dword off, v0, s32 offset:4 sc0 sc1
3361; GFX940-NEXT:    s_waitcnt vmcnt(0)
3362; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3000
3363; GFX940-NEXT:    v_mov_b32_e32 v1, 15
3364; GFX940-NEXT:    scratch_store_dword v0, v1, s32 offset:3716 sc0 sc1
3365; GFX940-NEXT:    s_waitcnt vmcnt(0)
3366; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1
3367; GFX940-NEXT:    s_waitcnt vmcnt(0)
3368; GFX940-NEXT:    s_setpc_b64 s[30:31]
3369;
3370; GFX10-PAL-LABEL: store_load_large_imm_offset_foo:
3371; GFX10-PAL:       ; %bb.0: ; %bb
3372; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3373; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3374; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 13
3375; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
3376; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x3800
3377; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 4
3378; GFX10-PAL-NEXT:    s_add_i32 s0, s0, vcc_lo
3379; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s32 offset:4
3380; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3381; GFX10-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
3382; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3383; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
3384; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3385; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3386;
3387; GFX11-PAL-LABEL: store_load_large_imm_offset_foo:
3388; GFX11-PAL:       ; %bb.0: ; %bb
3389; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3390; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3391; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 13
3392; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 0x3000
3393; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 15
3394; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
3395; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3396; GFX11-PAL-NEXT:    scratch_store_b32 v1, v2, s32 offset:3716 dlc
3397; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3398; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:3716 glc dlc
3399; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3400; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
3401; GCN-LABEL: store_load_large_imm_offset_foo:
3402; GCN:       ; %bb.0: ; %bb
3403; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3404; GCN-NEXT:    v_mov_b32_e32 v0, 13
3405; GCN-NEXT:    scratch_store_dword off, v0, s32 sc0 sc1
3406; GCN-NEXT:    s_waitcnt vmcnt(0)
3407; GCN-NEXT:    v_mov_b32_e32 v0, 0x3000
3408; GCN-NEXT:    v_mov_b32_e32 v1, 15
3409; GCN-NEXT:    scratch_store_dword v0, v1, s32 offset:3712 sc0 sc1
3410; GCN-NEXT:    s_waitcnt vmcnt(0)
3411; GCN-NEXT:    scratch_load_dword v0, v0, s32 offset:3712 sc0 sc1
3412; GCN-NEXT:    s_waitcnt vmcnt(0)
3413; GCN-NEXT:    s_setpc_b64 s[30:31]
3414bb:
3415  %i = alloca [4096 x i32], align 4, addrspace(5)
3416  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
3417  store volatile i32 13, i32 addrspace(5)* %i1, align 4
3418  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
3419  store volatile i32 15, i32 addrspace(5)* %i7, align 4
3420  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
3421  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
3422  ret void
3423}
3424
3425define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
3426; GFX9-LABEL: store_load_vidx_sidx_offset:
3427; GFX9:       ; %bb.0: ; %bb
3428; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
3429; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
3430; GFX9-NEXT:    v_mov_b32_e32 v1, 4
3431; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
3432; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3433; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
3434; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
3435; GFX9-NEXT:    v_mov_b32_e32 v1, 15
3436; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
3437; GFX9-NEXT:    s_waitcnt vmcnt(0)
3438; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
3439; GFX9-NEXT:    s_waitcnt vmcnt(0)
3440; GFX9-NEXT:    s_endpgm
3441;
3442; GFX10-LABEL: store_load_vidx_sidx_offset:
3443; GFX10:       ; %bb.0: ; %bb
3444; GFX10-NEXT:    s_add_u32 s2, s2, s5
3445; GFX10-NEXT:    s_addc_u32 s3, s3, 0
3446; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
3447; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
3448; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
3449; GFX10-NEXT:    v_mov_b32_e32 v1, 15
3450; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3451; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
3452; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
3453; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
3454; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3455; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
3456; GFX10-NEXT:    s_waitcnt vmcnt(0)
3457; GFX10-NEXT:    s_endpgm
3458;
3459; GFX11-LABEL: store_load_vidx_sidx_offset:
3460; GFX11:       ; %bb.0: ; %bb
3461; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
3462; GFX11-NEXT:    v_mov_b32_e32 v1, 15
3463; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3464; GFX11-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
3465; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:1028 dlc
3466; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3467; GFX11-NEXT:    scratch_load_b32 v0, v0, off offset:1028 glc dlc
3468; GFX11-NEXT:    s_waitcnt vmcnt(0)
3469; GFX11-NEXT:    s_endpgm
3470;
3471; GFX9-PAL-LABEL: store_load_vidx_sidx_offset:
3472; GFX9-PAL:       ; %bb.0: ; %bb
3473; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
3474; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
3475; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
3476; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 4
3477; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
3478; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3479; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
3480; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
3481; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
3482; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
3483; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
3484; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
3485; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
3486; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3487; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
3488; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3489; GFX9-PAL-NEXT:    s_endpgm
3490;
3491; GFX940-LABEL: store_load_vidx_sidx_offset:
3492; GFX940:       ; %bb.0: ; %bb
3493; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
3494; GFX940-NEXT:    v_mov_b32_e32 v1, 15
3495; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
3496; GFX940-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
3497; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:1028 sc0 sc1
3498; GFX940-NEXT:    s_waitcnt vmcnt(0)
3499; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:1028 sc0 sc1
3500; GFX940-NEXT:    s_waitcnt vmcnt(0)
3501; GFX940-NEXT:    s_endpgm
3502;
3503; GFX10-PAL-LABEL: store_load_vidx_sidx_offset:
3504; GFX10-PAL:       ; %bb.0: ; %bb
3505; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
3506; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
3507; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
3508; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3509; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
3510; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
3511; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
3512; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
3513; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
3514; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
3515; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
3516; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3517; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
3518; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
3519; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
3520; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3521; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
3522; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3523; GFX10-PAL-NEXT:    s_endpgm
3524;
3525; GFX11-PAL-LABEL: store_load_vidx_sidx_offset:
3526; GFX11-PAL:       ; %bb.0: ; %bb
3527; GFX11-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
3528; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
3529; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3530; GFX11-PAL-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
3531; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:1028 dlc
3532; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3533; GFX11-PAL-NEXT:    scratch_load_b32 v0, v0, off offset:1028 glc dlc
3534; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3535; GFX11-PAL-NEXT:    s_endpgm
3536; GCN-LABEL: store_load_vidx_sidx_offset:
3537; GCN:       ; %bb.0: ; %bb
3538; GCN-NEXT:    s_load_dword s0, s[0:1], 0x24
3539; GCN-NEXT:    v_mov_b32_e32 v1, 15
3540; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3541; GCN-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
3542; GCN-NEXT:    scratch_store_dword v0, v1, off offset:1028 sc0 sc1
3543; GCN-NEXT:    s_waitcnt vmcnt(0)
3544; GCN-NEXT:    scratch_load_dword v0, v0, off offset:1028 sc0 sc1
3545; GCN-NEXT:    s_waitcnt vmcnt(0)
3546; GCN-NEXT:    s_endpgm
3547bb:
3548  %alloca = alloca [32 x i32], align 4, addrspace(5)
3549  %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
3550  %add1 = add nsw i32 %sidx, %vidx
3551  %add2 = add nsw i32 %add1, 256
3552  %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2
3553  store volatile i32 15, i32 addrspace(5)* %gep, align 4
3554  %load = load volatile i32, i32 addrspace(5)* %gep, align 4
3555  ret void
3556}
3557
3558define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
3559; GFX9-LABEL: store_load_i64_aligned:
3560; GFX9:       ; %bb.0: ; %bb
3561; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3562; GFX9-NEXT:    v_mov_b32_e32 v1, 15
3563; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3564; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3565; GFX9-NEXT:    s_waitcnt vmcnt(0)
3566; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
3567; GFX9-NEXT:    s_waitcnt vmcnt(0)
3568; GFX9-NEXT:    s_setpc_b64 s[30:31]
3569;
3570; GFX10-LABEL: store_load_i64_aligned:
3571; GFX10:       ; %bb.0: ; %bb
3572; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3573; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3574; GFX10-NEXT:    v_mov_b32_e32 v1, 15
3575; GFX10-NEXT:    v_mov_b32_e32 v2, 0
3576; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3577; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3578; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
3579; GFX10-NEXT:    s_waitcnt vmcnt(0)
3580; GFX10-NEXT:    s_setpc_b64 s[30:31]
3581;
3582; GFX11-LABEL: store_load_i64_aligned:
3583; GFX11:       ; %bb.0: ; %bb
3584; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3585; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3586; GFX11-NEXT:    v_mov_b32_e32 v1, 15
3587; GFX11-NEXT:    v_mov_b32_e32 v2, 0
3588; GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
3589; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3590; GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
3591; GFX11-NEXT:    s_waitcnt vmcnt(0)
3592; GFX11-NEXT:    s_setpc_b64 s[30:31]
3593;
3594; GFX9-PAL-LABEL: store_load_i64_aligned:
3595; GFX9-PAL:       ; %bb.0: ; %bb
3596; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3597; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
3598; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
3599; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3600; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3601; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
3602; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3603; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3604;
3605; GFX940-LABEL: store_load_i64_aligned:
3606; GFX940:       ; %bb.0: ; %bb
3607; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3608; GFX940-NEXT:    v_mov_b32_e32 v2, 15
3609; GFX940-NEXT:    v_mov_b32_e32 v3, 0
3610; GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
3611; GFX940-NEXT:    s_waitcnt vmcnt(0)
3612; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
3613; GFX940-NEXT:    s_waitcnt vmcnt(0)
3614; GFX940-NEXT:    s_setpc_b64 s[30:31]
3615;
3616; GFX10-PAL-LABEL: store_load_i64_aligned:
3617; GFX10-PAL:       ; %bb.0: ; %bb
3618; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3619; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3620; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
3621; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
3622; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3623; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3624; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
3625; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3626; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3627;
3628; GFX11-PAL-LABEL: store_load_i64_aligned:
3629; GFX11-PAL:       ; %bb.0: ; %bb
3630; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3631; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3632; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
3633; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 0
3634; GFX11-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
3635; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3636; GFX11-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
3637; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3638; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
3639; GCN-LABEL: store_load_i64_aligned:
3640; GCN:       ; %bb.0: ; %bb
3641; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3642; GCN-NEXT:    v_mov_b32_e32 v2, 15
3643; GCN-NEXT:    v_mov_b32_e32 v3, 0
3644; GCN-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
3645; GCN-NEXT:    s_waitcnt vmcnt(0)
3646; GCN-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
3647; GCN-NEXT:    s_waitcnt vmcnt(0)
3648; GCN-NEXT:    s_setpc_b64 s[30:31]
3649bb:
3650  store volatile i64 15, i64 addrspace(5)* %arg, align 8
3651  %load = load volatile i64, i64 addrspace(5)* %arg, align 8
3652  ret void
3653}
3654
3655define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
3656; GFX9-LABEL: store_load_i64_unaligned:
3657; GFX9:       ; %bb.0: ; %bb
3658; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3659; GFX9-NEXT:    v_mov_b32_e32 v1, 15
3660; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3661; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3662; GFX9-NEXT:    s_waitcnt vmcnt(0)
3663; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
3664; GFX9-NEXT:    s_waitcnt vmcnt(0)
3665; GFX9-NEXT:    s_setpc_b64 s[30:31]
3666;
3667; GFX10-LABEL: store_load_i64_unaligned:
3668; GFX10:       ; %bb.0: ; %bb
3669; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3670; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3671; GFX10-NEXT:    v_mov_b32_e32 v1, 15
3672; GFX10-NEXT:    v_mov_b32_e32 v2, 0
3673; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3674; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3675; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
3676; GFX10-NEXT:    s_waitcnt vmcnt(0)
3677; GFX10-NEXT:    s_setpc_b64 s[30:31]
3678;
3679; GFX11-LABEL: store_load_i64_unaligned:
3680; GFX11:       ; %bb.0: ; %bb
3681; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3682; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3683; GFX11-NEXT:    v_mov_b32_e32 v1, 15
3684; GFX11-NEXT:    v_mov_b32_e32 v2, 0
3685; GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
3686; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3687; GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
3688; GFX11-NEXT:    s_waitcnt vmcnt(0)
3689; GFX11-NEXT:    s_setpc_b64 s[30:31]
3690;
3691; GFX9-PAL-LABEL: store_load_i64_unaligned:
3692; GFX9-PAL:       ; %bb.0: ; %bb
3693; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3694; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
3695; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
3696; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3697; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3698; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
3699; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3700; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3701;
3702; GFX940-LABEL: store_load_i64_unaligned:
3703; GFX940:       ; %bb.0: ; %bb
3704; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3705; GFX940-NEXT:    v_mov_b32_e32 v2, 15
3706; GFX940-NEXT:    v_mov_b32_e32 v3, 0
3707; GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
3708; GFX940-NEXT:    s_waitcnt vmcnt(0)
3709; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
3710; GFX940-NEXT:    s_waitcnt vmcnt(0)
3711; GFX940-NEXT:    s_setpc_b64 s[30:31]
3712;
3713; GFX10-PAL-LABEL: store_load_i64_unaligned:
3714; GFX10-PAL:       ; %bb.0: ; %bb
3715; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3716; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3717; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
3718; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
3719; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3720; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3721; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
3722; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3723; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3724;
3725; GFX11-PAL-LABEL: store_load_i64_unaligned:
3726; GFX11-PAL:       ; %bb.0: ; %bb
3727; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3728; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3729; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
3730; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 0
3731; GFX11-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
3732; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3733; GFX11-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
3734; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3735; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
3736; GCN-LABEL: store_load_i64_unaligned:
3737; GCN:       ; %bb.0: ; %bb
3738; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3739; GCN-NEXT:    v_mov_b32_e32 v2, 15
3740; GCN-NEXT:    v_mov_b32_e32 v3, 0
3741; GCN-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
3742; GCN-NEXT:    s_waitcnt vmcnt(0)
3743; GCN-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
3744; GCN-NEXT:    s_waitcnt vmcnt(0)
3745; GCN-NEXT:    s_setpc_b64 s[30:31]
3746bb:
3747  store volatile i64 15, i64 addrspace(5)* %arg, align 1
3748  %load = load volatile i64, i64 addrspace(5)* %arg, align 1
3749  ret void
3750}
3751
3752define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) {
3753; GFX9-LABEL: store_load_v3i32_unaligned:
3754; GFX9:       ; %bb.0: ; %bb
3755; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3756; GFX9-NEXT:    v_mov_b32_e32 v1, 1
3757; GFX9-NEXT:    v_mov_b32_e32 v2, 2
3758; GFX9-NEXT:    v_mov_b32_e32 v3, 3
3759; GFX9-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
3760; GFX9-NEXT:    s_waitcnt vmcnt(0)
3761; GFX9-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
3762; GFX9-NEXT:    s_waitcnt vmcnt(0)
3763; GFX9-NEXT:    s_setpc_b64 s[30:31]
3764;
3765; GFX10-LABEL: store_load_v3i32_unaligned:
3766; GFX10:       ; %bb.0: ; %bb
3767; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3768; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3769; GFX10-NEXT:    v_mov_b32_e32 v1, 1
3770; GFX10-NEXT:    v_mov_b32_e32 v2, 2
3771; GFX10-NEXT:    v_mov_b32_e32 v3, 3
3772; GFX10-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
3773; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3774; GFX10-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
3775; GFX10-NEXT:    s_waitcnt vmcnt(0)
3776; GFX10-NEXT:    s_setpc_b64 s[30:31]
3777;
3778; GFX11-LABEL: store_load_v3i32_unaligned:
3779; GFX11:       ; %bb.0: ; %bb
3780; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3781; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3782; GFX11-NEXT:    v_mov_b32_e32 v1, 1
3783; GFX11-NEXT:    v_mov_b32_e32 v2, 2
3784; GFX11-NEXT:    v_mov_b32_e32 v3, 3
3785; GFX11-NEXT:    scratch_store_b96 v0, v[1:3], off dlc
3786; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3787; GFX11-NEXT:    scratch_load_b96 v[0:2], v0, off glc dlc
3788; GFX11-NEXT:    s_waitcnt vmcnt(0)
3789; GFX11-NEXT:    s_setpc_b64 s[30:31]
3790;
3791; GFX9-PAL-LABEL: store_load_v3i32_unaligned:
3792; GFX9-PAL:       ; %bb.0: ; %bb
3793; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3794; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
3795; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
3796; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
3797; GFX9-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
3798; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3799; GFX9-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
3800; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3801; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3802;
3803; GFX940-LABEL: store_load_v3i32_unaligned:
3804; GFX940:       ; %bb.0: ; %bb
3805; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3806; GFX940-NEXT:    v_mov_b32_e32 v2, 1
3807; GFX940-NEXT:    v_mov_b32_e32 v3, 2
3808; GFX940-NEXT:    v_mov_b32_e32 v4, 3
3809; GFX940-NEXT:    scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
3810; GFX940-NEXT:    s_waitcnt vmcnt(0)
3811; GFX940-NEXT:    scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
3812; GFX940-NEXT:    s_waitcnt vmcnt(0)
3813; GFX940-NEXT:    s_setpc_b64 s[30:31]
3814;
3815; GFX10-PAL-LABEL: store_load_v3i32_unaligned:
3816; GFX10-PAL:       ; %bb.0: ; %bb
3817; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3818; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3819; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
3820; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
3821; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
3822; GFX10-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
3823; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3824; GFX10-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
3825; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3826; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3827;
3828; GFX11-PAL-LABEL: store_load_v3i32_unaligned:
3829; GFX11-PAL:       ; %bb.0: ; %bb
3830; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3831; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3832; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 1
3833; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 2
3834; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, 3
3835; GFX11-PAL-NEXT:    scratch_store_b96 v0, v[1:3], off dlc
3836; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3837; GFX11-PAL-NEXT:    scratch_load_b96 v[0:2], v0, off glc dlc
3838; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3839; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
3840; GCN-LABEL: store_load_v3i32_unaligned:
3841; GCN:       ; %bb.0: ; %bb
3842; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3843; GCN-NEXT:    v_mov_b32_e32 v2, 1
3844; GCN-NEXT:    v_mov_b32_e32 v3, 2
3845; GCN-NEXT:    v_mov_b32_e32 v4, 3
3846; GCN-NEXT:    scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
3847; GCN-NEXT:    s_waitcnt vmcnt(0)
3848; GCN-NEXT:    scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
3849; GCN-NEXT:    s_waitcnt vmcnt(0)
3850; GCN-NEXT:    s_setpc_b64 s[30:31]
3851bb:
3852  store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1
3853  %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1
3854  ret void
3855}
3856
3857define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) {
3858; GFX9-LABEL: store_load_v4i32_unaligned:
3859; GFX9:       ; %bb.0: ; %bb
3860; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3861; GFX9-NEXT:    v_mov_b32_e32 v1, 1
3862; GFX9-NEXT:    v_mov_b32_e32 v2, 2
3863; GFX9-NEXT:    v_mov_b32_e32 v3, 3
3864; GFX9-NEXT:    v_mov_b32_e32 v4, 4
3865; GFX9-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
3866; GFX9-NEXT:    s_waitcnt vmcnt(0)
3867; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
3868; GFX9-NEXT:    s_waitcnt vmcnt(0)
3869; GFX9-NEXT:    s_setpc_b64 s[30:31]
3870;
3871; GFX10-LABEL: store_load_v4i32_unaligned:
3872; GFX10:       ; %bb.0: ; %bb
3873; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3874; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3875; GFX10-NEXT:    v_mov_b32_e32 v1, 1
3876; GFX10-NEXT:    v_mov_b32_e32 v2, 2
3877; GFX10-NEXT:    v_mov_b32_e32 v3, 3
3878; GFX10-NEXT:    v_mov_b32_e32 v4, 4
3879; GFX10-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
3880; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3881; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
3882; GFX10-NEXT:    s_waitcnt vmcnt(0)
3883; GFX10-NEXT:    s_setpc_b64 s[30:31]
3884;
3885; GFX11-LABEL: store_load_v4i32_unaligned:
3886; GFX11:       ; %bb.0: ; %bb
3887; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3888; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3889; GFX11-NEXT:    v_mov_b32_e32 v1, 1
3890; GFX11-NEXT:    v_mov_b32_e32 v2, 2
3891; GFX11-NEXT:    v_mov_b32_e32 v3, 3
3892; GFX11-NEXT:    v_mov_b32_e32 v4, 4
3893; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off dlc
3894; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3895; GFX11-NEXT:    scratch_load_b128 v[0:3], v0, off glc dlc
3896; GFX11-NEXT:    s_waitcnt vmcnt(0)
3897; GFX11-NEXT:    s_setpc_b64 s[30:31]
3898;
3899; GFX9-PAL-LABEL: store_load_v4i32_unaligned:
3900; GFX9-PAL:       ; %bb.0: ; %bb
3901; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3902; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
3903; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
3904; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
3905; GFX9-PAL-NEXT:    v_mov_b32_e32 v4, 4
3906; GFX9-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
3907; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3908; GFX9-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
3909; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3910; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3911;
3912; GFX940-LABEL: store_load_v4i32_unaligned:
3913; GFX940:       ; %bb.0: ; %bb
3914; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3915; GFX940-NEXT:    v_mov_b32_e32 v2, 1
3916; GFX940-NEXT:    v_mov_b32_e32 v3, 2
3917; GFX940-NEXT:    v_mov_b32_e32 v4, 3
3918; GFX940-NEXT:    v_mov_b32_e32 v5, 4
3919; GFX940-NEXT:    scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
3920; GFX940-NEXT:    s_waitcnt vmcnt(0)
3921; GFX940-NEXT:    scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
3922; GFX940-NEXT:    s_waitcnt vmcnt(0)
3923; GFX940-NEXT:    s_setpc_b64 s[30:31]
3924;
3925; GFX10-PAL-LABEL: store_load_v4i32_unaligned:
3926; GFX10-PAL:       ; %bb.0: ; %bb
3927; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3928; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3929; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
3930; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
3931; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
3932; GFX10-PAL-NEXT:    v_mov_b32_e32 v4, 4
3933; GFX10-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
3934; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3935; GFX10-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
3936; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3937; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3938;
3939; GFX11-PAL-LABEL: store_load_v4i32_unaligned:
3940; GFX11-PAL:       ; %bb.0: ; %bb
3941; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3942; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3943; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 1
3944; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 2
3945; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, 3
3946; GFX11-PAL-NEXT:    v_mov_b32_e32 v4, 4
3947; GFX11-PAL-NEXT:    scratch_store_b128 v0, v[1:4], off dlc
3948; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3949; GFX11-PAL-NEXT:    scratch_load_b128 v[0:3], v0, off glc dlc
3950; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3951; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
3952; GCN-LABEL: store_load_v4i32_unaligned:
3953; GCN:       ; %bb.0: ; %bb
3954; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3955; GCN-NEXT:    v_mov_b32_e32 v2, 1
3956; GCN-NEXT:    v_mov_b32_e32 v3, 2
3957; GCN-NEXT:    v_mov_b32_e32 v4, 3
3958; GCN-NEXT:    v_mov_b32_e32 v5, 4
3959; GCN-NEXT:    scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
3960; GCN-NEXT:    s_waitcnt vmcnt(0)
3961; GCN-NEXT:    scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
3962; GCN-NEXT:    s_waitcnt vmcnt(0)
3963; GCN-NEXT:    s_setpc_b64 s[30:31]
3964bb:
3965  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1
3966  %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1
3967  ret void
3968}
3969
3970define void @store_load_i32_negative_unaligned(i8 addrspace(5)* nocapture %arg) {
3971; GFX9-LABEL: store_load_i32_negative_unaligned:
3972; GFX9:       ; %bb.0: ; %bb
3973; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3974; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
3975; GFX9-NEXT:    v_mov_b32_e32 v1, 1
3976; GFX9-NEXT:    scratch_store_byte v0, v1, off
3977; GFX9-NEXT:    s_waitcnt vmcnt(0)
3978; GFX9-NEXT:    scratch_load_ubyte v0, v0, off glc
3979; GFX9-NEXT:    s_waitcnt vmcnt(0)
3980; GFX9-NEXT:    s_setpc_b64 s[30:31]
3981;
3982; GFX10-LABEL: store_load_i32_negative_unaligned:
3983; GFX10:       ; %bb.0: ; %bb
3984; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3985; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3986; GFX10-NEXT:    v_mov_b32_e32 v1, 1
3987; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-1
3988; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3989; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
3990; GFX10-NEXT:    s_waitcnt vmcnt(0)
3991; GFX10-NEXT:    s_setpc_b64 s[30:31]
3992;
3993; GFX11-LABEL: store_load_i32_negative_unaligned:
3994; GFX11:       ; %bb.0: ; %bb
3995; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3996; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3997; GFX11-NEXT:    v_mov_b32_e32 v1, 1
3998; GFX11-NEXT:    scratch_store_b8 v0, v1, off offset:-1 dlc
3999; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4000; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:-1 glc dlc
4001; GFX11-NEXT:    s_waitcnt vmcnt(0)
4002; GFX11-NEXT:    s_setpc_b64 s[30:31]
4003;
4004; GFX9-PAL-LABEL: store_load_i32_negative_unaligned:
4005; GFX9-PAL:       ; %bb.0: ; %bb
4006; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4007; GFX9-PAL-NEXT:    v_add_u32_e32 v0, -1, v0
4008; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
4009; GFX9-PAL-NEXT:    scratch_store_byte v0, v1, off
4010; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4011; GFX9-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc
4012; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4013; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
4014;
4015; GFX940-LABEL: store_load_i32_negative_unaligned:
4016; GFX940:       ; %bb.0: ; %bb
4017; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4018; GFX940-NEXT:    v_add_u32_e32 v0, -1, v0
4019; GFX940-NEXT:    v_mov_b32_e32 v1, 1
4020; GFX940-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
4021; GFX940-NEXT:    s_waitcnt vmcnt(0)
4022; GFX940-NEXT:    scratch_load_ubyte v0, v0, off sc0 sc1
4023; GFX940-NEXT:    s_waitcnt vmcnt(0)
4024; GFX940-NEXT:    s_setpc_b64 s[30:31]
4025;
4026; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned:
4027; GFX1010-PAL:       ; %bb.0: ; %bb
4028; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4029; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4030; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, -1, v0
4031; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
4032; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off
4033; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4034; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
4035; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
4036; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
4037;
4038; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned:
4039; GFX1030-PAL:       ; %bb.0: ; %bb
4040; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4041; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4042; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
4043; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-1
4044; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4045; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
4046; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
4047; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
4048;
4049; GFX11-PAL-LABEL: store_load_i32_negative_unaligned:
4050; GFX11-PAL:       ; %bb.0: ; %bb
4051; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4052; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4053; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 1
4054; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-1 dlc
4055; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4056; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-1 glc dlc
4057; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
4058; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
4059bb:
4060  %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -1
4061  store volatile i8 1, i8 addrspace(5)* %ptr, align 1
4062  %load = load volatile i8, i8 addrspace(5)* %ptr, align 1
4063  ret void
4064}
4065
4066define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture %arg) {
4067; GFX9-LABEL: store_load_i32_large_negative_unaligned:
4068; GFX9:       ; %bb.0: ; %bb
4069; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4070; GFX9-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
4071; GFX9-NEXT:    v_mov_b32_e32 v1, 1
4072; GFX9-NEXT:    scratch_store_byte v0, v1, off
4073; GFX9-NEXT:    s_waitcnt vmcnt(0)
4074; GFX9-NEXT:    scratch_load_ubyte v0, v0, off glc
4075; GFX9-NEXT:    s_waitcnt vmcnt(0)
4076; GFX9-NEXT:    s_setpc_b64 s[30:31]
4077;
4078; GFX10-LABEL: store_load_i32_large_negative_unaligned:
4079; GFX10:       ; %bb.0: ; %bb
4080; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4081; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4082; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
4083; GFX10-NEXT:    v_mov_b32_e32 v1, 1
4084; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-129
4085; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4086; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
4087; GFX10-NEXT:    s_waitcnt vmcnt(0)
4088; GFX10-NEXT:    s_setpc_b64 s[30:31]
4089;
4090; GFX11-LABEL: store_load_i32_large_negative_unaligned:
4091; GFX11:       ; %bb.0: ; %bb
4092; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4093; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4094; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
4095; GFX11-NEXT:    v_mov_b32_e32 v1, 1
4096; GFX11-NEXT:    scratch_store_b8 v0, v1, off offset:-129 dlc
4097; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4098; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:-129 glc dlc
4099; GFX11-NEXT:    s_waitcnt vmcnt(0)
4100; GFX11-NEXT:    s_setpc_b64 s[30:31]
4101;
4102; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned:
4103; GFX9-PAL:       ; %bb.0: ; %bb
4104; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4105; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
4106; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
4107; GFX9-PAL-NEXT:    scratch_store_byte v0, v1, off
4108; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4109; GFX9-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc
4110; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4111; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
4112;
4113; GFX940-LABEL: store_load_i32_large_negative_unaligned:
4114; GFX940:       ; %bb.0: ; %bb
4115; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4116; GFX940-NEXT:    s_movk_i32 s0, 0xef7f
4117; GFX940-NEXT:    v_mov_b32_e32 v1, 1
4118; GFX940-NEXT:    scratch_store_byte v0, v1, s0 sc0 sc1
4119; GFX940-NEXT:    s_waitcnt vmcnt(0)
4120; GFX940-NEXT:    scratch_load_ubyte v0, v0, s0 sc0 sc1
4121; GFX940-NEXT:    s_waitcnt vmcnt(0)
4122; GFX940-NEXT:    s_setpc_b64 s[30:31]
4123;
4124; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned:
4125; GFX1010-PAL:       ; %bb.0: ; %bb
4126; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4127; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4128; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xffffefff, v0
4129; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
4130; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-128
4131; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4132; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-128 glc dlc
4133; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
4134; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
4135;
4136; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned:
4137; GFX1030-PAL:       ; %bb.0: ; %bb
4138; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4139; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4140; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
4141; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
4142; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-129
4143; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4144; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
4145; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
4146; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
4147;
4148; GFX11-PAL-LABEL: store_load_i32_large_negative_unaligned:
4149; GFX11-PAL:       ; %bb.0: ; %bb
4150; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4151; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4152; GFX11-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
4153; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 1
4154; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-129 dlc
4155; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4156; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-129 glc dlc
4157; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
4158; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
4159bb:
4160  %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -4225
4161  store volatile i8 1, i8 addrspace(5)* %ptr, align 1
4162  %load = load volatile i8, i8 addrspace(5)* %ptr, align 1
4163  ret void
4164}
4165
4166define amdgpu_ps void @large_offset() {
4167; GFX9-LABEL: large_offset:
4168; GFX9:       ; %bb.0: ; %bb
4169; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s2
4170; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4171; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
4172; GFX9-NEXT:    v_mov_b32_e32 v1, v0
4173; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4174; GFX9-NEXT:    v_mov_b32_e32 v3, v0
4175; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
4176; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024
4177; GFX9-NEXT:    s_waitcnt vmcnt(0)
4178; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
4179; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc
4180; GFX9-NEXT:    s_waitcnt vmcnt(0)
4181; GFX9-NEXT:    v_mov_b32_e32 v0, 16
4182; GFX9-NEXT:    ;;#ASMSTART
4183; GFX9-NEXT:    ; use v0
4184; GFX9-NEXT:    ;;#ASMEND
4185; GFX9-NEXT:    v_mov_b32_e32 v0, 0x810
4186; GFX9-NEXT:    ;;#ASMSTART
4187; GFX9-NEXT:    ; use v0
4188; GFX9-NEXT:    ;;#ASMEND
4189; GFX9-NEXT:    s_endpgm
4190;
4191; GFX10-LABEL: large_offset:
4192; GFX10:       ; %bb.0: ; %bb
4193; GFX10-NEXT:    s_add_u32 s0, s0, s2
4194; GFX10-NEXT:    s_addc_u32 s1, s1, 0
4195; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
4196; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
4197; GFX10-NEXT:    v_mov_b32_e32 v0, 0
4198; GFX10-NEXT:    s_movk_i32 s0, 0x810
4199; GFX10-NEXT:    s_addk_i32 s0, 0x3c0
4200; GFX10-NEXT:    v_mov_b32_e32 v1, v0
4201; GFX10-NEXT:    v_mov_b32_e32 v2, v0
4202; GFX10-NEXT:    v_mov_b32_e32 v3, v0
4203; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
4204; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4205; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 glc dlc
4206; GFX10-NEXT:    s_waitcnt vmcnt(0)
4207; GFX10-NEXT:    v_mov_b32_e32 v0, 16
4208; GFX10-NEXT:    v_mov_b32_e32 v1, 0x810
4209; GFX10-NEXT:    ;;#ASMSTART
4210; GFX10-NEXT:    ; use v0
4211; GFX10-NEXT:    ;;#ASMEND
4212; GFX10-NEXT:    ;;#ASMSTART
4213; GFX10-NEXT:    ; use v1
4214; GFX10-NEXT:    ;;#ASMEND
4215; GFX10-NEXT:    s_endpgm
4216;
4217; GFX11-LABEL: large_offset:
4218; GFX11:       ; %bb.0: ; %bb
4219; GFX11-NEXT:    v_mov_b32_e32 v0, 0
4220; GFX11-NEXT:    v_mov_b32_e32 v1, v0
4221; GFX11-NEXT:    v_mov_b32_e32 v2, v0
4222; GFX11-NEXT:    v_mov_b32_e32 v3, v0
4223; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:3024 dlc
4224; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4225; GFX11-NEXT:    scratch_load_b128 v[0:3], off, off offset:3024 glc dlc
4226; GFX11-NEXT:    s_waitcnt vmcnt(0)
4227; GFX11-NEXT:    v_mov_b32_e32 v0, 16
4228; GFX11-NEXT:    v_mov_b32_e32 v1, 0x810
4229; GFX11-NEXT:    ;;#ASMSTART
4230; GFX11-NEXT:    ; use v0
4231; GFX11-NEXT:    ;;#ASMEND
4232; GFX11-NEXT:    ;;#ASMSTART
4233; GFX11-NEXT:    ; use v1
4234; GFX11-NEXT:    ;;#ASMEND
4235; GFX11-NEXT:    s_endpgm
4236;
4237; GFX9-PAL-LABEL: large_offset:
4238; GFX9-PAL:       ; %bb.0: ; %bb
4239; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
4240; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
4241; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
4242; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0
4243; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, v0
4244; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, v0
4245; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, v0
4246; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
4247; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
4248; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s0
4249; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
4250; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
4251; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024
4252; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4253; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
4254; GFX9-PAL-NEXT:    scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc
4255; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4256; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 16
4257; GFX9-PAL-NEXT:    ;;#ASMSTART
4258; GFX9-PAL-NEXT:    ; use v0
4259; GFX9-PAL-NEXT:    ;;#ASMEND
4260; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0x810
4261; GFX9-PAL-NEXT:    ;;#ASMSTART
4262; GFX9-PAL-NEXT:    ; use v0
4263; GFX9-PAL-NEXT:    ;;#ASMEND
4264; GFX9-PAL-NEXT:    s_endpgm
4265;
4266; GFX940-LABEL: large_offset:
4267; GFX940:       ; %bb.0: ; %bb
4268; GFX940-NEXT:    v_mov_b32_e32 v0, 0
4269; GFX940-NEXT:    v_mov_b32_e32 v1, v0
4270; GFX940-NEXT:    v_mov_b32_e32 v2, v0
4271; GFX940-NEXT:    v_mov_b32_e32 v3, v0
4272; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1
4273; GFX940-NEXT:    s_waitcnt vmcnt(0)
4274; GFX940-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1
4275; GFX940-NEXT:    s_waitcnt vmcnt(0)
4276; GFX940-NEXT:    v_mov_b32_e32 v0, 16
4277; GFX940-NEXT:    ;;#ASMSTART
4278; GFX940-NEXT:    ; use v0
4279; GFX940-NEXT:    ;;#ASMEND
4280; GFX940-NEXT:    v_mov_b32_e32 v0, 0x810
4281; GFX940-NEXT:    ;;#ASMSTART
4282; GFX940-NEXT:    ; use v0
4283; GFX940-NEXT:    ;;#ASMEND
4284; GFX940-NEXT:    s_endpgm
4285;
4286; GFX10-PAL-LABEL: large_offset:
4287; GFX10-PAL:       ; %bb.0: ; %bb
4288; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
4289; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
4290; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
4291; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
4292; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
4293; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s0
4294; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
4295; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
4296; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
4297; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 0
4298; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x810
4299; GFX10-PAL-NEXT:    s_addk_i32 s0, 0x3c0
4300; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, v0
4301; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, v0
4302; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, v0
4303; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
4304; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4305; GFX10-PAL-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 glc dlc
4306; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
4307; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 16
4308; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x810
4309; GFX10-PAL-NEXT:    ;;#ASMSTART
4310; GFX10-PAL-NEXT:    ; use v0
4311; GFX10-PAL-NEXT:    ;;#ASMEND
4312; GFX10-PAL-NEXT:    ;;#ASMSTART
4313; GFX10-PAL-NEXT:    ; use v1
4314; GFX10-PAL-NEXT:    ;;#ASMEND
4315; GFX10-PAL-NEXT:    s_endpgm
4316;
4317; GFX11-PAL-LABEL: large_offset:
4318; GFX11-PAL:       ; %bb.0: ; %bb
4319; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 0
4320; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, v0
4321; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, v0
4322; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, v0
4323; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:3024 dlc
4324; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4325; GFX11-PAL-NEXT:    scratch_load_b128 v[0:3], off, off offset:3024 glc dlc
4326; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
4327; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 16
4328; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 0x810
4329; GFX11-PAL-NEXT:    ;;#ASMSTART
4330; GFX11-PAL-NEXT:    ; use v0
4331; GFX11-PAL-NEXT:    ;;#ASMEND
4332; GFX11-PAL-NEXT:    ;;#ASMSTART
4333; GFX11-PAL-NEXT:    ; use v1
4334; GFX11-PAL-NEXT:    ;;#ASMEND
4335; GFX11-PAL-NEXT:    s_endpgm
4336bb:
4337  %alloca = alloca [128 x <4 x i32>], align 16, addrspace(5)
4338  %alloca2 = alloca [128 x <4 x i32>], align 16, addrspace(5)
4339  %gep = getelementptr inbounds [128 x <4 x i32>], [128 x <4 x i32>] addrspace(5)* %alloca2, i32 0, i32 60
4340  store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(5)* %gep, align 16
4341  %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %gep, align 16
4342  call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca) #0
4343  call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca2) #0
4344  ret void
4345}
4346
4347declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg)
4348declare i32 @llvm.amdgcn.workitem.id.x()
4349