1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
4; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
5; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s
6; RUN: llc -march=amdgcn -mcpu=gfx940 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s
7; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s
8; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s
9; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-PAL %s
10
11define amdgpu_kernel void @zero_init_kernel() {
12; GFX9-LABEL: zero_init_kernel:
13; GFX9:       ; %bb.0:
14; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
15; GFX9-NEXT:    s_mov_b32 s0, 0
16; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
17; GFX9-NEXT:    s_mov_b32 s1, s0
18; GFX9-NEXT:    s_mov_b32 s2, s0
19; GFX9-NEXT:    s_mov_b32 s3, s0
20; GFX9-NEXT:    v_mov_b32_e32 v0, s0
21; GFX9-NEXT:    v_mov_b32_e32 v1, s1
22; GFX9-NEXT:    v_mov_b32_e32 v2, s2
23; GFX9-NEXT:    v_mov_b32_e32 v3, s3
24; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
25; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64
26; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
27; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
28; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
29; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
30; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
31; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
32; GFX9-NEXT:    s_endpgm
33;
34; GFX10-LABEL: zero_init_kernel:
35; GFX10:       ; %bb.0:
36; GFX10-NEXT:    s_add_u32 s0, s0, s3
37; GFX10-NEXT:    s_addc_u32 s1, s1, 0
38; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
39; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
40; GFX10-NEXT:    s_mov_b32 s0, 0
41; GFX10-NEXT:    s_mov_b32 s1, s0
42; GFX10-NEXT:    s_mov_b32 s2, s0
43; GFX10-NEXT:    s_mov_b32 s3, s0
44; GFX10-NEXT:    v_mov_b32_e32 v0, s0
45; GFX10-NEXT:    v_mov_b32_e32 v1, s1
46; GFX10-NEXT:    v_mov_b32_e32 v2, s2
47; GFX10-NEXT:    v_mov_b32_e32 v3, s3
48; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
49; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
50; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
51; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
52; GFX10-NEXT:    s_endpgm
53;
54; GFX11-LABEL: zero_init_kernel:
55; GFX11:       ; %bb.0:
56; GFX11-NEXT:    s_mov_b32 s0, 0
57; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
58; GFX11-NEXT:    s_mov_b32 s1, s0
59; GFX11-NEXT:    s_mov_b32 s2, s0
60; GFX11-NEXT:    s_mov_b32 s3, s0
61; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
62; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
63; GFX11-NEXT:    s_clause 0x3
64; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:64
65; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:48
66; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
67; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:16
68; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
69; GFX11-NEXT:    s_endpgm
70;
71; GFX9-PAL-LABEL: zero_init_kernel:
72; GFX9-PAL:       ; %bb.0:
73; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
74; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
75; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
76; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
77; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
78; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
79; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
80; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
81; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
82; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
83; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
84; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
85; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
86; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
87; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
88; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
89; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64
90; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
91; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
92; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
93; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
94; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
95; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
96; GFX9-PAL-NEXT:    s_endpgm
97;
98; GFX940-LABEL: zero_init_kernel:
99; GFX940:       ; %bb.0:
100; GFX940-NEXT:    s_mov_b32 s0, 0
101; GFX940-NEXT:    s_mov_b32 s1, s0
102; GFX940-NEXT:    s_mov_b32 s2, s0
103; GFX940-NEXT:    s_mov_b32 s3, s0
104; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
105; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
106; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
107; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
108; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
109; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
110; GFX940-NEXT:    s_endpgm
111;
112; GFX1010-PAL-LABEL: zero_init_kernel:
113; GFX1010-PAL:       ; %bb.0:
114; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
115; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
116; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
117; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
118; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
119; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
120; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
121; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
122; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
123; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
124; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
125; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
126; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
127; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
128; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
129; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
130; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
131; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
132; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:64
133; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
134; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
135; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
136; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
137; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
138; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
139; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
140; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
141; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
142; GFX1010-PAL-NEXT:    s_endpgm
143;
144; GFX1030-PAL-LABEL: zero_init_kernel:
145; GFX1030-PAL:       ; %bb.0:
146; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
147; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
148; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
149; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
150; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
151; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
152; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
153; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
154; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
155; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
156; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
157; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
158; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
159; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
160; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
161; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
162; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
163; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
164; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
165; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
166; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
167; GFX1030-PAL-NEXT:    s_endpgm
168;
169; GFX11-PAL-LABEL: zero_init_kernel:
170; GFX11-PAL:       ; %bb.0:
171; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
172; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
173; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
174; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
175; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
176; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
177; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
178; GFX11-PAL-NEXT:    s_clause 0x3
179; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:64
180; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:48
181; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
182; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16
183; GFX11-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
184; GFX11-PAL-NEXT:    s_endpgm
185  %alloca = alloca [32 x i16], align 2, addrspace(5)
186  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
187  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
188  ret void
189}
190
191define void @zero_init_foo() {
192; GFX9-LABEL: zero_init_foo:
193; GFX9:       ; %bb.0:
194; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
195; GFX9-NEXT:    s_mov_b32 s0, 0
196; GFX9-NEXT:    s_mov_b32 s1, s0
197; GFX9-NEXT:    s_mov_b32 s2, s0
198; GFX9-NEXT:    s_mov_b32 s3, s0
199; GFX9-NEXT:    v_mov_b32_e32 v0, s0
200; GFX9-NEXT:    v_mov_b32_e32 v1, s1
201; GFX9-NEXT:    v_mov_b32_e32 v2, s2
202; GFX9-NEXT:    v_mov_b32_e32 v3, s3
203; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
204; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
205; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
206; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
207; GFX9-NEXT:    s_waitcnt vmcnt(0)
208; GFX9-NEXT:    s_setpc_b64 s[30:31]
209;
210; GFX10-LABEL: zero_init_foo:
211; GFX10:       ; %bb.0:
212; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
213; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
214; GFX10-NEXT:    s_mov_b32 s0, 0
215; GFX10-NEXT:    s_mov_b32 s1, s0
216; GFX10-NEXT:    s_mov_b32 s2, s0
217; GFX10-NEXT:    s_mov_b32 s3, s0
218; GFX10-NEXT:    v_mov_b32_e32 v0, s0
219; GFX10-NEXT:    v_mov_b32_e32 v1, s1
220; GFX10-NEXT:    v_mov_b32_e32 v2, s2
221; GFX10-NEXT:    v_mov_b32_e32 v3, s3
222; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
223; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
224; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
225; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
226; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
227; GFX10-NEXT:    s_setpc_b64 s[30:31]
228;
229; GFX11-LABEL: zero_init_foo:
230; GFX11:       ; %bb.0:
231; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
232; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
233; GFX11-NEXT:    s_mov_b32 s0, 0
234; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
235; GFX11-NEXT:    s_mov_b32 s1, s0
236; GFX11-NEXT:    s_mov_b32 s2, s0
237; GFX11-NEXT:    s_mov_b32 s3, s0
238; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
239; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
240; GFX11-NEXT:    s_clause 0x3
241; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:48
242; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:32
243; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
244; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
245; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
246; GFX11-NEXT:    s_setpc_b64 s[30:31]
247;
248; GFX9-PAL-LABEL: zero_init_foo:
249; GFX9-PAL:       ; %bb.0:
250; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
251; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
252; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
253; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
254; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
255; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
256; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
257; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
258; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
259; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
260; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
261; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
262; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
263; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
264; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
265;
266; GFX940-LABEL: zero_init_foo:
267; GFX940:       ; %bb.0:
268; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
269; GFX940-NEXT:    s_mov_b32 s0, 0
270; GFX940-NEXT:    s_mov_b32 s1, s0
271; GFX940-NEXT:    s_mov_b32 s2, s0
272; GFX940-NEXT:    s_mov_b32 s3, s0
273; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
274; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
275; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
276; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
277; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
278; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
279; GFX940-NEXT:    s_waitcnt vmcnt(0)
280; GFX940-NEXT:    s_setpc_b64 s[30:31]
281;
282; GFX10-PAL-LABEL: zero_init_foo:
283; GFX10-PAL:       ; %bb.0:
284; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
285; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
286; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
287; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
288; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
289; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
290; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
291; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
292; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
293; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
294; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
295; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
296; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
297; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
298; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
299; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
300;
301; GFX11-PAL-LABEL: zero_init_foo:
302; GFX11-PAL:       ; %bb.0:
303; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
304; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
305; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
306; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
307; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
308; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
309; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
310; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
311; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
312; GFX11-PAL-NEXT:    s_clause 0x3
313; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:48
314; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:32
315; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
316; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32
317; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
318; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
319; GCN-LABEL: zero_init_foo:
320; GCN:       ; %bb.0:
321; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
322; GCN-NEXT:    s_mov_b32 s0, 0
323; GCN-NEXT:    s_mov_b32 s1, s0
324; GCN-NEXT:    s_mov_b32 s2, s0
325; GCN-NEXT:    s_mov_b32 s3, s0
326; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
327; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
328; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
329; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
330; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
331; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
332; GCN-NEXT:    s_waitcnt vmcnt(0)
333; GCN-NEXT:    s_setpc_b64 s[30:31]
334  %alloca = alloca [32 x i16], align 2, addrspace(5)
335  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
336  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
337  ret void
338}
339
340define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
341; GFX9-LABEL: store_load_sindex_kernel:
342; GFX9:       ; %bb.0: ; %bb
343; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
344; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
345; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
346; GFX9-NEXT:    v_mov_b32_e32 v0, 15
347; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
348; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
349; GFX9-NEXT:    s_and_b32 s0, s0, 15
350; GFX9-NEXT:    s_add_i32 s1, s1, 4
351; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
352; GFX9-NEXT:    scratch_store_dword off, v0, s1
353; GFX9-NEXT:    s_waitcnt vmcnt(0)
354; GFX9-NEXT:    s_add_i32 s0, s0, 4
355; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
356; GFX9-NEXT:    s_waitcnt vmcnt(0)
357; GFX9-NEXT:    s_endpgm
358;
359; GFX10-LABEL: store_load_sindex_kernel:
360; GFX10:       ; %bb.0: ; %bb
361; GFX10-NEXT:    s_add_u32 s2, s2, s5
362; GFX10-NEXT:    s_addc_u32 s3, s3, 0
363; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
364; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
365; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
366; GFX10-NEXT:    v_mov_b32_e32 v0, 15
367; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
368; GFX10-NEXT:    s_and_b32 s1, s0, 15
369; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
370; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
371; GFX10-NEXT:    s_add_i32 s0, s0, 4
372; GFX10-NEXT:    s_add_i32 s1, s1, 4
373; GFX10-NEXT:    scratch_store_dword off, v0, s0
374; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
375; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
376; GFX10-NEXT:    s_waitcnt vmcnt(0)
377; GFX10-NEXT:    s_endpgm
378;
379; GFX11-LABEL: store_load_sindex_kernel:
380; GFX11:       ; %bb.0: ; %bb
381; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
382; GFX11-NEXT:    v_mov_b32_e32 v0, 15
383; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
384; GFX11-NEXT:    s_and_b32 s1, s0, 15
385; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
386; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
387; GFX11-NEXT:    s_add_i32 s0, s0, 4
388; GFX11-NEXT:    s_add_i32 s1, s1, 4
389; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
390; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
391; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
392; GFX11-NEXT:    s_waitcnt vmcnt(0)
393; GFX11-NEXT:    s_endpgm
394;
395; GFX9-PAL-LABEL: store_load_sindex_kernel:
396; GFX9-PAL:       ; %bb.0: ; %bb
397; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
398; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
399; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
400; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
401; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
402; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
403; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
404; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
405; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
406; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
407; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
408; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 4
409; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
410; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
411; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
412; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
413; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
414; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
415; GFX9-PAL-NEXT:    s_endpgm
416;
417; GFX940-LABEL: store_load_sindex_kernel:
418; GFX940:       ; %bb.0: ; %bb
419; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
420; GFX940-NEXT:    v_mov_b32_e32 v0, 15
421; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
422; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
423; GFX940-NEXT:    s_and_b32 s0, s0, 15
424; GFX940-NEXT:    s_add_i32 s1, s1, 4
425; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
426; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
427; GFX940-NEXT:    s_waitcnt vmcnt(0)
428; GFX940-NEXT:    s_add_i32 s0, s0, 4
429; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
430; GFX940-NEXT:    s_waitcnt vmcnt(0)
431; GFX940-NEXT:    s_endpgm
432;
433; GFX10-PAL-LABEL: store_load_sindex_kernel:
434; GFX10-PAL:       ; %bb.0: ; %bb
435; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
436; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
437; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
438; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
439; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
440; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
441; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
442; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
443; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
444; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
445; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
446; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
447; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
448; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
449; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
450; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 4
451; GFX10-PAL-NEXT:    s_add_i32 s1, s1, 4
452; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
453; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
454; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
455; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
456; GFX10-PAL-NEXT:    s_endpgm
457;
458; GFX11-PAL-LABEL: store_load_sindex_kernel:
459; GFX11-PAL:       ; %bb.0: ; %bb
460; GFX11-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
461; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
462; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
463; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
464; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
465; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
466; GFX11-PAL-NEXT:    s_add_i32 s0, s0, 4
467; GFX11-PAL-NEXT:    s_add_i32 s1, s1, 4
468; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
469; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
470; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
471; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
472; GFX11-PAL-NEXT:    s_endpgm
473; GCN-LABEL: store_load_sindex_kernel:
474; GCN:       ; %bb.0: ; %bb
475; GCN-NEXT:    s_load_dword s0, s[0:1], 0x24
476; GCN-NEXT:    v_mov_b32_e32 v0, 15
477; GCN-NEXT:    s_waitcnt lgkmcnt(0)
478; GCN-NEXT:    s_lshl_b32 s1, s0, 2
479; GCN-NEXT:    s_and_b32 s0, s0, 15
480; GCN-NEXT:    s_lshl_b32 s0, s0, 2
481; GCN-NEXT:    s_add_u32 s1, 4, s1
482; GCN-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
483; GCN-NEXT:    s_waitcnt vmcnt(0)
484; GCN-NEXT:    s_add_u32 s0, 4, s0
485; GCN-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
486; GCN-NEXT:    s_waitcnt vmcnt(0)
487; GCN-NEXT:    s_endpgm
488bb:
489  %i = alloca [32 x float], align 4, addrspace(5)
490  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
491  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
492  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
493  store volatile i32 15, i32 addrspace(5)* %i8, align 4
494  %i9 = and i32 %idx, 15
495  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
496  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
497  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
498  ret void
499}
500
501define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
502; GFX9-LABEL: store_load_sindex_foo:
503; GFX9:       ; %bb.0: ; %bb
504; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
505; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
506; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
507; GFX9-NEXT:    s_add_i32 s0, s0, 4
508; GFX9-NEXT:    v_mov_b32_e32 v0, 15
509; GFX9-NEXT:    scratch_store_dword off, v0, s0
510; GFX9-NEXT:    s_waitcnt vmcnt(0)
511; GFX9-NEXT:    s_and_b32 s0, s2, 15
512; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
513; GFX9-NEXT:    s_add_i32 s0, s0, 4
514; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
515; GFX9-NEXT:    s_waitcnt vmcnt(0)
516; GFX9-NEXT:    s_endpgm
517;
518; GFX10-LABEL: store_load_sindex_foo:
519; GFX10:       ; %bb.0: ; %bb
520; GFX10-NEXT:    s_add_u32 s0, s0, s3
521; GFX10-NEXT:    s_addc_u32 s1, s1, 0
522; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
523; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
524; GFX10-NEXT:    v_mov_b32_e32 v0, 15
525; GFX10-NEXT:    s_and_b32 s0, s2, 15
526; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
527; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
528; GFX10-NEXT:    s_add_i32 s1, s1, 4
529; GFX10-NEXT:    s_add_i32 s0, s0, 4
530; GFX10-NEXT:    scratch_store_dword off, v0, s1
531; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
532; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
533; GFX10-NEXT:    s_waitcnt vmcnt(0)
534; GFX10-NEXT:    s_endpgm
535;
536; GFX11-LABEL: store_load_sindex_foo:
537; GFX11:       ; %bb.0: ; %bb
538; GFX11-NEXT:    v_mov_b32_e32 v0, 15
539; GFX11-NEXT:    s_and_b32 s1, s0, 15
540; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
541; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
542; GFX11-NEXT:    s_add_i32 s0, s0, 4
543; GFX11-NEXT:    s_add_i32 s1, s1, 4
544; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
545; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
546; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
547; GFX11-NEXT:    s_waitcnt vmcnt(0)
548; GFX11-NEXT:    s_endpgm
549;
550; GFX9-PAL-LABEL: store_load_sindex_foo:
551; GFX9-PAL:       ; %bb.0: ; %bb
552; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
553; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
554; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
555; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
556; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
557; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
558; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
559; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
560; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
561; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
562; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 4
563; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
564; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
565; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
566; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
567; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
568; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
569; GFX9-PAL-NEXT:    s_endpgm
570;
571; GFX940-LABEL: store_load_sindex_foo:
572; GFX940:       ; %bb.0: ; %bb
573; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
574; GFX940-NEXT:    s_and_b32 s0, s0, 15
575; GFX940-NEXT:    s_add_i32 s1, s1, 4
576; GFX940-NEXT:    v_mov_b32_e32 v0, 15
577; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
578; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
579; GFX940-NEXT:    s_waitcnt vmcnt(0)
580; GFX940-NEXT:    s_add_i32 s0, s0, 4
581; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
582; GFX940-NEXT:    s_waitcnt vmcnt(0)
583; GFX940-NEXT:    s_endpgm
584;
585; GFX10-PAL-LABEL: store_load_sindex_foo:
586; GFX10-PAL:       ; %bb.0: ; %bb
587; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
588; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
589; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
590; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
591; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
592; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
593; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
594; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
595; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
596; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
597; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
598; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
599; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
600; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 4
601; GFX10-PAL-NEXT:    s_add_i32 s1, s1, 4
602; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
603; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
604; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
605; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
606; GFX10-PAL-NEXT:    s_endpgm
607;
608; GFX11-PAL-LABEL: store_load_sindex_foo:
609; GFX11-PAL:       ; %bb.0: ; %bb
610; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
611; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
612; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
613; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
614; GFX11-PAL-NEXT:    s_add_i32 s0, s0, 4
615; GFX11-PAL-NEXT:    s_add_i32 s1, s1, 4
616; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
617; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
618; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
619; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
620; GFX11-PAL-NEXT:    s_endpgm
621; GCN-LABEL: store_load_sindex_foo:
622; GCN:       ; %bb.0: ; %bb
623; GCN-NEXT:    s_lshl_b32 s1, s0, 2
624; GCN-NEXT:    s_and_b32 s0, s0, 15
625; GCN-NEXT:    s_lshl_b32 s0, s0, 2
626; GCN-NEXT:    s_add_u32 s1, 4, s1
627; GCN-NEXT:    v_mov_b32_e32 v0, 15
628; GCN-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
629; GCN-NEXT:    s_waitcnt vmcnt(0)
630; GCN-NEXT:    s_add_u32 s0, 4, s0
631; GCN-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
632; GCN-NEXT:    s_waitcnt vmcnt(0)
633; GCN-NEXT:    s_endpgm
634bb:
635  %i = alloca [32 x float], align 4, addrspace(5)
636  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
637  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
638  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
639  store volatile i32 15, i32 addrspace(5)* %i8, align 4
640  %i9 = and i32 %idx, 15
641  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
642  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
643  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
644  ret void
645}
646
647define amdgpu_kernel void @store_load_vindex_kernel() {
648; GFX9-LABEL: store_load_vindex_kernel:
649; GFX9:       ; %bb.0: ; %bb
650; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
651; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
652; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
653; GFX9-NEXT:    v_add_u32_e32 v1, 4, v0
654; GFX9-NEXT:    v_mov_b32_e32 v2, 15
655; GFX9-NEXT:    scratch_store_dword v1, v2, off
656; GFX9-NEXT:    s_waitcnt vmcnt(0)
657; GFX9-NEXT:    v_sub_u32_e32 v0, 4, v0
658; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
659; GFX9-NEXT:    s_waitcnt vmcnt(0)
660; GFX9-NEXT:    s_endpgm
661;
662; GFX10-LABEL: store_load_vindex_kernel:
663; GFX10:       ; %bb.0: ; %bb
664; GFX10-NEXT:    s_add_u32 s0, s0, s3
665; GFX10-NEXT:    s_addc_u32 s1, s1, 0
666; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
667; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
668; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
669; GFX10-NEXT:    v_mov_b32_e32 v2, 15
670; GFX10-NEXT:    v_add_nc_u32_e32 v1, 4, v0
671; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 4, v0
672; GFX10-NEXT:    scratch_store_dword v1, v2, off
673; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
674; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
675; GFX10-NEXT:    s_waitcnt vmcnt(0)
676; GFX10-NEXT:    s_endpgm
677;
678; GFX11-LABEL: store_load_vindex_kernel:
679; GFX11:       ; %bb.0: ; %bb
680; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
681; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
682; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
683; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4 dlc
684; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
685; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
686; GFX11-NEXT:    s_waitcnt vmcnt(0)
687; GFX11-NEXT:    s_endpgm
688;
689; GFX9-PAL-LABEL: store_load_vindex_kernel:
690; GFX9-PAL:       ; %bb.0: ; %bb
691; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
692; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
693; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
694; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
695; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 4, v0
696; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
697; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 4, v0
698; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
699; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
700; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
701; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
702; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
703; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
704; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
705; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
706; GFX9-PAL-NEXT:    s_endpgm
707;
708; GFX940-LABEL: store_load_vindex_kernel:
709; GFX940:       ; %bb.0: ; %bb
710; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
711; GFX940-NEXT:    v_mov_b32_e32 v1, 15
712; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
713; GFX940-NEXT:    s_waitcnt vmcnt(0)
714; GFX940-NEXT:    v_sub_u32_e32 v0, 4, v0
715; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
716; GFX940-NEXT:    s_waitcnt vmcnt(0)
717; GFX940-NEXT:    s_endpgm
718;
719; GFX10-PAL-LABEL: store_load_vindex_kernel:
720; GFX10-PAL:       ; %bb.0: ; %bb
721; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
722; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
723; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
724; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
725; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
726; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
727; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
728; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
729; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
730; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
731; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
732; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v1, 4, v0
733; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, 4, v0
734; GFX10-PAL-NEXT:    scratch_store_dword v1, v2, off
735; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
736; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
737; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
738; GFX10-PAL-NEXT:    s_endpgm
739;
740; GFX11-PAL-LABEL: store_load_vindex_kernel:
741; GFX11-PAL:       ; %bb.0: ; %bb
742; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
743; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
744; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
745; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4 dlc
746; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
747; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
748; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
749; GFX11-PAL-NEXT:    s_endpgm
750; GCN-LABEL: store_load_vindex_kernel:
751; GCN:       ; %bb.0: ; %bb
752; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
753; GCN-NEXT:    v_mov_b32_e32 v1, 15
754; GCN-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
755; GCN-NEXT:    s_waitcnt vmcnt(0)
756; GCN-NEXT:    v_sub_u32_e32 v0, 4, v0
757; GCN-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
758; GCN-NEXT:    s_waitcnt vmcnt(0)
759; GCN-NEXT:    s_endpgm
760bb:
761  %i = alloca [32 x float], align 4, addrspace(5)
762  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
763  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
764  %i3 = zext i32 %i2 to i64
765  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
766  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
767  store volatile i32 15, i32 addrspace(5)* %i8, align 4
768  %i9 = sub nsw i32 31, %i2
769  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
770  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
771  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
772  ret void
773}
774
775define void @store_load_vindex_foo(i32 %idx) {
776; GFX9-LABEL: store_load_vindex_foo:
777; GFX9:       ; %bb.0: ; %bb
778; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
779; GFX9-NEXT:    v_mov_b32_e32 v1, s32
780; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
781; GFX9-NEXT:    v_mov_b32_e32 v3, 15
782; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
783; GFX9-NEXT:    scratch_store_dword v2, v3, off
784; GFX9-NEXT:    s_waitcnt vmcnt(0)
785; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
786; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
787; GFX9-NEXT:    s_waitcnt vmcnt(0)
788; GFX9-NEXT:    s_setpc_b64 s[30:31]
789;
790; GFX10-LABEL: store_load_vindex_foo:
791; GFX10:       ; %bb.0: ; %bb
792; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
793; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
794; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
795; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
796; GFX10-NEXT:    v_mov_b32_e32 v2, 15
797; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, s32
798; GFX10-NEXT:    scratch_store_dword v0, v2, off
799; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
800; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
801; GFX10-NEXT:    s_waitcnt vmcnt(0)
802; GFX10-NEXT:    s_setpc_b64 s[30:31]
803;
804; GFX11-LABEL: store_load_vindex_foo:
805; GFX11:       ; %bb.0: ; %bb
806; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
807; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
808; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
809; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
810; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
811; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
812; GFX11-NEXT:    scratch_store_b32 v0, v2, s32 dlc
813; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
814; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 glc dlc
815; GFX11-NEXT:    s_waitcnt vmcnt(0)
816; GFX11-NEXT:    s_setpc_b64 s[30:31]
817;
818; GFX9-PAL-LABEL: store_load_vindex_foo:
819; GFX9-PAL:       ; %bb.0: ; %bb
820; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
821; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s32
822; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
823; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
824; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
825; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
826; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
827; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
828; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
829; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
830; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
831;
832; GFX940-LABEL: store_load_vindex_foo:
833; GFX940:       ; %bb.0: ; %bb
834; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
835; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
836; GFX940-NEXT:    v_mov_b32_e32 v2, 15
837; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
838; GFX940-NEXT:    scratch_store_dword v1, v2, s32 sc0 sc1
839; GFX940-NEXT:    s_waitcnt vmcnt(0)
840; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
841; GFX940-NEXT:    scratch_load_dword v0, v0, s32 sc0 sc1
842; GFX940-NEXT:    s_waitcnt vmcnt(0)
843; GFX940-NEXT:    s_setpc_b64 s[30:31]
844;
845; GFX10-PAL-LABEL: store_load_vindex_foo:
846; GFX10-PAL:       ; %bb.0: ; %bb
847; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
848; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
849; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
850; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
851; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
852; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, s32
853; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
854; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
855; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
856; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
857; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
858;
859; GFX11-PAL-LABEL: store_load_vindex_foo:
860; GFX11-PAL:       ; %bb.0: ; %bb
861; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
862; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
863; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
864; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
865; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
866; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
867; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, s32 dlc
868; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
869; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 glc dlc
870; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
871; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
872; GCN-LABEL: store_load_vindex_foo:
873; GCN:       ; %bb.0: ; %bb
874; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
875; GCN-NEXT:    v_mov_b32_e32 v2, 15
876; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
877; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
878; GCN-NEXT:    scratch_store_dword v1, v2, s32 sc0 sc1
879; GCN-NEXT:    s_waitcnt vmcnt(0)
880; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
881; GCN-NEXT:    scratch_load_dword v0, v0, s32 sc0 sc1
882; GCN-NEXT:    s_waitcnt vmcnt(0)
883; GCN-NEXT:    s_setpc_b64 s[30:31]
884bb:
885  %i = alloca [32 x float], align 4, addrspace(5)
886  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
887  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
888  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
889  store volatile i32 15, i32 addrspace(5)* %i8, align 4
890  %i9 = and i32 %idx, 15
891  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
892  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
893  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
894  ret void
895}
896
897define void @private_ptr_foo(float addrspace(5)* nocapture %arg) {
898; GFX9-LABEL: private_ptr_foo:
899; GFX9:       ; %bb.0:
900; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
901; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
902; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
903; GFX9-NEXT:    s_waitcnt vmcnt(0)
904; GFX9-NEXT:    s_setpc_b64 s[30:31]
905;
906; GFX10-LABEL: private_ptr_foo:
907; GFX10:       ; %bb.0:
908; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
909; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
910; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
911; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
912; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
913; GFX10-NEXT:    s_setpc_b64 s[30:31]
914;
915; GFX11-LABEL: private_ptr_foo:
916; GFX11:       ; %bb.0:
917; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
918; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
919; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41200000
920; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4
921; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
922; GFX11-NEXT:    s_setpc_b64 s[30:31]
923;
924; GFX9-PAL-LABEL: private_ptr_foo:
925; GFX9-PAL:       ; %bb.0:
926; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
927; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
928; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
929; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
930; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
931;
932; GFX940-LABEL: private_ptr_foo:
933; GFX940:       ; %bb.0:
934; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
935; GFX940-NEXT:    v_mov_b32_e32 v1, 0x41200000
936; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4
937; GFX940-NEXT:    s_waitcnt vmcnt(0)
938; GFX940-NEXT:    s_setpc_b64 s[30:31]
939;
940; GFX10-PAL-LABEL: private_ptr_foo:
941; GFX10-PAL:       ; %bb.0:
942; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
943; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
944; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
945; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
946; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
947; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
948;
949; GFX11-PAL-LABEL: private_ptr_foo:
950; GFX11-PAL:       ; %bb.0:
951; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
952; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
953; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
954; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4
955; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
956; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
957; GCN-LABEL: private_ptr_foo:
958; GCN:       ; %bb.0:
959; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
960; GCN-NEXT:    v_mov_b32_e32 v1, 0x41200000
961; GCN-NEXT:    scratch_store_dword v0, v1, off offset:4
962; GCN-NEXT:    s_waitcnt vmcnt(0)
963; GCN-NEXT:    s_setpc_b64 s[30:31]
964  %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1
965  store float 1.000000e+01, float addrspace(5)* %gep, align 4
966  ret void
967}
968
969define amdgpu_kernel void @zero_init_small_offset_kernel() {
970; GFX9-LABEL: zero_init_small_offset_kernel:
971; GFX9:       ; %bb.0:
972; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
973; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
974; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
975; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
976; GFX9-NEXT:    s_waitcnt vmcnt(0)
977; GFX9-NEXT:    s_mov_b32 s0, 0
978; GFX9-NEXT:    s_mov_b32 s1, s0
979; GFX9-NEXT:    s_mov_b32 s2, s0
980; GFX9-NEXT:    s_mov_b32 s3, s0
981; GFX9-NEXT:    v_mov_b32_e32 v0, s0
982; GFX9-NEXT:    v_mov_b32_e32 v1, s1
983; GFX9-NEXT:    v_mov_b32_e32 v2, s2
984; GFX9-NEXT:    v_mov_b32_e32 v3, s3
985; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
986; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272
987; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
988; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288
989; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
990; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304
991; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
992; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
993; GFX9-NEXT:    s_endpgm
994;
995; GFX10-LABEL: zero_init_small_offset_kernel:
996; GFX10:       ; %bb.0:
997; GFX10-NEXT:    s_add_u32 s0, s0, s3
998; GFX10-NEXT:    s_addc_u32 s1, s1, 0
999; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1000; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1001; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1002; GFX10-NEXT:    s_waitcnt vmcnt(0)
1003; GFX10-NEXT:    s_mov_b32 s0, 0
1004; GFX10-NEXT:    s_mov_b32 s1, s0
1005; GFX10-NEXT:    s_mov_b32 s2, s0
1006; GFX10-NEXT:    s_mov_b32 s3, s0
1007; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1008; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1009; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1010; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1011; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
1012; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
1013; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
1014; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
1015; GFX10-NEXT:    s_endpgm
1016;
1017; GFX11-LABEL: zero_init_small_offset_kernel:
1018; GFX11:       ; %bb.0:
1019; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
1020; GFX11-NEXT:    s_waitcnt vmcnt(0)
1021; GFX11-NEXT:    s_mov_b32 s0, 0
1022; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1023; GFX11-NEXT:    s_mov_b32 s1, s0
1024; GFX11-NEXT:    s_mov_b32 s2, s0
1025; GFX11-NEXT:    s_mov_b32 s3, s0
1026; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1027; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
1028; GFX11-NEXT:    s_clause 0x3
1029; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
1030; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:288
1031; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:304
1032; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:320
1033; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1034; GFX11-NEXT:    s_endpgm
1035;
1036; GFX9-PAL-LABEL: zero_init_small_offset_kernel:
1037; GFX9-PAL:       ; %bb.0:
1038; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1039; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1040; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1041; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1042; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1043; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1044; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1045; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1046; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1047; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1048; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1049; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
1050; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1051; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
1052; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
1053; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
1054; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
1055; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
1056; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1057; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272
1058; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1059; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288
1060; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1061; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304
1062; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1063; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
1064; GFX9-PAL-NEXT:    s_endpgm
1065;
1066; GFX940-LABEL: zero_init_small_offset_kernel:
1067; GFX940:       ; %bb.0:
1068; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
1069; GFX940-NEXT:    s_waitcnt vmcnt(0)
1070; GFX940-NEXT:    s_mov_b32 s0, 0
1071; GFX940-NEXT:    s_mov_b32 s1, s0
1072; GFX940-NEXT:    s_mov_b32 s2, s0
1073; GFX940-NEXT:    s_mov_b32 s3, s0
1074; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1075; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
1076; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
1077; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
1078; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
1079; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
1080; GFX940-NEXT:    s_endpgm
1081;
1082; GFX1010-PAL-LABEL: zero_init_small_offset_kernel:
1083; GFX1010-PAL:       ; %bb.0:
1084; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1085; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1086; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1087; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1088; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1089; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1090; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1091; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1092; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1093; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1094; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
1095; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
1096; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1097; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
1098; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1099; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
1100; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
1101; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
1102; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
1103; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
1104; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1105; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:272
1106; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1107; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1108; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:288
1109; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1110; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1111; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:304
1112; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1113; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1114; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:320
1115; GFX1010-PAL-NEXT:    s_endpgm
1116;
1117; GFX1030-PAL-LABEL: zero_init_small_offset_kernel:
1118; GFX1030-PAL:       ; %bb.0:
1119; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1120; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1121; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1122; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1123; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1124; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1125; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1126; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1127; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1128; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1129; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1130; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
1131; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
1132; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1133; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
1134; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
1135; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
1136; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
1137; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
1138; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
1139; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
1140; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
1141; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
1142; GFX1030-PAL-NEXT:    s_endpgm
1143;
1144; GFX11-PAL-LABEL: zero_init_small_offset_kernel:
1145; GFX11-PAL:       ; %bb.0:
1146; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
1147; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1148; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
1149; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1150; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
1151; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
1152; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
1153; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1154; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
1155; GFX11-PAL-NEXT:    s_clause 0x3
1156; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
1157; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:288
1158; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:304
1159; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:320
1160; GFX11-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1161; GFX11-PAL-NEXT:    s_endpgm
1162  %padding = alloca [64 x i32], align 4, addrspace(5)
1163  %alloca = alloca [32 x i16], align 2, addrspace(5)
1164  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1165  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1166  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
1167  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
1168  ret void
1169}
1170
1171define void @zero_init_small_offset_foo() {
1172; GFX9-LABEL: zero_init_small_offset_foo:
1173; GFX9:       ; %bb.0:
1174; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1175; GFX9-NEXT:    scratch_load_dword v0, off, s32 glc
1176; GFX9-NEXT:    s_waitcnt vmcnt(0)
1177; GFX9-NEXT:    s_mov_b32 s0, 0
1178; GFX9-NEXT:    s_mov_b32 s1, s0
1179; GFX9-NEXT:    s_mov_b32 s2, s0
1180; GFX9-NEXT:    s_mov_b32 s3, s0
1181; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1182; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1183; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1184; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1185; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1186; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1187; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1188; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1189; GFX9-NEXT:    s_waitcnt vmcnt(0)
1190; GFX9-NEXT:    s_setpc_b64 s[30:31]
1191;
1192; GFX10-LABEL: zero_init_small_offset_foo:
1193; GFX10:       ; %bb.0:
1194; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1195; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1196; GFX10-NEXT:    scratch_load_dword v0, off, s32 glc dlc
1197; GFX10-NEXT:    s_waitcnt vmcnt(0)
1198; GFX10-NEXT:    s_mov_b32 s0, 0
1199; GFX10-NEXT:    s_mov_b32 s1, s0
1200; GFX10-NEXT:    s_mov_b32 s2, s0
1201; GFX10-NEXT:    s_mov_b32 s3, s0
1202; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1203; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1204; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1205; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1206; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1207; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1208; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1209; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1210; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1211; GFX10-NEXT:    s_setpc_b64 s[30:31]
1212;
1213; GFX11-LABEL: zero_init_small_offset_foo:
1214; GFX11:       ; %bb.0:
1215; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1216; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1217; GFX11-NEXT:    scratch_load_b32 v0, off, s32 glc dlc
1218; GFX11-NEXT:    s_waitcnt vmcnt(0)
1219; GFX11-NEXT:    s_mov_b32 s0, 0
1220; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1221; GFX11-NEXT:    s_mov_b32 s1, s0
1222; GFX11-NEXT:    s_mov_b32 s2, s0
1223; GFX11-NEXT:    s_mov_b32 s3, s0
1224; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1225; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
1226; GFX11-NEXT:    s_clause 0x3
1227; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:256
1228; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:272
1229; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:288
1230; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:304
1231; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1232; GFX11-NEXT:    s_setpc_b64 s[30:31]
1233;
1234; GFX9-PAL-LABEL: zero_init_small_offset_foo:
1235; GFX9-PAL:       ; %bb.0:
1236; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1237; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 glc
1238; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1239; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1240; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
1241; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1242; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
1243; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
1244; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
1245; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
1246; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
1247; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1248; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1249; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1250; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1251; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1252; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1253;
1254; GFX940-LABEL: zero_init_small_offset_foo:
1255; GFX940:       ; %bb.0:
1256; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1257; GFX940-NEXT:    scratch_load_dword v0, off, s32 sc0 sc1
1258; GFX940-NEXT:    s_waitcnt vmcnt(0)
1259; GFX940-NEXT:    s_mov_b32 s0, 0
1260; GFX940-NEXT:    s_mov_b32 s1, s0
1261; GFX940-NEXT:    s_mov_b32 s2, s0
1262; GFX940-NEXT:    s_mov_b32 s3, s0
1263; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1264; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
1265; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1266; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1267; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1268; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1269; GFX940-NEXT:    s_waitcnt vmcnt(0)
1270; GFX940-NEXT:    s_setpc_b64 s[30:31]
1271;
1272; GFX10-PAL-LABEL: zero_init_small_offset_foo:
1273; GFX10-PAL:       ; %bb.0:
1274; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1275; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1276; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s32 glc dlc
1277; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1278; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
1279; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
1280; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1281; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
1282; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
1283; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
1284; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
1285; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
1286; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1287; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1288; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1289; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1290; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1291; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1292;
1293; GFX11-PAL-LABEL: zero_init_small_offset_foo:
1294; GFX11-PAL:       ; %bb.0:
1295; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1296; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1297; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s32 glc dlc
1298; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1299; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
1300; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1301; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
1302; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
1303; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
1304; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1305; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
1306; GFX11-PAL-NEXT:    s_clause 0x3
1307; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:256
1308; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:272
1309; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:288
1310; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:304
1311; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1312; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
1313; GCN-LABEL: zero_init_small_offset_foo:
1314; GCN:       ; %bb.0:
1315; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1316; GCN-NEXT:    scratch_load_dword v0, off, s32 sc0 sc1
1317; GCN-NEXT:    s_waitcnt vmcnt(0)
1318; GCN-NEXT:    s_mov_b32 s0, 0
1319; GCN-NEXT:    s_mov_b32 s1, s0
1320; GCN-NEXT:    s_mov_b32 s2, s0
1321; GCN-NEXT:    s_mov_b32 s3, s0
1322; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1323; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
1324; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1325; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1326; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1327; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1328; GCN-NEXT:    s_waitcnt vmcnt(0)
1329; GCN-NEXT:    s_setpc_b64 s[30:31]
1330  %padding = alloca [64 x i32], align 4, addrspace(5)
1331  %alloca = alloca [32 x i16], align 2, addrspace(5)
1332  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1333  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1334  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
1335  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
1336  ret void
1337}
1338
1339define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
1340; GFX9-LABEL: store_load_sindex_small_offset_kernel:
1341; GFX9:       ; %bb.0: ; %bb
1342; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
1343; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
1344; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1345; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1346; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1347; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1348; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
1349; GFX9-NEXT:    s_and_b32 s0, s0, 15
1350; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1351; GFX9-NEXT:    s_addk_i32 s1, 0x104
1352; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1353; GFX9-NEXT:    scratch_store_dword off, v0, s1
1354; GFX9-NEXT:    s_waitcnt vmcnt(0)
1355; GFX9-NEXT:    s_addk_i32 s0, 0x104
1356; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1357; GFX9-NEXT:    s_waitcnt vmcnt(0)
1358; GFX9-NEXT:    s_endpgm
1359;
1360; GFX10-LABEL: store_load_sindex_small_offset_kernel:
1361; GFX10:       ; %bb.0: ; %bb
1362; GFX10-NEXT:    s_add_u32 s2, s2, s5
1363; GFX10-NEXT:    s_addc_u32 s3, s3, 0
1364; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1365; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1366; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
1367; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1368; GFX10-NEXT:    s_waitcnt vmcnt(0)
1369; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1370; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1371; GFX10-NEXT:    s_and_b32 s1, s0, 15
1372; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1373; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
1374; GFX10-NEXT:    s_addk_i32 s0, 0x104
1375; GFX10-NEXT:    s_addk_i32 s1, 0x104
1376; GFX10-NEXT:    scratch_store_dword off, v0, s0
1377; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1378; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1379; GFX10-NEXT:    s_waitcnt vmcnt(0)
1380; GFX10-NEXT:    s_endpgm
1381;
1382; GFX11-LABEL: store_load_sindex_small_offset_kernel:
1383; GFX11:       ; %bb.0: ; %bb
1384; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
1385; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
1386; GFX11-NEXT:    s_waitcnt vmcnt(0)
1387; GFX11-NEXT:    v_mov_b32_e32 v0, 15
1388; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1389; GFX11-NEXT:    s_and_b32 s1, s0, 15
1390; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
1391; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
1392; GFX11-NEXT:    s_addk_i32 s0, 0x104
1393; GFX11-NEXT:    s_addk_i32 s1, 0x104
1394; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
1395; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1396; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
1397; GFX11-NEXT:    s_waitcnt vmcnt(0)
1398; GFX11-NEXT:    s_endpgm
1399;
1400; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel:
1401; GFX9-PAL:       ; %bb.0: ; %bb
1402; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
1403; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
1404; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1405; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1406; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
1407; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1408; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1409; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
1410; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
1411; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1412; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1413; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1414; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1415; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1416; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x104
1417; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1418; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1419; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1420; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x104
1421; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
1422; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1423; GFX9-PAL-NEXT:    s_endpgm
1424;
1425; GFX940-LABEL: store_load_sindex_small_offset_kernel:
1426; GFX940:       ; %bb.0: ; %bb
1427; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
1428; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
1429; GFX940-NEXT:    s_waitcnt vmcnt(0)
1430; GFX940-NEXT:    v_mov_b32_e32 v0, 15
1431; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1432; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
1433; GFX940-NEXT:    s_and_b32 s0, s0, 15
1434; GFX940-NEXT:    s_addk_i32 s1, 0x104
1435; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
1436; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
1437; GFX940-NEXT:    s_waitcnt vmcnt(0)
1438; GFX940-NEXT:    s_addk_i32 s0, 0x104
1439; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
1440; GFX940-NEXT:    s_waitcnt vmcnt(0)
1441; GFX940-NEXT:    s_endpgm
1442;
1443; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel:
1444; GFX1010-PAL:       ; %bb.0: ; %bb
1445; GFX1010-PAL-NEXT:    s_getpc_b64 s[4:5]
1446; GFX1010-PAL-NEXT:    s_mov_b32 s4, s0
1447; GFX1010-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1448; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1449; GFX1010-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1450; GFX1010-PAL-NEXT:    s_add_u32 s4, s4, s3
1451; GFX1010-PAL-NEXT:    s_addc_u32 s5, s5, 0
1452; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
1453; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
1454; GFX1010-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
1455; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1456; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
1457; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1458; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
1459; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1460; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
1461; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1462; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1463; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x104
1464; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x104
1465; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
1466; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1467; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1468; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1469; GFX1010-PAL-NEXT:    s_endpgm
1470;
1471; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel:
1472; GFX1030-PAL:       ; %bb.0: ; %bb
1473; GFX1030-PAL-NEXT:    s_getpc_b64 s[4:5]
1474; GFX1030-PAL-NEXT:    s_mov_b32 s4, s0
1475; GFX1030-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1476; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1477; GFX1030-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1478; GFX1030-PAL-NEXT:    s_add_u32 s4, s4, s3
1479; GFX1030-PAL-NEXT:    s_addc_u32 s5, s5, 0
1480; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
1481; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
1482; GFX1030-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
1483; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1484; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1485; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
1486; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1487; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
1488; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1489; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1490; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x104
1491; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x104
1492; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
1493; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1494; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1495; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1496; GFX1030-PAL-NEXT:    s_endpgm
1497;
1498; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel:
1499; GFX11-PAL:       ; %bb.0: ; %bb
1500; GFX11-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
1501; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
1502; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1503; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
1504; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1505; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
1506; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1507; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1508; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x104
1509; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x104
1510; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
1511; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1512; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
1513; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1514; GFX11-PAL-NEXT:    s_endpgm
1515bb:
1516  %padding = alloca [64 x i32], align 4, addrspace(5)
1517  %i = alloca [32 x float], align 4, addrspace(5)
1518  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1519  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1520  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1521  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1522  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1523  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1524  %i9 = and i32 %idx, 15
1525  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1526  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1527  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1528  ret void
1529}
1530
1531define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
1532; GFX9-LABEL: store_load_sindex_small_offset_foo:
1533; GFX9:       ; %bb.0: ; %bb
1534; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1535; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1536; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1537; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1538; GFX9-NEXT:    s_waitcnt vmcnt(0)
1539; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
1540; GFX9-NEXT:    s_addk_i32 s0, 0x104
1541; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1542; GFX9-NEXT:    scratch_store_dword off, v0, s0
1543; GFX9-NEXT:    s_waitcnt vmcnt(0)
1544; GFX9-NEXT:    s_and_b32 s0, s2, 15
1545; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1546; GFX9-NEXT:    s_addk_i32 s0, 0x104
1547; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1548; GFX9-NEXT:    s_waitcnt vmcnt(0)
1549; GFX9-NEXT:    s_endpgm
1550;
1551; GFX10-LABEL: store_load_sindex_small_offset_foo:
1552; GFX10:       ; %bb.0: ; %bb
1553; GFX10-NEXT:    s_add_u32 s0, s0, s3
1554; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1555; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1556; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1557; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1558; GFX10-NEXT:    s_waitcnt vmcnt(0)
1559; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1560; GFX10-NEXT:    s_and_b32 s0, s2, 15
1561; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
1562; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1563; GFX10-NEXT:    s_addk_i32 s1, 0x104
1564; GFX10-NEXT:    s_addk_i32 s0, 0x104
1565; GFX10-NEXT:    scratch_store_dword off, v0, s1
1566; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1567; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
1568; GFX10-NEXT:    s_waitcnt vmcnt(0)
1569; GFX10-NEXT:    s_endpgm
1570;
1571; GFX11-LABEL: store_load_sindex_small_offset_foo:
1572; GFX11:       ; %bb.0: ; %bb
1573; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
1574; GFX11-NEXT:    s_waitcnt vmcnt(0)
1575; GFX11-NEXT:    v_mov_b32_e32 v0, 15
1576; GFX11-NEXT:    s_and_b32 s1, s0, 15
1577; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
1578; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
1579; GFX11-NEXT:    s_addk_i32 s0, 0x104
1580; GFX11-NEXT:    s_addk_i32 s1, 0x104
1581; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
1582; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1583; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
1584; GFX11-NEXT:    s_waitcnt vmcnt(0)
1585; GFX11-NEXT:    s_endpgm
1586;
1587; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo:
1588; GFX9-PAL:       ; %bb.0: ; %bb
1589; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1590; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1591; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1592; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1593; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1594; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1595; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1596; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1597; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1598; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1599; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1600; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1601; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x104
1602; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1603; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1604; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1605; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1606; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x104
1607; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
1608; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1609; GFX9-PAL-NEXT:    s_endpgm
1610;
1611; GFX940-LABEL: store_load_sindex_small_offset_foo:
1612; GFX940:       ; %bb.0: ; %bb
1613; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
1614; GFX940-NEXT:    s_waitcnt vmcnt(0)
1615; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
1616; GFX940-NEXT:    s_and_b32 s0, s0, 15
1617; GFX940-NEXT:    s_addk_i32 s1, 0x104
1618; GFX940-NEXT:    v_mov_b32_e32 v0, 15
1619; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
1620; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
1621; GFX940-NEXT:    s_waitcnt vmcnt(0)
1622; GFX940-NEXT:    s_addk_i32 s0, 0x104
1623; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
1624; GFX940-NEXT:    s_waitcnt vmcnt(0)
1625; GFX940-NEXT:    s_endpgm
1626;
1627; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo:
1628; GFX1010-PAL:       ; %bb.0: ; %bb
1629; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1630; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1631; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1632; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1633; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1634; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1635; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1636; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1637; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1638; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1639; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
1640; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
1641; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1642; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
1643; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1644; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1645; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x104
1646; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x104
1647; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
1648; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1649; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1650; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1651; GFX1010-PAL-NEXT:    s_endpgm
1652;
1653; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo:
1654; GFX1030-PAL:       ; %bb.0: ; %bb
1655; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1656; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1657; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1658; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1659; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1660; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1661; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1662; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1663; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1664; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1665; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1666; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
1667; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
1668; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1669; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1670; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x104
1671; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x104
1672; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
1673; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1674; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1675; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1676; GFX1030-PAL-NEXT:    s_endpgm
1677;
1678; GFX11-PAL-LABEL: store_load_sindex_small_offset_foo:
1679; GFX11-PAL:       ; %bb.0: ; %bb
1680; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
1681; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1682; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
1683; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
1684; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1685; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1686; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x104
1687; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x104
1688; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
1689; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1690; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
1691; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1692; GFX11-PAL-NEXT:    s_endpgm
1693bb:
1694  %padding = alloca [64 x i32], align 4, addrspace(5)
1695  %i = alloca [32 x float], align 4, addrspace(5)
1696  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1697  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1698  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1699  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1700  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1701  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1702  %i9 = and i32 %idx, 15
1703  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1704  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1705  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1706  ret void
1707}
1708
1709define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
1710; GFX9-LABEL: store_load_vindex_small_offset_kernel:
1711; GFX9:       ; %bb.0: ; %bb
1712; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1713; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1714; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1715; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
1716; GFX9-NEXT:    s_waitcnt vmcnt(0)
1717; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1718; GFX9-NEXT:    v_add_u32_e32 v1, 0x104, v0
1719; GFX9-NEXT:    v_mov_b32_e32 v2, 15
1720; GFX9-NEXT:    scratch_store_dword v1, v2, off
1721; GFX9-NEXT:    s_waitcnt vmcnt(0)
1722; GFX9-NEXT:    v_sub_u32_e32 v0, 0x104, v0
1723; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
1724; GFX9-NEXT:    s_waitcnt vmcnt(0)
1725; GFX9-NEXT:    s_endpgm
1726;
1727; GFX10-LABEL: store_load_vindex_small_offset_kernel:
1728; GFX10:       ; %bb.0: ; %bb
1729; GFX10-NEXT:    s_add_u32 s0, s0, s3
1730; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1731; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1732; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1733; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1734; GFX10-NEXT:    v_mov_b32_e32 v2, 15
1735; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
1736; GFX10-NEXT:    s_waitcnt vmcnt(0)
1737; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
1738; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
1739; GFX10-NEXT:    scratch_store_dword v1, v2, off
1740; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1741; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1742; GFX10-NEXT:    s_waitcnt vmcnt(0)
1743; GFX10-NEXT:    s_endpgm
1744;
1745; GFX11-LABEL: store_load_vindex_small_offset_kernel:
1746; GFX11:       ; %bb.0: ; %bb
1747; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
1748; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
1749; GFX11-NEXT:    s_waitcnt vmcnt(0)
1750; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
1751; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:260 dlc
1752; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1753; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
1754; GFX11-NEXT:    s_waitcnt vmcnt(0)
1755; GFX11-NEXT:    s_endpgm
1756;
1757; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel:
1758; GFX9-PAL:       ; %bb.0: ; %bb
1759; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1760; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1761; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1762; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1763; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1764; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
1765; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1766; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1767; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1768; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1769; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
1770; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1771; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x104, v0
1772; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
1773; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1774; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x104, v0
1775; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
1776; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1777; GFX9-PAL-NEXT:    s_endpgm
1778;
1779; GFX940-LABEL: store_load_vindex_small_offset_kernel:
1780; GFX940:       ; %bb.0: ; %bb
1781; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
1782; GFX940-NEXT:    s_waitcnt vmcnt(0)
1783; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1784; GFX940-NEXT:    v_mov_b32_e32 v1, 15
1785; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:260 sc0 sc1
1786; GFX940-NEXT:    s_waitcnt vmcnt(0)
1787; GFX940-NEXT:    v_sub_u32_e32 v0, 0x104, v0
1788; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
1789; GFX940-NEXT:    s_waitcnt vmcnt(0)
1790; GFX940-NEXT:    s_endpgm
1791;
1792; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel:
1793; GFX1010-PAL:       ; %bb.0: ; %bb
1794; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1795; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1796; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1797; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1798; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1799; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1800; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1801; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1802; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1803; GFX1010-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1804; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, 15
1805; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1806; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, vcc_lo offset:4 glc dlc
1807; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1808; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
1809; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
1810; GFX1010-PAL-NEXT:    scratch_store_dword v1, v2, off
1811; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1812; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1813; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1814; GFX1010-PAL-NEXT:    s_endpgm
1815;
1816; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel:
1817; GFX1030-PAL:       ; %bb.0: ; %bb
1818; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1819; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1820; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1821; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1822; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1823; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1824; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1825; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1826; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1827; GFX1030-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1828; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, 15
1829; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
1830; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1831; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
1832; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
1833; GFX1030-PAL-NEXT:    scratch_store_dword v1, v2, off
1834; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1835; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1836; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1837; GFX1030-PAL-NEXT:    s_endpgm
1838;
1839; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel:
1840; GFX11-PAL:       ; %bb.0: ; %bb
1841; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
1842; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
1843; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1844; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
1845; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:260 dlc
1846; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1847; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
1848; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1849; GFX11-PAL-NEXT:    s_endpgm
1850bb:
1851  %padding = alloca [64 x i32], align 4, addrspace(5)
1852  %i = alloca [32 x float], align 4, addrspace(5)
1853  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1854  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1855  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1856  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
1857  %i3 = zext i32 %i2 to i64
1858  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
1859  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1860  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1861  %i9 = sub nsw i32 31, %i2
1862  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1863  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1864  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1865  ret void
1866}
1867
1868define void @store_load_vindex_small_offset_foo(i32 %idx) {
1869; GFX9-LABEL: store_load_vindex_small_offset_foo:
1870; GFX9:       ; %bb.0: ; %bb
1871; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1872; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
1873; GFX9-NEXT:    s_waitcnt vmcnt(0)
1874; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x100
1875; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
1876; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1877; GFX9-NEXT:    v_mov_b32_e32 v3, 15
1878; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
1879; GFX9-NEXT:    scratch_store_dword v2, v3, off
1880; GFX9-NEXT:    s_waitcnt vmcnt(0)
1881; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1882; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
1883; GFX9-NEXT:    s_waitcnt vmcnt(0)
1884; GFX9-NEXT:    s_setpc_b64 s[30:31]
1885;
1886; GFX10-LABEL: store_load_vindex_small_offset_foo:
1887; GFX10:       ; %bb.0: ; %bb
1888; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1889; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1890; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
1891; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
1892; GFX10-NEXT:    v_mov_b32_e32 v2, 15
1893; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, vcc_lo
1894; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
1895; GFX10-NEXT:    scratch_load_dword v3, off, s32 glc dlc
1896; GFX10-NEXT:    s_waitcnt vmcnt(0)
1897; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
1898; GFX10-NEXT:    scratch_store_dword v0, v2, off
1899; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1900; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
1901; GFX10-NEXT:    s_waitcnt vmcnt(0)
1902; GFX10-NEXT:    s_setpc_b64 s[30:31]
1903;
1904; GFX11-LABEL: store_load_vindex_small_offset_foo:
1905; GFX11:       ; %bb.0: ; %bb
1906; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1907; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1908; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
1909; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1910; GFX11-NEXT:    scratch_load_b32 v3, off, s32 glc dlc
1911; GFX11-NEXT:    s_waitcnt vmcnt(0)
1912; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
1913; GFX11-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 dlc
1914; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1915; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 glc dlc
1916; GFX11-NEXT:    s_waitcnt vmcnt(0)
1917; GFX11-NEXT:    s_setpc_b64 s[30:31]
1918;
1919; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
1920; GFX9-PAL:       ; %bb.0: ; %bb
1921; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1922; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 glc
1923; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1924; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x100
1925; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
1926; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1927; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
1928; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
1929; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1930; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1931; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1932; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
1933; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1934; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1935;
1936; GFX940-LABEL: store_load_vindex_small_offset_foo:
1937; GFX940:       ; %bb.0: ; %bb
1938; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1939; GFX940-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
1940; GFX940-NEXT:    s_waitcnt vmcnt(0)
1941; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1942; GFX940-NEXT:    v_mov_b32_e32 v2, 15
1943; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
1944; GFX940-NEXT:    scratch_store_dword v1, v2, s32 offset:256 sc0 sc1
1945; GFX940-NEXT:    s_waitcnt vmcnt(0)
1946; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1947; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
1948; GFX940-NEXT:    s_waitcnt vmcnt(0)
1949; GFX940-NEXT:    s_setpc_b64 s[30:31]
1950;
1951; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo:
1952; GFX10-PAL:       ; %bb.0: ; %bb
1953; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1954; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1955; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
1956; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x100
1957; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
1958; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, vcc_lo
1959; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x100
1960; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 glc dlc
1961; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1962; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
1963; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
1964; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1965; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
1966; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1967; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1968;
1969; GFX11-PAL-LABEL: store_load_vindex_small_offset_foo:
1970; GFX11-PAL:       ; %bb.0: ; %bb
1971; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1972; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1973; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
1974; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1975; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, s32 glc dlc
1976; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1977; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
1978; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 dlc
1979; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1980; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 glc dlc
1981; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1982; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
1983; GCN-LABEL: store_load_vindex_small_offset_foo:
1984; GCN:       ; %bb.0: ; %bb
1985; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1986; GCN-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
1987; GCN-NEXT:    s_waitcnt vmcnt(0)
1988; GCN-NEXT:    v_mov_b32_e32 v2, 15
1989; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1990; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
1991; GCN-NEXT:    scratch_store_dword v1, v2, s32 offset:256 sc0 sc1
1992; GCN-NEXT:    s_waitcnt vmcnt(0)
1993; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1994; GCN-NEXT:    scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
1995; GCN-NEXT:    s_waitcnt vmcnt(0)
1996; GCN-NEXT:    s_setpc_b64 s[30:31]
1997bb:
1998  %padding = alloca [64 x i32], align 4, addrspace(5)
1999  %i = alloca [32 x float], align 4, addrspace(5)
2000  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
2001  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2002  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2003  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
2004  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2005  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2006  %i9 = and i32 %idx, 15
2007  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2008  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2009  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2010  ret void
2011}
2012
2013define amdgpu_kernel void @zero_init_large_offset_kernel() {
2014; GFX9-LABEL: zero_init_large_offset_kernel:
2015; GFX9:       ; %bb.0:
2016; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
2017; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
2018; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
2019; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:16 glc
2020; GFX9-NEXT:    s_waitcnt vmcnt(0)
2021; GFX9-NEXT:    s_mov_b32 s0, 0
2022; GFX9-NEXT:    s_mov_b32 s1, s0
2023; GFX9-NEXT:    s_mov_b32 s2, s0
2024; GFX9-NEXT:    s_mov_b32 s3, s0
2025; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2026; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2027; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2028; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2029; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
2030; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
2031; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
2032; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
2033; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
2034; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
2035; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
2036; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
2037; GFX9-NEXT:    s_endpgm
2038;
2039; GFX10-LABEL: zero_init_large_offset_kernel:
2040; GFX10:       ; %bb.0:
2041; GFX10-NEXT:    s_add_u32 s0, s0, s3
2042; GFX10-NEXT:    s_addc_u32 s1, s1, 0
2043; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
2044; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
2045; GFX10-NEXT:    scratch_load_dword v0, off, off offset:16 glc dlc
2046; GFX10-NEXT:    s_waitcnt vmcnt(0)
2047; GFX10-NEXT:    s_mov_b32 s0, 0
2048; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
2049; GFX10-NEXT:    s_mov_b32 s1, s0
2050; GFX10-NEXT:    s_mov_b32 s2, s0
2051; GFX10-NEXT:    s_mov_b32 s3, s0
2052; GFX10-NEXT:    v_mov_b32_e32 v0, s0
2053; GFX10-NEXT:    v_mov_b32_e32 v1, s1
2054; GFX10-NEXT:    v_mov_b32_e32 v2, s2
2055; GFX10-NEXT:    v_mov_b32_e32 v3, s3
2056; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
2057; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
2058; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
2059; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
2060; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
2061; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
2062; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
2063; GFX10-NEXT:    s_endpgm
2064;
2065; GFX11-LABEL: zero_init_large_offset_kernel:
2066; GFX11:       ; %bb.0:
2067; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:16 glc dlc
2068; GFX11-NEXT:    s_waitcnt vmcnt(0)
2069; GFX11-NEXT:    s_mov_b32 s0, 0
2070; GFX11-NEXT:    s_movk_i32 vcc_lo, 0x4010
2071; GFX11-NEXT:    s_mov_b32 s1, s0
2072; GFX11-NEXT:    s_mov_b32 s2, s0
2073; GFX11-NEXT:    s_mov_b32 s3, s0
2074; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
2075; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
2076; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo
2077; GFX11-NEXT:    s_movk_i32 vcc_lo, 0x4010
2078; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:16
2079; GFX11-NEXT:    s_movk_i32 vcc_lo, 0x4010
2080; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:32
2081; GFX11-NEXT:    s_movk_i32 vcc_lo, 0x4010
2082; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:48
2083; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2084; GFX11-NEXT:    s_endpgm
2085;
2086; GFX9-PAL-LABEL: zero_init_large_offset_kernel:
2087; GFX9-PAL:       ; %bb.0:
2088; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
2089; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2090; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2091; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
2092; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
2093; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2094; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2095; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
2096; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2097; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:16 glc
2098; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2099; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
2100; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2101; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
2102; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
2103; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
2104; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
2105; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
2106; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
2107; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
2108; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
2109; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
2110; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
2111; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
2112; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
2113; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
2114; GFX9-PAL-NEXT:    s_endpgm
2115;
2116; GFX940-LABEL: zero_init_large_offset_kernel:
2117; GFX940:       ; %bb.0:
2118; GFX940-NEXT:    scratch_load_dword v0, off, off offset:16 sc0 sc1
2119; GFX940-NEXT:    s_waitcnt vmcnt(0)
2120; GFX940-NEXT:    s_mov_b32 s0, 0
2121; GFX940-NEXT:    s_mov_b32 s1, s0
2122; GFX940-NEXT:    s_mov_b32 s2, s0
2123; GFX940-NEXT:    s_mov_b32 s3, s0
2124; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2125; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
2126; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4010
2127; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
2128; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4010
2129; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
2130; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4010
2131; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
2132; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4010
2133; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
2134; GFX940-NEXT:    s_endpgm
2135;
2136; GFX1010-PAL-LABEL: zero_init_large_offset_kernel:
2137; GFX1010-PAL:       ; %bb.0:
2138; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
2139; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2140; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2141; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2142; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2143; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
2144; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
2145; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2146; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2147; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2148; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
2149; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:16 glc dlc
2150; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2151; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
2152; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2153; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
2154; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
2155; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
2156; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
2157; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
2158; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2159; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
2160; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2161; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2162; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
2163; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2164; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2165; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
2166; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2167; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2168; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
2169; GFX1010-PAL-NEXT:    s_endpgm
2170;
2171; GFX1030-PAL-LABEL: zero_init_large_offset_kernel:
2172; GFX1030-PAL:       ; %bb.0:
2173; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
2174; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2175; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2176; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2177; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2178; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
2179; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
2180; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2181; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2182; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:16 glc dlc
2183; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2184; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
2185; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2186; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
2187; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2188; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
2189; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
2190; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
2191; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
2192; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
2193; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
2194; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2195; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
2196; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2197; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
2198; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2199; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
2200; GFX1030-PAL-NEXT:    s_endpgm
2201;
2202; GFX11-PAL-LABEL: zero_init_large_offset_kernel:
2203; GFX11-PAL:       ; %bb.0:
2204; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:16 glc dlc
2205; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2206; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
2207; GFX11-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2208; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
2209; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
2210; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
2211; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
2212; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
2213; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo
2214; GFX11-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2215; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:16
2216; GFX11-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2217; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:32
2218; GFX11-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
2219; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:48
2220; GFX11-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2221; GFX11-PAL-NEXT:    s_endpgm
2222  %padding = alloca [4096 x i32], align 4, addrspace(5)
2223  %alloca = alloca [32 x i16], align 2, addrspace(5)
2224  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2225  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2226  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
2227  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
2228  ret void
2229}
2230
2231define void @zero_init_large_offset_foo() {
2232; GFX9-LABEL: zero_init_large_offset_foo:
2233; GFX9:       ; %bb.0:
2234; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2235; GFX9-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc
2236; GFX9-NEXT:    s_waitcnt vmcnt(0)
2237; GFX9-NEXT:    s_mov_b32 s0, 0
2238; GFX9-NEXT:    s_mov_b32 s1, s0
2239; GFX9-NEXT:    s_mov_b32 s2, s0
2240; GFX9-NEXT:    s_mov_b32 s3, s0
2241; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2242; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2243; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2244; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2245; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2246; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
2247; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2248; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
2249; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2250; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
2251; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2252; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
2253; GFX9-NEXT:    s_waitcnt vmcnt(0)
2254; GFX9-NEXT:    s_setpc_b64 s[30:31]
2255;
2256; GFX10-LABEL: zero_init_large_offset_foo:
2257; GFX10:       ; %bb.0:
2258; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2259; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2260; GFX10-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc dlc
2261; GFX10-NEXT:    s_waitcnt vmcnt(0)
2262; GFX10-NEXT:    s_mov_b32 s0, 0
2263; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2264; GFX10-NEXT:    s_mov_b32 s1, s0
2265; GFX10-NEXT:    s_mov_b32 s2, s0
2266; GFX10-NEXT:    s_mov_b32 s3, s0
2267; GFX10-NEXT:    v_mov_b32_e32 v0, s0
2268; GFX10-NEXT:    v_mov_b32_e32 v1, s1
2269; GFX10-NEXT:    v_mov_b32_e32 v2, s2
2270; GFX10-NEXT:    v_mov_b32_e32 v3, s3
2271; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
2272; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2273; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
2274; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2275; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
2276; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2277; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
2278; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2279; GFX10-NEXT:    s_setpc_b64 s[30:31]
2280;
2281; GFX11-LABEL: zero_init_large_offset_foo:
2282; GFX11:       ; %bb.0:
2283; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2284; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2285; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:16 glc dlc
2286; GFX11-NEXT:    s_waitcnt vmcnt(0)
2287; GFX11-NEXT:    s_mov_b32 s0, 0
2288; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2289; GFX11-NEXT:    s_mov_b32 s1, s0
2290; GFX11-NEXT:    s_mov_b32 s2, s0
2291; GFX11-NEXT:    s_mov_b32 s3, s0
2292; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
2293; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
2294; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo
2295; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2296; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:16
2297; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2298; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:32
2299; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2300; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:48
2301; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2302; GFX11-NEXT:    s_setpc_b64 s[30:31]
2303;
2304; GFX9-PAL-LABEL: zero_init_large_offset_foo:
2305; GFX9-PAL:       ; %bb.0:
2306; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2307; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc
2308; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2309; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
2310; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
2311; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2312; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
2313; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
2314; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
2315; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
2316; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
2317; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2318; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
2319; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2320; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
2321; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2322; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
2323; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2324; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
2325; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2326; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2327;
2328; GFX940-LABEL: zero_init_large_offset_foo:
2329; GFX940:       ; %bb.0:
2330; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2331; GFX940-NEXT:    scratch_load_dword v0, off, s32 offset:16 sc0 sc1
2332; GFX940-NEXT:    s_waitcnt vmcnt(0)
2333; GFX940-NEXT:    s_mov_b32 s0, 0
2334; GFX940-NEXT:    s_mov_b32 s1, s0
2335; GFX940-NEXT:    s_mov_b32 s2, s0
2336; GFX940-NEXT:    s_mov_b32 s3, s0
2337; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2338; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
2339; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2340; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
2341; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2342; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
2343; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2344; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
2345; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
2346; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
2347; GFX940-NEXT:    s_waitcnt vmcnt(0)
2348; GFX940-NEXT:    s_setpc_b64 s[30:31]
2349;
2350; GFX1010-PAL-LABEL: zero_init_large_offset_foo:
2351; GFX1010-PAL:       ; %bb.0:
2352; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2353; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2354; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc dlc
2355; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2356; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
2357; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2358; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
2359; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2360; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
2361; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
2362; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
2363; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
2364; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
2365; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
2366; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2367; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2368; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
2369; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2370; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2371; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
2372; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2373; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2374; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
2375; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2376; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
2377;
2378; GFX1030-PAL-LABEL: zero_init_large_offset_foo:
2379; GFX1030-PAL:       ; %bb.0:
2380; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2381; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2382; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc dlc
2383; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2384; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
2385; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2386; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
2387; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2388; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
2389; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
2390; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
2391; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
2392; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
2393; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
2394; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2395; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
2396; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2397; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
2398; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2399; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
2400; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2401; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
2402;
2403; GFX11-PAL-LABEL: zero_init_large_offset_foo:
2404; GFX11-PAL:       ; %bb.0:
2405; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2406; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2407; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s32 offset:16 glc dlc
2408; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2409; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
2410; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2411; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
2412; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
2413; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
2414; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
2415; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
2416; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo
2417; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2418; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:16
2419; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2420; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:32
2421; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
2422; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:48
2423; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2424; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
2425  %padding = alloca [4096 x i32], align 4, addrspace(5)
2426  %alloca = alloca [32 x i16], align 2, addrspace(5)
2427  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2428  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2429  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
2430  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
2431  ret void
2432}
2433
2434define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
2435; GFX9-LABEL: store_load_sindex_large_offset_kernel:
2436; GFX9:       ; %bb.0: ; %bb
2437; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
2438; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
2439; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2440; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
2441; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
2442; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2443; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
2444; GFX9-NEXT:    s_and_b32 s0, s0, 15
2445; GFX9-NEXT:    v_mov_b32_e32 v0, 15
2446; GFX9-NEXT:    s_addk_i32 s1, 0x4004
2447; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
2448; GFX9-NEXT:    scratch_store_dword off, v0, s1
2449; GFX9-NEXT:    s_waitcnt vmcnt(0)
2450; GFX9-NEXT:    s_addk_i32 s0, 0x4004
2451; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
2452; GFX9-NEXT:    s_waitcnt vmcnt(0)
2453; GFX9-NEXT:    s_endpgm
2454;
2455; GFX10-LABEL: store_load_sindex_large_offset_kernel:
2456; GFX10:       ; %bb.0: ; %bb
2457; GFX10-NEXT:    s_add_u32 s2, s2, s5
2458; GFX10-NEXT:    s_addc_u32 s3, s3, 0
2459; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2460; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2461; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
2462; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2463; GFX10-NEXT:    s_waitcnt vmcnt(0)
2464; GFX10-NEXT:    v_mov_b32_e32 v0, 15
2465; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2466; GFX10-NEXT:    s_and_b32 s1, s0, 15
2467; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
2468; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
2469; GFX10-NEXT:    s_addk_i32 s0, 0x4004
2470; GFX10-NEXT:    s_addk_i32 s1, 0x4004
2471; GFX10-NEXT:    scratch_store_dword off, v0, s0
2472; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2473; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2474; GFX10-NEXT:    s_waitcnt vmcnt(0)
2475; GFX10-NEXT:    s_endpgm
2476;
2477; GFX11-LABEL: store_load_sindex_large_offset_kernel:
2478; GFX11:       ; %bb.0: ; %bb
2479; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
2480; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
2481; GFX11-NEXT:    s_waitcnt vmcnt(0)
2482; GFX11-NEXT:    v_mov_b32_e32 v0, 15
2483; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2484; GFX11-NEXT:    s_and_b32 s1, s0, 15
2485; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
2486; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
2487; GFX11-NEXT:    s_addk_i32 s0, 0x4004
2488; GFX11-NEXT:    s_addk_i32 s1, 0x4004
2489; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
2490; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2491; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
2492; GFX11-NEXT:    s_waitcnt vmcnt(0)
2493; GFX11-NEXT:    s_endpgm
2494;
2495; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel:
2496; GFX9-PAL:       ; %bb.0: ; %bb
2497; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
2498; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
2499; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2500; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
2501; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
2502; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2503; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2504; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
2505; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
2506; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
2507; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2508; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
2509; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
2510; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
2511; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x4004
2512; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2513; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
2514; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2515; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x4004
2516; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
2517; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2518; GFX9-PAL-NEXT:    s_endpgm
2519;
2520; GFX940-LABEL: store_load_sindex_large_offset_kernel:
2521; GFX940:       ; %bb.0: ; %bb
2522; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
2523; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
2524; GFX940-NEXT:    s_waitcnt vmcnt(0)
2525; GFX940-NEXT:    v_mov_b32_e32 v0, 15
2526; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
2527; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
2528; GFX940-NEXT:    s_and_b32 s0, s0, 15
2529; GFX940-NEXT:    s_addk_i32 s1, 0x4004
2530; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
2531; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
2532; GFX940-NEXT:    s_waitcnt vmcnt(0)
2533; GFX940-NEXT:    s_addk_i32 s0, 0x4004
2534; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
2535; GFX940-NEXT:    s_waitcnt vmcnt(0)
2536; GFX940-NEXT:    s_endpgm
2537;
2538; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel:
2539; GFX1010-PAL:       ; %bb.0: ; %bb
2540; GFX1010-PAL-NEXT:    s_getpc_b64 s[4:5]
2541; GFX1010-PAL-NEXT:    s_mov_b32 s4, s0
2542; GFX1010-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2543; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2544; GFX1010-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2545; GFX1010-PAL-NEXT:    s_add_u32 s4, s4, s3
2546; GFX1010-PAL-NEXT:    s_addc_u32 s5, s5, 0
2547; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
2548; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
2549; GFX1010-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
2550; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2551; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
2552; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2553; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
2554; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2555; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
2556; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2557; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2558; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x4004
2559; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x4004
2560; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
2561; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2562; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2563; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2564; GFX1010-PAL-NEXT:    s_endpgm
2565;
2566; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel:
2567; GFX1030-PAL:       ; %bb.0: ; %bb
2568; GFX1030-PAL-NEXT:    s_getpc_b64 s[4:5]
2569; GFX1030-PAL-NEXT:    s_mov_b32 s4, s0
2570; GFX1030-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2571; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2572; GFX1030-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2573; GFX1030-PAL-NEXT:    s_add_u32 s4, s4, s3
2574; GFX1030-PAL-NEXT:    s_addc_u32 s5, s5, 0
2575; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
2576; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
2577; GFX1030-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
2578; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2579; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2580; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
2581; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2582; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
2583; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2584; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2585; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x4004
2586; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x4004
2587; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
2588; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2589; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2590; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2591; GFX1030-PAL-NEXT:    s_endpgm
2592;
2593; GFX11-PAL-LABEL: store_load_sindex_large_offset_kernel:
2594; GFX11-PAL:       ; %bb.0: ; %bb
2595; GFX11-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
2596; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
2597; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2598; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
2599; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2600; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
2601; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2602; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2603; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x4004
2604; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x4004
2605; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
2606; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2607; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
2608; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2609; GFX11-PAL-NEXT:    s_endpgm
2610bb:
2611  %padding = alloca [4096 x i32], align 4, addrspace(5)
2612  %i = alloca [32 x float], align 4, addrspace(5)
2613  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2614  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2615  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2616  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
2617  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2618  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2619  %i9 = and i32 %idx, 15
2620  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2621  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2622  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2623  ret void
2624}
2625
2626define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
2627; GFX9-LABEL: store_load_sindex_large_offset_foo:
2628; GFX9:       ; %bb.0: ; %bb
2629; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
2630; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
2631; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
2632; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
2633; GFX9-NEXT:    s_waitcnt vmcnt(0)
2634; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
2635; GFX9-NEXT:    s_addk_i32 s0, 0x4004
2636; GFX9-NEXT:    v_mov_b32_e32 v0, 15
2637; GFX9-NEXT:    scratch_store_dword off, v0, s0
2638; GFX9-NEXT:    s_waitcnt vmcnt(0)
2639; GFX9-NEXT:    s_and_b32 s0, s2, 15
2640; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
2641; GFX9-NEXT:    s_addk_i32 s0, 0x4004
2642; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
2643; GFX9-NEXT:    s_waitcnt vmcnt(0)
2644; GFX9-NEXT:    s_endpgm
2645;
2646; GFX10-LABEL: store_load_sindex_large_offset_foo:
2647; GFX10:       ; %bb.0: ; %bb
2648; GFX10-NEXT:    s_add_u32 s0, s0, s3
2649; GFX10-NEXT:    s_addc_u32 s1, s1, 0
2650; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
2651; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
2652; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2653; GFX10-NEXT:    s_waitcnt vmcnt(0)
2654; GFX10-NEXT:    v_mov_b32_e32 v0, 15
2655; GFX10-NEXT:    s_and_b32 s0, s2, 15
2656; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
2657; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
2658; GFX10-NEXT:    s_addk_i32 s1, 0x4004
2659; GFX10-NEXT:    s_addk_i32 s0, 0x4004
2660; GFX10-NEXT:    scratch_store_dword off, v0, s1
2661; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2662; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
2663; GFX10-NEXT:    s_waitcnt vmcnt(0)
2664; GFX10-NEXT:    s_endpgm
2665;
2666; GFX11-LABEL: store_load_sindex_large_offset_foo:
2667; GFX11:       ; %bb.0: ; %bb
2668; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
2669; GFX11-NEXT:    s_waitcnt vmcnt(0)
2670; GFX11-NEXT:    v_mov_b32_e32 v0, 15
2671; GFX11-NEXT:    s_and_b32 s1, s0, 15
2672; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
2673; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
2674; GFX11-NEXT:    s_addk_i32 s0, 0x4004
2675; GFX11-NEXT:    s_addk_i32 s1, 0x4004
2676; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
2677; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2678; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
2679; GFX11-NEXT:    s_waitcnt vmcnt(0)
2680; GFX11-NEXT:    s_endpgm
2681;
2682; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo:
2683; GFX9-PAL:       ; %bb.0: ; %bb
2684; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
2685; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2686; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2687; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
2688; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2689; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2690; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
2691; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2692; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
2693; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2694; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
2695; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
2696; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x4004
2697; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
2698; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2699; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
2700; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2701; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x4004
2702; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
2703; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2704; GFX9-PAL-NEXT:    s_endpgm
2705;
2706; GFX940-LABEL: store_load_sindex_large_offset_foo:
2707; GFX940:       ; %bb.0: ; %bb
2708; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
2709; GFX940-NEXT:    s_waitcnt vmcnt(0)
2710; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
2711; GFX940-NEXT:    s_and_b32 s0, s0, 15
2712; GFX940-NEXT:    s_addk_i32 s1, 0x4004
2713; GFX940-NEXT:    v_mov_b32_e32 v0, 15
2714; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
2715; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
2716; GFX940-NEXT:    s_waitcnt vmcnt(0)
2717; GFX940-NEXT:    s_addk_i32 s0, 0x4004
2718; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
2719; GFX940-NEXT:    s_waitcnt vmcnt(0)
2720; GFX940-NEXT:    s_endpgm
2721;
2722; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo:
2723; GFX1010-PAL:       ; %bb.0: ; %bb
2724; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
2725; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2726; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2727; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2728; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2729; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
2730; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
2731; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2732; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2733; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2734; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
2735; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
2736; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2737; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
2738; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2739; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2740; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x4004
2741; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x4004
2742; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
2743; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2744; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2745; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2746; GFX1010-PAL-NEXT:    s_endpgm
2747;
2748; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo:
2749; GFX1030-PAL:       ; %bb.0: ; %bb
2750; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
2751; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2752; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2753; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2754; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2755; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
2756; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
2757; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2758; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2759; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2760; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2761; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
2762; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
2763; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2764; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2765; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x4004
2766; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x4004
2767; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
2768; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2769; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2770; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2771; GFX1030-PAL-NEXT:    s_endpgm
2772;
2773; GFX11-PAL-LABEL: store_load_sindex_large_offset_foo:
2774; GFX11-PAL:       ; %bb.0: ; %bb
2775; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
2776; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2777; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
2778; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
2779; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2780; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2781; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x4004
2782; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x4004
2783; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
2784; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2785; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
2786; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2787; GFX11-PAL-NEXT:    s_endpgm
2788bb:
2789  %padding = alloca [4096 x i32], align 4, addrspace(5)
2790  %i = alloca [32 x float], align 4, addrspace(5)
2791  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2792  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2793  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2794  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
2795  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2796  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2797  %i9 = and i32 %idx, 15
2798  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2799  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2800  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2801  ret void
2802}
2803
2804define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
2805; GFX9-LABEL: store_load_vindex_large_offset_kernel:
2806; GFX9:       ; %bb.0: ; %bb
2807; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
2808; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
2809; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
2810; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
2811; GFX9-NEXT:    s_waitcnt vmcnt(0)
2812; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2813; GFX9-NEXT:    v_add_u32_e32 v1, 0x4004, v0
2814; GFX9-NEXT:    v_mov_b32_e32 v2, 15
2815; GFX9-NEXT:    scratch_store_dword v1, v2, off
2816; GFX9-NEXT:    s_waitcnt vmcnt(0)
2817; GFX9-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
2818; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
2819; GFX9-NEXT:    s_waitcnt vmcnt(0)
2820; GFX9-NEXT:    s_endpgm
2821;
2822; GFX10-LABEL: store_load_vindex_large_offset_kernel:
2823; GFX10:       ; %bb.0: ; %bb
2824; GFX10-NEXT:    s_add_u32 s0, s0, s3
2825; GFX10-NEXT:    s_addc_u32 s1, s1, 0
2826; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
2827; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
2828; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2829; GFX10-NEXT:    v_mov_b32_e32 v2, 15
2830; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
2831; GFX10-NEXT:    s_waitcnt vmcnt(0)
2832; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
2833; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
2834; GFX10-NEXT:    scratch_store_dword v1, v2, off
2835; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2836; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
2837; GFX10-NEXT:    s_waitcnt vmcnt(0)
2838; GFX10-NEXT:    s_endpgm
2839;
2840; GFX11-LABEL: store_load_vindex_large_offset_kernel:
2841; GFX11:       ; %bb.0: ; %bb
2842; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
2843; GFX11-NEXT:    s_movk_i32 vcc_lo, 0x4004
2844; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
2845; GFX11-NEXT:    s_waitcnt vmcnt(0)
2846; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
2847; GFX11-NEXT:    scratch_store_b32 v0, v1, vcc_lo dlc
2848; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2849; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
2850; GFX11-NEXT:    s_waitcnt vmcnt(0)
2851; GFX11-NEXT:    s_endpgm
2852;
2853; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel:
2854; GFX9-PAL:       ; %bb.0: ; %bb
2855; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
2856; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2857; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2858; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
2859; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2860; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
2861; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2862; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2863; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
2864; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2865; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
2866; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2867; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x4004, v0
2868; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
2869; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2870; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
2871; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
2872; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2873; GFX9-PAL-NEXT:    s_endpgm
2874;
2875; GFX940-LABEL: store_load_vindex_large_offset_kernel:
2876; GFX940:       ; %bb.0: ; %bb
2877; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
2878; GFX940-NEXT:    s_waitcnt vmcnt(0)
2879; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2880; GFX940-NEXT:    v_mov_b32_e32 v1, 15
2881; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4004
2882; GFX940-NEXT:    scratch_store_dword v0, v1, vcc_hi sc0 sc1
2883; GFX940-NEXT:    s_waitcnt vmcnt(0)
2884; GFX940-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
2885; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
2886; GFX940-NEXT:    s_waitcnt vmcnt(0)
2887; GFX940-NEXT:    s_endpgm
2888;
2889; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel:
2890; GFX1010-PAL:       ; %bb.0: ; %bb
2891; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
2892; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2893; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2894; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2895; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2896; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
2897; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
2898; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2899; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2900; GFX1010-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2901; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, 15
2902; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2903; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, vcc_lo offset:4 glc dlc
2904; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2905; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
2906; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
2907; GFX1010-PAL-NEXT:    scratch_store_dword v1, v2, off
2908; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2909; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
2910; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2911; GFX1010-PAL-NEXT:    s_endpgm
2912;
2913; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel:
2914; GFX1030-PAL:       ; %bb.0: ; %bb
2915; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
2916; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2917; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2918; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2919; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2920; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
2921; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
2922; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2923; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2924; GFX1030-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2925; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, 15
2926; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
2927; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2928; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
2929; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
2930; GFX1030-PAL-NEXT:    scratch_store_dword v1, v2, off
2931; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2932; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
2933; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2934; GFX1030-PAL-NEXT:    s_endpgm
2935;
2936; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel:
2937; GFX11-PAL:       ; %bb.0: ; %bb
2938; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
2939; GFX11-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4004
2940; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
2941; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2942; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
2943; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, vcc_lo dlc
2944; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2945; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
2946; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2947; GFX11-PAL-NEXT:    s_endpgm
2948bb:
2949  %padding = alloca [4096 x i32], align 4, addrspace(5)
2950  %i = alloca [32 x float], align 4, addrspace(5)
2951  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2952  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2953  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2954  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
2955  %i3 = zext i32 %i2 to i64
2956  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
2957  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2958  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2959  %i9 = sub nsw i32 31, %i2
2960  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2961  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2962  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2963  ret void
2964}
2965
2966define void @store_load_vindex_large_offset_foo(i32 %idx) {
2967; GFX9-LABEL: store_load_vindex_large_offset_foo:
2968; GFX9:       ; %bb.0: ; %bb
2969; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2970; GFX9-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
2971; GFX9-NEXT:    s_waitcnt vmcnt(0)
2972; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
2973; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
2974; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
2975; GFX9-NEXT:    v_mov_b32_e32 v3, 15
2976; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
2977; GFX9-NEXT:    scratch_store_dword v2, v3, off
2978; GFX9-NEXT:    s_waitcnt vmcnt(0)
2979; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
2980; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
2981; GFX9-NEXT:    s_waitcnt vmcnt(0)
2982; GFX9-NEXT:    s_setpc_b64 s[30:31]
2983;
2984; GFX10-LABEL: store_load_vindex_large_offset_foo:
2985; GFX10:       ; %bb.0: ; %bb
2986; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2987; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2988; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
2989; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
2990; GFX10-NEXT:    v_mov_b32_e32 v2, 15
2991; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, vcc_lo
2992; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
2993; GFX10-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
2994; GFX10-NEXT:    s_waitcnt vmcnt(0)
2995; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
2996; GFX10-NEXT:    scratch_store_dword v0, v2, off
2997; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2998; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
2999; GFX10-NEXT:    s_waitcnt vmcnt(0)
3000; GFX10-NEXT:    s_setpc_b64 s[30:31]
3001;
3002; GFX11-LABEL: store_load_vindex_large_offset_foo:
3003; GFX11:       ; %bb.0: ; %bb
3004; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3005; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3006; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
3007; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3008; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3009; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:4 glc dlc
3010; GFX11-NEXT:    s_waitcnt vmcnt(0)
3011; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
3012; GFX11-NEXT:    scratch_store_b32 v0, v2, vcc_lo dlc
3013; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3014; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3015; GFX11-NEXT:    scratch_load_b32 v0, v1, vcc_lo glc dlc
3016; GFX11-NEXT:    s_waitcnt vmcnt(0)
3017; GFX11-NEXT:    s_setpc_b64 s[30:31]
3018;
3019; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
3020; GFX9-PAL:       ; %bb.0: ; %bb
3021; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3022; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
3023; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3024; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
3025; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
3026; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
3027; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
3028; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
3029; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
3030; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3031; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
3032; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
3033; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3034; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3035;
3036; GFX940-LABEL: store_load_vindex_large_offset_foo:
3037; GFX940:       ; %bb.0: ; %bb
3038; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3039; GFX940-NEXT:    scratch_load_dword v1, off, s32 offset:4 sc0 sc1
3040; GFX940-NEXT:    s_waitcnt vmcnt(0)
3041; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
3042; GFX940-NEXT:    v_mov_b32_e32 v2, 15
3043; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
3044; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
3045; GFX940-NEXT:    scratch_store_dword v1, v2, vcc_hi sc0 sc1
3046; GFX940-NEXT:    s_waitcnt vmcnt(0)
3047; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3048; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
3049; GFX940-NEXT:    scratch_load_dword v0, v0, vcc_hi sc0 sc1
3050; GFX940-NEXT:    s_waitcnt vmcnt(0)
3051; GFX940-NEXT:    s_setpc_b64 s[30:31]
3052;
3053; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo:
3054; GFX10-PAL:       ; %bb.0: ; %bb
3055; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3056; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3057; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
3058; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3059; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
3060; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, vcc_lo
3061; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3062; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
3063; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3064; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
3065; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
3066; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3067; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
3068; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3069; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3070;
3071; GFX11-PAL-LABEL: store_load_vindex_large_offset_foo:
3072; GFX11-PAL:       ; %bb.0: ; %bb
3073; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3074; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3075; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
3076; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3077; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3078; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, s32 offset:4 glc dlc
3079; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3080; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
3081; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, vcc_lo dlc
3082; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3083; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
3084; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, vcc_lo glc dlc
3085; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3086; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
3087; GCN-LABEL: store_load_vindex_large_offset_foo:
3088; GCN:       ; %bb.0: ; %bb
3089; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3090; GCN-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
3091; GCN-NEXT:    s_waitcnt vmcnt(0)
3092; GCN-NEXT:    v_mov_b32_e32 v2, 15
3093; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
3094; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
3095; GCN-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
3096; GCN-NEXT:    scratch_store_dword v1, v2, vcc_hi sc0 sc1
3097; GCN-NEXT:    s_waitcnt vmcnt(0)
3098; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3099; GCN-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
3100; GCN-NEXT:    scratch_load_dword v0, v0, vcc_hi sc0 sc1
3101; GCN-NEXT:    s_waitcnt vmcnt(0)
3102; GCN-NEXT:    s_setpc_b64 s[30:31]
3103bb:
3104  %padding = alloca [4096 x i32], align 4, addrspace(5)
3105  %i = alloca [32 x float], align 4, addrspace(5)
3106  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
3107  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
3108  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
3109  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
3110  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
3111  store volatile i32 15, i32 addrspace(5)* %i8, align 4
3112  %i9 = and i32 %idx, 15
3113  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
3114  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
3115  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
3116  ret void
3117}
3118
3119define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
3120; GFX9-LABEL: store_load_large_imm_offset_kernel:
3121; GFX9:       ; %bb.0: ; %bb
3122; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
3123; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
3124; GFX9-NEXT:    v_mov_b32_e32 v0, 13
3125; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
3126; GFX9-NEXT:    s_movk_i32 s0, 0x3000
3127; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
3128; GFX9-NEXT:    s_waitcnt vmcnt(0)
3129; GFX9-NEXT:    s_add_i32 s0, s0, 4
3130; GFX9-NEXT:    v_mov_b32_e32 v0, 15
3131; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
3132; GFX9-NEXT:    s_waitcnt vmcnt(0)
3133; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
3134; GFX9-NEXT:    s_waitcnt vmcnt(0)
3135; GFX9-NEXT:    s_endpgm
3136;
3137; GFX10-LABEL: store_load_large_imm_offset_kernel:
3138; GFX10:       ; %bb.0: ; %bb
3139; GFX10-NEXT:    s_add_u32 s0, s0, s3
3140; GFX10-NEXT:    s_addc_u32 s1, s1, 0
3141; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
3142; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
3143; GFX10-NEXT:    v_mov_b32_e32 v0, 13
3144; GFX10-NEXT:    v_mov_b32_e32 v1, 15
3145; GFX10-NEXT:    s_movk_i32 s0, 0x3800
3146; GFX10-NEXT:    s_add_i32 s0, s0, 4
3147; GFX10-NEXT:    scratch_store_dword off, v0, off offset:4
3148; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3149; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
3150; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3151; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
3152; GFX10-NEXT:    s_waitcnt vmcnt(0)
3153; GFX10-NEXT:    s_endpgm
3154;
3155; GFX11-LABEL: store_load_large_imm_offset_kernel:
3156; GFX11:       ; %bb.0: ; %bb
3157; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000
3158; GFX11-NEXT:    v_mov_b32_e32 v2, 15
3159; GFX11-NEXT:    scratch_store_b32 off, v0, off offset:4 dlc
3160; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3161; GFX11-NEXT:    scratch_store_b32 v1, v2, off offset:3716 dlc
3162; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3163; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:3716 glc dlc
3164; GFX11-NEXT:    s_waitcnt vmcnt(0)
3165; GFX11-NEXT:    s_endpgm
3166;
3167; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel:
3168; GFX9-PAL:       ; %bb.0: ; %bb
3169; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
3170; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
3171; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
3172; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
3173; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
3174; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
3175; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3176; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
3177; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
3178; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
3179; GFX9-PAL-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
3180; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3181; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
3182; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
3183; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
3184; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3185; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
3186; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3187; GFX9-PAL-NEXT:    s_endpgm
3188;
3189; GFX940-LABEL: store_load_large_imm_offset_kernel:
3190; GFX940:       ; %bb.0: ; %bb
3191; GFX940-NEXT:    v_mov_b32_e32 v0, 13
3192; GFX940-NEXT:    scratch_store_dword off, v0, off offset:4 sc0 sc1
3193; GFX940-NEXT:    s_waitcnt vmcnt(0)
3194; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3000
3195; GFX940-NEXT:    v_mov_b32_e32 v1, 15
3196; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:3716 sc0 sc1
3197; GFX940-NEXT:    s_waitcnt vmcnt(0)
3198; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:3716 sc0 sc1
3199; GFX940-NEXT:    s_waitcnt vmcnt(0)
3200; GFX940-NEXT:    s_endpgm
3201;
3202; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel:
3203; GFX1010-PAL:       ; %bb.0: ; %bb
3204; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
3205; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
3206; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
3207; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3208; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
3209; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
3210; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
3211; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
3212; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
3213; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 13
3214; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 15
3215; GFX1010-PAL-NEXT:    s_movk_i32 s0, 0x3800
3216; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
3217; GFX1010-PAL-NEXT:    s_add_i32 s0, s0, 4
3218; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, vcc_lo offset:4
3219; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3220; GFX1010-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
3221; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3222; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
3223; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
3224; GFX1010-PAL-NEXT:    s_endpgm
3225;
3226; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel:
3227; GFX1030-PAL:       ; %bb.0: ; %bb
3228; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
3229; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
3230; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
3231; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3232; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
3233; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
3234; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
3235; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
3236; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
3237; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 13
3238; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 15
3239; GFX1030-PAL-NEXT:    s_movk_i32 s0, 0x3800
3240; GFX1030-PAL-NEXT:    s_add_i32 s0, s0, 4
3241; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, off offset:4
3242; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3243; GFX1030-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
3244; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3245; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
3246; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
3247; GFX1030-PAL-NEXT:    s_endpgm
3248;
3249; GFX11-PAL-LABEL: store_load_large_imm_offset_kernel:
3250; GFX11-PAL:       ; %bb.0: ; %bb
3251; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000
3252; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 15
3253; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, off offset:4 dlc
3254; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3255; GFX11-PAL-NEXT:    scratch_store_b32 v1, v2, off offset:3716 dlc
3256; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3257; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, off offset:3716 glc dlc
3258; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3259; GFX11-PAL-NEXT:    s_endpgm
3260bb:
3261  %i = alloca [4096 x i32], align 4, addrspace(5)
3262  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
3263  store volatile i32 13, i32 addrspace(5)* %i1, align 4
3264  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
3265  store volatile i32 15, i32 addrspace(5)* %i7, align 4
3266  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
3267  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
3268  ret void
3269}
3270
3271define void @store_load_large_imm_offset_foo() {
3272; GFX9-LABEL: store_load_large_imm_offset_foo:
3273; GFX9:       ; %bb.0: ; %bb
3274; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3275; GFX9-NEXT:    v_mov_b32_e32 v0, 13
3276; GFX9-NEXT:    s_movk_i32 s0, 0x3000
3277; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 4
3278; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:4
3279; GFX9-NEXT:    s_waitcnt vmcnt(0)
3280; GFX9-NEXT:    s_add_i32 s0, s0, vcc_hi
3281; GFX9-NEXT:    v_mov_b32_e32 v0, 15
3282; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
3283; GFX9-NEXT:    s_waitcnt vmcnt(0)
3284; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
3285; GFX9-NEXT:    s_waitcnt vmcnt(0)
3286; GFX9-NEXT:    s_setpc_b64 s[30:31]
3287;
3288; GFX10-LABEL: store_load_large_imm_offset_foo:
3289; GFX10:       ; %bb.0: ; %bb
3290; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3291; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3292; GFX10-NEXT:    v_mov_b32_e32 v0, 13
3293; GFX10-NEXT:    v_mov_b32_e32 v1, 15
3294; GFX10-NEXT:    s_movk_i32 s0, 0x3800
3295; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 4
3296; GFX10-NEXT:    s_add_i32 s0, s0, vcc_lo
3297; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:4
3298; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3299; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
3300; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3301; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
3302; GFX10-NEXT:    s_waitcnt vmcnt(0)
3303; GFX10-NEXT:    s_setpc_b64 s[30:31]
3304;
3305; GFX11-LABEL: store_load_large_imm_offset_foo:
3306; GFX11:       ; %bb.0: ; %bb
3307; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3308; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3309; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000
3310; GFX11-NEXT:    v_mov_b32_e32 v2, 15
3311; GFX11-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
3312; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3313; GFX11-NEXT:    scratch_store_b32 v1, v2, s32 offset:3716 dlc
3314; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3315; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 offset:3716 glc dlc
3316; GFX11-NEXT:    s_waitcnt vmcnt(0)
3317; GFX11-NEXT:    s_setpc_b64 s[30:31]
3318;
3319; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
3320; GFX9-PAL:       ; %bb.0: ; %bb
3321; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3322; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
3323; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
3324; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 4
3325; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s32 offset:4
3326; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3327; GFX9-PAL-NEXT:    s_add_i32 s0, s0, vcc_hi
3328; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
3329; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
3330; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3331; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
3332; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3333; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3334;
3335; GFX940-LABEL: store_load_large_imm_offset_foo:
3336; GFX940:       ; %bb.0: ; %bb
3337; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3338; GFX940-NEXT:    v_mov_b32_e32 v0, 13
3339; GFX940-NEXT:    scratch_store_dword off, v0, s32 offset:4 sc0 sc1
3340; GFX940-NEXT:    s_waitcnt vmcnt(0)
3341; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3000
3342; GFX940-NEXT:    v_mov_b32_e32 v1, 15
3343; GFX940-NEXT:    scratch_store_dword v0, v1, s32 offset:3716 sc0 sc1
3344; GFX940-NEXT:    s_waitcnt vmcnt(0)
3345; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1
3346; GFX940-NEXT:    s_waitcnt vmcnt(0)
3347; GFX940-NEXT:    s_setpc_b64 s[30:31]
3348;
3349; GFX10-PAL-LABEL: store_load_large_imm_offset_foo:
3350; GFX10-PAL:       ; %bb.0: ; %bb
3351; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3352; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3353; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 13
3354; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
3355; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x3800
3356; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 4
3357; GFX10-PAL-NEXT:    s_add_i32 s0, s0, vcc_lo
3358; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s32 offset:4
3359; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3360; GFX10-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
3361; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3362; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
3363; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3364; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3365;
3366; GFX11-PAL-LABEL: store_load_large_imm_offset_foo:
3367; GFX11-PAL:       ; %bb.0: ; %bb
3368; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3369; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3370; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000
3371; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 15
3372; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
3373; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3374; GFX11-PAL-NEXT:    scratch_store_b32 v1, v2, s32 offset:3716 dlc
3375; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3376; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:3716 glc dlc
3377; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3378; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
3379; GCN-LABEL: store_load_large_imm_offset_foo:
3380; GCN:       ; %bb.0: ; %bb
3381; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3382; GCN-NEXT:    v_mov_b32_e32 v0, 13
3383; GCN-NEXT:    scratch_store_dword off, v0, s32 sc0 sc1
3384; GCN-NEXT:    s_waitcnt vmcnt(0)
3385; GCN-NEXT:    v_mov_b32_e32 v0, 0x3000
3386; GCN-NEXT:    v_mov_b32_e32 v1, 15
3387; GCN-NEXT:    scratch_store_dword v0, v1, s32 offset:3712 sc0 sc1
3388; GCN-NEXT:    s_waitcnt vmcnt(0)
3389; GCN-NEXT:    scratch_load_dword v0, v0, s32 offset:3712 sc0 sc1
3390; GCN-NEXT:    s_waitcnt vmcnt(0)
3391; GCN-NEXT:    s_setpc_b64 s[30:31]
3392bb:
3393  %i = alloca [4096 x i32], align 4, addrspace(5)
3394  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
3395  store volatile i32 13, i32 addrspace(5)* %i1, align 4
3396  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
3397  store volatile i32 15, i32 addrspace(5)* %i7, align 4
3398  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
3399  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
3400  ret void
3401}
3402
3403define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
3404; GFX9-LABEL: store_load_vidx_sidx_offset:
3405; GFX9:       ; %bb.0: ; %bb
3406; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
3407; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
3408; GFX9-NEXT:    v_mov_b32_e32 v1, 4
3409; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
3410; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3411; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
3412; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
3413; GFX9-NEXT:    v_mov_b32_e32 v1, 15
3414; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
3415; GFX9-NEXT:    s_waitcnt vmcnt(0)
3416; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
3417; GFX9-NEXT:    s_waitcnt vmcnt(0)
3418; GFX9-NEXT:    s_endpgm
3419;
3420; GFX10-LABEL: store_load_vidx_sidx_offset:
3421; GFX10:       ; %bb.0: ; %bb
3422; GFX10-NEXT:    s_add_u32 s2, s2, s5
3423; GFX10-NEXT:    s_addc_u32 s3, s3, 0
3424; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
3425; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
3426; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
3427; GFX10-NEXT:    v_mov_b32_e32 v1, 15
3428; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3429; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
3430; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
3431; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
3432; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3433; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
3434; GFX10-NEXT:    s_waitcnt vmcnt(0)
3435; GFX10-NEXT:    s_endpgm
3436;
3437; GFX11-LABEL: store_load_vidx_sidx_offset:
3438; GFX11:       ; %bb.0: ; %bb
3439; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
3440; GFX11-NEXT:    v_mov_b32_e32 v1, 15
3441; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3442; GFX11-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
3443; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:1028 dlc
3444; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3445; GFX11-NEXT:    scratch_load_b32 v0, v0, off offset:1028 glc dlc
3446; GFX11-NEXT:    s_waitcnt vmcnt(0)
3447; GFX11-NEXT:    s_endpgm
3448;
3449; GFX9-PAL-LABEL: store_load_vidx_sidx_offset:
3450; GFX9-PAL:       ; %bb.0: ; %bb
3451; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
3452; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
3453; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
3454; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 4
3455; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
3456; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3457; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
3458; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
3459; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
3460; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
3461; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
3462; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
3463; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
3464; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3465; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
3466; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3467; GFX9-PAL-NEXT:    s_endpgm
3468;
3469; GFX940-LABEL: store_load_vidx_sidx_offset:
3470; GFX940:       ; %bb.0: ; %bb
3471; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
3472; GFX940-NEXT:    v_mov_b32_e32 v1, 15
3473; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
3474; GFX940-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
3475; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:1028 sc0 sc1
3476; GFX940-NEXT:    s_waitcnt vmcnt(0)
3477; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:1028 sc0 sc1
3478; GFX940-NEXT:    s_waitcnt vmcnt(0)
3479; GFX940-NEXT:    s_endpgm
3480;
3481; GFX10-PAL-LABEL: store_load_vidx_sidx_offset:
3482; GFX10-PAL:       ; %bb.0: ; %bb
3483; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
3484; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
3485; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
3486; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3487; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
3488; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
3489; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
3490; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
3491; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
3492; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
3493; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
3494; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3495; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
3496; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
3497; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
3498; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3499; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
3500; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3501; GFX10-PAL-NEXT:    s_endpgm
3502;
3503; GFX11-PAL-LABEL: store_load_vidx_sidx_offset:
3504; GFX11-PAL:       ; %bb.0: ; %bb
3505; GFX11-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
3506; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
3507; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3508; GFX11-PAL-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
3509; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:1028 dlc
3510; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3511; GFX11-PAL-NEXT:    scratch_load_b32 v0, v0, off offset:1028 glc dlc
3512; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3513; GFX11-PAL-NEXT:    s_endpgm
3514; GCN-LABEL: store_load_vidx_sidx_offset:
3515; GCN:       ; %bb.0: ; %bb
3516; GCN-NEXT:    s_load_dword s0, s[0:1], 0x24
3517; GCN-NEXT:    v_mov_b32_e32 v1, 15
3518; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3519; GCN-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
3520; GCN-NEXT:    scratch_store_dword v0, v1, off offset:1028 sc0 sc1
3521; GCN-NEXT:    s_waitcnt vmcnt(0)
3522; GCN-NEXT:    scratch_load_dword v0, v0, off offset:1028 sc0 sc1
3523; GCN-NEXT:    s_waitcnt vmcnt(0)
3524; GCN-NEXT:    s_endpgm
3525bb:
3526  %alloca = alloca [32 x i32], align 4, addrspace(5)
3527  %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
3528  %add1 = add nsw i32 %sidx, %vidx
3529  %add2 = add nsw i32 %add1, 256
3530  %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2
3531  store volatile i32 15, i32 addrspace(5)* %gep, align 4
3532  %load = load volatile i32, i32 addrspace(5)* %gep, align 4
3533  ret void
3534}
3535
3536define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
3537; GFX9-LABEL: store_load_i64_aligned:
3538; GFX9:       ; %bb.0: ; %bb
3539; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3540; GFX9-NEXT:    v_mov_b32_e32 v1, 15
3541; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3542; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3543; GFX9-NEXT:    s_waitcnt vmcnt(0)
3544; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
3545; GFX9-NEXT:    s_waitcnt vmcnt(0)
3546; GFX9-NEXT:    s_setpc_b64 s[30:31]
3547;
3548; GFX10-LABEL: store_load_i64_aligned:
3549; GFX10:       ; %bb.0: ; %bb
3550; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3551; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3552; GFX10-NEXT:    v_mov_b32_e32 v1, 15
3553; GFX10-NEXT:    v_mov_b32_e32 v2, 0
3554; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3555; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3556; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
3557; GFX10-NEXT:    s_waitcnt vmcnt(0)
3558; GFX10-NEXT:    s_setpc_b64 s[30:31]
3559;
3560; GFX11-LABEL: store_load_i64_aligned:
3561; GFX11:       ; %bb.0: ; %bb
3562; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3563; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3564; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
3565; GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
3566; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3567; GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
3568; GFX11-NEXT:    s_waitcnt vmcnt(0)
3569; GFX11-NEXT:    s_setpc_b64 s[30:31]
3570;
3571; GFX9-PAL-LABEL: store_load_i64_aligned:
3572; GFX9-PAL:       ; %bb.0: ; %bb
3573; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3574; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
3575; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
3576; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3577; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3578; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
3579; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3580; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3581;
3582; GFX940-LABEL: store_load_i64_aligned:
3583; GFX940:       ; %bb.0: ; %bb
3584; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3585; GFX940-NEXT:    v_mov_b32_e32 v2, 15
3586; GFX940-NEXT:    v_mov_b32_e32 v3, 0
3587; GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
3588; GFX940-NEXT:    s_waitcnt vmcnt(0)
3589; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
3590; GFX940-NEXT:    s_waitcnt vmcnt(0)
3591; GFX940-NEXT:    s_setpc_b64 s[30:31]
3592;
3593; GFX10-PAL-LABEL: store_load_i64_aligned:
3594; GFX10-PAL:       ; %bb.0: ; %bb
3595; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3596; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3597; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
3598; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
3599; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3600; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3601; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
3602; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3603; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3604;
3605; GFX11-PAL-LABEL: store_load_i64_aligned:
3606; GFX11-PAL:       ; %bb.0: ; %bb
3607; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3608; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3609; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
3610; GFX11-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
3611; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3612; GFX11-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
3613; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3614; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
3615; GCN-LABEL: store_load_i64_aligned:
3616; GCN:       ; %bb.0: ; %bb
3617; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3618; GCN-NEXT:    v_mov_b32_e32 v2, 15
3619; GCN-NEXT:    v_mov_b32_e32 v3, 0
3620; GCN-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
3621; GCN-NEXT:    s_waitcnt vmcnt(0)
3622; GCN-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
3623; GCN-NEXT:    s_waitcnt vmcnt(0)
3624; GCN-NEXT:    s_setpc_b64 s[30:31]
3625bb:
3626  store volatile i64 15, i64 addrspace(5)* %arg, align 8
3627  %load = load volatile i64, i64 addrspace(5)* %arg, align 8
3628  ret void
3629}
3630
3631define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
3632; GFX9-LABEL: store_load_i64_unaligned:
3633; GFX9:       ; %bb.0: ; %bb
3634; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3635; GFX9-NEXT:    v_mov_b32_e32 v1, 15
3636; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3637; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3638; GFX9-NEXT:    s_waitcnt vmcnt(0)
3639; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
3640; GFX9-NEXT:    s_waitcnt vmcnt(0)
3641; GFX9-NEXT:    s_setpc_b64 s[30:31]
3642;
3643; GFX10-LABEL: store_load_i64_unaligned:
3644; GFX10:       ; %bb.0: ; %bb
3645; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3646; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3647; GFX10-NEXT:    v_mov_b32_e32 v1, 15
3648; GFX10-NEXT:    v_mov_b32_e32 v2, 0
3649; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3650; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3651; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
3652; GFX10-NEXT:    s_waitcnt vmcnt(0)
3653; GFX10-NEXT:    s_setpc_b64 s[30:31]
3654;
3655; GFX11-LABEL: store_load_i64_unaligned:
3656; GFX11:       ; %bb.0: ; %bb
3657; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3658; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3659; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
3660; GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
3661; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3662; GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
3663; GFX11-NEXT:    s_waitcnt vmcnt(0)
3664; GFX11-NEXT:    s_setpc_b64 s[30:31]
3665;
3666; GFX9-PAL-LABEL: store_load_i64_unaligned:
3667; GFX9-PAL:       ; %bb.0: ; %bb
3668; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3669; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
3670; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
3671; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3672; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3673; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
3674; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3675; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3676;
3677; GFX940-LABEL: store_load_i64_unaligned:
3678; GFX940:       ; %bb.0: ; %bb
3679; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3680; GFX940-NEXT:    v_mov_b32_e32 v2, 15
3681; GFX940-NEXT:    v_mov_b32_e32 v3, 0
3682; GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
3683; GFX940-NEXT:    s_waitcnt vmcnt(0)
3684; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
3685; GFX940-NEXT:    s_waitcnt vmcnt(0)
3686; GFX940-NEXT:    s_setpc_b64 s[30:31]
3687;
3688; GFX10-PAL-LABEL: store_load_i64_unaligned:
3689; GFX10-PAL:       ; %bb.0: ; %bb
3690; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3691; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3692; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
3693; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
3694; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3695; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3696; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
3697; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3698; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3699;
3700; GFX11-PAL-LABEL: store_load_i64_unaligned:
3701; GFX11-PAL:       ; %bb.0: ; %bb
3702; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3703; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3704; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
3705; GFX11-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
3706; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3707; GFX11-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
3708; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3709; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
3710; GCN-LABEL: store_load_i64_unaligned:
3711; GCN:       ; %bb.0: ; %bb
3712; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3713; GCN-NEXT:    v_mov_b32_e32 v2, 15
3714; GCN-NEXT:    v_mov_b32_e32 v3, 0
3715; GCN-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
3716; GCN-NEXT:    s_waitcnt vmcnt(0)
3717; GCN-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
3718; GCN-NEXT:    s_waitcnt vmcnt(0)
3719; GCN-NEXT:    s_setpc_b64 s[30:31]
3720bb:
3721  store volatile i64 15, i64 addrspace(5)* %arg, align 1
3722  %load = load volatile i64, i64 addrspace(5)* %arg, align 1
3723  ret void
3724}
3725
3726define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) {
3727; GFX9-LABEL: store_load_v3i32_unaligned:
3728; GFX9:       ; %bb.0: ; %bb
3729; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3730; GFX9-NEXT:    v_mov_b32_e32 v1, 1
3731; GFX9-NEXT:    v_mov_b32_e32 v2, 2
3732; GFX9-NEXT:    v_mov_b32_e32 v3, 3
3733; GFX9-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
3734; GFX9-NEXT:    s_waitcnt vmcnt(0)
3735; GFX9-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
3736; GFX9-NEXT:    s_waitcnt vmcnt(0)
3737; GFX9-NEXT:    s_setpc_b64 s[30:31]
3738;
3739; GFX10-LABEL: store_load_v3i32_unaligned:
3740; GFX10:       ; %bb.0: ; %bb
3741; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3742; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3743; GFX10-NEXT:    v_mov_b32_e32 v1, 1
3744; GFX10-NEXT:    v_mov_b32_e32 v2, 2
3745; GFX10-NEXT:    v_mov_b32_e32 v3, 3
3746; GFX10-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
3747; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3748; GFX10-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
3749; GFX10-NEXT:    s_waitcnt vmcnt(0)
3750; GFX10-NEXT:    s_setpc_b64 s[30:31]
3751;
3752; GFX11-LABEL: store_load_v3i32_unaligned:
3753; GFX11:       ; %bb.0: ; %bb
3754; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3755; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3756; GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
3757; GFX11-NEXT:    v_mov_b32_e32 v3, 3
3758; GFX11-NEXT:    scratch_store_b96 v0, v[1:3], off dlc
3759; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3760; GFX11-NEXT:    scratch_load_b96 v[0:2], v0, off glc dlc
3761; GFX11-NEXT:    s_waitcnt vmcnt(0)
3762; GFX11-NEXT:    s_setpc_b64 s[30:31]
3763;
3764; GFX9-PAL-LABEL: store_load_v3i32_unaligned:
3765; GFX9-PAL:       ; %bb.0: ; %bb
3766; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3767; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
3768; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
3769; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
3770; GFX9-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
3771; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3772; GFX9-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
3773; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3774; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3775;
3776; GFX940-LABEL: store_load_v3i32_unaligned:
3777; GFX940:       ; %bb.0: ; %bb
3778; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3779; GFX940-NEXT:    v_mov_b32_e32 v2, 1
3780; GFX940-NEXT:    v_mov_b32_e32 v3, 2
3781; GFX940-NEXT:    v_mov_b32_e32 v4, 3
3782; GFX940-NEXT:    scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
3783; GFX940-NEXT:    s_waitcnt vmcnt(0)
3784; GFX940-NEXT:    scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
3785; GFX940-NEXT:    s_waitcnt vmcnt(0)
3786; GFX940-NEXT:    s_setpc_b64 s[30:31]
3787;
3788; GFX10-PAL-LABEL: store_load_v3i32_unaligned:
3789; GFX10-PAL:       ; %bb.0: ; %bb
3790; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3791; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3792; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
3793; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
3794; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
3795; GFX10-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
3796; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3797; GFX10-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
3798; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3799; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3800;
3801; GFX11-PAL-LABEL: store_load_v3i32_unaligned:
3802; GFX11-PAL:       ; %bb.0: ; %bb
3803; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3804; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3805; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
3806; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, 3
3807; GFX11-PAL-NEXT:    scratch_store_b96 v0, v[1:3], off dlc
3808; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3809; GFX11-PAL-NEXT:    scratch_load_b96 v[0:2], v0, off glc dlc
3810; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3811; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
3812; GCN-LABEL: store_load_v3i32_unaligned:
3813; GCN:       ; %bb.0: ; %bb
3814; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3815; GCN-NEXT:    v_mov_b32_e32 v2, 1
3816; GCN-NEXT:    v_mov_b32_e32 v3, 2
3817; GCN-NEXT:    v_mov_b32_e32 v4, 3
3818; GCN-NEXT:    scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
3819; GCN-NEXT:    s_waitcnt vmcnt(0)
3820; GCN-NEXT:    scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
3821; GCN-NEXT:    s_waitcnt vmcnt(0)
3822; GCN-NEXT:    s_setpc_b64 s[30:31]
3823bb:
3824  store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1
3825  %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1
3826  ret void
3827}
3828
3829define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) {
3830; GFX9-LABEL: store_load_v4i32_unaligned:
3831; GFX9:       ; %bb.0: ; %bb
3832; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3833; GFX9-NEXT:    v_mov_b32_e32 v1, 1
3834; GFX9-NEXT:    v_mov_b32_e32 v2, 2
3835; GFX9-NEXT:    v_mov_b32_e32 v3, 3
3836; GFX9-NEXT:    v_mov_b32_e32 v4, 4
3837; GFX9-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
3838; GFX9-NEXT:    s_waitcnt vmcnt(0)
3839; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
3840; GFX9-NEXT:    s_waitcnt vmcnt(0)
3841; GFX9-NEXT:    s_setpc_b64 s[30:31]
3842;
3843; GFX10-LABEL: store_load_v4i32_unaligned:
3844; GFX10:       ; %bb.0: ; %bb
3845; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3846; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3847; GFX10-NEXT:    v_mov_b32_e32 v1, 1
3848; GFX10-NEXT:    v_mov_b32_e32 v2, 2
3849; GFX10-NEXT:    v_mov_b32_e32 v3, 3
3850; GFX10-NEXT:    v_mov_b32_e32 v4, 4
3851; GFX10-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
3852; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3853; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
3854; GFX10-NEXT:    s_waitcnt vmcnt(0)
3855; GFX10-NEXT:    s_setpc_b64 s[30:31]
3856;
3857; GFX11-LABEL: store_load_v4i32_unaligned:
3858; GFX11:       ; %bb.0: ; %bb
3859; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3860; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3861; GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
3862; GFX11-NEXT:    v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
3863; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off dlc
3864; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3865; GFX11-NEXT:    scratch_load_b128 v[0:3], v0, off glc dlc
3866; GFX11-NEXT:    s_waitcnt vmcnt(0)
3867; GFX11-NEXT:    s_setpc_b64 s[30:31]
3868;
3869; GFX9-PAL-LABEL: store_load_v4i32_unaligned:
3870; GFX9-PAL:       ; %bb.0: ; %bb
3871; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3872; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
3873; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
3874; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
3875; GFX9-PAL-NEXT:    v_mov_b32_e32 v4, 4
3876; GFX9-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
3877; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3878; GFX9-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
3879; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3880; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3881;
3882; GFX940-LABEL: store_load_v4i32_unaligned:
3883; GFX940:       ; %bb.0: ; %bb
3884; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3885; GFX940-NEXT:    v_mov_b32_e32 v2, 1
3886; GFX940-NEXT:    v_mov_b32_e32 v3, 2
3887; GFX940-NEXT:    v_mov_b32_e32 v4, 3
3888; GFX940-NEXT:    v_mov_b32_e32 v5, 4
3889; GFX940-NEXT:    scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
3890; GFX940-NEXT:    s_waitcnt vmcnt(0)
3891; GFX940-NEXT:    scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
3892; GFX940-NEXT:    s_waitcnt vmcnt(0)
3893; GFX940-NEXT:    s_setpc_b64 s[30:31]
3894;
3895; GFX10-PAL-LABEL: store_load_v4i32_unaligned:
3896; GFX10-PAL:       ; %bb.0: ; %bb
3897; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3898; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3899; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
3900; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
3901; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
3902; GFX10-PAL-NEXT:    v_mov_b32_e32 v4, 4
3903; GFX10-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
3904; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3905; GFX10-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
3906; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3907; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3908;
3909; GFX11-PAL-LABEL: store_load_v4i32_unaligned:
3910; GFX11-PAL:       ; %bb.0: ; %bb
3911; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3912; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3913; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
3914; GFX11-PAL-NEXT:    v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
3915; GFX11-PAL-NEXT:    scratch_store_b128 v0, v[1:4], off dlc
3916; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3917; GFX11-PAL-NEXT:    scratch_load_b128 v[0:3], v0, off glc dlc
3918; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3919; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
3920; GCN-LABEL: store_load_v4i32_unaligned:
3921; GCN:       ; %bb.0: ; %bb
3922; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3923; GCN-NEXT:    v_mov_b32_e32 v2, 1
3924; GCN-NEXT:    v_mov_b32_e32 v3, 2
3925; GCN-NEXT:    v_mov_b32_e32 v4, 3
3926; GCN-NEXT:    v_mov_b32_e32 v5, 4
3927; GCN-NEXT:    scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
3928; GCN-NEXT:    s_waitcnt vmcnt(0)
3929; GCN-NEXT:    scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
3930; GCN-NEXT:    s_waitcnt vmcnt(0)
3931; GCN-NEXT:    s_setpc_b64 s[30:31]
3932bb:
3933  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1
3934  %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1
3935  ret void
3936}
3937
3938define void @store_load_i32_negative_unaligned(i8 addrspace(5)* nocapture %arg) {
3939; GFX9-LABEL: store_load_i32_negative_unaligned:
3940; GFX9:       ; %bb.0: ; %bb
3941; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3942; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
3943; GFX9-NEXT:    v_mov_b32_e32 v1, 1
3944; GFX9-NEXT:    scratch_store_byte v0, v1, off
3945; GFX9-NEXT:    s_waitcnt vmcnt(0)
3946; GFX9-NEXT:    scratch_load_ubyte v0, v0, off glc
3947; GFX9-NEXT:    s_waitcnt vmcnt(0)
3948; GFX9-NEXT:    s_setpc_b64 s[30:31]
3949;
3950; GFX10-LABEL: store_load_i32_negative_unaligned:
3951; GFX10:       ; %bb.0: ; %bb
3952; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3953; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3954; GFX10-NEXT:    v_mov_b32_e32 v1, 1
3955; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-1
3956; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3957; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
3958; GFX10-NEXT:    s_waitcnt vmcnt(0)
3959; GFX10-NEXT:    s_setpc_b64 s[30:31]
3960;
3961; GFX11-LABEL: store_load_i32_negative_unaligned:
3962; GFX11:       ; %bb.0: ; %bb
3963; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3964; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3965; GFX11-NEXT:    v_mov_b32_e32 v1, 1
3966; GFX11-NEXT:    scratch_store_b8 v0, v1, off offset:-1 dlc
3967; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3968; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:-1 glc dlc
3969; GFX11-NEXT:    s_waitcnt vmcnt(0)
3970; GFX11-NEXT:    s_setpc_b64 s[30:31]
3971;
3972; GFX9-PAL-LABEL: store_load_i32_negative_unaligned:
3973; GFX9-PAL:       ; %bb.0: ; %bb
3974; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3975; GFX9-PAL-NEXT:    v_add_u32_e32 v0, -1, v0
3976; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
3977; GFX9-PAL-NEXT:    scratch_store_byte v0, v1, off
3978; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3979; GFX9-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc
3980; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3981; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3982;
3983; GFX940-LABEL: store_load_i32_negative_unaligned:
3984; GFX940:       ; %bb.0: ; %bb
3985; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3986; GFX940-NEXT:    v_add_u32_e32 v0, -1, v0
3987; GFX940-NEXT:    v_mov_b32_e32 v1, 1
3988; GFX940-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
3989; GFX940-NEXT:    s_waitcnt vmcnt(0)
3990; GFX940-NEXT:    scratch_load_ubyte v0, v0, off sc0 sc1
3991; GFX940-NEXT:    s_waitcnt vmcnt(0)
3992; GFX940-NEXT:    s_setpc_b64 s[30:31]
3993;
3994; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned:
3995; GFX1010-PAL:       ; %bb.0: ; %bb
3996; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3997; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3998; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, -1, v0
3999; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
4000; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off
4001; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4002; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
4003; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
4004; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
4005;
4006; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned:
4007; GFX1030-PAL:       ; %bb.0: ; %bb
4008; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4009; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4010; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
4011; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-1
4012; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4013; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
4014; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
4015; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
4016;
4017; GFX11-PAL-LABEL: store_load_i32_negative_unaligned:
4018; GFX11-PAL:       ; %bb.0: ; %bb
4019; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4020; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4021; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 1
4022; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-1 dlc
4023; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4024; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-1 glc dlc
4025; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
4026; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
4027bb:
4028  %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -1
4029  store volatile i8 1, i8 addrspace(5)* %ptr, align 1
4030  %load = load volatile i8, i8 addrspace(5)* %ptr, align 1
4031  ret void
4032}
4033
4034define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture %arg) {
4035; GFX9-LABEL: store_load_i32_large_negative_unaligned:
4036; GFX9:       ; %bb.0: ; %bb
4037; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4038; GFX9-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
4039; GFX9-NEXT:    v_mov_b32_e32 v1, 1
4040; GFX9-NEXT:    scratch_store_byte v0, v1, off
4041; GFX9-NEXT:    s_waitcnt vmcnt(0)
4042; GFX9-NEXT:    scratch_load_ubyte v0, v0, off glc
4043; GFX9-NEXT:    s_waitcnt vmcnt(0)
4044; GFX9-NEXT:    s_setpc_b64 s[30:31]
4045;
4046; GFX10-LABEL: store_load_i32_large_negative_unaligned:
4047; GFX10:       ; %bb.0: ; %bb
4048; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4049; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4050; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
4051; GFX10-NEXT:    v_mov_b32_e32 v1, 1
4052; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-129
4053; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4054; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
4055; GFX10-NEXT:    s_waitcnt vmcnt(0)
4056; GFX10-NEXT:    s_setpc_b64 s[30:31]
4057;
4058; GFX11-LABEL: store_load_i32_large_negative_unaligned:
4059; GFX11:       ; %bb.0: ; %bb
4060; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4061; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4062; GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0
4063; GFX11-NEXT:    scratch_store_b8 v0, v1, off offset:-129 dlc
4064; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4065; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:-129 glc dlc
4066; GFX11-NEXT:    s_waitcnt vmcnt(0)
4067; GFX11-NEXT:    s_setpc_b64 s[30:31]
4068;
4069; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned:
4070; GFX9-PAL:       ; %bb.0: ; %bb
4071; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4072; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
4073; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
4074; GFX9-PAL-NEXT:    scratch_store_byte v0, v1, off
4075; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4076; GFX9-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc
4077; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4078; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
4079;
4080; GFX940-LABEL: store_load_i32_large_negative_unaligned:
4081; GFX940:       ; %bb.0: ; %bb
4082; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4083; GFX940-NEXT:    s_movk_i32 s0, 0xef7f
4084; GFX940-NEXT:    v_mov_b32_e32 v1, 1
4085; GFX940-NEXT:    scratch_store_byte v0, v1, s0 sc0 sc1
4086; GFX940-NEXT:    s_waitcnt vmcnt(0)
4087; GFX940-NEXT:    scratch_load_ubyte v0, v0, s0 sc0 sc1
4088; GFX940-NEXT:    s_waitcnt vmcnt(0)
4089; GFX940-NEXT:    s_setpc_b64 s[30:31]
4090;
4091; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned:
4092; GFX1010-PAL:       ; %bb.0: ; %bb
4093; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4094; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4095; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xffffefff, v0
4096; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
4097; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-128
4098; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4099; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-128 glc dlc
4100; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
4101; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
4102;
4103; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned:
4104; GFX1030-PAL:       ; %bb.0: ; %bb
4105; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4106; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4107; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
4108; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
4109; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-129
4110; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4111; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
4112; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
4113; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
4114;
4115; GFX11-PAL-LABEL: store_load_i32_large_negative_unaligned:
4116; GFX11-PAL:       ; %bb.0: ; %bb
4117; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4118; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4119; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0
4120; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-129 dlc
4121; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4122; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-129 glc dlc
4123; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
4124; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
4125bb:
4126  %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -4225
4127  store volatile i8 1, i8 addrspace(5)* %ptr, align 1
4128  %load = load volatile i8, i8 addrspace(5)* %ptr, align 1
4129  ret void
4130}
4131
4132define amdgpu_ps void @large_offset() {
4133; GFX9-LABEL: large_offset:
4134; GFX9:       ; %bb.0: ; %bb
4135; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s2
4136; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4137; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
4138; GFX9-NEXT:    v_mov_b32_e32 v1, v0
4139; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4140; GFX9-NEXT:    v_mov_b32_e32 v3, v0
4141; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
4142; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024
4143; GFX9-NEXT:    s_waitcnt vmcnt(0)
4144; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
4145; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc
4146; GFX9-NEXT:    s_waitcnt vmcnt(0)
4147; GFX9-NEXT:    v_mov_b32_e32 v0, 16
4148; GFX9-NEXT:    ;;#ASMSTART
4149; GFX9-NEXT:    ; use v0
4150; GFX9-NEXT:    ;;#ASMEND
4151; GFX9-NEXT:    v_mov_b32_e32 v0, 0x810
4152; GFX9-NEXT:    ;;#ASMSTART
4153; GFX9-NEXT:    ; use v0
4154; GFX9-NEXT:    ;;#ASMEND
4155; GFX9-NEXT:    s_endpgm
4156;
4157; GFX10-LABEL: large_offset:
4158; GFX10:       ; %bb.0: ; %bb
4159; GFX10-NEXT:    s_add_u32 s0, s0, s2
4160; GFX10-NEXT:    s_addc_u32 s1, s1, 0
4161; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
4162; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
4163; GFX10-NEXT:    v_mov_b32_e32 v0, 0
4164; GFX10-NEXT:    s_movk_i32 s0, 0x810
4165; GFX10-NEXT:    s_addk_i32 s0, 0x3c0
4166; GFX10-NEXT:    v_mov_b32_e32 v1, v0
4167; GFX10-NEXT:    v_mov_b32_e32 v2, v0
4168; GFX10-NEXT:    v_mov_b32_e32 v3, v0
4169; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
4170; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4171; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 glc dlc
4172; GFX10-NEXT:    s_waitcnt vmcnt(0)
4173; GFX10-NEXT:    v_mov_b32_e32 v0, 16
4174; GFX10-NEXT:    v_mov_b32_e32 v1, 0x810
4175; GFX10-NEXT:    ;;#ASMSTART
4176; GFX10-NEXT:    ; use v0
4177; GFX10-NEXT:    ;;#ASMEND
4178; GFX10-NEXT:    ;;#ASMSTART
4179; GFX10-NEXT:    ; use v1
4180; GFX10-NEXT:    ;;#ASMEND
4181; GFX10-NEXT:    s_endpgm
4182;
4183; GFX11-LABEL: large_offset:
4184; GFX11:       ; %bb.0: ; %bb
4185; GFX11-NEXT:    v_mov_b32_e32 v0, 0
4186; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4187; GFX11-NEXT:    v_mov_b32_e32 v1, v0
4188; GFX11-NEXT:    v_mov_b32_e32 v2, v0
4189; GFX11-NEXT:    v_mov_b32_e32 v3, v0
4190; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:3024 dlc
4191; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4192; GFX11-NEXT:    scratch_load_b128 v[0:3], off, off offset:3024 glc dlc
4193; GFX11-NEXT:    s_waitcnt vmcnt(0)
4194; GFX11-NEXT:    v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810
4195; GFX11-NEXT:    ;;#ASMSTART
4196; GFX11-NEXT:    ; use v0
4197; GFX11-NEXT:    ;;#ASMEND
4198; GFX11-NEXT:    ;;#ASMSTART
4199; GFX11-NEXT:    ; use v1
4200; GFX11-NEXT:    ;;#ASMEND
4201; GFX11-NEXT:    s_endpgm
4202;
4203; GFX9-PAL-LABEL: large_offset:
4204; GFX9-PAL:       ; %bb.0: ; %bb
4205; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
4206; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
4207; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
4208; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0
4209; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, v0
4210; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, v0
4211; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, v0
4212; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
4213; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
4214; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s0
4215; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
4216; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
4217; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024
4218; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4219; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
4220; GFX9-PAL-NEXT:    scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc
4221; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4222; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 16
4223; GFX9-PAL-NEXT:    ;;#ASMSTART
4224; GFX9-PAL-NEXT:    ; use v0
4225; GFX9-PAL-NEXT:    ;;#ASMEND
4226; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0x810
4227; GFX9-PAL-NEXT:    ;;#ASMSTART
4228; GFX9-PAL-NEXT:    ; use v0
4229; GFX9-PAL-NEXT:    ;;#ASMEND
4230; GFX9-PAL-NEXT:    s_endpgm
4231;
4232; GFX940-LABEL: large_offset:
4233; GFX940:       ; %bb.0: ; %bb
4234; GFX940-NEXT:    v_mov_b32_e32 v0, 0
4235; GFX940-NEXT:    v_mov_b32_e32 v1, v0
4236; GFX940-NEXT:    v_mov_b32_e32 v2, v0
4237; GFX940-NEXT:    v_mov_b32_e32 v3, v0
4238; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1
4239; GFX940-NEXT:    s_waitcnt vmcnt(0)
4240; GFX940-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1
4241; GFX940-NEXT:    s_waitcnt vmcnt(0)
4242; GFX940-NEXT:    v_mov_b32_e32 v0, 16
4243; GFX940-NEXT:    ;;#ASMSTART
4244; GFX940-NEXT:    ; use v0
4245; GFX940-NEXT:    ;;#ASMEND
4246; GFX940-NEXT:    v_mov_b32_e32 v0, 0x810
4247; GFX940-NEXT:    ;;#ASMSTART
4248; GFX940-NEXT:    ; use v0
4249; GFX940-NEXT:    ;;#ASMEND
4250; GFX940-NEXT:    s_endpgm
4251;
4252; GFX10-PAL-LABEL: large_offset:
4253; GFX10-PAL:       ; %bb.0: ; %bb
4254; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
4255; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
4256; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
4257; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
4258; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
4259; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s0
4260; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
4261; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
4262; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
4263; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 0
4264; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x810
4265; GFX10-PAL-NEXT:    s_addk_i32 s0, 0x3c0
4266; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, v0
4267; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, v0
4268; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, v0
4269; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
4270; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4271; GFX10-PAL-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 glc dlc
4272; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
4273; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 16
4274; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x810
4275; GFX10-PAL-NEXT:    ;;#ASMSTART
4276; GFX10-PAL-NEXT:    ; use v0
4277; GFX10-PAL-NEXT:    ;;#ASMEND
4278; GFX10-PAL-NEXT:    ;;#ASMSTART
4279; GFX10-PAL-NEXT:    ; use v1
4280; GFX10-PAL-NEXT:    ;;#ASMEND
4281; GFX10-PAL-NEXT:    s_endpgm
4282;
4283; GFX11-PAL-LABEL: large_offset:
4284; GFX11-PAL:       ; %bb.0: ; %bb
4285; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 0
4286; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4287; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, v0
4288; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, v0
4289; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, v0
4290; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:3024 dlc
4291; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4292; GFX11-PAL-NEXT:    scratch_load_b128 v[0:3], off, off offset:3024 glc dlc
4293; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
4294; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810
4295; GFX11-PAL-NEXT:    ;;#ASMSTART
4296; GFX11-PAL-NEXT:    ; use v0
4297; GFX11-PAL-NEXT:    ;;#ASMEND
4298; GFX11-PAL-NEXT:    ;;#ASMSTART
4299; GFX11-PAL-NEXT:    ; use v1
4300; GFX11-PAL-NEXT:    ;;#ASMEND
4301; GFX11-PAL-NEXT:    s_endpgm
4302bb:
4303  %alloca = alloca [128 x <4 x i32>], align 16, addrspace(5)
4304  %alloca2 = alloca [128 x <4 x i32>], align 16, addrspace(5)
4305  %gep = getelementptr inbounds [128 x <4 x i32>], [128 x <4 x i32>] addrspace(5)* %alloca2, i32 0, i32 60
4306  store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(5)* %gep, align 16
4307  %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %gep, align 16
4308  call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca) #0
4309  call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca2) #0
4310  ret void
4311}
4312
4313declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg)
4314declare i32 @llvm.amdgcn.workitem.id.x()
4315