1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s
5; RUN: llc -march=amdgcn -mcpu=gfx940 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s
6; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s
7; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s
8
9define amdgpu_kernel void @zero_init_kernel() {
10; GFX9-LABEL: zero_init_kernel:
11; GFX9:       ; %bb.0:
12; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
13; GFX9-NEXT:    s_mov_b32 s0, 0
14; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
15; GFX9-NEXT:    s_mov_b32 s1, s0
16; GFX9-NEXT:    s_mov_b32 s2, s0
17; GFX9-NEXT:    s_mov_b32 s3, s0
18; GFX9-NEXT:    v_mov_b32_e32 v0, s0
19; GFX9-NEXT:    v_mov_b32_e32 v1, s1
20; GFX9-NEXT:    v_mov_b32_e32 v2, s2
21; GFX9-NEXT:    v_mov_b32_e32 v3, s3
22; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
23; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64
24; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
25; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
26; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
27; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
28; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
29; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
30; GFX9-NEXT:    s_endpgm
31;
32; GFX10-LABEL: zero_init_kernel:
33; GFX10:       ; %bb.0:
34; GFX10-NEXT:    s_add_u32 s0, s0, s3
35; GFX10-NEXT:    s_addc_u32 s1, s1, 0
36; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
37; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
38; GFX10-NEXT:    s_mov_b32 s0, 0
39; GFX10-NEXT:    s_mov_b32 s1, s0
40; GFX10-NEXT:    s_mov_b32 s2, s0
41; GFX10-NEXT:    s_mov_b32 s3, s0
42; GFX10-NEXT:    v_mov_b32_e32 v0, s0
43; GFX10-NEXT:    v_mov_b32_e32 v1, s1
44; GFX10-NEXT:    v_mov_b32_e32 v2, s2
45; GFX10-NEXT:    v_mov_b32_e32 v3, s3
46; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
47; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
48; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
49; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
50; GFX10-NEXT:    s_endpgm
51;
52; GFX9-PAL-LABEL: zero_init_kernel:
53; GFX9-PAL:       ; %bb.0:
54; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
55; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
56; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
57; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
58; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
59; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
60; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
61; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
62; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
63; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
64; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
65; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
66; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
67; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
68; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
69; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
70; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64
71; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
72; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
73; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
74; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
75; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
76; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
77; GFX9-PAL-NEXT:    s_endpgm
78;
79; GFX940-LABEL: zero_init_kernel:
80; GFX940:       ; %bb.0:
81; GFX940-NEXT:    s_mov_b32 s0, 0
82; GFX940-NEXT:    s_mov_b32 s1, s0
83; GFX940-NEXT:    s_mov_b32 s2, s0
84; GFX940-NEXT:    s_mov_b32 s3, s0
85; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
86; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
87; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
88; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
89; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
90; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
91; GFX940-NEXT:    s_endpgm
92;
93; GFX1010-PAL-LABEL: zero_init_kernel:
94; GFX1010-PAL:       ; %bb.0:
95; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
96; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
97; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
98; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
99; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
100; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
101; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
102; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
103; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
104; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
105; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
106; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
107; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
108; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
109; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
110; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
111; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
112; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
113; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:64
114; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
115; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
116; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
117; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
118; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
119; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
120; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
121; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
122; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
123; GFX1010-PAL-NEXT:    s_endpgm
124;
125; GFX1030-PAL-LABEL: zero_init_kernel:
126; GFX1030-PAL:       ; %bb.0:
127; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
128; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
129; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
130; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
131; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
132; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
133; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
134; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
135; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
136; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
137; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
138; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
139; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
140; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
141; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
142; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
143; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
144; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
145; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
146; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
147; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
148; GFX1030-PAL-NEXT:    s_endpgm
149  %alloca = alloca [32 x i16], align 2, addrspace(5)
150  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
151  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
152  ret void
153}
154
155define void @zero_init_foo() {
156; GFX9-LABEL: zero_init_foo:
157; GFX9:       ; %bb.0:
158; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159; GFX9-NEXT:    s_mov_b32 s0, 0
160; GFX9-NEXT:    s_mov_b32 s1, s0
161; GFX9-NEXT:    s_mov_b32 s2, s0
162; GFX9-NEXT:    s_mov_b32 s3, s0
163; GFX9-NEXT:    v_mov_b32_e32 v0, s0
164; GFX9-NEXT:    v_mov_b32_e32 v1, s1
165; GFX9-NEXT:    v_mov_b32_e32 v2, s2
166; GFX9-NEXT:    v_mov_b32_e32 v3, s3
167; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
168; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
169; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
170; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
171; GFX9-NEXT:    s_waitcnt vmcnt(0)
172; GFX9-NEXT:    s_setpc_b64 s[30:31]
173;
174; GFX10-LABEL: zero_init_foo:
175; GFX10:       ; %bb.0:
176; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
177; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
178; GFX10-NEXT:    s_mov_b32 s0, 0
179; GFX10-NEXT:    s_mov_b32 s1, s0
180; GFX10-NEXT:    s_mov_b32 s2, s0
181; GFX10-NEXT:    s_mov_b32 s3, s0
182; GFX10-NEXT:    v_mov_b32_e32 v0, s0
183; GFX10-NEXT:    v_mov_b32_e32 v1, s1
184; GFX10-NEXT:    v_mov_b32_e32 v2, s2
185; GFX10-NEXT:    v_mov_b32_e32 v3, s3
186; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
187; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
188; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
189; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
190; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
191; GFX10-NEXT:    s_setpc_b64 s[30:31]
192;
193; GFX9-PAL-LABEL: zero_init_foo:
194; GFX9-PAL:       ; %bb.0:
195; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
196; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
197; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
198; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
199; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
200; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
201; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
202; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
203; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
204; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
205; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
206; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
207; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
208; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
209; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
210;
211; GFX940-LABEL: zero_init_foo:
212; GFX940:       ; %bb.0:
213; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
214; GFX940-NEXT:    s_mov_b32 s0, 0
215; GFX940-NEXT:    s_mov_b32 s1, s0
216; GFX940-NEXT:    s_mov_b32 s2, s0
217; GFX940-NEXT:    s_mov_b32 s3, s0
218; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
219; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
220; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
221; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
222; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
223; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
224; GFX940-NEXT:    s_waitcnt vmcnt(0)
225; GFX940-NEXT:    s_setpc_b64 s[30:31]
226;
227; GFX10-PAL-LABEL: zero_init_foo:
228; GFX10-PAL:       ; %bb.0:
229; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
230; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
231; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
232; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
233; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
234; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
235; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
236; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
237; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
238; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
239; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
240; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
241; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
242; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
243; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
244; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
245; GCN-LABEL: zero_init_foo:
246; GCN:       ; %bb.0:
247; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
248; GCN-NEXT:    s_mov_b32 s0, 0
249; GCN-NEXT:    s_mov_b32 s1, s0
250; GCN-NEXT:    s_mov_b32 s2, s0
251; GCN-NEXT:    s_mov_b32 s3, s0
252; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
253; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
254; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
255; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
256; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
257; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
258; GCN-NEXT:    s_waitcnt vmcnt(0)
259; GCN-NEXT:    s_setpc_b64 s[30:31]
260  %alloca = alloca [32 x i16], align 2, addrspace(5)
261  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
262  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
263  ret void
264}
265
266define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
267; GFX9-LABEL: store_load_sindex_kernel:
268; GFX9:       ; %bb.0: ; %bb
269; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
270; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
271; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
272; GFX9-NEXT:    v_mov_b32_e32 v0, 15
273; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
274; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
275; GFX9-NEXT:    s_and_b32 s0, s0, 15
276; GFX9-NEXT:    s_add_i32 s1, s1, 4
277; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
278; GFX9-NEXT:    scratch_store_dword off, v0, s1
279; GFX9-NEXT:    s_waitcnt vmcnt(0)
280; GFX9-NEXT:    s_add_i32 s0, s0, 4
281; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
282; GFX9-NEXT:    s_waitcnt vmcnt(0)
283; GFX9-NEXT:    s_endpgm
284;
285; GFX10-LABEL: store_load_sindex_kernel:
286; GFX10:       ; %bb.0: ; %bb
287; GFX10-NEXT:    s_add_u32 s2, s2, s5
288; GFX10-NEXT:    s_addc_u32 s3, s3, 0
289; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
290; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
291; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
292; GFX10-NEXT:    v_mov_b32_e32 v0, 15
293; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
294; GFX10-NEXT:    s_and_b32 s1, s0, 15
295; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
296; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
297; GFX10-NEXT:    s_add_i32 s0, s0, 4
298; GFX10-NEXT:    s_add_i32 s1, s1, 4
299; GFX10-NEXT:    scratch_store_dword off, v0, s0
300; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
301; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
302; GFX10-NEXT:    s_waitcnt vmcnt(0)
303; GFX10-NEXT:    s_endpgm
304;
305; GFX9-PAL-LABEL: store_load_sindex_kernel:
306; GFX9-PAL:       ; %bb.0: ; %bb
307; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
308; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
309; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
310; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
311; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
312; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
313; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
314; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
315; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
316; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
317; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
318; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 4
319; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
320; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
321; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
322; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
323; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
324; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
325; GFX9-PAL-NEXT:    s_endpgm
326;
327; GFX940-LABEL: store_load_sindex_kernel:
328; GFX940:       ; %bb.0: ; %bb
329; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
330; GFX940-NEXT:    v_mov_b32_e32 v0, 15
331; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
332; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
333; GFX940-NEXT:    s_and_b32 s0, s0, 15
334; GFX940-NEXT:    s_add_i32 s1, s1, 4
335; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
336; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
337; GFX940-NEXT:    s_waitcnt vmcnt(0)
338; GFX940-NEXT:    s_add_i32 s0, s0, 4
339; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
340; GFX940-NEXT:    s_waitcnt vmcnt(0)
341; GFX940-NEXT:    s_endpgm
342;
343; GFX10-PAL-LABEL: store_load_sindex_kernel:
344; GFX10-PAL:       ; %bb.0: ; %bb
345; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
346; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
347; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
348; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
349; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
350; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
351; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
352; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
353; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
354; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
355; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
356; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
357; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
358; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
359; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
360; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 4
361; GFX10-PAL-NEXT:    s_add_i32 s1, s1, 4
362; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
363; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
364; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
365; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
366; GFX10-PAL-NEXT:    s_endpgm
367; GCN-LABEL: store_load_sindex_kernel:
368; GCN:       ; %bb.0: ; %bb
369; GCN-NEXT:    s_load_dword s0, s[0:1], 0x24
370; GCN-NEXT:    v_mov_b32_e32 v0, 15
371; GCN-NEXT:    s_waitcnt lgkmcnt(0)
372; GCN-NEXT:    s_lshl_b32 s1, s0, 2
373; GCN-NEXT:    s_and_b32 s0, s0, 15
374; GCN-NEXT:    s_lshl_b32 s0, s0, 2
375; GCN-NEXT:    s_add_u32 s1, 4, s1
376; GCN-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
377; GCN-NEXT:    s_waitcnt vmcnt(0)
378; GCN-NEXT:    s_add_u32 s0, 4, s0
379; GCN-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
380; GCN-NEXT:    s_waitcnt vmcnt(0)
381; GCN-NEXT:    s_endpgm
382bb:
383  %i = alloca [32 x float], align 4, addrspace(5)
384  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
385  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
386  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
387  store volatile i32 15, i32 addrspace(5)* %i8, align 4
388  %i9 = and i32 %idx, 15
389  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
390  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
391  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
392  ret void
393}
394
395define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
396; GFX9-LABEL: store_load_sindex_foo:
397; GFX9:       ; %bb.0: ; %bb
398; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
399; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
400; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
401; GFX9-NEXT:    s_add_i32 s0, s0, 4
402; GFX9-NEXT:    v_mov_b32_e32 v0, 15
403; GFX9-NEXT:    scratch_store_dword off, v0, s0
404; GFX9-NEXT:    s_waitcnt vmcnt(0)
405; GFX9-NEXT:    s_and_b32 s0, s2, 15
406; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
407; GFX9-NEXT:    s_add_i32 s0, s0, 4
408; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
409; GFX9-NEXT:    s_waitcnt vmcnt(0)
410; GFX9-NEXT:    s_endpgm
411;
412; GFX10-LABEL: store_load_sindex_foo:
413; GFX10:       ; %bb.0: ; %bb
414; GFX10-NEXT:    s_add_u32 s0, s0, s3
415; GFX10-NEXT:    s_addc_u32 s1, s1, 0
416; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
417; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
418; GFX10-NEXT:    v_mov_b32_e32 v0, 15
419; GFX10-NEXT:    s_and_b32 s0, s2, 15
420; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
421; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
422; GFX10-NEXT:    s_add_i32 s1, s1, 4
423; GFX10-NEXT:    s_add_i32 s0, s0, 4
424; GFX10-NEXT:    scratch_store_dword off, v0, s1
425; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
426; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
427; GFX10-NEXT:    s_waitcnt vmcnt(0)
428; GFX10-NEXT:    s_endpgm
429;
430; GFX9-PAL-LABEL: store_load_sindex_foo:
431; GFX9-PAL:       ; %bb.0: ; %bb
432; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
433; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
434; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
435; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
436; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
437; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
438; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
439; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
440; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
441; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
442; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 4
443; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
444; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
445; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
446; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
447; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
448; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
449; GFX9-PAL-NEXT:    s_endpgm
450;
451; GFX940-LABEL: store_load_sindex_foo:
452; GFX940:       ; %bb.0: ; %bb
453; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
454; GFX940-NEXT:    s_and_b32 s0, s0, 15
455; GFX940-NEXT:    s_add_i32 s1, s1, 4
456; GFX940-NEXT:    v_mov_b32_e32 v0, 15
457; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
458; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
459; GFX940-NEXT:    s_waitcnt vmcnt(0)
460; GFX940-NEXT:    s_add_i32 s0, s0, 4
461; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
462; GFX940-NEXT:    s_waitcnt vmcnt(0)
463; GFX940-NEXT:    s_endpgm
464;
465; GFX10-PAL-LABEL: store_load_sindex_foo:
466; GFX10-PAL:       ; %bb.0: ; %bb
467; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
468; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
469; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
470; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
471; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
472; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
473; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
474; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
475; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
476; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
477; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
478; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
479; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
480; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 4
481; GFX10-PAL-NEXT:    s_add_i32 s1, s1, 4
482; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
483; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
484; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
485; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
486; GFX10-PAL-NEXT:    s_endpgm
487; GCN-LABEL: store_load_sindex_foo:
488; GCN:       ; %bb.0: ; %bb
489; GCN-NEXT:    s_lshl_b32 s1, s0, 2
490; GCN-NEXT:    s_and_b32 s0, s0, 15
491; GCN-NEXT:    s_lshl_b32 s0, s0, 2
492; GCN-NEXT:    s_add_u32 s1, 4, s1
493; GCN-NEXT:    v_mov_b32_e32 v0, 15
494; GCN-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
495; GCN-NEXT:    s_waitcnt vmcnt(0)
496; GCN-NEXT:    s_add_u32 s0, 4, s0
497; GCN-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
498; GCN-NEXT:    s_waitcnt vmcnt(0)
499; GCN-NEXT:    s_endpgm
500bb:
501  %i = alloca [32 x float], align 4, addrspace(5)
502  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
503  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
504  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
505  store volatile i32 15, i32 addrspace(5)* %i8, align 4
506  %i9 = and i32 %idx, 15
507  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
508  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
509  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
510  ret void
511}
512
513define amdgpu_kernel void @store_load_vindex_kernel() {
514; GFX9-LABEL: store_load_vindex_kernel:
515; GFX9:       ; %bb.0: ; %bb
516; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
517; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
518; GFX9-NEXT:    v_mov_b32_e32 v1, 4
519; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
520; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
521; GFX9-NEXT:    v_mov_b32_e32 v3, 15
522; GFX9-NEXT:    scratch_store_dword v2, v3, off
523; GFX9-NEXT:    s_waitcnt vmcnt(0)
524; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
525; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
526; GFX9-NEXT:    s_waitcnt vmcnt(0)
527; GFX9-NEXT:    s_endpgm
528;
529; GFX10-LABEL: store_load_vindex_kernel:
530; GFX10:       ; %bb.0: ; %bb
531; GFX10-NEXT:    s_add_u32 s0, s0, s3
532; GFX10-NEXT:    s_addc_u32 s1, s1, 0
533; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
534; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
535; GFX10-NEXT:    v_mov_b32_e32 v1, 4
536; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
537; GFX10-NEXT:    v_mov_b32_e32 v3, 15
538; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
539; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
540; GFX10-NEXT:    scratch_store_dword v2, v3, off
541; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
542; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
543; GFX10-NEXT:    s_waitcnt vmcnt(0)
544; GFX10-NEXT:    s_endpgm
545;
546; GFX9-PAL-LABEL: store_load_vindex_kernel:
547; GFX9-PAL:       ; %bb.0: ; %bb
548; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
549; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
550; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
551; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
552; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 4
553; GFX9-PAL-NEXT:    v_add_u32_e32 v2, v1, v0
554; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
555; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
556; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
557; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
558; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
559; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
560; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
561; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, v1, v0
562; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
563; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
564; GFX9-PAL-NEXT:    s_endpgm
565;
566; GFX940-LABEL: store_load_vindex_kernel:
567; GFX940:       ; %bb.0: ; %bb
568; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
569; GFX940-NEXT:    v_mov_b32_e32 v1, 15
570; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
571; GFX940-NEXT:    s_waitcnt vmcnt(0)
572; GFX940-NEXT:    v_sub_u32_e32 v0, 4, v0
573; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
574; GFX940-NEXT:    s_waitcnt vmcnt(0)
575; GFX940-NEXT:    s_endpgm
576;
577; GFX10-PAL-LABEL: store_load_vindex_kernel:
578; GFX10-PAL:       ; %bb.0: ; %bb
579; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
580; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
581; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
582; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
583; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
584; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
585; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
586; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
587; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
588; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 4
589; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
590; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
591; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
592; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
593; GFX10-PAL-NEXT:    scratch_store_dword v2, v3, off
594; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
595; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
596; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
597; GFX10-PAL-NEXT:    s_endpgm
598; GCN-LABEL: store_load_vindex_kernel:
599; GCN:       ; %bb.0: ; %bb
600; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
601; GCN-NEXT:    v_mov_b32_e32 v1, 15
602; GCN-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
603; GCN-NEXT:    s_waitcnt vmcnt(0)
604; GCN-NEXT:    v_sub_u32_e32 v0, 4, v0
605; GCN-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
606; GCN-NEXT:    s_waitcnt vmcnt(0)
607; GCN-NEXT:    s_endpgm
608bb:
609  %i = alloca [32 x float], align 4, addrspace(5)
610  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
611  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
612  %i3 = zext i32 %i2 to i64
613  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
614  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
615  store volatile i32 15, i32 addrspace(5)* %i8, align 4
616  %i9 = sub nsw i32 31, %i2
617  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
618  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
619  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
620  ret void
621}
622
623define void @store_load_vindex_foo(i32 %idx) {
624; GFX9-LABEL: store_load_vindex_foo:
625; GFX9:       ; %bb.0: ; %bb
626; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
627; GFX9-NEXT:    v_mov_b32_e32 v1, s32
628; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
629; GFX9-NEXT:    v_mov_b32_e32 v3, 15
630; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
631; GFX9-NEXT:    scratch_store_dword v2, v3, off
632; GFX9-NEXT:    s_waitcnt vmcnt(0)
633; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
634; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
635; GFX9-NEXT:    s_waitcnt vmcnt(0)
636; GFX9-NEXT:    s_setpc_b64 s[30:31]
637;
638; GFX10-LABEL: store_load_vindex_foo:
639; GFX10:       ; %bb.0: ; %bb
640; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
641; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
642; GFX10-NEXT:    v_mov_b32_e32 v1, s32
643; GFX10-NEXT:    v_and_b32_e32 v2, 15, v0
644; GFX10-NEXT:    v_mov_b32_e32 v3, 15
645; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
646; GFX10-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
647; GFX10-NEXT:    scratch_store_dword v0, v3, off
648; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
649; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
650; GFX10-NEXT:    s_waitcnt vmcnt(0)
651; GFX10-NEXT:    s_setpc_b64 s[30:31]
652;
653; GFX9-PAL-LABEL: store_load_vindex_foo:
654; GFX9-PAL:       ; %bb.0: ; %bb
655; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
656; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s32
657; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
658; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
659; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
660; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
661; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
662; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
663; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
664; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
665; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
666;
667; GFX940-LABEL: store_load_vindex_foo:
668; GFX940:       ; %bb.0: ; %bb
669; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
670; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
671; GFX940-NEXT:    v_mov_b32_e32 v2, 15
672; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
673; GFX940-NEXT:    scratch_store_dword v1, v2, s32 sc0 sc1
674; GFX940-NEXT:    s_waitcnt vmcnt(0)
675; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
676; GFX940-NEXT:    scratch_load_dword v0, v0, s32 sc0 sc1
677; GFX940-NEXT:    s_waitcnt vmcnt(0)
678; GFX940-NEXT:    s_setpc_b64 s[30:31]
679;
680; GFX10-PAL-LABEL: store_load_vindex_foo:
681; GFX10-PAL:       ; %bb.0: ; %bb
682; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
683; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
684; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s32
685; GFX10-PAL-NEXT:    v_and_b32_e32 v2, 15, v0
686; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
687; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
688; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
689; GFX10-PAL-NEXT:    scratch_store_dword v0, v3, off
690; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
691; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
692; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
693; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
694; GCN-LABEL: store_load_vindex_foo:
695; GCN:       ; %bb.0: ; %bb
696; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
697; GCN-NEXT:    v_mov_b32_e32 v2, 15
698; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
699; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
700; GCN-NEXT:    scratch_store_dword v1, v2, s32 sc0 sc1
701; GCN-NEXT:    s_waitcnt vmcnt(0)
702; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
703; GCN-NEXT:    scratch_load_dword v0, v0, s32 sc0 sc1
704; GCN-NEXT:    s_waitcnt vmcnt(0)
705; GCN-NEXT:    s_setpc_b64 s[30:31]
706bb:
707  %i = alloca [32 x float], align 4, addrspace(5)
708  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
709  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
710  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
711  store volatile i32 15, i32 addrspace(5)* %i8, align 4
712  %i9 = and i32 %idx, 15
713  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
714  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
715  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
716  ret void
717}
718
719define void @private_ptr_foo(float addrspace(5)* nocapture %arg) {
720; GFX9-LABEL: private_ptr_foo:
721; GFX9:       ; %bb.0:
722; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
723; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
724; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
725; GFX9-NEXT:    s_waitcnt vmcnt(0)
726; GFX9-NEXT:    s_setpc_b64 s[30:31]
727;
728; GFX10-LABEL: private_ptr_foo:
729; GFX10:       ; %bb.0:
730; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
731; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
732; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
733; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
734; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
735; GFX10-NEXT:    s_setpc_b64 s[30:31]
736;
737; GFX9-PAL-LABEL: private_ptr_foo:
738; GFX9-PAL:       ; %bb.0:
739; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
740; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
741; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
742; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
743; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
744;
745; GFX940-LABEL: private_ptr_foo:
746; GFX940:       ; %bb.0:
747; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
748; GFX940-NEXT:    v_mov_b32_e32 v1, 0x41200000
749; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4
750; GFX940-NEXT:    s_waitcnt vmcnt(0)
751; GFX940-NEXT:    s_setpc_b64 s[30:31]
752;
753; GFX10-PAL-LABEL: private_ptr_foo:
754; GFX10-PAL:       ; %bb.0:
755; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
756; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
757; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
758; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
759; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
760; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
761; GCN-LABEL: private_ptr_foo:
762; GCN:       ; %bb.0:
763; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
764; GCN-NEXT:    v_mov_b32_e32 v1, 0x41200000
765; GCN-NEXT:    scratch_store_dword v0, v1, off offset:4
766; GCN-NEXT:    s_waitcnt vmcnt(0)
767; GCN-NEXT:    s_setpc_b64 s[30:31]
768  %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1
769  store float 1.000000e+01, float addrspace(5)* %gep, align 4
770  ret void
771}
772
773define amdgpu_kernel void @zero_init_small_offset_kernel() {
774; GFX9-LABEL: zero_init_small_offset_kernel:
775; GFX9:       ; %bb.0:
776; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
777; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
778; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
779; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
780; GFX9-NEXT:    s_waitcnt vmcnt(0)
781; GFX9-NEXT:    s_mov_b32 s0, 0
782; GFX9-NEXT:    s_mov_b32 s1, s0
783; GFX9-NEXT:    s_mov_b32 s2, s0
784; GFX9-NEXT:    s_mov_b32 s3, s0
785; GFX9-NEXT:    v_mov_b32_e32 v0, s0
786; GFX9-NEXT:    v_mov_b32_e32 v1, s1
787; GFX9-NEXT:    v_mov_b32_e32 v2, s2
788; GFX9-NEXT:    v_mov_b32_e32 v3, s3
789; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
790; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272
791; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
792; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288
793; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
794; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304
795; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
796; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
797; GFX9-NEXT:    s_endpgm
798;
799; GFX10-LABEL: zero_init_small_offset_kernel:
800; GFX10:       ; %bb.0:
801; GFX10-NEXT:    s_add_u32 s0, s0, s3
802; GFX10-NEXT:    s_addc_u32 s1, s1, 0
803; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
804; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
805; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
806; GFX10-NEXT:    s_waitcnt vmcnt(0)
807; GFX10-NEXT:    s_mov_b32 s0, 0
808; GFX10-NEXT:    s_mov_b32 s1, s0
809; GFX10-NEXT:    s_mov_b32 s2, s0
810; GFX10-NEXT:    s_mov_b32 s3, s0
811; GFX10-NEXT:    v_mov_b32_e32 v0, s0
812; GFX10-NEXT:    v_mov_b32_e32 v1, s1
813; GFX10-NEXT:    v_mov_b32_e32 v2, s2
814; GFX10-NEXT:    v_mov_b32_e32 v3, s3
815; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
816; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
817; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
818; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
819; GFX10-NEXT:    s_endpgm
820;
821; GFX9-PAL-LABEL: zero_init_small_offset_kernel:
822; GFX9-PAL:       ; %bb.0:
823; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
824; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
825; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
826; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
827; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
828; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
829; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
830; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
831; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
832; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
833; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
834; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
835; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
836; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
837; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
838; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
839; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
840; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
841; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
842; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272
843; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
844; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288
845; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
846; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304
847; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
848; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
849; GFX9-PAL-NEXT:    s_endpgm
850;
851; GFX940-LABEL: zero_init_small_offset_kernel:
852; GFX940:       ; %bb.0:
853; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
854; GFX940-NEXT:    s_waitcnt vmcnt(0)
855; GFX940-NEXT:    s_mov_b32 s0, 0
856; GFX940-NEXT:    s_mov_b32 s1, s0
857; GFX940-NEXT:    s_mov_b32 s2, s0
858; GFX940-NEXT:    s_mov_b32 s3, s0
859; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
860; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
861; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
862; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
863; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
864; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
865; GFX940-NEXT:    s_endpgm
866;
867; GFX1010-PAL-LABEL: zero_init_small_offset_kernel:
868; GFX1010-PAL:       ; %bb.0:
869; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
870; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
871; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
872; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
873; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
874; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
875; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
876; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
877; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
878; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
879; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
880; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
881; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
882; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
883; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
884; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
885; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
886; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
887; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
888; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
889; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
890; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:272
891; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
892; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
893; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:288
894; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
895; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
896; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:304
897; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
898; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
899; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:320
900; GFX1010-PAL-NEXT:    s_endpgm
901;
902; GFX1030-PAL-LABEL: zero_init_small_offset_kernel:
903; GFX1030-PAL:       ; %bb.0:
904; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
905; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
906; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
907; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
908; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
909; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
910; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
911; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
912; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
913; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
914; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
915; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
916; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
917; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
918; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
919; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
920; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
921; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
922; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
923; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
924; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
925; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
926; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
927; GFX1030-PAL-NEXT:    s_endpgm
928  %padding = alloca [64 x i32], align 4, addrspace(5)
929  %alloca = alloca [32 x i16], align 2, addrspace(5)
930  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
931  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
932  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
933  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
934  ret void
935}
936
937define void @zero_init_small_offset_foo() {
938; GFX9-LABEL: zero_init_small_offset_foo:
939; GFX9:       ; %bb.0:
940; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
941; GFX9-NEXT:    scratch_load_dword v0, off, s32 glc
942; GFX9-NEXT:    s_waitcnt vmcnt(0)
943; GFX9-NEXT:    s_mov_b32 s0, 0
944; GFX9-NEXT:    s_mov_b32 s1, s0
945; GFX9-NEXT:    s_mov_b32 s2, s0
946; GFX9-NEXT:    s_mov_b32 s3, s0
947; GFX9-NEXT:    v_mov_b32_e32 v0, s0
948; GFX9-NEXT:    v_mov_b32_e32 v1, s1
949; GFX9-NEXT:    v_mov_b32_e32 v2, s2
950; GFX9-NEXT:    v_mov_b32_e32 v3, s3
951; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
952; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
953; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
954; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
955; GFX9-NEXT:    s_waitcnt vmcnt(0)
956; GFX9-NEXT:    s_setpc_b64 s[30:31]
957;
958; GFX10-LABEL: zero_init_small_offset_foo:
959; GFX10:       ; %bb.0:
960; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
961; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
962; GFX10-NEXT:    scratch_load_dword v0, off, s32 glc dlc
963; GFX10-NEXT:    s_waitcnt vmcnt(0)
964; GFX10-NEXT:    s_mov_b32 s0, 0
965; GFX10-NEXT:    s_mov_b32 s1, s0
966; GFX10-NEXT:    s_mov_b32 s2, s0
967; GFX10-NEXT:    s_mov_b32 s3, s0
968; GFX10-NEXT:    v_mov_b32_e32 v0, s0
969; GFX10-NEXT:    v_mov_b32_e32 v1, s1
970; GFX10-NEXT:    v_mov_b32_e32 v2, s2
971; GFX10-NEXT:    v_mov_b32_e32 v3, s3
972; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
973; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
974; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
975; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
976; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
977; GFX10-NEXT:    s_setpc_b64 s[30:31]
978;
979; GFX9-PAL-LABEL: zero_init_small_offset_foo:
980; GFX9-PAL:       ; %bb.0:
981; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
982; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 glc
983; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
984; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
985; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
986; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
987; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
988; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
989; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
990; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
991; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
992; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
993; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
994; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
995; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
996; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
997; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
998;
999; GFX940-LABEL: zero_init_small_offset_foo:
1000; GFX940:       ; %bb.0:
1001; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1002; GFX940-NEXT:    scratch_load_dword v0, off, s32 sc0 sc1
1003; GFX940-NEXT:    s_waitcnt vmcnt(0)
1004; GFX940-NEXT:    s_mov_b32 s0, 0
1005; GFX940-NEXT:    s_mov_b32 s1, s0
1006; GFX940-NEXT:    s_mov_b32 s2, s0
1007; GFX940-NEXT:    s_mov_b32 s3, s0
1008; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1009; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
1010; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1011; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1012; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1013; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1014; GFX940-NEXT:    s_waitcnt vmcnt(0)
1015; GFX940-NEXT:    s_setpc_b64 s[30:31]
1016;
1017; GFX10-PAL-LABEL: zero_init_small_offset_foo:
1018; GFX10-PAL:       ; %bb.0:
1019; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1020; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1021; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s32 glc dlc
1022; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1023; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
1024; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
1025; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1026; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
1027; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
1028; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
1029; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
1030; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
1031; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1032; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1033; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1034; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1035; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1036; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1037; GCN-LABEL: zero_init_small_offset_foo:
1038; GCN:       ; %bb.0:
1039; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1040; GCN-NEXT:    scratch_load_dword v0, off, s32 sc0 sc1
1041; GCN-NEXT:    s_waitcnt vmcnt(0)
1042; GCN-NEXT:    s_mov_b32 s0, 0
1043; GCN-NEXT:    s_mov_b32 s1, s0
1044; GCN-NEXT:    s_mov_b32 s2, s0
1045; GCN-NEXT:    s_mov_b32 s3, s0
1046; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1047; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
1048; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1049; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1050; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1051; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1052; GCN-NEXT:    s_waitcnt vmcnt(0)
1053; GCN-NEXT:    s_setpc_b64 s[30:31]
1054  %padding = alloca [64 x i32], align 4, addrspace(5)
1055  %alloca = alloca [32 x i16], align 2, addrspace(5)
1056  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1057  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1058  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
1059  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
1060  ret void
1061}
1062
1063define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
1064; GFX9-LABEL: store_load_sindex_small_offset_kernel:
1065; GFX9:       ; %bb.0: ; %bb
1066; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
1067; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
1068; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1069; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1070; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1071; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1072; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
1073; GFX9-NEXT:    s_and_b32 s0, s0, 15
1074; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1075; GFX9-NEXT:    s_addk_i32 s1, 0x104
1076; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1077; GFX9-NEXT:    scratch_store_dword off, v0, s1
1078; GFX9-NEXT:    s_waitcnt vmcnt(0)
1079; GFX9-NEXT:    s_addk_i32 s0, 0x104
1080; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1081; GFX9-NEXT:    s_waitcnt vmcnt(0)
1082; GFX9-NEXT:    s_endpgm
1083;
1084; GFX10-LABEL: store_load_sindex_small_offset_kernel:
1085; GFX10:       ; %bb.0: ; %bb
1086; GFX10-NEXT:    s_add_u32 s2, s2, s5
1087; GFX10-NEXT:    s_addc_u32 s3, s3, 0
1088; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1089; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1090; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
1091; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1092; GFX10-NEXT:    s_waitcnt vmcnt(0)
1093; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1094; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1095; GFX10-NEXT:    s_and_b32 s1, s0, 15
1096; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1097; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
1098; GFX10-NEXT:    s_addk_i32 s0, 0x104
1099; GFX10-NEXT:    s_addk_i32 s1, 0x104
1100; GFX10-NEXT:    scratch_store_dword off, v0, s0
1101; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1102; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1103; GFX10-NEXT:    s_waitcnt vmcnt(0)
1104; GFX10-NEXT:    s_endpgm
1105;
1106; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel:
1107; GFX9-PAL:       ; %bb.0: ; %bb
1108; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
1109; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
1110; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1111; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1112; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
1113; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1114; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1115; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
1116; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
1117; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1118; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1119; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1120; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1121; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1122; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x104
1123; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1124; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1125; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1126; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x104
1127; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
1128; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1129; GFX9-PAL-NEXT:    s_endpgm
1130;
1131; GFX940-LABEL: store_load_sindex_small_offset_kernel:
1132; GFX940:       ; %bb.0: ; %bb
1133; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
1134; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
1135; GFX940-NEXT:    s_waitcnt vmcnt(0)
1136; GFX940-NEXT:    v_mov_b32_e32 v0, 15
1137; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1138; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
1139; GFX940-NEXT:    s_and_b32 s0, s0, 15
1140; GFX940-NEXT:    s_addk_i32 s1, 0x104
1141; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
1142; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
1143; GFX940-NEXT:    s_waitcnt vmcnt(0)
1144; GFX940-NEXT:    s_addk_i32 s0, 0x104
1145; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
1146; GFX940-NEXT:    s_waitcnt vmcnt(0)
1147; GFX940-NEXT:    s_endpgm
1148;
1149; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel:
1150; GFX1010-PAL:       ; %bb.0: ; %bb
1151; GFX1010-PAL-NEXT:    s_getpc_b64 s[4:5]
1152; GFX1010-PAL-NEXT:    s_mov_b32 s4, s0
1153; GFX1010-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1154; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1155; GFX1010-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1156; GFX1010-PAL-NEXT:    s_add_u32 s4, s4, s3
1157; GFX1010-PAL-NEXT:    s_addc_u32 s5, s5, 0
1158; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
1159; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
1160; GFX1010-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
1161; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1162; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
1163; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1164; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
1165; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1166; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
1167; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1168; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1169; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x104
1170; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x104
1171; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
1172; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1173; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1174; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1175; GFX1010-PAL-NEXT:    s_endpgm
1176;
1177; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel:
1178; GFX1030-PAL:       ; %bb.0: ; %bb
1179; GFX1030-PAL-NEXT:    s_getpc_b64 s[4:5]
1180; GFX1030-PAL-NEXT:    s_mov_b32 s4, s0
1181; GFX1030-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1182; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1183; GFX1030-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1184; GFX1030-PAL-NEXT:    s_add_u32 s4, s4, s3
1185; GFX1030-PAL-NEXT:    s_addc_u32 s5, s5, 0
1186; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
1187; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
1188; GFX1030-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
1189; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1190; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1191; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
1192; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1193; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
1194; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1195; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1196; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x104
1197; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x104
1198; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
1199; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1200; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1201; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1202; GFX1030-PAL-NEXT:    s_endpgm
1203bb:
1204  %padding = alloca [64 x i32], align 4, addrspace(5)
1205  %i = alloca [32 x float], align 4, addrspace(5)
1206  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1207  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1208  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1209  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1210  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1211  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1212  %i9 = and i32 %idx, 15
1213  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1214  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1215  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1216  ret void
1217}
1218
1219define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
1220; GFX9-LABEL: store_load_sindex_small_offset_foo:
1221; GFX9:       ; %bb.0: ; %bb
1222; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1223; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1224; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1225; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1226; GFX9-NEXT:    s_waitcnt vmcnt(0)
1227; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
1228; GFX9-NEXT:    s_addk_i32 s0, 0x104
1229; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1230; GFX9-NEXT:    scratch_store_dword off, v0, s0
1231; GFX9-NEXT:    s_waitcnt vmcnt(0)
1232; GFX9-NEXT:    s_and_b32 s0, s2, 15
1233; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1234; GFX9-NEXT:    s_addk_i32 s0, 0x104
1235; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1236; GFX9-NEXT:    s_waitcnt vmcnt(0)
1237; GFX9-NEXT:    s_endpgm
1238;
1239; GFX10-LABEL: store_load_sindex_small_offset_foo:
1240; GFX10:       ; %bb.0: ; %bb
1241; GFX10-NEXT:    s_add_u32 s0, s0, s3
1242; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1243; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1244; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1245; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1246; GFX10-NEXT:    s_waitcnt vmcnt(0)
1247; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1248; GFX10-NEXT:    s_and_b32 s0, s2, 15
1249; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
1250; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1251; GFX10-NEXT:    s_addk_i32 s1, 0x104
1252; GFX10-NEXT:    s_addk_i32 s0, 0x104
1253; GFX10-NEXT:    scratch_store_dword off, v0, s1
1254; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1255; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
1256; GFX10-NEXT:    s_waitcnt vmcnt(0)
1257; GFX10-NEXT:    s_endpgm
1258;
1259; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo:
1260; GFX9-PAL:       ; %bb.0: ; %bb
1261; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1262; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1263; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1264; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1265; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1266; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1267; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1268; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1269; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1270; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1271; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1272; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1273; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x104
1274; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1275; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1276; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1277; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1278; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x104
1279; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
1280; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1281; GFX9-PAL-NEXT:    s_endpgm
1282;
1283; GFX940-LABEL: store_load_sindex_small_offset_foo:
1284; GFX940:       ; %bb.0: ; %bb
1285; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
1286; GFX940-NEXT:    s_waitcnt vmcnt(0)
1287; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
1288; GFX940-NEXT:    s_and_b32 s0, s0, 15
1289; GFX940-NEXT:    s_addk_i32 s1, 0x104
1290; GFX940-NEXT:    v_mov_b32_e32 v0, 15
1291; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
1292; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
1293; GFX940-NEXT:    s_waitcnt vmcnt(0)
1294; GFX940-NEXT:    s_addk_i32 s0, 0x104
1295; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
1296; GFX940-NEXT:    s_waitcnt vmcnt(0)
1297; GFX940-NEXT:    s_endpgm
1298;
1299; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo:
1300; GFX1010-PAL:       ; %bb.0: ; %bb
1301; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1302; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1303; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1304; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1305; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1306; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1307; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1308; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1309; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1310; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1311; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
1312; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
1313; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1314; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
1315; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1316; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1317; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x104
1318; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x104
1319; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
1320; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1321; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1322; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1323; GFX1010-PAL-NEXT:    s_endpgm
1324;
1325; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo:
1326; GFX1030-PAL:       ; %bb.0: ; %bb
1327; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1328; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1329; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1330; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1331; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1332; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1333; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1334; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1335; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1336; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1337; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1338; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
1339; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
1340; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1341; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1342; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x104
1343; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x104
1344; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
1345; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1346; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1347; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1348; GFX1030-PAL-NEXT:    s_endpgm
1349bb:
1350  %padding = alloca [64 x i32], align 4, addrspace(5)
1351  %i = alloca [32 x float], align 4, addrspace(5)
1352  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1353  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1354  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1355  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1356  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1357  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1358  %i9 = and i32 %idx, 15
1359  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1360  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1361  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1362  ret void
1363}
1364
1365define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
1366; GFX9-LABEL: store_load_vindex_small_offset_kernel:
1367; GFX9:       ; %bb.0: ; %bb
1368; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1369; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1370; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1371; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
1372; GFX9-NEXT:    s_waitcnt vmcnt(0)
1373; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1374; GFX9-NEXT:    v_mov_b32_e32 v1, 0x104
1375; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
1376; GFX9-NEXT:    v_mov_b32_e32 v3, 15
1377; GFX9-NEXT:    scratch_store_dword v2, v3, off
1378; GFX9-NEXT:    s_waitcnt vmcnt(0)
1379; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
1380; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
1381; GFX9-NEXT:    s_waitcnt vmcnt(0)
1382; GFX9-NEXT:    s_endpgm
1383;
1384; GFX10-LABEL: store_load_vindex_small_offset_kernel:
1385; GFX10:       ; %bb.0: ; %bb
1386; GFX10-NEXT:    s_add_u32 s0, s0, s3
1387; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1388; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1389; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1390; GFX10-NEXT:    v_mov_b32_e32 v1, 0x104
1391; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1392; GFX10-NEXT:    v_mov_b32_e32 v3, 15
1393; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
1394; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
1395; GFX10-NEXT:    scratch_load_dword v1, off, off offset:4 glc dlc
1396; GFX10-NEXT:    s_waitcnt vmcnt(0)
1397; GFX10-NEXT:    scratch_store_dword v2, v3, off
1398; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1399; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1400; GFX10-NEXT:    s_waitcnt vmcnt(0)
1401; GFX10-NEXT:    s_endpgm
1402;
1403; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel:
1404; GFX9-PAL:       ; %bb.0: ; %bb
1405; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1406; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1407; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1408; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1409; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1410; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
1411; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1412; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1413; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1414; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1415; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
1416; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1417; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
1418; GFX9-PAL-NEXT:    v_add_u32_e32 v2, v1, v0
1419; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1420; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1421; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, v1, v0
1422; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
1423; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1424; GFX9-PAL-NEXT:    s_endpgm
1425;
1426; GFX940-LABEL: store_load_vindex_small_offset_kernel:
1427; GFX940:       ; %bb.0: ; %bb
1428; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
1429; GFX940-NEXT:    s_waitcnt vmcnt(0)
1430; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1431; GFX940-NEXT:    v_mov_b32_e32 v1, 15
1432; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:260 sc0 sc1
1433; GFX940-NEXT:    s_waitcnt vmcnt(0)
1434; GFX940-NEXT:    v_sub_u32_e32 v0, 0x104, v0
1435; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
1436; GFX940-NEXT:    s_waitcnt vmcnt(0)
1437; GFX940-NEXT:    s_endpgm
1438;
1439; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel:
1440; GFX1010-PAL:       ; %bb.0: ; %bb
1441; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1442; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1443; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1444; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1445; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1446; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1447; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1448; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1449; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1450; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
1451; GFX1010-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1452; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, 15
1453; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1454; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
1455; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
1456; GFX1010-PAL-NEXT:    scratch_load_dword v1, off, vcc_lo offset:4 glc dlc
1457; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1458; GFX1010-PAL-NEXT:    scratch_store_dword v2, v3, off
1459; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1460; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1461; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1462; GFX1010-PAL-NEXT:    s_endpgm
1463;
1464; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel:
1465; GFX1030-PAL:       ; %bb.0: ; %bb
1466; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1467; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1468; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1469; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1470; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1471; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1472; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1473; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1474; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1475; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
1476; GFX1030-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1477; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, 15
1478; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
1479; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
1480; GFX1030-PAL-NEXT:    scratch_load_dword v1, off, off offset:4 glc dlc
1481; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1482; GFX1030-PAL-NEXT:    scratch_store_dword v2, v3, off
1483; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1484; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1485; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1486; GFX1030-PAL-NEXT:    s_endpgm
1487bb:
1488  %padding = alloca [64 x i32], align 4, addrspace(5)
1489  %i = alloca [32 x float], align 4, addrspace(5)
1490  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1491  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1492  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1493  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
1494  %i3 = zext i32 %i2 to i64
1495  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
1496  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1497  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1498  %i9 = sub nsw i32 31, %i2
1499  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1500  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1501  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1502  ret void
1503}
1504
1505define void @store_load_vindex_small_offset_foo(i32 %idx) {
1506; GFX9-LABEL: store_load_vindex_small_offset_foo:
1507; GFX9:       ; %bb.0: ; %bb
1508; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1509; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
1510; GFX9-NEXT:    s_waitcnt vmcnt(0)
1511; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x100
1512; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
1513; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1514; GFX9-NEXT:    v_mov_b32_e32 v3, 15
1515; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
1516; GFX9-NEXT:    scratch_store_dword v2, v3, off
1517; GFX9-NEXT:    s_waitcnt vmcnt(0)
1518; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1519; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
1520; GFX9-NEXT:    s_waitcnt vmcnt(0)
1521; GFX9-NEXT:    s_setpc_b64 s[30:31]
1522;
1523; GFX10-LABEL: store_load_vindex_small_offset_foo:
1524; GFX10:       ; %bb.0: ; %bb
1525; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1526; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1527; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
1528; GFX10-NEXT:    v_and_b32_e32 v2, 15, v0
1529; GFX10-NEXT:    v_mov_b32_e32 v1, vcc_lo
1530; GFX10-NEXT:    v_mov_b32_e32 v3, 15
1531; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1532; GFX10-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
1533; GFX10-NEXT:    scratch_load_dword v2, off, s32 glc dlc
1534; GFX10-NEXT:    s_waitcnt vmcnt(0)
1535; GFX10-NEXT:    scratch_store_dword v0, v3, off
1536; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1537; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
1538; GFX10-NEXT:    s_waitcnt vmcnt(0)
1539; GFX10-NEXT:    s_setpc_b64 s[30:31]
1540;
1541; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
1542; GFX9-PAL:       ; %bb.0: ; %bb
1543; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1544; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 glc
1545; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1546; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x100
1547; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
1548; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1549; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
1550; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
1551; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1552; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1553; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1554; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
1555; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1556; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1557;
1558; GFX940-LABEL: store_load_vindex_small_offset_foo:
1559; GFX940:       ; %bb.0: ; %bb
1560; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1561; GFX940-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
1562; GFX940-NEXT:    s_waitcnt vmcnt(0)
1563; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1564; GFX940-NEXT:    v_mov_b32_e32 v2, 15
1565; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
1566; GFX940-NEXT:    scratch_store_dword v1, v2, s32 offset:256 sc0 sc1
1567; GFX940-NEXT:    s_waitcnt vmcnt(0)
1568; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1569; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
1570; GFX940-NEXT:    s_waitcnt vmcnt(0)
1571; GFX940-NEXT:    s_setpc_b64 s[30:31]
1572;
1573; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo:
1574; GFX10-PAL:       ; %bb.0: ; %bb
1575; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1576; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1577; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x100
1578; GFX10-PAL-NEXT:    v_and_b32_e32 v2, 15, v0
1579; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, vcc_lo
1580; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
1581; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1582; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
1583; GFX10-PAL-NEXT:    scratch_load_dword v2, off, s32 glc dlc
1584; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1585; GFX10-PAL-NEXT:    scratch_store_dword v0, v3, off
1586; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1587; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
1588; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1589; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1590; GCN-LABEL: store_load_vindex_small_offset_foo:
1591; GCN:       ; %bb.0: ; %bb
1592; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1593; GCN-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
1594; GCN-NEXT:    s_waitcnt vmcnt(0)
1595; GCN-NEXT:    v_mov_b32_e32 v2, 15
1596; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1597; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
1598; GCN-NEXT:    scratch_store_dword v1, v2, s32 offset:256 sc0 sc1
1599; GCN-NEXT:    s_waitcnt vmcnt(0)
1600; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1601; GCN-NEXT:    scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
1602; GCN-NEXT:    s_waitcnt vmcnt(0)
1603; GCN-NEXT:    s_setpc_b64 s[30:31]
1604bb:
1605  %padding = alloca [64 x i32], align 4, addrspace(5)
1606  %i = alloca [32 x float], align 4, addrspace(5)
1607  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1608  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1609  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1610  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1611  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1612  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1613  %i9 = and i32 %idx, 15
1614  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1615  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1616  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1617  ret void
1618}
1619
1620define amdgpu_kernel void @zero_init_large_offset_kernel() {
1621; GFX9-LABEL: zero_init_large_offset_kernel:
1622; GFX9:       ; %bb.0:
1623; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1624; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1625; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1626; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:16 glc
1627; GFX9-NEXT:    s_waitcnt vmcnt(0)
1628; GFX9-NEXT:    s_mov_b32 s0, 0
1629; GFX9-NEXT:    s_mov_b32 s1, s0
1630; GFX9-NEXT:    s_mov_b32 s2, s0
1631; GFX9-NEXT:    s_mov_b32 s3, s0
1632; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1633; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1634; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1635; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1636; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1637; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1638; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1639; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1640; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1641; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1642; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1643; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1644; GFX9-NEXT:    s_endpgm
1645;
1646; GFX10-LABEL: zero_init_large_offset_kernel:
1647; GFX10:       ; %bb.0:
1648; GFX10-NEXT:    s_add_u32 s0, s0, s3
1649; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1650; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1651; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1652; GFX10-NEXT:    scratch_load_dword v0, off, off offset:16 glc dlc
1653; GFX10-NEXT:    s_waitcnt vmcnt(0)
1654; GFX10-NEXT:    s_mov_b32 s0, 0
1655; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1656; GFX10-NEXT:    s_mov_b32 s1, s0
1657; GFX10-NEXT:    s_mov_b32 s2, s0
1658; GFX10-NEXT:    s_mov_b32 s3, s0
1659; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1660; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1661; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1662; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1663; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1664; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1665; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1666; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1667; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1668; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1669; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1670; GFX10-NEXT:    s_endpgm
1671;
1672; GFX9-PAL-LABEL: zero_init_large_offset_kernel:
1673; GFX9-PAL:       ; %bb.0:
1674; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1675; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1676; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1677; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1678; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1679; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1680; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1681; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1682; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1683; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:16 glc
1684; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1685; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
1686; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1687; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
1688; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
1689; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
1690; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
1691; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
1692; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1693; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1694; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1695; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1696; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1697; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1698; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1699; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1700; GFX9-PAL-NEXT:    s_endpgm
1701;
1702; GFX940-LABEL: zero_init_large_offset_kernel:
1703; GFX940:       ; %bb.0:
1704; GFX940-NEXT:    scratch_load_dword v0, off, off offset:16 sc0 sc1
1705; GFX940-NEXT:    s_waitcnt vmcnt(0)
1706; GFX940-NEXT:    s_mov_b32 s0, 0
1707; GFX940-NEXT:    s_mov_b32 s1, s0
1708; GFX940-NEXT:    s_mov_b32 s2, s0
1709; GFX940-NEXT:    s_mov_b32 s3, s0
1710; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1711; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
1712; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4010
1713; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1714; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4010
1715; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1716; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4010
1717; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1718; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4010
1719; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1720; GFX940-NEXT:    s_endpgm
1721;
1722; GFX1010-PAL-LABEL: zero_init_large_offset_kernel:
1723; GFX1010-PAL:       ; %bb.0:
1724; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1725; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1726; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1727; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1728; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1729; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1730; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1731; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1732; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1733; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1734; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
1735; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:16 glc dlc
1736; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1737; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
1738; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1739; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
1740; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
1741; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
1742; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
1743; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
1744; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1745; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1746; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1747; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1748; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1749; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1750; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1751; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1752; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1753; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1754; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1755; GFX1010-PAL-NEXT:    s_endpgm
1756;
1757; GFX1030-PAL-LABEL: zero_init_large_offset_kernel:
1758; GFX1030-PAL:       ; %bb.0:
1759; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1760; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1761; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1762; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1763; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1764; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1765; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1766; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1767; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1768; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:16 glc dlc
1769; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1770; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
1771; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1772; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
1773; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1774; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
1775; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
1776; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
1777; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
1778; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
1779; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1780; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1781; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1782; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1783; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1784; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1785; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1786; GFX1030-PAL-NEXT:    s_endpgm
1787  %padding = alloca [4096 x i32], align 4, addrspace(5)
1788  %alloca = alloca [32 x i16], align 2, addrspace(5)
1789  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1790  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1791  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
1792  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
1793  ret void
1794}
1795
1796define void @zero_init_large_offset_foo() {
1797; GFX9-LABEL: zero_init_large_offset_foo:
1798; GFX9:       ; %bb.0:
1799; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1800; GFX9-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc
1801; GFX9-NEXT:    s_waitcnt vmcnt(0)
1802; GFX9-NEXT:    s_mov_b32 s0, 0
1803; GFX9-NEXT:    s_mov_b32 s1, s0
1804; GFX9-NEXT:    s_mov_b32 s2, s0
1805; GFX9-NEXT:    s_mov_b32 s3, s0
1806; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1807; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1808; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1809; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1810; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1811; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1812; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1813; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1814; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1815; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1816; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1817; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1818; GFX9-NEXT:    s_waitcnt vmcnt(0)
1819; GFX9-NEXT:    s_setpc_b64 s[30:31]
1820;
1821; GFX10-LABEL: zero_init_large_offset_foo:
1822; GFX10:       ; %bb.0:
1823; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1824; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1825; GFX10-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc dlc
1826; GFX10-NEXT:    s_waitcnt vmcnt(0)
1827; GFX10-NEXT:    s_mov_b32 s0, 0
1828; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1829; GFX10-NEXT:    s_mov_b32 s1, s0
1830; GFX10-NEXT:    s_mov_b32 s2, s0
1831; GFX10-NEXT:    s_mov_b32 s3, s0
1832; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1833; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1834; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1835; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1836; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1837; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1838; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1839; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1840; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1841; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1842; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1843; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1844; GFX10-NEXT:    s_setpc_b64 s[30:31]
1845;
1846; GFX9-PAL-LABEL: zero_init_large_offset_foo:
1847; GFX9-PAL:       ; %bb.0:
1848; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1849; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc
1850; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1851; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1852; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
1853; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1854; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
1855; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
1856; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
1857; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
1858; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
1859; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1860; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1861; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1862; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1863; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1864; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1865; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1866; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1867; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1868; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1869;
1870; GFX940-LABEL: zero_init_large_offset_foo:
1871; GFX940:       ; %bb.0:
1872; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1873; GFX940-NEXT:    scratch_load_dword v0, off, s32 offset:16 sc0 sc1
1874; GFX940-NEXT:    s_waitcnt vmcnt(0)
1875; GFX940-NEXT:    s_mov_b32 s0, 0
1876; GFX940-NEXT:    s_mov_b32 s1, s0
1877; GFX940-NEXT:    s_mov_b32 s2, s0
1878; GFX940-NEXT:    s_mov_b32 s3, s0
1879; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1880; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
1881; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1882; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1883; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1884; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1885; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1886; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1887; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1888; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1889; GFX940-NEXT:    s_waitcnt vmcnt(0)
1890; GFX940-NEXT:    s_setpc_b64 s[30:31]
1891;
1892; GFX1010-PAL-LABEL: zero_init_large_offset_foo:
1893; GFX1010-PAL:       ; %bb.0:
1894; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1895; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1896; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc dlc
1897; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1898; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
1899; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1900; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
1901; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1902; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
1903; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
1904; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
1905; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
1906; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
1907; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1908; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1909; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1910; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1911; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1912; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1913; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1914; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1915; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1916; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1917; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1918; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
1919;
1920; GFX1030-PAL-LABEL: zero_init_large_offset_foo:
1921; GFX1030-PAL:       ; %bb.0:
1922; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1923; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1924; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc dlc
1925; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1926; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
1927; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1928; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
1929; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1930; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
1931; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
1932; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
1933; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
1934; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
1935; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1936; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1937; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1938; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1939; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1940; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1941; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1942; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1943; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
1944  %padding = alloca [4096 x i32], align 4, addrspace(5)
1945  %alloca = alloca [32 x i16], align 2, addrspace(5)
1946  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1947  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1948  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
1949  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
1950  ret void
1951}
1952
1953define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
1954; GFX9-LABEL: store_load_sindex_large_offset_kernel:
1955; GFX9:       ; %bb.0: ; %bb
1956; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
1957; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
1958; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1959; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1960; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1961; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1962; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
1963; GFX9-NEXT:    s_and_b32 s0, s0, 15
1964; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1965; GFX9-NEXT:    s_addk_i32 s1, 0x4004
1966; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1967; GFX9-NEXT:    scratch_store_dword off, v0, s1
1968; GFX9-NEXT:    s_waitcnt vmcnt(0)
1969; GFX9-NEXT:    s_addk_i32 s0, 0x4004
1970; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1971; GFX9-NEXT:    s_waitcnt vmcnt(0)
1972; GFX9-NEXT:    s_endpgm
1973;
1974; GFX10-LABEL: store_load_sindex_large_offset_kernel:
1975; GFX10:       ; %bb.0: ; %bb
1976; GFX10-NEXT:    s_add_u32 s2, s2, s5
1977; GFX10-NEXT:    s_addc_u32 s3, s3, 0
1978; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1979; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1980; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
1981; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1982; GFX10-NEXT:    s_waitcnt vmcnt(0)
1983; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1984; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1985; GFX10-NEXT:    s_and_b32 s1, s0, 15
1986; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1987; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
1988; GFX10-NEXT:    s_addk_i32 s0, 0x4004
1989; GFX10-NEXT:    s_addk_i32 s1, 0x4004
1990; GFX10-NEXT:    scratch_store_dword off, v0, s0
1991; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1992; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1993; GFX10-NEXT:    s_waitcnt vmcnt(0)
1994; GFX10-NEXT:    s_endpgm
1995;
1996; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel:
1997; GFX9-PAL:       ; %bb.0: ; %bb
1998; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
1999; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
2000; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2001; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
2002; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
2003; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2004; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2005; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
2006; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
2007; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
2008; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2009; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
2010; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
2011; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
2012; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x4004
2013; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2014; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
2015; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2016; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x4004
2017; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
2018; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2019; GFX9-PAL-NEXT:    s_endpgm
2020;
2021; GFX940-LABEL: store_load_sindex_large_offset_kernel:
2022; GFX940:       ; %bb.0: ; %bb
2023; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
2024; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
2025; GFX940-NEXT:    s_waitcnt vmcnt(0)
2026; GFX940-NEXT:    v_mov_b32_e32 v0, 15
2027; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
2028; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
2029; GFX940-NEXT:    s_and_b32 s0, s0, 15
2030; GFX940-NEXT:    s_addk_i32 s1, 0x4004
2031; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
2032; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
2033; GFX940-NEXT:    s_waitcnt vmcnt(0)
2034; GFX940-NEXT:    s_addk_i32 s0, 0x4004
2035; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
2036; GFX940-NEXT:    s_waitcnt vmcnt(0)
2037; GFX940-NEXT:    s_endpgm
2038;
2039; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel:
2040; GFX1010-PAL:       ; %bb.0: ; %bb
2041; GFX1010-PAL-NEXT:    s_getpc_b64 s[4:5]
2042; GFX1010-PAL-NEXT:    s_mov_b32 s4, s0
2043; GFX1010-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2044; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2045; GFX1010-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2046; GFX1010-PAL-NEXT:    s_add_u32 s4, s4, s3
2047; GFX1010-PAL-NEXT:    s_addc_u32 s5, s5, 0
2048; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
2049; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
2050; GFX1010-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
2051; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2052; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
2053; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2054; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
2055; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2056; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
2057; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2058; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2059; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x4004
2060; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x4004
2061; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
2062; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2063; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2064; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2065; GFX1010-PAL-NEXT:    s_endpgm
2066;
2067; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel:
2068; GFX1030-PAL:       ; %bb.0: ; %bb
2069; GFX1030-PAL-NEXT:    s_getpc_b64 s[4:5]
2070; GFX1030-PAL-NEXT:    s_mov_b32 s4, s0
2071; GFX1030-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2072; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2073; GFX1030-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2074; GFX1030-PAL-NEXT:    s_add_u32 s4, s4, s3
2075; GFX1030-PAL-NEXT:    s_addc_u32 s5, s5, 0
2076; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
2077; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
2078; GFX1030-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
2079; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2080; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2081; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
2082; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2083; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
2084; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2085; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2086; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x4004
2087; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x4004
2088; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
2089; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2090; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2091; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2092; GFX1030-PAL-NEXT:    s_endpgm
2093bb:
2094  %padding = alloca [4096 x i32], align 4, addrspace(5)
2095  %i = alloca [32 x float], align 4, addrspace(5)
2096  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2097  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2098  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2099  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
2100  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2101  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2102  %i9 = and i32 %idx, 15
2103  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2104  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2105  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2106  ret void
2107}
2108
2109define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
2110; GFX9-LABEL: store_load_sindex_large_offset_foo:
2111; GFX9:       ; %bb.0: ; %bb
2112; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
2113; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
2114; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
2115; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
2116; GFX9-NEXT:    s_waitcnt vmcnt(0)
2117; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
2118; GFX9-NEXT:    s_addk_i32 s0, 0x4004
2119; GFX9-NEXT:    v_mov_b32_e32 v0, 15
2120; GFX9-NEXT:    scratch_store_dword off, v0, s0
2121; GFX9-NEXT:    s_waitcnt vmcnt(0)
2122; GFX9-NEXT:    s_and_b32 s0, s2, 15
2123; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
2124; GFX9-NEXT:    s_addk_i32 s0, 0x4004
2125; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
2126; GFX9-NEXT:    s_waitcnt vmcnt(0)
2127; GFX9-NEXT:    s_endpgm
2128;
2129; GFX10-LABEL: store_load_sindex_large_offset_foo:
2130; GFX10:       ; %bb.0: ; %bb
2131; GFX10-NEXT:    s_add_u32 s0, s0, s3
2132; GFX10-NEXT:    s_addc_u32 s1, s1, 0
2133; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
2134; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
2135; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2136; GFX10-NEXT:    s_waitcnt vmcnt(0)
2137; GFX10-NEXT:    v_mov_b32_e32 v0, 15
2138; GFX10-NEXT:    s_and_b32 s0, s2, 15
2139; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
2140; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
2141; GFX10-NEXT:    s_addk_i32 s1, 0x4004
2142; GFX10-NEXT:    s_addk_i32 s0, 0x4004
2143; GFX10-NEXT:    scratch_store_dword off, v0, s1
2144; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2145; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
2146; GFX10-NEXT:    s_waitcnt vmcnt(0)
2147; GFX10-NEXT:    s_endpgm
2148;
2149; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo:
2150; GFX9-PAL:       ; %bb.0: ; %bb
2151; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
2152; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2153; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2154; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
2155; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2156; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2157; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
2158; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2159; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
2160; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2161; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
2162; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
2163; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x4004
2164; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
2165; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2166; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
2167; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2168; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x4004
2169; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
2170; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2171; GFX9-PAL-NEXT:    s_endpgm
2172;
2173; GFX940-LABEL: store_load_sindex_large_offset_foo:
2174; GFX940:       ; %bb.0: ; %bb
2175; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
2176; GFX940-NEXT:    s_waitcnt vmcnt(0)
2177; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
2178; GFX940-NEXT:    s_and_b32 s0, s0, 15
2179; GFX940-NEXT:    s_addk_i32 s1, 0x4004
2180; GFX940-NEXT:    v_mov_b32_e32 v0, 15
2181; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
2182; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
2183; GFX940-NEXT:    s_waitcnt vmcnt(0)
2184; GFX940-NEXT:    s_addk_i32 s0, 0x4004
2185; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
2186; GFX940-NEXT:    s_waitcnt vmcnt(0)
2187; GFX940-NEXT:    s_endpgm
2188;
2189; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo:
2190; GFX1010-PAL:       ; %bb.0: ; %bb
2191; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
2192; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2193; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2194; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2195; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2196; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
2197; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
2198; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2199; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2200; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2201; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
2202; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
2203; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2204; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
2205; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2206; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2207; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x4004
2208; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x4004
2209; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
2210; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2211; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2212; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2213; GFX1010-PAL-NEXT:    s_endpgm
2214;
2215; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo:
2216; GFX1030-PAL:       ; %bb.0: ; %bb
2217; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
2218; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2219; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2220; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2221; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2222; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
2223; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
2224; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2225; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2226; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2227; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2228; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
2229; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
2230; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2231; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2232; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x4004
2233; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x4004
2234; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
2235; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2236; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2237; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2238; GFX1030-PAL-NEXT:    s_endpgm
2239bb:
2240  %padding = alloca [4096 x i32], align 4, addrspace(5)
2241  %i = alloca [32 x float], align 4, addrspace(5)
2242  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2243  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2244  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2245  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
2246  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2247  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2248  %i9 = and i32 %idx, 15
2249  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2250  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2251  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2252  ret void
2253}
2254
2255define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
2256; GFX9-LABEL: store_load_vindex_large_offset_kernel:
2257; GFX9:       ; %bb.0: ; %bb
2258; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
2259; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
2260; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
2261; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
2262; GFX9-NEXT:    s_waitcnt vmcnt(0)
2263; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2264; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4004
2265; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
2266; GFX9-NEXT:    v_mov_b32_e32 v3, 15
2267; GFX9-NEXT:    scratch_store_dword v2, v3, off
2268; GFX9-NEXT:    s_waitcnt vmcnt(0)
2269; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
2270; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
2271; GFX9-NEXT:    s_waitcnt vmcnt(0)
2272; GFX9-NEXT:    s_endpgm
2273;
2274; GFX10-LABEL: store_load_vindex_large_offset_kernel:
2275; GFX10:       ; %bb.0: ; %bb
2276; GFX10-NEXT:    s_add_u32 s0, s0, s3
2277; GFX10-NEXT:    s_addc_u32 s1, s1, 0
2278; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
2279; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
2280; GFX10-NEXT:    v_mov_b32_e32 v1, 0x4004
2281; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2282; GFX10-NEXT:    v_mov_b32_e32 v3, 15
2283; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
2284; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
2285; GFX10-NEXT:    scratch_load_dword v1, off, off offset:4 glc dlc
2286; GFX10-NEXT:    s_waitcnt vmcnt(0)
2287; GFX10-NEXT:    scratch_store_dword v2, v3, off
2288; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2289; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
2290; GFX10-NEXT:    s_waitcnt vmcnt(0)
2291; GFX10-NEXT:    s_endpgm
2292;
2293; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel:
2294; GFX9-PAL:       ; %bb.0: ; %bb
2295; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
2296; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2297; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2298; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
2299; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2300; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
2301; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2302; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2303; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
2304; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2305; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
2306; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2307; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
2308; GFX9-PAL-NEXT:    v_add_u32_e32 v2, v1, v0
2309; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
2310; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2311; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, v1, v0
2312; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
2313; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2314; GFX9-PAL-NEXT:    s_endpgm
2315;
2316; GFX940-LABEL: store_load_vindex_large_offset_kernel:
2317; GFX940:       ; %bb.0: ; %bb
2318; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
2319; GFX940-NEXT:    s_waitcnt vmcnt(0)
2320; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2321; GFX940-NEXT:    v_mov_b32_e32 v1, 15
2322; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4004
2323; GFX940-NEXT:    scratch_store_dword v0, v1, vcc_hi sc0 sc1
2324; GFX940-NEXT:    s_waitcnt vmcnt(0)
2325; GFX940-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
2326; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
2327; GFX940-NEXT:    s_waitcnt vmcnt(0)
2328; GFX940-NEXT:    s_endpgm
2329;
2330; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel:
2331; GFX1010-PAL:       ; %bb.0: ; %bb
2332; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
2333; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2334; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2335; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2336; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2337; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
2338; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
2339; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2340; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2341; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
2342; GFX1010-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2343; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, 15
2344; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2345; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
2346; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
2347; GFX1010-PAL-NEXT:    scratch_load_dword v1, off, vcc_lo offset:4 glc dlc
2348; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2349; GFX1010-PAL-NEXT:    scratch_store_dword v2, v3, off
2350; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2351; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
2352; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2353; GFX1010-PAL-NEXT:    s_endpgm
2354;
2355; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel:
2356; GFX1030-PAL:       ; %bb.0: ; %bb
2357; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
2358; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2359; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2360; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2361; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2362; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
2363; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
2364; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2365; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2366; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
2367; GFX1030-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2368; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, 15
2369; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
2370; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
2371; GFX1030-PAL-NEXT:    scratch_load_dword v1, off, off offset:4 glc dlc
2372; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2373; GFX1030-PAL-NEXT:    scratch_store_dword v2, v3, off
2374; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2375; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
2376; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2377; GFX1030-PAL-NEXT:    s_endpgm
2378bb:
2379  %padding = alloca [4096 x i32], align 4, addrspace(5)
2380  %i = alloca [32 x float], align 4, addrspace(5)
2381  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2382  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2383  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2384  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
2385  %i3 = zext i32 %i2 to i64
2386  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
2387  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2388  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2389  %i9 = sub nsw i32 31, %i2
2390  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2391  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2392  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2393  ret void
2394}
2395
2396define void @store_load_vindex_large_offset_foo(i32 %idx) {
2397; GFX9-LABEL: store_load_vindex_large_offset_foo:
2398; GFX9:       ; %bb.0: ; %bb
2399; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2400; GFX9-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
2401; GFX9-NEXT:    s_waitcnt vmcnt(0)
2402; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
2403; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
2404; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
2405; GFX9-NEXT:    v_mov_b32_e32 v3, 15
2406; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
2407; GFX9-NEXT:    scratch_store_dword v2, v3, off
2408; GFX9-NEXT:    s_waitcnt vmcnt(0)
2409; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
2410; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
2411; GFX9-NEXT:    s_waitcnt vmcnt(0)
2412; GFX9-NEXT:    s_setpc_b64 s[30:31]
2413;
2414; GFX10-LABEL: store_load_vindex_large_offset_foo:
2415; GFX10:       ; %bb.0: ; %bb
2416; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2417; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2418; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
2419; GFX10-NEXT:    v_and_b32_e32 v2, 15, v0
2420; GFX10-NEXT:    v_mov_b32_e32 v1, vcc_lo
2421; GFX10-NEXT:    v_mov_b32_e32 v3, 15
2422; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
2423; GFX10-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
2424; GFX10-NEXT:    scratch_load_dword v2, off, s32 offset:4 glc dlc
2425; GFX10-NEXT:    s_waitcnt vmcnt(0)
2426; GFX10-NEXT:    scratch_store_dword v0, v3, off
2427; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2428; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
2429; GFX10-NEXT:    s_waitcnt vmcnt(0)
2430; GFX10-NEXT:    s_setpc_b64 s[30:31]
2431;
2432; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
2433; GFX9-PAL:       ; %bb.0: ; %bb
2434; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2435; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
2436; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2437; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
2438; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
2439; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
2440; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
2441; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
2442; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
2443; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2444; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
2445; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
2446; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2447; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2448;
2449; GFX940-LABEL: store_load_vindex_large_offset_foo:
2450; GFX940:       ; %bb.0: ; %bb
2451; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2452; GFX940-NEXT:    scratch_load_dword v1, off, s32 offset:4 sc0 sc1
2453; GFX940-NEXT:    s_waitcnt vmcnt(0)
2454; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
2455; GFX940-NEXT:    v_mov_b32_e32 v2, 15
2456; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
2457; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
2458; GFX940-NEXT:    scratch_store_dword v1, v2, vcc_hi sc0 sc1
2459; GFX940-NEXT:    s_waitcnt vmcnt(0)
2460; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2461; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
2462; GFX940-NEXT:    scratch_load_dword v0, v0, vcc_hi sc0 sc1
2463; GFX940-NEXT:    s_waitcnt vmcnt(0)
2464; GFX940-NEXT:    s_setpc_b64 s[30:31]
2465;
2466; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo:
2467; GFX10-PAL:       ; %bb.0: ; %bb
2468; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2469; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2470; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
2471; GFX10-PAL-NEXT:    v_and_b32_e32 v2, 15, v0
2472; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, vcc_lo
2473; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
2474; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
2475; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
2476; GFX10-PAL-NEXT:    scratch_load_dword v2, off, s32 offset:4 glc dlc
2477; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2478; GFX10-PAL-NEXT:    scratch_store_dword v0, v3, off
2479; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2480; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
2481; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2482; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2483; GCN-LABEL: store_load_vindex_large_offset_foo:
2484; GCN:       ; %bb.0: ; %bb
2485; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2486; GCN-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
2487; GCN-NEXT:    s_waitcnt vmcnt(0)
2488; GCN-NEXT:    v_mov_b32_e32 v2, 15
2489; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
2490; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
2491; GCN-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
2492; GCN-NEXT:    scratch_store_dword v1, v2, vcc_hi sc0 sc1
2493; GCN-NEXT:    s_waitcnt vmcnt(0)
2494; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2495; GCN-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
2496; GCN-NEXT:    scratch_load_dword v0, v0, vcc_hi sc0 sc1
2497; GCN-NEXT:    s_waitcnt vmcnt(0)
2498; GCN-NEXT:    s_setpc_b64 s[30:31]
2499bb:
2500  %padding = alloca [4096 x i32], align 4, addrspace(5)
2501  %i = alloca [32 x float], align 4, addrspace(5)
2502  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2503  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2504  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2505  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
2506  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2507  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2508  %i9 = and i32 %idx, 15
2509  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2510  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2511  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2512  ret void
2513}
2514
2515define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
2516; GFX9-LABEL: store_load_large_imm_offset_kernel:
2517; GFX9:       ; %bb.0: ; %bb
2518; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
2519; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
2520; GFX9-NEXT:    v_mov_b32_e32 v0, 13
2521; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
2522; GFX9-NEXT:    s_movk_i32 s0, 0x3000
2523; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
2524; GFX9-NEXT:    s_waitcnt vmcnt(0)
2525; GFX9-NEXT:    s_add_i32 s0, s0, 4
2526; GFX9-NEXT:    v_mov_b32_e32 v0, 15
2527; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
2528; GFX9-NEXT:    s_waitcnt vmcnt(0)
2529; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
2530; GFX9-NEXT:    s_waitcnt vmcnt(0)
2531; GFX9-NEXT:    s_endpgm
2532;
2533; GFX10-LABEL: store_load_large_imm_offset_kernel:
2534; GFX10:       ; %bb.0: ; %bb
2535; GFX10-NEXT:    s_add_u32 s0, s0, s3
2536; GFX10-NEXT:    s_addc_u32 s1, s1, 0
2537; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
2538; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
2539; GFX10-NEXT:    v_mov_b32_e32 v0, 13
2540; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2541; GFX10-NEXT:    s_movk_i32 s0, 0x3800
2542; GFX10-NEXT:    s_add_i32 s0, s0, 4
2543; GFX10-NEXT:    scratch_store_dword off, v0, off offset:4
2544; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2545; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
2546; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2547; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
2548; GFX10-NEXT:    s_waitcnt vmcnt(0)
2549; GFX10-NEXT:    s_endpgm
2550;
2551; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel:
2552; GFX9-PAL:       ; %bb.0: ; %bb
2553; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
2554; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2555; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2556; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
2557; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
2558; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
2559; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2560; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2561; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
2562; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2563; GFX9-PAL-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
2564; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2565; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
2566; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
2567; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
2568; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2569; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
2570; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2571; GFX9-PAL-NEXT:    s_endpgm
2572;
2573; GFX940-LABEL: store_load_large_imm_offset_kernel:
2574; GFX940:       ; %bb.0: ; %bb
2575; GFX940-NEXT:    v_mov_b32_e32 v0, 13
2576; GFX940-NEXT:    scratch_store_dword off, v0, off offset:4 sc0 sc1
2577; GFX940-NEXT:    s_waitcnt vmcnt(0)
2578; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3000
2579; GFX940-NEXT:    v_mov_b32_e32 v1, 15
2580; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:3716 sc0 sc1
2581; GFX940-NEXT:    s_waitcnt vmcnt(0)
2582; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:3716 sc0 sc1
2583; GFX940-NEXT:    s_waitcnt vmcnt(0)
2584; GFX940-NEXT:    s_endpgm
2585;
2586; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel:
2587; GFX1010-PAL:       ; %bb.0: ; %bb
2588; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
2589; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2590; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2591; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2592; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2593; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
2594; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
2595; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2596; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2597; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 13
2598; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 15
2599; GFX1010-PAL-NEXT:    s_movk_i32 s0, 0x3800
2600; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2601; GFX1010-PAL-NEXT:    s_add_i32 s0, s0, 4
2602; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, vcc_lo offset:4
2603; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2604; GFX1010-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
2605; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2606; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
2607; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2608; GFX1010-PAL-NEXT:    s_endpgm
2609;
2610; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel:
2611; GFX1030-PAL:       ; %bb.0: ; %bb
2612; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
2613; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2614; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2615; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2616; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2617; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
2618; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
2619; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2620; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2621; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 13
2622; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 15
2623; GFX1030-PAL-NEXT:    s_movk_i32 s0, 0x3800
2624; GFX1030-PAL-NEXT:    s_add_i32 s0, s0, 4
2625; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, off offset:4
2626; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2627; GFX1030-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
2628; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2629; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
2630; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2631; GFX1030-PAL-NEXT:    s_endpgm
2632bb:
2633  %i = alloca [4096 x i32], align 4, addrspace(5)
2634  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
2635  store volatile i32 13, i32 addrspace(5)* %i1, align 4
2636  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
2637  store volatile i32 15, i32 addrspace(5)* %i7, align 4
2638  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
2639  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
2640  ret void
2641}
2642
2643define void @store_load_large_imm_offset_foo() {
2644; GFX9-LABEL: store_load_large_imm_offset_foo:
2645; GFX9:       ; %bb.0: ; %bb
2646; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2647; GFX9-NEXT:    v_mov_b32_e32 v0, 13
2648; GFX9-NEXT:    s_movk_i32 s0, 0x3000
2649; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 4
2650; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:4
2651; GFX9-NEXT:    s_waitcnt vmcnt(0)
2652; GFX9-NEXT:    s_add_i32 s0, s0, vcc_hi
2653; GFX9-NEXT:    v_mov_b32_e32 v0, 15
2654; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
2655; GFX9-NEXT:    s_waitcnt vmcnt(0)
2656; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
2657; GFX9-NEXT:    s_waitcnt vmcnt(0)
2658; GFX9-NEXT:    s_setpc_b64 s[30:31]
2659;
2660; GFX10-LABEL: store_load_large_imm_offset_foo:
2661; GFX10:       ; %bb.0: ; %bb
2662; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2663; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2664; GFX10-NEXT:    v_mov_b32_e32 v0, 13
2665; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2666; GFX10-NEXT:    s_movk_i32 s0, 0x3800
2667; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 4
2668; GFX10-NEXT:    s_add_i32 s0, s0, vcc_lo
2669; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:4
2670; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2671; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
2672; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2673; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
2674; GFX10-NEXT:    s_waitcnt vmcnt(0)
2675; GFX10-NEXT:    s_setpc_b64 s[30:31]
2676;
2677; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
2678; GFX9-PAL:       ; %bb.0: ; %bb
2679; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2680; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
2681; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
2682; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 4
2683; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s32 offset:4
2684; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2685; GFX9-PAL-NEXT:    s_add_i32 s0, s0, vcc_hi
2686; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
2687; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
2688; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2689; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
2690; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2691; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2692;
2693; GFX940-LABEL: store_load_large_imm_offset_foo:
2694; GFX940:       ; %bb.0: ; %bb
2695; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2696; GFX940-NEXT:    v_mov_b32_e32 v0, 13
2697; GFX940-NEXT:    scratch_store_dword off, v0, s32 offset:4 sc0 sc1
2698; GFX940-NEXT:    s_waitcnt vmcnt(0)
2699; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3000
2700; GFX940-NEXT:    v_mov_b32_e32 v1, 15
2701; GFX940-NEXT:    scratch_store_dword v0, v1, s32 offset:3716 sc0 sc1
2702; GFX940-NEXT:    s_waitcnt vmcnt(0)
2703; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1
2704; GFX940-NEXT:    s_waitcnt vmcnt(0)
2705; GFX940-NEXT:    s_setpc_b64 s[30:31]
2706;
2707; GFX10-PAL-LABEL: store_load_large_imm_offset_foo:
2708; GFX10-PAL:       ; %bb.0: ; %bb
2709; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2710; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2711; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 13
2712; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
2713; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x3800
2714; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 4
2715; GFX10-PAL-NEXT:    s_add_i32 s0, s0, vcc_lo
2716; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s32 offset:4
2717; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2718; GFX10-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
2719; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2720; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
2721; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2722; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2723; GCN-LABEL: store_load_large_imm_offset_foo:
2724; GCN:       ; %bb.0: ; %bb
2725; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2726; GCN-NEXT:    v_mov_b32_e32 v0, 13
2727; GCN-NEXT:    scratch_store_dword off, v0, s32 sc0 sc1
2728; GCN-NEXT:    s_waitcnt vmcnt(0)
2729; GCN-NEXT:    v_mov_b32_e32 v0, 0x3000
2730; GCN-NEXT:    v_mov_b32_e32 v1, 15
2731; GCN-NEXT:    scratch_store_dword v0, v1, s32 offset:3712 sc0 sc1
2732; GCN-NEXT:    s_waitcnt vmcnt(0)
2733; GCN-NEXT:    scratch_load_dword v0, v0, s32 offset:3712 sc0 sc1
2734; GCN-NEXT:    s_waitcnt vmcnt(0)
2735; GCN-NEXT:    s_setpc_b64 s[30:31]
2736bb:
2737  %i = alloca [4096 x i32], align 4, addrspace(5)
2738  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
2739  store volatile i32 13, i32 addrspace(5)* %i1, align 4
2740  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
2741  store volatile i32 15, i32 addrspace(5)* %i7, align 4
2742  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
2743  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
2744  ret void
2745}
2746
2747define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
2748; GFX9-LABEL: store_load_vidx_sidx_offset:
2749; GFX9:       ; %bb.0: ; %bb
2750; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
2751; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
2752; GFX9-NEXT:    v_mov_b32_e32 v1, 4
2753; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2754; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2755; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
2756; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
2757; GFX9-NEXT:    v_mov_b32_e32 v1, 15
2758; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
2759; GFX9-NEXT:    s_waitcnt vmcnt(0)
2760; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
2761; GFX9-NEXT:    s_waitcnt vmcnt(0)
2762; GFX9-NEXT:    s_endpgm
2763;
2764; GFX10-LABEL: store_load_vidx_sidx_offset:
2765; GFX10:       ; %bb.0: ; %bb
2766; GFX10-NEXT:    s_add_u32 s2, s2, s5
2767; GFX10-NEXT:    s_addc_u32 s3, s3, 0
2768; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2769; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2770; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
2771; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2772; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2773; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
2774; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
2775; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
2776; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2777; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
2778; GFX10-NEXT:    s_waitcnt vmcnt(0)
2779; GFX10-NEXT:    s_endpgm
2780;
2781; GFX9-PAL-LABEL: store_load_vidx_sidx_offset:
2782; GFX9-PAL:       ; %bb.0: ; %bb
2783; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
2784; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
2785; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2786; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 4
2787; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
2788; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2789; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2790; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
2791; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
2792; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
2793; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
2794; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
2795; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
2796; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2797; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
2798; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2799; GFX9-PAL-NEXT:    s_endpgm
2800;
2801; GFX940-LABEL: store_load_vidx_sidx_offset:
2802; GFX940:       ; %bb.0: ; %bb
2803; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
2804; GFX940-NEXT:    v_mov_b32_e32 v1, 15
2805; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
2806; GFX940-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
2807; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:1028 sc0 sc1
2808; GFX940-NEXT:    s_waitcnt vmcnt(0)
2809; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:1028 sc0 sc1
2810; GFX940-NEXT:    s_waitcnt vmcnt(0)
2811; GFX940-NEXT:    s_endpgm
2812;
2813; GFX10-PAL-LABEL: store_load_vidx_sidx_offset:
2814; GFX10-PAL:       ; %bb.0: ; %bb
2815; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
2816; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
2817; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2818; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2819; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2820; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
2821; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
2822; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
2823; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
2824; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
2825; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
2826; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2827; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
2828; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
2829; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
2830; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2831; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
2832; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2833; GFX10-PAL-NEXT:    s_endpgm
2834; GCN-LABEL: store_load_vidx_sidx_offset:
2835; GCN:       ; %bb.0: ; %bb
2836; GCN-NEXT:    s_load_dword s0, s[0:1], 0x24
2837; GCN-NEXT:    v_mov_b32_e32 v1, 15
2838; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2839; GCN-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
2840; GCN-NEXT:    scratch_store_dword v0, v1, off offset:1028 sc0 sc1
2841; GCN-NEXT:    s_waitcnt vmcnt(0)
2842; GCN-NEXT:    scratch_load_dword v0, v0, off offset:1028 sc0 sc1
2843; GCN-NEXT:    s_waitcnt vmcnt(0)
2844; GCN-NEXT:    s_endpgm
2845bb:
2846  %alloca = alloca [32 x i32], align 4, addrspace(5)
2847  %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
2848  %add1 = add nsw i32 %sidx, %vidx
2849  %add2 = add nsw i32 %add1, 256
2850  %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2
2851  store volatile i32 15, i32 addrspace(5)* %gep, align 4
2852  %load = load volatile i32, i32 addrspace(5)* %gep, align 4
2853  ret void
2854}
2855
2856define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
2857; GFX9-LABEL: store_load_i64_aligned:
2858; GFX9:       ; %bb.0: ; %bb
2859; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2860; GFX9-NEXT:    v_mov_b32_e32 v1, 15
2861; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2862; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2863; GFX9-NEXT:    s_waitcnt vmcnt(0)
2864; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
2865; GFX9-NEXT:    s_waitcnt vmcnt(0)
2866; GFX9-NEXT:    s_setpc_b64 s[30:31]
2867;
2868; GFX10-LABEL: store_load_i64_aligned:
2869; GFX10:       ; %bb.0: ; %bb
2870; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2871; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2872; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2873; GFX10-NEXT:    v_mov_b32_e32 v2, 0
2874; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2875; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2876; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
2877; GFX10-NEXT:    s_waitcnt vmcnt(0)
2878; GFX10-NEXT:    s_setpc_b64 s[30:31]
2879;
2880; GFX9-PAL-LABEL: store_load_i64_aligned:
2881; GFX9-PAL:       ; %bb.0: ; %bb
2882; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2883; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
2884; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
2885; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2886; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2887; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
2888; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2889; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2890;
2891; GFX940-LABEL: store_load_i64_aligned:
2892; GFX940:       ; %bb.0: ; %bb
2893; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2894; GFX940-NEXT:    v_mov_b32_e32 v2, 15
2895; GFX940-NEXT:    v_mov_b32_e32 v3, 0
2896; GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
2897; GFX940-NEXT:    s_waitcnt vmcnt(0)
2898; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
2899; GFX940-NEXT:    s_waitcnt vmcnt(0)
2900; GFX940-NEXT:    s_setpc_b64 s[30:31]
2901;
2902; GFX10-PAL-LABEL: store_load_i64_aligned:
2903; GFX10-PAL:       ; %bb.0: ; %bb
2904; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2905; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2906; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
2907; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
2908; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2909; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2910; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
2911; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2912; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2913; GCN-LABEL: store_load_i64_aligned:
2914; GCN:       ; %bb.0: ; %bb
2915; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2916; GCN-NEXT:    v_mov_b32_e32 v2, 15
2917; GCN-NEXT:    v_mov_b32_e32 v3, 0
2918; GCN-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
2919; GCN-NEXT:    s_waitcnt vmcnt(0)
2920; GCN-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
2921; GCN-NEXT:    s_waitcnt vmcnt(0)
2922; GCN-NEXT:    s_setpc_b64 s[30:31]
2923bb:
2924  store volatile i64 15, i64 addrspace(5)* %arg, align 8
2925  %load = load volatile i64, i64 addrspace(5)* %arg, align 8
2926  ret void
2927}
2928
2929define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
2930; GFX9-LABEL: store_load_i64_unaligned:
2931; GFX9:       ; %bb.0: ; %bb
2932; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2933; GFX9-NEXT:    v_mov_b32_e32 v1, 15
2934; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2935; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2936; GFX9-NEXT:    s_waitcnt vmcnt(0)
2937; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
2938; GFX9-NEXT:    s_waitcnt vmcnt(0)
2939; GFX9-NEXT:    s_setpc_b64 s[30:31]
2940;
2941; GFX10-LABEL: store_load_i64_unaligned:
2942; GFX10:       ; %bb.0: ; %bb
2943; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2944; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2945; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2946; GFX10-NEXT:    v_mov_b32_e32 v2, 0
2947; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2948; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2949; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
2950; GFX10-NEXT:    s_waitcnt vmcnt(0)
2951; GFX10-NEXT:    s_setpc_b64 s[30:31]
2952;
2953; GFX9-PAL-LABEL: store_load_i64_unaligned:
2954; GFX9-PAL:       ; %bb.0: ; %bb
2955; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2956; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
2957; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
2958; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2959; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2960; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
2961; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2962; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2963;
2964; GFX940-LABEL: store_load_i64_unaligned:
2965; GFX940:       ; %bb.0: ; %bb
2966; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2967; GFX940-NEXT:    v_mov_b32_e32 v2, 15
2968; GFX940-NEXT:    v_mov_b32_e32 v3, 0
2969; GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
2970; GFX940-NEXT:    s_waitcnt vmcnt(0)
2971; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
2972; GFX940-NEXT:    s_waitcnt vmcnt(0)
2973; GFX940-NEXT:    s_setpc_b64 s[30:31]
2974;
2975; GFX10-PAL-LABEL: store_load_i64_unaligned:
2976; GFX10-PAL:       ; %bb.0: ; %bb
2977; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2978; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2979; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
2980; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
2981; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2982; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2983; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
2984; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2985; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2986; GCN-LABEL: store_load_i64_unaligned:
2987; GCN:       ; %bb.0: ; %bb
2988; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2989; GCN-NEXT:    v_mov_b32_e32 v2, 15
2990; GCN-NEXT:    v_mov_b32_e32 v3, 0
2991; GCN-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
2992; GCN-NEXT:    s_waitcnt vmcnt(0)
2993; GCN-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
2994; GCN-NEXT:    s_waitcnt vmcnt(0)
2995; GCN-NEXT:    s_setpc_b64 s[30:31]
2996bb:
2997  store volatile i64 15, i64 addrspace(5)* %arg, align 1
2998  %load = load volatile i64, i64 addrspace(5)* %arg, align 1
2999  ret void
3000}
3001
3002define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) {
3003; GFX9-LABEL: store_load_v3i32_unaligned:
3004; GFX9:       ; %bb.0: ; %bb
3005; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3006; GFX9-NEXT:    v_mov_b32_e32 v1, 1
3007; GFX9-NEXT:    v_mov_b32_e32 v2, 2
3008; GFX9-NEXT:    v_mov_b32_e32 v3, 3
3009; GFX9-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
3010; GFX9-NEXT:    s_waitcnt vmcnt(0)
3011; GFX9-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
3012; GFX9-NEXT:    s_waitcnt vmcnt(0)
3013; GFX9-NEXT:    s_setpc_b64 s[30:31]
3014;
3015; GFX10-LABEL: store_load_v3i32_unaligned:
3016; GFX10:       ; %bb.0: ; %bb
3017; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3018; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3019; GFX10-NEXT:    v_mov_b32_e32 v1, 1
3020; GFX10-NEXT:    v_mov_b32_e32 v2, 2
3021; GFX10-NEXT:    v_mov_b32_e32 v3, 3
3022; GFX10-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
3023; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3024; GFX10-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
3025; GFX10-NEXT:    s_waitcnt vmcnt(0)
3026; GFX10-NEXT:    s_setpc_b64 s[30:31]
3027;
3028; GFX9-PAL-LABEL: store_load_v3i32_unaligned:
3029; GFX9-PAL:       ; %bb.0: ; %bb
3030; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3031; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
3032; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
3033; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
3034; GFX9-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
3035; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3036; GFX9-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
3037; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3038; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3039;
3040; GFX940-LABEL: store_load_v3i32_unaligned:
3041; GFX940:       ; %bb.0: ; %bb
3042; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3043; GFX940-NEXT:    v_mov_b32_e32 v2, 1
3044; GFX940-NEXT:    v_mov_b32_e32 v3, 2
3045; GFX940-NEXT:    v_mov_b32_e32 v4, 3
3046; GFX940-NEXT:    scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
3047; GFX940-NEXT:    s_waitcnt vmcnt(0)
3048; GFX940-NEXT:    scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
3049; GFX940-NEXT:    s_waitcnt vmcnt(0)
3050; GFX940-NEXT:    s_setpc_b64 s[30:31]
3051;
3052; GFX10-PAL-LABEL: store_load_v3i32_unaligned:
3053; GFX10-PAL:       ; %bb.0: ; %bb
3054; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3055; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3056; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
3057; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
3058; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
3059; GFX10-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
3060; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3061; GFX10-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
3062; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3063; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3064; GCN-LABEL: store_load_v3i32_unaligned:
3065; GCN:       ; %bb.0: ; %bb
3066; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3067; GCN-NEXT:    v_mov_b32_e32 v2, 1
3068; GCN-NEXT:    v_mov_b32_e32 v3, 2
3069; GCN-NEXT:    v_mov_b32_e32 v4, 3
3070; GCN-NEXT:    scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
3071; GCN-NEXT:    s_waitcnt vmcnt(0)
3072; GCN-NEXT:    scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
3073; GCN-NEXT:    s_waitcnt vmcnt(0)
3074; GCN-NEXT:    s_setpc_b64 s[30:31]
3075bb:
3076  store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1
3077  %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1
3078  ret void
3079}
3080
3081define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) {
3082; GFX9-LABEL: store_load_v4i32_unaligned:
3083; GFX9:       ; %bb.0: ; %bb
3084; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3085; GFX9-NEXT:    v_mov_b32_e32 v1, 1
3086; GFX9-NEXT:    v_mov_b32_e32 v2, 2
3087; GFX9-NEXT:    v_mov_b32_e32 v3, 3
3088; GFX9-NEXT:    v_mov_b32_e32 v4, 4
3089; GFX9-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
3090; GFX9-NEXT:    s_waitcnt vmcnt(0)
3091; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
3092; GFX9-NEXT:    s_waitcnt vmcnt(0)
3093; GFX9-NEXT:    s_setpc_b64 s[30:31]
3094;
3095; GFX10-LABEL: store_load_v4i32_unaligned:
3096; GFX10:       ; %bb.0: ; %bb
3097; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3098; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3099; GFX10-NEXT:    v_mov_b32_e32 v1, 1
3100; GFX10-NEXT:    v_mov_b32_e32 v2, 2
3101; GFX10-NEXT:    v_mov_b32_e32 v3, 3
3102; GFX10-NEXT:    v_mov_b32_e32 v4, 4
3103; GFX10-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
3104; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3105; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
3106; GFX10-NEXT:    s_waitcnt vmcnt(0)
3107; GFX10-NEXT:    s_setpc_b64 s[30:31]
3108;
3109; GFX9-PAL-LABEL: store_load_v4i32_unaligned:
3110; GFX9-PAL:       ; %bb.0: ; %bb
3111; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3112; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
3113; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
3114; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
3115; GFX9-PAL-NEXT:    v_mov_b32_e32 v4, 4
3116; GFX9-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
3117; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3118; GFX9-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
3119; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3120; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3121;
3122; GFX940-LABEL: store_load_v4i32_unaligned:
3123; GFX940:       ; %bb.0: ; %bb
3124; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3125; GFX940-NEXT:    v_mov_b32_e32 v2, 1
3126; GFX940-NEXT:    v_mov_b32_e32 v3, 2
3127; GFX940-NEXT:    v_mov_b32_e32 v4, 3
3128; GFX940-NEXT:    v_mov_b32_e32 v5, 4
3129; GFX940-NEXT:    scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
3130; GFX940-NEXT:    s_waitcnt vmcnt(0)
3131; GFX940-NEXT:    scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
3132; GFX940-NEXT:    s_waitcnt vmcnt(0)
3133; GFX940-NEXT:    s_setpc_b64 s[30:31]
3134;
3135; GFX10-PAL-LABEL: store_load_v4i32_unaligned:
3136; GFX10-PAL:       ; %bb.0: ; %bb
3137; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3138; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3139; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
3140; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
3141; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
3142; GFX10-PAL-NEXT:    v_mov_b32_e32 v4, 4
3143; GFX10-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
3144; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3145; GFX10-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
3146; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3147; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3148; GCN-LABEL: store_load_v4i32_unaligned:
3149; GCN:       ; %bb.0: ; %bb
3150; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3151; GCN-NEXT:    v_mov_b32_e32 v2, 1
3152; GCN-NEXT:    v_mov_b32_e32 v3, 2
3153; GCN-NEXT:    v_mov_b32_e32 v4, 3
3154; GCN-NEXT:    v_mov_b32_e32 v5, 4
3155; GCN-NEXT:    scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
3156; GCN-NEXT:    s_waitcnt vmcnt(0)
3157; GCN-NEXT:    scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
3158; GCN-NEXT:    s_waitcnt vmcnt(0)
3159; GCN-NEXT:    s_setpc_b64 s[30:31]
3160bb:
3161  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1
3162  %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1
3163  ret void
3164}
3165
3166define void @store_load_i32_negative_unaligned(i8 addrspace(5)* nocapture %arg) {
3167; GFX9-LABEL: store_load_i32_negative_unaligned:
3168; GFX9:       ; %bb.0: ; %bb
3169; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3170; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
3171; GFX9-NEXT:    v_mov_b32_e32 v1, 1
3172; GFX9-NEXT:    scratch_store_byte v0, v1, off
3173; GFX9-NEXT:    s_waitcnt vmcnt(0)
3174; GFX9-NEXT:    scratch_load_ubyte v0, v0, off glc
3175; GFX9-NEXT:    s_waitcnt vmcnt(0)
3176; GFX9-NEXT:    s_setpc_b64 s[30:31]
3177;
3178; GFX10-LABEL: store_load_i32_negative_unaligned:
3179; GFX10:       ; %bb.0: ; %bb
3180; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3181; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3182; GFX10-NEXT:    v_mov_b32_e32 v1, 1
3183; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-1
3184; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3185; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
3186; GFX10-NEXT:    s_waitcnt vmcnt(0)
3187; GFX10-NEXT:    s_setpc_b64 s[30:31]
3188;
3189; GFX9-PAL-LABEL: store_load_i32_negative_unaligned:
3190; GFX9-PAL:       ; %bb.0: ; %bb
3191; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3192; GFX9-PAL-NEXT:    v_add_u32_e32 v0, -1, v0
3193; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
3194; GFX9-PAL-NEXT:    scratch_store_byte v0, v1, off
3195; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3196; GFX9-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc
3197; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3198; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3199;
3200; GFX940-LABEL: store_load_i32_negative_unaligned:
3201; GFX940:       ; %bb.0: ; %bb
3202; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3203; GFX940-NEXT:    v_add_u32_e32 v0, -1, v0
3204; GFX940-NEXT:    v_mov_b32_e32 v1, 1
3205; GFX940-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
3206; GFX940-NEXT:    s_waitcnt vmcnt(0)
3207; GFX940-NEXT:    scratch_load_ubyte v0, v0, off sc0 sc1
3208; GFX940-NEXT:    s_waitcnt vmcnt(0)
3209; GFX940-NEXT:    s_setpc_b64 s[30:31]
3210;
3211; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned:
3212; GFX1010-PAL:       ; %bb.0: ; %bb
3213; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3214; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3215; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, -1, v0
3216; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
3217; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off
3218; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3219; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
3220; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
3221; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
3222;
3223; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned:
3224; GFX1030-PAL:       ; %bb.0: ; %bb
3225; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3226; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3227; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
3228; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-1
3229; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3230; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
3231; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
3232; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
3233bb:
3234  %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -1
3235  store volatile i8 1, i8 addrspace(5)* %ptr, align 1
3236  %load = load volatile i8, i8 addrspace(5)* %ptr, align 1
3237  ret void
3238}
3239
3240define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture %arg) {
3241; GFX9-LABEL: store_load_i32_large_negative_unaligned:
3242; GFX9:       ; %bb.0: ; %bb
3243; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3244; GFX9-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
3245; GFX9-NEXT:    v_mov_b32_e32 v1, 1
3246; GFX9-NEXT:    scratch_store_byte v0, v1, off
3247; GFX9-NEXT:    s_waitcnt vmcnt(0)
3248; GFX9-NEXT:    scratch_load_ubyte v0, v0, off glc
3249; GFX9-NEXT:    s_waitcnt vmcnt(0)
3250; GFX9-NEXT:    s_setpc_b64 s[30:31]
3251;
3252; GFX10-LABEL: store_load_i32_large_negative_unaligned:
3253; GFX10:       ; %bb.0: ; %bb
3254; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3255; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3256; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
3257; GFX10-NEXT:    v_mov_b32_e32 v1, 1
3258; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-129
3259; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3260; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
3261; GFX10-NEXT:    s_waitcnt vmcnt(0)
3262; GFX10-NEXT:    s_setpc_b64 s[30:31]
3263;
3264; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned:
3265; GFX9-PAL:       ; %bb.0: ; %bb
3266; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3267; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
3268; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
3269; GFX9-PAL-NEXT:    scratch_store_byte v0, v1, off
3270; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3271; GFX9-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc
3272; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3273; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3274;
3275; GFX940-LABEL: store_load_i32_large_negative_unaligned:
3276; GFX940:       ; %bb.0: ; %bb
3277; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3278; GFX940-NEXT:    s_movk_i32 s0, 0xef7f
3279; GFX940-NEXT:    v_mov_b32_e32 v1, 1
3280; GFX940-NEXT:    scratch_store_byte v0, v1, s0 sc0 sc1
3281; GFX940-NEXT:    s_waitcnt vmcnt(0)
3282; GFX940-NEXT:    scratch_load_ubyte v0, v0, s0 sc0 sc1
3283; GFX940-NEXT:    s_waitcnt vmcnt(0)
3284; GFX940-NEXT:    s_setpc_b64 s[30:31]
3285;
3286; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned:
3287; GFX1010-PAL:       ; %bb.0: ; %bb
3288; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3289; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3290; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xffffefff, v0
3291; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
3292; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-128
3293; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3294; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-128 glc dlc
3295; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
3296; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
3297;
3298; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned:
3299; GFX1030-PAL:       ; %bb.0: ; %bb
3300; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3301; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3302; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
3303; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
3304; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-129
3305; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3306; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
3307; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
3308; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
3309bb:
3310  %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -4225
3311  store volatile i8 1, i8 addrspace(5)* %ptr, align 1
3312  %load = load volatile i8, i8 addrspace(5)* %ptr, align 1
3313  ret void
3314}
3315
3316define amdgpu_ps void @large_offset() {
3317; GFX9-LABEL: large_offset:
3318; GFX9:       ; %bb.0: ; %bb
3319; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s2
3320; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3321; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
3322; GFX9-NEXT:    v_mov_b32_e32 v1, v0
3323; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3324; GFX9-NEXT:    v_mov_b32_e32 v3, v0
3325; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
3326; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024
3327; GFX9-NEXT:    s_waitcnt vmcnt(0)
3328; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
3329; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc
3330; GFX9-NEXT:    s_waitcnt vmcnt(0)
3331; GFX9-NEXT:    v_mov_b32_e32 v0, 16
3332; GFX9-NEXT:    ;;#ASMSTART
3333; GFX9-NEXT:    ; use v0
3334; GFX9-NEXT:    ;;#ASMEND
3335; GFX9-NEXT:    v_mov_b32_e32 v0, 0x810
3336; GFX9-NEXT:    ;;#ASMSTART
3337; GFX9-NEXT:    ; use v0
3338; GFX9-NEXT:    ;;#ASMEND
3339; GFX9-NEXT:    s_endpgm
3340;
3341; GFX10-LABEL: large_offset:
3342; GFX10:       ; %bb.0: ; %bb
3343; GFX10-NEXT:    s_add_u32 s0, s0, s2
3344; GFX10-NEXT:    s_addc_u32 s1, s1, 0
3345; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
3346; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
3347; GFX10-NEXT:    v_mov_b32_e32 v0, 0
3348; GFX10-NEXT:    s_movk_i32 s0, 0x810
3349; GFX10-NEXT:    s_addk_i32 s0, 0x3c0
3350; GFX10-NEXT:    v_mov_b32_e32 v1, v0
3351; GFX10-NEXT:    v_mov_b32_e32 v2, v0
3352; GFX10-NEXT:    v_mov_b32_e32 v3, v0
3353; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
3354; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3355; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 glc dlc
3356; GFX10-NEXT:    s_waitcnt vmcnt(0)
3357; GFX10-NEXT:    v_mov_b32_e32 v0, 16
3358; GFX10-NEXT:    v_mov_b32_e32 v1, 0x810
3359; GFX10-NEXT:    ;;#ASMSTART
3360; GFX10-NEXT:    ; use v0
3361; GFX10-NEXT:    ;;#ASMEND
3362; GFX10-NEXT:    ;;#ASMSTART
3363; GFX10-NEXT:    ; use v1
3364; GFX10-NEXT:    ;;#ASMEND
3365; GFX10-NEXT:    s_endpgm
3366;
3367; GFX9-PAL-LABEL: large_offset:
3368; GFX9-PAL:       ; %bb.0: ; %bb
3369; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
3370; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
3371; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
3372; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0
3373; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, v0
3374; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, v0
3375; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, v0
3376; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3377; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
3378; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s0
3379; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
3380; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
3381; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024
3382; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3383; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
3384; GFX9-PAL-NEXT:    scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc
3385; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3386; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 16
3387; GFX9-PAL-NEXT:    ;;#ASMSTART
3388; GFX9-PAL-NEXT:    ; use v0
3389; GFX9-PAL-NEXT:    ;;#ASMEND
3390; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0x810
3391; GFX9-PAL-NEXT:    ;;#ASMSTART
3392; GFX9-PAL-NEXT:    ; use v0
3393; GFX9-PAL-NEXT:    ;;#ASMEND
3394; GFX9-PAL-NEXT:    s_endpgm
3395;
3396; GFX940-LABEL: large_offset:
3397; GFX940:       ; %bb.0: ; %bb
3398; GFX940-NEXT:    v_mov_b32_e32 v0, 0
3399; GFX940-NEXT:    v_mov_b32_e32 v1, v0
3400; GFX940-NEXT:    v_mov_b32_e32 v2, v0
3401; GFX940-NEXT:    v_mov_b32_e32 v3, v0
3402; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1
3403; GFX940-NEXT:    s_waitcnt vmcnt(0)
3404; GFX940-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1
3405; GFX940-NEXT:    s_waitcnt vmcnt(0)
3406; GFX940-NEXT:    v_mov_b32_e32 v0, 16
3407; GFX940-NEXT:    ;;#ASMSTART
3408; GFX940-NEXT:    ; use v0
3409; GFX940-NEXT:    ;;#ASMEND
3410; GFX940-NEXT:    v_mov_b32_e32 v0, 0x810
3411; GFX940-NEXT:    ;;#ASMSTART
3412; GFX940-NEXT:    ; use v0
3413; GFX940-NEXT:    ;;#ASMEND
3414; GFX940-NEXT:    s_endpgm
3415;
3416; GFX10-PAL-LABEL: large_offset:
3417; GFX10-PAL:       ; %bb.0: ; %bb
3418; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
3419; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
3420; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
3421; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3422; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
3423; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s0
3424; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
3425; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
3426; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
3427; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 0
3428; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x810
3429; GFX10-PAL-NEXT:    s_addk_i32 s0, 0x3c0
3430; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, v0
3431; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, v0
3432; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, v0
3433; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
3434; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3435; GFX10-PAL-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 glc dlc
3436; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3437; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 16
3438; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x810
3439; GFX10-PAL-NEXT:    ;;#ASMSTART
3440; GFX10-PAL-NEXT:    ; use v0
3441; GFX10-PAL-NEXT:    ;;#ASMEND
3442; GFX10-PAL-NEXT:    ;;#ASMSTART
3443; GFX10-PAL-NEXT:    ; use v1
3444; GFX10-PAL-NEXT:    ;;#ASMEND
3445; GFX10-PAL-NEXT:    s_endpgm
3446bb:
3447  %alloca = alloca [128 x <4 x i32>], align 16, addrspace(5)
3448  %alloca2 = alloca [128 x <4 x i32>], align 16, addrspace(5)
3449  %gep = getelementptr inbounds [128 x <4 x i32>], [128 x <4 x i32>] addrspace(5)* %alloca2, i32 0, i32 60
3450  store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(5)* %gep, align 16
3451  %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %gep, align 16
3452  call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca) #0
3453  call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca2) #0
3454  ret void
3455}
3456
3457declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg)
3458declare i32 @llvm.amdgcn.workitem.id.x()
3459