1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s
5; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10-PAL %s
6
7define amdgpu_kernel void @zero_init_kernel() {
8; GFX9-LABEL: zero_init_kernel:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
11; GFX9-NEXT:    s_mov_b32 s0, 0
12; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
13; GFX9-NEXT:    s_mov_b32 s1, s0
14; GFX9-NEXT:    s_mov_b32 s2, s0
15; GFX9-NEXT:    s_mov_b32 s3, s0
16; GFX9-NEXT:    v_mov_b32_e32 v0, s0
17; GFX9-NEXT:    v_mov_b32_e32 v1, s1
18; GFX9-NEXT:    v_mov_b32_e32 v2, s2
19; GFX9-NEXT:    v_mov_b32_e32 v3, s3
20; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
21; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64
22; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
23; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
24; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
25; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
26; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
27; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
28; GFX9-NEXT:    s_endpgm
29;
30; GFX10-LABEL: zero_init_kernel:
31; GFX10:       ; %bb.0:
32; GFX10-NEXT:    s_add_u32 s0, s0, s3
33; GFX10-NEXT:    s_addc_u32 s1, s1, 0
34; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
35; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
36; GFX10-NEXT:    s_mov_b32 s0, 0
37; GFX10-NEXT:    s_mov_b32 s1, s0
38; GFX10-NEXT:    s_mov_b32 s2, s0
39; GFX10-NEXT:    s_mov_b32 s3, s0
40; GFX10-NEXT:    v_mov_b32_e32 v0, s0
41; GFX10-NEXT:    v_mov_b32_e32 v1, s1
42; GFX10-NEXT:    v_mov_b32_e32 v2, s2
43; GFX10-NEXT:    v_mov_b32_e32 v3, s3
44; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
45; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
46; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
47; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
48; GFX10-NEXT:    s_endpgm
49;
50; GFX9-PAL-LABEL: zero_init_kernel:
51; GFX9-PAL:       ; %bb.0:
52; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
53; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
54; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
55; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
56; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
57; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
58; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
59; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
60; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
61; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
62; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
63; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
64; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
65; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
66; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
67; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
68; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64
69; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
70; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
71; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
72; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
73; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
74; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
75; GFX9-PAL-NEXT:    s_endpgm
76;
77; GFX10-PAL-LABEL: zero_init_kernel:
78; GFX10-PAL:       ; %bb.0:
79; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
80; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
81; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
82; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
83; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
84; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
85; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
86; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
87; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
88; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
89; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
90; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
91; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
92; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
93; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
94; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
95; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
96; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
97; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
98; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
99; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
100; GFX10-PAL-NEXT:    s_endpgm
101  %alloca = alloca [32 x i16], align 2, addrspace(5)
102  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
103  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
104  ret void
105}
106
107define void @zero_init_foo() {
108; GFX9-LABEL: zero_init_foo:
109; GFX9:       ; %bb.0:
110; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
111; GFX9-NEXT:    s_mov_b32 s0, 0
112; GFX9-NEXT:    s_mov_b32 s1, s0
113; GFX9-NEXT:    s_mov_b32 s2, s0
114; GFX9-NEXT:    s_mov_b32 s3, s0
115; GFX9-NEXT:    v_mov_b32_e32 v0, s0
116; GFX9-NEXT:    v_mov_b32_e32 v1, s1
117; GFX9-NEXT:    v_mov_b32_e32 v2, s2
118; GFX9-NEXT:    v_mov_b32_e32 v3, s3
119; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
120; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
121; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
122; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
123; GFX9-NEXT:    s_waitcnt vmcnt(0)
124; GFX9-NEXT:    s_setpc_b64 s[30:31]
125;
126; GFX10-LABEL: zero_init_foo:
127; GFX10:       ; %bb.0:
128; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
129; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
130; GFX10-NEXT:    s_mov_b32 s0, 0
131; GFX10-NEXT:    s_mov_b32 s1, s0
132; GFX10-NEXT:    s_mov_b32 s2, s0
133; GFX10-NEXT:    s_mov_b32 s3, s0
134; GFX10-NEXT:    v_mov_b32_e32 v0, s0
135; GFX10-NEXT:    v_mov_b32_e32 v1, s1
136; GFX10-NEXT:    v_mov_b32_e32 v2, s2
137; GFX10-NEXT:    v_mov_b32_e32 v3, s3
138; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
139; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
140; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
141; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
142; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
143; GFX10-NEXT:    s_setpc_b64 s[30:31]
144;
145; GFX9-PAL-LABEL: zero_init_foo:
146; GFX9-PAL:       ; %bb.0:
147; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
148; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
149; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
150; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
151; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
152; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
153; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
154; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
155; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
156; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
157; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
158; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
159; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
160; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
161; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
162;
163; GFX10-PAL-LABEL: zero_init_foo:
164; GFX10-PAL:       ; %bb.0:
165; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
166; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
167; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
168; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
169; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
170; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
171; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
172; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
173; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
174; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
175; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
176; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
177; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
178; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
179; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
180; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
181  %alloca = alloca [32 x i16], align 2, addrspace(5)
182  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
183  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
184  ret void
185}
186
187define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
188; GFX9-LABEL: store_load_sindex_kernel:
189; GFX9:       ; %bb.0: ; %bb
190; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
191; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
192; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
193; GFX9-NEXT:    v_mov_b32_e32 v0, 15
194; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
195; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
196; GFX9-NEXT:    s_and_b32 s0, s0, 15
197; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
198; GFX9-NEXT:    s_add_u32 s1, 4, s1
199; GFX9-NEXT:    scratch_store_dword off, v0, s1
200; GFX9-NEXT:    s_waitcnt vmcnt(0)
201; GFX9-NEXT:    s_add_u32 s0, 4, s0
202; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
203; GFX9-NEXT:    s_waitcnt vmcnt(0)
204; GFX9-NEXT:    s_endpgm
205;
206; GFX10-LABEL: store_load_sindex_kernel:
207; GFX10:       ; %bb.0: ; %bb
208; GFX10-NEXT:    s_add_u32 s2, s2, s5
209; GFX10-NEXT:    s_addc_u32 s3, s3, 0
210; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
211; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
212; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
213; GFX10-NEXT:    v_mov_b32_e32 v0, 15
214; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
215; GFX10-NEXT:    s_and_b32 s1, s0, 15
216; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
217; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
218; GFX10-NEXT:    s_add_u32 s0, 4, s0
219; GFX10-NEXT:    s_add_u32 s1, 4, s1
220; GFX10-NEXT:    scratch_store_dword off, v0, s0
221; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
222; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
223; GFX10-NEXT:    s_waitcnt vmcnt(0)
224; GFX10-NEXT:    s_endpgm
225;
226; GFX9-PAL-LABEL: store_load_sindex_kernel:
227; GFX9-PAL:       ; %bb.0: ; %bb
228; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
229; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
230; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
231; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
232; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
233; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
234; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
235; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
236; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
237; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
238; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
239; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
240; GFX9-PAL-NEXT:    s_add_u32 s1, 4, s1
241; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
242; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
243; GFX9-PAL-NEXT:    s_add_u32 s0, 4, s0
244; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
245; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
246; GFX9-PAL-NEXT:    s_endpgm
247;
248; GFX10-PAL-LABEL: store_load_sindex_kernel:
249; GFX10-PAL:       ; %bb.0: ; %bb
250; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
251; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
252; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
253; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
254; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
255; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
256; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
257; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
258; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
259; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
260; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
261; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
262; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
263; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
264; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
265; GFX10-PAL-NEXT:    s_add_u32 s0, 4, s0
266; GFX10-PAL-NEXT:    s_add_u32 s1, 4, s1
267; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
268; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
269; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
270; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
271; GFX10-PAL-NEXT:    s_endpgm
272bb:
273  %i = alloca [32 x float], align 4, addrspace(5)
274  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
275  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
276  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
277  store volatile i32 15, i32 addrspace(5)* %i8, align 4
278  %i9 = and i32 %idx, 15
279  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
280  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
281  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
282  ret void
283}
284
285define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
286; GFX9-LABEL: store_load_sindex_foo:
287; GFX9:       ; %bb.0: ; %bb
288; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
289; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
290; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
291; GFX9-NEXT:    s_add_u32 s0, 4, s0
292; GFX9-NEXT:    v_mov_b32_e32 v0, 15
293; GFX9-NEXT:    scratch_store_dword off, v0, s0
294; GFX9-NEXT:    s_waitcnt vmcnt(0)
295; GFX9-NEXT:    s_and_b32 s0, s2, 15
296; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
297; GFX9-NEXT:    s_add_u32 s0, 4, s0
298; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
299; GFX9-NEXT:    s_waitcnt vmcnt(0)
300; GFX9-NEXT:    s_endpgm
301;
302; GFX10-LABEL: store_load_sindex_foo:
303; GFX10:       ; %bb.0: ; %bb
304; GFX10-NEXT:    s_add_u32 s0, s0, s3
305; GFX10-NEXT:    s_addc_u32 s1, s1, 0
306; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
307; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
308; GFX10-NEXT:    s_and_b32 s0, s2, 15
309; GFX10-NEXT:    v_mov_b32_e32 v0, 15
310; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
311; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
312; GFX10-NEXT:    s_add_u32 s1, 4, s1
313; GFX10-NEXT:    s_add_u32 s0, 4, s0
314; GFX10-NEXT:    scratch_store_dword off, v0, s1
315; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
316; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
317; GFX10-NEXT:    s_waitcnt vmcnt(0)
318; GFX10-NEXT:    s_endpgm
319;
320; GFX9-PAL-LABEL: store_load_sindex_foo:
321; GFX9-PAL:       ; %bb.0: ; %bb
322; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
323; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
324; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
325; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
326; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
327; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
328; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
329; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
330; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
331; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
332; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
333; GFX9-PAL-NEXT:    s_add_u32 s1, 4, s1
334; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
335; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
336; GFX9-PAL-NEXT:    s_add_u32 s0, 4, s0
337; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
338; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
339; GFX9-PAL-NEXT:    s_endpgm
340;
341; GFX10-PAL-LABEL: store_load_sindex_foo:
342; GFX10-PAL:       ; %bb.0: ; %bb
343; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
344; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
345; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
346; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
347; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
348; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
349; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
350; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
351; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
352; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
353; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
354; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
355; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
356; GFX10-PAL-NEXT:    s_add_u32 s0, 4, s0
357; GFX10-PAL-NEXT:    s_add_u32 s1, 4, s1
358; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
359; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
360; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
361; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
362; GFX10-PAL-NEXT:    s_endpgm
363bb:
364  %i = alloca [32 x float], align 4, addrspace(5)
365  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
366  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
367  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
368  store volatile i32 15, i32 addrspace(5)* %i8, align 4
369  %i9 = and i32 %idx, 15
370  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
371  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
372  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
373  ret void
374}
375
376define amdgpu_kernel void @store_load_vindex_kernel() {
377; GFX9-LABEL: store_load_vindex_kernel:
378; GFX9:       ; %bb.0: ; %bb
379; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
380; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
381; GFX9-NEXT:    v_mov_b32_e32 v1, 4
382; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
383; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
384; GFX9-NEXT:    v_mov_b32_e32 v3, 15
385; GFX9-NEXT:    scratch_store_dword v2, v3, off
386; GFX9-NEXT:    s_waitcnt vmcnt(0)
387; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
388; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
389; GFX9-NEXT:    s_waitcnt vmcnt(0)
390; GFX9-NEXT:    s_endpgm
391;
392; GFX10-LABEL: store_load_vindex_kernel:
393; GFX10:       ; %bb.0: ; %bb
394; GFX10-NEXT:    s_add_u32 s0, s0, s3
395; GFX10-NEXT:    s_addc_u32 s1, s1, 0
396; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
397; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
398; GFX10-NEXT:    v_mov_b32_e32 v1, 4
399; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
400; GFX10-NEXT:    v_mov_b32_e32 v3, 15
401; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
402; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
403; GFX10-NEXT:    scratch_store_dword v2, v3, off
404; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
405; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
406; GFX10-NEXT:    s_waitcnt vmcnt(0)
407; GFX10-NEXT:    s_endpgm
408;
409; GFX9-PAL-LABEL: store_load_vindex_kernel:
410; GFX9-PAL:       ; %bb.0: ; %bb
411; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
412; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
413; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
414; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
415; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 4
416; GFX9-PAL-NEXT:    v_add_u32_e32 v2, v1, v0
417; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
418; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
419; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
420; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
421; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
422; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
423; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
424; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, v1, v0
425; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
426; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
427; GFX9-PAL-NEXT:    s_endpgm
428;
429; GFX10-PAL-LABEL: store_load_vindex_kernel:
430; GFX10-PAL:       ; %bb.0: ; %bb
431; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
432; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
433; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
434; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
435; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
436; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
437; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
438; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
439; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
440; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 4
441; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
442; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
443; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
444; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
445; GFX10-PAL-NEXT:    scratch_store_dword v2, v3, off
446; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
447; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
448; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
449; GFX10-PAL-NEXT:    s_endpgm
450bb:
451  %i = alloca [32 x float], align 4, addrspace(5)
452  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
453  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
454  %i3 = zext i32 %i2 to i64
455  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
456  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
457  store volatile i32 15, i32 addrspace(5)* %i8, align 4
458  %i9 = sub nsw i32 31, %i2
459  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
460  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
461  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
462  ret void
463}
464
465define void @store_load_vindex_foo(i32 %idx) {
466; GFX9-LABEL: store_load_vindex_foo:
467; GFX9:       ; %bb.0: ; %bb
468; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
469; GFX9-NEXT:    v_mov_b32_e32 v1, s32
470; GFX9-NEXT:    v_mov_b32_e32 v3, 15
471; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
472; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
473; GFX9-NEXT:    scratch_store_dword v2, v3, off
474; GFX9-NEXT:    s_waitcnt vmcnt(0)
475; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
476; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
477; GFX9-NEXT:    s_waitcnt vmcnt(0)
478; GFX9-NEXT:    s_setpc_b64 s[30:31]
479;
480; GFX10-LABEL: store_load_vindex_foo:
481; GFX10:       ; %bb.0: ; %bb
482; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
483; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
484; GFX10-NEXT:    v_mov_b32_e32 v1, 15
485; GFX10-NEXT:    v_mov_b32_e32 v2, s32
486; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
487; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
488; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
489; GFX10-NEXT:    scratch_store_dword v0, v1, off
490; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
491; GFX10-NEXT:    scratch_load_dword v0, v2, off glc dlc
492; GFX10-NEXT:    s_waitcnt vmcnt(0)
493; GFX10-NEXT:    s_setpc_b64 s[30:31]
494;
495; GFX9-PAL-LABEL: store_load_vindex_foo:
496; GFX9-PAL:       ; %bb.0: ; %bb
497; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
498; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s32
499; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
500; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
501; GFX9-PAL-NEXT:    v_and_b32_e32 v0, v0, v3
502; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
503; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
504; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
505; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
506; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
507; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
508;
509; GFX10-PAL-LABEL: store_load_vindex_foo:
510; GFX10-PAL:       ; %bb.0: ; %bb
511; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
512; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
513; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
514; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s32
515; GFX10-PAL-NEXT:    v_and_b32_e32 v3, v0, v1
516; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
517; GFX10-PAL-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
518; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
519; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
520; GFX10-PAL-NEXT:    scratch_load_dword v0, v2, off glc dlc
521; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
522; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
523bb:
524  %i = alloca [32 x float], align 4, addrspace(5)
525  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
526  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
527  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
528  store volatile i32 15, i32 addrspace(5)* %i8, align 4
529  %i9 = and i32 %idx, 15
530  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
531  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
532  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
533  ret void
534}
535
536define void @private_ptr_foo(float addrspace(5)* nocapture %arg) {
537; GFX9-LABEL: private_ptr_foo:
538; GFX9:       ; %bb.0:
539; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
540; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
541; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
542; GFX9-NEXT:    s_waitcnt vmcnt(0)
543; GFX9-NEXT:    s_setpc_b64 s[30:31]
544;
545; GFX10-LABEL: private_ptr_foo:
546; GFX10:       ; %bb.0:
547; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
548; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
549; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
550; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
551; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
552; GFX10-NEXT:    s_setpc_b64 s[30:31]
553;
554; GFX9-PAL-LABEL: private_ptr_foo:
555; GFX9-PAL:       ; %bb.0:
556; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
557; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
558; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
559; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
560; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
561;
562; GFX10-PAL-LABEL: private_ptr_foo:
563; GFX10-PAL:       ; %bb.0:
564; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
565; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
566; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
567; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
568; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
569; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
570  %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1
571  store float 1.000000e+01, float addrspace(5)* %gep, align 4
572  ret void
573}
574
575define amdgpu_kernel void @zero_init_small_offset_kernel() {
576; GFX9-LABEL: zero_init_small_offset_kernel:
577; GFX9:       ; %bb.0:
578; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
579; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
580; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
581; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
582; GFX9-NEXT:    s_waitcnt vmcnt(0)
583; GFX9-NEXT:    s_mov_b32 s0, 0
584; GFX9-NEXT:    s_mov_b32 s1, s0
585; GFX9-NEXT:    s_mov_b32 s2, s0
586; GFX9-NEXT:    s_mov_b32 s3, s0
587; GFX9-NEXT:    v_mov_b32_e32 v0, s0
588; GFX9-NEXT:    v_mov_b32_e32 v1, s1
589; GFX9-NEXT:    v_mov_b32_e32 v2, s2
590; GFX9-NEXT:    v_mov_b32_e32 v3, s3
591; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
592; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272
593; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
594; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288
595; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
596; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304
597; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
598; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
599; GFX9-NEXT:    s_endpgm
600;
601; GFX10-LABEL: zero_init_small_offset_kernel:
602; GFX10:       ; %bb.0:
603; GFX10-NEXT:    s_add_u32 s0, s0, s3
604; GFX10-NEXT:    s_addc_u32 s1, s1, 0
605; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
606; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
607; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
608; GFX10-NEXT:    s_waitcnt vmcnt(0)
609; GFX10-NEXT:    s_mov_b32 s0, 0
610; GFX10-NEXT:    s_mov_b32 s1, s0
611; GFX10-NEXT:    s_mov_b32 s2, s0
612; GFX10-NEXT:    s_mov_b32 s3, s0
613; GFX10-NEXT:    v_mov_b32_e32 v0, s0
614; GFX10-NEXT:    v_mov_b32_e32 v1, s1
615; GFX10-NEXT:    v_mov_b32_e32 v2, s2
616; GFX10-NEXT:    v_mov_b32_e32 v3, s3
617; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
618; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
619; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
620; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
621; GFX10-NEXT:    s_endpgm
622;
623; GFX9-PAL-LABEL: zero_init_small_offset_kernel:
624; GFX9-PAL:       ; %bb.0:
625; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
626; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
627; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
628; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
629; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
630; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
631; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
632; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
633; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
634; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
635; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
636; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
637; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
638; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
639; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
640; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
641; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
642; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
643; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
644; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272
645; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
646; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288
647; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
648; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304
649; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
650; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
651; GFX9-PAL-NEXT:    s_endpgm
652;
653; GFX10-PAL-LABEL: zero_init_small_offset_kernel:
654; GFX10-PAL:       ; %bb.0:
655; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
656; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
657; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
658; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
659; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
660; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
661; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
662; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
663; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
664; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
665; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
666; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
667; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
668; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
669; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
670; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
671; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
672; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
673; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
674; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
675; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
676; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
677; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
678; GFX10-PAL-NEXT:    s_endpgm
679  %padding = alloca [64 x i32], align 4, addrspace(5)
680  %alloca = alloca [32 x i16], align 2, addrspace(5)
681  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
682  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
683  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
684  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
685  ret void
686}
687
688define void @zero_init_small_offset_foo() {
689; GFX9-LABEL: zero_init_small_offset_foo:
690; GFX9:       ; %bb.0:
691; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
692; GFX9-NEXT:    scratch_load_dword v0, off, s32 glc
693; GFX9-NEXT:    s_waitcnt vmcnt(0)
694; GFX9-NEXT:    s_mov_b32 s0, 0
695; GFX9-NEXT:    s_mov_b32 s1, s0
696; GFX9-NEXT:    s_mov_b32 s2, s0
697; GFX9-NEXT:    s_mov_b32 s3, s0
698; GFX9-NEXT:    v_mov_b32_e32 v0, s0
699; GFX9-NEXT:    v_mov_b32_e32 v1, s1
700; GFX9-NEXT:    v_mov_b32_e32 v2, s2
701; GFX9-NEXT:    v_mov_b32_e32 v3, s3
702; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
703; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
704; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
705; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
706; GFX9-NEXT:    s_waitcnt vmcnt(0)
707; GFX9-NEXT:    s_setpc_b64 s[30:31]
708;
709; GFX10-LABEL: zero_init_small_offset_foo:
710; GFX10:       ; %bb.0:
711; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
712; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
713; GFX10-NEXT:    scratch_load_dword v0, off, s32 glc dlc
714; GFX10-NEXT:    s_waitcnt vmcnt(0)
715; GFX10-NEXT:    s_mov_b32 s0, 0
716; GFX10-NEXT:    s_mov_b32 s1, s0
717; GFX10-NEXT:    s_mov_b32 s2, s0
718; GFX10-NEXT:    s_mov_b32 s3, s0
719; GFX10-NEXT:    v_mov_b32_e32 v0, s0
720; GFX10-NEXT:    v_mov_b32_e32 v1, s1
721; GFX10-NEXT:    v_mov_b32_e32 v2, s2
722; GFX10-NEXT:    v_mov_b32_e32 v3, s3
723; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
724; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
725; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
726; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
727; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
728; GFX10-NEXT:    s_setpc_b64 s[30:31]
729;
730; GFX9-PAL-LABEL: zero_init_small_offset_foo:
731; GFX9-PAL:       ; %bb.0:
732; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
733; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 glc
734; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
735; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
736; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
737; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
738; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
739; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
740; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
741; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
742; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
743; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
744; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
745; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
746; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
747; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
748; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
749;
750; GFX10-PAL-LABEL: zero_init_small_offset_foo:
751; GFX10-PAL:       ; %bb.0:
752; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
753; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
754; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s32 glc dlc
755; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
756; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
757; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
758; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
759; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
760; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
761; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
762; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
763; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
764; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
765; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
766; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
767; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
768; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
769; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
770  %padding = alloca [64 x i32], align 4, addrspace(5)
771  %alloca = alloca [32 x i16], align 2, addrspace(5)
772  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
773  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
774  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
775  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
776  ret void
777}
778
779define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
780; GFX9-LABEL: store_load_sindex_small_offset_kernel:
781; GFX9:       ; %bb.0: ; %bb
782; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
783; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
784; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
785; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
786; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
787; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
788; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
789; GFX9-NEXT:    s_and_b32 s0, s0, 15
790; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
791; GFX9-NEXT:    s_waitcnt vmcnt(0)
792; GFX9-NEXT:    v_mov_b32_e32 v0, 15
793; GFX9-NEXT:    s_add_u32 s1, 0x104, s1
794; GFX9-NEXT:    scratch_store_dword off, v0, s1
795; GFX9-NEXT:    s_waitcnt vmcnt(0)
796; GFX9-NEXT:    s_add_u32 s0, 0x104, s0
797; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
798; GFX9-NEXT:    s_waitcnt vmcnt(0)
799; GFX9-NEXT:    s_endpgm
800;
801; GFX10-LABEL: store_load_sindex_small_offset_kernel:
802; GFX10:       ; %bb.0: ; %bb
803; GFX10-NEXT:    s_add_u32 s2, s2, s5
804; GFX10-NEXT:    s_addc_u32 s3, s3, 0
805; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
806; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
807; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
808; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
809; GFX10-NEXT:    s_waitcnt vmcnt(0)
810; GFX10-NEXT:    v_mov_b32_e32 v0, 15
811; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
812; GFX10-NEXT:    s_and_b32 s1, s0, 15
813; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
814; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
815; GFX10-NEXT:    s_add_u32 s0, 0x104, s0
816; GFX10-NEXT:    s_add_u32 s1, 0x104, s1
817; GFX10-NEXT:    scratch_store_dword off, v0, s0
818; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
819; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
820; GFX10-NEXT:    s_waitcnt vmcnt(0)
821; GFX10-NEXT:    s_endpgm
822;
823; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel:
824; GFX9-PAL:       ; %bb.0: ; %bb
825; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
826; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
827; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
828; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
829; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
830; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
831; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
832; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
833; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
834; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
835; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
836; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
837; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
838; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
839; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
840; GFX9-PAL-NEXT:    s_add_u32 s1, 0x104, s1
841; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
842; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
843; GFX9-PAL-NEXT:    s_add_u32 s0, 0x104, s0
844; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
845; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
846; GFX9-PAL-NEXT:    s_endpgm
847;
848; GFX10-PAL-LABEL: store_load_sindex_small_offset_kernel:
849; GFX10-PAL:       ; %bb.0: ; %bb
850; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
851; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
852; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
853; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
854; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
855; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
856; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
857; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
858; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
859; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
860; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
861; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
862; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
863; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
864; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
865; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
866; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
867; GFX10-PAL-NEXT:    s_add_u32 s0, 0x104, s0
868; GFX10-PAL-NEXT:    s_add_u32 s1, 0x104, s1
869; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
870; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
871; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
872; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
873; GFX10-PAL-NEXT:    s_endpgm
874bb:
875  %padding = alloca [64 x i32], align 4, addrspace(5)
876  %i = alloca [32 x float], align 4, addrspace(5)
877  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
878  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
879  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
880  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
881  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
882  store volatile i32 15, i32 addrspace(5)* %i8, align 4
883  %i9 = and i32 %idx, 15
884  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
885  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
886  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
887  ret void
888}
889
890define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
891; GFX9-LABEL: store_load_sindex_small_offset_foo:
892; GFX9:       ; %bb.0: ; %bb
893; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
894; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
895; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
896; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
897; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
898; GFX9-NEXT:    s_waitcnt vmcnt(0)
899; GFX9-NEXT:    s_add_u32 s0, 0x104, s0
900; GFX9-NEXT:    v_mov_b32_e32 v0, 15
901; GFX9-NEXT:    scratch_store_dword off, v0, s0
902; GFX9-NEXT:    s_waitcnt vmcnt(0)
903; GFX9-NEXT:    s_and_b32 s0, s2, 15
904; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
905; GFX9-NEXT:    s_add_u32 s0, 0x104, s0
906; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
907; GFX9-NEXT:    s_waitcnt vmcnt(0)
908; GFX9-NEXT:    s_endpgm
909;
910; GFX10-LABEL: store_load_sindex_small_offset_foo:
911; GFX10:       ; %bb.0: ; %bb
912; GFX10-NEXT:    s_add_u32 s0, s0, s3
913; GFX10-NEXT:    s_addc_u32 s1, s1, 0
914; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
915; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
916; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
917; GFX10-NEXT:    s_waitcnt vmcnt(0)
918; GFX10-NEXT:    s_and_b32 s0, s2, 15
919; GFX10-NEXT:    v_mov_b32_e32 v0, 15
920; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
921; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
922; GFX10-NEXT:    s_add_u32 s1, 0x104, s1
923; GFX10-NEXT:    s_add_u32 s0, 0x104, s0
924; GFX10-NEXT:    scratch_store_dword off, v0, s1
925; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
926; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
927; GFX10-NEXT:    s_waitcnt vmcnt(0)
928; GFX10-NEXT:    s_endpgm
929;
930; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo:
931; GFX9-PAL:       ; %bb.0: ; %bb
932; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
933; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
934; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
935; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
936; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
937; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
938; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
939; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
940; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
941; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
942; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
943; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
944; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
945; GFX9-PAL-NEXT:    s_add_u32 s1, 0x104, s1
946; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
947; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
948; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
949; GFX9-PAL-NEXT:    s_add_u32 s0, 0x104, s0
950; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
951; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
952; GFX9-PAL-NEXT:    s_endpgm
953;
954; GFX10-PAL-LABEL: store_load_sindex_small_offset_foo:
955; GFX10-PAL:       ; %bb.0: ; %bb
956; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
957; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
958; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
959; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
960; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
961; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
962; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
963; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
964; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
965; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
966; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
967; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
968; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
969; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
970; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
971; GFX10-PAL-NEXT:    s_add_u32 s0, 0x104, s0
972; GFX10-PAL-NEXT:    s_add_u32 s1, 0x104, s1
973; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
974; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
975; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
976; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
977; GFX10-PAL-NEXT:    s_endpgm
978bb:
979  %padding = alloca [64 x i32], align 4, addrspace(5)
980  %i = alloca [32 x float], align 4, addrspace(5)
981  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
982  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
983  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
984  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
985  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
986  store volatile i32 15, i32 addrspace(5)* %i8, align 4
987  %i9 = and i32 %idx, 15
988  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
989  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
990  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
991  ret void
992}
993
994define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
995; GFX9-LABEL: store_load_vindex_small_offset_kernel:
996; GFX9:       ; %bb.0: ; %bb
997; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
998; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
999; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1000; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
1001; GFX9-NEXT:    s_waitcnt vmcnt(0)
1002; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1003; GFX9-NEXT:    v_mov_b32_e32 v1, 0x104
1004; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
1005; GFX9-NEXT:    v_mov_b32_e32 v3, 15
1006; GFX9-NEXT:    scratch_store_dword v2, v3, off
1007; GFX9-NEXT:    s_waitcnt vmcnt(0)
1008; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
1009; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
1010; GFX9-NEXT:    s_waitcnt vmcnt(0)
1011; GFX9-NEXT:    s_endpgm
1012;
1013; GFX10-LABEL: store_load_vindex_small_offset_kernel:
1014; GFX10:       ; %bb.0: ; %bb
1015; GFX10-NEXT:    s_add_u32 s0, s0, s3
1016; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1017; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1018; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1019; GFX10-NEXT:    v_mov_b32_e32 v1, 0x104
1020; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1021; GFX10-NEXT:    v_mov_b32_e32 v3, 15
1022; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
1023; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
1024; GFX10-NEXT:    scratch_load_dword v1, off, off offset:4 glc dlc
1025; GFX10-NEXT:    s_waitcnt vmcnt(0)
1026; GFX10-NEXT:    scratch_store_dword v2, v3, off
1027; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1028; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1029; GFX10-NEXT:    s_waitcnt vmcnt(0)
1030; GFX10-NEXT:    s_endpgm
1031;
1032; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel:
1033; GFX9-PAL:       ; %bb.0: ; %bb
1034; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1035; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1036; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1037; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1038; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1039; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
1040; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1041; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1042; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1043; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1044; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
1045; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1046; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
1047; GFX9-PAL-NEXT:    v_add_u32_e32 v2, v1, v0
1048; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1049; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1050; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, v1, v0
1051; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
1052; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1053; GFX9-PAL-NEXT:    s_endpgm
1054;
1055; GFX10-PAL-LABEL: store_load_vindex_small_offset_kernel:
1056; GFX10-PAL:       ; %bb.0: ; %bb
1057; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
1058; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1059; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1060; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1061; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1062; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
1063; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
1064; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1065; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1066; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
1067; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1068; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
1069; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
1070; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
1071; GFX10-PAL-NEXT:    scratch_load_dword v1, off, off offset:4 glc dlc
1072; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1073; GFX10-PAL-NEXT:    scratch_store_dword v2, v3, off
1074; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1075; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1076; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1077; GFX10-PAL-NEXT:    s_endpgm
1078bb:
1079  %padding = alloca [64 x i32], align 4, addrspace(5)
1080  %i = alloca [32 x float], align 4, addrspace(5)
1081  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1082  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1083  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1084  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
1085  %i3 = zext i32 %i2 to i64
1086  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
1087  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1088  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1089  %i9 = sub nsw i32 31, %i2
1090  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1091  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1092  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1093  ret void
1094}
1095
1096define void @store_load_vindex_small_offset_foo(i32 %idx) {
1097; GFX9-LABEL: store_load_vindex_small_offset_foo:
1098; GFX9:       ; %bb.0: ; %bb
1099; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1100; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
1101; GFX9-NEXT:    s_waitcnt vmcnt(0)
1102; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x100
1103; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
1104; GFX9-NEXT:    v_mov_b32_e32 v3, 15
1105; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1106; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
1107; GFX9-NEXT:    scratch_store_dword v2, v3, off
1108; GFX9-NEXT:    s_waitcnt vmcnt(0)
1109; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1110; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
1111; GFX9-NEXT:    s_waitcnt vmcnt(0)
1112; GFX9-NEXT:    s_setpc_b64 s[30:31]
1113;
1114; GFX10-LABEL: store_load_vindex_small_offset_foo:
1115; GFX10:       ; %bb.0: ; %bb
1116; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1117; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1118; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1119; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x100
1120; GFX10-NEXT:    v_mov_b32_e32 v2, vcc_lo
1121; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
1122; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
1123; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
1124; GFX10-NEXT:    scratch_load_dword v3, off, s32 glc dlc
1125; GFX10-NEXT:    s_waitcnt vmcnt(0)
1126; GFX10-NEXT:    scratch_store_dword v0, v1, off
1127; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1128; GFX10-NEXT:    scratch_load_dword v0, v2, off glc dlc
1129; GFX10-NEXT:    s_waitcnt vmcnt(0)
1130; GFX10-NEXT:    s_setpc_b64 s[30:31]
1131;
1132; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
1133; GFX9-PAL:       ; %bb.0: ; %bb
1134; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1135; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 glc
1136; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1137; GFX9-PAL-NEXT:    s_add_u32 vcc_hi, s32, 0x100
1138; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
1139; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
1140; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1141; GFX9-PAL-NEXT:    v_and_b32_e32 v0, v0, v3
1142; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1143; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1144; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1145; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
1146; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1147; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1148;
1149; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo:
1150; GFX10-PAL:       ; %bb.0: ; %bb
1151; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1152; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1153; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
1154; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x100
1155; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, vcc_lo
1156; GFX10-PAL-NEXT:    v_and_b32_e32 v3, v0, v1
1157; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
1158; GFX10-PAL-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
1159; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 glc dlc
1160; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1161; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
1162; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1163; GFX10-PAL-NEXT:    scratch_load_dword v0, v2, off glc dlc
1164; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1165; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1166bb:
1167  %padding = alloca [64 x i32], align 4, addrspace(5)
1168  %i = alloca [32 x float], align 4, addrspace(5)
1169  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1170  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1171  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1172  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1173  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1174  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1175  %i9 = and i32 %idx, 15
1176  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1177  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1178  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1179  ret void
1180}
1181
1182define amdgpu_kernel void @zero_init_large_offset_kernel() {
1183; GFX9-LABEL: zero_init_large_offset_kernel:
1184; GFX9:       ; %bb.0:
1185; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1186; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1187; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1188; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:16 glc
1189; GFX9-NEXT:    s_waitcnt vmcnt(0)
1190; GFX9-NEXT:    s_mov_b32 s0, 0
1191; GFX9-NEXT:    s_mov_b32 s1, s0
1192; GFX9-NEXT:    s_mov_b32 s2, s0
1193; GFX9-NEXT:    s_mov_b32 s3, s0
1194; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1195; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1196; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1197; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1198; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1199; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1200; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1201; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1202; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1203; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1204; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1205; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1206; GFX9-NEXT:    s_endpgm
1207;
1208; GFX10-LABEL: zero_init_large_offset_kernel:
1209; GFX10:       ; %bb.0:
1210; GFX10-NEXT:    s_add_u32 s0, s0, s3
1211; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1212; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1213; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1214; GFX10-NEXT:    scratch_load_dword v0, off, off offset:16 glc dlc
1215; GFX10-NEXT:    s_waitcnt vmcnt(0)
1216; GFX10-NEXT:    s_mov_b32 s0, 0
1217; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1218; GFX10-NEXT:    s_mov_b32 s1, s0
1219; GFX10-NEXT:    s_mov_b32 s2, s0
1220; GFX10-NEXT:    s_mov_b32 s3, s0
1221; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1222; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1223; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1224; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1225; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1226; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1227; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1228; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1229; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1230; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1231; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1232; GFX10-NEXT:    s_endpgm
1233;
1234; GFX9-PAL-LABEL: zero_init_large_offset_kernel:
1235; GFX9-PAL:       ; %bb.0:
1236; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1237; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1238; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1239; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1240; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1241; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1242; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1243; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1244; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1245; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:16 glc
1246; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1247; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
1248; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1249; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
1250; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
1251; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
1252; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
1253; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
1254; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1255; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1256; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1257; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1258; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1259; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1260; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1261; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1262; GFX9-PAL-NEXT:    s_endpgm
1263;
1264; GFX10-PAL-LABEL: zero_init_large_offset_kernel:
1265; GFX10-PAL:       ; %bb.0:
1266; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
1267; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1268; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1269; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1270; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1271; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
1272; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
1273; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1274; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1275; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:16 glc dlc
1276; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1277; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
1278; GFX10-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1279; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
1280; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1281; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
1282; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
1283; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
1284; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
1285; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
1286; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1287; GFX10-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1288; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1289; GFX10-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1290; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1291; GFX10-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1292; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1293; GFX10-PAL-NEXT:    s_endpgm
1294  %padding = alloca [4096 x i32], align 4, addrspace(5)
1295  %alloca = alloca [32 x i16], align 2, addrspace(5)
1296  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1297  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1298  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
1299  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
1300  ret void
1301}
1302
1303define void @zero_init_large_offset_foo() {
1304; GFX9-LABEL: zero_init_large_offset_foo:
1305; GFX9:       ; %bb.0:
1306; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1307; GFX9-NEXT:    scratch_load_dword v0, off, s32 glc
1308; GFX9-NEXT:    s_waitcnt vmcnt(0)
1309; GFX9-NEXT:    s_mov_b32 s0, 0
1310; GFX9-NEXT:    s_mov_b32 s1, s0
1311; GFX9-NEXT:    s_mov_b32 s2, s0
1312; GFX9-NEXT:    s_mov_b32 s3, s0
1313; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1314; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1315; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1316; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1317; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1318; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1319; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1320; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1321; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1322; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1323; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1324; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1325; GFX9-NEXT:    s_waitcnt vmcnt(0)
1326; GFX9-NEXT:    s_setpc_b64 s[30:31]
1327;
1328; GFX10-LABEL: zero_init_large_offset_foo:
1329; GFX10:       ; %bb.0:
1330; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1331; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1332; GFX10-NEXT:    scratch_load_dword v0, off, s32 glc dlc
1333; GFX10-NEXT:    s_waitcnt vmcnt(0)
1334; GFX10-NEXT:    s_mov_b32 s0, 0
1335; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1336; GFX10-NEXT:    s_mov_b32 s1, s0
1337; GFX10-NEXT:    s_mov_b32 s2, s0
1338; GFX10-NEXT:    s_mov_b32 s3, s0
1339; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1340; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1341; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1342; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1343; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1344; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1345; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1346; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1347; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1348; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1349; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1350; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1351; GFX10-NEXT:    s_setpc_b64 s[30:31]
1352;
1353; GFX9-PAL-LABEL: zero_init_large_offset_foo:
1354; GFX9-PAL:       ; %bb.0:
1355; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1356; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 glc
1357; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1358; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1359; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
1360; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1361; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
1362; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
1363; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
1364; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
1365; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
1366; GFX9-PAL-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1367; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1368; GFX9-PAL-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1369; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1370; GFX9-PAL-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1371; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1372; GFX9-PAL-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1373; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1374; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1375; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1376;
1377; GFX10-PAL-LABEL: zero_init_large_offset_foo:
1378; GFX10-PAL:       ; %bb.0:
1379; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1380; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1381; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s32 glc dlc
1382; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1383; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
1384; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1385; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
1386; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1387; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
1388; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
1389; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
1390; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
1391; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
1392; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1393; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1394; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1395; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1396; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1397; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1398; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1399; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1400; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1401  %padding = alloca [4096 x i32], align 4, addrspace(5)
1402  %alloca = alloca [32 x i16], align 2, addrspace(5)
1403  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1404  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1405  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
1406  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
1407  ret void
1408}
1409
1410define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
1411; GFX9-LABEL: store_load_sindex_large_offset_kernel:
1412; GFX9:       ; %bb.0: ; %bb
1413; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
1414; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
1415; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1416; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1417; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1418; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1419; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
1420; GFX9-NEXT:    s_and_b32 s0, s0, 15
1421; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1422; GFX9-NEXT:    s_waitcnt vmcnt(0)
1423; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1424; GFX9-NEXT:    s_add_u32 s1, 0x4004, s1
1425; GFX9-NEXT:    scratch_store_dword off, v0, s1
1426; GFX9-NEXT:    s_waitcnt vmcnt(0)
1427; GFX9-NEXT:    s_add_u32 s0, 0x4004, s0
1428; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1429; GFX9-NEXT:    s_waitcnt vmcnt(0)
1430; GFX9-NEXT:    s_endpgm
1431;
1432; GFX10-LABEL: store_load_sindex_large_offset_kernel:
1433; GFX10:       ; %bb.0: ; %bb
1434; GFX10-NEXT:    s_add_u32 s2, s2, s5
1435; GFX10-NEXT:    s_addc_u32 s3, s3, 0
1436; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1437; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1438; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
1439; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1440; GFX10-NEXT:    s_waitcnt vmcnt(0)
1441; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1442; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1443; GFX10-NEXT:    s_and_b32 s1, s0, 15
1444; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1445; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
1446; GFX10-NEXT:    s_add_u32 s0, 0x4004, s0
1447; GFX10-NEXT:    s_add_u32 s1, 0x4004, s1
1448; GFX10-NEXT:    scratch_store_dword off, v0, s0
1449; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1450; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1451; GFX10-NEXT:    s_waitcnt vmcnt(0)
1452; GFX10-NEXT:    s_endpgm
1453;
1454; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel:
1455; GFX9-PAL:       ; %bb.0: ; %bb
1456; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
1457; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
1458; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1459; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1460; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
1461; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1462; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1463; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
1464; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
1465; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1466; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1467; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1468; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1469; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1470; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1471; GFX9-PAL-NEXT:    s_add_u32 s1, 0x4004, s1
1472; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1473; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1474; GFX9-PAL-NEXT:    s_add_u32 s0, 0x4004, s0
1475; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
1476; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1477; GFX9-PAL-NEXT:    s_endpgm
1478;
1479; GFX10-PAL-LABEL: store_load_sindex_large_offset_kernel:
1480; GFX10-PAL:       ; %bb.0: ; %bb
1481; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
1482; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
1483; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1484; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1485; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1486; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
1487; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
1488; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
1489; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
1490; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
1491; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1492; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1493; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
1494; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1495; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
1496; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1497; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1498; GFX10-PAL-NEXT:    s_add_u32 s0, 0x4004, s0
1499; GFX10-PAL-NEXT:    s_add_u32 s1, 0x4004, s1
1500; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
1501; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1502; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1503; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1504; GFX10-PAL-NEXT:    s_endpgm
1505bb:
1506  %padding = alloca [4096 x i32], align 4, addrspace(5)
1507  %i = alloca [32 x float], align 4, addrspace(5)
1508  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1509  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1510  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1511  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1512  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1513  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1514  %i9 = and i32 %idx, 15
1515  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1516  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1517  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1518  ret void
1519}
1520
1521define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
1522; GFX9-LABEL: store_load_sindex_large_offset_foo:
1523; GFX9:       ; %bb.0: ; %bb
1524; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1525; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1526; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1527; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
1528; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1529; GFX9-NEXT:    s_waitcnt vmcnt(0)
1530; GFX9-NEXT:    s_add_u32 s0, 0x4004, s0
1531; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1532; GFX9-NEXT:    scratch_store_dword off, v0, s0
1533; GFX9-NEXT:    s_waitcnt vmcnt(0)
1534; GFX9-NEXT:    s_and_b32 s0, s2, 15
1535; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1536; GFX9-NEXT:    s_add_u32 s0, 0x4004, s0
1537; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1538; GFX9-NEXT:    s_waitcnt vmcnt(0)
1539; GFX9-NEXT:    s_endpgm
1540;
1541; GFX10-LABEL: store_load_sindex_large_offset_foo:
1542; GFX10:       ; %bb.0: ; %bb
1543; GFX10-NEXT:    s_add_u32 s0, s0, s3
1544; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1545; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1546; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1547; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1548; GFX10-NEXT:    s_waitcnt vmcnt(0)
1549; GFX10-NEXT:    s_and_b32 s0, s2, 15
1550; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1551; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
1552; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1553; GFX10-NEXT:    s_add_u32 s1, 0x4004, s1
1554; GFX10-NEXT:    s_add_u32 s0, 0x4004, s0
1555; GFX10-NEXT:    scratch_store_dword off, v0, s1
1556; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1557; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
1558; GFX10-NEXT:    s_waitcnt vmcnt(0)
1559; GFX10-NEXT:    s_endpgm
1560;
1561; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo:
1562; GFX9-PAL:       ; %bb.0: ; %bb
1563; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1564; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1565; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1566; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1567; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1568; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1569; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1570; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1571; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1572; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1573; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1574; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1575; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1576; GFX9-PAL-NEXT:    s_add_u32 s1, 0x4004, s1
1577; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1578; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1579; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1580; GFX9-PAL-NEXT:    s_add_u32 s0, 0x4004, s0
1581; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
1582; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1583; GFX9-PAL-NEXT:    s_endpgm
1584;
1585; GFX10-PAL-LABEL: store_load_sindex_large_offset_foo:
1586; GFX10-PAL:       ; %bb.0: ; %bb
1587; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
1588; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1589; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1590; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1591; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1592; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
1593; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
1594; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1595; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1596; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1597; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1598; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
1599; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
1600; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1601; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1602; GFX10-PAL-NEXT:    s_add_u32 s0, 0x4004, s0
1603; GFX10-PAL-NEXT:    s_add_u32 s1, 0x4004, s1
1604; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
1605; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1606; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1607; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1608; GFX10-PAL-NEXT:    s_endpgm
1609bb:
1610  %padding = alloca [4096 x i32], align 4, addrspace(5)
1611  %i = alloca [32 x float], align 4, addrspace(5)
1612  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1613  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1614  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1615  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1616  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1617  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1618  %i9 = and i32 %idx, 15
1619  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1620  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1621  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1622  ret void
1623}
1624
1625define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
1626; GFX9-LABEL: store_load_vindex_large_offset_kernel:
1627; GFX9:       ; %bb.0: ; %bb
1628; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1629; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1630; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1631; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
1632; GFX9-NEXT:    s_waitcnt vmcnt(0)
1633; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1634; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4004
1635; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
1636; GFX9-NEXT:    v_mov_b32_e32 v3, 15
1637; GFX9-NEXT:    scratch_store_dword v2, v3, off
1638; GFX9-NEXT:    s_waitcnt vmcnt(0)
1639; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
1640; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
1641; GFX9-NEXT:    s_waitcnt vmcnt(0)
1642; GFX9-NEXT:    s_endpgm
1643;
1644; GFX10-LABEL: store_load_vindex_large_offset_kernel:
1645; GFX10:       ; %bb.0: ; %bb
1646; GFX10-NEXT:    s_add_u32 s0, s0, s3
1647; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1648; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1649; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1650; GFX10-NEXT:    v_mov_b32_e32 v1, 0x4004
1651; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1652; GFX10-NEXT:    v_mov_b32_e32 v3, 15
1653; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
1654; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
1655; GFX10-NEXT:    scratch_load_dword v1, off, off offset:4 glc dlc
1656; GFX10-NEXT:    s_waitcnt vmcnt(0)
1657; GFX10-NEXT:    scratch_store_dword v2, v3, off
1658; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1659; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1660; GFX10-NEXT:    s_waitcnt vmcnt(0)
1661; GFX10-NEXT:    s_endpgm
1662;
1663; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel:
1664; GFX9-PAL:       ; %bb.0: ; %bb
1665; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1666; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1667; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1668; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1669; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1670; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
1671; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1672; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1673; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1674; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1675; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
1676; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1677; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
1678; GFX9-PAL-NEXT:    v_add_u32_e32 v2, v1, v0
1679; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1680; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1681; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, v1, v0
1682; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
1683; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1684; GFX9-PAL-NEXT:    s_endpgm
1685;
1686; GFX10-PAL-LABEL: store_load_vindex_large_offset_kernel:
1687; GFX10-PAL:       ; %bb.0: ; %bb
1688; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
1689; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1690; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1691; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1692; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1693; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
1694; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
1695; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1696; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1697; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
1698; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1699; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
1700; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
1701; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
1702; GFX10-PAL-NEXT:    scratch_load_dword v1, off, off offset:4 glc dlc
1703; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1704; GFX10-PAL-NEXT:    scratch_store_dword v2, v3, off
1705; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1706; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1707; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1708; GFX10-PAL-NEXT:    s_endpgm
1709bb:
1710  %padding = alloca [4096 x i32], align 4, addrspace(5)
1711  %i = alloca [32 x float], align 4, addrspace(5)
1712  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1713  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1714  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1715  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
1716  %i3 = zext i32 %i2 to i64
1717  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
1718  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1719  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1720  %i9 = sub nsw i32 31, %i2
1721  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1722  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1723  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1724  ret void
1725}
1726
1727define void @store_load_vindex_large_offset_foo(i32 %idx) {
1728; GFX9-LABEL: store_load_vindex_large_offset_foo:
1729; GFX9:       ; %bb.0: ; %bb
1730; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1731; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
1732; GFX9-NEXT:    s_waitcnt vmcnt(0)
1733; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1734; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
1735; GFX9-NEXT:    v_mov_b32_e32 v3, 15
1736; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1737; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
1738; GFX9-NEXT:    scratch_store_dword v2, v3, off
1739; GFX9-NEXT:    s_waitcnt vmcnt(0)
1740; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1741; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
1742; GFX9-NEXT:    s_waitcnt vmcnt(0)
1743; GFX9-NEXT:    s_setpc_b64 s[30:31]
1744;
1745; GFX10-LABEL: store_load_vindex_large_offset_foo:
1746; GFX10:       ; %bb.0: ; %bb
1747; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1748; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1749; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1750; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1751; GFX10-NEXT:    v_mov_b32_e32 v2, vcc_lo
1752; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
1753; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
1754; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
1755; GFX10-NEXT:    scratch_load_dword v3, off, s32 glc dlc
1756; GFX10-NEXT:    s_waitcnt vmcnt(0)
1757; GFX10-NEXT:    scratch_store_dword v0, v1, off
1758; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1759; GFX10-NEXT:    scratch_load_dword v0, v2, off glc dlc
1760; GFX10-NEXT:    s_waitcnt vmcnt(0)
1761; GFX10-NEXT:    s_setpc_b64 s[30:31]
1762;
1763; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
1764; GFX9-PAL:       ; %bb.0: ; %bb
1765; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1766; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 glc
1767; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1768; GFX9-PAL-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1769; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
1770; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
1771; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1772; GFX9-PAL-NEXT:    v_and_b32_e32 v0, v0, v3
1773; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1774; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1775; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1776; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
1777; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1778; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1779;
1780; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo:
1781; GFX10-PAL:       ; %bb.0: ; %bb
1782; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1783; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1784; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
1785; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1786; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, vcc_lo
1787; GFX10-PAL-NEXT:    v_and_b32_e32 v3, v0, v1
1788; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
1789; GFX10-PAL-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
1790; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 glc dlc
1791; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1792; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
1793; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1794; GFX10-PAL-NEXT:    scratch_load_dword v0, v2, off glc dlc
1795; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1796; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1797bb:
1798  %padding = alloca [4096 x i32], align 4, addrspace(5)
1799  %i = alloca [32 x float], align 4, addrspace(5)
1800  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1801  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1802  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1803  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1804  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1805  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1806  %i9 = and i32 %idx, 15
1807  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1808  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1809  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1810  ret void
1811}
1812
1813define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
1814; GFX9-LABEL: store_load_large_imm_offset_kernel:
1815; GFX9:       ; %bb.0: ; %bb
1816; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1817; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1818; GFX9-NEXT:    s_movk_i32 s0, 0x3000
1819; GFX9-NEXT:    v_mov_b32_e32 v0, 13
1820; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1821; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
1822; GFX9-NEXT:    s_waitcnt vmcnt(0)
1823; GFX9-NEXT:    s_add_u32 s0, 4, s0
1824; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1825; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
1826; GFX9-NEXT:    s_waitcnt vmcnt(0)
1827; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
1828; GFX9-NEXT:    s_waitcnt vmcnt(0)
1829; GFX9-NEXT:    s_endpgm
1830;
1831; GFX10-LABEL: store_load_large_imm_offset_kernel:
1832; GFX10:       ; %bb.0: ; %bb
1833; GFX10-NEXT:    s_add_u32 s0, s0, s3
1834; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1835; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1836; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1837; GFX10-NEXT:    v_mov_b32_e32 v0, 13
1838; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1839; GFX10-NEXT:    s_movk_i32 s0, 0x3800
1840; GFX10-NEXT:    s_add_u32 s0, 4, s0
1841; GFX10-NEXT:    scratch_store_dword off, v0, off offset:4
1842; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1843; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
1844; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1845; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
1846; GFX10-NEXT:    s_waitcnt vmcnt(0)
1847; GFX10-NEXT:    s_endpgm
1848;
1849; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel:
1850; GFX9-PAL:       ; %bb.0: ; %bb
1851; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1852; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1853; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1854; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
1855; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1856; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
1857; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1858; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1859; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1860; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1861; GFX9-PAL-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
1862; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1863; GFX9-PAL-NEXT:    s_add_u32 s0, 4, s0
1864; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1865; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
1866; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1867; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
1868; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1869; GFX9-PAL-NEXT:    s_endpgm
1870;
1871; GFX10-PAL-LABEL: store_load_large_imm_offset_kernel:
1872; GFX10-PAL:       ; %bb.0: ; %bb
1873; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
1874; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1875; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1876; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1877; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1878; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
1879; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
1880; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1881; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1882; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 13
1883; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
1884; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x3800
1885; GFX10-PAL-NEXT:    s_add_u32 s0, 4, s0
1886; GFX10-PAL-NEXT:    scratch_store_dword off, v0, off offset:4
1887; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1888; GFX10-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
1889; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1890; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
1891; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1892; GFX10-PAL-NEXT:    s_endpgm
1893bb:
1894  %i = alloca [4096 x i32], align 4, addrspace(5)
1895  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
1896  store volatile i32 13, i32 addrspace(5)* %i1, align 4
1897  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
1898  store volatile i32 15, i32 addrspace(5)* %i7, align 4
1899  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
1900  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
1901  ret void
1902}
1903
1904define void @store_load_large_imm_offset_foo() {
1905; GFX9-LABEL: store_load_large_imm_offset_foo:
1906; GFX9:       ; %bb.0: ; %bb
1907; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1908; GFX9-NEXT:    s_movk_i32 s0, 0x3000
1909; GFX9-NEXT:    v_mov_b32_e32 v0, 13
1910; GFX9-NEXT:    scratch_store_dword off, v0, s32
1911; GFX9-NEXT:    s_waitcnt vmcnt(0)
1912; GFX9-NEXT:    s_add_u32 s0, s32, s0
1913; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1914; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
1915; GFX9-NEXT:    s_waitcnt vmcnt(0)
1916; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
1917; GFX9-NEXT:    s_waitcnt vmcnt(0)
1918; GFX9-NEXT:    s_setpc_b64 s[30:31]
1919;
1920; GFX10-LABEL: store_load_large_imm_offset_foo:
1921; GFX10:       ; %bb.0: ; %bb
1922; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1923; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1924; GFX10-NEXT:    v_mov_b32_e32 v0, 13
1925; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1926; GFX10-NEXT:    s_movk_i32 s0, 0x3800
1927; GFX10-NEXT:    s_add_u32 s0, s32, s0
1928; GFX10-NEXT:    scratch_store_dword off, v0, s32
1929; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1930; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
1931; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1932; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
1933; GFX10-NEXT:    s_waitcnt vmcnt(0)
1934; GFX10-NEXT:    s_setpc_b64 s[30:31]
1935;
1936; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
1937; GFX9-PAL:       ; %bb.0: ; %bb
1938; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1939; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
1940; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
1941; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s32
1942; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1943; GFX9-PAL-NEXT:    s_add_u32 s0, s32, s0
1944; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1945; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
1946; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1947; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
1948; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1949; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1950;
1951; GFX10-PAL-LABEL: store_load_large_imm_offset_foo:
1952; GFX10-PAL:       ; %bb.0: ; %bb
1953; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1954; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1955; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 13
1956; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
1957; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x3800
1958; GFX10-PAL-NEXT:    s_add_u32 s0, s32, s0
1959; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s32
1960; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1961; GFX10-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
1962; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1963; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
1964; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1965; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1966bb:
1967  %i = alloca [4096 x i32], align 4, addrspace(5)
1968  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
1969  store volatile i32 13, i32 addrspace(5)* %i1, align 4
1970  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
1971  store volatile i32 15, i32 addrspace(5)* %i7, align 4
1972  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
1973  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
1974  ret void
1975}
1976
1977define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
1978; GFX9-LABEL: store_load_vidx_sidx_offset:
1979; GFX9:       ; %bb.0: ; %bb
1980; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
1981; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
1982; GFX9-NEXT:    v_mov_b32_e32 v1, 4
1983; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1984; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1985; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
1986; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1987; GFX9-NEXT:    v_mov_b32_e32 v1, 15
1988; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
1989; GFX9-NEXT:    s_waitcnt vmcnt(0)
1990; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
1991; GFX9-NEXT:    s_waitcnt vmcnt(0)
1992; GFX9-NEXT:    s_endpgm
1993;
1994; GFX10-LABEL: store_load_vidx_sidx_offset:
1995; GFX10:       ; %bb.0: ; %bb
1996; GFX10-NEXT:    s_add_u32 s2, s2, s5
1997; GFX10-NEXT:    s_addc_u32 s3, s3, 0
1998; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1999; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2000; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
2001; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2002; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2003; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
2004; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
2005; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
2006; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2007; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
2008; GFX10-NEXT:    s_waitcnt vmcnt(0)
2009; GFX10-NEXT:    s_endpgm
2010;
2011; GFX9-PAL-LABEL: store_load_vidx_sidx_offset:
2012; GFX9-PAL:       ; %bb.0: ; %bb
2013; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
2014; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
2015; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2016; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 4
2017; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
2018; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2019; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2020; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
2021; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
2022; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
2023; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
2024; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
2025; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
2026; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2027; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
2028; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2029; GFX9-PAL-NEXT:    s_endpgm
2030;
2031; GFX10-PAL-LABEL: store_load_vidx_sidx_offset:
2032; GFX10-PAL:       ; %bb.0: ; %bb
2033; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
2034; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
2035; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2036; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2037; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2038; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
2039; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
2040; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
2041; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
2042; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
2043; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
2044; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2045; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
2046; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
2047; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
2048; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2049; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
2050; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2051; GFX10-PAL-NEXT:    s_endpgm
2052bb:
2053  %alloca = alloca [32 x i32], align 4, addrspace(5)
2054  %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
2055  %add1 = add nsw i32 %sidx, %vidx
2056  %add2 = add nsw i32 %add1, 256
2057  %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2
2058  store volatile i32 15, i32 addrspace(5)* %gep, align 4
2059  %load = load volatile i32, i32 addrspace(5)* %gep, align 4
2060  ret void
2061}
2062
2063define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
2064; GFX9-LABEL: store_load_i64_aligned:
2065; GFX9:       ; %bb.0: ; %bb
2066; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2067; GFX9-NEXT:    v_mov_b32_e32 v1, 15
2068; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2069; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2070; GFX9-NEXT:    s_waitcnt vmcnt(0)
2071; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
2072; GFX9-NEXT:    s_waitcnt vmcnt(0)
2073; GFX9-NEXT:    s_setpc_b64 s[30:31]
2074;
2075; GFX10-LABEL: store_load_i64_aligned:
2076; GFX10:       ; %bb.0: ; %bb
2077; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2078; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2079; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2080; GFX10-NEXT:    v_mov_b32_e32 v2, 0
2081; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2082; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2083; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
2084; GFX10-NEXT:    s_waitcnt vmcnt(0)
2085; GFX10-NEXT:    s_setpc_b64 s[30:31]
2086;
2087; GFX9-PAL-LABEL: store_load_i64_aligned:
2088; GFX9-PAL:       ; %bb.0: ; %bb
2089; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2090; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
2091; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
2092; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2093; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2094; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
2095; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2096; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2097;
2098; GFX10-PAL-LABEL: store_load_i64_aligned:
2099; GFX10-PAL:       ; %bb.0: ; %bb
2100; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2101; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2102; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
2103; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
2104; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2105; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2106; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
2107; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2108; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2109bb:
2110  store volatile i64 15, i64 addrspace(5)* %arg, align 8
2111  %load = load volatile i64, i64 addrspace(5)* %arg, align 8
2112  ret void
2113}
2114
2115define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
2116; GFX9-LABEL: store_load_i64_unaligned:
2117; GFX9:       ; %bb.0: ; %bb
2118; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2119; GFX9-NEXT:    v_mov_b32_e32 v1, 15
2120; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2121; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2122; GFX9-NEXT:    s_waitcnt vmcnt(0)
2123; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
2124; GFX9-NEXT:    s_waitcnt vmcnt(0)
2125; GFX9-NEXT:    s_setpc_b64 s[30:31]
2126;
2127; GFX10-LABEL: store_load_i64_unaligned:
2128; GFX10:       ; %bb.0: ; %bb
2129; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2130; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2131; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2132; GFX10-NEXT:    v_mov_b32_e32 v2, 0
2133; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2134; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2135; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
2136; GFX10-NEXT:    s_waitcnt vmcnt(0)
2137; GFX10-NEXT:    s_setpc_b64 s[30:31]
2138;
2139; GFX9-PAL-LABEL: store_load_i64_unaligned:
2140; GFX9-PAL:       ; %bb.0: ; %bb
2141; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2142; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
2143; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
2144; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2145; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2146; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
2147; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2148; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2149;
2150; GFX10-PAL-LABEL: store_load_i64_unaligned:
2151; GFX10-PAL:       ; %bb.0: ; %bb
2152; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2153; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2154; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
2155; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
2156; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2157; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2158; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
2159; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2160; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2161bb:
2162  store volatile i64 15, i64 addrspace(5)* %arg, align 1
2163  %load = load volatile i64, i64 addrspace(5)* %arg, align 1
2164  ret void
2165}
2166
2167define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) {
2168; GFX9-LABEL: store_load_v3i32_unaligned:
2169; GFX9:       ; %bb.0: ; %bb
2170; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2171; GFX9-NEXT:    v_mov_b32_e32 v1, 1
2172; GFX9-NEXT:    v_mov_b32_e32 v2, 2
2173; GFX9-NEXT:    v_mov_b32_e32 v3, 3
2174; GFX9-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
2175; GFX9-NEXT:    s_waitcnt vmcnt(0)
2176; GFX9-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
2177; GFX9-NEXT:    s_waitcnt vmcnt(0)
2178; GFX9-NEXT:    s_setpc_b64 s[30:31]
2179;
2180; GFX10-LABEL: store_load_v3i32_unaligned:
2181; GFX10:       ; %bb.0: ; %bb
2182; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2183; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2184; GFX10-NEXT:    v_mov_b32_e32 v1, 1
2185; GFX10-NEXT:    v_mov_b32_e32 v2, 2
2186; GFX10-NEXT:    v_mov_b32_e32 v3, 3
2187; GFX10-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
2188; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2189; GFX10-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
2190; GFX10-NEXT:    s_waitcnt vmcnt(0)
2191; GFX10-NEXT:    s_setpc_b64 s[30:31]
2192;
2193; GFX9-PAL-LABEL: store_load_v3i32_unaligned:
2194; GFX9-PAL:       ; %bb.0: ; %bb
2195; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2196; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
2197; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
2198; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
2199; GFX9-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
2200; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2201; GFX9-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
2202; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2203; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2204;
2205; GFX10-PAL-LABEL: store_load_v3i32_unaligned:
2206; GFX10-PAL:       ; %bb.0: ; %bb
2207; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2208; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2209; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
2210; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
2211; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
2212; GFX10-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
2213; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2214; GFX10-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
2215; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2216; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2217bb:
2218  store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1
2219  %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1
2220  ret void
2221}
2222
2223define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) {
2224; GFX9-LABEL: store_load_v4i32_unaligned:
2225; GFX9:       ; %bb.0: ; %bb
2226; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2227; GFX9-NEXT:    v_mov_b32_e32 v1, 1
2228; GFX9-NEXT:    v_mov_b32_e32 v2, 2
2229; GFX9-NEXT:    v_mov_b32_e32 v3, 3
2230; GFX9-NEXT:    v_mov_b32_e32 v4, 4
2231; GFX9-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
2232; GFX9-NEXT:    s_waitcnt vmcnt(0)
2233; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
2234; GFX9-NEXT:    s_waitcnt vmcnt(0)
2235; GFX9-NEXT:    s_setpc_b64 s[30:31]
2236;
2237; GFX10-LABEL: store_load_v4i32_unaligned:
2238; GFX10:       ; %bb.0: ; %bb
2239; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2240; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2241; GFX10-NEXT:    v_mov_b32_e32 v1, 1
2242; GFX10-NEXT:    v_mov_b32_e32 v2, 2
2243; GFX10-NEXT:    v_mov_b32_e32 v3, 3
2244; GFX10-NEXT:    v_mov_b32_e32 v4, 4
2245; GFX10-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
2246; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2247; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
2248; GFX10-NEXT:    s_waitcnt vmcnt(0)
2249; GFX10-NEXT:    s_setpc_b64 s[30:31]
2250;
2251; GFX9-PAL-LABEL: store_load_v4i32_unaligned:
2252; GFX9-PAL:       ; %bb.0: ; %bb
2253; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2254; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
2255; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
2256; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
2257; GFX9-PAL-NEXT:    v_mov_b32_e32 v4, 4
2258; GFX9-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
2259; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2260; GFX9-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
2261; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2262; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2263;
2264; GFX10-PAL-LABEL: store_load_v4i32_unaligned:
2265; GFX10-PAL:       ; %bb.0: ; %bb
2266; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2267; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2268; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
2269; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
2270; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
2271; GFX10-PAL-NEXT:    v_mov_b32_e32 v4, 4
2272; GFX10-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
2273; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2274; GFX10-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
2275; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2276; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2277bb:
2278  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1
2279  %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1
2280  ret void
2281}
2282
2283declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg)
2284declare i32 @llvm.amdgcn.workitem.id.x()
2285