1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-PAL %s
5; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10-PAL %s
6
7define amdgpu_kernel void @zero_init_kernel() {
8; GFX9-LABEL: zero_init_kernel:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
11; GFX9-NEXT:    s_mov_b32 s0, 0
12; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
13; GFX9-NEXT:    s_mov_b32 s1, s0
14; GFX9-NEXT:    s_mov_b32 s2, s0
15; GFX9-NEXT:    s_mov_b32 s3, s0
16; GFX9-NEXT:    v_mov_b32_e32 v0, s0
17; GFX9-NEXT:    v_mov_b32_e32 v1, s1
18; GFX9-NEXT:    v_mov_b32_e32 v2, s2
19; GFX9-NEXT:    v_mov_b32_e32 v3, s3
20; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
21; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64
22; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
23; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
24; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
25; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
26; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
27; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
28; GFX9-NEXT:    s_endpgm
29;
30; GFX10-LABEL: zero_init_kernel:
31; GFX10:       ; %bb.0:
32; GFX10-NEXT:    s_add_u32 s0, s0, s3
33; GFX10-NEXT:    s_addc_u32 s1, s1, 0
34; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
35; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
36; GFX10-NEXT:    s_mov_b32 s0, 0
37; GFX10-NEXT:    ; implicit-def: $vcc_hi
38; GFX10-NEXT:    s_mov_b32 s1, s0
39; GFX10-NEXT:    s_mov_b32 s2, s0
40; GFX10-NEXT:    s_mov_b32 s3, s0
41; GFX10-NEXT:    v_mov_b32_e32 v0, s0
42; GFX10-NEXT:    v_mov_b32_e32 v1, s1
43; GFX10-NEXT:    v_mov_b32_e32 v2, s2
44; GFX10-NEXT:    v_mov_b32_e32 v3, s3
45; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
46; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
47; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
48; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
49; GFX10-NEXT:    s_endpgm
50;
51; GFX9-PAL-LABEL: zero_init_kernel:
52; GFX9-PAL:       ; %bb.0:
53; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
54; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
55; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
56; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
57; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
58; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
59; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
60; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
61; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
62; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
63; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
64; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
65; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
66; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
67; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
68; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
69; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64
70; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
71; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
72; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
73; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
74; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
75; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
76; GFX9-PAL-NEXT:    s_endpgm
77;
78; GFX10-PAL-LABEL: zero_init_kernel:
79; GFX10-PAL:       ; %bb.0:
80; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
81; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
82; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
83; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
84; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
85; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
86; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
87; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
88; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
89; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
90; GFX10-PAL-NEXT:    ; implicit-def: $vcc_hi
91; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
92; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
93; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
94; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
95; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
96; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
97; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
98; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
99; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
100; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
101; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
102; GFX10-PAL-NEXT:    s_endpgm
103  %alloca = alloca [32 x i16], align 2, addrspace(5)
104  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
105  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
106  ret void
107}
108
109define void @zero_init_foo() {
110; GFX9-LABEL: zero_init_foo:
111; GFX9:       ; %bb.0:
112; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113; GFX9-NEXT:    s_mov_b32 s0, 0
114; GFX9-NEXT:    s_mov_b32 s1, s0
115; GFX9-NEXT:    s_mov_b32 s2, s0
116; GFX9-NEXT:    s_mov_b32 s3, s0
117; GFX9-NEXT:    v_mov_b32_e32 v0, s0
118; GFX9-NEXT:    v_mov_b32_e32 v1, s1
119; GFX9-NEXT:    v_mov_b32_e32 v2, s2
120; GFX9-NEXT:    v_mov_b32_e32 v3, s3
121; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
122; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
123; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
124; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
125; GFX9-NEXT:    s_waitcnt vmcnt(0)
126; GFX9-NEXT:    s_setpc_b64 s[30:31]
127;
128; GFX10-LABEL: zero_init_foo:
129; GFX10:       ; %bb.0:
130; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
131; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
132; GFX10-NEXT:    s_mov_b32 s0, 0
133; GFX10-NEXT:    ; implicit-def: $vcc_hi
134; GFX10-NEXT:    s_mov_b32 s1, s0
135; GFX10-NEXT:    s_mov_b32 s2, s0
136; GFX10-NEXT:    s_mov_b32 s3, s0
137; GFX10-NEXT:    v_mov_b32_e32 v0, s0
138; GFX10-NEXT:    v_mov_b32_e32 v1, s1
139; GFX10-NEXT:    v_mov_b32_e32 v2, s2
140; GFX10-NEXT:    v_mov_b32_e32 v3, s3
141; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
142; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
143; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
144; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
145; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
146; GFX10-NEXT:    s_setpc_b64 s[30:31]
147;
148; GFX9-PAL-LABEL: zero_init_foo:
149; GFX9-PAL:       ; %bb.0:
150; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
152; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
153; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
154; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
155; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
156; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
157; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
158; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
159; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
160; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
161; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
162; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
163; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
164; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
165;
166; GFX10-PAL-LABEL: zero_init_foo:
167; GFX10-PAL:       ; %bb.0:
168; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
169; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
170; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
171; GFX10-PAL-NEXT:    ; implicit-def: $vcc_hi
172; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
173; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
174; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
175; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
176; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
177; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
178; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
179; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
180; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
181; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
182; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
183; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
184; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
185  %alloca = alloca [32 x i16], align 2, addrspace(5)
186  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
187  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
188  ret void
189}
190
191define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
192; GFX9-LABEL: store_load_sindex_kernel:
193; GFX9:       ; %bb.0: ; %bb
194; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
195; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
196; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
197; GFX9-NEXT:    v_mov_b32_e32 v0, 15
198; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
199; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
200; GFX9-NEXT:    s_and_b32 s0, s0, 15
201; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
202; GFX9-NEXT:    s_add_u32 s1, 4, s1
203; GFX9-NEXT:    scratch_store_dword off, v0, s1
204; GFX9-NEXT:    s_add_u32 s0, 4, s0
205; GFX9-NEXT:    scratch_load_dword v0, off, s0
206; GFX9-NEXT:    s_endpgm
207;
208; GFX10-LABEL: store_load_sindex_kernel:
209; GFX10:       ; %bb.0: ; %bb
210; GFX10-NEXT:    s_add_u32 s2, s2, s5
211; GFX10-NEXT:    s_addc_u32 s3, s3, 0
212; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
213; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
214; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
215; GFX10-NEXT:    v_mov_b32_e32 v0, 15
216; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
217; GFX10-NEXT:    s_and_b32 s1, s0, 15
218; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
219; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
220; GFX10-NEXT:    s_add_u32 s0, 4, s0
221; GFX10-NEXT:    s_add_u32 s1, 4, s1
222; GFX10-NEXT:    scratch_store_dword off, v0, s0
223; GFX10-NEXT:    scratch_load_dword v0, off, s1
224; GFX10-NEXT:    s_endpgm
225;
226; GFX9-PAL-LABEL: store_load_sindex_kernel:
227; GFX9-PAL:       ; %bb.0: ; %bb
228; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
229; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
230; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
231; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
232; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
233; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
234; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
235; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
236; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
237; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
238; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
239; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
240; GFX9-PAL-NEXT:    s_add_u32 s1, 4, s1
241; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
242; GFX9-PAL-NEXT:    s_add_u32 s0, 4, s0
243; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0
244; GFX9-PAL-NEXT:    s_endpgm
245;
246; GFX10-PAL-LABEL: store_load_sindex_kernel:
247; GFX10-PAL:       ; %bb.0: ; %bb
248; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
249; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
250; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
251; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
252; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
253; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
254; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
255; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
256; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
257; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
258; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
259; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
260; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
261; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
262; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
263; GFX10-PAL-NEXT:    s_add_u32 s0, 4, s0
264; GFX10-PAL-NEXT:    s_add_u32 s1, 4, s1
265; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
266; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1
267; GFX10-PAL-NEXT:    s_endpgm
268bb:
269  %i = alloca [32 x float], align 4, addrspace(5)
270  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
271  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
272  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
273  store volatile i32 15, i32 addrspace(5)* %i8, align 4
274  %i9 = and i32 %idx, 15
275  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
276  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
277  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
278  ret void
279}
280
281define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
282; GFX9-LABEL: store_load_sindex_foo:
283; GFX9:       ; %bb.0: ; %bb
284; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
285; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
286; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
287; GFX9-NEXT:    s_add_u32 s0, 4, s0
288; GFX9-NEXT:    v_mov_b32_e32 v0, 15
289; GFX9-NEXT:    scratch_store_dword off, v0, s0
290; GFX9-NEXT:    s_and_b32 s0, s2, 15
291; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
292; GFX9-NEXT:    s_add_u32 s0, 4, s0
293; GFX9-NEXT:    scratch_load_dword v0, off, s0
294; GFX9-NEXT:    s_endpgm
295;
296; GFX10-LABEL: store_load_sindex_foo:
297; GFX10:       ; %bb.0: ; %bb
298; GFX10-NEXT:    s_add_u32 s0, s0, s3
299; GFX10-NEXT:    s_addc_u32 s1, s1, 0
300; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
301; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
302; GFX10-NEXT:    s_and_b32 s0, s2, 15
303; GFX10-NEXT:    v_mov_b32_e32 v0, 15
304; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
305; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
306; GFX10-NEXT:    s_add_u32 s1, 4, s1
307; GFX10-NEXT:    s_add_u32 s0, 4, s0
308; GFX10-NEXT:    scratch_store_dword off, v0, s1
309; GFX10-NEXT:    scratch_load_dword v0, off, s0
310; GFX10-NEXT:    s_endpgm
311;
312; GFX9-PAL-LABEL: store_load_sindex_foo:
313; GFX9-PAL:       ; %bb.0: ; %bb
314; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
315; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
316; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
317; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
318; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
319; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
320; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
321; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
322; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
323; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
324; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
325; GFX9-PAL-NEXT:    s_add_u32 s1, 4, s1
326; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
327; GFX9-PAL-NEXT:    s_add_u32 s0, 4, s0
328; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0
329; GFX9-PAL-NEXT:    s_endpgm
330;
331; GFX10-PAL-LABEL: store_load_sindex_foo:
332; GFX10-PAL:       ; %bb.0: ; %bb
333; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
334; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
335; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
336; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
337; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
338; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
339; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
340; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
341; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
342; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
343; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
344; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
345; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
346; GFX10-PAL-NEXT:    s_add_u32 s0, 4, s0
347; GFX10-PAL-NEXT:    s_add_u32 s1, 4, s1
348; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
349; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1
350; GFX10-PAL-NEXT:    s_endpgm
351bb:
352  %i = alloca [32 x float], align 4, addrspace(5)
353  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
354  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
355  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
356  store volatile i32 15, i32 addrspace(5)* %i8, align 4
357  %i9 = and i32 %idx, 15
358  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
359  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
360  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
361  ret void
362}
363
364define amdgpu_kernel void @store_load_vindex_kernel() {
365; GFX9-LABEL: store_load_vindex_kernel:
366; GFX9:       ; %bb.0: ; %bb
367; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
368; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
369; GFX9-NEXT:    v_mov_b32_e32 v1, 4
370; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
371; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
372; GFX9-NEXT:    v_mov_b32_e32 v3, 15
373; GFX9-NEXT:    scratch_store_dword v2, v3, off
374; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
375; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124
376; GFX9-NEXT:    s_endpgm
377;
378; GFX10-LABEL: store_load_vindex_kernel:
379; GFX10:       ; %bb.0: ; %bb
380; GFX10-NEXT:    s_add_u32 s0, s0, s3
381; GFX10-NEXT:    s_addc_u32 s1, s1, 0
382; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
383; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
384; GFX10-NEXT:    v_mov_b32_e32 v1, 4
385; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
386; GFX10-NEXT:    v_mov_b32_e32 v3, 15
387; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
388; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
389; GFX10-NEXT:    scratch_store_dword v2, v3, off
390; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124
391; GFX10-NEXT:    s_endpgm
392;
393; GFX9-PAL-LABEL: store_load_vindex_kernel:
394; GFX9-PAL:       ; %bb.0: ; %bb
395; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
396; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
397; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
398; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
399; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 4
400; GFX9-PAL-NEXT:    v_add_u32_e32 v2, v1, v0
401; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
402; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
403; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
404; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
405; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
406; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
407; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, v1, v0
408; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124
409; GFX9-PAL-NEXT:    s_endpgm
410;
411; GFX10-PAL-LABEL: store_load_vindex_kernel:
412; GFX10-PAL:       ; %bb.0: ; %bb
413; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
414; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
415; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
416; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
417; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
418; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
419; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
420; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
421; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
422; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 4
423; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
424; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
425; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
426; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
427; GFX10-PAL-NEXT:    scratch_store_dword v2, v3, off
428; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124
429; GFX10-PAL-NEXT:    s_endpgm
430bb:
431  %i = alloca [32 x float], align 4, addrspace(5)
432  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
433  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
434  %i3 = zext i32 %i2 to i64
435  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
436  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
437  store volatile i32 15, i32 addrspace(5)* %i8, align 4
438  %i9 = sub nsw i32 31, %i2
439  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
440  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
441  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
442  ret void
443}
444
445define void @store_load_vindex_foo(i32 %idx) {
446; GFX9-LABEL: store_load_vindex_foo:
447; GFX9:       ; %bb.0: ; %bb
448; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
449; GFX9-NEXT:    v_mov_b32_e32 v1, s32
450; GFX9-NEXT:    v_mov_b32_e32 v3, 15
451; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
452; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
453; GFX9-NEXT:    scratch_store_dword v2, v3, off
454; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
455; GFX9-NEXT:    scratch_load_dword v0, v0, off
456; GFX9-NEXT:    s_waitcnt vmcnt(0)
457; GFX9-NEXT:    s_setpc_b64 s[30:31]
458;
459; GFX10-LABEL: store_load_vindex_foo:
460; GFX10:       ; %bb.0: ; %bb
461; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
462; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
463; GFX10-NEXT:    v_mov_b32_e32 v1, 15
464; GFX10-NEXT:    v_mov_b32_e32 v2, s32
465; GFX10-NEXT:    ; implicit-def: $vcc_hi
466; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
467; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
468; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
469; GFX10-NEXT:    scratch_store_dword v0, v1, off
470; GFX10-NEXT:    scratch_load_dword v0, v2, off
471; GFX10-NEXT:    s_waitcnt vmcnt(0)
472; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
473; GFX10-NEXT:    s_setpc_b64 s[30:31]
474;
475; GFX9-PAL-LABEL: store_load_vindex_foo:
476; GFX9-PAL:       ; %bb.0: ; %bb
477; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
478; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s32
479; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
480; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
481; GFX9-PAL-NEXT:    v_and_b32_e32 v0, v0, v3
482; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
483; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
484; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off
485; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
486; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
487;
488; GFX10-PAL-LABEL: store_load_vindex_foo:
489; GFX10-PAL:       ; %bb.0: ; %bb
490; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
491; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
492; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
493; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s32
494; GFX10-PAL-NEXT:    ; implicit-def: $vcc_hi
495; GFX10-PAL-NEXT:    v_and_b32_e32 v3, v0, v1
496; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
497; GFX10-PAL-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
498; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
499; GFX10-PAL-NEXT:    scratch_load_dword v0, v2, off
500; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
501; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
502; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
503bb:
504  %i = alloca [32 x float], align 4, addrspace(5)
505  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
506  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
507  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
508  store volatile i32 15, i32 addrspace(5)* %i8, align 4
509  %i9 = and i32 %idx, 15
510  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
511  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
512  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
513  ret void
514}
515
516define void @private_ptr_foo(float addrspace(5)* nocapture %arg) {
517; GFX9-LABEL: private_ptr_foo:
518; GFX9:       ; %bb.0:
519; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
520; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
521; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
522; GFX9-NEXT:    s_waitcnt vmcnt(0)
523; GFX9-NEXT:    s_setpc_b64 s[30:31]
524;
525; GFX10-LABEL: private_ptr_foo:
526; GFX10:       ; %bb.0:
527; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
528; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
529; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
530; GFX10-NEXT:    ; implicit-def: $vcc_hi
531; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
532; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
533; GFX10-NEXT:    s_setpc_b64 s[30:31]
534;
535; GFX9-PAL-LABEL: private_ptr_foo:
536; GFX9-PAL:       ; %bb.0:
537; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
538; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
539; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
540; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
541; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
542;
543; GFX10-PAL-LABEL: private_ptr_foo:
544; GFX10-PAL:       ; %bb.0:
545; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
546; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
547; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
548; GFX10-PAL-NEXT:    ; implicit-def: $vcc_hi
549; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
550; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
551; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
552  %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1
553  store float 1.000000e+01, float addrspace(5)* %gep, align 4
554  ret void
555}
556
557define amdgpu_kernel void @zero_init_small_offset_kernel() {
558; GFX9-LABEL: zero_init_small_offset_kernel:
559; GFX9:       ; %bb.0:
560; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
561; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
562; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
563; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
564; GFX9-NEXT:    s_mov_b32 s0, 0
565; GFX9-NEXT:    s_mov_b32 s1, s0
566; GFX9-NEXT:    s_mov_b32 s2, s0
567; GFX9-NEXT:    s_mov_b32 s3, s0
568; GFX9-NEXT:    s_waitcnt vmcnt(0)
569; GFX9-NEXT:    v_mov_b32_e32 v0, s0
570; GFX9-NEXT:    v_mov_b32_e32 v1, s1
571; GFX9-NEXT:    v_mov_b32_e32 v2, s2
572; GFX9-NEXT:    v_mov_b32_e32 v3, s3
573; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
574; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272
575; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
576; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288
577; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
578; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304
579; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
580; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
581; GFX9-NEXT:    s_endpgm
582;
583; GFX10-LABEL: zero_init_small_offset_kernel:
584; GFX10:       ; %bb.0:
585; GFX10-NEXT:    s_add_u32 s0, s0, s3
586; GFX10-NEXT:    s_addc_u32 s1, s1, 0
587; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
588; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
589; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
590; GFX10-NEXT:    s_mov_b32 s0, 0
591; GFX10-NEXT:    ; implicit-def: $vcc_hi
592; GFX10-NEXT:    s_mov_b32 s1, s0
593; GFX10-NEXT:    s_mov_b32 s2, s0
594; GFX10-NEXT:    s_mov_b32 s3, s0
595; GFX10-NEXT:    s_waitcnt vmcnt(0)
596; GFX10-NEXT:    v_mov_b32_e32 v0, s0
597; GFX10-NEXT:    v_mov_b32_e32 v1, s1
598; GFX10-NEXT:    v_mov_b32_e32 v2, s2
599; GFX10-NEXT:    v_mov_b32_e32 v3, s3
600; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
601; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
602; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
603; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
604; GFX10-NEXT:    s_endpgm
605;
606; GFX9-PAL-LABEL: zero_init_small_offset_kernel:
607; GFX9-PAL:       ; %bb.0:
608; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
609; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
610; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
611; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
612; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
613; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
614; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
615; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
616; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
617; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
618; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
619; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
620; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
621; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
622; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
623; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
624; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
625; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
626; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
627; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272
628; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
629; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288
630; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
631; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304
632; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
633; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
634; GFX9-PAL-NEXT:    s_endpgm
635;
636; GFX10-PAL-LABEL: zero_init_small_offset_kernel:
637; GFX10-PAL:       ; %bb.0:
638; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
639; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
640; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
641; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
642; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
643; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
644; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
645; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
646; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
647; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4
648; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
649; GFX10-PAL-NEXT:    ; implicit-def: $vcc_hi
650; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
651; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
652; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
653; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
654; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
655; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
656; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
657; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
658; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
659; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
660; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
661; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
662; GFX10-PAL-NEXT:    s_endpgm
663  %padding = alloca [64 x i32], align 4, addrspace(5)
664  %alloca = alloca [32 x i16], align 2, addrspace(5)
665  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
666  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
667  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
668  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
669  ret void
670}
671
672define void @zero_init_small_offset_foo() {
673; GFX9-LABEL: zero_init_small_offset_foo:
674; GFX9:       ; %bb.0:
675; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
676; GFX9-NEXT:    scratch_load_dword v0, off, s32
677; GFX9-NEXT:    s_mov_b32 s0, 0
678; GFX9-NEXT:    s_mov_b32 s1, s0
679; GFX9-NEXT:    s_mov_b32 s2, s0
680; GFX9-NEXT:    s_mov_b32 s3, s0
681; GFX9-NEXT:    s_waitcnt vmcnt(0)
682; GFX9-NEXT:    v_mov_b32_e32 v0, s0
683; GFX9-NEXT:    v_mov_b32_e32 v1, s1
684; GFX9-NEXT:    v_mov_b32_e32 v2, s2
685; GFX9-NEXT:    v_mov_b32_e32 v3, s3
686; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
687; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
688; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
689; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
690; GFX9-NEXT:    s_waitcnt vmcnt(0)
691; GFX9-NEXT:    s_setpc_b64 s[30:31]
692;
693; GFX10-LABEL: zero_init_small_offset_foo:
694; GFX10:       ; %bb.0:
695; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
696; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
697; GFX10-NEXT:    scratch_load_dword v0, off, s32
698; GFX10-NEXT:    s_mov_b32 s0, 0
699; GFX10-NEXT:    ; implicit-def: $vcc_hi
700; GFX10-NEXT:    s_mov_b32 s1, s0
701; GFX10-NEXT:    s_mov_b32 s2, s0
702; GFX10-NEXT:    s_mov_b32 s3, s0
703; GFX10-NEXT:    s_waitcnt vmcnt(0)
704; GFX10-NEXT:    v_mov_b32_e32 v0, s0
705; GFX10-NEXT:    v_mov_b32_e32 v1, s1
706; GFX10-NEXT:    v_mov_b32_e32 v2, s2
707; GFX10-NEXT:    v_mov_b32_e32 v3, s3
708; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
709; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
710; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
711; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
712; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
713; GFX10-NEXT:    s_setpc_b64 s[30:31]
714;
715; GFX9-PAL-LABEL: zero_init_small_offset_foo:
716; GFX9-PAL:       ; %bb.0:
717; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
718; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32
719; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
720; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
721; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
722; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
723; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
724; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
725; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
726; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
727; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
728; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
729; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
730; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
731; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
732; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
733; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
734;
735; GFX10-PAL-LABEL: zero_init_small_offset_foo:
736; GFX10-PAL:       ; %bb.0:
737; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
738; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
739; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s32
740; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
741; GFX10-PAL-NEXT:    ; implicit-def: $vcc_hi
742; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
743; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
744; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
745; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
746; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
747; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
748; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
749; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
750; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
751; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
752; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
753; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
754; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
755; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
756  %padding = alloca [64 x i32], align 4, addrspace(5)
757  %alloca = alloca [32 x i16], align 2, addrspace(5)
758  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
759  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
760  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
761  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
762  ret void
763}
764
765define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
766; GFX9-LABEL: store_load_sindex_small_offset_kernel:
767; GFX9:       ; %bb.0: ; %bb
768; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
769; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
770; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
771; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
772; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
773; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
774; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
775; GFX9-NEXT:    s_and_b32 s0, s0, 15
776; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
777; GFX9-NEXT:    s_waitcnt vmcnt(0)
778; GFX9-NEXT:    v_mov_b32_e32 v0, 15
779; GFX9-NEXT:    s_add_u32 s1, 0x104, s1
780; GFX9-NEXT:    scratch_store_dword off, v0, s1
781; GFX9-NEXT:    s_add_u32 s0, 0x104, s0
782; GFX9-NEXT:    scratch_load_dword v0, off, s0
783; GFX9-NEXT:    s_endpgm
784;
785; GFX10-LABEL: store_load_sindex_small_offset_kernel:
786; GFX10:       ; %bb.0: ; %bb
787; GFX10-NEXT:    s_add_u32 s2, s2, s5
788; GFX10-NEXT:    s_addc_u32 s3, s3, 0
789; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
790; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
791; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
792; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
793; GFX10-NEXT:    s_waitcnt vmcnt(0)
794; GFX10-NEXT:    v_mov_b32_e32 v0, 15
795; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
796; GFX10-NEXT:    s_and_b32 s1, s0, 15
797; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
798; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
799; GFX10-NEXT:    s_add_u32 s0, 0x104, s0
800; GFX10-NEXT:    s_add_u32 s1, 0x104, s1
801; GFX10-NEXT:    scratch_store_dword off, v0, s0
802; GFX10-NEXT:    scratch_load_dword v0, off, s1
803; GFX10-NEXT:    s_endpgm
804;
805; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel:
806; GFX9-PAL:       ; %bb.0: ; %bb
807; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
808; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
809; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
810; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
811; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
812; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
813; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
814; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
815; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
816; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
817; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
818; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
819; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
820; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
821; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
822; GFX9-PAL-NEXT:    s_add_u32 s1, 0x104, s1
823; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
824; GFX9-PAL-NEXT:    s_add_u32 s0, 0x104, s0
825; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0
826; GFX9-PAL-NEXT:    s_endpgm
827;
828; GFX10-PAL-LABEL: store_load_sindex_small_offset_kernel:
829; GFX10-PAL:       ; %bb.0: ; %bb
830; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
831; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
832; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
833; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
834; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
835; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
836; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
837; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
838; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
839; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
840; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4
841; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
842; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
843; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
844; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
845; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
846; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
847; GFX10-PAL-NEXT:    s_add_u32 s0, 0x104, s0
848; GFX10-PAL-NEXT:    s_add_u32 s1, 0x104, s1
849; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
850; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1
851; GFX10-PAL-NEXT:    s_endpgm
852bb:
853  %padding = alloca [64 x i32], align 4, addrspace(5)
854  %i = alloca [32 x float], align 4, addrspace(5)
855  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
856  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
857  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
858  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
859  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
860  store volatile i32 15, i32 addrspace(5)* %i8, align 4
861  %i9 = and i32 %idx, 15
862  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
863  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
864  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
865  ret void
866}
867
868define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
869; GFX9-LABEL: store_load_sindex_small_offset_foo:
870; GFX9:       ; %bb.0: ; %bb
871; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
872; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
873; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
874; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
875; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
876; GFX9-NEXT:    s_add_u32 s0, 0x104, s0
877; GFX9-NEXT:    s_waitcnt vmcnt(0)
878; GFX9-NEXT:    v_mov_b32_e32 v0, 15
879; GFX9-NEXT:    scratch_store_dword off, v0, s0
880; GFX9-NEXT:    s_and_b32 s0, s2, 15
881; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
882; GFX9-NEXT:    s_add_u32 s0, 0x104, s0
883; GFX9-NEXT:    scratch_load_dword v0, off, s0
884; GFX9-NEXT:    s_endpgm
885;
886; GFX10-LABEL: store_load_sindex_small_offset_foo:
887; GFX10:       ; %bb.0: ; %bb
888; GFX10-NEXT:    s_add_u32 s0, s0, s3
889; GFX10-NEXT:    s_addc_u32 s1, s1, 0
890; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
891; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
892; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
893; GFX10-NEXT:    s_and_b32 s0, s2, 15
894; GFX10-NEXT:    s_waitcnt vmcnt(0)
895; GFX10-NEXT:    v_mov_b32_e32 v0, 15
896; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
897; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
898; GFX10-NEXT:    s_add_u32 s1, 0x104, s1
899; GFX10-NEXT:    s_add_u32 s0, 0x104, s0
900; GFX10-NEXT:    scratch_store_dword off, v0, s1
901; GFX10-NEXT:    scratch_load_dword v0, off, s0
902; GFX10-NEXT:    s_endpgm
903;
904; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo:
905; GFX9-PAL:       ; %bb.0: ; %bb
906; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
907; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
908; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
909; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
910; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
911; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
912; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
913; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
914; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
915; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
916; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
917; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
918; GFX9-PAL-NEXT:    s_add_u32 s1, 0x104, s1
919; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
920; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
921; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
922; GFX9-PAL-NEXT:    s_add_u32 s0, 0x104, s0
923; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0
924; GFX9-PAL-NEXT:    s_endpgm
925;
926; GFX10-PAL-LABEL: store_load_sindex_small_offset_foo:
927; GFX10-PAL:       ; %bb.0: ; %bb
928; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
929; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
930; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
931; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
932; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
933; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
934; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
935; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
936; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
937; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4
938; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
939; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
940; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
941; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
942; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
943; GFX10-PAL-NEXT:    s_add_u32 s0, 0x104, s0
944; GFX10-PAL-NEXT:    s_add_u32 s1, 0x104, s1
945; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
946; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1
947; GFX10-PAL-NEXT:    s_endpgm
948bb:
949  %padding = alloca [64 x i32], align 4, addrspace(5)
950  %i = alloca [32 x float], align 4, addrspace(5)
951  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
952  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
953  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
954  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
955  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
956  store volatile i32 15, i32 addrspace(5)* %i8, align 4
957  %i9 = and i32 %idx, 15
958  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
959  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
960  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
961  ret void
962}
963
964define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
965; GFX9-LABEL: store_load_vindex_small_offset_kernel:
966; GFX9:       ; %bb.0: ; %bb
967; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
968; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
969; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
970; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4
971; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
972; GFX9-NEXT:    s_waitcnt vmcnt(0)
973; GFX9-NEXT:    v_mov_b32_e32 v1, 0x104
974; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
975; GFX9-NEXT:    v_mov_b32_e32 v3, 15
976; GFX9-NEXT:    scratch_store_dword v2, v3, off
977; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
978; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124
979; GFX9-NEXT:    s_endpgm
980;
981; GFX10-LABEL: store_load_vindex_small_offset_kernel:
982; GFX10:       ; %bb.0: ; %bb
983; GFX10-NEXT:    s_add_u32 s0, s0, s3
984; GFX10-NEXT:    s_addc_u32 s1, s1, 0
985; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
986; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
987; GFX10-NEXT:    v_mov_b32_e32 v1, 0x104
988; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
989; GFX10-NEXT:    v_mov_b32_e32 v3, 15
990; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
991; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
992; GFX10-NEXT:    scratch_load_dword v1, off, off offset:4
993; GFX10-NEXT:    scratch_store_dword v2, v3, off
994; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124
995; GFX10-NEXT:    s_endpgm
996;
997; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel:
998; GFX9-PAL:       ; %bb.0: ; %bb
999; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1000; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1001; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1002; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1003; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1004; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
1005; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1006; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1007; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1008; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1009; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4
1010; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1011; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
1012; GFX9-PAL-NEXT:    v_add_u32_e32 v2, v1, v0
1013; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1014; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, v1, v0
1015; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124
1016; GFX9-PAL-NEXT:    s_endpgm
1017;
1018; GFX10-PAL-LABEL: store_load_vindex_small_offset_kernel:
1019; GFX10-PAL:       ; %bb.0: ; %bb
1020; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
1021; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1022; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1023; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1024; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1025; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
1026; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
1027; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1028; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1029; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
1030; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1031; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
1032; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
1033; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
1034; GFX10-PAL-NEXT:    scratch_load_dword v1, off, off offset:4
1035; GFX10-PAL-NEXT:    scratch_store_dword v2, v3, off
1036; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124
1037; GFX10-PAL-NEXT:    s_endpgm
1038bb:
1039  %padding = alloca [64 x i32], align 4, addrspace(5)
1040  %i = alloca [32 x float], align 4, addrspace(5)
1041  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1042  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1043  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1044  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
1045  %i3 = zext i32 %i2 to i64
1046  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
1047  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1048  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1049  %i9 = sub nsw i32 31, %i2
1050  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1051  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1052  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1053  ret void
1054}
1055
1056define void @store_load_vindex_small_offset_foo(i32 %idx) {
1057; GFX9-LABEL: store_load_vindex_small_offset_foo:
1058; GFX9:       ; %bb.0: ; %bb
1059; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1060; GFX9-NEXT:    scratch_load_dword v1, off, s32
1061; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x100
1062; GFX9-NEXT:    s_waitcnt vmcnt(0)
1063; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
1064; GFX9-NEXT:    v_mov_b32_e32 v3, 15
1065; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1066; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
1067; GFX9-NEXT:    scratch_store_dword v2, v3, off
1068; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1069; GFX9-NEXT:    scratch_load_dword v0, v0, off
1070; GFX9-NEXT:    s_waitcnt vmcnt(0)
1071; GFX9-NEXT:    s_setpc_b64 s[30:31]
1072;
1073; GFX10-LABEL: store_load_vindex_small_offset_foo:
1074; GFX10:       ; %bb.0: ; %bb
1075; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1076; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1077; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1078; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x100
1079; GFX10-NEXT:    ; implicit-def: $vcc_hi
1080; GFX10-NEXT:    v_mov_b32_e32 v2, vcc_lo
1081; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
1082; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
1083; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
1084; GFX10-NEXT:    scratch_load_dword v3, off, s32
1085; GFX10-NEXT:    scratch_store_dword v0, v1, off
1086; GFX10-NEXT:    scratch_load_dword v0, v2, off
1087; GFX10-NEXT:    s_waitcnt vmcnt(0)
1088; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1089; GFX10-NEXT:    s_setpc_b64 s[30:31]
1090;
1091; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
1092; GFX9-PAL:       ; %bb.0: ; %bb
1093; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1094; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32
1095; GFX9-PAL-NEXT:    s_add_u32 vcc_hi, s32, 0x100
1096; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1097; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
1098; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
1099; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1100; GFX9-PAL-NEXT:    v_and_b32_e32 v0, v0, v3
1101; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1102; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1103; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off
1104; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1105; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1106;
1107; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo:
1108; GFX10-PAL:       ; %bb.0: ; %bb
1109; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1110; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1111; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
1112; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x100
1113; GFX10-PAL-NEXT:    ; implicit-def: $vcc_hi
1114; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, vcc_lo
1115; GFX10-PAL-NEXT:    v_and_b32_e32 v3, v0, v1
1116; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
1117; GFX10-PAL-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
1118; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32
1119; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
1120; GFX10-PAL-NEXT:    scratch_load_dword v0, v2, off
1121; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1122; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1123; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1124bb:
1125  %padding = alloca [64 x i32], align 4, addrspace(5)
1126  %i = alloca [32 x float], align 4, addrspace(5)
1127  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1128  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1129  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1130  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1131  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1132  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1133  %i9 = and i32 %idx, 15
1134  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1135  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1136  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1137  ret void
1138}
1139
1140define amdgpu_kernel void @zero_init_large_offset_kernel() {
1141; GFX9-LABEL: zero_init_large_offset_kernel:
1142; GFX9:       ; %bb.0:
1143; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1144; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1145; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1146; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
1147; GFX9-NEXT:    s_mov_b32 s0, 0
1148; GFX9-NEXT:    s_mov_b32 s1, s0
1149; GFX9-NEXT:    s_mov_b32 s2, s0
1150; GFX9-NEXT:    s_mov_b32 s3, s0
1151; GFX9-NEXT:    s_waitcnt vmcnt(0)
1152; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1153; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1154; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1155; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1156; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1157; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1158; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1159; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1160; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1161; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1162; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1163; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1164; GFX9-NEXT:    s_endpgm
1165;
1166; GFX10-LABEL: zero_init_large_offset_kernel:
1167; GFX10:       ; %bb.0:
1168; GFX10-NEXT:    s_add_u32 s0, s0, s3
1169; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1170; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1171; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1172; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
1173; GFX10-NEXT:    s_mov_b32 s0, 0
1174; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1175; GFX10-NEXT:    s_mov_b32 s1, s0
1176; GFX10-NEXT:    s_mov_b32 s2, s0
1177; GFX10-NEXT:    s_mov_b32 s3, s0
1178; GFX10-NEXT:    s_waitcnt vmcnt(0)
1179; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1180; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1181; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1182; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1183; GFX10-NEXT:    ; implicit-def: $vcc_hi
1184; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1185; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1186; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1187; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1188; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1189; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1190; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1191; GFX10-NEXT:    s_endpgm
1192;
1193; GFX9-PAL-LABEL: zero_init_large_offset_kernel:
1194; GFX9-PAL:       ; %bb.0:
1195; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1196; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1197; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1198; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1199; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1200; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1201; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1202; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1203; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1204; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
1205; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
1206; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1207; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
1208; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1209; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
1210; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
1211; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
1212; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
1213; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1214; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1215; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1216; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1217; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1218; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1219; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1220; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1221; GFX9-PAL-NEXT:    s_endpgm
1222;
1223; GFX10-PAL-LABEL: zero_init_large_offset_kernel:
1224; GFX10-PAL:       ; %bb.0:
1225; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
1226; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1227; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1228; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1229; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1230; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
1231; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
1232; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1233; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1234; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4
1235; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
1236; GFX10-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1237; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
1238; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1239; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
1240; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1241; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
1242; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
1243; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
1244; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
1245; GFX10-PAL-NEXT:    ; implicit-def: $vcc_hi
1246; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1247; GFX10-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1248; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1249; GFX10-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1250; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1251; GFX10-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1252; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1253; GFX10-PAL-NEXT:    s_endpgm
1254  %padding = alloca [4096 x i32], align 4, addrspace(5)
1255  %alloca = alloca [32 x i16], align 2, addrspace(5)
1256  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1257  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1258  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
1259  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
1260  ret void
1261}
1262
1263define void @zero_init_large_offset_foo() {
1264; GFX9-LABEL: zero_init_large_offset_foo:
1265; GFX9:       ; %bb.0:
1266; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1267; GFX9-NEXT:    scratch_load_dword v0, off, s32
1268; GFX9-NEXT:    s_mov_b32 s0, 0
1269; GFX9-NEXT:    s_mov_b32 s1, s0
1270; GFX9-NEXT:    s_mov_b32 s2, s0
1271; GFX9-NEXT:    s_mov_b32 s3, s0
1272; GFX9-NEXT:    s_waitcnt vmcnt(0)
1273; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1274; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1275; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1276; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1277; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1278; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1279; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1280; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1281; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1282; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1283; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1284; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1285; GFX9-NEXT:    s_waitcnt vmcnt(0)
1286; GFX9-NEXT:    s_setpc_b64 s[30:31]
1287;
1288; GFX10-LABEL: zero_init_large_offset_foo:
1289; GFX10:       ; %bb.0:
1290; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1291; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1292; GFX10-NEXT:    scratch_load_dword v0, off, s32
1293; GFX10-NEXT:    s_mov_b32 s0, 0
1294; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1295; GFX10-NEXT:    s_mov_b32 s1, s0
1296; GFX10-NEXT:    s_mov_b32 s2, s0
1297; GFX10-NEXT:    s_mov_b32 s3, s0
1298; GFX10-NEXT:    s_waitcnt vmcnt(0)
1299; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1300; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1301; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1302; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1303; GFX10-NEXT:    ; implicit-def: $vcc_hi
1304; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1305; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1306; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1307; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1308; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1309; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1310; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1311; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1312; GFX10-NEXT:    s_setpc_b64 s[30:31]
1313;
1314; GFX9-PAL-LABEL: zero_init_large_offset_foo:
1315; GFX9-PAL:       ; %bb.0:
1316; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1317; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32
1318; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1319; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
1320; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1321; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
1322; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1323; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
1324; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
1325; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
1326; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
1327; GFX9-PAL-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1328; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1329; GFX9-PAL-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1330; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1331; GFX9-PAL-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1332; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1333; GFX9-PAL-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1334; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1335; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1336; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1337;
1338; GFX10-PAL-LABEL: zero_init_large_offset_foo:
1339; GFX10-PAL:       ; %bb.0:
1340; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1341; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1342; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s32
1343; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
1344; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1345; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
1346; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1347; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
1348; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1349; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
1350; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
1351; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
1352; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
1353; GFX10-PAL-NEXT:    ; implicit-def: $vcc_hi
1354; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1355; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1356; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1357; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1358; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1359; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1360; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1361; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1362; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1363  %padding = alloca [4096 x i32], align 4, addrspace(5)
1364  %alloca = alloca [32 x i16], align 2, addrspace(5)
1365  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1366  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1367  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
1368  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
1369  ret void
1370}
1371
1372define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
1373; GFX9-LABEL: store_load_sindex_large_offset_kernel:
1374; GFX9:       ; %bb.0: ; %bb
1375; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
1376; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
1377; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1378; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1379; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
1380; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1381; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
1382; GFX9-NEXT:    s_and_b32 s0, s0, 15
1383; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1384; GFX9-NEXT:    s_waitcnt vmcnt(0)
1385; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1386; GFX9-NEXT:    s_add_u32 s1, 0x4004, s1
1387; GFX9-NEXT:    scratch_store_dword off, v0, s1
1388; GFX9-NEXT:    s_add_u32 s0, 0x4004, s0
1389; GFX9-NEXT:    scratch_load_dword v0, off, s0
1390; GFX9-NEXT:    s_endpgm
1391;
1392; GFX10-LABEL: store_load_sindex_large_offset_kernel:
1393; GFX10:       ; %bb.0: ; %bb
1394; GFX10-NEXT:    s_add_u32 s2, s2, s5
1395; GFX10-NEXT:    s_addc_u32 s3, s3, 0
1396; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1397; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1398; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
1399; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
1400; GFX10-NEXT:    s_waitcnt vmcnt(0)
1401; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1402; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1403; GFX10-NEXT:    s_and_b32 s1, s0, 15
1404; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1405; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
1406; GFX10-NEXT:    s_add_u32 s0, 0x4004, s0
1407; GFX10-NEXT:    s_add_u32 s1, 0x4004, s1
1408; GFX10-NEXT:    scratch_store_dword off, v0, s0
1409; GFX10-NEXT:    scratch_load_dword v0, off, s1
1410; GFX10-NEXT:    s_endpgm
1411;
1412; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel:
1413; GFX9-PAL:       ; %bb.0: ; %bb
1414; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
1415; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
1416; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1417; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
1418; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1419; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1420; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1421; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
1422; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
1423; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1424; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1425; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
1426; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1427; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1428; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1429; GFX9-PAL-NEXT:    s_add_u32 s1, 0x4004, s1
1430; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1431; GFX9-PAL-NEXT:    s_add_u32 s0, 0x4004, s0
1432; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0
1433; GFX9-PAL-NEXT:    s_endpgm
1434;
1435; GFX10-PAL-LABEL: store_load_sindex_large_offset_kernel:
1436; GFX10-PAL:       ; %bb.0: ; %bb
1437; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
1438; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
1439; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1440; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1441; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1442; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
1443; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
1444; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
1445; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
1446; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
1447; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4
1448; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1449; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
1450; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1451; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
1452; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1453; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1454; GFX10-PAL-NEXT:    s_add_u32 s0, 0x4004, s0
1455; GFX10-PAL-NEXT:    s_add_u32 s1, 0x4004, s1
1456; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
1457; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1
1458; GFX10-PAL-NEXT:    s_endpgm
1459bb:
1460  %padding = alloca [4096 x i32], align 4, addrspace(5)
1461  %i = alloca [32 x float], align 4, addrspace(5)
1462  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1463  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1464  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1465  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1466  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1467  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1468  %i9 = and i32 %idx, 15
1469  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1470  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1471  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1472  ret void
1473}
1474
1475define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
1476; GFX9-LABEL: store_load_sindex_large_offset_foo:
1477; GFX9:       ; %bb.0: ; %bb
1478; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1479; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1480; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1481; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
1482; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
1483; GFX9-NEXT:    s_add_u32 s0, 0x4004, s0
1484; GFX9-NEXT:    s_waitcnt vmcnt(0)
1485; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1486; GFX9-NEXT:    scratch_store_dword off, v0, s0
1487; GFX9-NEXT:    s_and_b32 s0, s2, 15
1488; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1489; GFX9-NEXT:    s_add_u32 s0, 0x4004, s0
1490; GFX9-NEXT:    scratch_load_dword v0, off, s0
1491; GFX9-NEXT:    s_endpgm
1492;
1493; GFX10-LABEL: store_load_sindex_large_offset_foo:
1494; GFX10:       ; %bb.0: ; %bb
1495; GFX10-NEXT:    s_add_u32 s0, s0, s3
1496; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1497; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1498; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1499; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
1500; GFX10-NEXT:    s_and_b32 s0, s2, 15
1501; GFX10-NEXT:    s_waitcnt vmcnt(0)
1502; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1503; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
1504; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1505; GFX10-NEXT:    s_add_u32 s1, 0x4004, s1
1506; GFX10-NEXT:    s_add_u32 s0, 0x4004, s0
1507; GFX10-NEXT:    scratch_store_dword off, v0, s1
1508; GFX10-NEXT:    scratch_load_dword v0, off, s0
1509; GFX10-NEXT:    s_endpgm
1510;
1511; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo:
1512; GFX9-PAL:       ; %bb.0: ; %bb
1513; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1514; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1515; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1516; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1517; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1518; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1519; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1520; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1521; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1522; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1523; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
1524; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1525; GFX9-PAL-NEXT:    s_add_u32 s1, 0x4004, s1
1526; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1527; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1528; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1529; GFX9-PAL-NEXT:    s_add_u32 s0, 0x4004, s0
1530; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0
1531; GFX9-PAL-NEXT:    s_endpgm
1532;
1533; GFX10-PAL-LABEL: store_load_sindex_large_offset_foo:
1534; GFX10-PAL:       ; %bb.0: ; %bb
1535; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
1536; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1537; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1538; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1539; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1540; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
1541; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
1542; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1543; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1544; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4
1545; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
1546; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1547; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
1548; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1549; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1550; GFX10-PAL-NEXT:    s_add_u32 s0, 0x4004, s0
1551; GFX10-PAL-NEXT:    s_add_u32 s1, 0x4004, s1
1552; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
1553; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1
1554; GFX10-PAL-NEXT:    s_endpgm
1555bb:
1556  %padding = alloca [4096 x i32], align 4, addrspace(5)
1557  %i = alloca [32 x float], align 4, addrspace(5)
1558  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1559  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1560  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1561  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1562  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1563  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1564  %i9 = and i32 %idx, 15
1565  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1566  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1567  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1568  ret void
1569}
1570
1571define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
1572; GFX9-LABEL: store_load_vindex_large_offset_kernel:
1573; GFX9:       ; %bb.0: ; %bb
1574; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1575; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1576; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1577; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4
1578; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1579; GFX9-NEXT:    s_waitcnt vmcnt(0)
1580; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4004
1581; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
1582; GFX9-NEXT:    v_mov_b32_e32 v3, 15
1583; GFX9-NEXT:    scratch_store_dword v2, v3, off
1584; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
1585; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124
1586; GFX9-NEXT:    s_endpgm
1587;
1588; GFX10-LABEL: store_load_vindex_large_offset_kernel:
1589; GFX10:       ; %bb.0: ; %bb
1590; GFX10-NEXT:    s_add_u32 s0, s0, s3
1591; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1592; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1593; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1594; GFX10-NEXT:    v_mov_b32_e32 v1, 0x4004
1595; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1596; GFX10-NEXT:    v_mov_b32_e32 v3, 15
1597; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
1598; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
1599; GFX10-NEXT:    scratch_load_dword v1, off, off offset:4
1600; GFX10-NEXT:    scratch_store_dword v2, v3, off
1601; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124
1602; GFX10-NEXT:    s_endpgm
1603;
1604; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel:
1605; GFX9-PAL:       ; %bb.0: ; %bb
1606; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1607; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1608; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1609; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1610; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1611; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
1612; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1613; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1614; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1615; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1616; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4
1617; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1618; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
1619; GFX9-PAL-NEXT:    v_add_u32_e32 v2, v1, v0
1620; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1621; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, v1, v0
1622; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124
1623; GFX9-PAL-NEXT:    s_endpgm
1624;
1625; GFX10-PAL-LABEL: store_load_vindex_large_offset_kernel:
1626; GFX10-PAL:       ; %bb.0: ; %bb
1627; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
1628; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1629; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1630; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1631; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1632; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
1633; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
1634; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1635; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1636; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
1637; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1638; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
1639; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
1640; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
1641; GFX10-PAL-NEXT:    scratch_load_dword v1, off, off offset:4
1642; GFX10-PAL-NEXT:    scratch_store_dword v2, v3, off
1643; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124
1644; GFX10-PAL-NEXT:    s_endpgm
1645bb:
1646  %padding = alloca [4096 x i32], align 4, addrspace(5)
1647  %i = alloca [32 x float], align 4, addrspace(5)
1648  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1649  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1650  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1651  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
1652  %i3 = zext i32 %i2 to i64
1653  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
1654  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1655  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1656  %i9 = sub nsw i32 31, %i2
1657  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1658  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1659  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1660  ret void
1661}
1662
1663define void @store_load_vindex_large_offset_foo(i32 %idx) {
1664; GFX9-LABEL: store_load_vindex_large_offset_foo:
1665; GFX9:       ; %bb.0: ; %bb
1666; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1667; GFX9-NEXT:    scratch_load_dword v1, off, s32
1668; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1669; GFX9-NEXT:    s_waitcnt vmcnt(0)
1670; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
1671; GFX9-NEXT:    v_mov_b32_e32 v3, 15
1672; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1673; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
1674; GFX9-NEXT:    scratch_store_dword v2, v3, off
1675; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1676; GFX9-NEXT:    scratch_load_dword v0, v0, off
1677; GFX9-NEXT:    s_waitcnt vmcnt(0)
1678; GFX9-NEXT:    s_setpc_b64 s[30:31]
1679;
1680; GFX10-LABEL: store_load_vindex_large_offset_foo:
1681; GFX10:       ; %bb.0: ; %bb
1682; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1683; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1684; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1685; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1686; GFX10-NEXT:    ; implicit-def: $vcc_hi
1687; GFX10-NEXT:    v_mov_b32_e32 v2, vcc_lo
1688; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
1689; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
1690; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
1691; GFX10-NEXT:    scratch_load_dword v3, off, s32
1692; GFX10-NEXT:    scratch_store_dword v0, v1, off
1693; GFX10-NEXT:    scratch_load_dword v0, v2, off
1694; GFX10-NEXT:    s_waitcnt vmcnt(0)
1695; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1696; GFX10-NEXT:    s_setpc_b64 s[30:31]
1697;
1698; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
1699; GFX9-PAL:       ; %bb.0: ; %bb
1700; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1701; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32
1702; GFX9-PAL-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1703; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1704; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
1705; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
1706; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1707; GFX9-PAL-NEXT:    v_and_b32_e32 v0, v0, v3
1708; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1709; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1710; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off
1711; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1712; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1713;
1714; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo:
1715; GFX10-PAL:       ; %bb.0: ; %bb
1716; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1717; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1718; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
1719; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1720; GFX10-PAL-NEXT:    ; implicit-def: $vcc_hi
1721; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, vcc_lo
1722; GFX10-PAL-NEXT:    v_and_b32_e32 v3, v0, v1
1723; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
1724; GFX10-PAL-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
1725; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32
1726; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
1727; GFX10-PAL-NEXT:    scratch_load_dword v0, v2, off
1728; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1729; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1730; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1731bb:
1732  %padding = alloca [4096 x i32], align 4, addrspace(5)
1733  %i = alloca [32 x float], align 4, addrspace(5)
1734  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1735  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1736  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1737  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1738  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1739  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1740  %i9 = and i32 %idx, 15
1741  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1742  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1743  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1744  ret void
1745}
1746
1747define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
1748; GFX9-LABEL: store_load_large_imm_offset_kernel:
1749; GFX9:       ; %bb.0: ; %bb
1750; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1751; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1752; GFX9-NEXT:    s_movk_i32 s0, 0x3000
1753; GFX9-NEXT:    v_mov_b32_e32 v0, 13
1754; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1755; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
1756; GFX9-NEXT:    s_add_u32 s0, 4, s0
1757; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1758; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
1759; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712
1760; GFX9-NEXT:    s_endpgm
1761;
1762; GFX10-LABEL: store_load_large_imm_offset_kernel:
1763; GFX10:       ; %bb.0: ; %bb
1764; GFX10-NEXT:    s_add_u32 s0, s0, s3
1765; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1766; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1767; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1768; GFX10-NEXT:    v_mov_b32_e32 v0, 13
1769; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1770; GFX10-NEXT:    s_movk_i32 s0, 0x3800
1771; GFX10-NEXT:    s_add_u32 s0, 4, s0
1772; GFX10-NEXT:    scratch_store_dword off, v0, off offset:4
1773; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
1774; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664
1775; GFX10-NEXT:    s_endpgm
1776;
1777; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel:
1778; GFX9-PAL:       ; %bb.0: ; %bb
1779; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1780; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1781; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1782; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
1783; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1784; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
1785; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1786; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1787; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1788; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1789; GFX9-PAL-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
1790; GFX9-PAL-NEXT:    s_add_u32 s0, 4, s0
1791; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1792; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
1793; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712
1794; GFX9-PAL-NEXT:    s_endpgm
1795;
1796; GFX10-PAL-LABEL: store_load_large_imm_offset_kernel:
1797; GFX10-PAL:       ; %bb.0: ; %bb
1798; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
1799; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1800; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1801; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1802; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1803; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
1804; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
1805; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1806; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1807; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 13
1808; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
1809; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x3800
1810; GFX10-PAL-NEXT:    s_add_u32 s0, 4, s0
1811; GFX10-PAL-NEXT:    scratch_store_dword off, v0, off offset:4
1812; GFX10-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
1813; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664
1814; GFX10-PAL-NEXT:    s_endpgm
1815bb:
1816  %i = alloca [4096 x i32], align 4, addrspace(5)
1817  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
1818  store volatile i32 13, i32 addrspace(5)* %i1, align 4
1819  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
1820  store volatile i32 15, i32 addrspace(5)* %i7, align 4
1821  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
1822  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
1823  ret void
1824}
1825
1826define void @store_load_large_imm_offset_foo() {
1827; GFX9-LABEL: store_load_large_imm_offset_foo:
1828; GFX9:       ; %bb.0: ; %bb
1829; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1830; GFX9-NEXT:    s_movk_i32 s0, 0x3000
1831; GFX9-NEXT:    v_mov_b32_e32 v0, 13
1832; GFX9-NEXT:    scratch_store_dword off, v0, s32
1833; GFX9-NEXT:    s_add_u32 s0, s32, s0
1834; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1835; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
1836; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712
1837; GFX9-NEXT:    s_waitcnt vmcnt(0)
1838; GFX9-NEXT:    s_setpc_b64 s[30:31]
1839;
1840; GFX10-LABEL: store_load_large_imm_offset_foo:
1841; GFX10:       ; %bb.0: ; %bb
1842; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1843; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1844; GFX10-NEXT:    v_mov_b32_e32 v0, 13
1845; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1846; GFX10-NEXT:    s_movk_i32 s0, 0x3800
1847; GFX10-NEXT:    ; implicit-def: $vcc_hi
1848; GFX10-NEXT:    s_add_u32 s0, s32, s0
1849; GFX10-NEXT:    scratch_store_dword off, v0, s32
1850; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
1851; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664
1852; GFX10-NEXT:    s_waitcnt vmcnt(0)
1853; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1854; GFX10-NEXT:    s_setpc_b64 s[30:31]
1855;
1856; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
1857; GFX9-PAL:       ; %bb.0: ; %bb
1858; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1859; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
1860; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
1861; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s32
1862; GFX9-PAL-NEXT:    s_add_u32 s0, s32, s0
1863; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1864; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
1865; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712
1866; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1867; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1868;
1869; GFX10-PAL-LABEL: store_load_large_imm_offset_foo:
1870; GFX10-PAL:       ; %bb.0: ; %bb
1871; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1872; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1873; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 13
1874; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
1875; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x3800
1876; GFX10-PAL-NEXT:    ; implicit-def: $vcc_hi
1877; GFX10-PAL-NEXT:    s_add_u32 s0, s32, s0
1878; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s32
1879; GFX10-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
1880; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664
1881; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1882; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1883; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1884bb:
1885  %i = alloca [4096 x i32], align 4, addrspace(5)
1886  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
1887  store volatile i32 13, i32 addrspace(5)* %i1, align 4
1888  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
1889  store volatile i32 15, i32 addrspace(5)* %i7, align 4
1890  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
1891  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
1892  ret void
1893}
1894
1895define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
1896; GFX9-LABEL: store_load_vidx_sidx_offset:
1897; GFX9:       ; %bb.0: ; %bb
1898; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
1899; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
1900; GFX9-NEXT:    v_mov_b32_e32 v1, 4
1901; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1902; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1903; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
1904; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1905; GFX9-NEXT:    v_mov_b32_e32 v1, 15
1906; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
1907; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024
1908; GFX9-NEXT:    s_endpgm
1909;
1910; GFX10-LABEL: store_load_vidx_sidx_offset:
1911; GFX10:       ; %bb.0: ; %bb
1912; GFX10-NEXT:    s_add_u32 s2, s2, s5
1913; GFX10-NEXT:    s_addc_u32 s3, s3, 0
1914; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1915; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1916; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
1917; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1918; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1919; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
1920; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
1921; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
1922; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024
1923; GFX10-NEXT:    s_endpgm
1924;
1925; GFX9-PAL-LABEL: store_load_vidx_sidx_offset:
1926; GFX9-PAL:       ; %bb.0: ; %bb
1927; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
1928; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
1929; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1930; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
1931; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 4
1932; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1933; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1934; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
1935; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
1936; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1937; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
1938; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
1939; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
1940; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024
1941; GFX9-PAL-NEXT:    s_endpgm
1942;
1943; GFX10-PAL-LABEL: store_load_vidx_sidx_offset:
1944; GFX10-PAL:       ; %bb.0: ; %bb
1945; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
1946; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
1947; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1948; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1949; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1950; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
1951; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
1952; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
1953; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
1954; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
1955; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
1956; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1957; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
1958; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
1959; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
1960; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024
1961; GFX10-PAL-NEXT:    s_endpgm
1962bb:
1963  %alloca = alloca [32 x i32], align 4, addrspace(5)
1964  %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
1965  %add1 = add nsw i32 %sidx, %vidx
1966  %add2 = add nsw i32 %add1, 256
1967  %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2
1968  store volatile i32 15, i32 addrspace(5)* %gep, align 4
1969  %load = load volatile i32, i32 addrspace(5)* %gep, align 4
1970  ret void
1971}
1972
1973define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
1974; GFX9-LABEL: store_load_i64_aligned:
1975; GFX9:       ; %bb.0: ; %bb
1976; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1977; GFX9-NEXT:    v_mov_b32_e32 v1, 15
1978; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1979; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
1980; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
1981; GFX9-NEXT:    s_waitcnt vmcnt(0)
1982; GFX9-NEXT:    s_setpc_b64 s[30:31]
1983;
1984; GFX10-LABEL: store_load_i64_aligned:
1985; GFX10:       ; %bb.0: ; %bb
1986; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1987; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1988; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1989; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1990; GFX10-NEXT:    ; implicit-def: $vcc_hi
1991; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
1992; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
1993; GFX10-NEXT:    s_waitcnt vmcnt(0)
1994; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1995; GFX10-NEXT:    s_setpc_b64 s[30:31]
1996;
1997; GFX9-PAL-LABEL: store_load_i64_aligned:
1998; GFX9-PAL:       ; %bb.0: ; %bb
1999; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2000; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
2001; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
2002; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2003; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
2004; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2005; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2006;
2007; GFX10-PAL-LABEL: store_load_i64_aligned:
2008; GFX10-PAL:       ; %bb.0: ; %bb
2009; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2010; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2011; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
2012; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
2013; GFX10-PAL-NEXT:    ; implicit-def: $vcc_hi
2014; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2015; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
2016; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2017; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2018; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2019bb:
2020  store volatile i64 15, i64 addrspace(5)* %arg, align 8
2021  %load = load volatile i64, i64 addrspace(5)* %arg, align 8
2022  ret void
2023}
2024
2025define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
2026; GFX9-LABEL: store_load_i64_unaligned:
2027; GFX9:       ; %bb.0: ; %bb
2028; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2029; GFX9-NEXT:    v_mov_b32_e32 v1, 15
2030; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2031; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2032; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
2033; GFX9-NEXT:    s_waitcnt vmcnt(0)
2034; GFX9-NEXT:    s_setpc_b64 s[30:31]
2035;
2036; GFX10-LABEL: store_load_i64_unaligned:
2037; GFX10:       ; %bb.0: ; %bb
2038; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2039; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2040; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2041; GFX10-NEXT:    v_mov_b32_e32 v2, 0
2042; GFX10-NEXT:    ; implicit-def: $vcc_hi
2043; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2044; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
2045; GFX10-NEXT:    s_waitcnt vmcnt(0)
2046; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2047; GFX10-NEXT:    s_setpc_b64 s[30:31]
2048;
2049; GFX9-PAL-LABEL: store_load_i64_unaligned:
2050; GFX9-PAL:       ; %bb.0: ; %bb
2051; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2052; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
2053; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
2054; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2055; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
2056; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2057; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2058;
2059; GFX10-PAL-LABEL: store_load_i64_unaligned:
2060; GFX10-PAL:       ; %bb.0: ; %bb
2061; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2062; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2063; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
2064; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
2065; GFX10-PAL-NEXT:    ; implicit-def: $vcc_hi
2066; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2067; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off
2068; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2069; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2070; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2071bb:
2072  store volatile i64 15, i64 addrspace(5)* %arg, align 1
2073  %load = load volatile i64, i64 addrspace(5)* %arg, align 1
2074  ret void
2075}
2076
2077define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) {
2078; GFX9-LABEL: store_load_v3i32_unaligned:
2079; GFX9:       ; %bb.0: ; %bb
2080; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2081; GFX9-NEXT:    v_mov_b32_e32 v1, 1
2082; GFX9-NEXT:    v_mov_b32_e32 v2, 2
2083; GFX9-NEXT:    v_mov_b32_e32 v3, 3
2084; GFX9-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
2085; GFX9-NEXT:    scratch_load_dwordx3 v[0:2], v0, off
2086; GFX9-NEXT:    s_waitcnt vmcnt(0)
2087; GFX9-NEXT:    s_setpc_b64 s[30:31]
2088;
2089; GFX10-LABEL: store_load_v3i32_unaligned:
2090; GFX10:       ; %bb.0: ; %bb
2091; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2092; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2093; GFX10-NEXT:    v_mov_b32_e32 v1, 1
2094; GFX10-NEXT:    v_mov_b32_e32 v2, 2
2095; GFX10-NEXT:    v_mov_b32_e32 v3, 3
2096; GFX10-NEXT:    ; implicit-def: $vcc_hi
2097; GFX10-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
2098; GFX10-NEXT:    scratch_load_dwordx3 v[0:2], v0, off
2099; GFX10-NEXT:    s_waitcnt vmcnt(0)
2100; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2101; GFX10-NEXT:    s_setpc_b64 s[30:31]
2102;
2103; GFX9-PAL-LABEL: store_load_v3i32_unaligned:
2104; GFX9-PAL:       ; %bb.0: ; %bb
2105; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2106; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
2107; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
2108; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
2109; GFX9-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
2110; GFX9-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off
2111; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2112; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2113;
2114; GFX10-PAL-LABEL: store_load_v3i32_unaligned:
2115; GFX10-PAL:       ; %bb.0: ; %bb
2116; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2117; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2118; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
2119; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
2120; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
2121; GFX10-PAL-NEXT:    ; implicit-def: $vcc_hi
2122; GFX10-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
2123; GFX10-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off
2124; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2125; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2126; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2127bb:
2128  store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1
2129  %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1
2130  ret void
2131}
2132
2133define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) {
2134; GFX9-LABEL: store_load_v4i32_unaligned:
2135; GFX9:       ; %bb.0: ; %bb
2136; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2137; GFX9-NEXT:    v_mov_b32_e32 v1, 1
2138; GFX9-NEXT:    v_mov_b32_e32 v2, 2
2139; GFX9-NEXT:    v_mov_b32_e32 v3, 3
2140; GFX9-NEXT:    v_mov_b32_e32 v4, 4
2141; GFX9-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
2142; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], v0, off
2143; GFX9-NEXT:    s_waitcnt vmcnt(0)
2144; GFX9-NEXT:    s_setpc_b64 s[30:31]
2145;
2146; GFX10-LABEL: store_load_v4i32_unaligned:
2147; GFX10:       ; %bb.0: ; %bb
2148; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2149; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2150; GFX10-NEXT:    v_mov_b32_e32 v1, 1
2151; GFX10-NEXT:    v_mov_b32_e32 v2, 2
2152; GFX10-NEXT:    v_mov_b32_e32 v3, 3
2153; GFX10-NEXT:    v_mov_b32_e32 v4, 4
2154; GFX10-NEXT:    ; implicit-def: $vcc_hi
2155; GFX10-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
2156; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], v0, off
2157; GFX10-NEXT:    s_waitcnt vmcnt(0)
2158; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2159; GFX10-NEXT:    s_setpc_b64 s[30:31]
2160;
2161; GFX9-PAL-LABEL: store_load_v4i32_unaligned:
2162; GFX9-PAL:       ; %bb.0: ; %bb
2163; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2164; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
2165; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
2166; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
2167; GFX9-PAL-NEXT:    v_mov_b32_e32 v4, 4
2168; GFX9-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
2169; GFX9-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off
2170; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2171; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2172;
2173; GFX10-PAL-LABEL: store_load_v4i32_unaligned:
2174; GFX10-PAL:       ; %bb.0: ; %bb
2175; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2176; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2177; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
2178; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
2179; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
2180; GFX10-PAL-NEXT:    v_mov_b32_e32 v4, 4
2181; GFX10-PAL-NEXT:    ; implicit-def: $vcc_hi
2182; GFX10-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
2183; GFX10-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off
2184; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2185; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2186; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2187bb:
2188  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1
2189  %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1
2190  ret void
2191}
2192
2193declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg)
2194declare i32 @llvm.amdgcn.workitem.id.x()
2195