1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
4
5define amdgpu_kernel void @zero_init_kernel() {
6; GFX9-LABEL: zero_init_kernel:
7; GFX9:       ; %bb.0:
8; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
9; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
10; GFX9-NEXT:    v_mov_b32_e32 v0, 0
11; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
12; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:76
13; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
14; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:72
15; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
16; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:68
17; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
18; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:64
19; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
20; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:60
21; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
22; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:56
23; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
24; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:52
25; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
26; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:48
27; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
28; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:44
29; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
30; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:40
31; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
32; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:36
33; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
34; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:32
35; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
36; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:28
37; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
38; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:24
39; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
40; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:20
41; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
42; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:16
43; GFX9-NEXT:    s_endpgm
44;
45; GFX10-LABEL: zero_init_kernel:
46; GFX10:       ; %bb.0:
47; GFX10-NEXT:    s_add_u32 s0, s0, s3
48; GFX10-NEXT:    s_addc_u32 s1, s1, 0
49; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
50; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
51; GFX10-NEXT:    v_mov_b32_e32 v0, 0
52; GFX10-NEXT:    ; implicit-def: $vcc_hi
53; GFX10-NEXT:    scratch_store_dword off, v0, off offset:76
54; GFX10-NEXT:    scratch_store_dword off, v0, off offset:72
55; GFX10-NEXT:    scratch_store_dword off, v0, off offset:68
56; GFX10-NEXT:    scratch_store_dword off, v0, off offset:64
57; GFX10-NEXT:    scratch_store_dword off, v0, off offset:60
58; GFX10-NEXT:    scratch_store_dword off, v0, off offset:56
59; GFX10-NEXT:    scratch_store_dword off, v0, off offset:52
60; GFX10-NEXT:    scratch_store_dword off, v0, off offset:48
61; GFX10-NEXT:    scratch_store_dword off, v0, off offset:44
62; GFX10-NEXT:    scratch_store_dword off, v0, off offset:40
63; GFX10-NEXT:    scratch_store_dword off, v0, off offset:36
64; GFX10-NEXT:    scratch_store_dword off, v0, off offset:32
65; GFX10-NEXT:    scratch_store_dword off, v0, off offset:28
66; GFX10-NEXT:    scratch_store_dword off, v0, off offset:24
67; GFX10-NEXT:    scratch_store_dword off, v0, off offset:20
68; GFX10-NEXT:    scratch_store_dword off, v0, off offset:16
69; GFX10-NEXT:    s_endpgm
70  %alloca = alloca [32 x i16], align 2, addrspace(5)
71  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
72  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
73  ret void
74}
75
76define void @zero_init_foo() {
77; GFX9-LABEL: zero_init_foo:
78; GFX9:       ; %bb.0:
79; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80; GFX9-NEXT:    v_mov_b32_e32 v0, 0
81; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:60
82; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:56
83; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:52
84; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:48
85; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:44
86; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:40
87; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:36
88; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:32
89; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:28
90; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:24
91; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:20
92; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:16
93; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:12
94; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:8
95; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:4
96; GFX9-NEXT:    scratch_store_dword off, v0, s32
97; GFX9-NEXT:    s_waitcnt vmcnt(0)
98; GFX9-NEXT:    s_setpc_b64 s[30:31]
99;
100; GFX10-LABEL: zero_init_foo:
101; GFX10:       ; %bb.0:
102; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
103; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
104; GFX10-NEXT:    v_mov_b32_e32 v0, 0
105; GFX10-NEXT:    ; implicit-def: $vcc_hi
106; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:60
107; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:56
108; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:52
109; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:48
110; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:44
111; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:40
112; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:36
113; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:32
114; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:28
115; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:24
116; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:20
117; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:16
118; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:12
119; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:8
120; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:4
121; GFX10-NEXT:    scratch_store_dword off, v0, s32
122; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
123; GFX10-NEXT:    s_setpc_b64 s[30:31]
124  %alloca = alloca [32 x i16], align 2, addrspace(5)
125  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
126  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
127  ret void
128}
129
130define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
131; GFX9-LABEL: store_load_sindex_kernel:
132; GFX9:       ; %bb.0: ; %bb
133; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
134; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
135; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
136; GFX9-NEXT:    v_mov_b32_e32 v0, 15
137; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
138; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
139; GFX9-NEXT:    s_and_b32 s0, s0, 15
140; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
141; GFX9-NEXT:    s_add_u32 s1, 4, s1
142; GFX9-NEXT:    scratch_store_dword off, v0, s1
143; GFX9-NEXT:    s_add_u32 s0, 4, s0
144; GFX9-NEXT:    scratch_load_dword v0, off, s0
145; GFX9-NEXT:    s_endpgm
146;
147; GFX10-LABEL: store_load_sindex_kernel:
148; GFX10:       ; %bb.0: ; %bb
149; GFX10-NEXT:    s_add_u32 s2, s2, s5
150; GFX10-NEXT:    s_addc_u32 s3, s3, 0
151; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
152; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
153; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
154; GFX10-NEXT:    v_mov_b32_e32 v0, 15
155; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
156; GFX10-NEXT:    s_and_b32 s1, s0, 15
157; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
158; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
159; GFX10-NEXT:    s_add_u32 s0, 4, s0
160; GFX10-NEXT:    s_add_u32 s1, 4, s1
161; GFX10-NEXT:    scratch_store_dword off, v0, s0
162; GFX10-NEXT:    scratch_load_dword v0, off, s1
163; GFX10-NEXT:    s_endpgm
164bb:
165  %i = alloca [32 x float], align 4, addrspace(5)
166  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
167  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
168  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
169  store volatile i32 15, i32 addrspace(5)* %i8, align 4
170  %i9 = and i32 %idx, 15
171  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
172  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
173  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
174  ret void
175}
176
177define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
178; GFX9-LABEL: store_load_sindex_foo:
179; GFX9:       ; %bb.0: ; %bb
180; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
181; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
182; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
183; GFX9-NEXT:    s_add_u32 s0, 4, s0
184; GFX9-NEXT:    v_mov_b32_e32 v0, 15
185; GFX9-NEXT:    scratch_store_dword off, v0, s0
186; GFX9-NEXT:    s_and_b32 s0, s2, 15
187; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
188; GFX9-NEXT:    s_add_u32 s0, 4, s0
189; GFX9-NEXT:    scratch_load_dword v0, off, s0
190; GFX9-NEXT:    s_endpgm
191;
192; GFX10-LABEL: store_load_sindex_foo:
193; GFX10:       ; %bb.0: ; %bb
194; GFX10-NEXT:    s_add_u32 s0, s0, s3
195; GFX10-NEXT:    s_addc_u32 s1, s1, 0
196; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
197; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
198; GFX10-NEXT:    s_and_b32 s0, s2, 15
199; GFX10-NEXT:    v_mov_b32_e32 v0, 15
200; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
201; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
202; GFX10-NEXT:    s_add_u32 s1, 4, s1
203; GFX10-NEXT:    s_add_u32 s0, 4, s0
204; GFX10-NEXT:    scratch_store_dword off, v0, s1
205; GFX10-NEXT:    scratch_load_dword v0, off, s0
206; GFX10-NEXT:    s_endpgm
207bb:
208  %i = alloca [32 x float], align 4, addrspace(5)
209  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
210  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
211  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
212  store volatile i32 15, i32 addrspace(5)* %i8, align 4
213  %i9 = and i32 %idx, 15
214  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
215  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
216  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
217  ret void
218}
219
220define amdgpu_kernel void @store_load_vindex_kernel() {
221; GFX9-LABEL: store_load_vindex_kernel:
222; GFX9:       ; %bb.0: ; %bb
223; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
224; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
225; GFX9-NEXT:    v_mov_b32_e32 v1, 4
226; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
227; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
228; GFX9-NEXT:    v_mov_b32_e32 v3, 15
229; GFX9-NEXT:    scratch_store_dword v2, v3, off
230; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
231; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124
232; GFX9-NEXT:    s_endpgm
233;
234; GFX10-LABEL: store_load_vindex_kernel:
235; GFX10:       ; %bb.0: ; %bb
236; GFX10-NEXT:    s_add_u32 s0, s0, s3
237; GFX10-NEXT:    s_addc_u32 s1, s1, 0
238; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
239; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
240; GFX10-NEXT:    v_mov_b32_e32 v1, 4
241; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
242; GFX10-NEXT:    v_mov_b32_e32 v3, 15
243; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
244; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
245; GFX10-NEXT:    scratch_store_dword v2, v3, off
246; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124
247; GFX10-NEXT:    s_endpgm
248bb:
249  %i = alloca [32 x float], align 4, addrspace(5)
250  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
251  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
252  %i3 = zext i32 %i2 to i64
253  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
254  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
255  store volatile i32 15, i32 addrspace(5)* %i8, align 4
256  %i9 = sub nsw i32 31, %i2
257  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
258  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
259  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
260  ret void
261}
262
263define void @store_load_vindex_foo(i32 %idx) {
264; GFX9-LABEL: store_load_vindex_foo:
265; GFX9:       ; %bb.0: ; %bb
266; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
267; GFX9-NEXT:    v_mov_b32_e32 v1, s32
268; GFX9-NEXT:    v_mov_b32_e32 v3, 15
269; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
270; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
271; GFX9-NEXT:    scratch_store_dword v2, v3, off
272; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
273; GFX9-NEXT:    scratch_load_dword v0, v0, off
274; GFX9-NEXT:    s_waitcnt vmcnt(0)
275; GFX9-NEXT:    s_setpc_b64 s[30:31]
276;
277; GFX10-LABEL: store_load_vindex_foo:
278; GFX10:       ; %bb.0: ; %bb
279; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
281; GFX10-NEXT:    v_mov_b32_e32 v1, 15
282; GFX10-NEXT:    v_mov_b32_e32 v2, s32
283; GFX10-NEXT:    ; implicit-def: $vcc_hi
284; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
285; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
286; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
287; GFX10-NEXT:    scratch_store_dword v0, v1, off
288; GFX10-NEXT:    scratch_load_dword v0, v2, off
289; GFX10-NEXT:    s_waitcnt vmcnt(0)
290; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
291; GFX10-NEXT:    s_setpc_b64 s[30:31]
292bb:
293  %i = alloca [32 x float], align 4, addrspace(5)
294  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
295  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
296  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
297  store volatile i32 15, i32 addrspace(5)* %i8, align 4
298  %i9 = and i32 %idx, 15
299  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
300  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
301  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
302  ret void
303}
304
305define void @private_ptr_foo(float addrspace(5)* nocapture %arg) {
306; GFX9-LABEL: private_ptr_foo:
307; GFX9:       ; %bb.0:
308; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
309; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
310; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
311; GFX9-NEXT:    s_waitcnt vmcnt(0)
312; GFX9-NEXT:    s_setpc_b64 s[30:31]
313;
314; GFX10-LABEL: private_ptr_foo:
315; GFX10:       ; %bb.0:
316; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
317; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
318; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
319; GFX10-NEXT:    ; implicit-def: $vcc_hi
320; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
321; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
322; GFX10-NEXT:    s_setpc_b64 s[30:31]
323  %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1
324  store float 1.000000e+01, float addrspace(5)* %gep, align 4
325  ret void
326}
327
328define amdgpu_kernel void @zero_init_small_offset_kernel() {
329; GFX9-LABEL: zero_init_small_offset_kernel:
330; GFX9:       ; %bb.0:
331; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
332; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
333; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
334; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
335; GFX9-NEXT:    s_waitcnt vmcnt(0)
336; GFX9-NEXT:    v_mov_b32_e32 v0, 0
337; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
338; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:284
339; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
340; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:280
341; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
342; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:276
343; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
344; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:272
345; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
346; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:300
347; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
348; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:296
349; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
350; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:292
351; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
352; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:288
353; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
354; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:316
355; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
356; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:312
357; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
358; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:308
359; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
360; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:304
361; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
362; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:332
363; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
364; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:328
365; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
366; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:324
367; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
368; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:320
369; GFX9-NEXT:    s_endpgm
370;
371; GFX10-LABEL: zero_init_small_offset_kernel:
372; GFX10:       ; %bb.0:
373; GFX10-NEXT:    s_add_u32 s0, s0, s3
374; GFX10-NEXT:    s_addc_u32 s1, s1, 0
375; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
376; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
377; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
378; GFX10-NEXT:    s_waitcnt vmcnt(0)
379; GFX10-NEXT:    v_mov_b32_e32 v0, 0
380; GFX10-NEXT:    ; implicit-def: $vcc_hi
381; GFX10-NEXT:    scratch_store_dword off, v0, off offset:284
382; GFX10-NEXT:    scratch_store_dword off, v0, off offset:280
383; GFX10-NEXT:    scratch_store_dword off, v0, off offset:276
384; GFX10-NEXT:    scratch_store_dword off, v0, off offset:272
385; GFX10-NEXT:    scratch_store_dword off, v0, off offset:300
386; GFX10-NEXT:    scratch_store_dword off, v0, off offset:296
387; GFX10-NEXT:    scratch_store_dword off, v0, off offset:292
388; GFX10-NEXT:    scratch_store_dword off, v0, off offset:288
389; GFX10-NEXT:    scratch_store_dword off, v0, off offset:316
390; GFX10-NEXT:    scratch_store_dword off, v0, off offset:312
391; GFX10-NEXT:    scratch_store_dword off, v0, off offset:308
392; GFX10-NEXT:    scratch_store_dword off, v0, off offset:304
393; GFX10-NEXT:    scratch_store_dword off, v0, off offset:332
394; GFX10-NEXT:    scratch_store_dword off, v0, off offset:328
395; GFX10-NEXT:    scratch_store_dword off, v0, off offset:324
396; GFX10-NEXT:    scratch_store_dword off, v0, off offset:320
397; GFX10-NEXT:    s_endpgm
398  %padding = alloca [64 x i32], align 4, addrspace(5)
399  %alloca = alloca [32 x i16], align 2, addrspace(5)
400  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
401  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
402  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
403  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
404  ret void
405}
406
407define void @zero_init_small_offset_foo() {
408; GFX9-LABEL: zero_init_small_offset_foo:
409; GFX9:       ; %bb.0:
410; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
411; GFX9-NEXT:    scratch_load_dword v0, off, s32
412; GFX9-NEXT:    s_waitcnt vmcnt(0)
413; GFX9-NEXT:    v_mov_b32_e32 v0, 0
414; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:268
415; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:264
416; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:260
417; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:256
418; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:284
419; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:280
420; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:276
421; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:272
422; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:300
423; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:296
424; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:292
425; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:288
426; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:316
427; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:312
428; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:308
429; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:304
430; GFX9-NEXT:    s_waitcnt vmcnt(0)
431; GFX9-NEXT:    s_setpc_b64 s[30:31]
432;
433; GFX10-LABEL: zero_init_small_offset_foo:
434; GFX10:       ; %bb.0:
435; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
436; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
437; GFX10-NEXT:    scratch_load_dword v0, off, s32
438; GFX10-NEXT:    s_waitcnt vmcnt(0)
439; GFX10-NEXT:    v_mov_b32_e32 v0, 0
440; GFX10-NEXT:    ; implicit-def: $vcc_hi
441; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:268
442; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:264
443; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:260
444; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:256
445; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:284
446; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:280
447; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:276
448; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:272
449; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:300
450; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:296
451; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:292
452; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:288
453; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:316
454; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:312
455; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:308
456; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:304
457; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
458; GFX10-NEXT:    s_setpc_b64 s[30:31]
459  %padding = alloca [64 x i32], align 4, addrspace(5)
460  %alloca = alloca [32 x i16], align 2, addrspace(5)
461  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
462  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
463  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
464  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
465  ret void
466}
467
468define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
469; GFX9-LABEL: store_load_sindex_small_offset_kernel:
470; GFX9:       ; %bb.0: ; %bb
471; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
472; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
473; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
474; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
475; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
476; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
477; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
478; GFX9-NEXT:    s_and_b32 s0, s0, 15
479; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
480; GFX9-NEXT:    s_waitcnt vmcnt(0)
481; GFX9-NEXT:    v_mov_b32_e32 v0, 15
482; GFX9-NEXT:    s_add_u32 s1, 0x104, s1
483; GFX9-NEXT:    scratch_store_dword off, v0, s1
484; GFX9-NEXT:    s_add_u32 s0, 0x104, s0
485; GFX9-NEXT:    scratch_load_dword v0, off, s0
486; GFX9-NEXT:    s_endpgm
487;
488; GFX10-LABEL: store_load_sindex_small_offset_kernel:
489; GFX10:       ; %bb.0: ; %bb
490; GFX10-NEXT:    s_add_u32 s2, s2, s5
491; GFX10-NEXT:    s_addc_u32 s3, s3, 0
492; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
493; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
494; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
495; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
496; GFX10-NEXT:    s_waitcnt vmcnt(0)
497; GFX10-NEXT:    v_mov_b32_e32 v0, 15
498; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
499; GFX10-NEXT:    s_and_b32 s1, s0, 15
500; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
501; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
502; GFX10-NEXT:    s_add_u32 s0, 0x104, s0
503; GFX10-NEXT:    s_add_u32 s1, 0x104, s1
504; GFX10-NEXT:    scratch_store_dword off, v0, s0
505; GFX10-NEXT:    scratch_load_dword v0, off, s1
506; GFX10-NEXT:    s_endpgm
507bb:
508  %padding = alloca [64 x i32], align 4, addrspace(5)
509  %i = alloca [32 x float], align 4, addrspace(5)
510  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
511  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
512  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
513  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
514  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
515  store volatile i32 15, i32 addrspace(5)* %i8, align 4
516  %i9 = and i32 %idx, 15
517  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
518  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
519  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
520  ret void
521}
522
523define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
524; GFX9-LABEL: store_load_sindex_small_offset_foo:
525; GFX9:       ; %bb.0: ; %bb
526; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
527; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
528; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
529; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
530; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
531; GFX9-NEXT:    s_add_u32 s0, 0x104, s0
532; GFX9-NEXT:    s_waitcnt vmcnt(0)
533; GFX9-NEXT:    v_mov_b32_e32 v0, 15
534; GFX9-NEXT:    scratch_store_dword off, v0, s0
535; GFX9-NEXT:    s_and_b32 s0, s2, 15
536; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
537; GFX9-NEXT:    s_add_u32 s0, 0x104, s0
538; GFX9-NEXT:    scratch_load_dword v0, off, s0
539; GFX9-NEXT:    s_endpgm
540;
541; GFX10-LABEL: store_load_sindex_small_offset_foo:
542; GFX10:       ; %bb.0: ; %bb
543; GFX10-NEXT:    s_add_u32 s0, s0, s3
544; GFX10-NEXT:    s_addc_u32 s1, s1, 0
545; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
546; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
547; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
548; GFX10-NEXT:    s_and_b32 s0, s2, 15
549; GFX10-NEXT:    s_waitcnt vmcnt(0)
550; GFX10-NEXT:    v_mov_b32_e32 v0, 15
551; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
552; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
553; GFX10-NEXT:    s_add_u32 s1, 0x104, s1
554; GFX10-NEXT:    s_add_u32 s0, 0x104, s0
555; GFX10-NEXT:    scratch_store_dword off, v0, s1
556; GFX10-NEXT:    scratch_load_dword v0, off, s0
557; GFX10-NEXT:    s_endpgm
558bb:
559  %padding = alloca [64 x i32], align 4, addrspace(5)
560  %i = alloca [32 x float], align 4, addrspace(5)
561  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
562  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
563  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
564  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
565  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
566  store volatile i32 15, i32 addrspace(5)* %i8, align 4
567  %i9 = and i32 %idx, 15
568  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
569  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
570  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
571  ret void
572}
573
574define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
575; GFX9-LABEL: store_load_vindex_small_offset_kernel:
576; GFX9:       ; %bb.0: ; %bb
577; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
578; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
579; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
580; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4
581; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
582; GFX9-NEXT:    s_waitcnt vmcnt(0)
583; GFX9-NEXT:    v_mov_b32_e32 v1, 0x104
584; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
585; GFX9-NEXT:    v_mov_b32_e32 v3, 15
586; GFX9-NEXT:    scratch_store_dword v2, v3, off
587; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
588; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124
589; GFX9-NEXT:    s_endpgm
590;
591; GFX10-LABEL: store_load_vindex_small_offset_kernel:
592; GFX10:       ; %bb.0: ; %bb
593; GFX10-NEXT:    s_add_u32 s0, s0, s3
594; GFX10-NEXT:    s_addc_u32 s1, s1, 0
595; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
596; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
597; GFX10-NEXT:    v_mov_b32_e32 v1, 0x104
598; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
599; GFX10-NEXT:    v_mov_b32_e32 v3, 15
600; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
601; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
602; GFX10-NEXT:    scratch_load_dword v1, off, off offset:4
603; GFX10-NEXT:    scratch_store_dword v2, v3, off
604; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124
605; GFX10-NEXT:    s_endpgm
606bb:
607  %padding = alloca [64 x i32], align 4, addrspace(5)
608  %i = alloca [32 x float], align 4, addrspace(5)
609  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
610  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
611  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
612  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
613  %i3 = zext i32 %i2 to i64
614  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
615  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
616  store volatile i32 15, i32 addrspace(5)* %i8, align 4
617  %i9 = sub nsw i32 31, %i2
618  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
619  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
620  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
621  ret void
622}
623
624define void @store_load_vindex_small_offset_foo(i32 %idx) {
625; GFX9-LABEL: store_load_vindex_small_offset_foo:
626; GFX9:       ; %bb.0: ; %bb
627; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
628; GFX9-NEXT:    scratch_load_dword v1, off, s32
629; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x100
630; GFX9-NEXT:    s_waitcnt vmcnt(0)
631; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
632; GFX9-NEXT:    v_mov_b32_e32 v3, 15
633; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
634; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
635; GFX9-NEXT:    scratch_store_dword v2, v3, off
636; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
637; GFX9-NEXT:    scratch_load_dword v0, v0, off
638; GFX9-NEXT:    s_waitcnt vmcnt(0)
639; GFX9-NEXT:    s_setpc_b64 s[30:31]
640;
641; GFX10-LABEL: store_load_vindex_small_offset_foo:
642; GFX10:       ; %bb.0: ; %bb
643; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
644; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
645; GFX10-NEXT:    v_mov_b32_e32 v1, 15
646; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x100
647; GFX10-NEXT:    ; implicit-def: $vcc_hi
648; GFX10-NEXT:    v_mov_b32_e32 v2, vcc_lo
649; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
650; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
651; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
652; GFX10-NEXT:    scratch_load_dword v3, off, s32
653; GFX10-NEXT:    scratch_store_dword v0, v1, off
654; GFX10-NEXT:    scratch_load_dword v0, v2, off
655; GFX10-NEXT:    s_waitcnt vmcnt(0)
656; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
657; GFX10-NEXT:    s_setpc_b64 s[30:31]
658bb:
659  %padding = alloca [64 x i32], align 4, addrspace(5)
660  %i = alloca [32 x float], align 4, addrspace(5)
661  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
662  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
663  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
664  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
665  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
666  store volatile i32 15, i32 addrspace(5)* %i8, align 4
667  %i9 = and i32 %idx, 15
668  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
669  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
670  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
671  ret void
672}
673
674define amdgpu_kernel void @zero_init_large_offset_kernel() {
675; GFX9-LABEL: zero_init_large_offset_kernel:
676; GFX9:       ; %bb.0:
677; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
678; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
679; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
680; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
681; GFX9-NEXT:    s_waitcnt vmcnt(0)
682; GFX9-NEXT:    v_mov_b32_e32 v0, 0
683; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
684; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:12
685; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
686; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:8
687; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
688; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
689; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
690; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi
691; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
692; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:28
693; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
694; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:24
695; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
696; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:20
697; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
698; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:16
699; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
700; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:44
701; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
702; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:40
703; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
704; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:36
705; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
706; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:32
707; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
708; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:60
709; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
710; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:56
711; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
712; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:52
713; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
714; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:48
715; GFX9-NEXT:    s_endpgm
716;
717; GFX10-LABEL: zero_init_large_offset_kernel:
718; GFX10:       ; %bb.0:
719; GFX10-NEXT:    s_add_u32 s0, s0, s3
720; GFX10-NEXT:    s_addc_u32 s1, s1, 0
721; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
722; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
723; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
724; GFX10-NEXT:    s_waitcnt vmcnt(0)
725; GFX10-NEXT:    v_mov_b32_e32 v0, 0
726; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
727; GFX10-NEXT:    ; implicit-def: $vcc_hi
728; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:12
729; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
730; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:8
731; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
732; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:4
733; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
734; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo
735; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
736; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:28
737; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
738; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:24
739; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
740; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:20
741; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
742; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:16
743; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
744; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:44
745; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
746; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:40
747; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
748; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:36
749; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
750; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:32
751; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
752; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:60
753; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
754; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:56
755; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
756; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:52
757; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
758; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:48
759; GFX10-NEXT:    s_endpgm
760  %padding = alloca [4096 x i32], align 4, addrspace(5)
761  %alloca = alloca [32 x i16], align 2, addrspace(5)
762  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
763  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
764  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
765  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
766  ret void
767}
768
769define void @zero_init_large_offset_foo() {
770; GFX9-LABEL: zero_init_large_offset_foo:
771; GFX9:       ; %bb.0:
772; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
773; GFX9-NEXT:    scratch_load_dword v0, off, s32
774; GFX9-NEXT:    s_waitcnt vmcnt(0)
775; GFX9-NEXT:    v_mov_b32_e32 v0, 0
776; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
777; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:12
778; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
779; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:8
780; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
781; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
782; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
783; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi
784; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
785; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:28
786; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
787; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:24
788; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
789; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:20
790; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
791; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:16
792; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
793; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:44
794; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
795; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:40
796; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
797; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:36
798; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
799; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:32
800; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
801; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:60
802; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
803; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:56
804; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
805; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:52
806; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
807; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:48
808; GFX9-NEXT:    s_waitcnt vmcnt(0)
809; GFX9-NEXT:    s_setpc_b64 s[30:31]
810;
811; GFX10-LABEL: zero_init_large_offset_foo:
812; GFX10:       ; %bb.0:
813; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
814; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
815; GFX10-NEXT:    scratch_load_dword v0, off, s32
816; GFX10-NEXT:    s_waitcnt vmcnt(0)
817; GFX10-NEXT:    v_mov_b32_e32 v0, 0
818; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
819; GFX10-NEXT:    ; implicit-def: $vcc_hi
820; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:12
821; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
822; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:8
823; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
824; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:4
825; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
826; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo
827; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
828; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:28
829; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
830; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:24
831; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
832; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:20
833; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
834; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:16
835; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
836; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:44
837; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
838; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:40
839; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
840; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:36
841; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
842; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:32
843; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
844; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:60
845; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
846; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:56
847; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
848; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:52
849; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
850; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:48
851; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
852; GFX10-NEXT:    s_setpc_b64 s[30:31]
853  %padding = alloca [4096 x i32], align 4, addrspace(5)
854  %alloca = alloca [32 x i16], align 2, addrspace(5)
855  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
856  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
857  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
858  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
859  ret void
860}
861
862define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
863; GFX9-LABEL: store_load_sindex_large_offset_kernel:
864; GFX9:       ; %bb.0: ; %bb
865; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
866; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
867; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
868; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
869; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
870; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
871; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
872; GFX9-NEXT:    s_and_b32 s0, s0, 15
873; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
874; GFX9-NEXT:    s_waitcnt vmcnt(0)
875; GFX9-NEXT:    v_mov_b32_e32 v0, 15
876; GFX9-NEXT:    s_add_u32 s1, 0x4004, s1
877; GFX9-NEXT:    scratch_store_dword off, v0, s1
878; GFX9-NEXT:    s_add_u32 s0, 0x4004, s0
879; GFX9-NEXT:    scratch_load_dword v0, off, s0
880; GFX9-NEXT:    s_endpgm
881;
882; GFX10-LABEL: store_load_sindex_large_offset_kernel:
883; GFX10:       ; %bb.0: ; %bb
884; GFX10-NEXT:    s_add_u32 s2, s2, s5
885; GFX10-NEXT:    s_addc_u32 s3, s3, 0
886; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
887; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
888; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
889; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
890; GFX10-NEXT:    s_waitcnt vmcnt(0)
891; GFX10-NEXT:    v_mov_b32_e32 v0, 15
892; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
893; GFX10-NEXT:    s_and_b32 s1, s0, 15
894; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
895; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
896; GFX10-NEXT:    s_add_u32 s0, 0x4004, s0
897; GFX10-NEXT:    s_add_u32 s1, 0x4004, s1
898; GFX10-NEXT:    scratch_store_dword off, v0, s0
899; GFX10-NEXT:    scratch_load_dword v0, off, s1
900; GFX10-NEXT:    s_endpgm
901bb:
902  %padding = alloca [4096 x i32], align 4, addrspace(5)
903  %i = alloca [32 x float], align 4, addrspace(5)
904  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
905  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
906  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
907  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
908  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
909  store volatile i32 15, i32 addrspace(5)* %i8, align 4
910  %i9 = and i32 %idx, 15
911  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
912  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
913  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
914  ret void
915}
916
917define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
918; GFX9-LABEL: store_load_sindex_large_offset_foo:
919; GFX9:       ; %bb.0: ; %bb
920; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
921; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
922; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
923; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
924; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
925; GFX9-NEXT:    s_add_u32 s0, 0x4004, s0
926; GFX9-NEXT:    s_waitcnt vmcnt(0)
927; GFX9-NEXT:    v_mov_b32_e32 v0, 15
928; GFX9-NEXT:    scratch_store_dword off, v0, s0
929; GFX9-NEXT:    s_and_b32 s0, s2, 15
930; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
931; GFX9-NEXT:    s_add_u32 s0, 0x4004, s0
932; GFX9-NEXT:    scratch_load_dword v0, off, s0
933; GFX9-NEXT:    s_endpgm
934;
935; GFX10-LABEL: store_load_sindex_large_offset_foo:
936; GFX10:       ; %bb.0: ; %bb
937; GFX10-NEXT:    s_add_u32 s0, s0, s3
938; GFX10-NEXT:    s_addc_u32 s1, s1, 0
939; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
940; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
941; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
942; GFX10-NEXT:    s_and_b32 s0, s2, 15
943; GFX10-NEXT:    s_waitcnt vmcnt(0)
944; GFX10-NEXT:    v_mov_b32_e32 v0, 15
945; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
946; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
947; GFX10-NEXT:    s_add_u32 s1, 0x4004, s1
948; GFX10-NEXT:    s_add_u32 s0, 0x4004, s0
949; GFX10-NEXT:    scratch_store_dword off, v0, s1
950; GFX10-NEXT:    scratch_load_dword v0, off, s0
951; GFX10-NEXT:    s_endpgm
952bb:
953  %padding = alloca [4096 x i32], align 4, addrspace(5)
954  %i = alloca [32 x float], align 4, addrspace(5)
955  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
956  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
957  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
958  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
959  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
960  store volatile i32 15, i32 addrspace(5)* %i8, align 4
961  %i9 = and i32 %idx, 15
962  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
963  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
964  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
965  ret void
966}
967
968define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
969; GFX9-LABEL: store_load_vindex_large_offset_kernel:
970; GFX9:       ; %bb.0: ; %bb
971; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
972; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
973; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
974; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4
975; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
976; GFX9-NEXT:    s_waitcnt vmcnt(0)
977; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4004
978; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
979; GFX9-NEXT:    v_mov_b32_e32 v3, 15
980; GFX9-NEXT:    scratch_store_dword v2, v3, off
981; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
982; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124
983; GFX9-NEXT:    s_endpgm
984;
985; GFX10-LABEL: store_load_vindex_large_offset_kernel:
986; GFX10:       ; %bb.0: ; %bb
987; GFX10-NEXT:    s_add_u32 s0, s0, s3
988; GFX10-NEXT:    s_addc_u32 s1, s1, 0
989; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
990; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
991; GFX10-NEXT:    v_mov_b32_e32 v1, 0x4004
992; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
993; GFX10-NEXT:    v_mov_b32_e32 v3, 15
994; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
995; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
996; GFX10-NEXT:    scratch_load_dword v1, off, off offset:4
997; GFX10-NEXT:    scratch_store_dword v2, v3, off
998; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124
999; GFX10-NEXT:    s_endpgm
1000bb:
1001  %padding = alloca [4096 x i32], align 4, addrspace(5)
1002  %i = alloca [32 x float], align 4, addrspace(5)
1003  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1004  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1005  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1006  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
1007  %i3 = zext i32 %i2 to i64
1008  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
1009  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1010  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1011  %i9 = sub nsw i32 31, %i2
1012  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1013  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1014  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1015  ret void
1016}
1017
1018define void @store_load_vindex_large_offset_foo(i32 %idx) {
1019; GFX9-LABEL: store_load_vindex_large_offset_foo:
1020; GFX9:       ; %bb.0: ; %bb
1021; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1022; GFX9-NEXT:    scratch_load_dword v1, off, s32
1023; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
1024; GFX9-NEXT:    s_waitcnt vmcnt(0)
1025; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
1026; GFX9-NEXT:    v_mov_b32_e32 v3, 15
1027; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1028; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
1029; GFX9-NEXT:    scratch_store_dword v2, v3, off
1030; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1031; GFX9-NEXT:    scratch_load_dword v0, v0, off
1032; GFX9-NEXT:    s_waitcnt vmcnt(0)
1033; GFX9-NEXT:    s_setpc_b64 s[30:31]
1034;
1035; GFX10-LABEL: store_load_vindex_large_offset_foo:
1036; GFX10:       ; %bb.0: ; %bb
1037; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1038; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1039; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1040; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
1041; GFX10-NEXT:    ; implicit-def: $vcc_hi
1042; GFX10-NEXT:    v_mov_b32_e32 v2, vcc_lo
1043; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
1044; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
1045; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
1046; GFX10-NEXT:    scratch_load_dword v3, off, s32
1047; GFX10-NEXT:    scratch_store_dword v0, v1, off
1048; GFX10-NEXT:    scratch_load_dword v0, v2, off
1049; GFX10-NEXT:    s_waitcnt vmcnt(0)
1050; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1051; GFX10-NEXT:    s_setpc_b64 s[30:31]
1052bb:
1053  %padding = alloca [4096 x i32], align 4, addrspace(5)
1054  %i = alloca [32 x float], align 4, addrspace(5)
1055  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1056  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1057  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1058  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1059  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1060  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1061  %i9 = and i32 %idx, 15
1062  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1063  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1064  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1065  ret void
1066}
1067
1068define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
1069; GFX9-LABEL: store_load_large_imm_offset_kernel:
1070; GFX9:       ; %bb.0: ; %bb
1071; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1072; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1073; GFX9-NEXT:    s_movk_i32 s0, 0x3000
1074; GFX9-NEXT:    v_mov_b32_e32 v0, 13
1075; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1076; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
1077; GFX9-NEXT:    s_add_u32 s0, 4, s0
1078; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1079; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
1080; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712
1081; GFX9-NEXT:    s_endpgm
1082;
1083; GFX10-LABEL: store_load_large_imm_offset_kernel:
1084; GFX10:       ; %bb.0: ; %bb
1085; GFX10-NEXT:    s_add_u32 s0, s0, s3
1086; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1087; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1088; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1089; GFX10-NEXT:    v_mov_b32_e32 v0, 13
1090; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1091; GFX10-NEXT:    s_movk_i32 s0, 0x3800
1092; GFX10-NEXT:    s_add_u32 s0, 4, s0
1093; GFX10-NEXT:    scratch_store_dword off, v0, off offset:4
1094; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
1095; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664
1096; GFX10-NEXT:    s_endpgm
1097bb:
1098  %i = alloca [4096 x i32], align 4, addrspace(5)
1099  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
1100  store volatile i32 13, i32 addrspace(5)* %i1, align 4
1101  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
1102  store volatile i32 15, i32 addrspace(5)* %i7, align 4
1103  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
1104  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
1105  ret void
1106}
1107
1108define void @store_load_large_imm_offset_foo() {
1109; GFX9-LABEL: store_load_large_imm_offset_foo:
1110; GFX9:       ; %bb.0: ; %bb
1111; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1112; GFX9-NEXT:    s_movk_i32 s4, 0x3000
1113; GFX9-NEXT:    v_mov_b32_e32 v0, 13
1114; GFX9-NEXT:    scratch_store_dword off, v0, s32
1115; GFX9-NEXT:    s_add_u32 s4, s32, s4
1116; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1117; GFX9-NEXT:    scratch_store_dword off, v0, s4 offset:3712
1118; GFX9-NEXT:    scratch_load_dword v0, off, s4 offset:3712
1119; GFX9-NEXT:    s_waitcnt vmcnt(0)
1120; GFX9-NEXT:    s_setpc_b64 s[30:31]
1121;
1122; GFX10-LABEL: store_load_large_imm_offset_foo:
1123; GFX10:       ; %bb.0: ; %bb
1124; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1125; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1126; GFX10-NEXT:    v_mov_b32_e32 v0, 13
1127; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1128; GFX10-NEXT:    s_movk_i32 s4, 0x3800
1129; GFX10-NEXT:    ; implicit-def: $vcc_hi
1130; GFX10-NEXT:    s_add_u32 s4, s32, s4
1131; GFX10-NEXT:    scratch_store_dword off, v0, s32
1132; GFX10-NEXT:    scratch_store_dword off, v1, s4 offset:1664
1133; GFX10-NEXT:    scratch_load_dword v0, off, s4 offset:1664
1134; GFX10-NEXT:    s_waitcnt vmcnt(0)
1135; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1136; GFX10-NEXT:    s_setpc_b64 s[30:31]
1137bb:
1138  %i = alloca [4096 x i32], align 4, addrspace(5)
1139  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
1140  store volatile i32 13, i32 addrspace(5)* %i1, align 4
1141  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
1142  store volatile i32 15, i32 addrspace(5)* %i7, align 4
1143  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
1144  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
1145  ret void
1146}
1147
1148define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
1149; GFX9-LABEL: store_load_vidx_sidx_offset:
1150; GFX9:       ; %bb.0: ; %bb
1151; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
1152; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
1153; GFX9-NEXT:    v_mov_b32_e32 v1, 4
1154; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1155; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1156; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
1157; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1158; GFX9-NEXT:    v_mov_b32_e32 v1, 15
1159; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
1160; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024
1161; GFX9-NEXT:    s_endpgm
1162;
1163; GFX10-LABEL: store_load_vidx_sidx_offset:
1164; GFX10:       ; %bb.0: ; %bb
1165; GFX10-NEXT:    s_add_u32 s2, s2, s5
1166; GFX10-NEXT:    s_addc_u32 s3, s3, 0
1167; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1168; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1169; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
1170; GFX10-NEXT:    v_mov_b32_e32 v1, 15
1171; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1172; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
1173; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
1174; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
1175; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024
1176; GFX10-NEXT:    s_endpgm
1177bb:
1178  %alloca = alloca [32 x i32], align 4, addrspace(5)
1179  %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
1180  %add1 = add nsw i32 %sidx, %vidx
1181  %add2 = add nsw i32 %add1, 256
1182  %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2
1183  store volatile i32 15, i32 addrspace(5)* %gep, align 4
1184  %load = load volatile i32, i32 addrspace(5)* %gep, align 4
1185  ret void
1186}
1187
1188; FIXME: Multi-DWORD scratch shall be supported
1189define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
1190; GFX9-LABEL: store_load_i64_aligned:
1191; GFX9:       ; %bb.0: ; %bb
1192; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1193; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1194; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
1195; GFX9-NEXT:    v_mov_b32_e32 v1, 15
1196; GFX9-NEXT:    scratch_store_dword v0, v1, off
1197; GFX9-NEXT:    scratch_load_dword v1, v0, off offset:4
1198; GFX9-NEXT:    scratch_load_dword v0, v0, off
1199; GFX9-NEXT:    s_waitcnt vmcnt(0)
1200; GFX9-NEXT:    s_setpc_b64 s[30:31]
1201;
1202; GFX10-LABEL: store_load_i64_aligned:
1203; GFX10:       ; %bb.0: ; %bb
1204; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1205; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1206; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1207; GFX10-NEXT:    v_mov_b32_e32 v2, 15
1208; GFX10-NEXT:    ; implicit-def: $vcc_hi
1209; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
1210; GFX10-NEXT:    scratch_store_dword v0, v2, off
1211; GFX10-NEXT:    s_clause 0x1
1212; GFX10-NEXT:    scratch_load_dword v1, v0, off offset:4
1213; GFX10-NEXT:    scratch_load_dword v0, v0, off
1214; GFX10-NEXT:    s_waitcnt vmcnt(0)
1215; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1216; GFX10-NEXT:    s_setpc_b64 s[30:31]
1217bb:
1218  store volatile i64 15, i64 addrspace(5)* %arg, align 8
1219  %load = load volatile i64, i64 addrspace(5)* %arg, align 8
1220  ret void
1221}
1222
1223; FIXME: Multi-DWORD unaligned scratch shall be supported
1224define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
1225; GFX9-LABEL: store_load_i64_unaligned:
1226; GFX9:       ; %bb.0: ; %bb
1227; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1228; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1229; GFX9-NEXT:    scratch_store_byte v0, v1, off offset:7
1230; GFX9-NEXT:    scratch_store_byte v0, v1, off offset:6
1231; GFX9-NEXT:    scratch_store_byte v0, v1, off offset:5
1232; GFX9-NEXT:    scratch_store_byte v0, v1, off offset:4
1233; GFX9-NEXT:    scratch_store_byte v0, v1, off offset:3
1234; GFX9-NEXT:    scratch_store_byte v0, v1, off offset:2
1235; GFX9-NEXT:    scratch_store_byte v0, v1, off offset:1
1236; GFX9-NEXT:    v_mov_b32_e32 v1, 15
1237; GFX9-NEXT:    scratch_store_byte v0, v1, off
1238; GFX9-NEXT:    scratch_load_ubyte v1, v0, off offset:6
1239; GFX9-NEXT:    s_waitcnt vmcnt(0)
1240; GFX9-NEXT:    scratch_load_ubyte v1, v0, off offset:7
1241; GFX9-NEXT:    s_waitcnt vmcnt(0)
1242; GFX9-NEXT:    scratch_load_ubyte v1, v0, off offset:4
1243; GFX9-NEXT:    s_waitcnt vmcnt(0)
1244; GFX9-NEXT:    scratch_load_ubyte v1, v0, off offset:5
1245; GFX9-NEXT:    s_waitcnt vmcnt(0)
1246; GFX9-NEXT:    scratch_load_ubyte v1, v0, off offset:2
1247; GFX9-NEXT:    s_waitcnt vmcnt(0)
1248; GFX9-NEXT:    scratch_load_ubyte v1, v0, off offset:3
1249; GFX9-NEXT:    s_waitcnt vmcnt(0)
1250; GFX9-NEXT:    scratch_load_ubyte v1, v0, off
1251; GFX9-NEXT:    scratch_load_ubyte v0, v0, off offset:1
1252; GFX9-NEXT:    s_waitcnt vmcnt(0)
1253; GFX9-NEXT:    s_setpc_b64 s[30:31]
1254;
1255; GFX10-LABEL: store_load_i64_unaligned:
1256; GFX10:       ; %bb.0: ; %bb
1257; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1258; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1259; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1260; GFX10-NEXT:    v_mov_b32_e32 v2, 15
1261; GFX10-NEXT:    ; implicit-def: $vcc_hi
1262; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:7
1263; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:6
1264; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:5
1265; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:4
1266; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:3
1267; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:2
1268; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:1
1269; GFX10-NEXT:    scratch_store_byte v0, v2, off
1270; GFX10-NEXT:    scratch_load_ubyte v1, v0, off offset:6
1271; GFX10-NEXT:    s_waitcnt vmcnt(0)
1272; GFX10-NEXT:    scratch_load_ubyte v1, v0, off offset:7
1273; GFX10-NEXT:    s_waitcnt vmcnt(0)
1274; GFX10-NEXT:    scratch_load_ubyte v1, v0, off offset:4
1275; GFX10-NEXT:    s_waitcnt vmcnt(0)
1276; GFX10-NEXT:    scratch_load_ubyte v1, v0, off offset:5
1277; GFX10-NEXT:    s_waitcnt vmcnt(0)
1278; GFX10-NEXT:    scratch_load_ubyte v1, v0, off offset:2
1279; GFX10-NEXT:    s_waitcnt vmcnt(0)
1280; GFX10-NEXT:    scratch_load_ubyte v1, v0, off offset:3
1281; GFX10-NEXT:    s_waitcnt vmcnt(0)
1282; GFX10-NEXT:    s_clause 0x1
1283; GFX10-NEXT:    scratch_load_ubyte v1, v0, off
1284; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:1
1285; GFX10-NEXT:    s_waitcnt vmcnt(0)
1286; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1287; GFX10-NEXT:    s_setpc_b64 s[30:31]
1288bb:
1289  store volatile i64 15, i64 addrspace(5)* %arg, align 1
1290  %load = load volatile i64, i64 addrspace(5)* %arg, align 1
1291  ret void
1292}
1293
1294declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg)
1295declare i32 @llvm.amdgcn.workitem.id.x()
1296