1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s
5; RUN: llc -march=amdgcn -mcpu=gfx940 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s
6; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s
7; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s
8
9define amdgpu_kernel void @zero_init_kernel() {
10; GFX9-LABEL: zero_init_kernel:
11; GFX9:       ; %bb.0:
12; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
13; GFX9-NEXT:    s_mov_b32 s0, 0
14; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
15; GFX9-NEXT:    s_mov_b32 s1, s0
16; GFX9-NEXT:    s_mov_b32 s2, s0
17; GFX9-NEXT:    s_mov_b32 s3, s0
18; GFX9-NEXT:    v_mov_b32_e32 v0, s0
19; GFX9-NEXT:    v_mov_b32_e32 v1, s1
20; GFX9-NEXT:    v_mov_b32_e32 v2, s2
21; GFX9-NEXT:    v_mov_b32_e32 v3, s3
22; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
23; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64
24; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
25; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
26; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
27; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
28; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
29; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
30; GFX9-NEXT:    s_endpgm
31;
32; GFX10-LABEL: zero_init_kernel:
33; GFX10:       ; %bb.0:
34; GFX10-NEXT:    s_add_u32 s0, s0, s3
35; GFX10-NEXT:    s_addc_u32 s1, s1, 0
36; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
37; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
38; GFX10-NEXT:    s_mov_b32 s0, 0
39; GFX10-NEXT:    s_mov_b32 s1, s0
40; GFX10-NEXT:    s_mov_b32 s2, s0
41; GFX10-NEXT:    s_mov_b32 s3, s0
42; GFX10-NEXT:    v_mov_b32_e32 v0, s0
43; GFX10-NEXT:    v_mov_b32_e32 v1, s1
44; GFX10-NEXT:    v_mov_b32_e32 v2, s2
45; GFX10-NEXT:    v_mov_b32_e32 v3, s3
46; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
47; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
48; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
49; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
50; GFX10-NEXT:    s_endpgm
51;
52; GFX9-PAL-LABEL: zero_init_kernel:
53; GFX9-PAL:       ; %bb.0:
54; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
55; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
56; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
57; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
58; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
59; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
60; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
61; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
62; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
63; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
64; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
65; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
66; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
67; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
68; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
69; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
70; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64
71; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
72; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
73; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
74; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
75; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
76; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
77; GFX9-PAL-NEXT:    s_endpgm
78;
79; GFX940-LABEL: zero_init_kernel:
80; GFX940:       ; %bb.0:
81; GFX940-NEXT:    s_mov_b32 s0, 0
82; GFX940-NEXT:    s_mov_b32 s1, s0
83; GFX940-NEXT:    s_mov_b32 s2, s0
84; GFX940-NEXT:    s_mov_b32 s3, s0
85; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
86; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
87; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
88; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
89; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
90; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
91; GFX940-NEXT:    s_endpgm
92;
93; GFX1010-PAL-LABEL: zero_init_kernel:
94; GFX1010-PAL:       ; %bb.0:
95; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
96; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
97; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
98; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
99; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
100; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
101; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
102; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
103; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
104; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
105; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
106; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
107; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
108; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
109; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
110; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
111; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
112; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
113; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:64
114; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
115; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
116; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
117; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
118; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
119; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
120; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
121; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
122; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
123; GFX1010-PAL-NEXT:    s_endpgm
124;
125; GFX1030-PAL-LABEL: zero_init_kernel:
126; GFX1030-PAL:       ; %bb.0:
127; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
128; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
129; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
130; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
131; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
132; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
133; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
134; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
135; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
136; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
137; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
138; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
139; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
140; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
141; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
142; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
143; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
144; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
145; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
146; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
147; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
148; GFX1030-PAL-NEXT:    s_endpgm
149  %alloca = alloca [32 x i16], align 2, addrspace(5)
150  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
151  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
152  ret void
153}
154
155define void @zero_init_foo() {
156; GFX9-LABEL: zero_init_foo:
157; GFX9:       ; %bb.0:
158; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159; GFX9-NEXT:    s_mov_b32 s0, 0
160; GFX9-NEXT:    s_mov_b32 s1, s0
161; GFX9-NEXT:    s_mov_b32 s2, s0
162; GFX9-NEXT:    s_mov_b32 s3, s0
163; GFX9-NEXT:    v_mov_b32_e32 v0, s0
164; GFX9-NEXT:    v_mov_b32_e32 v1, s1
165; GFX9-NEXT:    v_mov_b32_e32 v2, s2
166; GFX9-NEXT:    v_mov_b32_e32 v3, s3
167; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
168; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
169; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
170; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
171; GFX9-NEXT:    s_waitcnt vmcnt(0)
172; GFX9-NEXT:    s_setpc_b64 s[30:31]
173;
174; GFX10-LABEL: zero_init_foo:
175; GFX10:       ; %bb.0:
176; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
177; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
178; GFX10-NEXT:    s_mov_b32 s0, 0
179; GFX10-NEXT:    s_mov_b32 s1, s0
180; GFX10-NEXT:    s_mov_b32 s2, s0
181; GFX10-NEXT:    s_mov_b32 s3, s0
182; GFX10-NEXT:    v_mov_b32_e32 v0, s0
183; GFX10-NEXT:    v_mov_b32_e32 v1, s1
184; GFX10-NEXT:    v_mov_b32_e32 v2, s2
185; GFX10-NEXT:    v_mov_b32_e32 v3, s3
186; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
187; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
188; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
189; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
190; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
191; GFX10-NEXT:    s_setpc_b64 s[30:31]
192;
193; GFX9-PAL-LABEL: zero_init_foo:
194; GFX9-PAL:       ; %bb.0:
195; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
196; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
197; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
198; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
199; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
200; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
201; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
202; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
203; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
204; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
205; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
206; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
207; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
208; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
209; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
210;
211; GFX940-LABEL: zero_init_foo:
212; GFX940:       ; %bb.0:
213; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
214; GFX940-NEXT:    s_mov_b32 s0, 0
215; GFX940-NEXT:    s_mov_b32 s1, s0
216; GFX940-NEXT:    s_mov_b32 s2, s0
217; GFX940-NEXT:    s_mov_b32 s3, s0
218; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
219; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
220; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
221; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
222; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
223; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
224; GFX940-NEXT:    s_waitcnt vmcnt(0)
225; GFX940-NEXT:    s_setpc_b64 s[30:31]
226;
227; GFX10-PAL-LABEL: zero_init_foo:
228; GFX10-PAL:       ; %bb.0:
229; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
230; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
231; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
232; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
233; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
234; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
235; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
236; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
237; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
238; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
239; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
240; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
241; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
242; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
243; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
244; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
245; GCN-LABEL: zero_init_foo:
246; GCN:       ; %bb.0:
247; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
248; GCN-NEXT:    s_mov_b32 s0, 0
249; GCN-NEXT:    s_mov_b32 s1, s0
250; GCN-NEXT:    s_mov_b32 s2, s0
251; GCN-NEXT:    s_mov_b32 s3, s0
252; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
253; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
254; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
255; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
256; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
257; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
258; GCN-NEXT:    s_waitcnt vmcnt(0)
259; GCN-NEXT:    s_setpc_b64 s[30:31]
260  %alloca = alloca [32 x i16], align 2, addrspace(5)
261  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
262  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
263  ret void
264}
265
266define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
267; GFX9-LABEL: store_load_sindex_kernel:
268; GFX9:       ; %bb.0: ; %bb
269; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
270; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
271; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
272; GFX9-NEXT:    v_mov_b32_e32 v0, 15
273; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
274; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
275; GFX9-NEXT:    s_and_b32 s0, s0, 15
276; GFX9-NEXT:    s_add_i32 s1, s1, 4
277; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
278; GFX9-NEXT:    scratch_store_dword off, v0, s1
279; GFX9-NEXT:    s_waitcnt vmcnt(0)
280; GFX9-NEXT:    s_add_i32 s0, s0, 4
281; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
282; GFX9-NEXT:    s_waitcnt vmcnt(0)
283; GFX9-NEXT:    s_endpgm
284;
285; GFX10-LABEL: store_load_sindex_kernel:
286; GFX10:       ; %bb.0: ; %bb
287; GFX10-NEXT:    s_add_u32 s2, s2, s5
288; GFX10-NEXT:    s_addc_u32 s3, s3, 0
289; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
290; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
291; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
292; GFX10-NEXT:    v_mov_b32_e32 v0, 15
293; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
294; GFX10-NEXT:    s_and_b32 s1, s0, 15
295; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
296; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
297; GFX10-NEXT:    s_add_i32 s0, s0, 4
298; GFX10-NEXT:    s_add_i32 s1, s1, 4
299; GFX10-NEXT:    scratch_store_dword off, v0, s0
300; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
301; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
302; GFX10-NEXT:    s_waitcnt vmcnt(0)
303; GFX10-NEXT:    s_endpgm
304;
305; GFX9-PAL-LABEL: store_load_sindex_kernel:
306; GFX9-PAL:       ; %bb.0: ; %bb
307; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
308; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
309; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
310; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
311; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
312; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
313; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
314; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
315; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
316; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
317; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
318; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 4
319; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
320; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
321; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
322; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
323; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
324; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
325; GFX9-PAL-NEXT:    s_endpgm
326;
327; GFX940-LABEL: store_load_sindex_kernel:
328; GFX940:       ; %bb.0: ; %bb
329; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
330; GFX940-NEXT:    v_mov_b32_e32 v0, 15
331; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
332; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
333; GFX940-NEXT:    s_and_b32 s0, s0, 15
334; GFX940-NEXT:    s_add_i32 s1, s1, 4
335; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
336; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
337; GFX940-NEXT:    s_waitcnt vmcnt(0)
338; GFX940-NEXT:    s_add_i32 s0, s0, 4
339; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
340; GFX940-NEXT:    s_waitcnt vmcnt(0)
341; GFX940-NEXT:    s_endpgm
342;
343; GFX10-PAL-LABEL: store_load_sindex_kernel:
344; GFX10-PAL:       ; %bb.0: ; %bb
345; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
346; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
347; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
348; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
349; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
350; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
351; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
352; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
353; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
354; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
355; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
356; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
357; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
358; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
359; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
360; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 4
361; GFX10-PAL-NEXT:    s_add_i32 s1, s1, 4
362; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
363; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
364; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
365; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
366; GFX10-PAL-NEXT:    s_endpgm
367; GCN-LABEL: store_load_sindex_kernel:
368; GCN:       ; %bb.0: ; %bb
369; GCN-NEXT:    s_load_dword s0, s[0:1], 0x24
370; GCN-NEXT:    v_mov_b32_e32 v0, 15
371; GCN-NEXT:    s_waitcnt lgkmcnt(0)
372; GCN-NEXT:    s_lshl_b32 s1, s0, 2
373; GCN-NEXT:    s_and_b32 s0, s0, 15
374; GCN-NEXT:    s_lshl_b32 s0, s0, 2
375; GCN-NEXT:    s_add_u32 s1, 4, s1
376; GCN-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
377; GCN-NEXT:    s_waitcnt vmcnt(0)
378; GCN-NEXT:    s_add_u32 s0, 4, s0
379; GCN-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
380; GCN-NEXT:    s_waitcnt vmcnt(0)
381; GCN-NEXT:    s_endpgm
382bb:
383  %i = alloca [32 x float], align 4, addrspace(5)
384  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
385  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
386  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
387  store volatile i32 15, i32 addrspace(5)* %i8, align 4
388  %i9 = and i32 %idx, 15
389  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
390  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
391  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
392  ret void
393}
394
395define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
396; GFX9-LABEL: store_load_sindex_foo:
397; GFX9:       ; %bb.0: ; %bb
398; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
399; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
400; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
401; GFX9-NEXT:    s_add_i32 s0, s0, 4
402; GFX9-NEXT:    v_mov_b32_e32 v0, 15
403; GFX9-NEXT:    scratch_store_dword off, v0, s0
404; GFX9-NEXT:    s_waitcnt vmcnt(0)
405; GFX9-NEXT:    s_and_b32 s0, s2, 15
406; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
407; GFX9-NEXT:    s_add_i32 s0, s0, 4
408; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
409; GFX9-NEXT:    s_waitcnt vmcnt(0)
410; GFX9-NEXT:    s_endpgm
411;
412; GFX10-LABEL: store_load_sindex_foo:
413; GFX10:       ; %bb.0: ; %bb
414; GFX10-NEXT:    s_add_u32 s0, s0, s3
415; GFX10-NEXT:    s_addc_u32 s1, s1, 0
416; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
417; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
418; GFX10-NEXT:    v_mov_b32_e32 v0, 15
419; GFX10-NEXT:    s_and_b32 s0, s2, 15
420; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
421; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
422; GFX10-NEXT:    s_add_i32 s1, s1, 4
423; GFX10-NEXT:    s_add_i32 s0, s0, 4
424; GFX10-NEXT:    scratch_store_dword off, v0, s1
425; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
426; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
427; GFX10-NEXT:    s_waitcnt vmcnt(0)
428; GFX10-NEXT:    s_endpgm
429;
430; GFX9-PAL-LABEL: store_load_sindex_foo:
431; GFX9-PAL:       ; %bb.0: ; %bb
432; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
433; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
434; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
435; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
436; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
437; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
438; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
439; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
440; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
441; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
442; GFX9-PAL-NEXT:    s_add_i32 s1, s1, 4
443; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
444; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
445; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
446; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
447; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
448; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
449; GFX9-PAL-NEXT:    s_endpgm
450;
451; GFX940-LABEL: store_load_sindex_foo:
452; GFX940:       ; %bb.0: ; %bb
453; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
454; GFX940-NEXT:    s_and_b32 s0, s0, 15
455; GFX940-NEXT:    s_add_i32 s1, s1, 4
456; GFX940-NEXT:    v_mov_b32_e32 v0, 15
457; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
458; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
459; GFX940-NEXT:    s_waitcnt vmcnt(0)
460; GFX940-NEXT:    s_add_i32 s0, s0, 4
461; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
462; GFX940-NEXT:    s_waitcnt vmcnt(0)
463; GFX940-NEXT:    s_endpgm
464;
465; GFX10-PAL-LABEL: store_load_sindex_foo:
466; GFX10-PAL:       ; %bb.0: ; %bb
467; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
468; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
469; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
470; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
471; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
472; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
473; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
474; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
475; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
476; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
477; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
478; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
479; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
480; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 4
481; GFX10-PAL-NEXT:    s_add_i32 s1, s1, 4
482; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
483; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
484; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
485; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
486; GFX10-PAL-NEXT:    s_endpgm
487; GCN-LABEL: store_load_sindex_foo:
488; GCN:       ; %bb.0: ; %bb
489; GCN-NEXT:    s_lshl_b32 s1, s0, 2
490; GCN-NEXT:    s_and_b32 s0, s0, 15
491; GCN-NEXT:    s_lshl_b32 s0, s0, 2
492; GCN-NEXT:    s_add_u32 s1, 4, s1
493; GCN-NEXT:    v_mov_b32_e32 v0, 15
494; GCN-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
495; GCN-NEXT:    s_waitcnt vmcnt(0)
496; GCN-NEXT:    s_add_u32 s0, 4, s0
497; GCN-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
498; GCN-NEXT:    s_waitcnt vmcnt(0)
499; GCN-NEXT:    s_endpgm
500bb:
501  %i = alloca [32 x float], align 4, addrspace(5)
502  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
503  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
504  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
505  store volatile i32 15, i32 addrspace(5)* %i8, align 4
506  %i9 = and i32 %idx, 15
507  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
508  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
509  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
510  ret void
511}
512
513define amdgpu_kernel void @store_load_vindex_kernel() {
514; GFX9-LABEL: store_load_vindex_kernel:
515; GFX9:       ; %bb.0: ; %bb
516; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
517; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
518; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
519; GFX9-NEXT:    v_add_u32_e32 v1, 4, v0
520; GFX9-NEXT:    v_mov_b32_e32 v2, 15
521; GFX9-NEXT:    scratch_store_dword v1, v2, off
522; GFX9-NEXT:    s_waitcnt vmcnt(0)
523; GFX9-NEXT:    v_sub_u32_e32 v0, 4, v0
524; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
525; GFX9-NEXT:    s_waitcnt vmcnt(0)
526; GFX9-NEXT:    s_endpgm
527;
528; GFX10-LABEL: store_load_vindex_kernel:
529; GFX10:       ; %bb.0: ; %bb
530; GFX10-NEXT:    s_add_u32 s0, s0, s3
531; GFX10-NEXT:    s_addc_u32 s1, s1, 0
532; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
533; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
534; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
535; GFX10-NEXT:    v_mov_b32_e32 v2, 15
536; GFX10-NEXT:    v_add_nc_u32_e32 v1, 4, v0
537; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 4, v0
538; GFX10-NEXT:    scratch_store_dword v1, v2, off
539; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
540; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
541; GFX10-NEXT:    s_waitcnt vmcnt(0)
542; GFX10-NEXT:    s_endpgm
543;
544; GFX9-PAL-LABEL: store_load_vindex_kernel:
545; GFX9-PAL:       ; %bb.0: ; %bb
546; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
547; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
548; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
549; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
550; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 4, v0
551; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
552; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 4, v0
553; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
554; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
555; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
556; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
557; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
558; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
559; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
560; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
561; GFX9-PAL-NEXT:    s_endpgm
562;
563; GFX940-LABEL: store_load_vindex_kernel:
564; GFX940:       ; %bb.0: ; %bb
565; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
566; GFX940-NEXT:    v_mov_b32_e32 v1, 15
567; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
568; GFX940-NEXT:    s_waitcnt vmcnt(0)
569; GFX940-NEXT:    v_sub_u32_e32 v0, 4, v0
570; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
571; GFX940-NEXT:    s_waitcnt vmcnt(0)
572; GFX940-NEXT:    s_endpgm
573;
574; GFX10-PAL-LABEL: store_load_vindex_kernel:
575; GFX10-PAL:       ; %bb.0: ; %bb
576; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
577; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
578; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
579; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
580; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
581; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
582; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
583; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
584; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
585; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
586; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
587; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v1, 4, v0
588; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, 4, v0
589; GFX10-PAL-NEXT:    scratch_store_dword v1, v2, off
590; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
591; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
592; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
593; GFX10-PAL-NEXT:    s_endpgm
594; GCN-LABEL: store_load_vindex_kernel:
595; GCN:       ; %bb.0: ; %bb
596; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
597; GCN-NEXT:    v_mov_b32_e32 v1, 15
598; GCN-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
599; GCN-NEXT:    s_waitcnt vmcnt(0)
600; GCN-NEXT:    v_sub_u32_e32 v0, 4, v0
601; GCN-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
602; GCN-NEXT:    s_waitcnt vmcnt(0)
603; GCN-NEXT:    s_endpgm
604bb:
605  %i = alloca [32 x float], align 4, addrspace(5)
606  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
607  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
608  %i3 = zext i32 %i2 to i64
609  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
610  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
611  store volatile i32 15, i32 addrspace(5)* %i8, align 4
612  %i9 = sub nsw i32 31, %i2
613  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
614  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
615  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
616  ret void
617}
618
619define void @store_load_vindex_foo(i32 %idx) {
620; GFX9-LABEL: store_load_vindex_foo:
621; GFX9:       ; %bb.0: ; %bb
622; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
623; GFX9-NEXT:    v_mov_b32_e32 v1, s32
624; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
625; GFX9-NEXT:    v_mov_b32_e32 v3, 15
626; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
627; GFX9-NEXT:    scratch_store_dword v2, v3, off
628; GFX9-NEXT:    s_waitcnt vmcnt(0)
629; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
630; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
631; GFX9-NEXT:    s_waitcnt vmcnt(0)
632; GFX9-NEXT:    s_setpc_b64 s[30:31]
633;
634; GFX10-LABEL: store_load_vindex_foo:
635; GFX10:       ; %bb.0: ; %bb
636; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
637; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
638; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
639; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
640; GFX10-NEXT:    v_mov_b32_e32 v2, 15
641; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, s32
642; GFX10-NEXT:    scratch_store_dword v0, v2, off
643; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
644; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
645; GFX10-NEXT:    s_waitcnt vmcnt(0)
646; GFX10-NEXT:    s_setpc_b64 s[30:31]
647;
648; GFX9-PAL-LABEL: store_load_vindex_foo:
649; GFX9-PAL:       ; %bb.0: ; %bb
650; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
651; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s32
652; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
653; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
654; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
655; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
656; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
657; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
658; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
659; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
660; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
661;
662; GFX940-LABEL: store_load_vindex_foo:
663; GFX940:       ; %bb.0: ; %bb
664; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
665; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
666; GFX940-NEXT:    v_mov_b32_e32 v2, 15
667; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
668; GFX940-NEXT:    scratch_store_dword v1, v2, s32 sc0 sc1
669; GFX940-NEXT:    s_waitcnt vmcnt(0)
670; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
671; GFX940-NEXT:    scratch_load_dword v0, v0, s32 sc0 sc1
672; GFX940-NEXT:    s_waitcnt vmcnt(0)
673; GFX940-NEXT:    s_setpc_b64 s[30:31]
674;
675; GFX10-PAL-LABEL: store_load_vindex_foo:
676; GFX10-PAL:       ; %bb.0: ; %bb
677; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
678; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
679; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
680; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
681; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
682; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, s32
683; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
684; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
685; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
686; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
687; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
688; GCN-LABEL: store_load_vindex_foo:
689; GCN:       ; %bb.0: ; %bb
690; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
691; GCN-NEXT:    v_mov_b32_e32 v2, 15
692; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
693; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
694; GCN-NEXT:    scratch_store_dword v1, v2, s32 sc0 sc1
695; GCN-NEXT:    s_waitcnt vmcnt(0)
696; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
697; GCN-NEXT:    scratch_load_dword v0, v0, s32 sc0 sc1
698; GCN-NEXT:    s_waitcnt vmcnt(0)
699; GCN-NEXT:    s_setpc_b64 s[30:31]
700bb:
701  %i = alloca [32 x float], align 4, addrspace(5)
702  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
703  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
704  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
705  store volatile i32 15, i32 addrspace(5)* %i8, align 4
706  %i9 = and i32 %idx, 15
707  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
708  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
709  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
710  ret void
711}
712
713define void @private_ptr_foo(float addrspace(5)* nocapture %arg) {
714; GFX9-LABEL: private_ptr_foo:
715; GFX9:       ; %bb.0:
716; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
717; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
718; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
719; GFX9-NEXT:    s_waitcnt vmcnt(0)
720; GFX9-NEXT:    s_setpc_b64 s[30:31]
721;
722; GFX10-LABEL: private_ptr_foo:
723; GFX10:       ; %bb.0:
724; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
725; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
726; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
727; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
728; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
729; GFX10-NEXT:    s_setpc_b64 s[30:31]
730;
731; GFX9-PAL-LABEL: private_ptr_foo:
732; GFX9-PAL:       ; %bb.0:
733; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
734; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
735; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
736; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
737; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
738;
739; GFX940-LABEL: private_ptr_foo:
740; GFX940:       ; %bb.0:
741; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
742; GFX940-NEXT:    v_mov_b32_e32 v1, 0x41200000
743; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4
744; GFX940-NEXT:    s_waitcnt vmcnt(0)
745; GFX940-NEXT:    s_setpc_b64 s[30:31]
746;
747; GFX10-PAL-LABEL: private_ptr_foo:
748; GFX10-PAL:       ; %bb.0:
749; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
750; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
751; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
752; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
753; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
754; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
755; GCN-LABEL: private_ptr_foo:
756; GCN:       ; %bb.0:
757; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
758; GCN-NEXT:    v_mov_b32_e32 v1, 0x41200000
759; GCN-NEXT:    scratch_store_dword v0, v1, off offset:4
760; GCN-NEXT:    s_waitcnt vmcnt(0)
761; GCN-NEXT:    s_setpc_b64 s[30:31]
762  %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1
763  store float 1.000000e+01, float addrspace(5)* %gep, align 4
764  ret void
765}
766
767define amdgpu_kernel void @zero_init_small_offset_kernel() {
768; GFX9-LABEL: zero_init_small_offset_kernel:
769; GFX9:       ; %bb.0:
770; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
771; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
772; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
773; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
774; GFX9-NEXT:    s_waitcnt vmcnt(0)
775; GFX9-NEXT:    s_mov_b32 s0, 0
776; GFX9-NEXT:    s_mov_b32 s1, s0
777; GFX9-NEXT:    s_mov_b32 s2, s0
778; GFX9-NEXT:    s_mov_b32 s3, s0
779; GFX9-NEXT:    v_mov_b32_e32 v0, s0
780; GFX9-NEXT:    v_mov_b32_e32 v1, s1
781; GFX9-NEXT:    v_mov_b32_e32 v2, s2
782; GFX9-NEXT:    v_mov_b32_e32 v3, s3
783; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
784; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272
785; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
786; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288
787; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
788; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304
789; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
790; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
791; GFX9-NEXT:    s_endpgm
792;
793; GFX10-LABEL: zero_init_small_offset_kernel:
794; GFX10:       ; %bb.0:
795; GFX10-NEXT:    s_add_u32 s0, s0, s3
796; GFX10-NEXT:    s_addc_u32 s1, s1, 0
797; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
798; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
799; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
800; GFX10-NEXT:    s_waitcnt vmcnt(0)
801; GFX10-NEXT:    s_mov_b32 s0, 0
802; GFX10-NEXT:    s_mov_b32 s1, s0
803; GFX10-NEXT:    s_mov_b32 s2, s0
804; GFX10-NEXT:    s_mov_b32 s3, s0
805; GFX10-NEXT:    v_mov_b32_e32 v0, s0
806; GFX10-NEXT:    v_mov_b32_e32 v1, s1
807; GFX10-NEXT:    v_mov_b32_e32 v2, s2
808; GFX10-NEXT:    v_mov_b32_e32 v3, s3
809; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
810; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
811; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
812; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
813; GFX10-NEXT:    s_endpgm
814;
815; GFX9-PAL-LABEL: zero_init_small_offset_kernel:
816; GFX9-PAL:       ; %bb.0:
817; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
818; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
819; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
820; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
821; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
822; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
823; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
824; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
825; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
826; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
827; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
828; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
829; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
830; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
831; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
832; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
833; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
834; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
835; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
836; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272
837; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
838; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288
839; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
840; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304
841; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
842; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
843; GFX9-PAL-NEXT:    s_endpgm
844;
845; GFX940-LABEL: zero_init_small_offset_kernel:
846; GFX940:       ; %bb.0:
847; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
848; GFX940-NEXT:    s_waitcnt vmcnt(0)
849; GFX940-NEXT:    s_mov_b32 s0, 0
850; GFX940-NEXT:    s_mov_b32 s1, s0
851; GFX940-NEXT:    s_mov_b32 s2, s0
852; GFX940-NEXT:    s_mov_b32 s3, s0
853; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
854; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
855; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
856; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
857; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
858; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
859; GFX940-NEXT:    s_endpgm
860;
861; GFX1010-PAL-LABEL: zero_init_small_offset_kernel:
862; GFX1010-PAL:       ; %bb.0:
863; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
864; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
865; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
866; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
867; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
868; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
869; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
870; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
871; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
872; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
873; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
874; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
875; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
876; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
877; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
878; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
879; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
880; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
881; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
882; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
883; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
884; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:272
885; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
886; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
887; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:288
888; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
889; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
890; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:304
891; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
892; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
893; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:320
894; GFX1010-PAL-NEXT:    s_endpgm
895;
896; GFX1030-PAL-LABEL: zero_init_small_offset_kernel:
897; GFX1030-PAL:       ; %bb.0:
898; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
899; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
900; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
901; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
902; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
903; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
904; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
905; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
906; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
907; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
908; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
909; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
910; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
911; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
912; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
913; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
914; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
915; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
916; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
917; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
918; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
919; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
920; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
921; GFX1030-PAL-NEXT:    s_endpgm
922  %padding = alloca [64 x i32], align 4, addrspace(5)
923  %alloca = alloca [32 x i16], align 2, addrspace(5)
924  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
925  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
926  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
927  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
928  ret void
929}
930
931define void @zero_init_small_offset_foo() {
932; GFX9-LABEL: zero_init_small_offset_foo:
933; GFX9:       ; %bb.0:
934; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
935; GFX9-NEXT:    scratch_load_dword v0, off, s32 glc
936; GFX9-NEXT:    s_waitcnt vmcnt(0)
937; GFX9-NEXT:    s_mov_b32 s0, 0
938; GFX9-NEXT:    s_mov_b32 s1, s0
939; GFX9-NEXT:    s_mov_b32 s2, s0
940; GFX9-NEXT:    s_mov_b32 s3, s0
941; GFX9-NEXT:    v_mov_b32_e32 v0, s0
942; GFX9-NEXT:    v_mov_b32_e32 v1, s1
943; GFX9-NEXT:    v_mov_b32_e32 v2, s2
944; GFX9-NEXT:    v_mov_b32_e32 v3, s3
945; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
946; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
947; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
948; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
949; GFX9-NEXT:    s_waitcnt vmcnt(0)
950; GFX9-NEXT:    s_setpc_b64 s[30:31]
951;
952; GFX10-LABEL: zero_init_small_offset_foo:
953; GFX10:       ; %bb.0:
954; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
955; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
956; GFX10-NEXT:    scratch_load_dword v0, off, s32 glc dlc
957; GFX10-NEXT:    s_waitcnt vmcnt(0)
958; GFX10-NEXT:    s_mov_b32 s0, 0
959; GFX10-NEXT:    s_mov_b32 s1, s0
960; GFX10-NEXT:    s_mov_b32 s2, s0
961; GFX10-NEXT:    s_mov_b32 s3, s0
962; GFX10-NEXT:    v_mov_b32_e32 v0, s0
963; GFX10-NEXT:    v_mov_b32_e32 v1, s1
964; GFX10-NEXT:    v_mov_b32_e32 v2, s2
965; GFX10-NEXT:    v_mov_b32_e32 v3, s3
966; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
967; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
968; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
969; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
970; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
971; GFX10-NEXT:    s_setpc_b64 s[30:31]
972;
973; GFX9-PAL-LABEL: zero_init_small_offset_foo:
974; GFX9-PAL:       ; %bb.0:
975; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
976; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 glc
977; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
978; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
979; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
980; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
981; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
982; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
983; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
984; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
985; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
986; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
987; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
988; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
989; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
990; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
991; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
992;
993; GFX940-LABEL: zero_init_small_offset_foo:
994; GFX940:       ; %bb.0:
995; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
996; GFX940-NEXT:    scratch_load_dword v0, off, s32 sc0 sc1
997; GFX940-NEXT:    s_waitcnt vmcnt(0)
998; GFX940-NEXT:    s_mov_b32 s0, 0
999; GFX940-NEXT:    s_mov_b32 s1, s0
1000; GFX940-NEXT:    s_mov_b32 s2, s0
1001; GFX940-NEXT:    s_mov_b32 s3, s0
1002; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1003; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
1004; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1005; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1006; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1007; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1008; GFX940-NEXT:    s_waitcnt vmcnt(0)
1009; GFX940-NEXT:    s_setpc_b64 s[30:31]
1010;
1011; GFX10-PAL-LABEL: zero_init_small_offset_foo:
1012; GFX10-PAL:       ; %bb.0:
1013; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1014; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1015; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s32 glc dlc
1016; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1017; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
1018; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
1019; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1020; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
1021; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
1022; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
1023; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
1024; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
1025; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1026; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1027; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1028; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1029; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1030; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1031; GCN-LABEL: zero_init_small_offset_foo:
1032; GCN:       ; %bb.0:
1033; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1034; GCN-NEXT:    scratch_load_dword v0, off, s32 sc0 sc1
1035; GCN-NEXT:    s_waitcnt vmcnt(0)
1036; GCN-NEXT:    s_mov_b32 s0, 0
1037; GCN-NEXT:    s_mov_b32 s1, s0
1038; GCN-NEXT:    s_mov_b32 s2, s0
1039; GCN-NEXT:    s_mov_b32 s3, s0
1040; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1041; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
1042; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1043; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1044; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1045; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1046; GCN-NEXT:    s_waitcnt vmcnt(0)
1047; GCN-NEXT:    s_setpc_b64 s[30:31]
1048  %padding = alloca [64 x i32], align 4, addrspace(5)
1049  %alloca = alloca [32 x i16], align 2, addrspace(5)
1050  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1051  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1052  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
1053  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
1054  ret void
1055}
1056
1057define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
1058; GFX9-LABEL: store_load_sindex_small_offset_kernel:
1059; GFX9:       ; %bb.0: ; %bb
1060; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
1061; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
1062; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1063; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1064; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1065; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1066; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
1067; GFX9-NEXT:    s_and_b32 s0, s0, 15
1068; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1069; GFX9-NEXT:    s_addk_i32 s1, 0x104
1070; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1071; GFX9-NEXT:    scratch_store_dword off, v0, s1
1072; GFX9-NEXT:    s_waitcnt vmcnt(0)
1073; GFX9-NEXT:    s_addk_i32 s0, 0x104
1074; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1075; GFX9-NEXT:    s_waitcnt vmcnt(0)
1076; GFX9-NEXT:    s_endpgm
1077;
1078; GFX10-LABEL: store_load_sindex_small_offset_kernel:
1079; GFX10:       ; %bb.0: ; %bb
1080; GFX10-NEXT:    s_add_u32 s2, s2, s5
1081; GFX10-NEXT:    s_addc_u32 s3, s3, 0
1082; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1083; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1084; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
1085; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1086; GFX10-NEXT:    s_waitcnt vmcnt(0)
1087; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1088; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1089; GFX10-NEXT:    s_and_b32 s1, s0, 15
1090; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1091; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
1092; GFX10-NEXT:    s_addk_i32 s0, 0x104
1093; GFX10-NEXT:    s_addk_i32 s1, 0x104
1094; GFX10-NEXT:    scratch_store_dword off, v0, s0
1095; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1096; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1097; GFX10-NEXT:    s_waitcnt vmcnt(0)
1098; GFX10-NEXT:    s_endpgm
1099;
1100; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel:
1101; GFX9-PAL:       ; %bb.0: ; %bb
1102; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
1103; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
1104; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1105; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1106; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
1107; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1108; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1109; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
1110; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
1111; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1112; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1113; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1114; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1115; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1116; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x104
1117; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1118; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1119; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1120; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x104
1121; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
1122; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1123; GFX9-PAL-NEXT:    s_endpgm
1124;
1125; GFX940-LABEL: store_load_sindex_small_offset_kernel:
1126; GFX940:       ; %bb.0: ; %bb
1127; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
1128; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
1129; GFX940-NEXT:    s_waitcnt vmcnt(0)
1130; GFX940-NEXT:    v_mov_b32_e32 v0, 15
1131; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1132; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
1133; GFX940-NEXT:    s_and_b32 s0, s0, 15
1134; GFX940-NEXT:    s_addk_i32 s1, 0x104
1135; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
1136; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
1137; GFX940-NEXT:    s_waitcnt vmcnt(0)
1138; GFX940-NEXT:    s_addk_i32 s0, 0x104
1139; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
1140; GFX940-NEXT:    s_waitcnt vmcnt(0)
1141; GFX940-NEXT:    s_endpgm
1142;
1143; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel:
1144; GFX1010-PAL:       ; %bb.0: ; %bb
1145; GFX1010-PAL-NEXT:    s_getpc_b64 s[4:5]
1146; GFX1010-PAL-NEXT:    s_mov_b32 s4, s0
1147; GFX1010-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1148; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1149; GFX1010-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1150; GFX1010-PAL-NEXT:    s_add_u32 s4, s4, s3
1151; GFX1010-PAL-NEXT:    s_addc_u32 s5, s5, 0
1152; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
1153; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
1154; GFX1010-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
1155; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1156; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
1157; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1158; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
1159; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1160; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
1161; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1162; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1163; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x104
1164; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x104
1165; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
1166; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1167; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1168; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1169; GFX1010-PAL-NEXT:    s_endpgm
1170;
1171; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel:
1172; GFX1030-PAL:       ; %bb.0: ; %bb
1173; GFX1030-PAL-NEXT:    s_getpc_b64 s[4:5]
1174; GFX1030-PAL-NEXT:    s_mov_b32 s4, s0
1175; GFX1030-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1176; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1177; GFX1030-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1178; GFX1030-PAL-NEXT:    s_add_u32 s4, s4, s3
1179; GFX1030-PAL-NEXT:    s_addc_u32 s5, s5, 0
1180; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
1181; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
1182; GFX1030-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
1183; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1184; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1185; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
1186; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1187; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
1188; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1189; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1190; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x104
1191; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x104
1192; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
1193; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1194; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1195; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1196; GFX1030-PAL-NEXT:    s_endpgm
1197bb:
1198  %padding = alloca [64 x i32], align 4, addrspace(5)
1199  %i = alloca [32 x float], align 4, addrspace(5)
1200  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1201  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1202  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1203  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1204  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1205  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1206  %i9 = and i32 %idx, 15
1207  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1208  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1209  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1210  ret void
1211}
1212
1213define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
1214; GFX9-LABEL: store_load_sindex_small_offset_foo:
1215; GFX9:       ; %bb.0: ; %bb
1216; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1217; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1218; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1219; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1220; GFX9-NEXT:    s_waitcnt vmcnt(0)
1221; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
1222; GFX9-NEXT:    s_addk_i32 s0, 0x104
1223; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1224; GFX9-NEXT:    scratch_store_dword off, v0, s0
1225; GFX9-NEXT:    s_waitcnt vmcnt(0)
1226; GFX9-NEXT:    s_and_b32 s0, s2, 15
1227; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1228; GFX9-NEXT:    s_addk_i32 s0, 0x104
1229; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1230; GFX9-NEXT:    s_waitcnt vmcnt(0)
1231; GFX9-NEXT:    s_endpgm
1232;
1233; GFX10-LABEL: store_load_sindex_small_offset_foo:
1234; GFX10:       ; %bb.0: ; %bb
1235; GFX10-NEXT:    s_add_u32 s0, s0, s3
1236; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1237; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1238; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1239; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1240; GFX10-NEXT:    s_waitcnt vmcnt(0)
1241; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1242; GFX10-NEXT:    s_and_b32 s0, s2, 15
1243; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
1244; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1245; GFX10-NEXT:    s_addk_i32 s1, 0x104
1246; GFX10-NEXT:    s_addk_i32 s0, 0x104
1247; GFX10-NEXT:    scratch_store_dword off, v0, s1
1248; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1249; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
1250; GFX10-NEXT:    s_waitcnt vmcnt(0)
1251; GFX10-NEXT:    s_endpgm
1252;
1253; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo:
1254; GFX9-PAL:       ; %bb.0: ; %bb
1255; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1256; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1257; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1258; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1259; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1260; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1261; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1262; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1263; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1264; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1265; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1266; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1267; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x104
1268; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1269; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1270; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1271; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1272; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x104
1273; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
1274; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1275; GFX9-PAL-NEXT:    s_endpgm
1276;
1277; GFX940-LABEL: store_load_sindex_small_offset_foo:
1278; GFX940:       ; %bb.0: ; %bb
1279; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
1280; GFX940-NEXT:    s_waitcnt vmcnt(0)
1281; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
1282; GFX940-NEXT:    s_and_b32 s0, s0, 15
1283; GFX940-NEXT:    s_addk_i32 s1, 0x104
1284; GFX940-NEXT:    v_mov_b32_e32 v0, 15
1285; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
1286; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
1287; GFX940-NEXT:    s_waitcnt vmcnt(0)
1288; GFX940-NEXT:    s_addk_i32 s0, 0x104
1289; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
1290; GFX940-NEXT:    s_waitcnt vmcnt(0)
1291; GFX940-NEXT:    s_endpgm
1292;
1293; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo:
1294; GFX1010-PAL:       ; %bb.0: ; %bb
1295; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1296; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1297; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1298; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1299; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1300; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1301; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1302; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1303; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1304; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1305; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
1306; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
1307; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1308; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
1309; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1310; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1311; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x104
1312; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x104
1313; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
1314; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1315; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1316; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1317; GFX1010-PAL-NEXT:    s_endpgm
1318;
1319; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo:
1320; GFX1030-PAL:       ; %bb.0: ; %bb
1321; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1322; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1323; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1324; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1325; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1326; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1327; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1328; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1329; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1330; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1331; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1332; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
1333; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
1334; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1335; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1336; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x104
1337; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x104
1338; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
1339; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1340; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1341; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1342; GFX1030-PAL-NEXT:    s_endpgm
1343bb:
1344  %padding = alloca [64 x i32], align 4, addrspace(5)
1345  %i = alloca [32 x float], align 4, addrspace(5)
1346  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1347  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1348  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1349  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1350  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1351  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1352  %i9 = and i32 %idx, 15
1353  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1354  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1355  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1356  ret void
1357}
1358
1359define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
1360; GFX9-LABEL: store_load_vindex_small_offset_kernel:
1361; GFX9:       ; %bb.0: ; %bb
1362; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1363; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1364; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1365; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
1366; GFX9-NEXT:    s_waitcnt vmcnt(0)
1367; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1368; GFX9-NEXT:    v_add_u32_e32 v1, 0x104, v0
1369; GFX9-NEXT:    v_mov_b32_e32 v2, 15
1370; GFX9-NEXT:    scratch_store_dword v1, v2, off
1371; GFX9-NEXT:    s_waitcnt vmcnt(0)
1372; GFX9-NEXT:    v_sub_u32_e32 v0, 0x104, v0
1373; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
1374; GFX9-NEXT:    s_waitcnt vmcnt(0)
1375; GFX9-NEXT:    s_endpgm
1376;
1377; GFX10-LABEL: store_load_vindex_small_offset_kernel:
1378; GFX10:       ; %bb.0: ; %bb
1379; GFX10-NEXT:    s_add_u32 s0, s0, s3
1380; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1381; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1382; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1383; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1384; GFX10-NEXT:    v_mov_b32_e32 v2, 15
1385; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
1386; GFX10-NEXT:    s_waitcnt vmcnt(0)
1387; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
1388; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
1389; GFX10-NEXT:    scratch_store_dword v1, v2, off
1390; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1391; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1392; GFX10-NEXT:    s_waitcnt vmcnt(0)
1393; GFX10-NEXT:    s_endpgm
1394;
1395; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel:
1396; GFX9-PAL:       ; %bb.0: ; %bb
1397; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1398; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1399; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1400; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1401; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1402; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
1403; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1404; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1405; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1406; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1407; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
1408; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1409; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x104, v0
1410; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
1411; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1412; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x104, v0
1413; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
1414; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1415; GFX9-PAL-NEXT:    s_endpgm
1416;
1417; GFX940-LABEL: store_load_vindex_small_offset_kernel:
1418; GFX940:       ; %bb.0: ; %bb
1419; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
1420; GFX940-NEXT:    s_waitcnt vmcnt(0)
1421; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1422; GFX940-NEXT:    v_mov_b32_e32 v1, 15
1423; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:260 sc0 sc1
1424; GFX940-NEXT:    s_waitcnt vmcnt(0)
1425; GFX940-NEXT:    v_sub_u32_e32 v0, 0x104, v0
1426; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
1427; GFX940-NEXT:    s_waitcnt vmcnt(0)
1428; GFX940-NEXT:    s_endpgm
1429;
1430; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel:
1431; GFX1010-PAL:       ; %bb.0: ; %bb
1432; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1433; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1434; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1435; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1436; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1437; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1438; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1439; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1440; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1441; GFX1010-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1442; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, 15
1443; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1444; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, vcc_lo offset:4 glc dlc
1445; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1446; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
1447; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
1448; GFX1010-PAL-NEXT:    scratch_store_dword v1, v2, off
1449; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1450; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1451; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1452; GFX1010-PAL-NEXT:    s_endpgm
1453;
1454; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel:
1455; GFX1030-PAL:       ; %bb.0: ; %bb
1456; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1457; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1458; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1459; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1460; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1461; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1462; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1463; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1464; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1465; GFX1030-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1466; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, 15
1467; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
1468; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1469; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
1470; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
1471; GFX1030-PAL-NEXT:    scratch_store_dword v1, v2, off
1472; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1473; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1474; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1475; GFX1030-PAL-NEXT:    s_endpgm
1476bb:
1477  %padding = alloca [64 x i32], align 4, addrspace(5)
1478  %i = alloca [32 x float], align 4, addrspace(5)
1479  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1480  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1481  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1482  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
1483  %i3 = zext i32 %i2 to i64
1484  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
1485  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1486  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1487  %i9 = sub nsw i32 31, %i2
1488  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1489  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1490  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1491  ret void
1492}
1493
1494define void @store_load_vindex_small_offset_foo(i32 %idx) {
1495; GFX9-LABEL: store_load_vindex_small_offset_foo:
1496; GFX9:       ; %bb.0: ; %bb
1497; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1498; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
1499; GFX9-NEXT:    s_waitcnt vmcnt(0)
1500; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x100
1501; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
1502; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1503; GFX9-NEXT:    v_mov_b32_e32 v3, 15
1504; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
1505; GFX9-NEXT:    scratch_store_dword v2, v3, off
1506; GFX9-NEXT:    s_waitcnt vmcnt(0)
1507; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1508; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
1509; GFX9-NEXT:    s_waitcnt vmcnt(0)
1510; GFX9-NEXT:    s_setpc_b64 s[30:31]
1511;
1512; GFX10-LABEL: store_load_vindex_small_offset_foo:
1513; GFX10:       ; %bb.0: ; %bb
1514; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1515; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1516; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
1517; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
1518; GFX10-NEXT:    v_mov_b32_e32 v2, 15
1519; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, vcc_lo
1520; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
1521; GFX10-NEXT:    scratch_load_dword v3, off, s32 glc dlc
1522; GFX10-NEXT:    s_waitcnt vmcnt(0)
1523; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
1524; GFX10-NEXT:    scratch_store_dword v0, v2, off
1525; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1526; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
1527; GFX10-NEXT:    s_waitcnt vmcnt(0)
1528; GFX10-NEXT:    s_setpc_b64 s[30:31]
1529;
1530; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
1531; GFX9-PAL:       ; %bb.0: ; %bb
1532; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1533; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 glc
1534; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1535; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x100
1536; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
1537; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
1538; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
1539; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
1540; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
1541; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1542; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
1543; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
1544; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1545; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1546;
1547; GFX940-LABEL: store_load_vindex_small_offset_foo:
1548; GFX940:       ; %bb.0: ; %bb
1549; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1550; GFX940-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
1551; GFX940-NEXT:    s_waitcnt vmcnt(0)
1552; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1553; GFX940-NEXT:    v_mov_b32_e32 v2, 15
1554; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
1555; GFX940-NEXT:    scratch_store_dword v1, v2, s32 offset:256 sc0 sc1
1556; GFX940-NEXT:    s_waitcnt vmcnt(0)
1557; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1558; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
1559; GFX940-NEXT:    s_waitcnt vmcnt(0)
1560; GFX940-NEXT:    s_setpc_b64 s[30:31]
1561;
1562; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo:
1563; GFX10-PAL:       ; %bb.0: ; %bb
1564; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1565; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1566; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
1567; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x100
1568; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
1569; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, vcc_lo
1570; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x100
1571; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 glc dlc
1572; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1573; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
1574; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
1575; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1576; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
1577; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1578; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1579; GCN-LABEL: store_load_vindex_small_offset_foo:
1580; GCN:       ; %bb.0: ; %bb
1581; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1582; GCN-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
1583; GCN-NEXT:    s_waitcnt vmcnt(0)
1584; GCN-NEXT:    v_mov_b32_e32 v2, 15
1585; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1586; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
1587; GCN-NEXT:    scratch_store_dword v1, v2, s32 offset:256 sc0 sc1
1588; GCN-NEXT:    s_waitcnt vmcnt(0)
1589; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1590; GCN-NEXT:    scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
1591; GCN-NEXT:    s_waitcnt vmcnt(0)
1592; GCN-NEXT:    s_setpc_b64 s[30:31]
1593bb:
1594  %padding = alloca [64 x i32], align 4, addrspace(5)
1595  %i = alloca [32 x float], align 4, addrspace(5)
1596  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
1597  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1598  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
1599  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
1600  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
1601  store volatile i32 15, i32 addrspace(5)* %i8, align 4
1602  %i9 = and i32 %idx, 15
1603  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
1604  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
1605  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
1606  ret void
1607}
1608
1609define amdgpu_kernel void @zero_init_large_offset_kernel() {
1610; GFX9-LABEL: zero_init_large_offset_kernel:
1611; GFX9:       ; %bb.0:
1612; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1613; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1614; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1615; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:16 glc
1616; GFX9-NEXT:    s_waitcnt vmcnt(0)
1617; GFX9-NEXT:    s_mov_b32 s0, 0
1618; GFX9-NEXT:    s_mov_b32 s1, s0
1619; GFX9-NEXT:    s_mov_b32 s2, s0
1620; GFX9-NEXT:    s_mov_b32 s3, s0
1621; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1622; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1623; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1624; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1625; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1626; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1627; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1628; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1629; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1630; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1631; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
1632; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1633; GFX9-NEXT:    s_endpgm
1634;
1635; GFX10-LABEL: zero_init_large_offset_kernel:
1636; GFX10:       ; %bb.0:
1637; GFX10-NEXT:    s_add_u32 s0, s0, s3
1638; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1639; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1640; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1641; GFX10-NEXT:    scratch_load_dword v0, off, off offset:16 glc dlc
1642; GFX10-NEXT:    s_waitcnt vmcnt(0)
1643; GFX10-NEXT:    s_mov_b32 s0, 0
1644; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1645; GFX10-NEXT:    s_mov_b32 s1, s0
1646; GFX10-NEXT:    s_mov_b32 s2, s0
1647; GFX10-NEXT:    s_mov_b32 s3, s0
1648; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1649; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1650; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1651; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1652; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1653; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1654; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1655; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1656; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1657; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
1658; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1659; GFX10-NEXT:    s_endpgm
1660;
1661; GFX9-PAL-LABEL: zero_init_large_offset_kernel:
1662; GFX9-PAL:       ; %bb.0:
1663; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1664; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1665; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1666; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1667; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1668; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1669; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1670; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1671; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1672; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:16 glc
1673; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1674; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
1675; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1676; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
1677; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
1678; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
1679; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
1680; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
1681; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1682; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1683; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1684; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1685; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1686; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1687; GFX9-PAL-NEXT:    s_movk_i32 vcc_hi, 0x4010
1688; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1689; GFX9-PAL-NEXT:    s_endpgm
1690;
1691; GFX940-LABEL: zero_init_large_offset_kernel:
1692; GFX940:       ; %bb.0:
1693; GFX940-NEXT:    scratch_load_dword v0, off, off offset:16 sc0 sc1
1694; GFX940-NEXT:    s_waitcnt vmcnt(0)
1695; GFX940-NEXT:    s_mov_b32 s0, 0
1696; GFX940-NEXT:    s_mov_b32 s1, s0
1697; GFX940-NEXT:    s_mov_b32 s2, s0
1698; GFX940-NEXT:    s_mov_b32 s3, s0
1699; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1700; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
1701; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4010
1702; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1703; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4010
1704; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1705; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4010
1706; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1707; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4010
1708; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1709; GFX940-NEXT:    s_endpgm
1710;
1711; GFX1010-PAL-LABEL: zero_init_large_offset_kernel:
1712; GFX1010-PAL:       ; %bb.0:
1713; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1714; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1715; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1716; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1717; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1718; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1719; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1720; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1721; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1722; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
1723; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
1724; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:16 glc dlc
1725; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1726; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
1727; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1728; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
1729; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
1730; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
1731; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
1732; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
1733; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1734; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1735; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1736; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1737; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1738; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1739; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1740; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1741; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1742; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1743; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1744; GFX1010-PAL-NEXT:    s_endpgm
1745;
1746; GFX1030-PAL-LABEL: zero_init_large_offset_kernel:
1747; GFX1030-PAL:       ; %bb.0:
1748; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1749; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1750; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1751; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1752; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1753; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1754; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1755; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1756; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1757; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:16 glc dlc
1758; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1759; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
1760; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1761; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
1762; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1763; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
1764; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
1765; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
1766; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
1767; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
1768; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1769; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1770; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1771; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1772; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1773; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
1774; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1775; GFX1030-PAL-NEXT:    s_endpgm
1776  %padding = alloca [4096 x i32], align 4, addrspace(5)
1777  %alloca = alloca [32 x i16], align 2, addrspace(5)
1778  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1779  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1780  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
1781  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
1782  ret void
1783}
1784
1785define void @zero_init_large_offset_foo() {
1786; GFX9-LABEL: zero_init_large_offset_foo:
1787; GFX9:       ; %bb.0:
1788; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1789; GFX9-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc
1790; GFX9-NEXT:    s_waitcnt vmcnt(0)
1791; GFX9-NEXT:    s_mov_b32 s0, 0
1792; GFX9-NEXT:    s_mov_b32 s1, s0
1793; GFX9-NEXT:    s_mov_b32 s2, s0
1794; GFX9-NEXT:    s_mov_b32 s3, s0
1795; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1796; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1797; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1798; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1799; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1800; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1801; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1802; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1803; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1804; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1805; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1806; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1807; GFX9-NEXT:    s_waitcnt vmcnt(0)
1808; GFX9-NEXT:    s_setpc_b64 s[30:31]
1809;
1810; GFX10-LABEL: zero_init_large_offset_foo:
1811; GFX10:       ; %bb.0:
1812; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1813; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1814; GFX10-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc dlc
1815; GFX10-NEXT:    s_waitcnt vmcnt(0)
1816; GFX10-NEXT:    s_mov_b32 s0, 0
1817; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1818; GFX10-NEXT:    s_mov_b32 s1, s0
1819; GFX10-NEXT:    s_mov_b32 s2, s0
1820; GFX10-NEXT:    s_mov_b32 s3, s0
1821; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1822; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1823; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1824; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1825; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1826; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1827; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1828; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1829; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1830; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1831; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1832; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1833; GFX10-NEXT:    s_setpc_b64 s[30:31]
1834;
1835; GFX9-PAL-LABEL: zero_init_large_offset_foo:
1836; GFX9-PAL:       ; %bb.0:
1837; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1838; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc
1839; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1840; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1841; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
1842; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1843; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
1844; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
1845; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
1846; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
1847; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
1848; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1849; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1850; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1851; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1852; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1853; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1854; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1855; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1856; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1857; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1858;
1859; GFX940-LABEL: zero_init_large_offset_foo:
1860; GFX940:       ; %bb.0:
1861; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1862; GFX940-NEXT:    scratch_load_dword v0, off, s32 offset:16 sc0 sc1
1863; GFX940-NEXT:    s_waitcnt vmcnt(0)
1864; GFX940-NEXT:    s_mov_b32 s0, 0
1865; GFX940-NEXT:    s_mov_b32 s1, s0
1866; GFX940-NEXT:    s_mov_b32 s2, s0
1867; GFX940-NEXT:    s_mov_b32 s3, s0
1868; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1869; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
1870; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1871; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi
1872; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1873; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
1874; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1875; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
1876; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4010
1877; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
1878; GFX940-NEXT:    s_waitcnt vmcnt(0)
1879; GFX940-NEXT:    s_setpc_b64 s[30:31]
1880;
1881; GFX1010-PAL-LABEL: zero_init_large_offset_foo:
1882; GFX1010-PAL:       ; %bb.0:
1883; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1884; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1885; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc dlc
1886; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1887; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
1888; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1889; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
1890; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1891; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
1892; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
1893; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
1894; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
1895; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
1896; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1897; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1898; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1899; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1900; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1901; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1902; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1903; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
1904; GFX1010-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1905; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1906; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1907; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
1908;
1909; GFX1030-PAL-LABEL: zero_init_large_offset_foo:
1910; GFX1030-PAL:       ; %bb.0:
1911; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1912; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1913; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:16 glc dlc
1914; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1915; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
1916; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1917; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
1918; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1919; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
1920; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
1921; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
1922; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
1923; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
1924; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
1925; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1926; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
1927; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1928; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
1929; GFX1030-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4010
1930; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
1931; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1932; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
1933  %padding = alloca [4096 x i32], align 4, addrspace(5)
1934  %alloca = alloca [32 x i16], align 2, addrspace(5)
1935  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
1936  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
1937  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
1938  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
1939  ret void
1940}
1941
1942define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
1943; GFX9-LABEL: store_load_sindex_large_offset_kernel:
1944; GFX9:       ; %bb.0: ; %bb
1945; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
1946; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
1947; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1948; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
1949; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1950; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1951; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
1952; GFX9-NEXT:    s_and_b32 s0, s0, 15
1953; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1954; GFX9-NEXT:    s_addk_i32 s1, 0x4004
1955; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1956; GFX9-NEXT:    scratch_store_dword off, v0, s1
1957; GFX9-NEXT:    s_waitcnt vmcnt(0)
1958; GFX9-NEXT:    s_addk_i32 s0, 0x4004
1959; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1960; GFX9-NEXT:    s_waitcnt vmcnt(0)
1961; GFX9-NEXT:    s_endpgm
1962;
1963; GFX10-LABEL: store_load_sindex_large_offset_kernel:
1964; GFX10:       ; %bb.0: ; %bb
1965; GFX10-NEXT:    s_add_u32 s2, s2, s5
1966; GFX10-NEXT:    s_addc_u32 s3, s3, 0
1967; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1968; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1969; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
1970; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
1971; GFX10-NEXT:    s_waitcnt vmcnt(0)
1972; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1973; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1974; GFX10-NEXT:    s_and_b32 s1, s0, 15
1975; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1976; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
1977; GFX10-NEXT:    s_addk_i32 s0, 0x4004
1978; GFX10-NEXT:    s_addk_i32 s1, 0x4004
1979; GFX10-NEXT:    scratch_store_dword off, v0, s0
1980; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1981; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1982; GFX10-NEXT:    s_waitcnt vmcnt(0)
1983; GFX10-NEXT:    s_endpgm
1984;
1985; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel:
1986; GFX9-PAL:       ; %bb.0: ; %bb
1987; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
1988; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
1989; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1990; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
1991; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
1992; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1993; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
1994; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
1995; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
1996; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
1997; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1998; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1999; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
2000; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
2001; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x4004
2002; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2003; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
2004; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2005; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x4004
2006; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
2007; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2008; GFX9-PAL-NEXT:    s_endpgm
2009;
2010; GFX940-LABEL: store_load_sindex_large_offset_kernel:
2011; GFX940:       ; %bb.0: ; %bb
2012; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
2013; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
2014; GFX940-NEXT:    s_waitcnt vmcnt(0)
2015; GFX940-NEXT:    v_mov_b32_e32 v0, 15
2016; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
2017; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
2018; GFX940-NEXT:    s_and_b32 s0, s0, 15
2019; GFX940-NEXT:    s_addk_i32 s1, 0x4004
2020; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
2021; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
2022; GFX940-NEXT:    s_waitcnt vmcnt(0)
2023; GFX940-NEXT:    s_addk_i32 s0, 0x4004
2024; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
2025; GFX940-NEXT:    s_waitcnt vmcnt(0)
2026; GFX940-NEXT:    s_endpgm
2027;
2028; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel:
2029; GFX1010-PAL:       ; %bb.0: ; %bb
2030; GFX1010-PAL-NEXT:    s_getpc_b64 s[4:5]
2031; GFX1010-PAL-NEXT:    s_mov_b32 s4, s0
2032; GFX1010-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2033; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2034; GFX1010-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2035; GFX1010-PAL-NEXT:    s_add_u32 s4, s4, s3
2036; GFX1010-PAL-NEXT:    s_addc_u32 s5, s5, 0
2037; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
2038; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
2039; GFX1010-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
2040; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2041; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
2042; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2043; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
2044; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2045; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
2046; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2047; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2048; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x4004
2049; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x4004
2050; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
2051; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2052; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2053; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2054; GFX1010-PAL-NEXT:    s_endpgm
2055;
2056; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel:
2057; GFX1030-PAL:       ; %bb.0: ; %bb
2058; GFX1030-PAL-NEXT:    s_getpc_b64 s[4:5]
2059; GFX1030-PAL-NEXT:    s_mov_b32 s4, s0
2060; GFX1030-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2061; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2062; GFX1030-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2063; GFX1030-PAL-NEXT:    s_add_u32 s4, s4, s3
2064; GFX1030-PAL-NEXT:    s_addc_u32 s5, s5, 0
2065; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
2066; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
2067; GFX1030-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
2068; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2069; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2070; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
2071; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2072; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
2073; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2074; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2075; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x4004
2076; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x4004
2077; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
2078; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2079; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2080; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2081; GFX1030-PAL-NEXT:    s_endpgm
2082bb:
2083  %padding = alloca [4096 x i32], align 4, addrspace(5)
2084  %i = alloca [32 x float], align 4, addrspace(5)
2085  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2086  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2087  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2088  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
2089  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2090  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2091  %i9 = and i32 %idx, 15
2092  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2093  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2094  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2095  ret void
2096}
2097
2098define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
2099; GFX9-LABEL: store_load_sindex_large_offset_foo:
2100; GFX9:       ; %bb.0: ; %bb
2101; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
2102; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
2103; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
2104; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
2105; GFX9-NEXT:    s_waitcnt vmcnt(0)
2106; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
2107; GFX9-NEXT:    s_addk_i32 s0, 0x4004
2108; GFX9-NEXT:    v_mov_b32_e32 v0, 15
2109; GFX9-NEXT:    scratch_store_dword off, v0, s0
2110; GFX9-NEXT:    s_waitcnt vmcnt(0)
2111; GFX9-NEXT:    s_and_b32 s0, s2, 15
2112; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
2113; GFX9-NEXT:    s_addk_i32 s0, 0x4004
2114; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
2115; GFX9-NEXT:    s_waitcnt vmcnt(0)
2116; GFX9-NEXT:    s_endpgm
2117;
2118; GFX10-LABEL: store_load_sindex_large_offset_foo:
2119; GFX10:       ; %bb.0: ; %bb
2120; GFX10-NEXT:    s_add_u32 s0, s0, s3
2121; GFX10-NEXT:    s_addc_u32 s1, s1, 0
2122; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
2123; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
2124; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2125; GFX10-NEXT:    s_waitcnt vmcnt(0)
2126; GFX10-NEXT:    v_mov_b32_e32 v0, 15
2127; GFX10-NEXT:    s_and_b32 s0, s2, 15
2128; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
2129; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
2130; GFX10-NEXT:    s_addk_i32 s1, 0x4004
2131; GFX10-NEXT:    s_addk_i32 s0, 0x4004
2132; GFX10-NEXT:    scratch_store_dword off, v0, s1
2133; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2134; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
2135; GFX10-NEXT:    s_waitcnt vmcnt(0)
2136; GFX10-NEXT:    s_endpgm
2137;
2138; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo:
2139; GFX9-PAL:       ; %bb.0: ; %bb
2140; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
2141; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2142; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2143; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
2144; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2145; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2146; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
2147; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2148; GFX9-PAL-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4 glc
2149; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2150; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
2151; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
2152; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x4004
2153; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
2154; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2155; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
2156; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2157; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x4004
2158; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
2159; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2160; GFX9-PAL-NEXT:    s_endpgm
2161;
2162; GFX940-LABEL: store_load_sindex_large_offset_foo:
2163; GFX940:       ; %bb.0: ; %bb
2164; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
2165; GFX940-NEXT:    s_waitcnt vmcnt(0)
2166; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
2167; GFX940-NEXT:    s_and_b32 s0, s0, 15
2168; GFX940-NEXT:    s_addk_i32 s1, 0x4004
2169; GFX940-NEXT:    v_mov_b32_e32 v0, 15
2170; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
2171; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
2172; GFX940-NEXT:    s_waitcnt vmcnt(0)
2173; GFX940-NEXT:    s_addk_i32 s0, 0x4004
2174; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
2175; GFX940-NEXT:    s_waitcnt vmcnt(0)
2176; GFX940-NEXT:    s_endpgm
2177;
2178; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo:
2179; GFX1010-PAL:       ; %bb.0: ; %bb
2180; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
2181; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2182; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2183; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2184; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2185; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
2186; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
2187; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2188; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2189; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2190; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
2191; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
2192; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2193; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
2194; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2195; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2196; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x4004
2197; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x4004
2198; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
2199; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2200; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2201; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2202; GFX1010-PAL-NEXT:    s_endpgm
2203;
2204; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo:
2205; GFX1030-PAL:       ; %bb.0: ; %bb
2206; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
2207; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2208; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2209; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2210; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2211; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
2212; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
2213; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2214; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2215; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2216; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2217; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
2218; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
2219; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2220; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2221; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x4004
2222; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x4004
2223; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
2224; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2225; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2226; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2227; GFX1030-PAL-NEXT:    s_endpgm
2228bb:
2229  %padding = alloca [4096 x i32], align 4, addrspace(5)
2230  %i = alloca [32 x float], align 4, addrspace(5)
2231  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2232  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2233  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2234  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
2235  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2236  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2237  %i9 = and i32 %idx, 15
2238  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2239  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2240  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2241  ret void
2242}
2243
2244define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
2245; GFX9-LABEL: store_load_vindex_large_offset_kernel:
2246; GFX9:       ; %bb.0: ; %bb
2247; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
2248; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
2249; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
2250; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
2251; GFX9-NEXT:    s_waitcnt vmcnt(0)
2252; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2253; GFX9-NEXT:    v_add_u32_e32 v1, 0x4004, v0
2254; GFX9-NEXT:    v_mov_b32_e32 v2, 15
2255; GFX9-NEXT:    scratch_store_dword v1, v2, off
2256; GFX9-NEXT:    s_waitcnt vmcnt(0)
2257; GFX9-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
2258; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
2259; GFX9-NEXT:    s_waitcnt vmcnt(0)
2260; GFX9-NEXT:    s_endpgm
2261;
2262; GFX10-LABEL: store_load_vindex_large_offset_kernel:
2263; GFX10:       ; %bb.0: ; %bb
2264; GFX10-NEXT:    s_add_u32 s0, s0, s3
2265; GFX10-NEXT:    s_addc_u32 s1, s1, 0
2266; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
2267; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
2268; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2269; GFX10-NEXT:    v_mov_b32_e32 v2, 15
2270; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
2271; GFX10-NEXT:    s_waitcnt vmcnt(0)
2272; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
2273; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
2274; GFX10-NEXT:    scratch_store_dword v1, v2, off
2275; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2276; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
2277; GFX10-NEXT:    s_waitcnt vmcnt(0)
2278; GFX10-NEXT:    s_endpgm
2279;
2280; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel:
2281; GFX9-PAL:       ; %bb.0: ; %bb
2282; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
2283; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2284; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2285; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
2286; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2287; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
2288; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2289; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2290; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
2291; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2292; GFX9-PAL-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4 glc
2293; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2294; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x4004, v0
2295; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
2296; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2297; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
2298; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
2299; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2300; GFX9-PAL-NEXT:    s_endpgm
2301;
2302; GFX940-LABEL: store_load_vindex_large_offset_kernel:
2303; GFX940:       ; %bb.0: ; %bb
2304; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
2305; GFX940-NEXT:    s_waitcnt vmcnt(0)
2306; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2307; GFX940-NEXT:    v_mov_b32_e32 v1, 15
2308; GFX940-NEXT:    s_movk_i32 vcc_hi, 0x4004
2309; GFX940-NEXT:    scratch_store_dword v0, v1, vcc_hi sc0 sc1
2310; GFX940-NEXT:    s_waitcnt vmcnt(0)
2311; GFX940-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
2312; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
2313; GFX940-NEXT:    s_waitcnt vmcnt(0)
2314; GFX940-NEXT:    s_endpgm
2315;
2316; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel:
2317; GFX1010-PAL:       ; %bb.0: ; %bb
2318; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
2319; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2320; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2321; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2322; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2323; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
2324; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
2325; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2326; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2327; GFX1010-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2328; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, 15
2329; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2330; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, vcc_lo offset:4 glc dlc
2331; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2332; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
2333; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
2334; GFX1010-PAL-NEXT:    scratch_store_dword v1, v2, off
2335; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2336; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
2337; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2338; GFX1010-PAL-NEXT:    s_endpgm
2339;
2340; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel:
2341; GFX1030-PAL:       ; %bb.0: ; %bb
2342; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
2343; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2344; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2345; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2346; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2347; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
2348; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
2349; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2350; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2351; GFX1030-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2352; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, 15
2353; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
2354; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2355; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
2356; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
2357; GFX1030-PAL-NEXT:    scratch_store_dword v1, v2, off
2358; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2359; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
2360; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2361; GFX1030-PAL-NEXT:    s_endpgm
2362bb:
2363  %padding = alloca [4096 x i32], align 4, addrspace(5)
2364  %i = alloca [32 x float], align 4, addrspace(5)
2365  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2366  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2367  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2368  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
2369  %i3 = zext i32 %i2 to i64
2370  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
2371  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2372  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2373  %i9 = sub nsw i32 31, %i2
2374  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2375  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2376  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2377  ret void
2378}
2379
2380define void @store_load_vindex_large_offset_foo(i32 %idx) {
2381; GFX9-LABEL: store_load_vindex_large_offset_foo:
2382; GFX9:       ; %bb.0: ; %bb
2383; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2384; GFX9-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
2385; GFX9-NEXT:    s_waitcnt vmcnt(0)
2386; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
2387; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
2388; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
2389; GFX9-NEXT:    v_mov_b32_e32 v3, 15
2390; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
2391; GFX9-NEXT:    scratch_store_dword v2, v3, off
2392; GFX9-NEXT:    s_waitcnt vmcnt(0)
2393; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
2394; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
2395; GFX9-NEXT:    s_waitcnt vmcnt(0)
2396; GFX9-NEXT:    s_setpc_b64 s[30:31]
2397;
2398; GFX10-LABEL: store_load_vindex_large_offset_foo:
2399; GFX10:       ; %bb.0: ; %bb
2400; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2401; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2402; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
2403; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
2404; GFX10-NEXT:    v_mov_b32_e32 v2, 15
2405; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, vcc_lo
2406; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
2407; GFX10-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
2408; GFX10-NEXT:    s_waitcnt vmcnt(0)
2409; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
2410; GFX10-NEXT:    scratch_store_dword v0, v2, off
2411; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2412; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
2413; GFX10-NEXT:    s_waitcnt vmcnt(0)
2414; GFX10-NEXT:    s_setpc_b64 s[30:31]
2415;
2416; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
2417; GFX9-PAL:       ; %bb.0: ; %bb
2418; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2419; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
2420; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2421; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
2422; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
2423; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
2424; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
2425; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
2426; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
2427; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2428; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
2429; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
2430; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2431; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2432;
2433; GFX940-LABEL: store_load_vindex_large_offset_foo:
2434; GFX940:       ; %bb.0: ; %bb
2435; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2436; GFX940-NEXT:    scratch_load_dword v1, off, s32 offset:4 sc0 sc1
2437; GFX940-NEXT:    s_waitcnt vmcnt(0)
2438; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
2439; GFX940-NEXT:    v_mov_b32_e32 v2, 15
2440; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
2441; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
2442; GFX940-NEXT:    scratch_store_dword v1, v2, vcc_hi sc0 sc1
2443; GFX940-NEXT:    s_waitcnt vmcnt(0)
2444; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2445; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
2446; GFX940-NEXT:    scratch_load_dword v0, v0, vcc_hi sc0 sc1
2447; GFX940-NEXT:    s_waitcnt vmcnt(0)
2448; GFX940-NEXT:    s_setpc_b64 s[30:31]
2449;
2450; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo:
2451; GFX10-PAL:       ; %bb.0: ; %bb
2452; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2453; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2454; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
2455; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
2456; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
2457; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, vcc_lo
2458; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
2459; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
2460; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2461; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
2462; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
2463; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2464; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
2465; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2466; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2467; GCN-LABEL: store_load_vindex_large_offset_foo:
2468; GCN:       ; %bb.0: ; %bb
2469; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2470; GCN-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
2471; GCN-NEXT:    s_waitcnt vmcnt(0)
2472; GCN-NEXT:    v_mov_b32_e32 v2, 15
2473; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
2474; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
2475; GCN-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
2476; GCN-NEXT:    scratch_store_dword v1, v2, vcc_hi sc0 sc1
2477; GCN-NEXT:    s_waitcnt vmcnt(0)
2478; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2479; GCN-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
2480; GCN-NEXT:    scratch_load_dword v0, v0, vcc_hi sc0 sc1
2481; GCN-NEXT:    s_waitcnt vmcnt(0)
2482; GCN-NEXT:    s_setpc_b64 s[30:31]
2483bb:
2484  %padding = alloca [4096 x i32], align 4, addrspace(5)
2485  %i = alloca [32 x float], align 4, addrspace(5)
2486  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
2487  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
2488  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
2489  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
2490  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
2491  store volatile i32 15, i32 addrspace(5)* %i8, align 4
2492  %i9 = and i32 %idx, 15
2493  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
2494  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
2495  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
2496  ret void
2497}
2498
2499define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
2500; GFX9-LABEL: store_load_large_imm_offset_kernel:
2501; GFX9:       ; %bb.0: ; %bb
2502; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
2503; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
2504; GFX9-NEXT:    v_mov_b32_e32 v0, 13
2505; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
2506; GFX9-NEXT:    s_movk_i32 s0, 0x3000
2507; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
2508; GFX9-NEXT:    s_waitcnt vmcnt(0)
2509; GFX9-NEXT:    s_add_i32 s0, s0, 4
2510; GFX9-NEXT:    v_mov_b32_e32 v0, 15
2511; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
2512; GFX9-NEXT:    s_waitcnt vmcnt(0)
2513; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
2514; GFX9-NEXT:    s_waitcnt vmcnt(0)
2515; GFX9-NEXT:    s_endpgm
2516;
2517; GFX10-LABEL: store_load_large_imm_offset_kernel:
2518; GFX10:       ; %bb.0: ; %bb
2519; GFX10-NEXT:    s_add_u32 s0, s0, s3
2520; GFX10-NEXT:    s_addc_u32 s1, s1, 0
2521; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
2522; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
2523; GFX10-NEXT:    v_mov_b32_e32 v0, 13
2524; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2525; GFX10-NEXT:    s_movk_i32 s0, 0x3800
2526; GFX10-NEXT:    s_add_i32 s0, s0, 4
2527; GFX10-NEXT:    scratch_store_dword off, v0, off offset:4
2528; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2529; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
2530; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2531; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
2532; GFX10-NEXT:    s_waitcnt vmcnt(0)
2533; GFX10-NEXT:    s_endpgm
2534;
2535; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel:
2536; GFX9-PAL:       ; %bb.0: ; %bb
2537; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
2538; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2539; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2540; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
2541; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
2542; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
2543; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2544; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2545; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
2546; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2547; GFX9-PAL-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
2548; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2549; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
2550; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
2551; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
2552; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2553; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
2554; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2555; GFX9-PAL-NEXT:    s_endpgm
2556;
2557; GFX940-LABEL: store_load_large_imm_offset_kernel:
2558; GFX940:       ; %bb.0: ; %bb
2559; GFX940-NEXT:    v_mov_b32_e32 v0, 13
2560; GFX940-NEXT:    scratch_store_dword off, v0, off offset:4 sc0 sc1
2561; GFX940-NEXT:    s_waitcnt vmcnt(0)
2562; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3000
2563; GFX940-NEXT:    v_mov_b32_e32 v1, 15
2564; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:3716 sc0 sc1
2565; GFX940-NEXT:    s_waitcnt vmcnt(0)
2566; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:3716 sc0 sc1
2567; GFX940-NEXT:    s_waitcnt vmcnt(0)
2568; GFX940-NEXT:    s_endpgm
2569;
2570; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel:
2571; GFX1010-PAL:       ; %bb.0: ; %bb
2572; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
2573; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2574; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2575; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2576; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2577; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
2578; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
2579; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2580; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2581; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 13
2582; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 15
2583; GFX1010-PAL-NEXT:    s_movk_i32 s0, 0x3800
2584; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
2585; GFX1010-PAL-NEXT:    s_add_i32 s0, s0, 4
2586; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, vcc_lo offset:4
2587; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2588; GFX1010-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
2589; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2590; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
2591; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2592; GFX1010-PAL-NEXT:    s_endpgm
2593;
2594; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel:
2595; GFX1030-PAL:       ; %bb.0: ; %bb
2596; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
2597; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2598; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2599; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2600; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2601; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
2602; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
2603; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2604; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2605; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 13
2606; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 15
2607; GFX1030-PAL-NEXT:    s_movk_i32 s0, 0x3800
2608; GFX1030-PAL-NEXT:    s_add_i32 s0, s0, 4
2609; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, off offset:4
2610; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2611; GFX1030-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
2612; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2613; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
2614; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2615; GFX1030-PAL-NEXT:    s_endpgm
2616bb:
2617  %i = alloca [4096 x i32], align 4, addrspace(5)
2618  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
2619  store volatile i32 13, i32 addrspace(5)* %i1, align 4
2620  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
2621  store volatile i32 15, i32 addrspace(5)* %i7, align 4
2622  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
2623  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
2624  ret void
2625}
2626
2627define void @store_load_large_imm_offset_foo() {
2628; GFX9-LABEL: store_load_large_imm_offset_foo:
2629; GFX9:       ; %bb.0: ; %bb
2630; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2631; GFX9-NEXT:    v_mov_b32_e32 v0, 13
2632; GFX9-NEXT:    s_movk_i32 s0, 0x3000
2633; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 4
2634; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:4
2635; GFX9-NEXT:    s_waitcnt vmcnt(0)
2636; GFX9-NEXT:    s_add_i32 s0, s0, vcc_hi
2637; GFX9-NEXT:    v_mov_b32_e32 v0, 15
2638; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
2639; GFX9-NEXT:    s_waitcnt vmcnt(0)
2640; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
2641; GFX9-NEXT:    s_waitcnt vmcnt(0)
2642; GFX9-NEXT:    s_setpc_b64 s[30:31]
2643;
2644; GFX10-LABEL: store_load_large_imm_offset_foo:
2645; GFX10:       ; %bb.0: ; %bb
2646; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2647; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2648; GFX10-NEXT:    v_mov_b32_e32 v0, 13
2649; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2650; GFX10-NEXT:    s_movk_i32 s0, 0x3800
2651; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 4
2652; GFX10-NEXT:    s_add_i32 s0, s0, vcc_lo
2653; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:4
2654; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2655; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
2656; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2657; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
2658; GFX10-NEXT:    s_waitcnt vmcnt(0)
2659; GFX10-NEXT:    s_setpc_b64 s[30:31]
2660;
2661; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
2662; GFX9-PAL:       ; %bb.0: ; %bb
2663; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2664; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
2665; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
2666; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 4
2667; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s32 offset:4
2668; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2669; GFX9-PAL-NEXT:    s_add_i32 s0, s0, vcc_hi
2670; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
2671; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
2672; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2673; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
2674; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2675; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2676;
2677; GFX940-LABEL: store_load_large_imm_offset_foo:
2678; GFX940:       ; %bb.0: ; %bb
2679; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2680; GFX940-NEXT:    v_mov_b32_e32 v0, 13
2681; GFX940-NEXT:    scratch_store_dword off, v0, s32 offset:4 sc0 sc1
2682; GFX940-NEXT:    s_waitcnt vmcnt(0)
2683; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3000
2684; GFX940-NEXT:    v_mov_b32_e32 v1, 15
2685; GFX940-NEXT:    scratch_store_dword v0, v1, s32 offset:3716 sc0 sc1
2686; GFX940-NEXT:    s_waitcnt vmcnt(0)
2687; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1
2688; GFX940-NEXT:    s_waitcnt vmcnt(0)
2689; GFX940-NEXT:    s_setpc_b64 s[30:31]
2690;
2691; GFX10-PAL-LABEL: store_load_large_imm_offset_foo:
2692; GFX10-PAL:       ; %bb.0: ; %bb
2693; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2694; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2695; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 13
2696; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
2697; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x3800
2698; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 4
2699; GFX10-PAL-NEXT:    s_add_i32 s0, s0, vcc_lo
2700; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s32 offset:4
2701; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2702; GFX10-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
2703; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2704; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
2705; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2706; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2707; GCN-LABEL: store_load_large_imm_offset_foo:
2708; GCN:       ; %bb.0: ; %bb
2709; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2710; GCN-NEXT:    v_mov_b32_e32 v0, 13
2711; GCN-NEXT:    scratch_store_dword off, v0, s32 sc0 sc1
2712; GCN-NEXT:    s_waitcnt vmcnt(0)
2713; GCN-NEXT:    v_mov_b32_e32 v0, 0x3000
2714; GCN-NEXT:    v_mov_b32_e32 v1, 15
2715; GCN-NEXT:    scratch_store_dword v0, v1, s32 offset:3712 sc0 sc1
2716; GCN-NEXT:    s_waitcnt vmcnt(0)
2717; GCN-NEXT:    scratch_load_dword v0, v0, s32 offset:3712 sc0 sc1
2718; GCN-NEXT:    s_waitcnt vmcnt(0)
2719; GCN-NEXT:    s_setpc_b64 s[30:31]
2720bb:
2721  %i = alloca [4096 x i32], align 4, addrspace(5)
2722  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
2723  store volatile i32 13, i32 addrspace(5)* %i1, align 4
2724  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
2725  store volatile i32 15, i32 addrspace(5)* %i7, align 4
2726  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
2727  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
2728  ret void
2729}
2730
2731define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
2732; GFX9-LABEL: store_load_vidx_sidx_offset:
2733; GFX9:       ; %bb.0: ; %bb
2734; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
2735; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
2736; GFX9-NEXT:    v_mov_b32_e32 v1, 4
2737; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
2738; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2739; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
2740; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
2741; GFX9-NEXT:    v_mov_b32_e32 v1, 15
2742; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
2743; GFX9-NEXT:    s_waitcnt vmcnt(0)
2744; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
2745; GFX9-NEXT:    s_waitcnt vmcnt(0)
2746; GFX9-NEXT:    s_endpgm
2747;
2748; GFX10-LABEL: store_load_vidx_sidx_offset:
2749; GFX10:       ; %bb.0: ; %bb
2750; GFX10-NEXT:    s_add_u32 s2, s2, s5
2751; GFX10-NEXT:    s_addc_u32 s3, s3, 0
2752; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2753; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2754; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
2755; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2756; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2757; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
2758; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
2759; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
2760; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2761; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
2762; GFX10-NEXT:    s_waitcnt vmcnt(0)
2763; GFX10-NEXT:    s_endpgm
2764;
2765; GFX9-PAL-LABEL: store_load_vidx_sidx_offset:
2766; GFX9-PAL:       ; %bb.0: ; %bb
2767; GFX9-PAL-NEXT:    s_getpc_b64 s[4:5]
2768; GFX9-PAL-NEXT:    s_mov_b32 s4, s0
2769; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2770; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 4
2771; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
2772; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2773; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2774; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
2775; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
2776; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
2777; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
2778; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
2779; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
2780; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2781; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
2782; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2783; GFX9-PAL-NEXT:    s_endpgm
2784;
2785; GFX940-LABEL: store_load_vidx_sidx_offset:
2786; GFX940:       ; %bb.0: ; %bb
2787; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
2788; GFX940-NEXT:    v_mov_b32_e32 v1, 15
2789; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
2790; GFX940-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
2791; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:1028 sc0 sc1
2792; GFX940-NEXT:    s_waitcnt vmcnt(0)
2793; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:1028 sc0 sc1
2794; GFX940-NEXT:    s_waitcnt vmcnt(0)
2795; GFX940-NEXT:    s_endpgm
2796;
2797; GFX10-PAL-LABEL: store_load_vidx_sidx_offset:
2798; GFX10-PAL:       ; %bb.0: ; %bb
2799; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
2800; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
2801; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
2802; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2803; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
2804; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
2805; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
2806; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
2807; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
2808; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
2809; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
2810; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2811; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
2812; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
2813; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
2814; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2815; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
2816; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2817; GFX10-PAL-NEXT:    s_endpgm
2818; GCN-LABEL: store_load_vidx_sidx_offset:
2819; GCN:       ; %bb.0: ; %bb
2820; GCN-NEXT:    s_load_dword s0, s[0:1], 0x24
2821; GCN-NEXT:    v_mov_b32_e32 v1, 15
2822; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2823; GCN-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
2824; GCN-NEXT:    scratch_store_dword v0, v1, off offset:1028 sc0 sc1
2825; GCN-NEXT:    s_waitcnt vmcnt(0)
2826; GCN-NEXT:    scratch_load_dword v0, v0, off offset:1028 sc0 sc1
2827; GCN-NEXT:    s_waitcnt vmcnt(0)
2828; GCN-NEXT:    s_endpgm
2829bb:
2830  %alloca = alloca [32 x i32], align 4, addrspace(5)
2831  %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
2832  %add1 = add nsw i32 %sidx, %vidx
2833  %add2 = add nsw i32 %add1, 256
2834  %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2
2835  store volatile i32 15, i32 addrspace(5)* %gep, align 4
2836  %load = load volatile i32, i32 addrspace(5)* %gep, align 4
2837  ret void
2838}
2839
2840define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
2841; GFX9-LABEL: store_load_i64_aligned:
2842; GFX9:       ; %bb.0: ; %bb
2843; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2844; GFX9-NEXT:    v_mov_b32_e32 v1, 15
2845; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2846; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2847; GFX9-NEXT:    s_waitcnt vmcnt(0)
2848; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
2849; GFX9-NEXT:    s_waitcnt vmcnt(0)
2850; GFX9-NEXT:    s_setpc_b64 s[30:31]
2851;
2852; GFX10-LABEL: store_load_i64_aligned:
2853; GFX10:       ; %bb.0: ; %bb
2854; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2855; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2856; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2857; GFX10-NEXT:    v_mov_b32_e32 v2, 0
2858; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2859; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2860; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
2861; GFX10-NEXT:    s_waitcnt vmcnt(0)
2862; GFX10-NEXT:    s_setpc_b64 s[30:31]
2863;
2864; GFX9-PAL-LABEL: store_load_i64_aligned:
2865; GFX9-PAL:       ; %bb.0: ; %bb
2866; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2867; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
2868; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
2869; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2870; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2871; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
2872; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2873; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2874;
2875; GFX940-LABEL: store_load_i64_aligned:
2876; GFX940:       ; %bb.0: ; %bb
2877; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2878; GFX940-NEXT:    v_mov_b32_e32 v2, 15
2879; GFX940-NEXT:    v_mov_b32_e32 v3, 0
2880; GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
2881; GFX940-NEXT:    s_waitcnt vmcnt(0)
2882; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
2883; GFX940-NEXT:    s_waitcnt vmcnt(0)
2884; GFX940-NEXT:    s_setpc_b64 s[30:31]
2885;
2886; GFX10-PAL-LABEL: store_load_i64_aligned:
2887; GFX10-PAL:       ; %bb.0: ; %bb
2888; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2889; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2890; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
2891; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
2892; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2893; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2894; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
2895; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2896; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2897; GCN-LABEL: store_load_i64_aligned:
2898; GCN:       ; %bb.0: ; %bb
2899; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2900; GCN-NEXT:    v_mov_b32_e32 v2, 15
2901; GCN-NEXT:    v_mov_b32_e32 v3, 0
2902; GCN-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
2903; GCN-NEXT:    s_waitcnt vmcnt(0)
2904; GCN-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
2905; GCN-NEXT:    s_waitcnt vmcnt(0)
2906; GCN-NEXT:    s_setpc_b64 s[30:31]
2907bb:
2908  store volatile i64 15, i64 addrspace(5)* %arg, align 8
2909  %load = load volatile i64, i64 addrspace(5)* %arg, align 8
2910  ret void
2911}
2912
2913define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
2914; GFX9-LABEL: store_load_i64_unaligned:
2915; GFX9:       ; %bb.0: ; %bb
2916; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2917; GFX9-NEXT:    v_mov_b32_e32 v1, 15
2918; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2919; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2920; GFX9-NEXT:    s_waitcnt vmcnt(0)
2921; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
2922; GFX9-NEXT:    s_waitcnt vmcnt(0)
2923; GFX9-NEXT:    s_setpc_b64 s[30:31]
2924;
2925; GFX10-LABEL: store_load_i64_unaligned:
2926; GFX10:       ; %bb.0: ; %bb
2927; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2928; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2929; GFX10-NEXT:    v_mov_b32_e32 v1, 15
2930; GFX10-NEXT:    v_mov_b32_e32 v2, 0
2931; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2932; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2933; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
2934; GFX10-NEXT:    s_waitcnt vmcnt(0)
2935; GFX10-NEXT:    s_setpc_b64 s[30:31]
2936;
2937; GFX9-PAL-LABEL: store_load_i64_unaligned:
2938; GFX9-PAL:       ; %bb.0: ; %bb
2939; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2940; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
2941; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
2942; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2943; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2944; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
2945; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2946; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2947;
2948; GFX940-LABEL: store_load_i64_unaligned:
2949; GFX940:       ; %bb.0: ; %bb
2950; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2951; GFX940-NEXT:    v_mov_b32_e32 v2, 15
2952; GFX940-NEXT:    v_mov_b32_e32 v3, 0
2953; GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
2954; GFX940-NEXT:    s_waitcnt vmcnt(0)
2955; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
2956; GFX940-NEXT:    s_waitcnt vmcnt(0)
2957; GFX940-NEXT:    s_setpc_b64 s[30:31]
2958;
2959; GFX10-PAL-LABEL: store_load_i64_unaligned:
2960; GFX10-PAL:       ; %bb.0: ; %bb
2961; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2962; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2963; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
2964; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
2965; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
2966; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2967; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
2968; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2969; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2970; GCN-LABEL: store_load_i64_unaligned:
2971; GCN:       ; %bb.0: ; %bb
2972; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2973; GCN-NEXT:    v_mov_b32_e32 v2, 15
2974; GCN-NEXT:    v_mov_b32_e32 v3, 0
2975; GCN-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
2976; GCN-NEXT:    s_waitcnt vmcnt(0)
2977; GCN-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
2978; GCN-NEXT:    s_waitcnt vmcnt(0)
2979; GCN-NEXT:    s_setpc_b64 s[30:31]
2980bb:
2981  store volatile i64 15, i64 addrspace(5)* %arg, align 1
2982  %load = load volatile i64, i64 addrspace(5)* %arg, align 1
2983  ret void
2984}
2985
2986define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) {
2987; GFX9-LABEL: store_load_v3i32_unaligned:
2988; GFX9:       ; %bb.0: ; %bb
2989; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2990; GFX9-NEXT:    v_mov_b32_e32 v1, 1
2991; GFX9-NEXT:    v_mov_b32_e32 v2, 2
2992; GFX9-NEXT:    v_mov_b32_e32 v3, 3
2993; GFX9-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
2994; GFX9-NEXT:    s_waitcnt vmcnt(0)
2995; GFX9-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
2996; GFX9-NEXT:    s_waitcnt vmcnt(0)
2997; GFX9-NEXT:    s_setpc_b64 s[30:31]
2998;
2999; GFX10-LABEL: store_load_v3i32_unaligned:
3000; GFX10:       ; %bb.0: ; %bb
3001; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3002; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3003; GFX10-NEXT:    v_mov_b32_e32 v1, 1
3004; GFX10-NEXT:    v_mov_b32_e32 v2, 2
3005; GFX10-NEXT:    v_mov_b32_e32 v3, 3
3006; GFX10-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
3007; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3008; GFX10-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
3009; GFX10-NEXT:    s_waitcnt vmcnt(0)
3010; GFX10-NEXT:    s_setpc_b64 s[30:31]
3011;
3012; GFX9-PAL-LABEL: store_load_v3i32_unaligned:
3013; GFX9-PAL:       ; %bb.0: ; %bb
3014; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3015; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
3016; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
3017; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
3018; GFX9-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
3019; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3020; GFX9-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
3021; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3022; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3023;
3024; GFX940-LABEL: store_load_v3i32_unaligned:
3025; GFX940:       ; %bb.0: ; %bb
3026; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3027; GFX940-NEXT:    v_mov_b32_e32 v2, 1
3028; GFX940-NEXT:    v_mov_b32_e32 v3, 2
3029; GFX940-NEXT:    v_mov_b32_e32 v4, 3
3030; GFX940-NEXT:    scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
3031; GFX940-NEXT:    s_waitcnt vmcnt(0)
3032; GFX940-NEXT:    scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
3033; GFX940-NEXT:    s_waitcnt vmcnt(0)
3034; GFX940-NEXT:    s_setpc_b64 s[30:31]
3035;
3036; GFX10-PAL-LABEL: store_load_v3i32_unaligned:
3037; GFX10-PAL:       ; %bb.0: ; %bb
3038; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3039; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3040; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
3041; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
3042; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
3043; GFX10-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
3044; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3045; GFX10-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
3046; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3047; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3048; GCN-LABEL: store_load_v3i32_unaligned:
3049; GCN:       ; %bb.0: ; %bb
3050; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3051; GCN-NEXT:    v_mov_b32_e32 v2, 1
3052; GCN-NEXT:    v_mov_b32_e32 v3, 2
3053; GCN-NEXT:    v_mov_b32_e32 v4, 3
3054; GCN-NEXT:    scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
3055; GCN-NEXT:    s_waitcnt vmcnt(0)
3056; GCN-NEXT:    scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
3057; GCN-NEXT:    s_waitcnt vmcnt(0)
3058; GCN-NEXT:    s_setpc_b64 s[30:31]
3059bb:
3060  store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1
3061  %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1
3062  ret void
3063}
3064
3065define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) {
3066; GFX9-LABEL: store_load_v4i32_unaligned:
3067; GFX9:       ; %bb.0: ; %bb
3068; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3069; GFX9-NEXT:    v_mov_b32_e32 v1, 1
3070; GFX9-NEXT:    v_mov_b32_e32 v2, 2
3071; GFX9-NEXT:    v_mov_b32_e32 v3, 3
3072; GFX9-NEXT:    v_mov_b32_e32 v4, 4
3073; GFX9-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
3074; GFX9-NEXT:    s_waitcnt vmcnt(0)
3075; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
3076; GFX9-NEXT:    s_waitcnt vmcnt(0)
3077; GFX9-NEXT:    s_setpc_b64 s[30:31]
3078;
3079; GFX10-LABEL: store_load_v4i32_unaligned:
3080; GFX10:       ; %bb.0: ; %bb
3081; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3082; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3083; GFX10-NEXT:    v_mov_b32_e32 v1, 1
3084; GFX10-NEXT:    v_mov_b32_e32 v2, 2
3085; GFX10-NEXT:    v_mov_b32_e32 v3, 3
3086; GFX10-NEXT:    v_mov_b32_e32 v4, 4
3087; GFX10-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
3088; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3089; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
3090; GFX10-NEXT:    s_waitcnt vmcnt(0)
3091; GFX10-NEXT:    s_setpc_b64 s[30:31]
3092;
3093; GFX9-PAL-LABEL: store_load_v4i32_unaligned:
3094; GFX9-PAL:       ; %bb.0: ; %bb
3095; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3096; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
3097; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
3098; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
3099; GFX9-PAL-NEXT:    v_mov_b32_e32 v4, 4
3100; GFX9-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
3101; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3102; GFX9-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
3103; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3104; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3105;
3106; GFX940-LABEL: store_load_v4i32_unaligned:
3107; GFX940:       ; %bb.0: ; %bb
3108; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3109; GFX940-NEXT:    v_mov_b32_e32 v2, 1
3110; GFX940-NEXT:    v_mov_b32_e32 v3, 2
3111; GFX940-NEXT:    v_mov_b32_e32 v4, 3
3112; GFX940-NEXT:    v_mov_b32_e32 v5, 4
3113; GFX940-NEXT:    scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
3114; GFX940-NEXT:    s_waitcnt vmcnt(0)
3115; GFX940-NEXT:    scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
3116; GFX940-NEXT:    s_waitcnt vmcnt(0)
3117; GFX940-NEXT:    s_setpc_b64 s[30:31]
3118;
3119; GFX10-PAL-LABEL: store_load_v4i32_unaligned:
3120; GFX10-PAL:       ; %bb.0: ; %bb
3121; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3122; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3123; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
3124; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
3125; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
3126; GFX10-PAL-NEXT:    v_mov_b32_e32 v4, 4
3127; GFX10-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
3128; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3129; GFX10-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
3130; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3131; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3132; GCN-LABEL: store_load_v4i32_unaligned:
3133; GCN:       ; %bb.0: ; %bb
3134; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3135; GCN-NEXT:    v_mov_b32_e32 v2, 1
3136; GCN-NEXT:    v_mov_b32_e32 v3, 2
3137; GCN-NEXT:    v_mov_b32_e32 v4, 3
3138; GCN-NEXT:    v_mov_b32_e32 v5, 4
3139; GCN-NEXT:    scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
3140; GCN-NEXT:    s_waitcnt vmcnt(0)
3141; GCN-NEXT:    scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
3142; GCN-NEXT:    s_waitcnt vmcnt(0)
3143; GCN-NEXT:    s_setpc_b64 s[30:31]
3144bb:
3145  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1
3146  %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1
3147  ret void
3148}
3149
3150define void @store_load_i32_negative_unaligned(i8 addrspace(5)* nocapture %arg) {
3151; GFX9-LABEL: store_load_i32_negative_unaligned:
3152; GFX9:       ; %bb.0: ; %bb
3153; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3154; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
3155; GFX9-NEXT:    v_mov_b32_e32 v1, 1
3156; GFX9-NEXT:    scratch_store_byte v0, v1, off
3157; GFX9-NEXT:    s_waitcnt vmcnt(0)
3158; GFX9-NEXT:    scratch_load_ubyte v0, v0, off glc
3159; GFX9-NEXT:    s_waitcnt vmcnt(0)
3160; GFX9-NEXT:    s_setpc_b64 s[30:31]
3161;
3162; GFX10-LABEL: store_load_i32_negative_unaligned:
3163; GFX10:       ; %bb.0: ; %bb
3164; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3165; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3166; GFX10-NEXT:    v_mov_b32_e32 v1, 1
3167; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-1
3168; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3169; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
3170; GFX10-NEXT:    s_waitcnt vmcnt(0)
3171; GFX10-NEXT:    s_setpc_b64 s[30:31]
3172;
3173; GFX9-PAL-LABEL: store_load_i32_negative_unaligned:
3174; GFX9-PAL:       ; %bb.0: ; %bb
3175; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3176; GFX9-PAL-NEXT:    v_add_u32_e32 v0, -1, v0
3177; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
3178; GFX9-PAL-NEXT:    scratch_store_byte v0, v1, off
3179; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3180; GFX9-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc
3181; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3182; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3183;
3184; GFX940-LABEL: store_load_i32_negative_unaligned:
3185; GFX940:       ; %bb.0: ; %bb
3186; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3187; GFX940-NEXT:    v_add_u32_e32 v0, -1, v0
3188; GFX940-NEXT:    v_mov_b32_e32 v1, 1
3189; GFX940-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
3190; GFX940-NEXT:    s_waitcnt vmcnt(0)
3191; GFX940-NEXT:    scratch_load_ubyte v0, v0, off sc0 sc1
3192; GFX940-NEXT:    s_waitcnt vmcnt(0)
3193; GFX940-NEXT:    s_setpc_b64 s[30:31]
3194;
3195; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned:
3196; GFX1010-PAL:       ; %bb.0: ; %bb
3197; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3198; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3199; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, -1, v0
3200; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
3201; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off
3202; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3203; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
3204; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
3205; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
3206;
3207; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned:
3208; GFX1030-PAL:       ; %bb.0: ; %bb
3209; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3210; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3211; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
3212; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-1
3213; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3214; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
3215; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
3216; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
3217bb:
3218  %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -1
3219  store volatile i8 1, i8 addrspace(5)* %ptr, align 1
3220  %load = load volatile i8, i8 addrspace(5)* %ptr, align 1
3221  ret void
3222}
3223
3224define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture %arg) {
3225; GFX9-LABEL: store_load_i32_large_negative_unaligned:
3226; GFX9:       ; %bb.0: ; %bb
3227; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3228; GFX9-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
3229; GFX9-NEXT:    v_mov_b32_e32 v1, 1
3230; GFX9-NEXT:    scratch_store_byte v0, v1, off
3231; GFX9-NEXT:    s_waitcnt vmcnt(0)
3232; GFX9-NEXT:    scratch_load_ubyte v0, v0, off glc
3233; GFX9-NEXT:    s_waitcnt vmcnt(0)
3234; GFX9-NEXT:    s_setpc_b64 s[30:31]
3235;
3236; GFX10-LABEL: store_load_i32_large_negative_unaligned:
3237; GFX10:       ; %bb.0: ; %bb
3238; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3239; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3240; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
3241; GFX10-NEXT:    v_mov_b32_e32 v1, 1
3242; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-129
3243; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3244; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
3245; GFX10-NEXT:    s_waitcnt vmcnt(0)
3246; GFX10-NEXT:    s_setpc_b64 s[30:31]
3247;
3248; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned:
3249; GFX9-PAL:       ; %bb.0: ; %bb
3250; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3251; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
3252; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
3253; GFX9-PAL-NEXT:    scratch_store_byte v0, v1, off
3254; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3255; GFX9-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc
3256; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3257; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3258;
3259; GFX940-LABEL: store_load_i32_large_negative_unaligned:
3260; GFX940:       ; %bb.0: ; %bb
3261; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3262; GFX940-NEXT:    s_movk_i32 s0, 0xef7f
3263; GFX940-NEXT:    v_mov_b32_e32 v1, 1
3264; GFX940-NEXT:    scratch_store_byte v0, v1, s0 sc0 sc1
3265; GFX940-NEXT:    s_waitcnt vmcnt(0)
3266; GFX940-NEXT:    scratch_load_ubyte v0, v0, s0 sc0 sc1
3267; GFX940-NEXT:    s_waitcnt vmcnt(0)
3268; GFX940-NEXT:    s_setpc_b64 s[30:31]
3269;
3270; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned:
3271; GFX1010-PAL:       ; %bb.0: ; %bb
3272; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3273; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3274; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xffffefff, v0
3275; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
3276; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-128
3277; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3278; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-128 glc dlc
3279; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
3280; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
3281;
3282; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned:
3283; GFX1030-PAL:       ; %bb.0: ; %bb
3284; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3285; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3286; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
3287; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
3288; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-129
3289; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3290; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
3291; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
3292; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
3293bb:
3294  %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -4225
3295  store volatile i8 1, i8 addrspace(5)* %ptr, align 1
3296  %load = load volatile i8, i8 addrspace(5)* %ptr, align 1
3297  ret void
3298}
3299
3300define amdgpu_ps void @large_offset() {
3301; GFX9-LABEL: large_offset:
3302; GFX9:       ; %bb.0: ; %bb
3303; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s2
3304; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3305; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
3306; GFX9-NEXT:    v_mov_b32_e32 v1, v0
3307; GFX9-NEXT:    v_mov_b32_e32 v2, v0
3308; GFX9-NEXT:    v_mov_b32_e32 v3, v0
3309; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
3310; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024
3311; GFX9-NEXT:    s_waitcnt vmcnt(0)
3312; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
3313; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc
3314; GFX9-NEXT:    s_waitcnt vmcnt(0)
3315; GFX9-NEXT:    v_mov_b32_e32 v0, 16
3316; GFX9-NEXT:    ;;#ASMSTART
3317; GFX9-NEXT:    ; use v0
3318; GFX9-NEXT:    ;;#ASMEND
3319; GFX9-NEXT:    v_mov_b32_e32 v0, 0x810
3320; GFX9-NEXT:    ;;#ASMSTART
3321; GFX9-NEXT:    ; use v0
3322; GFX9-NEXT:    ;;#ASMEND
3323; GFX9-NEXT:    s_endpgm
3324;
3325; GFX10-LABEL: large_offset:
3326; GFX10:       ; %bb.0: ; %bb
3327; GFX10-NEXT:    s_add_u32 s0, s0, s2
3328; GFX10-NEXT:    s_addc_u32 s1, s1, 0
3329; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
3330; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
3331; GFX10-NEXT:    v_mov_b32_e32 v0, 0
3332; GFX10-NEXT:    s_movk_i32 s0, 0x810
3333; GFX10-NEXT:    s_addk_i32 s0, 0x3c0
3334; GFX10-NEXT:    v_mov_b32_e32 v1, v0
3335; GFX10-NEXT:    v_mov_b32_e32 v2, v0
3336; GFX10-NEXT:    v_mov_b32_e32 v3, v0
3337; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
3338; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3339; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 glc dlc
3340; GFX10-NEXT:    s_waitcnt vmcnt(0)
3341; GFX10-NEXT:    v_mov_b32_e32 v0, 16
3342; GFX10-NEXT:    v_mov_b32_e32 v1, 0x810
3343; GFX10-NEXT:    ;;#ASMSTART
3344; GFX10-NEXT:    ; use v0
3345; GFX10-NEXT:    ;;#ASMEND
3346; GFX10-NEXT:    ;;#ASMSTART
3347; GFX10-NEXT:    ; use v1
3348; GFX10-NEXT:    ;;#ASMEND
3349; GFX10-NEXT:    s_endpgm
3350;
3351; GFX9-PAL-LABEL: large_offset:
3352; GFX9-PAL:       ; %bb.0: ; %bb
3353; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
3354; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
3355; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
3356; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0
3357; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, v0
3358; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, v0
3359; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, v0
3360; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3361; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
3362; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s0
3363; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
3364; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
3365; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024
3366; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3367; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
3368; GFX9-PAL-NEXT:    scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc
3369; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3370; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 16
3371; GFX9-PAL-NEXT:    ;;#ASMSTART
3372; GFX9-PAL-NEXT:    ; use v0
3373; GFX9-PAL-NEXT:    ;;#ASMEND
3374; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0x810
3375; GFX9-PAL-NEXT:    ;;#ASMSTART
3376; GFX9-PAL-NEXT:    ; use v0
3377; GFX9-PAL-NEXT:    ;;#ASMEND
3378; GFX9-PAL-NEXT:    s_endpgm
3379;
3380; GFX940-LABEL: large_offset:
3381; GFX940:       ; %bb.0: ; %bb
3382; GFX940-NEXT:    v_mov_b32_e32 v0, 0
3383; GFX940-NEXT:    v_mov_b32_e32 v1, v0
3384; GFX940-NEXT:    v_mov_b32_e32 v2, v0
3385; GFX940-NEXT:    v_mov_b32_e32 v3, v0
3386; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1
3387; GFX940-NEXT:    s_waitcnt vmcnt(0)
3388; GFX940-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1
3389; GFX940-NEXT:    s_waitcnt vmcnt(0)
3390; GFX940-NEXT:    v_mov_b32_e32 v0, 16
3391; GFX940-NEXT:    ;;#ASMSTART
3392; GFX940-NEXT:    ; use v0
3393; GFX940-NEXT:    ;;#ASMEND
3394; GFX940-NEXT:    v_mov_b32_e32 v0, 0x810
3395; GFX940-NEXT:    ;;#ASMSTART
3396; GFX940-NEXT:    ; use v0
3397; GFX940-NEXT:    ;;#ASMEND
3398; GFX940-NEXT:    s_endpgm
3399;
3400; GFX10-PAL-LABEL: large_offset:
3401; GFX10-PAL:       ; %bb.0: ; %bb
3402; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
3403; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
3404; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
3405; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3406; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
3407; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s0
3408; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
3409; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
3410; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
3411; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 0
3412; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x810
3413; GFX10-PAL-NEXT:    s_addk_i32 s0, 0x3c0
3414; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, v0
3415; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, v0
3416; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, v0
3417; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
3418; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3419; GFX10-PAL-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 glc dlc
3420; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3421; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 16
3422; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x810
3423; GFX10-PAL-NEXT:    ;;#ASMSTART
3424; GFX10-PAL-NEXT:    ; use v0
3425; GFX10-PAL-NEXT:    ;;#ASMEND
3426; GFX10-PAL-NEXT:    ;;#ASMSTART
3427; GFX10-PAL-NEXT:    ; use v1
3428; GFX10-PAL-NEXT:    ;;#ASMEND
3429; GFX10-PAL-NEXT:    s_endpgm
3430bb:
3431  %alloca = alloca [128 x <4 x i32>], align 16, addrspace(5)
3432  %alloca2 = alloca [128 x <4 x i32>], align 16, addrspace(5)
3433  %gep = getelementptr inbounds [128 x <4 x i32>], [128 x <4 x i32>] addrspace(5)* %alloca2, i32 0, i32 60
3434  store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(5)* %gep, align 16
3435  %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %gep, align 16
3436  call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca) #0
3437  call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca2) #0
3438  ret void
3439}
3440
3441declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg)
3442declare i32 @llvm.amdgcn.workitem.id.x()
3443