1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s
3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s
4
5; FIXME: Broken on evergreen
6; FIXME: For some reason the 8 and 16 vectors are being stored as
7; individual elements instead of 128-bit stores.
8
9
10; FIXME: Why is the constant moved into the intermediate register and
11; not just directly into the vector component?
12define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
13; SI-LABEL: insertelement_v4f32_0:
14; SI:       ; %bb.0:
15; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
17; SI-NEXT:    s_waitcnt lgkmcnt(0)
18; SI-NEXT:    s_mov_b32 s4, 0x40a00000
19; SI-NEXT:    s_mov_b32 s3, 0x100f000
20; SI-NEXT:    s_mov_b32 s2, -1
21; SI-NEXT:    v_mov_b32_e32 v0, s4
22; SI-NEXT:    v_mov_b32_e32 v1, s5
23; SI-NEXT:    v_mov_b32_e32 v2, s6
24; SI-NEXT:    v_mov_b32_e32 v3, s7
25; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
26; SI-NEXT:    s_endpgm
27;
28; VI-LABEL: insertelement_v4f32_0:
29; VI:       ; %bb.0:
30; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
31; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
32; VI-NEXT:    s_waitcnt lgkmcnt(0)
33; VI-NEXT:    s_mov_b32 s4, 0x40a00000
34; VI-NEXT:    s_mov_b32 s3, 0x1100f000
35; VI-NEXT:    s_mov_b32 s2, -1
36; VI-NEXT:    v_mov_b32_e32 v0, s4
37; VI-NEXT:    v_mov_b32_e32 v1, s5
38; VI-NEXT:    v_mov_b32_e32 v2, s6
39; VI-NEXT:    v_mov_b32_e32 v3, s7
40; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
41; VI-NEXT:    s_endpgm
42  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
43  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
44  ret void
45}
46
47define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
48; SI-LABEL: insertelement_v4f32_1:
49; SI:       ; %bb.0:
50; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
51; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
52; SI-NEXT:    s_waitcnt lgkmcnt(0)
53; SI-NEXT:    s_mov_b32 s5, 0x40a00000
54; SI-NEXT:    s_mov_b32 s3, 0x100f000
55; SI-NEXT:    s_mov_b32 s2, -1
56; SI-NEXT:    v_mov_b32_e32 v0, s4
57; SI-NEXT:    v_mov_b32_e32 v1, s5
58; SI-NEXT:    v_mov_b32_e32 v2, s6
59; SI-NEXT:    v_mov_b32_e32 v3, s7
60; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
61; SI-NEXT:    s_endpgm
62;
63; VI-LABEL: insertelement_v4f32_1:
64; VI:       ; %bb.0:
65; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
66; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
67; VI-NEXT:    s_waitcnt lgkmcnt(0)
68; VI-NEXT:    s_mov_b32 s5, 0x40a00000
69; VI-NEXT:    s_mov_b32 s3, 0x1100f000
70; VI-NEXT:    s_mov_b32 s2, -1
71; VI-NEXT:    v_mov_b32_e32 v0, s4
72; VI-NEXT:    v_mov_b32_e32 v1, s5
73; VI-NEXT:    v_mov_b32_e32 v2, s6
74; VI-NEXT:    v_mov_b32_e32 v3, s7
75; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
76; VI-NEXT:    s_endpgm
77  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
78  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
79  ret void
80}
81
82define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
83; SI-LABEL: insertelement_v4f32_2:
84; SI:       ; %bb.0:
85; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
86; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
87; SI-NEXT:    s_waitcnt lgkmcnt(0)
88; SI-NEXT:    s_mov_b32 s6, 0x40a00000
89; SI-NEXT:    s_mov_b32 s3, 0x100f000
90; SI-NEXT:    s_mov_b32 s2, -1
91; SI-NEXT:    v_mov_b32_e32 v0, s4
92; SI-NEXT:    v_mov_b32_e32 v1, s5
93; SI-NEXT:    v_mov_b32_e32 v2, s6
94; SI-NEXT:    v_mov_b32_e32 v3, s7
95; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
96; SI-NEXT:    s_endpgm
97;
98; VI-LABEL: insertelement_v4f32_2:
99; VI:       ; %bb.0:
100; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
101; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
102; VI-NEXT:    s_waitcnt lgkmcnt(0)
103; VI-NEXT:    s_mov_b32 s6, 0x40a00000
104; VI-NEXT:    s_mov_b32 s3, 0x1100f000
105; VI-NEXT:    s_mov_b32 s2, -1
106; VI-NEXT:    v_mov_b32_e32 v0, s4
107; VI-NEXT:    v_mov_b32_e32 v1, s5
108; VI-NEXT:    v_mov_b32_e32 v2, s6
109; VI-NEXT:    v_mov_b32_e32 v3, s7
110; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
111; VI-NEXT:    s_endpgm
112  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
113  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
114  ret void
115}
116
117define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
118; SI-LABEL: insertelement_v4f32_3:
119; SI:       ; %bb.0:
120; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
121; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
122; SI-NEXT:    s_waitcnt lgkmcnt(0)
123; SI-NEXT:    s_mov_b32 s7, 0x40a00000
124; SI-NEXT:    s_mov_b32 s3, 0x100f000
125; SI-NEXT:    s_mov_b32 s2, -1
126; SI-NEXT:    v_mov_b32_e32 v0, s4
127; SI-NEXT:    v_mov_b32_e32 v1, s5
128; SI-NEXT:    v_mov_b32_e32 v2, s6
129; SI-NEXT:    v_mov_b32_e32 v3, s7
130; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
131; SI-NEXT:    s_endpgm
132;
133; VI-LABEL: insertelement_v4f32_3:
134; VI:       ; %bb.0:
135; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
136; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
137; VI-NEXT:    s_waitcnt lgkmcnt(0)
138; VI-NEXT:    s_mov_b32 s7, 0x40a00000
139; VI-NEXT:    s_mov_b32 s3, 0x1100f000
140; VI-NEXT:    s_mov_b32 s2, -1
141; VI-NEXT:    v_mov_b32_e32 v0, s4
142; VI-NEXT:    v_mov_b32_e32 v1, s5
143; VI-NEXT:    v_mov_b32_e32 v2, s6
144; VI-NEXT:    v_mov_b32_e32 v3, s7
145; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
146; VI-NEXT:    s_endpgm
147  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
148  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
149  ret void
150}
151
152define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
153; SI-LABEL: insertelement_v4i32_0:
154; SI:       ; %bb.0:
155; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
156; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
157; SI-NEXT:    s_waitcnt lgkmcnt(0)
158; SI-NEXT:    s_movk_i32 s4, 0x3e7
159; SI-NEXT:    s_mov_b32 s3, 0x100f000
160; SI-NEXT:    s_mov_b32 s2, -1
161; SI-NEXT:    v_mov_b32_e32 v0, s4
162; SI-NEXT:    v_mov_b32_e32 v1, s5
163; SI-NEXT:    v_mov_b32_e32 v2, s6
164; SI-NEXT:    v_mov_b32_e32 v3, s7
165; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
166; SI-NEXT:    s_endpgm
167;
168; VI-LABEL: insertelement_v4i32_0:
169; VI:       ; %bb.0:
170; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
171; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
172; VI-NEXT:    s_waitcnt lgkmcnt(0)
173; VI-NEXT:    s_movk_i32 s4, 0x3e7
174; VI-NEXT:    s_mov_b32 s3, 0x1100f000
175; VI-NEXT:    s_mov_b32 s2, -1
176; VI-NEXT:    v_mov_b32_e32 v0, s4
177; VI-NEXT:    v_mov_b32_e32 v1, s5
178; VI-NEXT:    v_mov_b32_e32 v2, s6
179; VI-NEXT:    v_mov_b32_e32 v3, s7
180; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
181; VI-NEXT:    s_endpgm
182  %vecins = insertelement <4 x i32> %a, i32 999, i32 0
183  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
184  ret void
185}
186
187define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
188; SI-LABEL: insertelement_v3f32_1:
189; SI:       ; %bb.0:
190; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
191; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
192; SI-NEXT:    s_mov_b32 s3, 0x100f000
193; SI-NEXT:    s_mov_b32 s2, -1
194; SI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
195; SI-NEXT:    s_waitcnt lgkmcnt(0)
196; SI-NEXT:    v_mov_b32_e32 v0, s4
197; SI-NEXT:    v_mov_b32_e32 v2, s6
198; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
199; SI-NEXT:    s_endpgm
200;
201; VI-LABEL: insertelement_v3f32_1:
202; VI:       ; %bb.0:
203; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
204; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
205; VI-NEXT:    s_mov_b32 s3, 0x1100f000
206; VI-NEXT:    s_mov_b32 s2, -1
207; VI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
208; VI-NEXT:    s_waitcnt lgkmcnt(0)
209; VI-NEXT:    v_mov_b32_e32 v0, s4
210; VI-NEXT:    v_mov_b32_e32 v2, s6
211; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
212; VI-NEXT:    s_endpgm
213  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
214  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
215  ret void
216}
217
218define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
219; SI-LABEL: insertelement_v3f32_2:
220; SI:       ; %bb.0:
221; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
222; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
223; SI-NEXT:    s_mov_b32 s3, 0x100f000
224; SI-NEXT:    s_mov_b32 s2, -1
225; SI-NEXT:    v_mov_b32_e32 v2, 0x40a00000
226; SI-NEXT:    s_waitcnt lgkmcnt(0)
227; SI-NEXT:    v_mov_b32_e32 v0, s4
228; SI-NEXT:    v_mov_b32_e32 v1, s5
229; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
230; SI-NEXT:    s_endpgm
231;
232; VI-LABEL: insertelement_v3f32_2:
233; VI:       ; %bb.0:
234; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
235; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
236; VI-NEXT:    s_mov_b32 s3, 0x1100f000
237; VI-NEXT:    s_mov_b32 s2, -1
238; VI-NEXT:    v_mov_b32_e32 v2, 0x40a00000
239; VI-NEXT:    s_waitcnt lgkmcnt(0)
240; VI-NEXT:    v_mov_b32_e32 v0, s4
241; VI-NEXT:    v_mov_b32_e32 v1, s5
242; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
243; VI-NEXT:    s_endpgm
244  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
245  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
246  ret void
247}
248
249define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
250; GCN-LABEL: insertelement_v3f32_3:
251; GCN:       ; %bb.0:
252; GCN-NEXT:    s_endpgm
253  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3
254  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
255  ret void
256}
257
258define <4 x float> @insertelement_to_sgpr() nounwind {
259; GCN-LABEL: insertelement_to_sgpr:
260; GCN:       ; %bb.0:
261; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
262; GCN-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x0
263; GCN-NEXT:    s_waitcnt lgkmcnt(0)
264; GCN-NEXT:    s_mov_b32 s12, 0
265; GCN-NEXT:    s_mov_b32 s4, s12
266; GCN-NEXT:    s_mov_b32 s5, s12
267; GCN-NEXT:    s_mov_b32 s6, s12
268; GCN-NEXT:    s_mov_b32 s7, s12
269; GCN-NEXT:    s_mov_b32 s8, s12
270; GCN-NEXT:    s_mov_b32 s9, s12
271; GCN-NEXT:    s_mov_b32 s10, s12
272; GCN-NEXT:    s_mov_b32 s11, s12
273; GCN-NEXT:    image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1
274; GCN-NEXT:    s_waitcnt vmcnt(0)
275; GCN-NEXT:    s_setpc_b64 s[30:31]
276  %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef
277  %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
278  %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)
279  ret <4 x float> %tmp2
280}
281
282define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
283; SI-LABEL: dynamic_insertelement_v2f32:
284; SI:       ; %bb.0:
285; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
286; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
287; SI-NEXT:    s_load_dword s4, s[4:5], 0x4
288; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
289; SI-NEXT:    s_mov_b32 s3, 0x100f000
290; SI-NEXT:    s_mov_b32 s2, -1
291; SI-NEXT:    s_waitcnt lgkmcnt(0)
292; SI-NEXT:    v_mov_b32_e32 v1, s7
293; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
294; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
295; SI-NEXT:    v_mov_b32_e32 v2, s6
296; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
297; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
298; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
299; SI-NEXT:    s_endpgm
300;
301; VI-LABEL: dynamic_insertelement_v2f32:
302; VI:       ; %bb.0:
303; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
304; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
305; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
306; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
307; VI-NEXT:    s_mov_b32 s3, 0x1100f000
308; VI-NEXT:    s_mov_b32 s2, -1
309; VI-NEXT:    s_waitcnt lgkmcnt(0)
310; VI-NEXT:    v_mov_b32_e32 v1, s7
311; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
312; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
313; VI-NEXT:    v_mov_b32_e32 v2, s6
314; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
315; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
316; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
317; VI-NEXT:    s_endpgm
318  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
319  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
320  ret void
321}
322
323define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
324; SI-LABEL: dynamic_insertelement_v3f32:
325; SI:       ; %bb.0:
326; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
327; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
328; SI-NEXT:    s_load_dword s4, s[4:5], 0x8
329; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
330; SI-NEXT:    s_mov_b32 s3, 0x100f000
331; SI-NEXT:    s_mov_b32 s2, -1
332; SI-NEXT:    s_waitcnt lgkmcnt(0)
333; SI-NEXT:    v_mov_b32_e32 v1, s10
334; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
335; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
336; SI-NEXT:    v_mov_b32_e32 v1, s9
337; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
338; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
339; SI-NEXT:    v_mov_b32_e32 v3, s8
340; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
341; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
342; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
343; SI-NEXT:    s_endpgm
344;
345; VI-LABEL: dynamic_insertelement_v3f32:
346; VI:       ; %bb.0:
347; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
348; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
349; VI-NEXT:    s_load_dword s4, s[4:5], 0x20
350; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
351; VI-NEXT:    s_mov_b32 s3, 0x1100f000
352; VI-NEXT:    s_mov_b32 s2, -1
353; VI-NEXT:    s_waitcnt lgkmcnt(0)
354; VI-NEXT:    v_mov_b32_e32 v1, s10
355; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
356; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
357; VI-NEXT:    v_mov_b32_e32 v1, s9
358; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
359; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
360; VI-NEXT:    v_mov_b32_e32 v3, s8
361; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
362; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
363; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
364; VI-NEXT:    s_endpgm
365  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
366  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
367  ret void
368}
369
370define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
371; SI-LABEL: dynamic_insertelement_v4f32:
372; SI:       ; %bb.0:
373; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
374; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
375; SI-NEXT:    s_load_dword s4, s[4:5], 0x8
376; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
377; SI-NEXT:    s_mov_b32 s3, 0x100f000
378; SI-NEXT:    s_mov_b32 s2, -1
379; SI-NEXT:    s_waitcnt lgkmcnt(0)
380; SI-NEXT:    v_mov_b32_e32 v1, s11
381; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
382; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
383; SI-NEXT:    v_mov_b32_e32 v1, s10
384; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
385; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
386; SI-NEXT:    v_mov_b32_e32 v1, s9
387; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
388; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
389; SI-NEXT:    v_mov_b32_e32 v4, s8
390; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
391; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
392; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
393; SI-NEXT:    s_endpgm
394;
395; VI-LABEL: dynamic_insertelement_v4f32:
396; VI:       ; %bb.0:
397; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
398; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
399; VI-NEXT:    s_load_dword s4, s[4:5], 0x20
400; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
401; VI-NEXT:    s_mov_b32 s3, 0x1100f000
402; VI-NEXT:    s_mov_b32 s2, -1
403; VI-NEXT:    s_waitcnt lgkmcnt(0)
404; VI-NEXT:    v_mov_b32_e32 v1, s11
405; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
406; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
407; VI-NEXT:    v_mov_b32_e32 v1, s10
408; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
409; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
410; VI-NEXT:    v_mov_b32_e32 v1, s9
411; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
412; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
413; VI-NEXT:    v_mov_b32_e32 v4, s8
414; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
415; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
416; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
417; VI-NEXT:    s_endpgm
418  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
419  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
420  ret void
421}
422
423define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
424; SI-LABEL: dynamic_insertelement_v8f32:
425; SI:       ; %bb.0:
426; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
427; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
428; SI-NEXT:    s_load_dword s4, s[4:5], 0x10
429; SI-NEXT:    v_mov_b32_e32 v4, 0x40a00000
430; SI-NEXT:    s_mov_b32 s3, 0x100f000
431; SI-NEXT:    s_mov_b32 s2, -1
432; SI-NEXT:    s_waitcnt lgkmcnt(0)
433; SI-NEXT:    v_mov_b32_e32 v0, s11
434; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
435; SI-NEXT:    v_cndmask_b32_e32 v3, v4, v0, vcc
436; SI-NEXT:    v_mov_b32_e32 v0, s10
437; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
438; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v0, vcc
439; SI-NEXT:    v_mov_b32_e32 v0, s9
440; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
441; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v0, vcc
442; SI-NEXT:    v_mov_b32_e32 v0, s8
443; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
444; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
445; SI-NEXT:    v_mov_b32_e32 v5, s15
446; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
447; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v5, vcc
448; SI-NEXT:    v_mov_b32_e32 v5, s14
449; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
450; SI-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
451; SI-NEXT:    v_mov_b32_e32 v5, s13
452; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
453; SI-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc
454; SI-NEXT:    v_mov_b32_e32 v8, s12
455; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
456; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
457; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
458; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
459; SI-NEXT:    s_endpgm
460;
461; VI-LABEL: dynamic_insertelement_v8f32:
462; VI:       ; %bb.0:
463; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
464; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
465; VI-NEXT:    s_load_dword s4, s[4:5], 0x40
466; VI-NEXT:    v_mov_b32_e32 v4, 0x40a00000
467; VI-NEXT:    s_mov_b32 s3, 0x1100f000
468; VI-NEXT:    s_mov_b32 s2, -1
469; VI-NEXT:    s_waitcnt lgkmcnt(0)
470; VI-NEXT:    v_mov_b32_e32 v0, s11
471; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
472; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v0, vcc
473; VI-NEXT:    v_mov_b32_e32 v0, s10
474; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
475; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v0, vcc
476; VI-NEXT:    v_mov_b32_e32 v0, s9
477; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
478; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v0, vcc
479; VI-NEXT:    v_mov_b32_e32 v0, s8
480; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
481; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
482; VI-NEXT:    v_mov_b32_e32 v5, s15
483; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
484; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v5, vcc
485; VI-NEXT:    v_mov_b32_e32 v5, s14
486; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
487; VI-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
488; VI-NEXT:    v_mov_b32_e32 v5, s13
489; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
490; VI-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc
491; VI-NEXT:    v_mov_b32_e32 v8, s12
492; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
493; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
494; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
495; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
496; VI-NEXT:    s_endpgm
497  %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
498  store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
499  ret void
500}
501
502define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
503; SI-LABEL: dynamic_insertelement_v16f32:
504; SI:       ; %bb.0:
505; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
506; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
507; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
508; SI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
509; SI-NEXT:    s_mov_b32 s3, 0x100f000
510; SI-NEXT:    s_mov_b32 s2, -1
511; SI-NEXT:    s_waitcnt lgkmcnt(0)
512; SI-NEXT:    v_mov_b32_e32 v0, s8
513; SI-NEXT:    v_mov_b32_e32 v1, s9
514; SI-NEXT:    v_mov_b32_e32 v2, s10
515; SI-NEXT:    v_mov_b32_e32 v3, s11
516; SI-NEXT:    v_mov_b32_e32 v4, s12
517; SI-NEXT:    v_mov_b32_e32 v5, s13
518; SI-NEXT:    v_mov_b32_e32 v6, s14
519; SI-NEXT:    v_mov_b32_e32 v7, s15
520; SI-NEXT:    v_mov_b32_e32 v8, s16
521; SI-NEXT:    v_mov_b32_e32 v9, s17
522; SI-NEXT:    v_mov_b32_e32 v10, s18
523; SI-NEXT:    v_mov_b32_e32 v11, s19
524; SI-NEXT:    v_mov_b32_e32 v12, s20
525; SI-NEXT:    v_mov_b32_e32 v13, s21
526; SI-NEXT:    v_mov_b32_e32 v14, s22
527; SI-NEXT:    v_mov_b32_e32 v15, s23
528; SI-NEXT:    s_mov_b32 m0, s4
529; SI-NEXT:    v_movreld_b32_e32 v0, v16
530; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
531; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
532; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
533; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
534; SI-NEXT:    s_endpgm
535;
536; VI-LABEL: dynamic_insertelement_v16f32:
537; VI:       ; %bb.0:
538; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
539; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
540; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
541; VI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
542; VI-NEXT:    s_mov_b32 s3, 0x1100f000
543; VI-NEXT:    s_mov_b32 s2, -1
544; VI-NEXT:    s_waitcnt lgkmcnt(0)
545; VI-NEXT:    v_mov_b32_e32 v0, s8
546; VI-NEXT:    v_mov_b32_e32 v1, s9
547; VI-NEXT:    v_mov_b32_e32 v2, s10
548; VI-NEXT:    v_mov_b32_e32 v3, s11
549; VI-NEXT:    v_mov_b32_e32 v4, s12
550; VI-NEXT:    v_mov_b32_e32 v5, s13
551; VI-NEXT:    v_mov_b32_e32 v6, s14
552; VI-NEXT:    v_mov_b32_e32 v7, s15
553; VI-NEXT:    v_mov_b32_e32 v8, s16
554; VI-NEXT:    v_mov_b32_e32 v9, s17
555; VI-NEXT:    v_mov_b32_e32 v10, s18
556; VI-NEXT:    v_mov_b32_e32 v11, s19
557; VI-NEXT:    v_mov_b32_e32 v12, s20
558; VI-NEXT:    v_mov_b32_e32 v13, s21
559; VI-NEXT:    v_mov_b32_e32 v14, s22
560; VI-NEXT:    v_mov_b32_e32 v15, s23
561; VI-NEXT:    s_mov_b32 m0, s4
562; VI-NEXT:    v_movreld_b32_e32 v0, v16
563; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
564; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
565; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
566; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
567; VI-NEXT:    s_endpgm
568  %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
569  store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
570  ret void
571}
572
573define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
574; SI-LABEL: dynamic_insertelement_v2i32:
575; SI:       ; %bb.0:
576; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
577; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
578; SI-NEXT:    s_load_dword s4, s[4:5], 0x4
579; SI-NEXT:    s_mov_b32 s3, 0x100f000
580; SI-NEXT:    s_mov_b32 s2, -1
581; SI-NEXT:    s_waitcnt lgkmcnt(0)
582; SI-NEXT:    v_mov_b32_e32 v0, s7
583; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
584; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
585; SI-NEXT:    v_mov_b32_e32 v0, s6
586; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
587; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
588; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
589; SI-NEXT:    s_endpgm
590;
591; VI-LABEL: dynamic_insertelement_v2i32:
592; VI:       ; %bb.0:
593; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
594; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
595; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
596; VI-NEXT:    s_mov_b32 s3, 0x1100f000
597; VI-NEXT:    s_mov_b32 s2, -1
598; VI-NEXT:    s_waitcnt lgkmcnt(0)
599; VI-NEXT:    v_mov_b32_e32 v0, s7
600; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
601; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
602; VI-NEXT:    v_mov_b32_e32 v0, s6
603; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
604; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
605; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
606; VI-NEXT:    s_endpgm
607  %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
608  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
609  ret void
610}
611
612define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
613; SI-LABEL: dynamic_insertelement_v3i32:
614; SI:       ; %bb.0:
615; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
616; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
617; SI-NEXT:    s_load_dword s4, s[4:5], 0x8
618; SI-NEXT:    s_mov_b32 s3, 0x100f000
619; SI-NEXT:    s_mov_b32 s2, -1
620; SI-NEXT:    s_waitcnt lgkmcnt(0)
621; SI-NEXT:    v_mov_b32_e32 v0, s10
622; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
623; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
624; SI-NEXT:    v_mov_b32_e32 v0, s9
625; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
626; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
627; SI-NEXT:    v_mov_b32_e32 v0, s8
628; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
629; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
630; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
631; SI-NEXT:    s_endpgm
632;
633; VI-LABEL: dynamic_insertelement_v3i32:
634; VI:       ; %bb.0:
635; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
636; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
637; VI-NEXT:    s_load_dword s4, s[4:5], 0x20
638; VI-NEXT:    s_mov_b32 s3, 0x1100f000
639; VI-NEXT:    s_mov_b32 s2, -1
640; VI-NEXT:    s_waitcnt lgkmcnt(0)
641; VI-NEXT:    v_mov_b32_e32 v0, s10
642; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
643; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
644; VI-NEXT:    v_mov_b32_e32 v0, s9
645; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
646; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
647; VI-NEXT:    v_mov_b32_e32 v0, s8
648; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
649; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
650; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
651; VI-NEXT:    s_endpgm
652  %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
653  store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16
654  ret void
655}
656
657define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
658; SI-LABEL: dynamic_insertelement_v4i32:
659; SI:       ; %bb.0:
660; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
661; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
662; SI-NEXT:    s_load_dword s6, s[4:5], 0x8
663; SI-NEXT:    s_load_dword s4, s[4:5], 0x11
664; SI-NEXT:    s_mov_b32 s3, 0x100f000
665; SI-NEXT:    s_mov_b32 s2, -1
666; SI-NEXT:    s_waitcnt lgkmcnt(0)
667; SI-NEXT:    v_mov_b32_e32 v0, s11
668; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 3
669; SI-NEXT:    v_mov_b32_e32 v4, s4
670; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
671; SI-NEXT:    v_mov_b32_e32 v0, s10
672; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 2
673; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
674; SI-NEXT:    v_mov_b32_e32 v0, s9
675; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 1
676; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
677; SI-NEXT:    v_mov_b32_e32 v0, s8
678; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 0
679; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
680; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
681; SI-NEXT:    s_endpgm
682;
683; VI-LABEL: dynamic_insertelement_v4i32:
684; VI:       ; %bb.0:
685; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
686; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
687; VI-NEXT:    s_load_dword s6, s[4:5], 0x20
688; VI-NEXT:    s_load_dword s4, s[4:5], 0x44
689; VI-NEXT:    s_mov_b32 s3, 0x1100f000
690; VI-NEXT:    s_mov_b32 s2, -1
691; VI-NEXT:    s_waitcnt lgkmcnt(0)
692; VI-NEXT:    v_mov_b32_e32 v0, s11
693; VI-NEXT:    v_mov_b32_e32 v4, s4
694; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 3
695; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
696; VI-NEXT:    v_mov_b32_e32 v0, s10
697; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 2
698; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
699; VI-NEXT:    v_mov_b32_e32 v0, s9
700; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 1
701; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
702; VI-NEXT:    v_mov_b32_e32 v0, s8
703; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 0
704; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
705; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
706; VI-NEXT:    s_endpgm
707  %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
708  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
709  ret void
710}
711
712define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
713; SI-LABEL: dynamic_insertelement_v8i32:
714; SI:       ; %bb.0:
715; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
716; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
717; SI-NEXT:    s_load_dword s4, s[4:5], 0x10
718; SI-NEXT:    s_mov_b32 s3, 0x100f000
719; SI-NEXT:    s_mov_b32 s2, -1
720; SI-NEXT:    s_waitcnt lgkmcnt(0)
721; SI-NEXT:    v_mov_b32_e32 v0, s11
722; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
723; SI-NEXT:    v_cndmask_b32_e32 v3, 5, v0, vcc
724; SI-NEXT:    v_mov_b32_e32 v0, s10
725; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
726; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
727; SI-NEXT:    v_mov_b32_e32 v0, s9
728; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
729; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
730; SI-NEXT:    v_mov_b32_e32 v0, s8
731; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
732; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
733; SI-NEXT:    v_mov_b32_e32 v4, s15
734; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
735; SI-NEXT:    v_cndmask_b32_e32 v7, 5, v4, vcc
736; SI-NEXT:    v_mov_b32_e32 v4, s14
737; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
738; SI-NEXT:    v_cndmask_b32_e32 v6, 5, v4, vcc
739; SI-NEXT:    v_mov_b32_e32 v4, s13
740; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
741; SI-NEXT:    v_cndmask_b32_e32 v5, 5, v4, vcc
742; SI-NEXT:    v_mov_b32_e32 v4, s12
743; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
744; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
745; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
746; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
747; SI-NEXT:    s_endpgm
748;
749; VI-LABEL: dynamic_insertelement_v8i32:
750; VI:       ; %bb.0:
751; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
752; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
753; VI-NEXT:    s_load_dword s4, s[4:5], 0x40
754; VI-NEXT:    s_mov_b32 s3, 0x1100f000
755; VI-NEXT:    s_mov_b32 s2, -1
756; VI-NEXT:    s_waitcnt lgkmcnt(0)
757; VI-NEXT:    v_mov_b32_e32 v0, s11
758; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
759; VI-NEXT:    v_cndmask_b32_e32 v3, 5, v0, vcc
760; VI-NEXT:    v_mov_b32_e32 v0, s10
761; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
762; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
763; VI-NEXT:    v_mov_b32_e32 v0, s9
764; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
765; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
766; VI-NEXT:    v_mov_b32_e32 v0, s8
767; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
768; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
769; VI-NEXT:    v_mov_b32_e32 v4, s15
770; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
771; VI-NEXT:    v_cndmask_b32_e32 v7, 5, v4, vcc
772; VI-NEXT:    v_mov_b32_e32 v4, s14
773; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
774; VI-NEXT:    v_cndmask_b32_e32 v6, 5, v4, vcc
775; VI-NEXT:    v_mov_b32_e32 v4, s13
776; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
777; VI-NEXT:    v_cndmask_b32_e32 v5, 5, v4, vcc
778; VI-NEXT:    v_mov_b32_e32 v4, s12
779; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
780; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
781; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
782; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
783; VI-NEXT:    s_endpgm
784  %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
785  store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
786  ret void
787}
788
789define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
790; SI-LABEL: dynamic_insertelement_v16i32:
791; SI:       ; %bb.0:
792; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
793; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
794; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
795; SI-NEXT:    s_mov_b32 s3, 0x100f000
796; SI-NEXT:    s_mov_b32 s2, -1
797; SI-NEXT:    s_waitcnt lgkmcnt(0)
798; SI-NEXT:    v_mov_b32_e32 v0, s8
799; SI-NEXT:    v_mov_b32_e32 v1, s9
800; SI-NEXT:    v_mov_b32_e32 v2, s10
801; SI-NEXT:    v_mov_b32_e32 v3, s11
802; SI-NEXT:    v_mov_b32_e32 v4, s12
803; SI-NEXT:    v_mov_b32_e32 v5, s13
804; SI-NEXT:    v_mov_b32_e32 v6, s14
805; SI-NEXT:    v_mov_b32_e32 v7, s15
806; SI-NEXT:    v_mov_b32_e32 v8, s16
807; SI-NEXT:    v_mov_b32_e32 v9, s17
808; SI-NEXT:    v_mov_b32_e32 v10, s18
809; SI-NEXT:    v_mov_b32_e32 v11, s19
810; SI-NEXT:    v_mov_b32_e32 v12, s20
811; SI-NEXT:    v_mov_b32_e32 v13, s21
812; SI-NEXT:    v_mov_b32_e32 v14, s22
813; SI-NEXT:    v_mov_b32_e32 v15, s23
814; SI-NEXT:    s_mov_b32 m0, s4
815; SI-NEXT:    v_movreld_b32_e32 v0, 5
816; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
817; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
818; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
819; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
820; SI-NEXT:    s_endpgm
821;
822; VI-LABEL: dynamic_insertelement_v16i32:
823; VI:       ; %bb.0:
824; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
825; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
826; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
827; VI-NEXT:    s_mov_b32 s3, 0x1100f000
828; VI-NEXT:    s_mov_b32 s2, -1
829; VI-NEXT:    s_waitcnt lgkmcnt(0)
830; VI-NEXT:    v_mov_b32_e32 v0, s8
831; VI-NEXT:    v_mov_b32_e32 v1, s9
832; VI-NEXT:    v_mov_b32_e32 v2, s10
833; VI-NEXT:    v_mov_b32_e32 v3, s11
834; VI-NEXT:    v_mov_b32_e32 v4, s12
835; VI-NEXT:    v_mov_b32_e32 v5, s13
836; VI-NEXT:    v_mov_b32_e32 v6, s14
837; VI-NEXT:    v_mov_b32_e32 v7, s15
838; VI-NEXT:    v_mov_b32_e32 v8, s16
839; VI-NEXT:    v_mov_b32_e32 v9, s17
840; VI-NEXT:    v_mov_b32_e32 v10, s18
841; VI-NEXT:    v_mov_b32_e32 v11, s19
842; VI-NEXT:    v_mov_b32_e32 v12, s20
843; VI-NEXT:    v_mov_b32_e32 v13, s21
844; VI-NEXT:    v_mov_b32_e32 v14, s22
845; VI-NEXT:    v_mov_b32_e32 v15, s23
846; VI-NEXT:    s_mov_b32 m0, s4
847; VI-NEXT:    v_movreld_b32_e32 v0, 5
848; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
849; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
850; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
851; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
852; VI-NEXT:    s_endpgm
853  %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
854  store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
855  ret void
856}
857
858define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
859; SI-LABEL: dynamic_insertelement_v2i16:
860; SI:       ; %bb.0:
861; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
862; SI-NEXT:    s_load_dword s6, s[4:5], 0x2
863; SI-NEXT:    s_load_dword s4, s[4:5], 0x3
864; SI-NEXT:    v_mov_b32_e32 v0, 0x50005
865; SI-NEXT:    s_mov_b32 s3, 0x100f000
866; SI-NEXT:    s_mov_b32 s2, -1
867; SI-NEXT:    s_waitcnt lgkmcnt(0)
868; SI-NEXT:    v_mov_b32_e32 v1, s6
869; SI-NEXT:    s_lshl_b32 s4, s4, 4
870; SI-NEXT:    s_lshl_b32 s4, 0xffff, s4
871; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
872; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
873; SI-NEXT:    s_endpgm
874;
875; VI-LABEL: dynamic_insertelement_v2i16:
876; VI:       ; %bb.0:
877; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
878; VI-NEXT:    s_load_dword s6, s[4:5], 0x8
879; VI-NEXT:    s_load_dword s4, s[4:5], 0xc
880; VI-NEXT:    v_mov_b32_e32 v0, 0x50005
881; VI-NEXT:    s_mov_b32 s3, 0x1100f000
882; VI-NEXT:    s_mov_b32 s2, -1
883; VI-NEXT:    s_waitcnt lgkmcnt(0)
884; VI-NEXT:    v_mov_b32_e32 v1, s6
885; VI-NEXT:    s_lshl_b32 s4, s4, 4
886; VI-NEXT:    s_lshl_b32 s4, 0xffff, s4
887; VI-NEXT:    v_bfi_b32 v0, s4, v0, v1
888; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
889; VI-NEXT:    s_endpgm
890  %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
891  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
892  ret void
893}
894
895define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
896; SI-LABEL: dynamic_insertelement_v3i16:
897; SI:       ; %bb.0:
898; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
899; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
900; SI-NEXT:    s_load_dword s4, s[4:5], 0x4
901; SI-NEXT:    s_mov_b32 s5, 0
902; SI-NEXT:    s_mov_b32 s3, 0x100f000
903; SI-NEXT:    s_mov_b32 s2, -1
904; SI-NEXT:    s_waitcnt lgkmcnt(0)
905; SI-NEXT:    s_lshl_b32 s8, s4, 4
906; SI-NEXT:    s_mov_b32 s4, 0xffff
907; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
908; SI-NEXT:    s_mov_b32 s8, 0x50005
909; SI-NEXT:    s_and_b32 s9, s5, s8
910; SI-NEXT:    s_and_b32 s8, s4, s8
911; SI-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
912; SI-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
913; SI-NEXT:    v_mov_b32_e32 v0, s5
914; SI-NEXT:    v_mov_b32_e32 v1, s4
915; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
916; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
917; SI-NEXT:    s_endpgm
918;
919; VI-LABEL: dynamic_insertelement_v3i16:
920; VI:       ; %bb.0:
921; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
922; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
923; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
924; VI-NEXT:    s_mov_b32 s5, 0
925; VI-NEXT:    s_mov_b32 s3, 0x1100f000
926; VI-NEXT:    s_mov_b32 s2, -1
927; VI-NEXT:    s_waitcnt lgkmcnt(0)
928; VI-NEXT:    v_mov_b32_e32 v1, s7
929; VI-NEXT:    s_lshl_b32 s8, s4, 4
930; VI-NEXT:    s_mov_b32 s4, 0xffff
931; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
932; VI-NEXT:    s_mov_b32 s8, 0x50005
933; VI-NEXT:    v_mov_b32_e32 v0, s8
934; VI-NEXT:    v_bfi_b32 v0, s5, v0, v1
935; VI-NEXT:    v_mov_b32_e32 v1, s8
936; VI-NEXT:    v_mov_b32_e32 v2, s6
937; VI-NEXT:    v_bfi_b32 v1, s4, v1, v2
938; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
939; VI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
940; VI-NEXT:    s_endpgm
941  %vecins = insertelement <3 x i16> %a, i16 5, i32 %b
942  store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8
943  ret void
944}
945
946define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
947; SI-LABEL: dynamic_insertelement_v2i8:
948; SI:       ; %bb.0:
949; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
950; SI-NEXT:    s_load_dword s6, s[4:5], 0xa
951; SI-NEXT:    s_load_dword s4, s[4:5], 0x13
952; SI-NEXT:    v_mov_b32_e32 v0, 0x505
953; SI-NEXT:    s_mov_b32 s3, 0x100f000
954; SI-NEXT:    s_mov_b32 s2, -1
955; SI-NEXT:    s_waitcnt lgkmcnt(0)
956; SI-NEXT:    v_mov_b32_e32 v1, s6
957; SI-NEXT:    s_lshl_b32 s4, s4, 3
958; SI-NEXT:    s_lshl_b32 s4, -1, s4
959; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
960; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
961; SI-NEXT:    s_endpgm
962;
963; VI-LABEL: dynamic_insertelement_v2i8:
964; VI:       ; %bb.0:
965; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
966; VI-NEXT:    s_load_dword s6, s[4:5], 0x28
967; VI-NEXT:    s_load_dword s4, s[4:5], 0x4c
968; VI-NEXT:    s_mov_b32 s3, 0x1100f000
969; VI-NEXT:    s_mov_b32 s2, -1
970; VI-NEXT:    s_waitcnt lgkmcnt(0)
971; VI-NEXT:    s_lshl_b32 s4, s4, 3
972; VI-NEXT:    v_lshlrev_b16_e64 v0, s4, -1
973; VI-NEXT:    v_and_b32_e32 v1, 0x505, v0
974; VI-NEXT:    v_xor_b32_e32 v0, -1, v0
975; VI-NEXT:    v_and_b32_e32 v0, s6, v0
976; VI-NEXT:    v_or_b32_e32 v0, v1, v0
977; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
978; VI-NEXT:    s_endpgm
979  %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
980  store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
981  ret void
982}
983
984; FIXME: post legalize i16 and i32 shifts aren't merged because of
985; isTypeDesirableForOp in SimplifyDemandedBits
986define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
987; SI-LABEL: dynamic_insertelement_v3i8:
988; SI:       ; %bb.0:
989; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
990; SI-NEXT:    s_load_dword s6, s[4:5], 0xa
991; SI-NEXT:    s_load_dword s4, s[4:5], 0x13
992; SI-NEXT:    v_mov_b32_e32 v0, 0x5050505
993; SI-NEXT:    s_mov_b32 s3, 0x100f000
994; SI-NEXT:    s_mov_b32 s2, -1
995; SI-NEXT:    s_waitcnt lgkmcnt(0)
996; SI-NEXT:    v_mov_b32_e32 v1, s6
997; SI-NEXT:    s_lshl_b32 s4, s4, 3
998; SI-NEXT:    s_lshl_b32 s4, 0xffff, s4
999; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
1000; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1001; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1002; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:2
1003; SI-NEXT:    s_endpgm
1004;
1005; VI-LABEL: dynamic_insertelement_v3i8:
1006; VI:       ; %bb.0:
1007; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1008; VI-NEXT:    s_load_dword s6, s[4:5], 0x28
1009; VI-NEXT:    s_load_dword s4, s[4:5], 0x4c
1010; VI-NEXT:    v_mov_b32_e32 v0, 0x5050505
1011; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1012; VI-NEXT:    s_mov_b32 s2, -1
1013; VI-NEXT:    s_waitcnt lgkmcnt(0)
1014; VI-NEXT:    v_mov_b32_e32 v1, s6
1015; VI-NEXT:    s_lshl_b32 s4, s4, 3
1016; VI-NEXT:    s_lshl_b32 s4, 0xffff, s4
1017; VI-NEXT:    v_bfi_b32 v0, s4, v0, v1
1018; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1019; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1020; VI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:2
1021; VI-NEXT:    s_endpgm
1022  %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
1023  store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
1024  ret void
1025}
1026
1027define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
1028; SI-LABEL: dynamic_insertelement_v4i8:
1029; SI:       ; %bb.0:
1030; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1031; SI-NEXT:    s_load_dword s6, s[4:5], 0xa
1032; SI-NEXT:    s_load_dword s4, s[4:5], 0x13
1033; SI-NEXT:    v_mov_b32_e32 v0, 0x5050505
1034; SI-NEXT:    s_mov_b32 s3, 0x100f000
1035; SI-NEXT:    s_mov_b32 s2, -1
1036; SI-NEXT:    s_waitcnt lgkmcnt(0)
1037; SI-NEXT:    v_mov_b32_e32 v1, s6
1038; SI-NEXT:    s_lshl_b32 s4, s4, 3
1039; SI-NEXT:    s_lshl_b32 s4, 0xffff, s4
1040; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
1041; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1042; SI-NEXT:    s_endpgm
1043;
1044; VI-LABEL: dynamic_insertelement_v4i8:
1045; VI:       ; %bb.0:
1046; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1047; VI-NEXT:    s_load_dword s6, s[4:5], 0x28
1048; VI-NEXT:    s_load_dword s4, s[4:5], 0x4c
1049; VI-NEXT:    v_mov_b32_e32 v0, 0x5050505
1050; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1051; VI-NEXT:    s_mov_b32 s2, -1
1052; VI-NEXT:    s_waitcnt lgkmcnt(0)
1053; VI-NEXT:    v_mov_b32_e32 v1, s6
1054; VI-NEXT:    s_lshl_b32 s4, s4, 3
1055; VI-NEXT:    s_lshl_b32 s4, 0xffff, s4
1056; VI-NEXT:    v_bfi_b32 v0, s4, v0, v1
1057; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1058; VI-NEXT:    s_endpgm
1059  %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
1060  store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
1061  ret void
1062}
1063
1064define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind {
1065; SI-LABEL: s_dynamic_insertelement_v8i8:
1066; SI:       ; %bb.0:
1067; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
1068; SI-NEXT:    s_load_dword s6, s[4:5], 0x4
1069; SI-NEXT:    s_mov_b32 s7, 0
1070; SI-NEXT:    s_mov_b32 s3, 0x100f000
1071; SI-NEXT:    s_mov_b32 s2, -1
1072; SI-NEXT:    s_waitcnt lgkmcnt(0)
1073; SI-NEXT:    s_load_dwordx2 s[4:5], s[10:11], 0x0
1074; SI-NEXT:    s_mov_b32 s0, s8
1075; SI-NEXT:    s_lshl_b32 s8, s6, 3
1076; SI-NEXT:    s_mov_b32 s6, 0xffff
1077; SI-NEXT:    s_lshl_b64 s[6:7], s[6:7], s8
1078; SI-NEXT:    s_mov_b32 s8, 0x5050505
1079; SI-NEXT:    s_mov_b32 s1, s9
1080; SI-NEXT:    s_and_b32 s9, s7, s8
1081; SI-NEXT:    s_and_b32 s8, s6, s8
1082; SI-NEXT:    s_waitcnt lgkmcnt(0)
1083; SI-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
1084; SI-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
1085; SI-NEXT:    v_mov_b32_e32 v0, s4
1086; SI-NEXT:    v_mov_b32_e32 v1, s5
1087; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1088; SI-NEXT:    s_endpgm
1089;
1090; VI-LABEL: s_dynamic_insertelement_v8i8:
1091; VI:       ; %bb.0:
1092; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
1093; VI-NEXT:    s_load_dword s6, s[4:5], 0x10
1094; VI-NEXT:    s_mov_b32 s7, 0
1095; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1096; VI-NEXT:    s_mov_b32 s2, -1
1097; VI-NEXT:    s_waitcnt lgkmcnt(0)
1098; VI-NEXT:    s_load_dwordx2 s[4:5], s[10:11], 0x0
1099; VI-NEXT:    s_mov_b32 s0, s8
1100; VI-NEXT:    s_lshl_b32 s8, s6, 3
1101; VI-NEXT:    s_mov_b32 s6, 0xffff
1102; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], s8
1103; VI-NEXT:    s_mov_b32 s8, 0x5050505
1104; VI-NEXT:    s_mov_b32 s1, s9
1105; VI-NEXT:    s_and_b32 s9, s7, s8
1106; VI-NEXT:    s_and_b32 s8, s6, s8
1107; VI-NEXT:    s_waitcnt lgkmcnt(0)
1108; VI-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
1109; VI-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
1110; VI-NEXT:    v_mov_b32_e32 v0, s4
1111; VI-NEXT:    v_mov_b32_e32 v1, s5
1112; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1113; VI-NEXT:    s_endpgm
1114  %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4
1115  %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
1116  store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8
1117  ret void
1118}
1119
1120define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
1121; SI-LABEL: dynamic_insertelement_v16i8:
1122; SI:       ; %bb.0:
1123; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1124; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
1125; SI-NEXT:    s_load_dword s4, s[4:5], 0x8
1126; SI-NEXT:    s_mov_b32 s3, 0x100f000
1127; SI-NEXT:    s_mov_b32 s2, -1
1128; SI-NEXT:    s_waitcnt lgkmcnt(0)
1129; SI-NEXT:    s_lshr_b32 s5, s11, 24
1130; SI-NEXT:    v_mov_b32_e32 v0, s5
1131; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 15
1132; SI-NEXT:    s_lshr_b32 s5, s11, 16
1133; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1134; SI-NEXT:    v_mov_b32_e32 v1, s5
1135; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 14
1136; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1137; SI-NEXT:    s_movk_i32 s5, 0xff
1138; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1139; SI-NEXT:    v_and_b32_e32 v1, s5, v1
1140; SI-NEXT:    s_lshr_b32 s6, s11, 8
1141; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1142; SI-NEXT:    v_mov_b32_e32 v1, s6
1143; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 13
1144; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1145; SI-NEXT:    v_mov_b32_e32 v2, s11
1146; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 12
1147; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1148; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1149; SI-NEXT:    v_and_b32_e32 v2, s5, v2
1150; SI-NEXT:    v_or_b32_e32 v1, v2, v1
1151; SI-NEXT:    s_mov_b32 s6, 0xffff
1152; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1153; SI-NEXT:    v_and_b32_e32 v1, s6, v1
1154; SI-NEXT:    s_lshr_b32 s7, s10, 24
1155; SI-NEXT:    v_or_b32_e32 v3, v1, v0
1156; SI-NEXT:    v_mov_b32_e32 v0, s7
1157; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 11
1158; SI-NEXT:    s_lshr_b32 s7, s10, 16
1159; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1160; SI-NEXT:    v_mov_b32_e32 v1, s7
1161; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 10
1162; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1163; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1164; SI-NEXT:    v_and_b32_e32 v1, s5, v1
1165; SI-NEXT:    s_lshr_b32 s7, s10, 8
1166; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1167; SI-NEXT:    v_mov_b32_e32 v1, s7
1168; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 9
1169; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1170; SI-NEXT:    v_mov_b32_e32 v2, s10
1171; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 8
1172; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1173; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1174; SI-NEXT:    v_and_b32_e32 v2, s5, v2
1175; SI-NEXT:    v_or_b32_e32 v1, v2, v1
1176; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1177; SI-NEXT:    v_and_b32_e32 v1, s6, v1
1178; SI-NEXT:    s_lshr_b32 s7, s9, 24
1179; SI-NEXT:    v_or_b32_e32 v2, v1, v0
1180; SI-NEXT:    v_mov_b32_e32 v0, s7
1181; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
1182; SI-NEXT:    s_lshr_b32 s7, s9, 16
1183; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1184; SI-NEXT:    v_mov_b32_e32 v1, s7
1185; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
1186; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1187; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1188; SI-NEXT:    v_and_b32_e32 v1, s5, v1
1189; SI-NEXT:    s_lshr_b32 s7, s9, 8
1190; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1191; SI-NEXT:    v_mov_b32_e32 v1, s7
1192; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
1193; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1194; SI-NEXT:    v_mov_b32_e32 v4, s9
1195; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
1196; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1197; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1198; SI-NEXT:    v_and_b32_e32 v4, s5, v4
1199; SI-NEXT:    v_or_b32_e32 v1, v4, v1
1200; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1201; SI-NEXT:    v_and_b32_e32 v1, s6, v1
1202; SI-NEXT:    s_lshr_b32 s7, s8, 24
1203; SI-NEXT:    v_or_b32_e32 v1, v1, v0
1204; SI-NEXT:    v_mov_b32_e32 v0, s7
1205; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
1206; SI-NEXT:    s_lshr_b32 s7, s8, 16
1207; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1208; SI-NEXT:    v_mov_b32_e32 v4, s7
1209; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
1210; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1211; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1212; SI-NEXT:    v_and_b32_e32 v4, s5, v4
1213; SI-NEXT:    s_lshr_b32 s7, s8, 8
1214; SI-NEXT:    v_or_b32_e32 v0, v4, v0
1215; SI-NEXT:    v_mov_b32_e32 v4, s7
1216; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
1217; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1218; SI-NEXT:    v_mov_b32_e32 v5, s8
1219; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
1220; SI-NEXT:    v_cndmask_b32_e32 v5, 5, v5, vcc
1221; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
1222; SI-NEXT:    v_and_b32_e32 v5, s5, v5
1223; SI-NEXT:    v_or_b32_e32 v4, v5, v4
1224; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1225; SI-NEXT:    v_and_b32_e32 v4, s6, v4
1226; SI-NEXT:    v_or_b32_e32 v0, v4, v0
1227; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1228; SI-NEXT:    s_endpgm
1229;
1230; VI-LABEL: dynamic_insertelement_v16i8:
1231; VI:       ; %bb.0:
1232; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1233; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
1234; VI-NEXT:    s_load_dword s4, s[4:5], 0x20
1235; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1236; VI-NEXT:    s_mov_b32 s2, -1
1237; VI-NEXT:    s_waitcnt lgkmcnt(0)
1238; VI-NEXT:    s_lshr_b32 s5, s11, 24
1239; VI-NEXT:    v_mov_b32_e32 v0, s5
1240; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 15
1241; VI-NEXT:    s_lshr_b32 s5, s11, 16
1242; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1243; VI-NEXT:    v_mov_b32_e32 v1, s5
1244; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 14
1245; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1246; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1247; VI-NEXT:    s_lshr_b32 s5, s11, 8
1248; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1249; VI-NEXT:    v_mov_b32_e32 v1, s5
1250; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 13
1251; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1252; VI-NEXT:    v_mov_b32_e32 v2, s11
1253; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 12
1254; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1255; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
1256; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1257; VI-NEXT:    s_lshr_b32 s5, s10, 24
1258; VI-NEXT:    v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1259; VI-NEXT:    v_mov_b32_e32 v0, s5
1260; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 11
1261; VI-NEXT:    s_lshr_b32 s5, s10, 16
1262; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1263; VI-NEXT:    v_mov_b32_e32 v1, s5
1264; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 10
1265; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1266; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1267; VI-NEXT:    s_lshr_b32 s5, s10, 8
1268; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1269; VI-NEXT:    v_mov_b32_e32 v1, s5
1270; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 9
1271; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1272; VI-NEXT:    v_mov_b32_e32 v2, s10
1273; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 8
1274; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1275; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
1276; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1277; VI-NEXT:    s_lshr_b32 s5, s9, 24
1278; VI-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1279; VI-NEXT:    v_mov_b32_e32 v0, s5
1280; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
1281; VI-NEXT:    s_lshr_b32 s5, s9, 16
1282; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1283; VI-NEXT:    v_mov_b32_e32 v1, s5
1284; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
1285; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1286; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1287; VI-NEXT:    s_lshr_b32 s5, s9, 8
1288; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1289; VI-NEXT:    v_mov_b32_e32 v1, s5
1290; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
1291; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1292; VI-NEXT:    v_mov_b32_e32 v4, s9
1293; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
1294; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1295; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
1296; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1297; VI-NEXT:    s_lshr_b32 s5, s8, 24
1298; VI-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1299; VI-NEXT:    v_mov_b32_e32 v0, s5
1300; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
1301; VI-NEXT:    s_lshr_b32 s5, s8, 16
1302; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1303; VI-NEXT:    v_mov_b32_e32 v4, s5
1304; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
1305; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1306; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1307; VI-NEXT:    s_lshr_b32 s5, s8, 8
1308; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1309; VI-NEXT:    v_mov_b32_e32 v4, s5
1310; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
1311; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1312; VI-NEXT:    v_mov_b32_e32 v5, s8
1313; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
1314; VI-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
1315; VI-NEXT:    v_cndmask_b32_e32 v5, 5, v5, vcc
1316; VI-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1317; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1318; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1319; VI-NEXT:    s_endpgm
1320  %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
1321  store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
1322  ret void
1323}
1324
1325; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
1326; the compiler doesn't crash.
1327define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
1328; SI-LABEL: insert_split_bb:
1329; SI:       ; %bb.0: ; %entry
1330; SI-NEXT:    s_load_dword s0, s[4:5], 0x4
1331; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
1332; SI-NEXT:    s_waitcnt lgkmcnt(0)
1333; SI-NEXT:    s_cmp_lg_u32 s0, 0
1334; SI-NEXT:    s_cbranch_scc0 BB26_2
1335; SI-NEXT:  ; %bb.1: ; %else
1336; SI-NEXT:    s_load_dword s1, s[6:7], 0x1
1337; SI-NEXT:    s_mov_b64 s[2:3], 0
1338; SI-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
1339; SI-NEXT:    s_waitcnt lgkmcnt(0)
1340; SI-NEXT:    s_mov_b64 vcc, vcc
1341; SI-NEXT:    s_cbranch_vccz BB26_3
1342; SI-NEXT:    s_branch BB26_4
1343; SI-NEXT:  BB26_2:
1344; SI-NEXT:    s_mov_b64 s[2:3], -1
1345; SI-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
1346; SI-NEXT:    s_cbranch_vccnz BB26_4
1347; SI-NEXT:  BB26_3: ; %if
1348; SI-NEXT:    s_load_dword s1, s[6:7], 0x0
1349; SI-NEXT:  BB26_4: ; %endif
1350; SI-NEXT:    s_waitcnt lgkmcnt(0)
1351; SI-NEXT:    v_mov_b32_e32 v0, s0
1352; SI-NEXT:    s_mov_b32 s7, 0x100f000
1353; SI-NEXT:    s_mov_b32 s6, -1
1354; SI-NEXT:    v_mov_b32_e32 v1, s1
1355; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1356; SI-NEXT:    s_endpgm
1357;
1358; VI-LABEL: insert_split_bb:
1359; VI:       ; %bb.0: ; %entry
1360; VI-NEXT:    s_load_dword s0, s[4:5], 0x10
1361; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
1362; VI-NEXT:    s_waitcnt lgkmcnt(0)
1363; VI-NEXT:    s_cmp_lg_u32 s0, 0
1364; VI-NEXT:    s_cbranch_scc0 BB26_2
1365; VI-NEXT:  ; %bb.1: ; %else
1366; VI-NEXT:    s_load_dword s1, s[6:7], 0x4
1367; VI-NEXT:    s_mov_b64 s[2:3], 0
1368; VI-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
1369; VI-NEXT:    s_cbranch_vccz BB26_3
1370; VI-NEXT:    s_branch BB26_4
1371; VI-NEXT:  BB26_2:
1372; VI-NEXT:    s_mov_b64 s[2:3], -1
1373; VI-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
1374; VI-NEXT:    s_cbranch_vccnz BB26_4
1375; VI-NEXT:  BB26_3: ; %if
1376; VI-NEXT:    s_waitcnt lgkmcnt(0)
1377; VI-NEXT:    s_load_dword s1, s[6:7], 0x0
1378; VI-NEXT:  BB26_4: ; %endif
1379; VI-NEXT:    s_waitcnt lgkmcnt(0)
1380; VI-NEXT:    v_mov_b32_e32 v0, s0
1381; VI-NEXT:    s_mov_b32 s7, 0x1100f000
1382; VI-NEXT:    s_mov_b32 s6, -1
1383; VI-NEXT:    v_mov_b32_e32 v1, s1
1384; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1385; VI-NEXT:    s_endpgm
1386entry:
1387  %0 = insertelement <2 x i32> undef, i32 %a, i32 0
1388  %1 = icmp eq i32 %a, 0
1389  br i1 %1, label %if, label %else
1390
1391if:
1392  %2 = load i32, i32 addrspace(1)* %in
1393  %3 = insertelement <2 x i32> %0, i32 %2, i32 1
1394  br label %endif
1395
1396else:
1397  %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1
1398  %5 = load i32, i32 addrspace(1)* %4
1399  %6 = insertelement <2 x i32> %0, i32 %5, i32 1
1400  br label %endif
1401
1402endif:
1403  %7 = phi <2 x i32> [%3, %if], [%6, %else]
1404  store <2 x i32> %7, <2 x i32> addrspace(1)* %out
1405  ret void
1406}
1407
1408define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
1409; SI-LABEL: dynamic_insertelement_v2f64:
1410; SI:       ; %bb.0:
1411; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1412; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xc
1413; SI-NEXT:    s_load_dword s4, s[4:5], 0x18
1414; SI-NEXT:    v_mov_b32_e32 v1, 0x40200000
1415; SI-NEXT:    s_mov_b32 s3, 0x100f000
1416; SI-NEXT:    s_mov_b32 s2, -1
1417; SI-NEXT:    s_waitcnt lgkmcnt(0)
1418; SI-NEXT:    v_mov_b32_e32 v0, s11
1419; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
1420; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
1421; SI-NEXT:    v_mov_b32_e32 v0, s10
1422; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1423; SI-NEXT:    v_mov_b32_e32 v0, s9
1424; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
1425; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
1426; SI-NEXT:    v_mov_b32_e32 v0, s8
1427; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1428; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1429; SI-NEXT:    s_endpgm
1430;
1431; VI-LABEL: dynamic_insertelement_v2f64:
1432; VI:       ; %bb.0:
1433; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1434; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x30
1435; VI-NEXT:    s_load_dword s4, s[4:5], 0x60
1436; VI-NEXT:    v_mov_b32_e32 v1, 0x40200000
1437; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1438; VI-NEXT:    s_mov_b32 s2, -1
1439; VI-NEXT:    s_waitcnt lgkmcnt(0)
1440; VI-NEXT:    v_mov_b32_e32 v0, s11
1441; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
1442; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
1443; VI-NEXT:    v_mov_b32_e32 v0, s10
1444; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1445; VI-NEXT:    v_mov_b32_e32 v0, s9
1446; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
1447; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
1448; VI-NEXT:    v_mov_b32_e32 v0, s8
1449; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1450; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1451; VI-NEXT:    s_endpgm
1452  %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
1453  store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
1454  ret void
1455}
1456
1457define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
1458; SI-LABEL: dynamic_insertelement_v2i64:
1459; SI:       ; %bb.0:
1460; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1461; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
1462; SI-NEXT:    s_load_dword s6, s[4:5], 0x8
1463; SI-NEXT:    s_mov_b32 s3, 0x100f000
1464; SI-NEXT:    s_mov_b32 s2, -1
1465; SI-NEXT:    s_waitcnt lgkmcnt(0)
1466; SI-NEXT:    v_mov_b32_e32 v0, s11
1467; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 1
1468; SI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1469; SI-NEXT:    v_mov_b32_e32 v0, s10
1470; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1471; SI-NEXT:    v_mov_b32_e32 v0, s9
1472; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 0
1473; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1474; SI-NEXT:    v_mov_b32_e32 v0, s8
1475; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1476; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1477; SI-NEXT:    s_endpgm
1478;
1479; VI-LABEL: dynamic_insertelement_v2i64:
1480; VI:       ; %bb.0:
1481; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1482; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
1483; VI-NEXT:    s_load_dword s6, s[4:5], 0x20
1484; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1485; VI-NEXT:    s_mov_b32 s2, -1
1486; VI-NEXT:    s_waitcnt lgkmcnt(0)
1487; VI-NEXT:    v_mov_b32_e32 v0, s11
1488; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 1
1489; VI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1490; VI-NEXT:    v_mov_b32_e32 v0, s10
1491; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1492; VI-NEXT:    v_mov_b32_e32 v0, s9
1493; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 0
1494; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1495; VI-NEXT:    v_mov_b32_e32 v0, s8
1496; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1497; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1498; VI-NEXT:    s_endpgm
1499  %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
1500  store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
1501  ret void
1502}
1503
1504define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
1505; SI-LABEL: dynamic_insertelement_v3i64:
1506; SI:       ; %bb.0:
1507; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1508; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
1509; SI-NEXT:    s_load_dword s6, s[4:5], 0x10
1510; SI-NEXT:    s_mov_b32 s3, 0x100f000
1511; SI-NEXT:    s_mov_b32 s2, -1
1512; SI-NEXT:    s_waitcnt lgkmcnt(0)
1513; SI-NEXT:    v_mov_b32_e32 v0, s13
1514; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 2
1515; SI-NEXT:    v_cndmask_b32_e64 v5, v0, 0, s[4:5]
1516; SI-NEXT:    v_mov_b32_e32 v0, s12
1517; SI-NEXT:    v_cndmask_b32_e64 v4, v0, 5, s[4:5]
1518; SI-NEXT:    v_mov_b32_e32 v0, s11
1519; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 1
1520; SI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1521; SI-NEXT:    v_mov_b32_e32 v0, s10
1522; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1523; SI-NEXT:    v_mov_b32_e32 v0, s9
1524; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 0
1525; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1526; SI-NEXT:    v_mov_b32_e32 v0, s8
1527; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1528; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
1529; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1530; SI-NEXT:    s_endpgm
1531;
1532; VI-LABEL: dynamic_insertelement_v3i64:
1533; VI:       ; %bb.0:
1534; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1535; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
1536; VI-NEXT:    s_load_dword s6, s[4:5], 0x40
1537; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1538; VI-NEXT:    s_mov_b32 s2, -1
1539; VI-NEXT:    s_waitcnt lgkmcnt(0)
1540; VI-NEXT:    v_mov_b32_e32 v0, s13
1541; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 2
1542; VI-NEXT:    v_cndmask_b32_e64 v5, v0, 0, s[4:5]
1543; VI-NEXT:    v_mov_b32_e32 v0, s12
1544; VI-NEXT:    v_cndmask_b32_e64 v4, v0, 5, s[4:5]
1545; VI-NEXT:    v_mov_b32_e32 v0, s11
1546; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 1
1547; VI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1548; VI-NEXT:    v_mov_b32_e32 v0, s10
1549; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1550; VI-NEXT:    v_mov_b32_e32 v0, s9
1551; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 0
1552; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1553; VI-NEXT:    v_mov_b32_e32 v0, s8
1554; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1555; VI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
1556; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1557; VI-NEXT:    s_endpgm
1558  %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
1559  store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
1560  ret void
1561}
1562
1563define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
1564; SI-LABEL: dynamic_insertelement_v4f64:
1565; SI:       ; %bb.0:
1566; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1567; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
1568; SI-NEXT:    s_load_dword s4, s[4:5], 0x10
1569; SI-NEXT:    v_mov_b32_e32 v4, 0x40200000
1570; SI-NEXT:    s_mov_b32 s3, 0x100f000
1571; SI-NEXT:    s_mov_b32 s2, -1
1572; SI-NEXT:    s_waitcnt lgkmcnt(0)
1573; SI-NEXT:    v_mov_b32_e32 v0, s11
1574; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
1575; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
1576; SI-NEXT:    v_mov_b32_e32 v0, s10
1577; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1578; SI-NEXT:    v_mov_b32_e32 v0, s9
1579; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
1580; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
1581; SI-NEXT:    v_mov_b32_e32 v0, s8
1582; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1583; SI-NEXT:    v_mov_b32_e32 v5, s15
1584; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 3
1585; SI-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
1586; SI-NEXT:    v_mov_b32_e32 v5, s14
1587; SI-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
1588; SI-NEXT:    v_mov_b32_e32 v5, s13
1589; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 2
1590; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
1591; SI-NEXT:    v_mov_b32_e32 v4, s12
1592; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
1593; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1594; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1595; SI-NEXT:    s_endpgm
1596;
1597; VI-LABEL: dynamic_insertelement_v4f64:
1598; VI:       ; %bb.0:
1599; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1600; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
1601; VI-NEXT:    s_load_dword s4, s[4:5], 0x40
1602; VI-NEXT:    v_mov_b32_e32 v4, 0x40200000
1603; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1604; VI-NEXT:    s_mov_b32 s2, -1
1605; VI-NEXT:    s_waitcnt lgkmcnt(0)
1606; VI-NEXT:    v_mov_b32_e32 v0, s11
1607; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
1608; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
1609; VI-NEXT:    v_mov_b32_e32 v0, s10
1610; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1611; VI-NEXT:    v_mov_b32_e32 v0, s9
1612; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
1613; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
1614; VI-NEXT:    v_mov_b32_e32 v0, s8
1615; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1616; VI-NEXT:    v_mov_b32_e32 v5, s15
1617; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 3
1618; VI-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
1619; VI-NEXT:    v_mov_b32_e32 v5, s14
1620; VI-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
1621; VI-NEXT:    v_mov_b32_e32 v5, s13
1622; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 2
1623; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
1624; VI-NEXT:    v_mov_b32_e32 v4, s12
1625; VI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
1626; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1627; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1628; VI-NEXT:    s_endpgm
1629  %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
1630  store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
1631  ret void
1632}
1633
1634define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 {
1635; SI-LABEL: dynamic_insertelement_v8f64:
1636; SI:       ; %bb.0:
1637; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1638; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
1639; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
1640; SI-NEXT:    v_mov_b32_e32 v16, 0x40200000
1641; SI-NEXT:    s_mov_b32 s3, 0x100f000
1642; SI-NEXT:    s_mov_b32 s2, -1
1643; SI-NEXT:    s_waitcnt lgkmcnt(0)
1644; SI-NEXT:    v_mov_b32_e32 v0, s8
1645; SI-NEXT:    s_lshl_b32 s4, s4, 1
1646; SI-NEXT:    v_mov_b32_e32 v1, s9
1647; SI-NEXT:    v_mov_b32_e32 v2, s10
1648; SI-NEXT:    v_mov_b32_e32 v3, s11
1649; SI-NEXT:    v_mov_b32_e32 v4, s12
1650; SI-NEXT:    v_mov_b32_e32 v5, s13
1651; SI-NEXT:    v_mov_b32_e32 v6, s14
1652; SI-NEXT:    v_mov_b32_e32 v7, s15
1653; SI-NEXT:    v_mov_b32_e32 v8, s16
1654; SI-NEXT:    v_mov_b32_e32 v9, s17
1655; SI-NEXT:    v_mov_b32_e32 v10, s18
1656; SI-NEXT:    v_mov_b32_e32 v11, s19
1657; SI-NEXT:    v_mov_b32_e32 v12, s20
1658; SI-NEXT:    v_mov_b32_e32 v13, s21
1659; SI-NEXT:    v_mov_b32_e32 v14, s22
1660; SI-NEXT:    v_mov_b32_e32 v15, s23
1661; SI-NEXT:    s_mov_b32 m0, s4
1662; SI-NEXT:    v_movreld_b32_e32 v0, 0
1663; SI-NEXT:    v_movreld_b32_e32 v1, v16
1664; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1665; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1666; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1667; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1668; SI-NEXT:    s_endpgm
1669;
1670; VI-LABEL: dynamic_insertelement_v8f64:
1671; VI:       ; %bb.0:
1672; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1673; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
1674; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
1675; VI-NEXT:    v_mov_b32_e32 v16, 0x40200000
1676; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1677; VI-NEXT:    s_mov_b32 s2, -1
1678; VI-NEXT:    s_waitcnt lgkmcnt(0)
1679; VI-NEXT:    v_mov_b32_e32 v0, s8
1680; VI-NEXT:    s_lshl_b32 s4, s4, 1
1681; VI-NEXT:    v_mov_b32_e32 v1, s9
1682; VI-NEXT:    v_mov_b32_e32 v2, s10
1683; VI-NEXT:    v_mov_b32_e32 v3, s11
1684; VI-NEXT:    v_mov_b32_e32 v4, s12
1685; VI-NEXT:    v_mov_b32_e32 v5, s13
1686; VI-NEXT:    v_mov_b32_e32 v6, s14
1687; VI-NEXT:    v_mov_b32_e32 v7, s15
1688; VI-NEXT:    v_mov_b32_e32 v8, s16
1689; VI-NEXT:    v_mov_b32_e32 v9, s17
1690; VI-NEXT:    v_mov_b32_e32 v10, s18
1691; VI-NEXT:    v_mov_b32_e32 v11, s19
1692; VI-NEXT:    v_mov_b32_e32 v12, s20
1693; VI-NEXT:    v_mov_b32_e32 v13, s21
1694; VI-NEXT:    v_mov_b32_e32 v14, s22
1695; VI-NEXT:    v_mov_b32_e32 v15, s23
1696; VI-NEXT:    s_mov_b32 m0, s4
1697; VI-NEXT:    v_movreld_b32_e32 v0, 0
1698; VI-NEXT:    v_movreld_b32_e32 v1, v16
1699; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1700; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1701; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1702; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1703; VI-NEXT:    s_endpgm
1704  %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
1705  store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
1706  ret void
1707}
1708
1709declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
1710
1711attributes #0 = { nounwind }
1712attributes #1 = { nounwind readnone }
1713