1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s
3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s
4
5; FIXME: Broken on evergreen
6; FIXME: For some reason the 8 and 16 vectors are being stored as
7; individual elements instead of 128-bit stores.
8
9
10; FIXME: Why is the constant moved into the intermediate register and
11; not just directly into the vector component?
12define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
13; SI-LABEL: insertelement_v4f32_0:
14; SI:       ; %bb.0:
15; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
16; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
17; SI-NEXT:    s_mov_b32 s8, 0x40a00000
18; SI-NEXT:    s_mov_b32 s3, 0x100f000
19; SI-NEXT:    s_mov_b32 s2, -1
20; SI-NEXT:    s_waitcnt lgkmcnt(0)
21; SI-NEXT:    v_mov_b32_e32 v0, s4
22; SI-NEXT:    v_mov_b32_e32 v1, s5
23; SI-NEXT:    v_mov_b32_e32 v2, s6
24; SI-NEXT:    v_mov_b32_e32 v3, s7
25; SI-NEXT:    v_mov_b32_e32 v0, s8
26; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
27; SI-NEXT:    s_endpgm
28;
29; VI-LABEL: insertelement_v4f32_0:
30; VI:       ; %bb.0:
31; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
32; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
33; VI-NEXT:    s_mov_b32 s8, 0x40a00000
34; VI-NEXT:    s_mov_b32 s3, 0x1100f000
35; VI-NEXT:    s_mov_b32 s2, -1
36; VI-NEXT:    s_waitcnt lgkmcnt(0)
37; VI-NEXT:    v_mov_b32_e32 v0, s4
38; VI-NEXT:    v_mov_b32_e32 v1, s5
39; VI-NEXT:    v_mov_b32_e32 v2, s6
40; VI-NEXT:    v_mov_b32_e32 v3, s7
41; VI-NEXT:    v_mov_b32_e32 v0, s8
42; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
43; VI-NEXT:    s_endpgm
44  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
45  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
46  ret void
47}
48
49define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
50; SI-LABEL: insertelement_v4f32_1:
51; SI:       ; %bb.0:
52; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
53; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
54; SI-NEXT:    s_mov_b32 s8, 0x40a00000
55; SI-NEXT:    s_mov_b32 s3, 0x100f000
56; SI-NEXT:    s_mov_b32 s2, -1
57; SI-NEXT:    s_waitcnt lgkmcnt(0)
58; SI-NEXT:    v_mov_b32_e32 v0, s4
59; SI-NEXT:    v_mov_b32_e32 v1, s5
60; SI-NEXT:    v_mov_b32_e32 v2, s6
61; SI-NEXT:    v_mov_b32_e32 v3, s7
62; SI-NEXT:    v_mov_b32_e32 v1, s8
63; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
64; SI-NEXT:    s_endpgm
65;
66; VI-LABEL: insertelement_v4f32_1:
67; VI:       ; %bb.0:
68; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
69; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
70; VI-NEXT:    s_mov_b32 s8, 0x40a00000
71; VI-NEXT:    s_mov_b32 s3, 0x1100f000
72; VI-NEXT:    s_mov_b32 s2, -1
73; VI-NEXT:    s_waitcnt lgkmcnt(0)
74; VI-NEXT:    v_mov_b32_e32 v0, s4
75; VI-NEXT:    v_mov_b32_e32 v1, s5
76; VI-NEXT:    v_mov_b32_e32 v2, s6
77; VI-NEXT:    v_mov_b32_e32 v3, s7
78; VI-NEXT:    v_mov_b32_e32 v1, s8
79; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
80; VI-NEXT:    s_endpgm
81  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
82  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
83  ret void
84}
85
86define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
87; SI-LABEL: insertelement_v4f32_2:
88; SI:       ; %bb.0:
89; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
90; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
91; SI-NEXT:    s_mov_b32 s8, 0x40a00000
92; SI-NEXT:    s_mov_b32 s3, 0x100f000
93; SI-NEXT:    s_mov_b32 s2, -1
94; SI-NEXT:    s_waitcnt lgkmcnt(0)
95; SI-NEXT:    v_mov_b32_e32 v0, s4
96; SI-NEXT:    v_mov_b32_e32 v2, s6
97; SI-NEXT:    v_mov_b32_e32 v1, s5
98; SI-NEXT:    v_mov_b32_e32 v3, s7
99; SI-NEXT:    v_mov_b32_e32 v2, s8
100; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
101; SI-NEXT:    s_endpgm
102;
103; VI-LABEL: insertelement_v4f32_2:
104; VI:       ; %bb.0:
105; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
106; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
107; VI-NEXT:    s_mov_b32 s8, 0x40a00000
108; VI-NEXT:    s_mov_b32 s3, 0x1100f000
109; VI-NEXT:    s_mov_b32 s2, -1
110; VI-NEXT:    s_waitcnt lgkmcnt(0)
111; VI-NEXT:    v_mov_b32_e32 v0, s4
112; VI-NEXT:    v_mov_b32_e32 v2, s6
113; VI-NEXT:    v_mov_b32_e32 v1, s5
114; VI-NEXT:    v_mov_b32_e32 v3, s7
115; VI-NEXT:    v_mov_b32_e32 v2, s8
116; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
117; VI-NEXT:    s_endpgm
118  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
119  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
120  ret void
121}
122
123define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
124; SI-LABEL: insertelement_v4f32_3:
125; SI:       ; %bb.0:
126; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
127; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
128; SI-NEXT:    s_mov_b32 s8, 0x40a00000
129; SI-NEXT:    s_mov_b32 s3, 0x100f000
130; SI-NEXT:    s_mov_b32 s2, -1
131; SI-NEXT:    s_waitcnt lgkmcnt(0)
132; SI-NEXT:    v_mov_b32_e32 v0, s4
133; SI-NEXT:    v_mov_b32_e32 v3, s7
134; SI-NEXT:    v_mov_b32_e32 v1, s5
135; SI-NEXT:    v_mov_b32_e32 v2, s6
136; SI-NEXT:    v_mov_b32_e32 v3, s8
137; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
138; SI-NEXT:    s_endpgm
139;
140; VI-LABEL: insertelement_v4f32_3:
141; VI:       ; %bb.0:
142; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
143; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
144; VI-NEXT:    s_mov_b32 s8, 0x40a00000
145; VI-NEXT:    s_mov_b32 s3, 0x1100f000
146; VI-NEXT:    s_mov_b32 s2, -1
147; VI-NEXT:    s_waitcnt lgkmcnt(0)
148; VI-NEXT:    v_mov_b32_e32 v0, s4
149; VI-NEXT:    v_mov_b32_e32 v3, s7
150; VI-NEXT:    v_mov_b32_e32 v1, s5
151; VI-NEXT:    v_mov_b32_e32 v2, s6
152; VI-NEXT:    v_mov_b32_e32 v3, s8
153; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
154; VI-NEXT:    s_endpgm
155  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
156  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
157  ret void
158}
159
160define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
161; SI-LABEL: insertelement_v4i32_0:
162; SI:       ; %bb.0:
163; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
164; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
165; SI-NEXT:    s_waitcnt lgkmcnt(0)
166; SI-NEXT:    s_movk_i32 s4, 0x3e7
167; SI-NEXT:    v_mov_b32_e32 v0, s4
168; SI-NEXT:    s_mov_b32 s3, 0x100f000
169; SI-NEXT:    s_mov_b32 s2, -1
170; SI-NEXT:    v_mov_b32_e32 v1, s5
171; SI-NEXT:    v_mov_b32_e32 v2, s6
172; SI-NEXT:    v_mov_b32_e32 v3, s7
173; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
174; SI-NEXT:    s_endpgm
175;
176; VI-LABEL: insertelement_v4i32_0:
177; VI:       ; %bb.0:
178; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
179; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
180; VI-NEXT:    s_waitcnt lgkmcnt(0)
181; VI-NEXT:    s_movk_i32 s4, 0x3e7
182; VI-NEXT:    v_mov_b32_e32 v0, s4
183; VI-NEXT:    s_mov_b32 s3, 0x1100f000
184; VI-NEXT:    s_mov_b32 s2, -1
185; VI-NEXT:    v_mov_b32_e32 v1, s5
186; VI-NEXT:    v_mov_b32_e32 v2, s6
187; VI-NEXT:    v_mov_b32_e32 v3, s7
188; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
189; VI-NEXT:    s_endpgm
190  %vecins = insertelement <4 x i32> %a, i32 999, i32 0
191  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
192  ret void
193}
194
195define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
196; SI-LABEL: insertelement_v3f32_1:
197; SI:       ; %bb.0:
198; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
199; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
200; SI-NEXT:    s_mov_b32 s3, 0x100f000
201; SI-NEXT:    s_mov_b32 s2, -1
202; SI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
203; SI-NEXT:    s_waitcnt lgkmcnt(0)
204; SI-NEXT:    v_mov_b32_e32 v2, s6
205; SI-NEXT:    v_mov_b32_e32 v0, s4
206; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
207; SI-NEXT:    s_endpgm
208;
209; VI-LABEL: insertelement_v3f32_1:
210; VI:       ; %bb.0:
211; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
212; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
213; VI-NEXT:    s_mov_b32 s3, 0x1100f000
214; VI-NEXT:    s_mov_b32 s2, -1
215; VI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
216; VI-NEXT:    s_waitcnt lgkmcnt(0)
217; VI-NEXT:    v_mov_b32_e32 v2, s6
218; VI-NEXT:    v_mov_b32_e32 v0, s4
219; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
220; VI-NEXT:    s_endpgm
221  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
222  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
223  ret void
224}
225
226define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
227; SI-LABEL: insertelement_v3f32_2:
228; SI:       ; %bb.0:
229; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
230; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
231; SI-NEXT:    s_mov_b32 s3, 0x100f000
232; SI-NEXT:    s_mov_b32 s2, -1
233; SI-NEXT:    v_mov_b32_e32 v2, 0x40a00000
234; SI-NEXT:    s_waitcnt lgkmcnt(0)
235; SI-NEXT:    v_mov_b32_e32 v1, s5
236; SI-NEXT:    v_mov_b32_e32 v0, s4
237; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
238; SI-NEXT:    s_endpgm
239;
240; VI-LABEL: insertelement_v3f32_2:
241; VI:       ; %bb.0:
242; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
243; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
244; VI-NEXT:    s_mov_b32 s3, 0x1100f000
245; VI-NEXT:    s_mov_b32 s2, -1
246; VI-NEXT:    v_mov_b32_e32 v2, 0x40a00000
247; VI-NEXT:    s_waitcnt lgkmcnt(0)
248; VI-NEXT:    v_mov_b32_e32 v1, s5
249; VI-NEXT:    v_mov_b32_e32 v0, s4
250; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
251; VI-NEXT:    s_endpgm
252  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
253  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
254  ret void
255}
256
257define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
258; GCN-LABEL: insertelement_v3f32_3:
259; GCN:       ; %bb.0:
260; GCN-NEXT:    s_endpgm
261  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3
262  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
263  ret void
264}
265
266define <4 x float> @insertelement_to_sgpr() nounwind {
267; GCN-LABEL: insertelement_to_sgpr:
268; GCN:       ; %bb.0:
269; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
270; GCN-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x0
271; GCN-NEXT:    s_waitcnt lgkmcnt(0)
272; GCN-NEXT:    s_mov_b32 s12, 0
273; GCN-NEXT:    s_mov_b32 s4, s12
274; GCN-NEXT:    s_mov_b32 s5, s12
275; GCN-NEXT:    s_mov_b32 s6, s12
276; GCN-NEXT:    s_mov_b32 s7, s12
277; GCN-NEXT:    s_mov_b32 s8, s12
278; GCN-NEXT:    s_mov_b32 s9, s12
279; GCN-NEXT:    s_mov_b32 s10, s12
280; GCN-NEXT:    s_mov_b32 s11, s12
281; GCN-NEXT:    image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1
282; GCN-NEXT:    s_waitcnt vmcnt(0)
283; GCN-NEXT:    s_setpc_b64 s[30:31]
284  %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef
285  %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
286  %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)
287  ret <4 x float> %tmp2
288}
289
290define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
291; SI-LABEL: dynamic_insertelement_v2f32:
292; SI:       ; %bb.0:
293; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
294; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
295; SI-NEXT:    s_load_dword s4, s[4:5], 0x4
296; SI-NEXT:    v_mov_b32_e32 v2, 0x40a00000
297; SI-NEXT:    s_mov_b32 s3, 0x100f000
298; SI-NEXT:    s_mov_b32 s2, -1
299; SI-NEXT:    s_waitcnt lgkmcnt(0)
300; SI-NEXT:    v_mov_b32_e32 v0, s7
301; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
302; SI-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc
303; SI-NEXT:    v_mov_b32_e32 v0, s6
304; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
305; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
306; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
307; SI-NEXT:    s_endpgm
308;
309; VI-LABEL: dynamic_insertelement_v2f32:
310; VI:       ; %bb.0:
311; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
312; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
313; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
314; VI-NEXT:    v_mov_b32_e32 v2, 0x40a00000
315; VI-NEXT:    s_mov_b32 s3, 0x1100f000
316; VI-NEXT:    s_mov_b32 s2, -1
317; VI-NEXT:    s_waitcnt lgkmcnt(0)
318; VI-NEXT:    v_mov_b32_e32 v0, s7
319; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
320; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v0, vcc
321; VI-NEXT:    v_mov_b32_e32 v0, s6
322; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
323; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
324; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
325; VI-NEXT:    s_endpgm
326  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
327  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
328  ret void
329}
330
331define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
332; SI-LABEL: dynamic_insertelement_v3f32:
333; SI:       ; %bb.0:
334; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
335; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
336; SI-NEXT:    s_load_dword s4, s[4:5], 0x8
337; SI-NEXT:    v_mov_b32_e32 v3, 0x40a00000
338; SI-NEXT:    s_mov_b32 s3, 0x100f000
339; SI-NEXT:    s_mov_b32 s2, -1
340; SI-NEXT:    s_waitcnt lgkmcnt(0)
341; SI-NEXT:    v_mov_b32_e32 v2, s10
342; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
343; SI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
344; SI-NEXT:    v_mov_b32_e32 v1, s9
345; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
346; SI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
347; SI-NEXT:    v_mov_b32_e32 v0, s8
348; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
349; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
350; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
351; SI-NEXT:    s_endpgm
352;
353; VI-LABEL: dynamic_insertelement_v3f32:
354; VI:       ; %bb.0:
355; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
356; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
357; VI-NEXT:    s_load_dword s4, s[4:5], 0x20
358; VI-NEXT:    v_mov_b32_e32 v3, 0x40a00000
359; VI-NEXT:    s_mov_b32 s3, 0x1100f000
360; VI-NEXT:    s_mov_b32 s2, -1
361; VI-NEXT:    s_waitcnt lgkmcnt(0)
362; VI-NEXT:    v_mov_b32_e32 v2, s10
363; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
364; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
365; VI-NEXT:    v_mov_b32_e32 v1, s9
366; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
367; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
368; VI-NEXT:    v_mov_b32_e32 v0, s8
369; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
370; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
371; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
372; VI-NEXT:    s_endpgm
373  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
374  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
375  ret void
376}
377
378define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
379; SI-LABEL: dynamic_insertelement_v4f32:
380; SI:       ; %bb.0:
381; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
382; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
383; SI-NEXT:    s_load_dword s4, s[4:5], 0x8
384; SI-NEXT:    v_mov_b32_e32 v4, 0x40a00000
385; SI-NEXT:    s_mov_b32 s3, 0x100f000
386; SI-NEXT:    s_mov_b32 s2, -1
387; SI-NEXT:    s_waitcnt lgkmcnt(0)
388; SI-NEXT:    v_mov_b32_e32 v0, s11
389; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
390; SI-NEXT:    v_cndmask_b32_e32 v3, v4, v0, vcc
391; SI-NEXT:    v_mov_b32_e32 v0, s10
392; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
393; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v0, vcc
394; SI-NEXT:    v_mov_b32_e32 v0, s9
395; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
396; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v0, vcc
397; SI-NEXT:    v_mov_b32_e32 v0, s8
398; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
399; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
400; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
401; SI-NEXT:    s_endpgm
402;
403; VI-LABEL: dynamic_insertelement_v4f32:
404; VI:       ; %bb.0:
405; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
406; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
407; VI-NEXT:    s_load_dword s4, s[4:5], 0x20
408; VI-NEXT:    v_mov_b32_e32 v4, 0x40a00000
409; VI-NEXT:    s_mov_b32 s3, 0x1100f000
410; VI-NEXT:    s_mov_b32 s2, -1
411; VI-NEXT:    s_waitcnt lgkmcnt(0)
412; VI-NEXT:    v_mov_b32_e32 v0, s11
413; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
414; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v0, vcc
415; VI-NEXT:    v_mov_b32_e32 v0, s10
416; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
417; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v0, vcc
418; VI-NEXT:    v_mov_b32_e32 v0, s9
419; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
420; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v0, vcc
421; VI-NEXT:    v_mov_b32_e32 v0, s8
422; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
423; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
424; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
425; VI-NEXT:    s_endpgm
426  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
427  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
428  ret void
429}
430
431define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
432; SI-LABEL: dynamic_insertelement_v8f32:
433; SI:       ; %bb.0:
434; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
435; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
436; SI-NEXT:    s_load_dword s4, s[4:5], 0x10
437; SI-NEXT:    v_mov_b32_e32 v4, 0x40a00000
438; SI-NEXT:    s_mov_b32 s3, 0x100f000
439; SI-NEXT:    s_mov_b32 s2, -1
440; SI-NEXT:    s_waitcnt lgkmcnt(0)
441; SI-NEXT:    v_mov_b32_e32 v0, s11
442; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
443; SI-NEXT:    v_cndmask_b32_e32 v3, v4, v0, vcc
444; SI-NEXT:    v_mov_b32_e32 v0, s10
445; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
446; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v0, vcc
447; SI-NEXT:    v_mov_b32_e32 v0, s9
448; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
449; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v0, vcc
450; SI-NEXT:    v_mov_b32_e32 v0, s8
451; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
452; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
453; SI-NEXT:    v_mov_b32_e32 v5, s15
454; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
455; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v5, vcc
456; SI-NEXT:    v_mov_b32_e32 v5, s14
457; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
458; SI-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
459; SI-NEXT:    v_mov_b32_e32 v5, s13
460; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
461; SI-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc
462; SI-NEXT:    v_mov_b32_e32 v8, s12
463; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
464; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
465; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
466; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
467; SI-NEXT:    s_endpgm
468;
469; VI-LABEL: dynamic_insertelement_v8f32:
470; VI:       ; %bb.0:
471; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
472; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
473; VI-NEXT:    s_load_dword s4, s[4:5], 0x40
474; VI-NEXT:    v_mov_b32_e32 v4, 0x40a00000
475; VI-NEXT:    s_mov_b32 s3, 0x1100f000
476; VI-NEXT:    s_mov_b32 s2, -1
477; VI-NEXT:    s_waitcnt lgkmcnt(0)
478; VI-NEXT:    v_mov_b32_e32 v0, s11
479; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
480; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v0, vcc
481; VI-NEXT:    v_mov_b32_e32 v0, s10
482; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
483; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v0, vcc
484; VI-NEXT:    v_mov_b32_e32 v0, s9
485; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
486; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v0, vcc
487; VI-NEXT:    v_mov_b32_e32 v0, s8
488; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
489; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
490; VI-NEXT:    v_mov_b32_e32 v5, s15
491; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
492; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v5, vcc
493; VI-NEXT:    v_mov_b32_e32 v5, s14
494; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
495; VI-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
496; VI-NEXT:    v_mov_b32_e32 v5, s13
497; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
498; VI-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc
499; VI-NEXT:    v_mov_b32_e32 v8, s12
500; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
501; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
502; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
503; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
504; VI-NEXT:    s_endpgm
505  %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
506  store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
507  ret void
508}
509
510define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
511; SI-LABEL: dynamic_insertelement_v16f32:
512; SI:       ; %bb.0:
513; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
514; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
515; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
516; SI-NEXT:    s_mov_b32 s3, 0x100f000
517; SI-NEXT:    s_mov_b32 s2, -1
518; SI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
519; SI-NEXT:    s_waitcnt lgkmcnt(0)
520; SI-NEXT:    v_mov_b32_e32 v0, s8
521; SI-NEXT:    v_mov_b32_e32 v1, s9
522; SI-NEXT:    v_mov_b32_e32 v2, s10
523; SI-NEXT:    v_mov_b32_e32 v3, s11
524; SI-NEXT:    v_mov_b32_e32 v4, s12
525; SI-NEXT:    v_mov_b32_e32 v5, s13
526; SI-NEXT:    v_mov_b32_e32 v6, s14
527; SI-NEXT:    v_mov_b32_e32 v7, s15
528; SI-NEXT:    v_mov_b32_e32 v8, s16
529; SI-NEXT:    v_mov_b32_e32 v9, s17
530; SI-NEXT:    v_mov_b32_e32 v10, s18
531; SI-NEXT:    v_mov_b32_e32 v11, s19
532; SI-NEXT:    v_mov_b32_e32 v12, s20
533; SI-NEXT:    v_mov_b32_e32 v13, s21
534; SI-NEXT:    v_mov_b32_e32 v14, s22
535; SI-NEXT:    v_mov_b32_e32 v15, s23
536; SI-NEXT:    s_mov_b32 m0, s4
537; SI-NEXT:    v_movreld_b32_e32 v0, v16
538; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
539; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
540; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
541; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
542; SI-NEXT:    s_endpgm
543;
544; VI-LABEL: dynamic_insertelement_v16f32:
545; VI:       ; %bb.0:
546; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
547; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
548; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
549; VI-NEXT:    s_mov_b32 s3, 0x1100f000
550; VI-NEXT:    s_mov_b32 s2, -1
551; VI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
552; VI-NEXT:    s_waitcnt lgkmcnt(0)
553; VI-NEXT:    v_mov_b32_e32 v0, s8
554; VI-NEXT:    v_mov_b32_e32 v1, s9
555; VI-NEXT:    v_mov_b32_e32 v2, s10
556; VI-NEXT:    v_mov_b32_e32 v3, s11
557; VI-NEXT:    v_mov_b32_e32 v4, s12
558; VI-NEXT:    v_mov_b32_e32 v5, s13
559; VI-NEXT:    v_mov_b32_e32 v6, s14
560; VI-NEXT:    v_mov_b32_e32 v7, s15
561; VI-NEXT:    v_mov_b32_e32 v8, s16
562; VI-NEXT:    v_mov_b32_e32 v9, s17
563; VI-NEXT:    v_mov_b32_e32 v10, s18
564; VI-NEXT:    v_mov_b32_e32 v11, s19
565; VI-NEXT:    v_mov_b32_e32 v12, s20
566; VI-NEXT:    v_mov_b32_e32 v13, s21
567; VI-NEXT:    v_mov_b32_e32 v14, s22
568; VI-NEXT:    v_mov_b32_e32 v15, s23
569; VI-NEXT:    s_mov_b32 m0, s4
570; VI-NEXT:    v_movreld_b32_e32 v0, v16
571; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
572; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
573; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
574; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
575; VI-NEXT:    s_endpgm
576  %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
577  store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
578  ret void
579}
580
581define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
582; SI-LABEL: dynamic_insertelement_v2i32:
583; SI:       ; %bb.0:
584; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
585; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
586; SI-NEXT:    s_load_dword s4, s[4:5], 0x4
587; SI-NEXT:    s_mov_b32 s3, 0x100f000
588; SI-NEXT:    s_mov_b32 s2, -1
589; SI-NEXT:    s_waitcnt lgkmcnt(0)
590; SI-NEXT:    v_mov_b32_e32 v0, s7
591; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
592; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
593; SI-NEXT:    v_mov_b32_e32 v0, s6
594; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
595; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
596; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
597; SI-NEXT:    s_endpgm
598;
599; VI-LABEL: dynamic_insertelement_v2i32:
600; VI:       ; %bb.0:
601; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
602; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
603; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
604; VI-NEXT:    s_mov_b32 s3, 0x1100f000
605; VI-NEXT:    s_mov_b32 s2, -1
606; VI-NEXT:    s_waitcnt lgkmcnt(0)
607; VI-NEXT:    v_mov_b32_e32 v0, s7
608; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
609; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
610; VI-NEXT:    v_mov_b32_e32 v0, s6
611; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
612; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
613; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
614; VI-NEXT:    s_endpgm
615  %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
616  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
617  ret void
618}
619
620define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
621; SI-LABEL: dynamic_insertelement_v3i32:
622; SI:       ; %bb.0:
623; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
624; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
625; SI-NEXT:    s_load_dword s4, s[4:5], 0x8
626; SI-NEXT:    s_mov_b32 s3, 0x100f000
627; SI-NEXT:    s_mov_b32 s2, -1
628; SI-NEXT:    s_waitcnt lgkmcnt(0)
629; SI-NEXT:    v_mov_b32_e32 v0, s10
630; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
631; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
632; SI-NEXT:    v_mov_b32_e32 v0, s9
633; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
634; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
635; SI-NEXT:    v_mov_b32_e32 v0, s8
636; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
637; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
638; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
639; SI-NEXT:    s_endpgm
640;
641; VI-LABEL: dynamic_insertelement_v3i32:
642; VI:       ; %bb.0:
643; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
644; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
645; VI-NEXT:    s_load_dword s4, s[4:5], 0x20
646; VI-NEXT:    s_mov_b32 s3, 0x1100f000
647; VI-NEXT:    s_mov_b32 s2, -1
648; VI-NEXT:    s_waitcnt lgkmcnt(0)
649; VI-NEXT:    v_mov_b32_e32 v0, s10
650; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
651; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
652; VI-NEXT:    v_mov_b32_e32 v0, s9
653; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
654; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
655; VI-NEXT:    v_mov_b32_e32 v0, s8
656; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
657; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
658; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
659; VI-NEXT:    s_endpgm
660  %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
661  store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16
662  ret void
663}
664
665define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
666; SI-LABEL: dynamic_insertelement_v4i32:
667; SI:       ; %bb.0:
668; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
669; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
670; SI-NEXT:    s_load_dword s6, s[4:5], 0x8
671; SI-NEXT:    s_load_dword s4, s[4:5], 0x11
672; SI-NEXT:    s_mov_b32 s3, 0x100f000
673; SI-NEXT:    s_mov_b32 s2, -1
674; SI-NEXT:    s_waitcnt lgkmcnt(0)
675; SI-NEXT:    v_mov_b32_e32 v0, s11
676; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 3
677; SI-NEXT:    v_mov_b32_e32 v4, s4
678; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
679; SI-NEXT:    v_mov_b32_e32 v0, s10
680; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 2
681; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
682; SI-NEXT:    v_mov_b32_e32 v0, s9
683; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 1
684; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
685; SI-NEXT:    v_mov_b32_e32 v0, s8
686; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 0
687; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
688; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
689; SI-NEXT:    s_endpgm
690;
691; VI-LABEL: dynamic_insertelement_v4i32:
692; VI:       ; %bb.0:
693; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
694; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
695; VI-NEXT:    s_load_dword s6, s[4:5], 0x20
696; VI-NEXT:    s_load_dword s4, s[4:5], 0x44
697; VI-NEXT:    s_mov_b32 s3, 0x1100f000
698; VI-NEXT:    s_mov_b32 s2, -1
699; VI-NEXT:    s_waitcnt lgkmcnt(0)
700; VI-NEXT:    v_mov_b32_e32 v0, s11
701; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 3
702; VI-NEXT:    v_mov_b32_e32 v4, s4
703; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
704; VI-NEXT:    v_mov_b32_e32 v0, s10
705; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 2
706; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
707; VI-NEXT:    v_mov_b32_e32 v0, s9
708; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 1
709; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
710; VI-NEXT:    v_mov_b32_e32 v0, s8
711; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 0
712; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
713; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
714; VI-NEXT:    s_endpgm
715  %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
716  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
717  ret void
718}
719
720define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
721; SI-LABEL: dynamic_insertelement_v8i32:
722; SI:       ; %bb.0:
723; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
724; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
725; SI-NEXT:    s_load_dword s4, s[4:5], 0x10
726; SI-NEXT:    s_mov_b32 s3, 0x100f000
727; SI-NEXT:    s_mov_b32 s2, -1
728; SI-NEXT:    s_waitcnt lgkmcnt(0)
729; SI-NEXT:    v_mov_b32_e32 v0, s11
730; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
731; SI-NEXT:    v_cndmask_b32_e32 v3, 5, v0, vcc
732; SI-NEXT:    v_mov_b32_e32 v0, s10
733; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
734; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
735; SI-NEXT:    v_mov_b32_e32 v0, s9
736; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
737; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
738; SI-NEXT:    v_mov_b32_e32 v0, s8
739; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
740; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
741; SI-NEXT:    v_mov_b32_e32 v4, s15
742; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
743; SI-NEXT:    v_cndmask_b32_e32 v7, 5, v4, vcc
744; SI-NEXT:    v_mov_b32_e32 v4, s14
745; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
746; SI-NEXT:    v_cndmask_b32_e32 v6, 5, v4, vcc
747; SI-NEXT:    v_mov_b32_e32 v4, s13
748; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
749; SI-NEXT:    v_cndmask_b32_e32 v5, 5, v4, vcc
750; SI-NEXT:    v_mov_b32_e32 v4, s12
751; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
752; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
753; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
754; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
755; SI-NEXT:    s_endpgm
756;
757; VI-LABEL: dynamic_insertelement_v8i32:
758; VI:       ; %bb.0:
759; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
760; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
761; VI-NEXT:    s_load_dword s4, s[4:5], 0x40
762; VI-NEXT:    s_mov_b32 s3, 0x1100f000
763; VI-NEXT:    s_mov_b32 s2, -1
764; VI-NEXT:    s_waitcnt lgkmcnt(0)
765; VI-NEXT:    v_mov_b32_e32 v0, s11
766; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
767; VI-NEXT:    v_cndmask_b32_e32 v3, 5, v0, vcc
768; VI-NEXT:    v_mov_b32_e32 v0, s10
769; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
770; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
771; VI-NEXT:    v_mov_b32_e32 v0, s9
772; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
773; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
774; VI-NEXT:    v_mov_b32_e32 v0, s8
775; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
776; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
777; VI-NEXT:    v_mov_b32_e32 v4, s15
778; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
779; VI-NEXT:    v_cndmask_b32_e32 v7, 5, v4, vcc
780; VI-NEXT:    v_mov_b32_e32 v4, s14
781; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
782; VI-NEXT:    v_cndmask_b32_e32 v6, 5, v4, vcc
783; VI-NEXT:    v_mov_b32_e32 v4, s13
784; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
785; VI-NEXT:    v_cndmask_b32_e32 v5, 5, v4, vcc
786; VI-NEXT:    v_mov_b32_e32 v4, s12
787; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
788; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
789; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
790; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
791; VI-NEXT:    s_endpgm
792  %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
793  store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
794  ret void
795}
796
797define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
798; SI-LABEL: dynamic_insertelement_v16i32:
799; SI:       ; %bb.0:
800; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
801; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
802; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
803; SI-NEXT:    s_mov_b32 s3, 0x100f000
804; SI-NEXT:    s_mov_b32 s2, -1
805; SI-NEXT:    s_waitcnt lgkmcnt(0)
806; SI-NEXT:    v_mov_b32_e32 v0, s8
807; SI-NEXT:    v_mov_b32_e32 v1, s9
808; SI-NEXT:    v_mov_b32_e32 v2, s10
809; SI-NEXT:    v_mov_b32_e32 v3, s11
810; SI-NEXT:    v_mov_b32_e32 v4, s12
811; SI-NEXT:    v_mov_b32_e32 v5, s13
812; SI-NEXT:    v_mov_b32_e32 v6, s14
813; SI-NEXT:    v_mov_b32_e32 v7, s15
814; SI-NEXT:    v_mov_b32_e32 v8, s16
815; SI-NEXT:    v_mov_b32_e32 v9, s17
816; SI-NEXT:    v_mov_b32_e32 v10, s18
817; SI-NEXT:    v_mov_b32_e32 v11, s19
818; SI-NEXT:    v_mov_b32_e32 v12, s20
819; SI-NEXT:    v_mov_b32_e32 v13, s21
820; SI-NEXT:    v_mov_b32_e32 v14, s22
821; SI-NEXT:    v_mov_b32_e32 v15, s23
822; SI-NEXT:    s_mov_b32 m0, s4
823; SI-NEXT:    v_movreld_b32_e32 v0, 5
824; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
825; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
826; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
827; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
828; SI-NEXT:    s_endpgm
829;
830; VI-LABEL: dynamic_insertelement_v16i32:
831; VI:       ; %bb.0:
832; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
833; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
834; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
835; VI-NEXT:    s_mov_b32 s3, 0x1100f000
836; VI-NEXT:    s_mov_b32 s2, -1
837; VI-NEXT:    s_waitcnt lgkmcnt(0)
838; VI-NEXT:    v_mov_b32_e32 v0, s8
839; VI-NEXT:    v_mov_b32_e32 v1, s9
840; VI-NEXT:    v_mov_b32_e32 v2, s10
841; VI-NEXT:    v_mov_b32_e32 v3, s11
842; VI-NEXT:    v_mov_b32_e32 v4, s12
843; VI-NEXT:    v_mov_b32_e32 v5, s13
844; VI-NEXT:    v_mov_b32_e32 v6, s14
845; VI-NEXT:    v_mov_b32_e32 v7, s15
846; VI-NEXT:    v_mov_b32_e32 v8, s16
847; VI-NEXT:    v_mov_b32_e32 v9, s17
848; VI-NEXT:    v_mov_b32_e32 v10, s18
849; VI-NEXT:    v_mov_b32_e32 v11, s19
850; VI-NEXT:    v_mov_b32_e32 v12, s20
851; VI-NEXT:    v_mov_b32_e32 v13, s21
852; VI-NEXT:    v_mov_b32_e32 v14, s22
853; VI-NEXT:    v_mov_b32_e32 v15, s23
854; VI-NEXT:    s_mov_b32 m0, s4
855; VI-NEXT:    v_movreld_b32_e32 v0, 5
856; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
857; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
858; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
859; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
860; VI-NEXT:    s_endpgm
861  %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
862  store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
863  ret void
864}
865
866define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
867; SI-LABEL: dynamic_insertelement_v2i16:
868; SI:       ; %bb.0:
869; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
870; SI-NEXT:    s_load_dword s6, s[4:5], 0x2
871; SI-NEXT:    s_load_dword s4, s[4:5], 0x3
872; SI-NEXT:    v_mov_b32_e32 v0, 0x50005
873; SI-NEXT:    s_mov_b32 s3, 0x100f000
874; SI-NEXT:    s_mov_b32 s2, -1
875; SI-NEXT:    s_waitcnt lgkmcnt(0)
876; SI-NEXT:    v_mov_b32_e32 v1, s6
877; SI-NEXT:    s_lshl_b32 s4, s4, 4
878; SI-NEXT:    s_lshl_b32 s4, 0xffff, s4
879; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
880; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
881; SI-NEXT:    s_endpgm
882;
883; VI-LABEL: dynamic_insertelement_v2i16:
884; VI:       ; %bb.0:
885; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
886; VI-NEXT:    s_load_dword s6, s[4:5], 0x8
887; VI-NEXT:    s_load_dword s4, s[4:5], 0xc
888; VI-NEXT:    v_mov_b32_e32 v0, 0x50005
889; VI-NEXT:    s_mov_b32 s3, 0x1100f000
890; VI-NEXT:    s_mov_b32 s2, -1
891; VI-NEXT:    s_waitcnt lgkmcnt(0)
892; VI-NEXT:    v_mov_b32_e32 v1, s6
893; VI-NEXT:    s_lshl_b32 s4, s4, 4
894; VI-NEXT:    s_lshl_b32 s4, 0xffff, s4
895; VI-NEXT:    v_bfi_b32 v0, s4, v0, v1
896; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
897; VI-NEXT:    s_endpgm
898  %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
899  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
900  ret void
901}
902
903define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
904; SI-LABEL: dynamic_insertelement_v3i16:
905; SI:       ; %bb.0:
906; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
907; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2
908; SI-NEXT:    s_load_dword s4, s[4:5], 0x4
909; SI-NEXT:    s_mov_b32 s5, 0
910; SI-NEXT:    s_mov_b32 s3, 0x100f000
911; SI-NEXT:    s_mov_b32 s2, -1
912; SI-NEXT:    s_waitcnt lgkmcnt(0)
913; SI-NEXT:    s_lshl_b32 s8, s4, 4
914; SI-NEXT:    s_mov_b32 s4, 0xffff
915; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
916; SI-NEXT:    s_mov_b32 s8, 0x50005
917; SI-NEXT:    s_and_b32 s9, s5, s8
918; SI-NEXT:    s_and_b32 s8, s4, s8
919; SI-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
920; SI-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
921; SI-NEXT:    v_mov_b32_e32 v0, s5
922; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
923; SI-NEXT:    v_mov_b32_e32 v0, s4
924; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
925; SI-NEXT:    s_endpgm
926;
927; VI-LABEL: dynamic_insertelement_v3i16:
928; VI:       ; %bb.0:
929; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
930; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
931; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
932; VI-NEXT:    s_mov_b32 s5, 0
933; VI-NEXT:    s_mov_b32 s3, 0x1100f000
934; VI-NEXT:    s_mov_b32 s2, -1
935; VI-NEXT:    s_waitcnt lgkmcnt(0)
936; VI-NEXT:    v_mov_b32_e32 v1, s7
937; VI-NEXT:    s_lshl_b32 s8, s4, 4
938; VI-NEXT:    s_mov_b32 s4, 0xffff
939; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
940; VI-NEXT:    s_mov_b32 s8, 0x50005
941; VI-NEXT:    v_mov_b32_e32 v0, s8
942; VI-NEXT:    v_bfi_b32 v0, s5, v0, v1
943; VI-NEXT:    v_mov_b32_e32 v1, s8
944; VI-NEXT:    v_mov_b32_e32 v2, s6
945; VI-NEXT:    v_bfi_b32 v1, s4, v1, v2
946; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
947; VI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
948; VI-NEXT:    s_endpgm
949  %vecins = insertelement <3 x i16> %a, i16 5, i32 %b
950  store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8
951  ret void
952}
953
954define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
955; SI-LABEL: dynamic_insertelement_v2i8:
956; SI:       ; %bb.0:
957; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
958; SI-NEXT:    s_load_dword s6, s[4:5], 0xa
959; SI-NEXT:    s_load_dword s4, s[4:5], 0x13
960; SI-NEXT:    v_mov_b32_e32 v0, 0x505
961; SI-NEXT:    s_mov_b32 s3, 0x100f000
962; SI-NEXT:    s_mov_b32 s2, -1
963; SI-NEXT:    s_waitcnt lgkmcnt(0)
964; SI-NEXT:    v_mov_b32_e32 v1, s6
965; SI-NEXT:    s_lshl_b32 s4, s4, 3
966; SI-NEXT:    s_lshl_b32 s4, -1, s4
967; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
968; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
969; SI-NEXT:    s_endpgm
970;
971; VI-LABEL: dynamic_insertelement_v2i8:
972; VI:       ; %bb.0:
973; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
974; VI-NEXT:    s_load_dword s6, s[4:5], 0x28
975; VI-NEXT:    s_load_dword s4, s[4:5], 0x4c
976; VI-NEXT:    s_mov_b32 s3, 0x1100f000
977; VI-NEXT:    s_mov_b32 s2, -1
978; VI-NEXT:    s_waitcnt lgkmcnt(0)
979; VI-NEXT:    s_lshl_b32 s4, s4, 3
980; VI-NEXT:    v_lshlrev_b16_e64 v0, s4, -1
981; VI-NEXT:    v_and_b32_e32 v1, 0x505, v0
982; VI-NEXT:    v_xor_b32_e32 v0, -1, v0
983; VI-NEXT:    v_and_b32_e32 v0, s6, v0
984; VI-NEXT:    v_or_b32_e32 v0, v1, v0
985; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
986; VI-NEXT:    s_endpgm
987  %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
988  store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
989  ret void
990}
991
992; FIXME: post legalize i16 and i32 shifts aren't merged because of
993; isTypeDesirableForOp in SimplifyDemandedBits
994define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
995; SI-LABEL: dynamic_insertelement_v3i8:
996; SI:       ; %bb.0:
997; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
998; SI-NEXT:    s_load_dword s6, s[4:5], 0xa
999; SI-NEXT:    s_load_dword s4, s[4:5], 0x13
1000; SI-NEXT:    v_mov_b32_e32 v0, 0x5050505
1001; SI-NEXT:    s_mov_b32 s3, 0x100f000
1002; SI-NEXT:    s_mov_b32 s2, -1
1003; SI-NEXT:    s_waitcnt lgkmcnt(0)
1004; SI-NEXT:    v_mov_b32_e32 v1, s6
1005; SI-NEXT:    s_lshl_b32 s4, s4, 3
1006; SI-NEXT:    s_lshl_b32 s4, 0xffff, s4
1007; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
1008; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1009; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1010; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:2
1011; SI-NEXT:    s_endpgm
1012;
1013; VI-LABEL: dynamic_insertelement_v3i8:
1014; VI:       ; %bb.0:
1015; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1016; VI-NEXT:    s_load_dword s6, s[4:5], 0x28
1017; VI-NEXT:    s_load_dword s4, s[4:5], 0x4c
1018; VI-NEXT:    v_mov_b32_e32 v0, 0x5050505
1019; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1020; VI-NEXT:    s_mov_b32 s2, -1
1021; VI-NEXT:    s_waitcnt lgkmcnt(0)
1022; VI-NEXT:    v_mov_b32_e32 v1, s6
1023; VI-NEXT:    s_lshl_b32 s4, s4, 3
1024; VI-NEXT:    s_lshl_b32 s4, 0xffff, s4
1025; VI-NEXT:    v_bfi_b32 v0, s4, v0, v1
1026; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1027; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1028; VI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:2
1029; VI-NEXT:    s_endpgm
1030  %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
1031  store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
1032  ret void
1033}
1034
1035define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
1036; SI-LABEL: dynamic_insertelement_v4i8:
1037; SI:       ; %bb.0:
1038; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1039; SI-NEXT:    s_load_dword s6, s[4:5], 0xa
1040; SI-NEXT:    s_load_dword s4, s[4:5], 0x13
1041; SI-NEXT:    v_mov_b32_e32 v0, 0x5050505
1042; SI-NEXT:    s_mov_b32 s3, 0x100f000
1043; SI-NEXT:    s_mov_b32 s2, -1
1044; SI-NEXT:    s_waitcnt lgkmcnt(0)
1045; SI-NEXT:    v_mov_b32_e32 v1, s6
1046; SI-NEXT:    s_lshl_b32 s4, s4, 3
1047; SI-NEXT:    s_lshl_b32 s4, 0xffff, s4
1048; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
1049; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1050; SI-NEXT:    s_endpgm
1051;
1052; VI-LABEL: dynamic_insertelement_v4i8:
1053; VI:       ; %bb.0:
1054; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1055; VI-NEXT:    s_load_dword s6, s[4:5], 0x28
1056; VI-NEXT:    s_load_dword s4, s[4:5], 0x4c
1057; VI-NEXT:    v_mov_b32_e32 v0, 0x5050505
1058; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1059; VI-NEXT:    s_mov_b32 s2, -1
1060; VI-NEXT:    s_waitcnt lgkmcnt(0)
1061; VI-NEXT:    v_mov_b32_e32 v1, s6
1062; VI-NEXT:    s_lshl_b32 s4, s4, 3
1063; VI-NEXT:    s_lshl_b32 s4, 0xffff, s4
1064; VI-NEXT:    v_bfi_b32 v0, s4, v0, v1
1065; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1066; VI-NEXT:    s_endpgm
1067  %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
1068  store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
1069  ret void
1070}
1071
1072define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind {
1073; SI-LABEL: s_dynamic_insertelement_v8i8:
1074; SI:       ; %bb.0:
1075; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
1076; SI-NEXT:    s_load_dword s6, s[4:5], 0x4
1077; SI-NEXT:    s_mov_b32 s7, 0
1078; SI-NEXT:    s_mov_b32 s3, 0x100f000
1079; SI-NEXT:    s_mov_b32 s2, -1
1080; SI-NEXT:    s_waitcnt lgkmcnt(0)
1081; SI-NEXT:    s_load_dwordx2 s[4:5], s[10:11], 0x0
1082; SI-NEXT:    s_mov_b32 s0, s8
1083; SI-NEXT:    s_lshl_b32 s8, s6, 3
1084; SI-NEXT:    s_mov_b32 s6, 0xffff
1085; SI-NEXT:    s_lshl_b64 s[6:7], s[6:7], s8
1086; SI-NEXT:    s_mov_b32 s8, 0x5050505
1087; SI-NEXT:    s_mov_b32 s1, s9
1088; SI-NEXT:    s_and_b32 s9, s7, s8
1089; SI-NEXT:    s_and_b32 s8, s6, s8
1090; SI-NEXT:    s_waitcnt lgkmcnt(0)
1091; SI-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
1092; SI-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
1093; SI-NEXT:    v_mov_b32_e32 v0, s4
1094; SI-NEXT:    v_mov_b32_e32 v1, s5
1095; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1096; SI-NEXT:    s_endpgm
1097;
1098; VI-LABEL: s_dynamic_insertelement_v8i8:
1099; VI:       ; %bb.0:
1100; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
1101; VI-NEXT:    s_load_dword s6, s[4:5], 0x10
1102; VI-NEXT:    s_mov_b32 s7, 0
1103; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1104; VI-NEXT:    s_mov_b32 s2, -1
1105; VI-NEXT:    s_waitcnt lgkmcnt(0)
1106; VI-NEXT:    s_load_dwordx2 s[4:5], s[10:11], 0x0
1107; VI-NEXT:    s_mov_b32 s0, s8
1108; VI-NEXT:    s_lshl_b32 s8, s6, 3
1109; VI-NEXT:    s_mov_b32 s6, 0xffff
1110; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], s8
1111; VI-NEXT:    s_mov_b32 s8, 0x5050505
1112; VI-NEXT:    s_mov_b32 s1, s9
1113; VI-NEXT:    s_and_b32 s9, s7, s8
1114; VI-NEXT:    s_and_b32 s8, s6, s8
1115; VI-NEXT:    s_waitcnt lgkmcnt(0)
1116; VI-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
1117; VI-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
1118; VI-NEXT:    v_mov_b32_e32 v0, s4
1119; VI-NEXT:    v_mov_b32_e32 v1, s5
1120; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1121; VI-NEXT:    s_endpgm
1122  %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4
1123  %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
1124  store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8
1125  ret void
1126}
1127
1128define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
1129; SI-LABEL: dynamic_insertelement_v16i8:
1130; SI:       ; %bb.0:
1131; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1132; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
1133; SI-NEXT:    s_load_dword s4, s[4:5], 0x8
1134; SI-NEXT:    s_mov_b32 s3, 0x100f000
1135; SI-NEXT:    s_mov_b32 s2, -1
1136; SI-NEXT:    s_waitcnt lgkmcnt(0)
1137; SI-NEXT:    s_lshr_b32 s5, s11, 24
1138; SI-NEXT:    v_mov_b32_e32 v0, s5
1139; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 15
1140; SI-NEXT:    s_lshr_b32 s5, s11, 16
1141; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1142; SI-NEXT:    v_mov_b32_e32 v1, s5
1143; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 14
1144; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1145; SI-NEXT:    s_movk_i32 s5, 0xff
1146; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1147; SI-NEXT:    v_and_b32_e32 v1, s5, v1
1148; SI-NEXT:    s_lshr_b32 s6, s11, 8
1149; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1150; SI-NEXT:    v_mov_b32_e32 v1, s6
1151; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 13
1152; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1153; SI-NEXT:    v_mov_b32_e32 v2, s11
1154; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 12
1155; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1156; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1157; SI-NEXT:    v_and_b32_e32 v2, s5, v2
1158; SI-NEXT:    v_or_b32_e32 v1, v2, v1
1159; SI-NEXT:    s_mov_b32 s6, 0xffff
1160; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1161; SI-NEXT:    v_and_b32_e32 v1, s6, v1
1162; SI-NEXT:    s_lshr_b32 s7, s10, 24
1163; SI-NEXT:    v_or_b32_e32 v3, v1, v0
1164; SI-NEXT:    v_mov_b32_e32 v0, s7
1165; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 11
1166; SI-NEXT:    s_lshr_b32 s7, s10, 16
1167; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1168; SI-NEXT:    v_mov_b32_e32 v1, s7
1169; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 10
1170; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1171; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1172; SI-NEXT:    v_and_b32_e32 v1, s5, v1
1173; SI-NEXT:    s_lshr_b32 s7, s10, 8
1174; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1175; SI-NEXT:    v_mov_b32_e32 v1, s7
1176; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 9
1177; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1178; SI-NEXT:    v_mov_b32_e32 v2, s10
1179; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 8
1180; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1181; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1182; SI-NEXT:    v_and_b32_e32 v2, s5, v2
1183; SI-NEXT:    v_or_b32_e32 v1, v2, v1
1184; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1185; SI-NEXT:    v_and_b32_e32 v1, s6, v1
1186; SI-NEXT:    s_lshr_b32 s7, s9, 24
1187; SI-NEXT:    v_or_b32_e32 v2, v1, v0
1188; SI-NEXT:    v_mov_b32_e32 v0, s7
1189; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
1190; SI-NEXT:    s_lshr_b32 s7, s9, 16
1191; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1192; SI-NEXT:    v_mov_b32_e32 v1, s7
1193; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
1194; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1195; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1196; SI-NEXT:    v_and_b32_e32 v1, s5, v1
1197; SI-NEXT:    s_lshr_b32 s7, s9, 8
1198; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1199; SI-NEXT:    v_mov_b32_e32 v1, s7
1200; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
1201; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1202; SI-NEXT:    v_mov_b32_e32 v4, s9
1203; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
1204; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1205; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1206; SI-NEXT:    v_and_b32_e32 v4, s5, v4
1207; SI-NEXT:    v_or_b32_e32 v1, v4, v1
1208; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1209; SI-NEXT:    v_and_b32_e32 v1, s6, v1
1210; SI-NEXT:    s_lshr_b32 s7, s8, 24
1211; SI-NEXT:    v_or_b32_e32 v1, v1, v0
1212; SI-NEXT:    v_mov_b32_e32 v0, s7
1213; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
1214; SI-NEXT:    s_lshr_b32 s7, s8, 16
1215; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1216; SI-NEXT:    v_mov_b32_e32 v4, s7
1217; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
1218; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1219; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1220; SI-NEXT:    v_and_b32_e32 v4, s5, v4
1221; SI-NEXT:    s_lshr_b32 s7, s8, 8
1222; SI-NEXT:    v_or_b32_e32 v0, v4, v0
1223; SI-NEXT:    v_mov_b32_e32 v4, s7
1224; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
1225; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1226; SI-NEXT:    v_mov_b32_e32 v5, s8
1227; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
1228; SI-NEXT:    v_cndmask_b32_e32 v5, 5, v5, vcc
1229; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
1230; SI-NEXT:    v_and_b32_e32 v5, s5, v5
1231; SI-NEXT:    v_or_b32_e32 v4, v5, v4
1232; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1233; SI-NEXT:    v_and_b32_e32 v4, s6, v4
1234; SI-NEXT:    v_or_b32_e32 v0, v4, v0
1235; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1236; SI-NEXT:    s_endpgm
1237;
1238; VI-LABEL: dynamic_insertelement_v16i8:
1239; VI:       ; %bb.0:
1240; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1241; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
1242; VI-NEXT:    s_load_dword s4, s[4:5], 0x20
1243; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1244; VI-NEXT:    s_mov_b32 s2, -1
1245; VI-NEXT:    s_waitcnt lgkmcnt(0)
1246; VI-NEXT:    s_lshr_b32 s5, s11, 24
1247; VI-NEXT:    v_mov_b32_e32 v0, s5
1248; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 15
1249; VI-NEXT:    s_lshr_b32 s5, s11, 16
1250; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1251; VI-NEXT:    v_mov_b32_e32 v1, s5
1252; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 14
1253; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1254; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1255; VI-NEXT:    s_lshr_b32 s5, s11, 8
1256; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1257; VI-NEXT:    v_mov_b32_e32 v1, s5
1258; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 13
1259; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1260; VI-NEXT:    v_mov_b32_e32 v2, s11
1261; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 12
1262; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1263; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
1264; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1265; VI-NEXT:    s_lshr_b32 s5, s10, 24
1266; VI-NEXT:    v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1267; VI-NEXT:    v_mov_b32_e32 v0, s5
1268; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 11
1269; VI-NEXT:    s_lshr_b32 s5, s10, 16
1270; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1271; VI-NEXT:    v_mov_b32_e32 v1, s5
1272; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 10
1273; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1274; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1275; VI-NEXT:    s_lshr_b32 s5, s10, 8
1276; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1277; VI-NEXT:    v_mov_b32_e32 v1, s5
1278; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 9
1279; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1280; VI-NEXT:    v_mov_b32_e32 v2, s10
1281; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 8
1282; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
1283; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
1284; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1285; VI-NEXT:    s_lshr_b32 s5, s9, 24
1286; VI-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1287; VI-NEXT:    v_mov_b32_e32 v0, s5
1288; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
1289; VI-NEXT:    s_lshr_b32 s5, s9, 16
1290; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1291; VI-NEXT:    v_mov_b32_e32 v1, s5
1292; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
1293; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1294; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1295; VI-NEXT:    s_lshr_b32 s5, s9, 8
1296; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1297; VI-NEXT:    v_mov_b32_e32 v1, s5
1298; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
1299; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
1300; VI-NEXT:    v_mov_b32_e32 v4, s9
1301; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
1302; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1303; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
1304; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1305; VI-NEXT:    s_lshr_b32 s5, s8, 24
1306; VI-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1307; VI-NEXT:    v_mov_b32_e32 v0, s5
1308; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
1309; VI-NEXT:    s_lshr_b32 s5, s8, 16
1310; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
1311; VI-NEXT:    v_mov_b32_e32 v4, s5
1312; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
1313; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1314; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
1315; VI-NEXT:    s_lshr_b32 s5, s8, 8
1316; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1317; VI-NEXT:    v_mov_b32_e32 v4, s5
1318; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
1319; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
1320; VI-NEXT:    v_mov_b32_e32 v5, s8
1321; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
1322; VI-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
1323; VI-NEXT:    v_cndmask_b32_e32 v5, 5, v5, vcc
1324; VI-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1325; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1326; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1327; VI-NEXT:    s_endpgm
1328  %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
1329  store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
1330  ret void
1331}
1332
1333; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
1334; the compiler doesn't crash.
1335define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
1336; SI-LABEL: insert_split_bb:
1337; SI:       ; %bb.0: ; %entry
1338; SI-NEXT:    s_load_dword s0, s[4:5], 0x4
1339; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
1340; SI-NEXT:    s_waitcnt lgkmcnt(0)
1341; SI-NEXT:    s_cmp_lg_u32 s0, 0
1342; SI-NEXT:    s_cbranch_scc0 BB26_2
1343; SI-NEXT:  ; %bb.1: ; %else
1344; SI-NEXT:    s_load_dword s1, s[6:7], 0x1
1345; SI-NEXT:    s_branch BB26_3
1346; SI-NEXT:  BB26_2: ; %if
1347; SI-NEXT:    s_load_dword s1, s[6:7], 0x0
1348; SI-NEXT:  BB26_3: ; %endif
1349; SI-NEXT:    s_waitcnt lgkmcnt(0)
1350; SI-NEXT:    v_mov_b32_e32 v0, s0
1351; SI-NEXT:    s_mov_b32 s7, 0x100f000
1352; SI-NEXT:    s_mov_b32 s6, -1
1353; SI-NEXT:    v_mov_b32_e32 v1, s1
1354; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1355; SI-NEXT:    s_endpgm
1356;
1357; VI-LABEL: insert_split_bb:
1358; VI:       ; %bb.0: ; %entry
1359; VI-NEXT:    s_load_dword s0, s[4:5], 0x10
1360; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
1361; VI-NEXT:    s_waitcnt lgkmcnt(0)
1362; VI-NEXT:    s_cmp_lg_u32 s0, 0
1363; VI-NEXT:    s_cbranch_scc0 BB26_2
1364; VI-NEXT:  ; %bb.1: ; %else
1365; VI-NEXT:    s_load_dword s1, s[6:7], 0x4
1366; VI-NEXT:    s_branch BB26_3
1367; VI-NEXT:  BB26_2: ; %if
1368; VI-NEXT:    s_load_dword s1, s[6:7], 0x0
1369; VI-NEXT:  BB26_3: ; %endif
1370; VI-NEXT:    s_waitcnt lgkmcnt(0)
1371; VI-NEXT:    v_mov_b32_e32 v0, s0
1372; VI-NEXT:    s_mov_b32 s7, 0x1100f000
1373; VI-NEXT:    s_mov_b32 s6, -1
1374; VI-NEXT:    v_mov_b32_e32 v1, s1
1375; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1376; VI-NEXT:    s_endpgm
1377entry:
1378  %0 = insertelement <2 x i32> undef, i32 %a, i32 0
1379  %1 = icmp eq i32 %a, 0
1380  br i1 %1, label %if, label %else
1381
1382if:
1383  %2 = load i32, i32 addrspace(1)* %in
1384  %3 = insertelement <2 x i32> %0, i32 %2, i32 1
1385  br label %endif
1386
1387else:
1388  %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1
1389  %5 = load i32, i32 addrspace(1)* %4
1390  %6 = insertelement <2 x i32> %0, i32 %5, i32 1
1391  br label %endif
1392
1393endif:
1394  %7 = phi <2 x i32> [%3, %if], [%6, %else]
1395  store <2 x i32> %7, <2 x i32> addrspace(1)* %out
1396  ret void
1397}
1398
1399define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
1400; SI-LABEL: dynamic_insertelement_v2f64:
1401; SI:       ; %bb.0:
1402; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1403; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xc
1404; SI-NEXT:    s_load_dword s4, s[4:5], 0x18
1405; SI-NEXT:    v_mov_b32_e32 v1, 0x40200000
1406; SI-NEXT:    s_mov_b32 s3, 0x100f000
1407; SI-NEXT:    s_mov_b32 s2, -1
1408; SI-NEXT:    s_waitcnt lgkmcnt(0)
1409; SI-NEXT:    v_mov_b32_e32 v0, s11
1410; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
1411; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
1412; SI-NEXT:    v_mov_b32_e32 v0, s10
1413; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1414; SI-NEXT:    v_mov_b32_e32 v0, s9
1415; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
1416; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
1417; SI-NEXT:    v_mov_b32_e32 v0, s8
1418; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1419; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1420; SI-NEXT:    s_endpgm
1421;
1422; VI-LABEL: dynamic_insertelement_v2f64:
1423; VI:       ; %bb.0:
1424; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1425; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x30
1426; VI-NEXT:    s_load_dword s4, s[4:5], 0x60
1427; VI-NEXT:    v_mov_b32_e32 v1, 0x40200000
1428; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1429; VI-NEXT:    s_mov_b32 s2, -1
1430; VI-NEXT:    s_waitcnt lgkmcnt(0)
1431; VI-NEXT:    v_mov_b32_e32 v0, s11
1432; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
1433; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
1434; VI-NEXT:    v_mov_b32_e32 v0, s10
1435; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1436; VI-NEXT:    v_mov_b32_e32 v0, s9
1437; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
1438; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
1439; VI-NEXT:    v_mov_b32_e32 v0, s8
1440; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1441; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1442; VI-NEXT:    s_endpgm
1443  %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
1444  store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
1445  ret void
1446}
1447
1448define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
1449; SI-LABEL: dynamic_insertelement_v2i64:
1450; SI:       ; %bb.0:
1451; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1452; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
1453; SI-NEXT:    s_load_dword s6, s[4:5], 0x8
1454; SI-NEXT:    s_mov_b32 s3, 0x100f000
1455; SI-NEXT:    s_mov_b32 s2, -1
1456; SI-NEXT:    s_waitcnt lgkmcnt(0)
1457; SI-NEXT:    v_mov_b32_e32 v0, s11
1458; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 1
1459; SI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1460; SI-NEXT:    v_mov_b32_e32 v0, s10
1461; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1462; SI-NEXT:    v_mov_b32_e32 v0, s9
1463; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 0
1464; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1465; SI-NEXT:    v_mov_b32_e32 v0, s8
1466; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1467; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1468; SI-NEXT:    s_endpgm
1469;
1470; VI-LABEL: dynamic_insertelement_v2i64:
1471; VI:       ; %bb.0:
1472; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1473; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
1474; VI-NEXT:    s_load_dword s6, s[4:5], 0x20
1475; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1476; VI-NEXT:    s_mov_b32 s2, -1
1477; VI-NEXT:    s_waitcnt lgkmcnt(0)
1478; VI-NEXT:    v_mov_b32_e32 v0, s11
1479; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 1
1480; VI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1481; VI-NEXT:    v_mov_b32_e32 v0, s10
1482; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1483; VI-NEXT:    v_mov_b32_e32 v0, s9
1484; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 0
1485; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1486; VI-NEXT:    v_mov_b32_e32 v0, s8
1487; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1488; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1489; VI-NEXT:    s_endpgm
1490  %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
1491  store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
1492  ret void
1493}
1494
1495define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
1496; SI-LABEL: dynamic_insertelement_v3i64:
1497; SI:       ; %bb.0:
1498; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1499; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
1500; SI-NEXT:    s_load_dword s6, s[4:5], 0x10
1501; SI-NEXT:    s_mov_b32 s3, 0x100f000
1502; SI-NEXT:    s_mov_b32 s2, -1
1503; SI-NEXT:    s_waitcnt lgkmcnt(0)
1504; SI-NEXT:    v_mov_b32_e32 v0, s13
1505; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 2
1506; SI-NEXT:    v_cndmask_b32_e64 v5, v0, 0, s[4:5]
1507; SI-NEXT:    v_mov_b32_e32 v0, s12
1508; SI-NEXT:    v_cndmask_b32_e64 v4, v0, 5, s[4:5]
1509; SI-NEXT:    v_mov_b32_e32 v0, s11
1510; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 1
1511; SI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1512; SI-NEXT:    v_mov_b32_e32 v0, s10
1513; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1514; SI-NEXT:    v_mov_b32_e32 v0, s9
1515; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 0
1516; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1517; SI-NEXT:    v_mov_b32_e32 v0, s8
1518; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1519; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
1520; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1521; SI-NEXT:    s_endpgm
1522;
1523; VI-LABEL: dynamic_insertelement_v3i64:
1524; VI:       ; %bb.0:
1525; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1526; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
1527; VI-NEXT:    s_load_dword s6, s[4:5], 0x40
1528; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1529; VI-NEXT:    s_mov_b32 s2, -1
1530; VI-NEXT:    s_waitcnt lgkmcnt(0)
1531; VI-NEXT:    v_mov_b32_e32 v0, s13
1532; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 2
1533; VI-NEXT:    v_cndmask_b32_e64 v5, v0, 0, s[4:5]
1534; VI-NEXT:    v_mov_b32_e32 v0, s12
1535; VI-NEXT:    v_cndmask_b32_e64 v4, v0, 5, s[4:5]
1536; VI-NEXT:    v_mov_b32_e32 v0, s11
1537; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 1
1538; VI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1539; VI-NEXT:    v_mov_b32_e32 v0, s10
1540; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1541; VI-NEXT:    v_mov_b32_e32 v0, s9
1542; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 0
1543; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1544; VI-NEXT:    v_mov_b32_e32 v0, s8
1545; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1546; VI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
1547; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1548; VI-NEXT:    s_endpgm
1549  %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
1550  store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
1551  ret void
1552}
1553
1554define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
1555; SI-LABEL: dynamic_insertelement_v4f64:
1556; SI:       ; %bb.0:
1557; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1558; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
1559; SI-NEXT:    s_load_dword s4, s[4:5], 0x10
1560; SI-NEXT:    v_mov_b32_e32 v4, 0x40200000
1561; SI-NEXT:    s_mov_b32 s3, 0x100f000
1562; SI-NEXT:    s_mov_b32 s2, -1
1563; SI-NEXT:    s_waitcnt lgkmcnt(0)
1564; SI-NEXT:    v_mov_b32_e32 v0, s11
1565; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
1566; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
1567; SI-NEXT:    v_mov_b32_e32 v0, s10
1568; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1569; SI-NEXT:    v_mov_b32_e32 v0, s9
1570; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
1571; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
1572; SI-NEXT:    v_mov_b32_e32 v0, s8
1573; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1574; SI-NEXT:    v_mov_b32_e32 v5, s15
1575; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 3
1576; SI-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
1577; SI-NEXT:    v_mov_b32_e32 v5, s14
1578; SI-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
1579; SI-NEXT:    v_mov_b32_e32 v5, s13
1580; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 2
1581; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
1582; SI-NEXT:    v_mov_b32_e32 v4, s12
1583; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
1584; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1585; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1586; SI-NEXT:    s_endpgm
1587;
1588; VI-LABEL: dynamic_insertelement_v4f64:
1589; VI:       ; %bb.0:
1590; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1591; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
1592; VI-NEXT:    s_load_dword s4, s[4:5], 0x40
1593; VI-NEXT:    v_mov_b32_e32 v4, 0x40200000
1594; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1595; VI-NEXT:    s_mov_b32 s2, -1
1596; VI-NEXT:    s_waitcnt lgkmcnt(0)
1597; VI-NEXT:    v_mov_b32_e32 v0, s11
1598; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
1599; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
1600; VI-NEXT:    v_mov_b32_e32 v0, s10
1601; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
1602; VI-NEXT:    v_mov_b32_e32 v0, s9
1603; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
1604; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
1605; VI-NEXT:    v_mov_b32_e32 v0, s8
1606; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1607; VI-NEXT:    v_mov_b32_e32 v5, s15
1608; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 3
1609; VI-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
1610; VI-NEXT:    v_mov_b32_e32 v5, s14
1611; VI-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
1612; VI-NEXT:    v_mov_b32_e32 v5, s13
1613; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 2
1614; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
1615; VI-NEXT:    v_mov_b32_e32 v4, s12
1616; VI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
1617; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1618; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1619; VI-NEXT:    s_endpgm
1620  %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
1621  store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
1622  ret void
1623}
1624
1625define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 {
1626; SI-LABEL: dynamic_insertelement_v8f64:
1627; SI:       ; %bb.0:
1628; SI-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x0
1629; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
1630; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
1631; SI-NEXT:    v_mov_b32_e32 v16, 64
1632; SI-NEXT:    s_mov_b32 s27, 0x100f000
1633; SI-NEXT:    s_mov_b32 s26, -1
1634; SI-NEXT:    s_waitcnt lgkmcnt(0)
1635; SI-NEXT:    v_mov_b32_e32 v0, s8
1636; SI-NEXT:    s_and_b32 s4, s4, 7
1637; SI-NEXT:    s_lshl_b32 s4, s4, 3
1638; SI-NEXT:    v_mov_b32_e32 v1, s9
1639; SI-NEXT:    v_mov_b32_e32 v2, s10
1640; SI-NEXT:    v_mov_b32_e32 v3, s11
1641; SI-NEXT:    v_mov_b32_e32 v4, s12
1642; SI-NEXT:    v_mov_b32_e32 v5, s13
1643; SI-NEXT:    v_mov_b32_e32 v6, s14
1644; SI-NEXT:    v_mov_b32_e32 v7, s15
1645; SI-NEXT:    v_mov_b32_e32 v8, s16
1646; SI-NEXT:    v_mov_b32_e32 v9, s17
1647; SI-NEXT:    v_mov_b32_e32 v10, s18
1648; SI-NEXT:    v_mov_b32_e32 v11, s19
1649; SI-NEXT:    v_mov_b32_e32 v12, s20
1650; SI-NEXT:    v_mov_b32_e32 v13, s21
1651; SI-NEXT:    v_mov_b32_e32 v14, s22
1652; SI-NEXT:    v_mov_b32_e32 v15, s23
1653; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112
1654; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96
1655; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80
1656; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64
1657; SI-NEXT:    v_or_b32_e32 v16, s4, v16
1658; SI-NEXT:    v_mov_b32_e32 v0, 0
1659; SI-NEXT:    v_mov_b32_e32 v1, 0x40200000
1660; SI-NEXT:    buffer_store_dwordx2 v[0:1], v16, s[0:3], s7 offen
1661; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], s7 offset:64
1662; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], s7 offset:80
1663; SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], s7 offset:96
1664; SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], s7 offset:112
1665; SI-NEXT:    s_waitcnt vmcnt(0)
1666; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[24:27], 0 offset:48
1667; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[24:27], 0 offset:32
1668; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:16
1669; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[24:27], 0
1670; SI-NEXT:    s_endpgm
1671;
1672; VI-LABEL: dynamic_insertelement_v8f64:
1673; VI:       ; %bb.0:
1674; VI-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x0
1675; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
1676; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
1677; VI-NEXT:    v_mov_b32_e32 v16, 64
1678; VI-NEXT:    s_mov_b32 s27, 0x1100f000
1679; VI-NEXT:    s_mov_b32 s26, -1
1680; VI-NEXT:    s_waitcnt lgkmcnt(0)
1681; VI-NEXT:    v_mov_b32_e32 v0, s8
1682; VI-NEXT:    s_and_b32 s4, s4, 7
1683; VI-NEXT:    s_lshl_b32 s4, s4, 3
1684; VI-NEXT:    v_mov_b32_e32 v1, s9
1685; VI-NEXT:    v_mov_b32_e32 v2, s10
1686; VI-NEXT:    v_mov_b32_e32 v3, s11
1687; VI-NEXT:    v_mov_b32_e32 v4, s12
1688; VI-NEXT:    v_mov_b32_e32 v5, s13
1689; VI-NEXT:    v_mov_b32_e32 v6, s14
1690; VI-NEXT:    v_mov_b32_e32 v7, s15
1691; VI-NEXT:    v_mov_b32_e32 v8, s16
1692; VI-NEXT:    v_mov_b32_e32 v9, s17
1693; VI-NEXT:    v_mov_b32_e32 v10, s18
1694; VI-NEXT:    v_mov_b32_e32 v11, s19
1695; VI-NEXT:    v_mov_b32_e32 v12, s20
1696; VI-NEXT:    v_mov_b32_e32 v13, s21
1697; VI-NEXT:    v_mov_b32_e32 v14, s22
1698; VI-NEXT:    v_mov_b32_e32 v15, s23
1699; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112
1700; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96
1701; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80
1702; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64
1703; VI-NEXT:    v_or_b32_e32 v16, s4, v16
1704; VI-NEXT:    v_mov_b32_e32 v0, 0
1705; VI-NEXT:    v_mov_b32_e32 v1, 0x40200000
1706; VI-NEXT:    buffer_store_dwordx2 v[0:1], v16, s[0:3], s7 offen
1707; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], s7 offset:64
1708; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], s7 offset:80
1709; VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], s7 offset:96
1710; VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], s7 offset:112
1711; VI-NEXT:    s_waitcnt vmcnt(0)
1712; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[24:27], 0 offset:48
1713; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[24:27], 0 offset:32
1714; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:16
1715; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[24:27], 0
1716; VI-NEXT:    s_endpgm
1717  %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
1718  store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
1719  ret void
1720}
1721
1722declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
1723
1724attributes #0 = { nounwind }
1725attributes #1 = { nounwind readnone }
1726